(https://www.linkedin.com/pulse/november-data-challenge-kate-strachnyi-/)
(https://github.com/rfordatascience/tidytuesday)
Load libraries
library(tidyverse)
library(lubridate)
library(skimr)
library(RColorBrewer)
library(scales)
Import datasets
malaria_deaths = read.csv("malaria_deaths_by_age.csv")
# https://www.gapminder.org/data/geo/
geography = read.csv("geography.csv")
Explore our data
malaria_deaths %>% glimpse()
Observations: 30,780
Variables: 6
$ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30...
$ entity <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afgha...
$ code <fct> AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG,...
$ year <int> 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2...
$ age_group <fct> Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, U...
$ deaths <dbl> 184.6064, 191.6582, 197.1402, 207.3578, 226.2094, 236.3280, 250.8689, 257.6679, 273.0083, 272.2882, 278.3776,...
geography %>% glimpse()
Observations: 197
Variables: 11
$ geo <fct> afg, alb, dza, and, ago, atg, arg, arm, aus, aut, aze, bhs, bhr, bgd, brb, blr, bel, blz, ...
$ name <fct> Afghanistan, Albania, Algeria, Andorra, Angola, Antigua and Barbuda, Argentina, Armenia, A...
$ four_regions <fct> asia, europe, africa, europe, africa, americas, americas, europe, asia, europe, europe, am...
$ eight_regions <fct> asia_west, europe_east, africa_north, europe_west, africa_sub_saharan, america_north, amer...
$ six_regions <fct> south_asia, europe_central_asia, middle_east_north_africa, europe_central_asia, sub_sahara...
$ members_oecd_g77 <fct> g77, others, g77, others, g77, g77, g77, others, oecd, oecd, others, g77, g77, g77, g77, o...
$ Latitude <dbl> 33.00000, 41.00000, 28.00000, 42.50779, -12.50000, 17.05000, -34.00000, 40.25000, -25.0000...
$ Longitude <dbl> 66.00000, 20.00000, 3.00000, 1.52109, 18.50000, -61.80000, -64.00000, 45.00000, 135.00000,...
$ UN.member.since <fct> 19/11/1946, 14/12/1955, 8/10/62, 28/7/1993, 1/12/76, 11/11/81, 24/10/1945, 2/3/92, 1/11/45...
$ World.bank.region <fct> South Asia, Europe & Central Asia, Middle East & North Africa, Europe & Central Asia, Sub-...
$ World.bank.income.group.2017 <fct> Low income, Upper middle income, Upper middle income, High income, Lower middle income, Hi...
Let’s clean up the data and rename some of the columns
malaria_deaths$X = NULL
colnames(malaria_deaths) = c("Country", "Code", "Year", "Age_Group", "Deaths")
Review our datasets before performing joining
summary(malaria_deaths)
Country Code Year Age_Group Deaths
Afghanistan : 135 AFG : 135 Min. :1990 15-49 :6156 Min. : 0.0
Albania : 135 AGO : 135 1st Qu.:1996 5-14 :6156 1st Qu.: 0.0
Algeria : 135 ALB : 135 Median :2003 50-69 :6156 Median : 0.1
American Samoa : 135 AND : 135 Mean :2003 70 or older:6156 Mean : 3698.6
Andean Latin America: 135 ARE : 135 3rd Qu.:2010 Under 5 :6156 3rd Qu.: 80.5
Andorra : 135 (Other):25785 Max. :2016 Max. :752025.5
(Other) :29970 NA's : 4320
Return the number of countries in the “maria_deaths”
malaria_deaths %>%
select(Country) %>%
unique() %>%
count()
Make the “Code” column lowercase. We do this as eventually we want to merge this one with the “geography” dataset
malaria_deaths$Code = tolower(malaria_deaths$Code)
Return the number of countries in “geography”
geography %>% select(name) %>% count()
More data manipulation for “geography”
colnames(geography)[1:2] <- c("Code", "Country")
glimpse(geography)
Observations: 197
Variables: 11
$ Code <fct> afg, alb, dza, and, ago, atg, arg, arm, aus, aut, aze, bhs, bhr, bgd, brb, blr, bel, blz, ...
$ Country <fct> Afghanistan, Albania, Algeria, Andorra, Angola, Antigua and Barbuda, Argentina, Armenia, A...
$ four_regions <fct> asia, europe, africa, europe, africa, americas, americas, europe, asia, europe, europe, am...
$ eight_regions <fct> asia_west, europe_east, africa_north, europe_west, africa_sub_saharan, america_north, amer...
$ six_regions <fct> south_asia, europe_central_asia, middle_east_north_africa, europe_central_asia, sub_sahara...
$ members_oecd_g77 <fct> g77, others, g77, others, g77, g77, g77, others, oecd, oecd, others, g77, g77, g77, g77, o...
$ Latitude <dbl> 33.00000, 41.00000, 28.00000, 42.50779, -12.50000, 17.05000, -34.00000, 40.25000, -25.0000...
$ Longitude <dbl> 66.00000, 20.00000, 3.00000, 1.52109, 18.50000, -61.80000, -64.00000, 45.00000, 135.00000,...
$ UN.member.since <fct> 19/11/1946, 14/12/1955, 8/10/62, 28/7/1993, 1/12/76, 11/11/81, 24/10/1945, 2/3/92, 1/11/45...
$ World.bank.region <fct> South Asia, Europe & Central Asia, Middle East & North Africa, Europe & Central Asia, Sub-...
$ World.bank.income.group.2017 <fct> Low income, Upper middle income, Upper middle income, High income, Lower middle income, Hi...
Inner join “malaria_deaths” onto geography
malaria_deaths = malaria_deaths %>%
inner_join(geography, by = "Code")
Column `Code` joining character vector and factor, coercing into character vector
colnames(malaria_deaths)
[1] "Country.x" "Code" "Year" "Age_Group"
[5] "Deaths" "Country.y" "four_regions" "eight_regions"
[9] "six_regions" "members_oecd_g77" "Latitude" "Longitude"
[13] "UN.member.since" "World.bank.region" "World.bank.income.group.2017"
Return some descriptive stats for the joined dataframe
summary(malaria_deaths)
Country.x Code Year Age_Group Deaths Country.y
Afghanistan : 135 Length:25380 Min. :1990 15-49 :5076 Min. : 0.00 Afghanistan : 135
Albania : 135 Class :character 1st Qu.:1996 5-14 :5076 1st Qu.: 0.00 Albania : 135
Algeria : 135 Mode :character Median :2003 50-69 :5076 Median : 0.07 Algeria : 135
Andorra : 135 Mean :2003 70 or older:5076 Mean : 917.75 Andorra : 135
Angola : 135 3rd Qu.:2010 Under 5 :5076 3rd Qu.: 44.40 Angola : 135
Antigua and Barbuda: 135 Max. :2016 Max. :261794.56 Antigua and Barbuda: 135
(Other) :24570 (Other) :24570
four_regions eight_regions six_regions members_oecd_g77 Latitude Longitude
africa :7290 africa_sub_saharan:6480 america :4590 : 135 Min. :-42.00 Min. :-175.00
americas:4590 east_asia_pacific :3780 east_asia_pacific :3780 g77 :17415 1st Qu.: 4.00 1st Qu.: -6.50
asia :7425 asia_west :3645 europe_central_asia :6750 oecd : 4050 Median : 17.27 Median : 21.88
europe :6075 europe_east :3240 middle_east_north_africa:2700 others: 3780 Mean : 18.92 Mean : 21.00
america_north :2970 south_asia :1080 3rd Qu.: 39.75 3rd Qu.: 48.64
europe_west :2835 sub_saharan_africa :6480 Max. : 65.00 Max. : 178.00
(Other) :2430
UN.member.since World.bank.region World.bank.income.group.2017
24/10/1945: 3645 Europe & Central Asia :6615 : 0
14/12/1955: 2160 Sub-Saharan Africa :6480 High income :7020
20/9/1960 : 1890 Latin America & Caribbean :4320 Low income :4185
2/3/92 : 1080 East Asia & Pacific :3780 Lower middle income:7020
17/9/1991 : 945 Middle East & North Africa:2835 Upper middle income:7155
18/9/1962 : 540 South Asia :1080
(Other) :15120 (Other) : 270
More data manipulation
colnames(malaria_deaths)[1] = "Country"
colnames(malaria_deaths)[15] = "Income_Group"
colnames(malaria_deaths)[7] = "Four_Reg"
colnames(malaria_deaths)[9] = "Six_Reg"
Subset the joined dataset to get only the variables we want
malaria_deaths =
malaria_deaths[c("Country", "Year", "Age_Group", "Deaths", "Four_Reg", "Six_Reg",
"Latitude", "Longitude", "Income_Group", "Latitude", "Longitude")]
summary(malaria_deaths)
Country Year Age_Group Deaths Four_Reg Six_Reg
Afghanistan : 135 Min. :1990 15-49 :5076 Min. : 0.00 africa :7290 america :4590
Albania : 135 1st Qu.:1996 5-14 :5076 1st Qu.: 0.00 americas:4590 east_asia_pacific :3780
Algeria : 135 Median :2003 50-69 :5076 Median : 0.07 asia :7425 europe_central_asia :6750
Andorra : 135 Mean :2003 70 or older:5076 Mean : 917.75 europe :6075 middle_east_north_africa:2700
Angola : 135 3rd Qu.:2010 Under 5 :5076 3rd Qu.: 44.40 south_asia :1080
Antigua and Barbuda: 135 Max. :2016 Max. :261794.56 sub_saharan_africa :6480
(Other) :24570
Latitude Longitude Income_Group Latitude.1 Longitude.1
Min. :-42.00 Min. :-175.00 : 0 Min. :-42.00 Min. :-175.00
1st Qu.: 4.00 1st Qu.: -6.50 High income :7020 1st Qu.: 4.00 1st Qu.: -6.50
Median : 17.27 Median : 21.88 Low income :4185 Median : 17.27 Median : 21.88
Mean : 18.92 Mean : 21.00 Lower middle income:7020 Mean : 18.92 Mean : 21.00
3rd Qu.: 39.75 3rd Qu.: 48.64 Upper middle income:7155 3rd Qu.: 39.75 3rd Qu.: 48.64
Max. : 65.00 Max. : 178.00 Max. : 65.00 Max. : 178.00
Plot a bar plot to show total number of deaths by age group for each region
ggplot(malaria_deaths) +
geom_bar(stat = "identity", width = 1.5, aes(x = Four_Reg, y = Deaths, fill = Age_Group), position = "dodge") +
scale_y_continuous(limits = c(0, 250000)) +
ggtitle("Total Number of Malaria Deaths for Different Age Groups by Region (1990 - 2016)") +
theme_minimal() +
xlab("World Regions") +
ylab("Number of Deaths") +
theme(axis.text.y = element_text(size = 12, face = 'bold'),
axis.text.x = element_text(size = 12, face = 'bold'),
axis.title.y = element_text(size = 12, face = 'bold'),
axis.title.x = element_text(size = 12, face = 'bold'),
plot.title = element_text(size = 12, face = "bold")) +
scale_fill_brewer(palette = "Set1")
It seems Africa exhibits an extremely high number of deaths. To see the picture more clearly, let’s look at Africa and the rest of the world separately
africa =
malaria_deaths %>%
filter(Four_Reg == "africa")
ggplot(africa) +
geom_bar(stat = "identity", aes(x = Four_Reg, y = Deaths, fill = Age_Group), position = "dodge") +
scale_y_continuous(limits = c(0, 250000)) +
ggtitle("Total Number of Malaria Deaths for Different Age Groups in Africa (1990 - 2016)") +
theme_minimal() +
xlab("Africa") +
ylab("Number of Deaths") +
theme(axis.text.y = element_text(size = 12, face = 'bold'),
axis.text.x = element_text(size = 12, face = 'bold'),
axis.title.y = element_text(size = 12, face = 'bold'),
axis.title.x = element_text(size = 12, face = 'bold'),
plot.title = element_text(size = 12, face = "bold"),
panel.grid.major = element_blank()) +
scale_fill_brewer(palette = "Set1")
We can see that the group affected the most in Africa is the “Under 5” group. Now, let’s look at the rest of the world.
world_no_africa =
malaria_deaths %>%
filter(Four_Reg != "africa")
ggplot(world_no_africa) +
geom_bar(stat = "identity", width = 0.8, aes(x = Four_Reg, y = Deaths, fill = Age_Group), position = "dodge") +
scale_y_continuous(limits = c(0, 50000)) +
ggtitle("Total Number of Malaria Deaths for Different Age Groups by Region (1990 - 2016)") +
theme_minimal() +
xlab("World Regions") +
ylab("Number of Deaths") +
theme(axis.text.y = element_text(size = 12, face = 'bold'),
axis.text.x = element_text(size = 12, face = 'bold'),
axis.title.y = element_text(size = 12, face = 'bold'),
axis.title.x = element_text(size = 12, face = 'bold'),
plot.title = element_text(size = 12, face = "bold")) +
scale_fill_brewer(palette = "Set1")
We can see that the death toll in Americas and Europe is close to zero. The group affected the most in Asia is also the “Under 5” group.
Now, let’s look at the trend of the country with the most number of malaria deaths
malaria_deaths %>%
arrange(desc(Deaths))
The country with the highest number of malaria deaths over the year is Nigeria. Let’s zoom into Nigeria and see the trend over the year
nigeria =
malaria_deaths %>%
filter(Country == "Nigeria")
head(nigeria)
Explore our new filtered dataset
glimpse(nigeria)
Observations: 135
Variables: 11
$ Country <fct> Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeria, Nigeri...
$ Year <int> 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 200...
$ Age_Group <fct> Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under 5, Under ...
$ Deaths <dbl> 169612.234, 177084.049, 182532.594, 189569.206, 194656.088, 197338.651, 201157.357, 203241.332, 209637.81...
$ Four_Reg <fct> africa, africa, africa, africa, africa, africa, africa, africa, africa, africa, africa, africa, africa, a...
$ Six_Reg <fct> sub_saharan_africa, sub_saharan_africa, sub_saharan_africa, sub_saharan_africa, sub_saharan_africa, sub_s...
$ Latitude <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...
$ Longitude <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...
$ Income_Group <fct> Lower middle income, Lower middle income, Lower middle income, Lower middle income, Lower middle income, ...
$ Latitude.1 <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...
$ Longitude.1 <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...
summary(nigeria)
Country Year Age_Group Deaths Four_Reg Six_Reg
Nigeria :135 Min. :1990 15-49 :27 Min. : 3141 africa :135 america : 0
Afghanistan : 0 1st Qu.:1996 5-14 :27 1st Qu.: 5808 americas: 0 east_asia_pacific : 0
Albania : 0 Median :2003 50-69 :27 Median : 9238 asia : 0 europe_central_asia : 0
Algeria : 0 Mean :2003 70 or older:27 Mean : 50235 europe : 0 middle_east_north_africa: 0
American Samoa : 0 3rd Qu.:2010 Under 5 :27 3rd Qu.: 20377 south_asia : 0
Andean Latin America: 0 Max. :2016 Max. :261795 sub_saharan_africa :135
(Other) : 0
Latitude Longitude Income_Group Latitude.1 Longitude.1
Min. :10 Min. :8 : 0 Min. :10 Min. :8
1st Qu.:10 1st Qu.:8 High income : 0 1st Qu.:10 1st Qu.:8
Median :10 Median :8 Low income : 0 Median :10 Median :8
Mean :10 Mean :8 Lower middle income:135 Mean :10 Mean :8
3rd Qu.:10 3rd Qu.:8 Upper middle income: 0 3rd Qu.:10 3rd Qu.:8
Max. :10 Max. :8 Max. :10 Max. :8
Plot the number of malaria deaths in Nigeria over the period 1990 - 2016 for different age groups
ggplot(nigeria, aes(x = Year, y = Deaths)) +
geom_line(aes(color = Age_Group)) +
ggtitle("Total Number of Malaria Deaths for Different Age Groups in Nigeria (1990 - 2016)") +
theme_minimal() +
xlab("Year") +
ylab("Number of Deaths") +
theme(axis.text.y = element_text(size = 12, face = 'bold'),
axis.text.x = element_text(size = 12, face = 'bold'),
axis.title.y = element_text(size = 12, face = 'bold'),
axis.title.x = element_text(size = 12, face = 'bold'),
plot.title = element_text(size = 12, face = "bold", hjust = 0.2),
panel.grid.major = element_blank()) +
scale_color_brewer(palette = "Set1") +
scale_x_continuous(breaks = pretty_breaks(n = 10)) +
scale_y_continuous(breaks = pretty_breaks(n = 5))