The average number of years a newborn child would live if current mortality patterns were to stay the same.
Source:- http://www.gapminder.org/data/
Gross Domestic Product per capita by Purchasing Power Parities (in international dollars, fixed 2005 prices). The inflation and differences in the cost of living between countries has been taken into account. Source:- http://www.gapminder.org/data/
setwd("~/Courses/Exploratory Data Analysis(Udacity)/Life expectancy")
getwd()
## [1] "/Users/ahada/Courses/Exploratory Data Analysis(Udacity)/Life expectancy"
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
## Loading required package: grid
library(reshape2)
library(ggthemes)
library(scales)
theme_set(theme_few(12))
Notes:- The final file I created after all the data munging can be accessed from the following link:- https://www.dropbox.com/s/enfsnti9ufkkndk/le_gdp_pop.csv
I have subsetted the population dataset to only have data from 1950 onwards. Since the data before 1950 is incomplete.
life_expectancy = read.csv("life_expectancy_at_birth.csv", header = T, check.names = FALSE)
gdp_per_capita = read.csv("gdp_per_capita_ppp.csv", header = T, check.names = FALSE)
population = read.csv("population_total.csv", header = T, check.names = F)
le = na.omit(life_expectancy)
le = data.frame(le, row.names=NULL, check.names = FALSE)
colnames(le)[1] = "Country"
gdp = na.omit(gdp_per_capita)
gdp = data.frame(gdp, row.names=NULL, check.names = FALSE)
colnames(gdp)[1] = "Country"
pop = na.omit(population)
pop = data.frame(pop, row.names=NULL, check.names = FALSE)
colnames(pop)[1] = "Country"
le.melted = melt(le, id = 'Country')
names(le.melted)[names(le.melted) == 'variable'] = 'Year'
names(le.melted)[names(le.melted) == 'value'] = 'life_expectancy'
gdp.melted = melt(gdp, id = 'Country')
names(gdp.melted)[names(gdp.melted) == 'variable'] = 'Year'
names(gdp.melted)[names(gdp.melted) == 'value'] = 'GDP_per_capita'
pop.melted = melt(pop, id = 'Country')
names(pop.melted)[names(pop.melted) == 'variable'] = 'Year'
names(pop.melted)[names(pop.melted) == 'value'] = 'Population'
contins = read.csv("country_by_continent.csv", header = T)
head(contins)
## Country Continent
## 1 Algeria Africa
## 2 Angola Africa
## 3 Benin Africa
## 4 Botswana Africa
## 5 Burkina Faso Africa
## 6 Burundi Africa
le_contins = merge(le.melted, contins, by = "Country", all = T)
Notes:- Exported the dataframe into a csv file. I had to do the manual editing in excel, so as to assign continents for the country not included in contins dataframe. I also had to manually delete some countries for which no life_expectancy, GDP_per_capita and Population information are available.
write.csv(le_contins, "le_contins.csv")
le_contins_edited = read.csv("le_contins_edited.csv", header = T)
le_gdp = merge(gdp.melted, le_contins_edited, by = c("Country", "Year"))
le_gdp_pop = merge(le_gdp, pop.melted, by = c("Country", "Year"))
le_gdp_pop = le_gdp_pop[c('Country', 'Continent', 'Year',
'GDP_per_capita', 'life_expectancy',
'Population')]
head(le_gdp_pop)
## Country Continent Year GDP_per_capita life_expectancy Population
## 1 Afghanistan Asia 1950 757.3 26.67 8151455
## 2 Afghanistan Asia 1951 766.8 26.93 8276820
## 3 Afghanistan Asia 1952 779.4 27.45 8407148
## 4 Afghanistan Asia 1953 812.9 27.96 8542906
## 5 Afghanistan Asia 1954 815.4 28.48 8684494
## 6 Afghanistan Asia 1955 816.4 29.00 8832253
write.csv(le_gdp_pop, "le_gdp_pop.csv")
data1950 = subset(le_gdp_pop, Year == 1950)
summary(data1950)
## Country Continent Year GDP_per_capita
## Afghanistan : 1 Africa :54 1950 :188 Min. : 283
## Albania : 1 Asia :48 1800 : 0 1st Qu.: 973
## Algeria : 1 Europe :42 1801 : 0 Median : 2043
## Angola : 1 N. America:22 1802 : 0 Mean : 3547
## Antigua and Barbuda: 1 Oceania :10 1803 : 0 3rd Qu.: 3498
## Argentina : 1 S. America:12 1804 : 0 Max. :104248
## (Other) :182 (Other): 0
## life_expectancy Population
## Min. :23.4 Min. :2.30e+04
## 1st Qu.:38.9 1st Qu.:6.52e+05
## Median :47.9 Median :2.60e+06
## Mean :48.7 Mean :1.35e+07
## 3rd Qu.:58.7 3rd Qu.:7.67e+06
## Max. :71.6 Max. :5.51e+08
##
data2012 = subset(le_gdp_pop, Year == 2012)
summary(data2012)
## Country Continent Year GDP_per_capita
## Afghanistan : 1 Africa :54 2012 :188 Min. : 403
## Albania : 1 Asia :48 1800 : 0 1st Qu.: 2355
## Algeria : 1 Europe :42 1801 : 0 Median : 6977
## Angola : 1 N. America:22 1802 : 0 Mean :12715
## Antigua and Barbuda: 1 Oceania :10 1803 : 0 3rd Qu.:17196
## Argentina : 1 S. America:12 1804 : 0 Max. :91493
## (Other) :182 (Other): 0
## life_expectancy Population
## Min. :45.3 Min. :5.73e+04
## 1st Qu.:63.9 1st Qu.:2.33e+06
## Median :72.5 Median :8.27e+06
## Mean :70.0 Mean :3.75e+07
## 3rd Qu.:76.3 3rd Qu.:2.56e+07
## Max. :83.4 Max. :1.35e+09
##
Notes:- The trend in this plot is important. We can see a clear increase in life expectancy over years. The median life expectancy has increased from 47.94 in 1950 to 72.45 in 2012, an increase of 24.51 years.
ggplot(data = le_gdp_pop, aes(x = Year, y = life_expectancy)) +
geom_boxplot() + ylab("Life expectancy") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
Palette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00")
Notes:- European countries took the lead in terms of life expectancy, followed by N. American countries in 1950. While the overall life expectancy of all the continents have increased, Europe still makes it to the top of list in 2012. African nations have lagged behind.
p1 = ggplot(data = data1950, aes(x = Continent, y = life_expectancy)) +
geom_boxplot(aes(fill = Continent)) +
ggtitle("Life expectancy across continents Year 1950") +
xlab('Continent') + ylab("Life expectancy (years)") +
ylim(c(23.39, 83.42)) +
scale_fill_manual(values= Palette)
p2 = ggplot(data = data2012, aes(x = Continent, y = life_expectancy)) +
geom_boxplot(aes(fill = Continent)) +
ggtitle("Life expectancy across continents Year 2012") +
xlab('Continent') + ylab("Life expectancy (years)") +
ylim(c(23.39, 83.42)) +
scale_fill_manual(values= Palette)
grid.arrange(p1,p2, ncol = 1)
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
ybreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
p3 = ggplot(data = data1950, aes(x = Continent, y = GDP_per_capita)) +
geom_boxplot(aes(fill = Continent)) +
ggtitle("GDP/capita across continents Year 1950") +
xlab('Continent') + ylab("GDP/capita fixed PPP$") +
scale_y_log10(breaks = ybreaks,
labels = comma(ybreaks)) +
scale_fill_manual(values= Palette)
p4 = ggplot(data = data2012, aes(x = Continent, y = GDP_per_capita)) +
geom_boxplot(aes(fill = Continent)) +
ggtitle("GDP/capita across continents Year 2012") +
xlab('Continent') + ylab("GDP/capita fixed PPP$") +
scale_y_log10(breaks = ybreaks,
labels = comma(ybreaks)) +
scale_fill_manual(values= Palette)
grid.arrange(p3,p4, ncol = 1)
grouped = group_by(le_gdp_pop, Year, Continent)
le_gdp.byYearContinent = summarise(grouped,
le_mean = mean(life_expectancy),
gdp_mean = mean(GDP_per_capita))
ggplot(le_gdp.byYearContinent, aes(x = Year, y = le_mean,
colour = Continent,
group = Continent)) +
geom_line(size = 1.5) +
scale_color_manual(values= Palette) +
ylab("Life expectancy (years)") +
ggtitle("Average life expectancy of continents through time") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
ybreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(le_gdp.byYearContinent, aes(x = Year, y = gdp_mean,
colour = Continent,
group = Continent)) +
geom_line(size = 1.5) +
scale_color_manual(values= Palette) +
scale_y_log10(breaks = ybreaks,
labels = comma(ybreaks)) +
ylab("Income per person (GDP/capita fixed PPP$)") +
ggtitle("Average GDP/capita of continents through time") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 1950),
aes(x = GDP_per_capita, y = life_expectancy)) +
geom_point() +
ggtitle("Year 1950") +
xlab("Income per person (GDP/capita fixed PPP$)") +
ylab("Life expectancy (years)") +
scale_x_log10(breaks = xbreaks,
labels = comma(xbreaks)) +
ylim(c(23.39, 83.42))
## Warning: Removed 1 rows containing missing values (geom_point).
xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 2012),
aes(x = GDP_per_capita, y = life_expectancy)) +
geom_point() +
ggtitle("Year 2012") +
xlab("Income per person (GDP/capita fixed PPP$)") +
ylab("Life expectancy (years)") +
scale_x_log10(breaks = xbreaks,
labels = comma(xbreaks)) +
ylim(c(23.39, 83.42))
Notes:- The size of the bubble is directly proportional to the Population of the country. N. American and European countries leaded both in terms of life expectancy and GDP/capita. African and asian countries lagged behind.
xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 1950),
aes(x = GDP_per_capita, y = life_expectancy)) +
geom_point(aes(fill = Continent, size = sqrt(Population/pi)),
pch = 21) +
ggtitle("Year 1950") +
xlab("Income per person (GDP/capita fixed PPP$)") +
ylab("Life expectancy (years)") +
scale_x_log10(breaks = xbreaks,
labels = comma(xbreaks)) +
scale_size_continuous(range=c(1,30)) +
guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
scale_fill_manual(values= Palette) +
ylim(c(23.39, 83.42))
## Warning: Removed 1 rows containing missing values (geom_point).
xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 2012),
aes(x = GDP_per_capita, y = life_expectancy)) +
geom_point(aes(fill = Continent, size = sqrt(Population/pi)),
pch = 21) +
ggtitle("Year 2012") +
xlab("Income per person (GDP/capita fixed PPP$)") +
ylab("Life expectancy (years)") +
scale_x_log10(breaks = xbreaks,
labels = comma(xbreaks)) +
scale_size_continuous(range=c(1,30)) +
guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
scale_fill_manual(values= Palette) +
ylim(c(23.39, 83.42))
Notes:- By 2012 asian countries have made their way up into the ladder both in terms of life expectancy and GDP/capita.
xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 1950),
aes(x = GDP_per_capita, y = life_expectancy)) +
geom_point(aes(fill = Continent, size = sqrt(Population/pi)),
pch = 21) +
ggtitle("Year 1950") +
xlab("Income per person (GDP/capita fixed PPP$)") +
ylab("Life expectancy (years)") +
scale_x_log10(breaks = xbreaks,
labels = comma(xbreaks)) +
scale_size_continuous(range=c(1,30)) +
guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
scale_fill_manual(values= Palette) +
ylim(c(23.39, 83.42)) +
facet_wrap(~Continent) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
## Warning: Removed 1 rows containing missing values (geom_point).
xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 2012),
aes(x = GDP_per_capita, y = life_expectancy)) +
geom_point(aes(fill = Continent, size = sqrt(Population/pi)),
pch = 21) +
ggtitle("Year 2012") +
xlab("Income per person (GDP/capita fixed PPP$)") +
ylab("Life expectancy (years)") +
scale_x_log10(breaks = xbreaks,
labels = comma(xbreaks)) +
scale_size_continuous(range=c(1,30)) +
guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
scale_fill_manual(values= Palette) +
ylim(c(23.39, 83.42)) +
facet_wrap(~Continent) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5))