Life expectancy at birth (years)

The average number of years a newborn child would live if current mortality patterns were to stay the same.
Source:- http://www.gapminder.org/data/

Income per person (fixed PPP$)

Gross Domestic Product per capita by Purchasing Power Parities (in international dollars, fixed 2005 prices). The inflation and differences in the cost of living between countries has been taken into account. Source:- http://www.gapminder.org/data/

setwd("~/Courses/Exploratory Data Analysis(Udacity)/Life expectancy")
getwd()
## [1] "/Users/ahada/Courses/Exploratory Data Analysis(Udacity)/Life expectancy"
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## Loading required package: grid
library(reshape2)
library(ggthemes)
library(scales) 
theme_set(theme_few(12))

Notes:- The final file I created after all the data munging can be accessed from the following link:- https://www.dropbox.com/s/enfsnti9ufkkndk/le_gdp_pop.csv

Reading the csv file into a df(original Source:- gapminder.org)

I have subsetted the population dataset to only have data from 1950 onwards. Since the data before 1950 is incomplete.

life_expectancy = read.csv("life_expectancy_at_birth.csv", header = T, check.names = FALSE)
gdp_per_capita = read.csv("gdp_per_capita_ppp.csv", header = T, check.names = FALSE)
population = read.csv("population_total.csv", header = T, check.names = F)

Subsetting the data to have only complete cases

le = na.omit(life_expectancy)
le = data.frame(le, row.names=NULL, check.names = FALSE)
colnames(le)[1] = "Country"
gdp = na.omit(gdp_per_capita)
gdp = data.frame(gdp, row.names=NULL, check.names = FALSE)
colnames(gdp)[1] = "Country"
pop = na.omit(population)
pop = data.frame(pop, row.names=NULL, check.names = FALSE)
colnames(pop)[1] = "Country"

Changing the data to long format

le.melted = melt(le, id = 'Country')
names(le.melted)[names(le.melted) == 'variable'] = 'Year'
names(le.melted)[names(le.melted) == 'value'] = 'life_expectancy'
gdp.melted = melt(gdp, id = 'Country')
names(gdp.melted)[names(gdp.melted) == 'variable'] = 'Year'
names(gdp.melted)[names(gdp.melted) == 'value'] = 'GDP_per_capita'
pop.melted = melt(pop, id = 'Country')
names(pop.melted)[names(pop.melted) == 'variable'] = 'Year'
names(pop.melted)[names(pop.melted) == 'value'] = 'Population'

Reading the datafile with countries grouped by continent

contins = read.csv("country_by_continent.csv", header = T)
head(contins)
##        Country Continent
## 1      Algeria    Africa
## 2       Angola    Africa
## 3        Benin    Africa
## 4     Botswana    Africa
## 5 Burkina Faso    Africa
## 6      Burundi    Africa

Asssigning continents to each country in the life expectancy(le) dataframe

le_contins = merge(le.melted, contins, by = "Country", all = T)

Notes:- Exported the dataframe into a csv file. I had to do the manual editing in excel, so as to assign continents for the country not included in contins dataframe. I also had to manually delete some countries for which no life_expectancy, GDP_per_capita and Population information are available.

write.csv(le_contins, "le_contins.csv")

Importing the edited file

le_contins_edited = read.csv("le_contins_edited.csv", header = T)

Merging dataframes based on Country and Year. After merging the final dataset has complete information for life_expectancy, GDP_per_capita and Population for countries from 1950 onwards.

le_gdp = merge(gdp.melted, le_contins_edited, by = c("Country", "Year"))
le_gdp_pop = merge(le_gdp, pop.melted, by = c("Country", "Year"))

Reordering the columns in dataframe

le_gdp_pop = le_gdp_pop[c('Country', 'Continent', 'Year', 
                          'GDP_per_capita', 'life_expectancy', 
                          'Population')]
head(le_gdp_pop)
##       Country Continent Year GDP_per_capita life_expectancy Population
## 1 Afghanistan      Asia 1950          757.3           26.67    8151455
## 2 Afghanistan      Asia 1951          766.8           26.93    8276820
## 3 Afghanistan      Asia 1952          779.4           27.45    8407148
## 4 Afghanistan      Asia 1953          812.9           27.96    8542906
## 5 Afghanistan      Asia 1954          815.4           28.48    8684494
## 6 Afghanistan      Asia 1955          816.4           29.00    8832253
write.csv(le_gdp_pop, "le_gdp_pop.csv")

Peeking into the extremes of the data

data1950 = subset(le_gdp_pop, Year == 1950) 
summary(data1950)
##                 Country         Continent       Year     GDP_per_capita  
##  Afghanistan        :  1   Africa    :54   1950   :188   Min.   :   283  
##  Albania            :  1   Asia      :48   1800   :  0   1st Qu.:   973  
##  Algeria            :  1   Europe    :42   1801   :  0   Median :  2043  
##  Angola             :  1   N. America:22   1802   :  0   Mean   :  3547  
##  Antigua and Barbuda:  1   Oceania   :10   1803   :  0   3rd Qu.:  3498  
##  Argentina          :  1   S. America:12   1804   :  0   Max.   :104248  
##  (Other)            :182                   (Other):  0                   
##  life_expectancy   Population      
##  Min.   :23.4    Min.   :2.30e+04  
##  1st Qu.:38.9    1st Qu.:6.52e+05  
##  Median :47.9    Median :2.60e+06  
##  Mean   :48.7    Mean   :1.35e+07  
##  3rd Qu.:58.7    3rd Qu.:7.67e+06  
##  Max.   :71.6    Max.   :5.51e+08  
## 
data2012 = subset(le_gdp_pop, Year == 2012) 
summary(data2012)
##                 Country         Continent       Year     GDP_per_capita 
##  Afghanistan        :  1   Africa    :54   2012   :188   Min.   :  403  
##  Albania            :  1   Asia      :48   1800   :  0   1st Qu.: 2355  
##  Algeria            :  1   Europe    :42   1801   :  0   Median : 6977  
##  Angola             :  1   N. America:22   1802   :  0   Mean   :12715  
##  Antigua and Barbuda:  1   Oceania   :10   1803   :  0   3rd Qu.:17196  
##  Argentina          :  1   S. America:12   1804   :  0   Max.   :91493  
##  (Other)            :182                   (Other):  0                  
##  life_expectancy   Population      
##  Min.   :45.3    Min.   :5.73e+04  
##  1st Qu.:63.9    1st Qu.:2.33e+06  
##  Median :72.5    Median :8.27e+06  
##  Mean   :70.0    Mean   :3.75e+07  
##  3rd Qu.:76.3    3rd Qu.:2.56e+07  
##  Max.   :83.4    Max.   :1.35e+09  
## 

Life expectancy and GDP/capita variation across years

Notes:- The trend in this plot is important. We can see a clear increase in life expectancy over years. The median life expectancy has increased from 47.94 in 1950 to 72.45 in 2012, an increase of 24.51 years.

ggplot(data = le_gdp_pop, aes(x = Year, y = life_expectancy)) +
geom_boxplot() + ylab("Life expectancy") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

plot of chunk unnamed-chunk-18

Palette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00")

Notes:- European countries took the lead in terms of life expectancy, followed by N. American countries in 1950. While the overall life expectancy of all the continents have increased, Europe still makes it to the top of list in 2012. African nations have lagged behind.

p1 = ggplot(data = data1950, aes(x = Continent, y = life_expectancy)) +
  geom_boxplot(aes(fill = Continent)) +
  ggtitle("Life expectancy across continents Year 1950") +
  xlab('Continent') + ylab("Life expectancy (years)") + 
  ylim(c(23.39, 83.42)) +
  scale_fill_manual(values= Palette)

p2 = ggplot(data = data2012, aes(x = Continent, y = life_expectancy)) +
  geom_boxplot(aes(fill = Continent)) +
  ggtitle("Life expectancy across continents Year 2012") +
  xlab('Continent') + ylab("Life expectancy (years)") +
  ylim(c(23.39, 83.42)) +
  scale_fill_manual(values= Palette)

grid.arrange(p1,p2, ncol = 1)
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-20

ybreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
p3 = ggplot(data = data1950, aes(x = Continent, y = GDP_per_capita)) +
  geom_boxplot(aes(fill = Continent)) +
  ggtitle("GDP/capita across continents Year 1950") +
  xlab('Continent') + ylab("GDP/capita fixed PPP$") +
  scale_y_log10(breaks = ybreaks,
                labels = comma(ybreaks)) +
  scale_fill_manual(values= Palette)

p4 = ggplot(data = data2012, aes(x = Continent, y = GDP_per_capita)) +
  geom_boxplot(aes(fill = Continent)) +
  ggtitle("GDP/capita across continents Year 2012") +
  xlab('Continent') + ylab("GDP/capita fixed PPP$") +
  scale_y_log10(breaks = ybreaks,
                labels = comma(ybreaks)) +
  scale_fill_manual(values= Palette)

grid.arrange(p3,p4, ncol = 1)

plot of chunk unnamed-chunk-21

grouped = group_by(le_gdp_pop, Year, Continent)
le_gdp.byYearContinent = summarise(grouped,
                                   le_mean = mean(life_expectancy),
                                   gdp_mean = mean(GDP_per_capita))
ggplot(le_gdp.byYearContinent, aes(x = Year, y = le_mean, 
                                   colour = Continent,
                                   group = Continent)) +
  geom_line(size = 1.5) +
  scale_color_manual(values= Palette) +
  ylab("Life expectancy (years)") +
  ggtitle("Average life expectancy of continents through time") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

plot of chunk unnamed-chunk-22

ybreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(le_gdp.byYearContinent, aes(x = Year, y = gdp_mean, 
                                   colour = Continent,
                                   group = Continent)) +
  geom_line(size = 1.5) +
  scale_color_manual(values= Palette) +
  scale_y_log10(breaks = ybreaks,
                labels = comma(ybreaks)) +
  ylab("Income per person (GDP/capita fixed PPP$)") +
  ggtitle("Average GDP/capita of continents through time") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

plot of chunk unnamed-chunk-22

Correlation betweeen Life expectancy and GDP/capita

xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 1950), 
       aes(x = GDP_per_capita, y = life_expectancy)) + 
  geom_point() + 
  ggtitle("Year 1950") +
  xlab("Income per person (GDP/capita fixed PPP$)") +
  ylab("Life expectancy (years)") +
  scale_x_log10(breaks = xbreaks,
                labels = comma(xbreaks)) +
  ylim(c(23.39, 83.42))
## Warning: Removed 1 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-23

xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 2012), 
       aes(x = GDP_per_capita, y = life_expectancy)) + 
  geom_point() + 
  ggtitle("Year 2012") +
  xlab("Income per person (GDP/capita fixed PPP$)") +
  ylab("Life expectancy (years)") +
  scale_x_log10(breaks = xbreaks,
                labels = comma(xbreaks)) +
  ylim(c(23.39, 83.42))

plot of chunk unnamed-chunk-24

Correlation betweeen Life expectancy and GDP/capita. Population of countries taken into account

Notes:- The size of the bubble is directly proportional to the Population of the country. N. American and European countries leaded both in terms of life expectancy and GDP/capita. African and asian countries lagged behind.

xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 1950), 
       aes(x = GDP_per_capita, y = life_expectancy)) + 
  geom_point(aes(fill = Continent, size = sqrt(Population/pi)), 
             pch = 21) + 
  ggtitle("Year 1950") +
  xlab("Income per person (GDP/capita fixed PPP$)") +
  ylab("Life expectancy (years)") +
  scale_x_log10(breaks = xbreaks,
                labels = comma(xbreaks)) +
  scale_size_continuous(range=c(1,30)) +
  guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
  scale_fill_manual(values= Palette) +
   ylim(c(23.39, 83.42))
## Warning: Removed 1 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-25

xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 2012), 
       aes(x = GDP_per_capita, y = life_expectancy)) + 
  geom_point(aes(fill = Continent, size = sqrt(Population/pi)), 
             pch = 21) + 
  ggtitle("Year 2012") +
  xlab("Income per person (GDP/capita fixed PPP$)") +
  ylab("Life expectancy (years)") +
  scale_x_log10(breaks = xbreaks,
                labels = comma(xbreaks)) +
  scale_size_continuous(range=c(1,30)) +
  guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
  scale_fill_manual(values= Palette) +
  ylim(c(23.39, 83.42))

plot of chunk unnamed-chunk-26

Notes:- By 2012 asian countries have made their way up into the ladder both in terms of life expectancy and GDP/capita.

xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 1950), 
       aes(x = GDP_per_capita, y = life_expectancy)) + 
  geom_point(aes(fill = Continent, size = sqrt(Population/pi)), 
             pch = 21) + 
  ggtitle("Year 1950") +
  xlab("Income per person (GDP/capita fixed PPP$)") +
  ylab("Life expectancy (years)") +
  scale_x_log10(breaks = xbreaks,
                labels = comma(xbreaks)) +
  scale_size_continuous(range=c(1,30)) +
  guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
  scale_fill_manual(values= Palette) +
  ylim(c(23.39, 83.42)) +
  facet_wrap(~Continent) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
## Warning: Removed 1 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-27

xbreaks = c(200, 400, 1000, 2000, 4000, 10000, 20000, 40000)
ggplot(data = subset(le_gdp_pop, Year == 2012), 
       aes(x = GDP_per_capita, y = life_expectancy)) + 
  geom_point(aes(fill = Continent, size = sqrt(Population/pi)), 
             pch = 21) + 
  ggtitle("Year 2012") +
  xlab("Income per person (GDP/capita fixed PPP$)") +
  ylab("Life expectancy (years)") +
  scale_x_log10(breaks = xbreaks,
                labels = comma(xbreaks)) +
  scale_size_continuous(range=c(1,30)) +
  guides(size = F, fill = guide_legend(override.aes = list(size=5))) +
  scale_fill_manual(values= Palette) +
  ylim(c(23.39, 83.42)) +
  facet_wrap(~Continent) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

plot of chunk unnamed-chunk-28