The relations between population and GDP per capita during the years 1960 to 2010. The population measured by the percentage of children’s age 0-4 with respect to total population and by life expectancy.


Reading the data, than gather and rearrange it using functions from the tidyr library. Finally audit the data so each variable include similar years and countries.

 library(ggplot2)
 library(tidyr)
 library(gridExtra) 
## Loading required package: grid
 setwd('C:/Shmuel/Nanodegree/Explorer_Data_R')


 per_chil <- read.csv('indicator_total0-4percen.csv')
 life_exp <- read.csv('life_expectancy_at_birth.csv')
 gdp <- read.csv('GDPC.csv')


# Rearrange the data, 
Per_chil <- gather(per_chil,'year','n',2:22)
Life_exp <- gather(life_exp,'year','n',2:206)
GDP      <- gather(gdp,'year','n',2:53)


# Remove the x from the year 
Per_chil$year  <- as.numeric(gsub('X','',Per_chil$year))
Life_exp$year  <- as.numeric(gsub('X','',Life_exp$year))
GDP$year       <- as.numeric(gsub('X','',GDP$year))
Life_exp$Total <- Life_exp[[1]]
GDP$Total      <- GDP[[1]]

# Find the years that included in each data frame

A1 <- levels(factor(Per_chil $year))
A2 <- levels(factor(Life_exp$year))
A3 <- levels(factor(GDP$year))

# Find the countries that included in each data frame
B1 <- levels(factor(Per_chil $Total))
B2 <- levels(factor(Life_exp$Total))
B3 <- levels(factor(GDP$Total))

# Find the years and the countries that are common to all data frame
AT <- Reduce(intersect, list(A1,A3,A2))
BT <- Reduce(intersect, list(B1,B3,B2))

# Subset each data frames according to the common years and countries
Per_chil_A <- subset(Per_chil,Total %in% BT & year%in% AT)
Life_exp_A <- subset(Life_exp,Total %in% BT & year%in% AT,select = c(2:4))
GDP_A      <- subset(GDP,     Total %in% BT & year%in% AT,select = c(2:4))

# Merge the 3 data frames and give names to the variables
total <- merge(Per_chil_A,Life_exp_A ,by=c('year','Total'))
total <- merge(total,GDP_A ,by=c('year','Total'))

total$chil <- total$n.x
total$Life <- total$n.y
total$GDP  <- total$n

# Remove NA values
total <- subset(total,!is.na(n))

Plot each parameter as function of years.

ggplot(aes(y = chil, x = factor(year)), 
       data = total) + 
  xlab('Years')+
  ylab('Percent childrens [%]')+
  ggtitle("Percent of childrens age 0-4, years 1960-2010")+
  geom_boxplot()+
  stat_summary(fun.y = mean, geom = 'point', shape = 4)

The percentage number of children’s average across all countries (Median (-) and Mean (x)) decrease with years.

ggplot(aes(y=GDP, x=factor(year)), 
       data = total) + 
  xlab('Years')+
  ylab('log GDP [$]')+
  ggtitle("Total log GDP $, years 1960-2010")+
  geom_boxplot()+
  coord_trans( y = "log")+
  stat_summary(fun.y = mean, geom = 'point', shape = 4)

The GDP average across all countries (Median (-) and Mean (x)) increase with years but only slightly.

ggplot(aes(y=Life,x=factor(year)), 
       data = total) +
  xlab('Years')+
  ylab('life expectancy[Years]')+
  ggtitle("Life expectancy, years 1960-2010")+
  geom_boxplot()+
  stat_summary(fun.y = mean, geom = 'point', shape = 4)

Life expectancy average across all countries (Median (-) and Mean (x)) increase with years.


Scatter plot between each pairs of parameters and their correlation values.

ggplot(aes(y = GDP, x = Life), 
       data = total) + 
  geom_point(size=2)+
  coord_trans( y = "log")

cor.test(total$GDP, total$Life, method = c("pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  total$GDP and total$Life
## t = 25.5536, df = 1362, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5322725 0.6040920
## sample estimates:
##       cor 
## 0.5692673

Life expectancy is correlated with GDP.

ggplot(aes(y = chil, x = Life), 
       data = total) + 
  geom_point(size=2)+
  coord_trans( y = "log")

cor.test(total$chil, total$Life, method = c("pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  total$chil and total$Life
## t = -49.008, df = 1362, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8172561 -0.7787726
## sample estimates:
##        cor 
## -0.7988302

Percentage number of children’s is negatively correlated with Life expectancy.

ggplot(aes(y = GDP, x = chil), 
       data = total) + 
  geom_point(size=2)+
  coord_trans( y = "log")

cor.test(total$chil, total$GDP, method = c("pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  total$chil and total$GDP
## t = -26.2949, df = 1362, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6144269 -0.5439492
## sample estimates:
##        cor 
## -0.5802734

Percentage number of children’s is also negatively correlated with GDP.

data obtained from http://www.gapminder.org/data/

.

.