Exploring the relationship between life expectancy and GNI per Capita

MATH1324 - Applied Analytics Assignment 2

Mitchell Pasmans s3853065

Last updated: 25 October, 2020

Introduction

Problem Statement

Data

Variables.

Processing data

# Reading the Data
gniLife <- read_excel("GNILifeExpectancy.xlsx", sheet = "Data")

# Excluding unwanted rows and columns
gniLife <- gniLife[1:217, 3:6]

# Reading Country metadata and joining Country metadata
metadata <- read_xlsx("Data_Extract_From_World_Development_Indicators_Metadata.xlsx", sheet = "Country - Metadata")
gniLife <- left_join(gniLife, metadata[c(1, 3, 4)], by = c("Country Code" = "Code")) 

# Simplifying column names
colnames(gniLife) <- c("Country", "Code", "GNI.per.capita.USD", "Life.Expectancy", "Income.Group", "Region")

# Changing missing values to NA
gniLife$`Life.Expectancy`[gniLife$`Life.Expectancy` == ".."] <- NA
gniLife$`GNI.per.capita.USD`[gniLife$`GNI.per.capita.USD` == ".."] <- NA

# converting to numerics
gniLife$`Life.Expectancy` <- as.numeric(gniLife$`Life.Expectancy`)
gniLife$`GNI.per.capita.USD` <- as.numeric(gniLife$`GNI.per.capita.USD`)

# Excluding NA's
gniLife <- na.omit(gniLife) 

# Converting to Factors
gniLife$Region <- factor(gniLife$Region)
gniLife$`Income.Group` <- factor(gniLife$`Income.Group`,
                                           levels = c("Low income", "Lower middle income", "Upper middle income", "High income"),
                                           labels = c("Low", "Lower Middle", "Upper Middle", "High"),
                                           ordered = TRUE)

# Head of complete dataset
head(gniLife)

Descriptive Statistics

# GNI Descriptive Statistics
summaryGNI <- gniLife %>% summarise(Variable = "GNI per Capita",
                                        Min = min(`GNI.per.capita.USD`),
                                        Q1 = quantile(`GNI.per.capita.USD`,probs = .25),
                                        Median = median(`GNI.per.capita.USD`),
                                        Q3 = quantile(`GNI.per.capita.USD`,probs = .75),
                                        Max = max(`GNI.per.capita.USD`),
                                        Mean = mean(`GNI.per.capita.USD`),
                                        IQR = IQR(`GNI.per.capita.USD`),
                                        SD = sd(`GNI.per.capita.USD`),
                                        n = n())
                                      

# Life Expectancy Descriptive Statistics
summaryLife <- gniLife %>% summarise(Variable = "Life.Expectancy",
                                        Min = min(`Life.Expectancy`),
                                        Q1 = quantile(`Life.Expectancy`,probs = .25),
                                        Median = median(`Life.Expectancy`),
                                        Q3 = quantile(`Life.Expectancy`,probs = .75),
                                        Max = max(`Life.Expectancy`),
                                        Mean = mean(`Life.Expectancy`),
                                        IQR = IQR(`Life.Expectancy`),
                                        SD = sd(`Life.Expectancy`),
                                        n = n())

summary <- rbind(summaryGNI, summaryLife)
knitr::kable(summary, align = "c", digits = 2, caption = "Summary Statistics")
Summary Statistics
Variable Min Q1 Median Q3 Max Mean IQR SD n
GNI per Capita 280.0 2017.50 5340.00 16145.00 84430.00 13916.94 14127.50 18771.78 180
Life.Expectancy 52.8 67.28 73.82 77.67 84.93 72.48 10.38 7.51 180

life expectancy Descriptive Statistics grouped by Income Group

# Life Expectancy grouped by Income Group descriptive statistics
summaryLifeByIncome <- gniLife %>% group_by(gniLife$`Income.Group`) %>% 
                                        summarise(Min = min(`Life.Expectancy`),
                                        Q1 = quantile(`Life.Expectancy`,probs = .25,na.rm = TRUE),
                                        Median = median(`Life.Expectancy`, na.rm = TRUE),
                                        Q3 = quantile(`Life.Expectancy`,probs = .75,na.rm = TRUE),
                                        Max = max(`Life.Expectancy`,na.rm = TRUE),
                                        Mean = mean(`Life.Expectancy`, na.rm = TRUE),
                                        IQR = IQR(`Life.Expectancy`),
                                        SD = sd(`Life.Expectancy`, na.rm = TRUE),
                                        n = n())

knitr::kable(summaryLifeByIncome, align = "c", digits = 2, caption = "Life Expectancy Summary Statistics by Income Group")
Life Expectancy Summary Statistics by Income Group
gniLife$Income.Group Min Q1 Median Q3 Max Mean IQR SD n
Low 52.80 60.32 61.88 64.64 70.88 62.04 4.32 4.42 24
Lower Middle 53.70 64.39 69.49 71.82 76.81 68.24 7.43 5.70 50
Upper Middle 58.40 72.20 74.18 76.52 80.10 73.42 4.32 4.26 49
High 72.84 77.60 80.89 82.45 84.93 79.79 4.85 3.20 57

Boxplots

# GNI Histograms
par(mfrow=c(1,2))  
boxplot(gniLife$`GNI.per.capita.USD`,
      main = "GNI per Capita", 
      ylab = "GNI per Capita USD",
      col = I("thistle3"))

boxplot(gniLife$`Life.Expectancy`,
      main = "Life Expectancy", 
      ylab = "Life Expectancy at Birth (Years)",
      col = I("paleturquoise3"))

Histograms

# GNI Histograms
par(mfrow=c(2,2))
plot1 <- ggplot() + geom_histogram(aes(gniLife$`GNI.per.capita.USD`, y = ..density..), bins = 20, fill = "thistle3", color = "Black" ) +
    labs(title = "GNI per Capita Histogram", x = "GNI.per.capita.USD")

log10GNI <- log10(gniLife$`GNI.per.capita.USD`)
plot2 <- ggplot() + geom_histogram(aes(log10GNI, y = ..density..), bins = 20, fill = "thistle3", color = "Black" ) +
    labs(title = "log10(GNI per Capita) Histogram", x = "log10(GNI.per.capita.USD)") +
    stat_function(fun = dnorm, args = list(mean = mean(log10GNI), sd = sd(log10GNI)))

# Life Expectancy Histograms
plot3 <- ggplot() + geom_histogram(aes(gniLife$`Life.Expectancy`, y = ..density..), bins = 20, fill = "paleturquoise3", color = "Black" ) +
    labs(title = "Life Expectancy", x = "Life.Expectancy")

lifeCubed <- gniLife$`Life.Expectancy`^ 3
plot4 <- ggplot() + geom_histogram(aes(lifeCubed, y = ..density..), bins = 20, fill = "paleturquoise3", color = "Black" ) +
    labs(title = "Life Expectancy^3 Histogram", x = "Life.Expectancy^3") +
    stat_function(fun = dnorm, args = list(mean = mean(lifeCubed), sd = sd(lifeCubed)))

grid.arrange(plot1, plot2, plot3, plot4, ncol=2)

Q-Q plots of transformed variables

# qqplots to test normality
par(mfrow=c(1,2))
log10GNI %>% qqPlot(dist="norm",
                    main = "log10(GNI per capita)",
                    col.lines = ("blue"))
## [1]  27 100
lifeCubed %>% qqPlot(dist="norm",
                      main = "Life Expectancy^3", 
                      col.lines = ("red"))

## [1] 32 93

Hypothesis Testing

# Preparing line of best fit
sum_x <- sum(log10GNI)
sum_y <- sum(lifeCubed)
sum_x_sq <- sum(log10GNI^2)
sum_y_sq <- sum(lifeCubed^2)
sum_xy <- sum(log10GNI*lifeCubed)
n <- length(log10GNI)
Lxx <-  sum_x_sq-((sum_x^2)/n)
Lyy <- sum_y_sq-((sum_y^2)/n)
Lxy = sum_xy - (((sum_x)*(sum_y))/n)
b = Lxy/Lxx
a = mean(lifeCubed - b*mean(log10GNI))

# Plotting the line
plot(lifeCubed ~ log10GNI, main = "Line of Best Fit", 
     xlab = "log10 GNI per Capita", 
     ylab = " Life Expectancy^3")
abline(a = a, b = b, col = "red")

Linear regression - F test and Multiple R-Squared

gniLifeModel <- lm(lifeCubed ~ log10GNI)
gniLifeModel %>% summary()
## 
## Call:
## lm(formula = lifeCubed ~ log10GNI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -203643  -29571    4764   36552  102128 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -214909      26178   -8.21 4.36e-14 ***
## log10GNI      161595       6872   23.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56240 on 178 degrees of freedom
## Multiple R-squared:  0.7565, Adjusted R-squared:  0.7551 
## F-statistic:   553 on 1 and 178 DF,  p-value: < 2.2e-16

Linear regression - Intercept and slope

# Slope And intercept
gniLifeModel %>% summary() %>% coef()
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -214908.7  26177.520 -8.209664 4.358542e-14
## log10GNI     161594.9   6871.854 23.515475 1.721335e-56
# Testing confidence intervals
gniLifeModel %>% confint()
##                 2.5 %    97.5 %
## (Intercept) -266566.9 -163250.4
## log10GNI     148034.1  175155.7

Linear regression - Diagnostics

# Diagnostics of fitted regression model
par(mfrow = c(2, 2))
gniLifeModel %>% plot(which=1)
gniLifeModel %>% plot(which=2)
gniLifeModel %>% plot(which=3)
gniLifeModel %>% plot(which=5)

Pearson Correlation

# Pearson Correlation
bivariate <- as.matrix(cbind(lifeCubed,log10GNI))
rcorr(bivariate, type = "pearson")
##           lifeCubed log10GNI
## lifeCubed      1.00     0.87
## log10GNI       0.87     1.00
## 
## n= 180 
## 
## 
## P
##           lifeCubed log10GNI
## lifeCubed            0      
## log10GNI   0
# Testing Confidence interval
r <- cor(lifeCubed,log10GNI)
CIr(r = r, n = 180, level = .95)
## [1] 0.8289567 0.9013615

Discussion

Discussion Continued

References

Data was taken from the https://databank.worldbank.org/source/world-development-indicators#