knitr::include_graphics("/Users/Nusrat/Desktop/Life.png")
Life expectancy is the average period of a person may expect to live. It is one of the most important measures of health. Statistics from World Health Organization (WHO) suggest that human beings currently have an average life span of 64.3 years. The purpose of this assignment is to study various factors, both individual level and country level, which might influence life expectancy across the globe. Additionally, I will also look at the trend in life expectency over the last fifteen years (2000 to 2015). The dataset I will be using for this study is a dataset from World Health Organization. The data can be accessed through the following link: https://www.kaggle.com/kumarajarshi/life-expectancy-who.
Variable Index:
Research Questions:
The research questions of this project are as follows: 1. Is life expectancy significantly impacted by the prevalence of thinness among children and adolescents aging 10 to 19 years; 2. Does life expectancy depend on how much the government spends on public health care? To be more specific, should a country having a low life expectancy value increase its health care expenditure in order to improve its average lifespan?; 3. What is the histrical trend of life expectancy?; 4. Does life expectancy vary across developed and developing countries?
Hypothesis:
The hypotheses for this study are as follows: 1. Life expectancy is significantly impacted by the prevalence of thinness among children and adolescents aging 10 to 19 years; 2. Countries with a low life expectancy value should increase its health care expenditure in order to improve its average lifespan; 3. Life expectancy has increased over the course of past fifteen years; 4. Life expectancy is greater in developed countries than developing countries.
library(dplyr)
library(magrittr)
library(tidyr)
library(haven)
library(ggplot2)
library(texreg)
library(gridExtra)
library(readr)
LifeExpectancy <- read_csv("C:/Users/Nusrat/Desktop/MA - 3rd semester, Spring 19/SOC 712 - Advanced Analytics (R)/Assignment 9 - Advanced VIsualization/LifeExpectancy.csv")
head(LifeExpectancy)
## # A tibble: 6 x 24
## Country Country_Rec Year Status Status_Rec Life_expectancy
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Afghan~ 1 2015 Devel~ 0 65
## 2 Afghan~ 1 2014 Devel~ 0 59.9
## 3 Afghan~ 1 2013 Devel~ 0 59.9
## 4 Afghan~ 1 2012 Devel~ 0 59.5
## 5 Afghan~ 1 2011 Devel~ 0 59.2
## 6 Afghan~ 1 2010 Devel~ 0 58.8
## # ... with 18 more variables: `Adult Mortality` <dbl>, `infant
## # deaths` <dbl>, Alcohol <dbl>, `percentage expenditure` <dbl>,
## # `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five
## # deaths` <dbl>, Polio <dbl>, Total_expenditure <dbl>, Diphtheria <dbl>,
## # `HIV/AIDS` <dbl>, GDP <dbl>, Population <dbl>, Thinness <dbl>,
## # `thinness 5-9 years` <dbl>, `Income composition of resources` <dbl>,
## # Schooling <dbl>
m1 <- lm(Life_expectancy ~ Thinness, data = LifeExpectancyy)
summary(m1)
##
## Call:
## lm(formula = Life_expectancy ~ Thinness, data = LifeExpectancyy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.699 -4.607 1.501 5.466 19.650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.5500 0.2800 262.6 <2e-16 ***
## Thinness -0.8757 0.0419 -20.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.823 on 1647 degrees of freedom
## Multiple R-squared: 0.2096, Adjusted R-squared: 0.2091
## F-statistic: 436.8 on 1 and 1647 DF, p-value: < 2.2e-16
m2 <- lm(Life_expectancy ~ Thinness + Total_expenditure, data = LifeExpectancyy)
summary(m2)
##
## Call:
## lm(formula = Life_expectancy ~ Thinness + Total_expenditure,
## data = LifeExpectancyy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.759 -4.192 1.444 5.393 19.878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.51563 0.61853 115.621 < 2e-16 ***
## Thinness -0.84268 0.04269 -19.739 < 2e-16 ***
## Total_expenditure 0.31468 0.08539 3.685 0.000236 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.793 on 1646 degrees of freedom
## Multiple R-squared: 0.2161, Adjusted R-squared: 0.2151
## F-statistic: 226.9 on 2 and 1646 DF, p-value: < 2.2e-16
m3 <- lm(Life_expectancy ~ Thinness + Status_Rec + Total_expenditure, data = LifeExpectancyy)
summary(m3)
##
## Call:
## lm(formula = Life_expectancy ~ Thinness + Status_Rec + Total_expenditure,
## data = LifeExpectancyy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.826 -4.219 1.106 5.121 18.720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.46746 0.58393 120.677 <2e-16 ***
## Thinness -0.66707 0.04167 -16.008 <2e-16 ***
## Status_Rec 8.15116 0.53949 15.109 <2e-16 ***
## Total_expenditure 0.14680 0.08081 1.817 0.0695 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.305 on 1645 degrees of freedom
## Multiple R-squared: 0.3116, Adjusted R-squared: 0.3104
## F-statistic: 248.2 on 3 and 1645 DF, p-value: < 2.2e-16
m4 <- lm(Life_expectancy ~ Thinness + Total_expenditure * Status_Rec, data = LifeExpectancyy)
summary(m4)
##
## Call:
## lm(formula = Life_expectancy ~ Thinness + Total_expenditure *
## Status_Rec, data = LifeExpectancyy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.815 -4.185 1.173 5.134 18.627
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.59107 0.63472 111.216 < 2e-16 ***
## Thinness -0.66825 0.04175 -16.006 < 2e-16 ***
## Total_expenditure 0.12650 0.09055 1.397 0.163
## Status_Rec 7.47508 1.46224 5.112 3.56e-07 ***
## Total_expenditure:Status_Rec 0.09921 0.19942 0.497 0.619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.307 on 1644 degrees of freedom
## Multiple R-squared: 0.3117, Adjusted R-squared: 0.31
## F-statistic: 186.1 on 4 and 1644 DF, p-value: < 2.2e-16
m5 <- lm(Life_expectancy ~ Thinness*Total_expenditure + Status_Rec, data = LifeExpectancyy)
summary(m5)
##
## Call:
## lm(formula = Life_expectancy ~ Thinness * Total_expenditure +
## Status_Rec, data = LifeExpectancyy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.2183 -3.8125 0.9259 5.0646 25.9413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.85283 0.71752 93.172 < 2e-16 ***
## Thinness 0.09231 0.09972 0.926 0.355
## Total_expenditure 0.79274 0.11071 7.160 1.21e-12 ***
## Status_Rec 7.48753 0.53451 14.008 < 2e-16 ***
## Thinness:Total_expenditure -0.14317 0.01715 -8.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.157 on 1644 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.338
## F-statistic: 211.4 on 4 and 1644 DF, p-value: < 2.2e-16
library(texreg)
htmlreg(list(m1,m2,m3,m4,m5),doctype = FALSE)
| Model 1 | Model 2 | Model 3 | Model 4 | Model 5 | ||
|---|---|---|---|---|---|---|
| (Intercept) | 73.55*** | 71.52*** | 70.47*** | 70.59*** | 66.85*** | |
| (0.28) | (0.62) | (0.58) | (0.63) | (0.72) | ||
| Thinness | -0.88*** | -0.84*** | -0.67*** | -0.67*** | 0.09 | |
| (0.04) | (0.04) | (0.04) | (0.04) | (0.10) | ||
| Total_expenditure | 0.31*** | 0.15 | 0.13 | 0.79*** | ||
| (0.09) | (0.08) | (0.09) | (0.11) | |||
| Status_Rec | 8.15*** | 7.48*** | 7.49*** | |||
| (0.54) | (1.46) | (0.53) | ||||
| Total_expenditure:Status_Rec | 0.10 | |||||
| (0.20) | ||||||
| Thinness:Total_expenditure | -0.14*** | |||||
| (0.02) | ||||||
| R2 | 0.21 | 0.22 | 0.31 | 0.31 | 0.34 | |
| Adj. R2 | 0.21 | 0.22 | 0.31 | 0.31 | 0.34 | |
| Num. obs. | 1649 | 1649 | 1649 | 1649 | 1649 | |
| RMSE | 7.82 | 7.79 | 7.31 | 7.31 | 7.16 | |
| p < 0.001, p < 0.01, p < 0.05 | ||||||
AIC(m1,m2,m3,m4,m5)
df AIC m1 3 11467.90 m2 4 11456.35 m3 5 11244.06 m4 6 11245.81 m5 6 11177.61
BIC(m1,m2,m3,m4,m5)
df BIC m1 3 11484.12 m2 4 11477.98 m3 5 11271.10 m4 6 11278.26 m5 6 11210.06
Relationship between Life Expectancy and Thinness
Lplot <- ggplot(data = LifeExpectancyy, aes(x = Thinness, y =Life_expectancy)) +
geom_line()
Lplot
Distribution of Life Expectancy Across Developed Countries
dcoef <- LifeExpectancyy %>%
group_by(Country_Rec) %>%
do(mod = lm(Life_expectancy ~ Status_Rec, data = .))
coef <- dcoef %>% do(data.frame(intc = coef(.$mod)[1]))
ggplot(coef, aes(x = intc)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histrical Trend in Life Expectancy
library(ggplot2)
g <- ggplot(LifeExpectancy, aes(x = Year, y = Life_expectancy))
g1 <- g + geom_smooth(color="green", size=1.5) + geom_point(alpha=.3) +
theme_classic() +
ggtitle("Overall Life Expectancy Over Time")
g1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Histrical Trend in Life Expectancy in a Developed Country: United States
LifeExpectancyUS <- read_csv("C:/Users/Nusrat/Desktop/MA - 3rd semester, Spring 19/SOC 712 - Advanced Analytics (R)/Assignment 9 - Advanced VIsualization/LifeExpectancyUS.csv")
gg <- ggplot(LifeExpectancyUS, aes(x = Year, y = Life_expectancy))
g2 <- gg + geom_smooth(color="blue", size=1.5) + geom_smooth(alpha=.3) +
theme_classic() +
ggtitle("Life Expectancy Over Time in the US")
g2
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Histrical Trend in Life Expectancy in a Developing Country: Bangladesh
LifeExpectancyBD <- read_csv("C:/Users/Nusrat/Desktop/MA - 3rd semester, Spring 19/SOC 712 - Advanced Analytics (R)/Assignment 9 - Advanced VIsualization/LifeExpectancyBD.csv")
ggg <- ggplot(LifeExpectancyBD, aes(x = Year, y = Life_expectancy))
g3 <- ggg + geom_smooth(color="blue", size=1.5) + geom_smooth(alpha=.3) +
theme_classic() +
ggtitle("Life Expectancy Over Time in BD")
g3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
grid.arrange(g1,g2, ncol=2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
grid.arrange(g1,g3, ncol=2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
grid.arrange(g2,g3, ncol=2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'