Set import libraries, working directory, and read csv

library(nlme)
library(ggplot2)
setwd("/Users/annapeterson/Desktop/Classes/GEOG6000/Lab06")
gap_mind = read.csv("/Users/annapeterson/Desktop/Classes/GEOG6000/Lab06/gapminderData5.csv")

tapply for mean life expectancy by country

µ = tapply(gap_mind$lifeExp, gap_mind$country, mean)

GGplot of all the countries and their life expectancy I ran into a lot of issues with overplotting like in this example and when i color coordinate it, the legend takes up too much space and I can no longer see the chart We can also try using facet wrap to put them all in their own rows, telling facet wrap not to shrink, and call out the fig height and width in the r markdown:



Scatter plots for Japan and France

gap_sub = subset(gap_mind, country == 'Japan')
gap_sub_2 = subset(gap_mind, country == 'France')
plot(gap_sub$year, gap_sub$lifeExp, 
     xlab = "Year", ylab = "Life expectancy (yr)",
     type ='l', col = 'blue')
lines(gap_sub_2$year, gap_sub_2$lifeExp,
      xlab = "Year", ylab = "Life Expectancy (yr)",
      type = 'l', col = 'green')
legend("bottomright", 
       legend = c("Japan", "France"),
       lty = 1, 
       col = c('blue','green'))

Centering the year around 1952, where 1952 is now year 0

gap_mind$year_center = gap_mind$year - 1952

Linear model

gap_lm = lm(lifeExp ~ year_center, data = gap_mind)
summary(gap_lm)
## 
## Call:
## lm(formula = lifeExp ~ year_center, data = gap_mind)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.949  -9.651   1.697  10.335  22.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50.51208    0.53000   95.31   <2e-16 ***
## year_center  0.32590    0.01632   19.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared:  0.1898, Adjusted R-squared:  0.1893 
## F-statistic: 398.6 on 1 and 1702 DF,  p-value: < 2.2e-16

The F-statistic is 398.6 with a p-value of 2.2x10-16. The life expectancy [intercept] in year 0 [1952] was 50.5 with a p-value of <2.2x10-16. The life expectancy will increase every year after year 0 [1952] by 0.326 with a p-value of 2x10-16.

Random intercept model

gap_mind$country = factor(gap_mind$country)
gap_lme = lme(lifeExp ~ year_center,
              random = ~ 1  | country,
              data = gap_mind)
summary(gap_lme)
## Linear mixed-effects model fit by REML
##   Data: gap_mind 
##        AIC      BIC    logLik
##   9874.263 9896.022 -4933.132
## 
## Random effects:
##  Formula: ~1 | country
##         (Intercept) Residual
## StdDev:    11.09716 3.584196
## 
## Fixed effects:  lifeExp ~ year_center 
##                Value Std.Error   DF  t-value p-value
## (Intercept) 50.51208 0.9454671 1561 53.42553       0
## year_center  0.32590 0.0050305 1561 64.78581       0
##  Correlation: 
##             (Intr)
## year_center -0.146
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -6.1691513 -0.4910339  0.1216455  0.6030002  2.4194109 
## 
## Number of Observations: 1704
## Number of Groups: 142

Fixed effect coefficients: 50.5 intercept with p-value of 0. Slope (year_center) 0.326 with p-value of 0. Standard dev: 11.097, residual standard dev: 3.58.

Calculating the ICC for the random intercept

VarCorr(gap_lme)
## country = pdLogChol(1) 
##             Variance  StdDev   
## (Intercept) 123.14688 11.097156
## Residual     12.84646  3.584196

Intercept ICC is the variance of the intercept/sum of variances

123.14688/135.9933
## [1] 0.9055364

This ICC is close to 1 which means it has similarity for the values within the same group.

Model with random intercept and slope

gap_lme_2 = lme(lifeExp ~ year_center,
               random = ~ year_center | country,
               data = gap_mind)
summary(gap_lme_2)
## Linear mixed-effects model fit by REML
##   Data: gap_mind 
##        AIC      BIC    logLik
##   8747.277 8779.915 -4367.639
## 
## Random effects:
##  Formula: ~year_center | country
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev     Corr  
## (Intercept) 12.3027957 (Intr)
## year_center  0.1582867 -0.434
## Residual     2.1807917       
## 
## Fixed effects:  lifeExp ~ year_center 
##                Value Std.Error   DF  t-value p-value
## (Intercept) 50.51208 1.0371995 1561 48.70045       0
## year_center  0.32590 0.0136312 1561 23.90865       0
##  Correlation: 
##             (Intr)
## year_center -0.439
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -8.08192929 -0.34891405  0.02690066  0.39742349  4.67388085 
## 
## Number of Observations: 1704
## Number of Groups: 142

Now to figure out which is the best model by using AIC

AIC(gap_lm, gap_lme, gap_lme_2)
## Warning in AIC.default(gap_lm, gap_lme, gap_lme_2): models are not all fitted to
## the same number of observations
##           df       AIC
## gap_lm     3 13201.732
## gap_lme    4  9874.263
## gap_lme_2  6  8747.277

The second model gap_lme_2 is the best model (8747.28) and the worst is the first model, gap_lm (13201.732)