Data prep

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
load_chi2018 <- function(var_select = c("water_score", "sanitation_score", "child_mortality", "Year", "ISO3", "CountryName")) {
  
  
  library(tidyr)
  library(dplyr)
  library(data.table)
  
  
  set.seed(1234567890)
 
  df = read.csv("https://raw.githubusercontent.com/davidblumenstiel/CUNY-MSDS-DATA-621/main/Final_Project/chi-2018.csv")
  
  #Took some code from: https://stackoverflow.com/questions/50010196/replacing-na-values-from-another-dataframe-by-id
  #and https://stackoverflow.com/questions/25908772/r-column-mean-by-factor
  
  x1 = df %>%
    pivot_longer(
      cols = starts_with("wat_"), 
      names_to = "Year",
      names_prefix = "wat_",
      values_to = "water_score"
    ) 
    x1 = x1 %>% 
      left_join(setDT(x1)[, mean(water_score, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
      mutate(water_score = ifelse(is.na(water_score), V1, water_score)) %>% #Replace NA with mean of values if available
      select("Year", "water_score", "CountryName")   

  
  x2 = df %>%
    pivot_longer(
      cols = starts_with("san_"), 
      names_to = "Year",
      names_prefix = "san_",
      values_to = "sanitation_score"
    ) 
  x2 = x2 %>%
    left_join(setDT(x2)[, mean(sanitation_score, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
    mutate(sanitation_score = ifelse(is.na(sanitation_score), V1, sanitation_score)) %>% #Replace NA with mean of values if available
    select("Year", "sanitation_score", "CountryName")    
  
    
  
  x3 = df %>%
    pivot_longer(
      cols = starts_with("chmort_"), 
      names_to = "Year",
      names_prefix = "chmort_",
      values_to = "child_mortality"
    ) 
  x3 = x3 %>%
    left_join(setDT(x3)[, mean(child_mortality, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
    mutate(child_mortality = ifelse(is.na(child_mortality), V1, child_mortality)) %>% #Replace NA with mean of values if available
    select("Year", "child_mortality", "CountryName")   
  
  
    
  x4 = df %>%
    pivot_longer(
      cols = starts_with("mortality_"), 
      names_to = "Year",
      names_prefix = "mortality_",
      values_to = "mortality_score"
    ) 
  x4 = x4 %>%
    left_join(setDT(x4)[, mean(mortality_score, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
    mutate(mortality_score = ifelse(is.na(mortality_score), V1, mortality_score)) %>% #Replace NA with mean of values if available
    select("Year", "mortality_score", "CountryName")   
    
  x5 = df %>%
    pivot_longer(
      cols = starts_with("CHI_v2018_"), 
      names_to = "Year",
      names_prefix = "CHI_v2018_",
      values_to = "CHI_v2018"
    ) 
  x5 = x5 %>%
    left_join(setDT(x5)[, mean(CHI_v2018, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
    mutate(CHI_v2018 = ifelse(is.na(CHI_v2018), V1, CHI_v2018)) %>% #Replace NA with mean of values if available
    select("Year", "CHI_v2018", "CountryName")   
  
  out = x1 %>% merge(x2, by = c("CountryName", "Year")) %>%
    merge(x3, by = c("CountryName", "Year")) %>%
    merge(x4, by = c("CountryName", "Year")) %>%
    merge(x5, by = c("CountryName", "Year"))
  
  out = as.data.frame(out)
 
 
  #Adds back ISO3 abbreviations 
  out <- out %>% merge(x = out, y = df[,1:2], by.x = "CountryName", by.y = "CountryName")
  colnames(out)[8] <- "ISO3"
  
  #NA dropping
  out <- data.frame(out[,var_select]) %>% drop_na()
  
 
  return(out)

}

#Basic EDA

df <- load_chi2018(var_select=c("water_score", "sanitation_score", "child_mortality", "Year", "ISO3", "CountryName", "mortality_score", "CHI_v2018"))

summary(df)
##   water_score     sanitation_score child_mortality      Year          
##  Min.   : 18.14   Min.   :  5.69   Min.   : 0.390   Length:1782       
##  1st Qu.: 77.80   1st Qu.: 49.29   1st Qu.: 1.310   Class :character  
##  Median : 94.75   Median : 88.36   Median : 3.285   Mode  :character  
##  Mean   : 86.05   Mean   : 73.80   Mean   : 9.902                     
##  3rd Qu.: 99.30   3rd Qu.: 97.61   3rd Qu.:13.000                     
##  Max.   :100.00   Max.   :100.00   Max.   :68.810                     
##                                                                       
##       ISO3                   CountryName   mortality_score   CHI_v2018    
##  ABW    :   9   Afghanistan        :   9   Min.   : 0.00   Min.   :17.28  
##  AFG    :   9   Albania            :   9   1st Qu.:78.96   1st Qu.:66.38  
##  AGO    :   9   Algeria            :   9   Median :94.43   Median :92.81  
##  ALB    :   9   Angola             :   9   Mean   :83.68   Mean   :81.18  
##  ARE    :   9   Antigua and Barbuda:   9   3rd Qu.:97.82   3rd Qu.:97.78  
##  ARG    :   9   Argentina          :   9   Max.   :99.30   Max.   :99.74  
##  (Other):1728   (Other)            :1728
hist(df$water_score)

hist(df$sanitation_score)

hist(df$child_mortality)

hist(df$mortality_score)

hist(df$CHI_v2018)

df <- df[complete.cases(df),]

All very exponentialy distributed.

library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(df[,c(1,2,3,7,8)], use = "pairwise.complete.obs"), type = "upper")

Also alot of collinearity.

plot(df$child_mortality ~ df$water_score)

plot(df$child_mortality ~ df$sanitation_score)

Going to need transformations to avoid problems with residuals and the assumptions of linear regression

Modeling

Let’s first explore the individual relationships between water and sanitation with child mortality.

df <- load_chi2018()

df$child_mortality <- log(df$child_mortality)
df$water_score <- df$water_score^6

splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]

fit <- lm(child_mortality ~ water_score, data = train)

summary(fit)
## 
## Call:
## lm(formula = child_mortality ~ water_score, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63399 -0.52890 -0.05428  0.47099  2.07863 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.396e+00  3.335e-02  101.82   <2e-16 ***
## water_score -3.345e-12  4.715e-14  -70.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6566 on 1425 degrees of freedom
## Multiple R-squared:  0.7794, Adjusted R-squared:  0.7792 
## F-statistic:  5034 on 1 and 1425 DF,  p-value: < 2.2e-16
plot(fit)

plot(predict(fit, val) ~ val$child_mortality)

hist(fit$residuals, breaks = 20)

print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2:  0.781173223506275"

after variable transformation, water-score alone will account for about 0.78 of the variation in child mortality

df <- load_chi2018()

df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^2.5

splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]

fit <- lm(child_mortality ~ sanitation_score, data = train)

summary(fit)
## 
## Call:
## lm(formula = child_mortality ~ sanitation_score, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.29133 -0.47940 -0.06311  0.45987  2.12226 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.343e+00  3.253e-02  102.77   <2e-16 ***
## sanitation_score -3.291e-05  4.610e-07  -71.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6534 on 1425 degrees of freedom
## Multiple R-squared:  0.7815, Adjusted R-squared:  0.7813 
## F-statistic:  5096 on 1 and 1425 DF,  p-value: < 2.2e-16
plot(fit)

plot(predict(fit, val) ~ val$child_mortality)

hist(fit$residuals, breaks = 20)

print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2:  0.771396313076661"

after variable transformation, sanitation-score alone can also account for about 0.77-0.78 of the variation in child mortality

df <- load_chi2018()


splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]

fit <- lm(log(child_mortality)^1.1 ~ I(water_score * sanitation_score), data = train)

summary(fit)
## 
## Call:
## lm(formula = log(child_mortality)^1.1 ~ I(water_score * sanitation_score), 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85708 -0.45924  0.03039  0.37639  1.56119 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        4.294e+00  3.844e-02  111.72   <2e-16 ***
## I(water_score * sanitation_score) -3.800e-04  5.584e-06  -68.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6148 on 1139 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.8026, Adjusted R-squared:  0.8025 
## F-statistic:  4632 on 1 and 1139 DF,  p-value: < 2.2e-16
plot(fit)

plot((exp(predict(fit, val)))^(1/1.1) ~ val$child_mortality)

hist(fit$residuals, breaks = 20)

Basic linear regression with log and slight exponential transformaton. Uses an interaction term to get around collinearity issues. Likely meets the criteria for linear regression althought there is slight heteroskedascity; residuals are still fairly normal.

Claims to have a fit of r^2=0.8, but the predictions vs fitted plot insicates this may not hold. Also looks heteroskedastic.

Final model 1

Also linear regression using an interactin term to avoid collinearity issues, but with exponenial transformations of the response and predictor variables. It’s a somewhat better fit, with holds true when predictions are plotted against residuals. Still has somewhat heteroskedastic residuals.

df <- load_chi2018()

df$child_mortality <- log(df$child_mortality) 
df$sanitation_score <- df$sanitation_score^4
df$water_score <- df$water_score  ^4


splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]


fit <- lm(child_mortality~ I(water_score + sanitation_score), data = train)

summary(fit)
## 
## Call:
## lm(formula = child_mortality ~ I(water_score + sanitation_score), 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88427 -0.42300 -0.05869  0.44431  1.64620 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.533e+00  3.044e-02  116.05   <2e-16 ***
## I(water_score + sanitation_score) -1.802e-08  2.193e-10  -82.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5834 on 1425 degrees of freedom
## Multiple R-squared:  0.8258, Adjusted R-squared:  0.8257 
## F-statistic:  6756 on 1 and 1425 DF,  p-value: < 2.2e-16
plot(fit)

plot(predict(fit, val) ~ val$child_mortality)

hist(fit$residuals, breaks = 20)

print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2:  0.82110038572662"

Final Model 2

A ridge regression model (handles collinearity) with exponential transformations on the variables. Fits about as well as the previous model, but maybe less heteroskedascity? Residuals are kinda normally distributed, but maybe a little bimodal.

df <- load_chi2018()

df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^5
df$water_score <- df$water_score  ^5


splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]




library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-1
train_X <- model.matrix(~ water_score + sanitation_score , data=train)  
train_Y <- train$child_mortality

val_X = model.matrix(~ water_score + sanitation_score ,data=val)


#Makes a series of crossvalidated glmnet models for 100 lambda values (default)
#lamba values are constants that define coefficient shrinkage.  
ridge_model <- cv.glmnet(x = train_X,   #Predictor variables
                      y = train_Y,
                      family = "gaussian", 
                      nfolds = 10, #k fold cv
                      type.measure = "mse",  #uses mse as loss
                      alpha = 0) #Alpha = 0 is ridge.

#setting lambda.min uses the lambda value with the minimum mean cv error (picks the best model)
predictions <- predict(ridge_model, 
                       newx = val_X,
                       type = "response",
                       s=ridge_model$lambda.min) 
#Print's the coefficients the model uses
print(coef.glmnet(ridge_model, s = ridge_model$lambda.min))
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       3.311455e+00
## (Intercept)       .           
## water_score      -1.900025e-10
## sanitation_score -1.474588e-10
#r^2 : https://stats.stackexchange.com/questions/266592/how-to-calculate-r2-for-lasso-glmnet
r2 <- ridge_model$glmnet.fit$dev.ratio[which(ridge_model$glmnet.fit$lambda == ridge_model$lambda.min)]
print(paste("R^2: ",r2))
## [1] "R^2:  0.823713138474955"
#Correct for  transformation
predictions <- predictions

residuals <- val$child_mortality - predictions

plot(predictions ~ val$child_mortality)

plot(residuals ~ predictions)

hist(residuals, breaks = 30)

print(paste("Validation R^2: ", cor(predictions, val$child_mortality)^2))
## [1] "Validation R^2:  0.82197433922245"

Adding “world” data

#Checking for collinearity

library(rnaturalearth)

#Adds new data
world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()

df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3") #Preserved all of the countries in the origional set

corrplot(cor(df[c("water_score", "sanitation_score", "child_mortality", 
                  "pop_est", "gdp_md_est")], use = "pairwise.complete.obs"), type = "upper")

population and and the other new variable seem uncorrelated to child mortality, but somehwat correlated to eachother; probably best to just use one if either. We can also an economy variable (categorical), of which there are two which are presumably fairly correlated

world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()

df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3")

df$child_mortality <- log10(df$child_mortality) #Still needs a transformation


splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]

fit <- lm(child_mortality ~ economy, data = train)

summary(fit)
## 
## Call:
## lm(formula = child_mortality ~ economy, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87658 -0.23899 -0.01321  0.21417  1.10094 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -0.15767    0.05324  -2.961  0.00312 ** 
## economy2. Developed region: nonG7  0.04011    0.05857   0.685  0.49352    
## economy3. Emerging region: BRIC    0.62350    0.08755   7.122 1.71e-12 ***
## economy4. Emerging region: MIKT    0.66849    0.08391   7.966 3.40e-15 ***
## economy5. Emerging region: G20     0.90721    0.06111  14.846  < 2e-16 ***
## economy6. Developing region        0.67770    0.05533  12.249  < 2e-16 ***
## economy7. Least developed region   1.47538    0.05674  26.002  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3611 on 1377 degrees of freedom
## Multiple R-squared:  0.6414, Adjusted R-squared:  0.6398 
## F-statistic: 410.5 on 6 and 1377 DF,  p-value: < 2.2e-16
plot(fit) 

plot(predict(fit, val) ~ val$child_mortality)

hist(fit$residuals, breaks = 20)

Economy seems to be fairly well correlated with child_mortality. Pop_est was also tested and proved to be highly unpredictive.

Let’s see what economy + the previous variables can acomplish

Final model 3

world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()

df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3")


df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^4
df$water_score <- df$water_score  ^4


splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]

fit <- lm(child_mortality ~ economy + I(water_score + sanitation_score), data = train)

summary(fit)
## 
## Call:
## lm(formula = child_mortality ~ economy + I(water_score + sanitation_score), 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6255 -0.3096  0.0059  0.3397  1.4619 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.555e+00  9.504e-02  26.888  < 2e-16 ***
## economy2. Developed region: nonG7 -6.282e-03  8.147e-02  -0.077   0.9386    
## economy3. Emerging region: BRIC    2.131e-01  1.243e-01   1.715   0.0866 .  
## economy4. Emerging region: MIKT    7.780e-01  1.177e-01   6.609 5.53e-11 ***
## economy5. Emerging region: G20     9.315e-01  8.820e-02  10.561  < 2e-16 ***
## economy6. Developing region        6.477e-01  7.916e-02   8.182 6.30e-16 ***
## economy7. Least developed region   9.349e-01  9.356e-02   9.993  < 2e-16 ***
## I(water_score + sanitation_score) -1.494e-08  3.050e-10 -48.983  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5022 on 1376 degrees of freedom
## Multiple R-squared:  0.8693, Adjusted R-squared:  0.8686 
## F-statistic:  1307 on 7 and 1376 DF,  p-value: < 2.2e-16
plot(fit) 

plot(predict(fit, val) ~ val$child_mortality)

hist(fit$residuals, breaks = 20)

print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2:  0.851456500809887"

Very well fitting; adding economy helps. Maybe slighltly heteroscedastic residuals, although they are still normally distributed.

Final model 4

world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()

df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3")


df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^3
df$water_score <- df$water_score  ^3


splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]




library(glmnet)

train_X <- model.matrix(~ water_score + sanitation_score + economy, data=train)  
train_Y <- train$child_mortality

val_X = model.matrix(~ water_score + sanitation_score + economy,data=val)

#Makes a series of crossvalidated glmnet models for 100 lambda values (default)
#lamba values are constants that define coefficient shrinkage.  
ridge_model <- cv.glmnet(x = train_X,   #Predictor variables
                      y = train_Y,
                      family = "gaussian", 
                      nfolds = 10, #k fold cv
                      type.measure = "mse",  #uses mse as loss
                      alpha = .5) #Alpha = 0 is ridge.

#setting lambda.min uses the lambda value with the minimum mean cv error (picks the best model)
predictions <- predict(ridge_model, 
                       newx = val_X,
                       type = "response",
                       s=ridge_model$lambda.min) 
#Print's the coefficients the model uses
print(coef.glmnet(ridge_model, s = ridge_model$lambda.min))
## 10 x 1 sparse Matrix of class "dgCMatrix"
##                                               1
## (Intercept)                        2.803998e+00
## (Intercept)                        .           
## water_score                       -1.594785e-06
## sanitation_score                  -1.578504e-06
## economy2. Developed region: nonG7 -3.830656e-02
## economy3. Emerging region: BRIC    2.735977e-01
## economy4. Emerging region: MIKT    8.095908e-01
## economy5. Emerging region: G20     9.636074e-01
## economy6. Developing region        6.803328e-01
## economy7. Least developed region   8.861855e-01
#r^2 : https://stats.stackexchange.com/questions/266592/how-to-calculate-r2-for-lasso-glmnet
r2 <- ridge_model$glmnet.fit$dev.ratio[which(ridge_model$glmnet.fit$lambda == ridge_model$lambda.min)]
print(r2)
## [1] 0.8711848
#Correct for  transformation
predictions <- predictions

residuals <- val$child_mortality - predictions

mean(residuals)
## [1] 0.03276305
plot(predictions ~ val$child_mortality)

plot(residuals ~ predictions)

#ridge_model$glmnet.fit


hist(residuals, breaks = 30)

print(paste("Validation R^2: ", cor(predictions, val$child_mortality)^2))
## [1] "Validation R^2:  0.855320916749198"

elastic net regression produces a similar result to linear. Performs slightly better than ridge alone.