library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
load_chi2018 <- function(var_select = c("water_score", "sanitation_score", "child_mortality", "Year", "ISO3", "CountryName")) {
library(tidyr)
library(dplyr)
library(data.table)
set.seed(1234567890)
df = read.csv("https://raw.githubusercontent.com/davidblumenstiel/CUNY-MSDS-DATA-621/main/Final_Project/chi-2018.csv")
#Took some code from: https://stackoverflow.com/questions/50010196/replacing-na-values-from-another-dataframe-by-id
#and https://stackoverflow.com/questions/25908772/r-column-mean-by-factor
x1 = df %>%
pivot_longer(
cols = starts_with("wat_"),
names_to = "Year",
names_prefix = "wat_",
values_to = "water_score"
)
x1 = x1 %>%
left_join(setDT(x1)[, mean(water_score, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
mutate(water_score = ifelse(is.na(water_score), V1, water_score)) %>% #Replace NA with mean of values if available
select("Year", "water_score", "CountryName")
x2 = df %>%
pivot_longer(
cols = starts_with("san_"),
names_to = "Year",
names_prefix = "san_",
values_to = "sanitation_score"
)
x2 = x2 %>%
left_join(setDT(x2)[, mean(sanitation_score, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
mutate(sanitation_score = ifelse(is.na(sanitation_score), V1, sanitation_score)) %>% #Replace NA with mean of values if available
select("Year", "sanitation_score", "CountryName")
x3 = df %>%
pivot_longer(
cols = starts_with("chmort_"),
names_to = "Year",
names_prefix = "chmort_",
values_to = "child_mortality"
)
x3 = x3 %>%
left_join(setDT(x3)[, mean(child_mortality, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
mutate(child_mortality = ifelse(is.na(child_mortality), V1, child_mortality)) %>% #Replace NA with mean of values if available
select("Year", "child_mortality", "CountryName")
x4 = df %>%
pivot_longer(
cols = starts_with("mortality_"),
names_to = "Year",
names_prefix = "mortality_",
values_to = "mortality_score"
)
x4 = x4 %>%
left_join(setDT(x4)[, mean(mortality_score, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
mutate(mortality_score = ifelse(is.na(mortality_score), V1, mortality_score)) %>% #Replace NA with mean of values if available
select("Year", "mortality_score", "CountryName")
x5 = df %>%
pivot_longer(
cols = starts_with("CHI_v2018_"),
names_to = "Year",
names_prefix = "CHI_v2018_",
values_to = "CHI_v2018"
)
x5 = x5 %>%
left_join(setDT(x5)[, mean(CHI_v2018, na.rm = TRUE), by=CountryName], by = "CountryName") %>%
mutate(CHI_v2018 = ifelse(is.na(CHI_v2018), V1, CHI_v2018)) %>% #Replace NA with mean of values if available
select("Year", "CHI_v2018", "CountryName")
out = x1 %>% merge(x2, by = c("CountryName", "Year")) %>%
merge(x3, by = c("CountryName", "Year")) %>%
merge(x4, by = c("CountryName", "Year")) %>%
merge(x5, by = c("CountryName", "Year"))
out = as.data.frame(out)
#Adds back ISO3 abbreviations
out <- out %>% merge(x = out, y = df[,1:2], by.x = "CountryName", by.y = "CountryName")
colnames(out)[8] <- "ISO3"
#NA dropping
out <- data.frame(out[,var_select]) %>% drop_na()
return(out)
}
#Basic EDA
df <- load_chi2018(var_select=c("water_score", "sanitation_score", "child_mortality", "Year", "ISO3", "CountryName", "mortality_score", "CHI_v2018"))
summary(df)
## water_score sanitation_score child_mortality Year
## Min. : 18.14 Min. : 5.69 Min. : 0.390 Length:1782
## 1st Qu.: 77.80 1st Qu.: 49.29 1st Qu.: 1.310 Class :character
## Median : 94.75 Median : 88.36 Median : 3.285 Mode :character
## Mean : 86.05 Mean : 73.80 Mean : 9.902
## 3rd Qu.: 99.30 3rd Qu.: 97.61 3rd Qu.:13.000
## Max. :100.00 Max. :100.00 Max. :68.810
##
## ISO3 CountryName mortality_score CHI_v2018
## ABW : 9 Afghanistan : 9 Min. : 0.00 Min. :17.28
## AFG : 9 Albania : 9 1st Qu.:78.96 1st Qu.:66.38
## AGO : 9 Algeria : 9 Median :94.43 Median :92.81
## ALB : 9 Angola : 9 Mean :83.68 Mean :81.18
## ARE : 9 Antigua and Barbuda: 9 3rd Qu.:97.82 3rd Qu.:97.78
## ARG : 9 Argentina : 9 Max. :99.30 Max. :99.74
## (Other):1728 (Other) :1728
hist(df$water_score)
hist(df$sanitation_score)
hist(df$child_mortality)
hist(df$mortality_score)
hist(df$CHI_v2018)
df <- df[complete.cases(df),]
All very exponentialy distributed.
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(df[,c(1,2,3,7,8)], use = "pairwise.complete.obs"), type = "upper")
Also alot of collinearity.
plot(df$child_mortality ~ df$water_score)
plot(df$child_mortality ~ df$sanitation_score)
Going to need transformations to avoid problems with residuals and the assumptions of linear regression
Let’s first explore the individual relationships between water and sanitation with child mortality.
df <- load_chi2018()
df$child_mortality <- log(df$child_mortality)
df$water_score <- df$water_score^6
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
fit <- lm(child_mortality ~ water_score, data = train)
summary(fit)
##
## Call:
## lm(formula = child_mortality ~ water_score, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.63399 -0.52890 -0.05428 0.47099 2.07863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.396e+00 3.335e-02 101.82 <2e-16 ***
## water_score -3.345e-12 4.715e-14 -70.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6566 on 1425 degrees of freedom
## Multiple R-squared: 0.7794, Adjusted R-squared: 0.7792
## F-statistic: 5034 on 1 and 1425 DF, p-value: < 2.2e-16
plot(fit)
plot(predict(fit, val) ~ val$child_mortality)
hist(fit$residuals, breaks = 20)
print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2: 0.781173223506275"
after variable transformation, water-score alone will account for about 0.78 of the variation in child mortality
df <- load_chi2018()
df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^2.5
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
fit <- lm(child_mortality ~ sanitation_score, data = train)
summary(fit)
##
## Call:
## lm(formula = child_mortality ~ sanitation_score, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29133 -0.47940 -0.06311 0.45987 2.12226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.343e+00 3.253e-02 102.77 <2e-16 ***
## sanitation_score -3.291e-05 4.610e-07 -71.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6534 on 1425 degrees of freedom
## Multiple R-squared: 0.7815, Adjusted R-squared: 0.7813
## F-statistic: 5096 on 1 and 1425 DF, p-value: < 2.2e-16
plot(fit)
plot(predict(fit, val) ~ val$child_mortality)
hist(fit$residuals, breaks = 20)
print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2: 0.771396313076661"
after variable transformation, sanitation-score alone can also account for about 0.77-0.78 of the variation in child mortality
df <- load_chi2018()
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
fit <- lm(log(child_mortality)^1.1 ~ I(water_score * sanitation_score), data = train)
summary(fit)
##
## Call:
## lm(formula = log(child_mortality)^1.1 ~ I(water_score * sanitation_score),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85708 -0.45924 0.03039 0.37639 1.56119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.294e+00 3.844e-02 111.72 <2e-16 ***
## I(water_score * sanitation_score) -3.800e-04 5.584e-06 -68.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6148 on 1139 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.8026, Adjusted R-squared: 0.8025
## F-statistic: 4632 on 1 and 1139 DF, p-value: < 2.2e-16
plot(fit)
plot((exp(predict(fit, val)))^(1/1.1) ~ val$child_mortality)
hist(fit$residuals, breaks = 20)
Basic linear regression with log and slight exponential transformaton. Uses an interaction term to get around collinearity issues. Likely meets the criteria for linear regression althought there is slight heteroskedascity; residuals are still fairly normal.
Claims to have a fit of r^2=0.8, but the predictions vs fitted plot insicates this may not hold. Also looks heteroskedastic.
Also linear regression using an interactin term to avoid collinearity issues, but with exponenial transformations of the response and predictor variables. It’s a somewhat better fit, with holds true when predictions are plotted against residuals. Still has somewhat heteroskedastic residuals.
df <- load_chi2018()
df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^4
df$water_score <- df$water_score ^4
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
fit <- lm(child_mortality~ I(water_score + sanitation_score), data = train)
summary(fit)
##
## Call:
## lm(formula = child_mortality ~ I(water_score + sanitation_score),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88427 -0.42300 -0.05869 0.44431 1.64620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.533e+00 3.044e-02 116.05 <2e-16 ***
## I(water_score + sanitation_score) -1.802e-08 2.193e-10 -82.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5834 on 1425 degrees of freedom
## Multiple R-squared: 0.8258, Adjusted R-squared: 0.8257
## F-statistic: 6756 on 1 and 1425 DF, p-value: < 2.2e-16
plot(fit)
plot(predict(fit, val) ~ val$child_mortality)
hist(fit$residuals, breaks = 20)
print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2: 0.82110038572662"
A ridge regression model (handles collinearity) with exponential transformations on the variables. Fits about as well as the previous model, but maybe less heteroskedascity? Residuals are kinda normally distributed, but maybe a little bimodal.
df <- load_chi2018()
df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^5
df$water_score <- df$water_score ^5
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
train_X <- model.matrix(~ water_score + sanitation_score , data=train)
train_Y <- train$child_mortality
val_X = model.matrix(~ water_score + sanitation_score ,data=val)
#Makes a series of crossvalidated glmnet models for 100 lambda values (default)
#lamba values are constants that define coefficient shrinkage.
ridge_model <- cv.glmnet(x = train_X, #Predictor variables
y = train_Y,
family = "gaussian",
nfolds = 10, #k fold cv
type.measure = "mse", #uses mse as loss
alpha = 0) #Alpha = 0 is ridge.
#setting lambda.min uses the lambda value with the minimum mean cv error (picks the best model)
predictions <- predict(ridge_model,
newx = val_X,
type = "response",
s=ridge_model$lambda.min)
#Print's the coefficients the model uses
print(coef.glmnet(ridge_model, s = ridge_model$lambda.min))
## 4 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.311455e+00
## (Intercept) .
## water_score -1.900025e-10
## sanitation_score -1.474588e-10
#r^2 : https://stats.stackexchange.com/questions/266592/how-to-calculate-r2-for-lasso-glmnet
r2 <- ridge_model$glmnet.fit$dev.ratio[which(ridge_model$glmnet.fit$lambda == ridge_model$lambda.min)]
print(paste("R^2: ",r2))
## [1] "R^2: 0.823713138474955"
#Correct for transformation
predictions <- predictions
residuals <- val$child_mortality - predictions
plot(predictions ~ val$child_mortality)
plot(residuals ~ predictions)
hist(residuals, breaks = 30)
print(paste("Validation R^2: ", cor(predictions, val$child_mortality)^2))
## [1] "Validation R^2: 0.82197433922245"
#Checking for collinearity
library(rnaturalearth)
#Adds new data
world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()
df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3") #Preserved all of the countries in the origional set
corrplot(cor(df[c("water_score", "sanitation_score", "child_mortality",
"pop_est", "gdp_md_est")], use = "pairwise.complete.obs"), type = "upper")
population and and the other new variable seem uncorrelated to child mortality, but somehwat correlated to eachother; probably best to just use one if either. We can also an economy variable (categorical), of which there are two which are presumably fairly correlated
world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()
df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3")
df$child_mortality <- log10(df$child_mortality) #Still needs a transformation
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
fit <- lm(child_mortality ~ economy, data = train)
summary(fit)
##
## Call:
## lm(formula = child_mortality ~ economy, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87658 -0.23899 -0.01321 0.21417 1.10094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15767 0.05324 -2.961 0.00312 **
## economy2. Developed region: nonG7 0.04011 0.05857 0.685 0.49352
## economy3. Emerging region: BRIC 0.62350 0.08755 7.122 1.71e-12 ***
## economy4. Emerging region: MIKT 0.66849 0.08391 7.966 3.40e-15 ***
## economy5. Emerging region: G20 0.90721 0.06111 14.846 < 2e-16 ***
## economy6. Developing region 0.67770 0.05533 12.249 < 2e-16 ***
## economy7. Least developed region 1.47538 0.05674 26.002 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3611 on 1377 degrees of freedom
## Multiple R-squared: 0.6414, Adjusted R-squared: 0.6398
## F-statistic: 410.5 on 6 and 1377 DF, p-value: < 2.2e-16
plot(fit)
plot(predict(fit, val) ~ val$child_mortality)
hist(fit$residuals, breaks = 20)
Economy seems to be fairly well correlated with child_mortality. Pop_est was also tested and proved to be highly unpredictive.
Let’s see what economy + the previous variables can acomplish
world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()
df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3")
df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^4
df$water_score <- df$water_score ^4
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
fit <- lm(child_mortality ~ economy + I(water_score + sanitation_score), data = train)
summary(fit)
##
## Call:
## lm(formula = child_mortality ~ economy + I(water_score + sanitation_score),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6255 -0.3096 0.0059 0.3397 1.4619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.555e+00 9.504e-02 26.888 < 2e-16 ***
## economy2. Developed region: nonG7 -6.282e-03 8.147e-02 -0.077 0.9386
## economy3. Emerging region: BRIC 2.131e-01 1.243e-01 1.715 0.0866 .
## economy4. Emerging region: MIKT 7.780e-01 1.177e-01 6.609 5.53e-11 ***
## economy5. Emerging region: G20 9.315e-01 8.820e-02 10.561 < 2e-16 ***
## economy6. Developing region 6.477e-01 7.916e-02 8.182 6.30e-16 ***
## economy7. Least developed region 9.349e-01 9.356e-02 9.993 < 2e-16 ***
## I(water_score + sanitation_score) -1.494e-08 3.050e-10 -48.983 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5022 on 1376 degrees of freedom
## Multiple R-squared: 0.8693, Adjusted R-squared: 0.8686
## F-statistic: 1307 on 7 and 1376 DF, p-value: < 2.2e-16
plot(fit)
plot(predict(fit, val) ~ val$child_mortality)
hist(fit$residuals, breaks = 20)
print(paste("Validation R^2: ", cor(predict(fit, val), val$child_mortality)^2))
## [1] "Validation R^2: 0.851456500809887"
Very well fitting; adding economy helps. Maybe slighltly heteroscedastic residuals, although they are still normally distributed.
world <- ne_countries(scale = "medium", returnclass = "sf")
df <- load_chi2018()
df <- merge(df, world, by.x = "ISO3", by.y = "iso_a3")
df$child_mortality <- log(df$child_mortality)
df$sanitation_score <- df$sanitation_score^3
df$water_score <- df$water_score ^3
splitdex <- createDataPartition(df$child_mortality, p = 0.8, list = FALSE)
train <- df[splitdex,]
val <- df[-splitdex,]
library(glmnet)
train_X <- model.matrix(~ water_score + sanitation_score + economy, data=train)
train_Y <- train$child_mortality
val_X = model.matrix(~ water_score + sanitation_score + economy,data=val)
#Makes a series of crossvalidated glmnet models for 100 lambda values (default)
#lamba values are constants that define coefficient shrinkage.
ridge_model <- cv.glmnet(x = train_X, #Predictor variables
y = train_Y,
family = "gaussian",
nfolds = 10, #k fold cv
type.measure = "mse", #uses mse as loss
alpha = .5) #Alpha = 0 is ridge.
#setting lambda.min uses the lambda value with the minimum mean cv error (picks the best model)
predictions <- predict(ridge_model,
newx = val_X,
type = "response",
s=ridge_model$lambda.min)
#Print's the coefficients the model uses
print(coef.glmnet(ridge_model, s = ridge_model$lambda.min))
## 10 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.803998e+00
## (Intercept) .
## water_score -1.594785e-06
## sanitation_score -1.578504e-06
## economy2. Developed region: nonG7 -3.830656e-02
## economy3. Emerging region: BRIC 2.735977e-01
## economy4. Emerging region: MIKT 8.095908e-01
## economy5. Emerging region: G20 9.636074e-01
## economy6. Developing region 6.803328e-01
## economy7. Least developed region 8.861855e-01
#r^2 : https://stats.stackexchange.com/questions/266592/how-to-calculate-r2-for-lasso-glmnet
r2 <- ridge_model$glmnet.fit$dev.ratio[which(ridge_model$glmnet.fit$lambda == ridge_model$lambda.min)]
print(r2)
## [1] 0.8711848
#Correct for transformation
predictions <- predictions
residuals <- val$child_mortality - predictions
mean(residuals)
## [1] 0.03276305
plot(predictions ~ val$child_mortality)
plot(residuals ~ predictions)
#ridge_model$glmnet.fit
hist(residuals, breaks = 30)
print(paste("Validation R^2: ", cor(predictions, val$child_mortality)^2))
## [1] "Validation R^2: 0.855320916749198"
elastic net regression produces a similar result to linear. Performs slightly better than ridge alone.