require(ggthemes)
library(tidyverse)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(fpp2)
library(forecast)
library(ggpubr)
library(boot)
library(plotly)
url <- "https://raw.githubusercontent.com/ssufian/DAT-605/master/who.csv"
#Reading CSV file from Github
df <- read.csv(file=url, sep=",",na.strings = c("NA"," ",""),strip.white = T, stringsAsFactors = F, header=T)
head(df)
## Country LifeExp InfantSurvival Under5Survival TBFree PropMD
## 1 Afghanistan 42 0.835 0.743 0.99769 0.000228841
## 2 Albania 71 0.985 0.983 0.99974 0.001143127
## 3 Algeria 71 0.967 0.962 0.99944 0.001060478
## 4 Andorra 82 0.997 0.996 0.99983 0.003297297
## 5 Angola 41 0.846 0.740 0.99656 0.000070400
## 6 Antigua and Barbuda 73 0.990 0.989 0.99991 0.000142857
## PropRN PersExp GovtExp TotExp
## 1 0.000572294 20 92 112
## 2 0.004614439 169 3128 3297
## 3 0.002091362 108 5184 5292
## 4 0.003500000 2589 169725 172314
## 5 0.001146162 36 1620 1656
## 6 0.002773810 503 12543 13046
#fitting a linear regression model
lm1 <- lm(LifeExp~ TotExp, data = df)
summary(lm1)
##
## Call:
## lm(formula = LifeExp ~ TotExp, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.764 -4.778 3.154 7.116 13.292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.475e+01 7.535e-01 85.933 < 2e-16 ***
## TotExp 6.297e-05 7.795e-06 8.079 7.71e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.371 on 188 degrees of freedom
## Multiple R-squared: 0.2577, Adjusted R-squared: 0.2537
## F-statistic: 65.26 on 1 and 188 DF, p-value: 7.714e-14
# Compute the analysis of variance
res.aov <- aov(LifeExp~ TotExp, data = df)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## TotExp 1 5731 5731 65.26 7.71e-14 ***
## Residuals 188 16509 88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res.aov, 1)
# 2. Normality
plot(res.aov, 2)
Observation 1:
Statistical Diagnostics
But simply based on F-Stat and T-Stat of the coefficient, the simple linear regression “appears” to be met
df1 <- df %>% mutate(LifeEXPpower4.6 = LifeExp**4.6)
df2 <- df1 %>% mutate(TotExppower0.6 = TotExp**.06)
head(df2)
## Country LifeExp InfantSurvival Under5Survival TBFree PropMD
## 1 Afghanistan 42 0.835 0.743 0.99769 0.000228841
## 2 Albania 71 0.985 0.983 0.99974 0.001143127
## 3 Algeria 71 0.967 0.962 0.99944 0.001060478
## 4 Andorra 82 0.997 0.996 0.99983 0.003297297
## 5 Angola 41 0.846 0.740 0.99656 0.000070400
## 6 Antigua and Barbuda 73 0.990 0.989 0.99991 0.000142857
## PropRN PersExp GovtExp TotExp LifeEXPpower4.6 TotExppower0.6
## 1 0.000572294 20 92 112 29305338 1.327251
## 2 0.004614439 169 3128 3297 327935478 1.625875
## 3 0.002091362 108 5184 5292 327935478 1.672697
## 4 0.003500000 2589 169725 172314 636126841 2.061481
## 5 0.001146162 36 1620 1656 26230450 1.560068
## 6 0.002773810 503 12543 13046 372636298 1.765748
fig1 <- plot_ly(data = df2, x = ~TotExppower0.6, y = ~LifeEXPpower4.6,type='scatter',mode='markers',
marker = list(size = 10,
color = 'rgba(255, 182, 193, .9)',
line = list(color = 'rgba(152, 0, 0, .8)',
width = 2)))
fig1<- fig1 %>% layout(title = 'Fig2: Scatter Plot of LifeExp^4.6 as function of TotExp^0.06',
yaxis = list(zeroline = FALSE),
xaxis = list(zeroline = FALSE))
fig1
#fitting a linear regression model on transformed variables
lm2 <- lm(LifeEXPpower4.6~ TotExppower0.6, data = df2)
summary(lm2)
##
## Call:
## lm(formula = LifeEXPpower4.6 ~ TotExppower0.6, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -308616089 -53978977 13697187 59139231 211951764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -736527910 46817945 -15.73 <2e-16 ***
## TotExppower0.6 620060216 27518940 22.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90490000 on 188 degrees of freedom
## Multiple R-squared: 0.7298, Adjusted R-squared: 0.7283
## F-statistic: 507.7 on 1 and 188 DF, p-value: < 2.2e-16
# Compute the analysis of variance on transformed variables
res1.aov <- aov(LifeEXPpower4.6~ TotExppower0.6, data = df2)
# Summary of the analysis
summary(res1.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## TotExppower0.6 1 4.157e+18 4.157e+18 507.7 <2e-16 ***
## Residuals 188 1.540e+18 8.189e+15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res1.aov, 1)
# 2. Normality
plot(res1.aov, 2)
Observation 2:
Statistical Diagnostics
Based on the transformed variables, the linear regression is back in play and has passed all the statistical diagnostic tests
#Prediction zone based on transformed variables
intercept <- -736527910
slope <- 620060216
#, forecast life expectancy when:
TotExppower0.6_1.5 <- 1.5
LifeEXPpower4.6_1.5 <- TotExppower0.6_1.5 *slope+intercept
LifeEXPpower4.6_1.5
## [1] 193562414
#, forecast life expectancy when:
TotExppower0.6_2.5 <- 2.5
LifeEXPpower4.6_2.5 <- TotExppower0.6_2.5 *slope+intercept
LifeEXPpower4.6_2.5
## [1] 813622630
df3 <- df2 %>% mutate(ProdMDxTotExp=PropMD*TotExp)
head(df3)
## Country LifeExp InfantSurvival Under5Survival TBFree PropMD
## 1 Afghanistan 42 0.835 0.743 0.99769 0.000228841
## 2 Albania 71 0.985 0.983 0.99974 0.001143127
## 3 Algeria 71 0.967 0.962 0.99944 0.001060478
## 4 Andorra 82 0.997 0.996 0.99983 0.003297297
## 5 Angola 41 0.846 0.740 0.99656 0.000070400
## 6 Antigua and Barbuda 73 0.990 0.989 0.99991 0.000142857
## PropRN PersExp GovtExp TotExp LifeEXPpower4.6 TotExppower0.6
## 1 0.000572294 20 92 112 29305338 1.327251
## 2 0.004614439 169 3128 3297 327935478 1.625875
## 3 0.002091362 108 5184 5292 327935478 1.672697
## 4 0.003500000 2589 169725 172314 636126841 2.061481
## 5 0.001146162 36 1620 1656 26230450 1.560068
## 6 0.002773810 503 12543 13046 372636298 1.765748
## ProdMDxTotExp
## 1 0.02563019
## 2 3.76888972
## 3 5.61204958
## 4 568.17043526
## 5 0.11658240
## 6 1.86371242
#Each variable is paired up with each of the variable. A scatterplot is plotted for each pair.
pairs(~LifeExp+ PropMD +TotExp+ProdMDxTotExp,data = df3 ,main = "Scatterplot Matrix")
input <- df3[,c("LifeExp","PropMD","TotExp","ProdMDxTotExp")]
# Create the relationship model.
model <- lm(LifeExp~ PropMD +TotExp+ProdMDxTotExp,data = df3)
# Show the model.
print(model)
##
## Call:
## lm(formula = LifeExp ~ PropMD + TotExp + ProdMDxTotExp, data = df3)
##
## Coefficients:
## (Intercept) PropMD TotExp ProdMDxTotExp
## 6.277e+01 1.497e+03 7.233e-05 -6.026e-03
# Get the Intercept and coefficients as vector elements.
cat("# # # # The Coefficient Values # # # ","\n")
## # # # # The Coefficient Values # # #
a <- coef(model)[1]
print(a)
## (Intercept)
## 62.7727
XPropMD <- coef(model)[2]
XTotExp <- coef(model)[3]
XProdMDxTotExp <- coef(model)[4]
print(XPropMD )
## PropMD
## 1497.494
print(XTotExp)
## TotExp
## 7.233324e-05
print(XProdMDxTotExp)
## ProdMDxTotExp
## -0.006025686
# summary of the ANOVA Table
summary(model)
##
## Call:
## lm(formula = LifeExp ~ PropMD + TotExp + ProdMDxTotExp, data = df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.320 -4.132 2.098 6.540 13.074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.277e+01 7.956e-01 78.899 < 2e-16 ***
## PropMD 1.497e+03 2.788e+02 5.371 2.32e-07 ***
## TotExp 7.233e-05 8.982e-06 8.053 9.39e-14 ***
## ProdMDxTotExp -6.026e-03 1.472e-03 -4.093 6.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.765 on 186 degrees of freedom
## Multiple R-squared: 0.3574, Adjusted R-squared: 0.3471
## F-statistic: 34.49 on 3 and 186 DF, p-value: < 2.2e-16
Statistical Diagnostics:
LifeExp <- 62.7727+1497.494*XPropMD + 7.233324e-05*XTotExp +-0.006025686 *XProdMDxTotExp
# Q-Q Plot to check for normality of residuals
# linear model — distance as a function of speed from base R “cars” dataset
qqPropMD <- glm(LifeExp~ PropMD, data = input, family = gaussian)
# diagnostic plot of model
glm.diag.plots(qqPropMD)
qqTotExp <- glm(LifeExp~TotExp, data = input, family = gaussian)
# diagnostic plot of model
glm.diag.plots(qqTotExp)
qqProdMDxTotExp <- glm(LifeExp~ProdMDxTotExp, data = input, family = gaussian)
# diagnostic plot of model
glm.diag.plots(qqProdMDxTotExp)
-Forecast LifeExp when PropMD=.03 and TotExp = 14.
XPropMD <-0.03
XTotExp <- 14
XProdMDxTotExp <- XPropMD*XTotExp
LifeExp <- 62.7727+1497.494*XPropMD + 7.233324e-05*XTotExp +-0.006025686 *XProdMDxTotExp
LifeExp
## [1] 107.696
Summary and Conclusion:
- The data is NOT an ideal candidate for a Multiple linear regression model; at least not these set of predictor variables.
- The scatter plots showed that there's really no good linear relationships between the predictors vs. the response variable.
- The statistical diagnostics via F-stat, T-stat and p-values revealed the signifcance of the slope and it all met the conditions for a linear regression model.
However, all that ends there as upon further analysis, it was revealed that none of residuals are normal and for the purist, there is a lack of homegeneity on the variances as well. This suggested that the model is poorly specified and further analysis such as variable transforms can help or simply using other better suited predictor variables.
- The multiple regression (given the x variables) suggested Life Exp to be 108 years old!