require(ggthemes)
library(tidyverse)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(fpp2)   
library(forecast)
library(ggpubr)
library(boot)
library(plotly)

Loading Database

url <- "https://raw.githubusercontent.com/ssufian/DAT-605/master/who.csv"
#Reading CSV file from Github
df <- read.csv(file=url, sep=",",na.strings = c("NA"," ",""),strip.white = T, stringsAsFactors = F, header=T) 

head(df)
##               Country LifeExp InfantSurvival Under5Survival  TBFree      PropMD
## 1         Afghanistan      42          0.835          0.743 0.99769 0.000228841
## 2             Albania      71          0.985          0.983 0.99974 0.001143127
## 3             Algeria      71          0.967          0.962 0.99944 0.001060478
## 4             Andorra      82          0.997          0.996 0.99983 0.003297297
## 5              Angola      41          0.846          0.740 0.99656 0.000070400
## 6 Antigua and Barbuda      73          0.990          0.989 0.99991 0.000142857
##        PropRN PersExp GovtExp TotExp
## 1 0.000572294      20      92    112
## 2 0.004614439     169    3128   3297
## 3 0.002091362     108    5184   5292
## 4 0.003500000    2589  169725 172314
## 5 0.001146162      36    1620   1656
## 6 0.002773810     503   12543  13046

Problem 1

#fitting a linear regression model
lm1 <- lm(LifeExp~ TotExp, data = df)
summary(lm1)
## 
## Call:
## lm(formula = LifeExp ~ TotExp, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.764  -4.778   3.154   7.116  13.292 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.475e+01  7.535e-01  85.933  < 2e-16 ***
## TotExp      6.297e-05  7.795e-06   8.079 7.71e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.371 on 188 degrees of freedom
## Multiple R-squared:  0.2577, Adjusted R-squared:  0.2537 
## F-statistic: 65.26 on 1 and 188 DF,  p-value: 7.714e-14
# Compute the analysis of variance
res.aov <- aov(LifeExp~ TotExp, data = df)
# Summary of the analysis
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## TotExp        1   5731    5731   65.26 7.71e-14 ***
## Residuals   188  16509      88                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res.aov, 1)

# 2. Normality
plot(res.aov, 2)


Observation 1:

Statistical Diagnostics

But simply based on F-Stat and T-Stat of the coefficient, the simple linear regression “appears” to be met


Problem 2: Data Transformations


df1 <- df %>% mutate(LifeEXPpower4.6 = LifeExp**4.6)

df2 <- df1 %>% mutate(TotExppower0.6 = TotExp**.06)

head(df2)
##               Country LifeExp InfantSurvival Under5Survival  TBFree      PropMD
## 1         Afghanistan      42          0.835          0.743 0.99769 0.000228841
## 2             Albania      71          0.985          0.983 0.99974 0.001143127
## 3             Algeria      71          0.967          0.962 0.99944 0.001060478
## 4             Andorra      82          0.997          0.996 0.99983 0.003297297
## 5              Angola      41          0.846          0.740 0.99656 0.000070400
## 6 Antigua and Barbuda      73          0.990          0.989 0.99991 0.000142857
##        PropRN PersExp GovtExp TotExp LifeEXPpower4.6 TotExppower0.6
## 1 0.000572294      20      92    112        29305338       1.327251
## 2 0.004614439     169    3128   3297       327935478       1.625875
## 3 0.002091362     108    5184   5292       327935478       1.672697
## 4 0.003500000    2589  169725 172314       636126841       2.061481
## 5 0.001146162      36    1620   1656        26230450       1.560068
## 6 0.002773810     503   12543  13046       372636298       1.765748
fig1 <- plot_ly(data = df2, x = ~TotExppower0.6, y = ~LifeEXPpower4.6,type='scatter',mode='markers',
               marker = list(size = 10,
                             color = 'rgba(255, 182, 193, .9)',
                             line = list(color = 'rgba(152, 0, 0, .8)',
                                         width = 2)))
fig1<- fig1 %>% layout(title = 'Fig2: Scatter Plot of LifeExp^4.6 as function of TotExp^0.06',
         yaxis = list(zeroline = FALSE),
         xaxis = list(zeroline = FALSE))

fig1
#fitting a linear regression model on transformed variables
lm2 <- lm(LifeEXPpower4.6~ TotExppower0.6, data = df2)
summary(lm2)
## 
## Call:
## lm(formula = LifeEXPpower4.6 ~ TotExppower0.6, data = df2)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -308616089  -53978977   13697187   59139231  211951764 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -736527910   46817945  -15.73   <2e-16 ***
## TotExppower0.6  620060216   27518940   22.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 90490000 on 188 degrees of freedom
## Multiple R-squared:  0.7298, Adjusted R-squared:  0.7283 
## F-statistic: 507.7 on 1 and 188 DF,  p-value: < 2.2e-16
# Compute the analysis of variance on transformed variables
res1.aov <- aov(LifeEXPpower4.6~ TotExppower0.6, data = df2)
# Summary of the analysis
summary(res1.aov)
##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## TotExppower0.6   1 4.157e+18 4.157e+18   507.7 <2e-16 ***
## Residuals      188 1.540e+18 8.189e+15                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res1.aov, 1)

# 2. Normality
plot(res1.aov, 2)


Observation 2:

Statistical Diagnostics

Based on the transformed variables, the linear regression is back in play and has passed all the statistical diagnostic tests


Problem 3

#Prediction zone based on transformed variables
intercept <- -736527910 
slope <- 620060216
#, forecast life expectancy when:
TotExppower0.6_1.5 <- 1.5
 
LifeEXPpower4.6_1.5 <- TotExppower0.6_1.5 *slope+intercept 
 
LifeEXPpower4.6_1.5  
## [1] 193562414
#, forecast life expectancy when:
TotExppower0.6_2.5 <- 2.5
 
LifeEXPpower4.6_2.5 <- TotExppower0.6_2.5 *slope+intercept 
 
LifeEXPpower4.6_2.5
## [1] 813622630

Problem 4 - Multiple Regression Model

df3 <- df2 %>% mutate(ProdMDxTotExp=PropMD*TotExp)
head(df3)
##               Country LifeExp InfantSurvival Under5Survival  TBFree      PropMD
## 1         Afghanistan      42          0.835          0.743 0.99769 0.000228841
## 2             Albania      71          0.985          0.983 0.99974 0.001143127
## 3             Algeria      71          0.967          0.962 0.99944 0.001060478
## 4             Andorra      82          0.997          0.996 0.99983 0.003297297
## 5              Angola      41          0.846          0.740 0.99656 0.000070400
## 6 Antigua and Barbuda      73          0.990          0.989 0.99991 0.000142857
##        PropRN PersExp GovtExp TotExp LifeEXPpower4.6 TotExppower0.6
## 1 0.000572294      20      92    112        29305338       1.327251
## 2 0.004614439     169    3128   3297       327935478       1.625875
## 3 0.002091362     108    5184   5292       327935478       1.672697
## 4 0.003500000    2589  169725 172314       636126841       2.061481
## 5 0.001146162      36    1620   1656        26230450       1.560068
## 6 0.002773810     503   12543  13046       372636298       1.765748
##   ProdMDxTotExp
## 1    0.02563019
## 2    3.76888972
## 3    5.61204958
## 4  568.17043526
## 5    0.11658240
## 6    1.86371242
#Each variable is paired up with each of the variable. A scatterplot is plotted for each pair.
pairs(~LifeExp+ PropMD +TotExp+ProdMDxTotExp,data = df3 ,main = "Scatterplot Matrix")

input <- df3[,c("LifeExp","PropMD","TotExp","ProdMDxTotExp")]

# Create the relationship model.
model <- lm(LifeExp~ PropMD +TotExp+ProdMDxTotExp,data = df3)

# Show the model.
print(model)
## 
## Call:
## lm(formula = LifeExp ~ PropMD + TotExp + ProdMDxTotExp, data = df3)
## 
## Coefficients:
##   (Intercept)         PropMD         TotExp  ProdMDxTotExp  
##     6.277e+01      1.497e+03      7.233e-05     -6.026e-03
# Get the Intercept and coefficients as vector elements.
cat("# # # # The Coefficient Values # # # ","\n")
## # # # # The Coefficient Values # # #
a <- coef(model)[1]
print(a)
## (Intercept) 
##     62.7727
XPropMD <- coef(model)[2]
XTotExp <- coef(model)[3]
XProdMDxTotExp <- coef(model)[4]

print(XPropMD )
##   PropMD 
## 1497.494
print(XTotExp)
##       TotExp 
## 7.233324e-05
print(XProdMDxTotExp)
## ProdMDxTotExp 
##  -0.006025686
# summary of the ANOVA Table
summary(model)
## 
## Call:
## lm(formula = LifeExp ~ PropMD + TotExp + ProdMDxTotExp, data = df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.320  -4.132   2.098   6.540  13.074 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.277e+01  7.956e-01  78.899  < 2e-16 ***
## PropMD         1.497e+03  2.788e+02   5.371 2.32e-07 ***
## TotExp         7.233e-05  8.982e-06   8.053 9.39e-14 ***
## ProdMDxTotExp -6.026e-03  1.472e-03  -4.093 6.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.765 on 186 degrees of freedom
## Multiple R-squared:  0.3574, Adjusted R-squared:  0.3471 
## F-statistic: 34.49 on 3 and 186 DF,  p-value: < 2.2e-16

Create Equation for Regression Model

Statistical Diagnostics:

LifeExp <- 62.7727+1497.494*XPropMD + 7.233324e-05*XTotExp +-0.006025686 *XProdMDxTotExp

Lets see what the Residuals w.r.t. to LifeExp (Response Variable) tell us in order to decide if this is a good model or not?

# Q-Q Plot to check for normality of residuals
# linear model — distance as a function of speed from base R “cars” dataset
qqPropMD <- glm(LifeExp~ PropMD, data = input, family = gaussian)
# diagnostic plot of model
glm.diag.plots(qqPropMD)

qqTotExp <- glm(LifeExp~TotExp, data = input, family = gaussian)
# diagnostic plot of model
glm.diag.plots(qqTotExp)

qqProdMDxTotExp <- glm(LifeExp~ProdMDxTotExp, data = input, family = gaussian)
# diagnostic plot of model
glm.diag.plots(qqProdMDxTotExp)


Problem 5 - Forecasting

-Forecast LifeExp when PropMD=.03 and TotExp = 14.


XPropMD <-0.03
XTotExp <- 14
XProdMDxTotExp <- XPropMD*XTotExp 
LifeExp <- 62.7727+1497.494*XPropMD + 7.233324e-05*XTotExp +-0.006025686 *XProdMDxTotExp
LifeExp 
## [1] 107.696

Summary and Conclusion:


- The data is NOT an ideal candidate for a Multiple linear regression model; at least not these set of predictor variables.
- The scatter plots showed that there's really no good linear relationships between the predictors vs. the response variable.
- The statistical diagnostics via F-stat, T-stat and p-values revealed the signifcance of the slope and it all met the conditions for a linear regression model. 

However, all that ends there as upon further analysis, it was revealed that none of residuals are normal and for the purist, there is a lack of homegeneity on the variances as well.  This suggested that the model is poorly specified and further analysis such as variable transforms can help or simply using other better suited predictor variables.  

- The multiple regression (given the x variables) suggested Life Exp to be 108 years old!