The attached who.csv dataset contains real-world data from 2008. The variables included follow. Country: name of the country LifeExp: average life expectancy for the country in years InfantSurvival: proportion of those surviving to one year or more Under5Survival: proportion of those surviving to five years or more TBFree: proportion of the population without TB. PropMD: proportion of the population who are MDs PropRN: proportion of the population who are RNs PersExp: mean personal expenditures on healthcare in US dollars at average exchange rate GovtExp: mean government expenditures per capita on healthcare, US dollars at average exchange rate TotExp: sum of personal and government expenditures. 1. Provide a scatterplot of LifeExp~TotExp, and run simple linear regression. Do not transform the variables. Provide and interpret the F statistics, R^2, standard error,and p-values only. Discuss whether the assumptions of simple linear regression met.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.0.5
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.0.5
## ResourceSelection 0.3-5 2019-07-22
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
Life<-"C:/Users/Lisa/Documents/CUNY/605/week12/who.csv"
life_use<-read.table(file=Life,header=TRUE, sep=",")
attach(life_use)
dim(life_use)
## [1] 140 10
head(life_use)
## Country LifeExp InfantSurvival Under5Survival TBFree PropMD
## 1 Afghanistan 42 0.835 0.743 0.99769 0.000228841
## 2 Albania 71 0.985 0.983 0.99974 0.001143127
## 3 Algeria 71 0.967 0.962 0.99944 0.001060478
## 4 Andorra 82 0.997 0.996 0.99983 0.003297297
## 5 Angola 41 0.846 0.740 0.99656 0.000070400
## 6 Antigua and Barbuda 73 0.990 0.989 0.99991 0.000142857
## PropRN PersExp GovtExp TotExp
## 1 0.000572294 20 92 112
## 2 0.004614439 169 3128 3297
## 3 0.002091362 108 5184 5292
## 4 0.003500000 2589 169725 172314
## 5 0.001146162 36 1620 1656
## 6 0.002773810 503 12543 13046
class(life_use)
## [1] "data.frame"
#linear regression
model <- lm(LifeExp ~ TotExp, data = life_use)
model
##
## Call:
## lm(formula = LifeExp ~ TotExp, data = life_use)
##
## Coefficients:
## (Intercept) TotExp
## 6.466e+01 6.093e-05
summary(model)
##
## Call:
## lm(formula = LifeExp ~ TotExp, data = life_use)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.669 -4.096 3.443 7.260 13.450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.466e+01 9.070e-01 71.291 < 2e-16 ***
## TotExp 6.093e-05 9.484e-06 6.424 1.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.763 on 138 degrees of freedom
## Multiple R-squared: 0.2302, Adjusted R-squared: 0.2246
## F-statistic: 41.27 on 1 and 138 DF, p-value: 1.993e-09
# SE should be 5-10x smaller than coeff
#.00006/.000009=6.6 so ok
#64.7/.9= 71.9 intercept may vary
# beta0,beta1 coeff are significant
# Residual std error=9.763
#1Q*1.5=4.1*1.5=6.15 not close to 9.76 not ok
#3Q*1.5=7.2*1.5=10.8 close to 9.76 so ok
#model accounts for 65% of variation
# F statistic gives a global testing of the model. # and p value associated. p value is significant
# There is a relationship
plot(TotExp,LifeExp)
abline(model)
plot(fitted(model),resid(model))
#residuals are NOT uniformly scattered, Not a good #fit, try to transform variables
qqnorm(resid(model))
#QQ plot is not ok, transform variables
life_use_r<-life_use %>% select(LifeExp, PropMD, TotExp) %>%
mutate (LifeExp_r=LifeExp^4.6, TotExp_r=TotExp^.06)
detach(life_use)
attach(life_use_r)
head(life_use_r)
## LifeExp PropMD TotExp LifeExp_r TotExp_r
## 1 42 0.000228841 112 29305338 1.327251
## 2 71 0.001143127 3297 327935478 1.625875
## 3 71 0.001060478 5292 327935478 1.672697
## 4 82 0.003297297 172314 636126841 2.061481
## 5 41 0.000070400 1656 26230450 1.560068
## 6 73 0.000142857 13046 372636298 1.765748
model2 <- lm(LifeExp_r ~ TotExp_r, data = life_use_r)
model2
##
## Call:
## lm(formula = LifeExp_r ~ TotExp_r, data = life_use_r)
##
## Coefficients:
## (Intercept) TotExp_r
## -720554749 609874086
summary(model2)
##
## Call:
## lm(formula = LifeExp_r ~ TotExp_r, data = life_use_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -306134143 -57556675 17946146 62597037 210341349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -720554749 56186881 -12.82 <2e-16 ***
## TotExp_r 609874086 33137634 18.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92060000 on 138 degrees of freedom
## Multiple R-squared: 0.7105, Adjusted R-squared: 0.7084
## F-statistic: 338.7 on 1 and 138 DF, p-value: < 2.2e-16
# beta0,beta1 coeff are significant
#model accounts for 71% of variation
# F statistic gives a global testing of the model. # and p value associated. p value is significant
# There is a relationship
plot(TotExp_r,LifeExp_r)
abline(model2)
plot(fitted(model2),resid(model2))
#residuals are uniformly scattered, This assumption is met
qqnorm(resid(model2))
# QQ plot is close normal distribution, assumption is close met
# may investigate some points on the left end
# here is some code that gives more diagnostic #plots
r4_2=model2$residuals #residuals
r5_2=shapiro.test(r4_2) #test for normality of residuals
r5_2
##
## Shapiro-Wilk normality test
##
## data: r4_2
## W = 0.9746, p-value = 0.01032
r6_2=bptest(model2) #Breusch-Pagan-Godfrey-Test for homoscedasticity, lmtest
r6_2
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 0.64322, df = 1, p-value = 0.4225
r7_2=dwtest(model2) #independence of #residuals #test for independence of errors
r7_2
##
## Durbin-Watson test
##
## data: model2
## DW = 1.8667, p-value = 0.2125
## alternative hypothesis: true autocorrelation is greater than 0
aic_2=AIC(model2)
aic_2
## [1] 5535.921
bic_2=BIC(model2)
bic_2
## [1] 5544.746
model.diag.metrics <- augment(model2)
head(model.diag.metrics)
## # A tibble: 6 x 8
## LifeExp_r TotExp_r .fitted .resid .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29305338. 1.33 88901280. -59595942. 0.0232 9.23e7 5.09e-3 -0.655
## 2 327935478. 1.63 271024244. 56911235. 0.00751 9.23e7 1.46e-3 0.621
## 3 327935478. 1.67 299579599. 28355879. 0.00715 9.24e7 3.44e-4 0.309
## 4 636126841. 2.06 536688795. 99438046. 0.0261 9.20e7 1.60e-2 1.09
## 5 26230450. 1.56 230890497. -204660047. 0.00898 9.07e7 2.26e-2 -2.23
## 6 372636298. 1.77 356328999. 16307298. 0.00811 9.24e7 1.29e-4 0.178
ggplot(model.diag.metrics, aes(TotExp_r,LifeExp_r)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = TotExp_r, yend = .fitted), color = "red", size = 0.3)
## `geom_smooth()` using formula 'y ~ x'
autoplot(model2)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#the shows no pattern in residual v fitted, so ok
#normal QQ plot pretty close to normal
#scale location addresses heteroscadesicity is ok
#Leverage - is good no leverage points
# This model with transformed variables is a #better model than the first, because the #assumptions for a valid model are met as stated #above.
The equation for this SLR: LifeExp_r = -720554749 + 609875086 TotExp_r
Recall: LifeExp_r=LifeExp4.7 TotExp_r=TotExp.06
Find LifeExp: TotExp^.06 =1.5
(-720554749 + 609875086*1.5)^(1/4.6)
## [1] 63.36092
#predicted Life Exp for TotExp^.06 =1.5 above
Find LifeExp: TotExp^.06 =2.5
(-720554749 + 609875086*2.5)^(1/4.6)
## [1] 86.2861
#predicted Life Exp for TotExp^.06 =2.5 above
4a. Build the following multiple regression model and interpret the F Statistics, R^2, standard error, and p-values. How good is the model? LifeExp = b0+b1 x PropMd + b2 x TotExp +b3 x PropMD x TotExp
DONT TRANSFORM
model3 <- lm(LifeExp ~ TotExp*PropMD , data = life_use_r)
model3
##
## Call:
## lm(formula = LifeExp ~ TotExp * PropMD, data = life_use_r)
##
## Coefficients:
## (Intercept) TotExp PropMD TotExp:PropMD
## 5.925e+01 8.190e-05 4.825e+03 -1.724e-02
summary(model3)
##
## Call:
## lm(formula = LifeExp ~ TotExp * PropMD, data = life_use_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.404 -5.133 1.200 6.303 12.524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.925e+01 1.060e+00 55.899 < 2e-16 ***
## TotExp 8.190e-05 1.003e-05 8.164 1.96e-13 ***
## PropMD 4.825e+03 6.496e+02 7.428 1.10e-11 ***
## TotExp:PropMD -1.724e-02 2.477e-03 -6.962 1.30e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.293 on 136 degrees of freedom
## Multiple R-squared: 0.4526, Adjusted R-squared: 0.4405
## F-statistic: 37.48 on 3 and 136 DF, p-value: < 2.2e-16
# beta0,beta1 coeff are significant
#model accounts for 44% of variation
# F statistic gives a global testing of the model. # and p value associated. p value is significant
# There is a relationship
plot(fitted(model3),resid(model3))
#residuals are not uniformly scattered, This assumption is NOT met
qqnorm(resid(model3))
# QQ plot does NOT follows normal distribution, assumption is NOT met
# here is some code that gives more diagnostic #plots
model.diag.metrics <- augment(model3)
head(model.diag.metrics)
## # A tibble: 6 x 9
## LifeExp TotExp PropMD .fitted .resid .hat .sigma .cooksd .std.resid
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 42 112 0.000229 60.4 -18.4 0.0135 8.17 0.0170 -2.23
## 2 71 3297 0.00114 65.0 6.03 0.00841 8.31 0.00113 0.730
## 3 71 5292 0.00106 64.7 6.30 0.00828 8.31 0.00121 0.762
## 4 82 172314 0.00330 79.5 2.52 0.0302 8.32 0.000743 0.309
## 5 41 1656 0.0000704 59.7 -18.7 0.0152 8.16 0.0200 -2.28
## 6 73 13046 0.000143 61.0 12.0 0.0131 8.26 0.00709 1.46
autoplot(model3)
#the shows atypcal pattern in residual v fitted, so not ok
#normal QQ plot is not ok
#scale location addresses heteroscadesicity is not ok
#Leverage - There is a leverage point that needs further investigation, possible error
# This model with non transformed variables, another predicator and interactions term. Not a good model.
4b. Build the following multiple regression model and interpret the F Statistics, R^2, standard error, and p-values. How good is the model? LifeExp^4.6 = b0+b1 x PropMd + b2 x TotExp$.06 +b3 x PropMD x TotExp
TRANSFORM
model4 <- lm(LifeExp_r ~ TotExp_r*PropMD , data = life_use_r)
model4
##
## Call:
## lm(formula = LifeExp_r ~ TotExp_r * PropMD, data = life_use_r)
##
## Coefficients:
## (Intercept) TotExp_r PropMD TotExp_r:PropMD
## -7.115e+08 5.935e+08 7.418e+10 -3.346e+10
summary(model4)
##
## Call:
## lm(formula = LifeExp_r ~ TotExp_r * PropMD, data = life_use_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -290713042 -53057522 8076393 57590796 210311396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.115e+08 6.043e+07 -11.773 <2e-16 ***
## TotExp_r 5.935e+08 3.618e+07 16.402 <2e-16 ***
## PropMD 7.418e+10 3.481e+10 2.131 0.0349 *
## TotExp_r:PropMD -3.346e+10 1.659e+10 -2.016 0.0458 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90600000 on 136 degrees of freedom
## Multiple R-squared: 0.7237, Adjusted R-squared: 0.7176
## F-statistic: 118.7 on 3 and 136 DF, p-value: < 2.2e-16
# beta0,beta1 coeff are significant
#model accounts for 72% of variation
# F statistic gives a global testing of the model. # and p value associated. p value is significant
# There is a relationship
plot(fitted(model4),resid(model4))
#residuals are uniformly scattered, This assumption is met
qqnorm(resid(model4))
# QQ plot follows closeto normal distribution, #assumption is close to straight line
#possible to investigate points at the lower end
# here is some code that gives more diagnostic #plots
r4_4=model4$residuals #residuals
r5_4=shapiro.test(r4_4) #test for normality of residuals
r5_4
##
## Shapiro-Wilk normality test
##
## data: r4_4
## W = 0.98054, p-value = 0.04338
r6_4=bptest(model4) #Breusch-Pagan-Godfrey-Test for homoscedasticity, lmtest
r6_4
##
## studentized Breusch-Pagan test
##
## data: model4
## BP = 2.0001, df = 3, p-value = 0.5724
r7_4=dwtest(model4) #independence of #residuals #test for independence of errors
r7_4
##
## Durbin-Watson test
##
## data: model4
## DW = 1.9451, p-value = 0.3707
## alternative hypothesis: true autocorrelation is greater than 0
aic_4=AIC(model4)
aic_4
## [1] 5533.394
bic_4=BIC(model4)
bic_4
## [1] 5548.102
model.diag.metrics <- augment(model4)
head(model.diag.metrics)
## # A tibble: 6 x 9
## LifeExp_r TotExp_r PropMD .fitted .resid .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29305338. 1.33 2.29e-4 8.30e7 -5.37e7 0.0257 9.08e7 2.38e-3 -0.601
## 2 327935478. 1.63 1.14e-3 2.76e8 5.19e7 0.00851 9.08e7 7.10e-4 0.575
## 3 327935478. 1.67 1.06e-3 3.01e8 2.74e7 0.00766 9.09e7 1.78e-4 0.304
## 4 636126841. 2.06 3.30e-3 5.29e8 1.07e8 0.0272 9.05e7 1.00e-2 1.20
## 5 26230450. 1.56 7.04e-5 2.16e8 -1.90e8 0.0135 8.94e7 1.52e-2 -2.11
## 6 372636298. 1.77 1.43e-4 3.39e8 3.40e7 0.0142 9.09e7 5.16e-4 0.378
autoplot(model4)
#the shows no pattern in residual v fitted, so ok
#normal QQ plot is ok
#scale location addresses heteroscadesicity is ok
#Leverage - There is a leverage point that needs further investigation, possible error
# This model with transformed variables is a #better model than the second, however, we added a #variable and an interaction term with not much #improvement over the second model. The #assumptions for a valid model are met as stated #above.
The equation for this SLR: LifeExp_r = -7.115e+08 + (-7.115e+08TotExp_r) + (7.418e+10PopMD) + -3.346e+10(TotExp_r PopMD)
Recall: LifeExp_r=LifeExp4.7 TotExp_r=TotExp.06 PropMD=.03 and TotExp = 14
Pred=(-7.115e+08 + (-7.115e+08*14) +
(7.418e+10*.03) + -3.346e+10*(14* .03))
Pred
## [1] -22500300000
# need to take pred^(1/4.6)
#Negative value cannot take to ^power(1/4.6).