The attached who.csv dataset contains real-world data from 2008. The variables included follow.
library(knitr)
whodf <- read.csv(file="https://raw.githubusercontent.com/monuchacko/cuny_msds/master/data_605/who.csv", header=TRUE, sep=",")
kable(head(whodf), digits = 2, align = c(rep("l", 4), rep("c", 4), rep("r", 4)))
| Country | LifeExp | InfantSurvival | Under5Survival | TBFree | PropMD | PropRN | PersExp | GovtExp | TotExp |
|---|---|---|---|---|---|---|---|---|---|
| Afghanistan | 42 | 0.84 | 0.74 | 1 | 0 | 0 | 20 | 92 | 112 |
| Albania | 71 | 0.98 | 0.98 | 1 | 0 | 0 | 169 | 3128 | 3297 |
| Algeria | 71 | 0.97 | 0.96 | 1 | 0 | 0 | 108 | 5184 | 5292 |
| Andorra | 82 | 1.00 | 1.00 | 1 | 0 | 0 | 2589 | 169725 | 172314 |
| Angola | 41 | 0.85 | 0.74 | 1 | 0 | 0 | 36 | 1620 | 1656 |
| Antigua and Barbuda | 73 | 0.99 | 0.99 | 1 | 0 | 0 | 503 | 12543 | 13046 |
plot(whodf$LifeExp ~ whodf$TotExp, main = "LifeExp vs TotExp", xlab = "Pers and gov expenditures", ylab = "Average life expectancy")
abline(lm(whodf$LifeExp ~ whodf$TotExp), col="red") # regression line (y~x)
# linear regression
m1 <- lm(LifeExp ~ TotExp, data = whodf)
summary(m1)
##
## Call:
## lm(formula = LifeExp ~ TotExp, data = whodf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.764 -4.778 3.154 7.116 13.292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.475e+01 7.535e-01 85.933 < 2e-16 ***
## TotExp 6.297e-05 7.795e-06 8.079 7.71e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.371 on 188 degrees of freedom
## Multiple R-squared: 0.2577, Adjusted R-squared: 0.2537
## F-statistic: 65.26 on 1 and 188 DF, p-value: 7.714e-14
From the data above we can say that there is high likelihood that the model is explaining the data failrly well. Here F-Stat is 65.26 and p-value is close to 0.
qqnorm(m1$residuals)
qqline(m1$residuals)
whodf2<-whodf
whodf2$LifeExp<-whodf2$LifeExp^4.6
whodf2$TotExp<-whodf2$TotExp^0.6
kable(head(whodf2), digits = 2, align = c(rep("l", 4), rep("c", 4), rep("r", 4)))
| Country | LifeExp | InfantSurvival | Under5Survival | TBFree | PropMD | PropRN | PersExp | GovtExp | TotExp |
|---|---|---|---|---|---|---|---|---|---|
| Afghanistan | 29305338 | 0.84 | 0.74 | 1 | 0 | 0 | 20 | 92 | 16.96 |
| Albania | 327935478 | 0.98 | 0.98 | 1 | 0 | 0 | 169 | 3128 | 129.08 |
| Algeria | 327935478 | 0.97 | 0.96 | 1 | 0 | 0 | 108 | 5184 | 171.46 |
| Andorra | 636126841 | 1.00 | 1.00 | 1 | 0 | 0 | 2589 | 169725 | 1386.09 |
| Angola | 26230450 | 0.85 | 0.74 | 1 | 0 | 0 | 36 | 1620 | 85.40 |
| Antigua and Barbuda | 372636298 | 0.99 | 0.99 | 1 | 0 | 0 | 503 | 12543 | 294.64 |
plot(whodf2$LifeExp ~ whodf2$TotExp, main = "LifeExpTransformed vs TotExpTransformed", xlab = "Pers and gov expenditures", ylab = "Average life expectancy")
abline(lm(whodf2$LifeExp ~ whodf2$TotExp), col="red") # regression line (y~x)
m2 <- lm(LifeExp ~ TotExp, data = whodf2)
summary(m2)
##
## Call:
## lm(formula = LifeExp ~ TotExp, data = whodf2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -257351739 -82599957 14030425 93896945 237720335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211907647 10234512 20.70 <2e-16 ***
## TotExp 238461 15021 15.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 113800000 on 188 degrees of freedom
## Multiple R-squared: 0.5728, Adjusted R-squared: 0.5705
## F-statistic: 252 on 1 and 188 DF, p-value: < 2.2e-16
TExp <- 1.5
LExp <- 238461*TExp + 211907647
round(LExp ^ (1/4.6),1)
## [1] 64.6
TExp <- 2.5
LExp <- 238461*TExp + 211907647
round(LExp ^ (1/4.6),1)
## [1] 64.6
LifeExp = b0+b1 x PropMd + b2 x TotExp +b3 x PropMD x TotExp
m3 <- lm(LifeExp ~ PropMD + TotExp + PropMD*TotExp, data = whodf)
summary(m3)
##
## Call:
## lm(formula = LifeExp ~ PropMD + TotExp + PropMD * TotExp, data = whodf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.320 -4.132 2.098 6.540 13.074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.277e+01 7.956e-01 78.899 < 2e-16 ***
## PropMD 1.497e+03 2.788e+02 5.371 2.32e-07 ***
## TotExp 7.233e-05 8.982e-06 8.053 9.39e-14 ***
## PropMD:TotExp -6.026e-03 1.472e-03 -4.093 6.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.765 on 186 degrees of freedom
## Multiple R-squared: 0.3574, Adjusted R-squared: 0.3471
## F-statistic: 34.49 on 3 and 186 DF, p-value: < 2.2e-16
qqnorm(m3$residuals)
qqline(m3$residuals)
TExp <- 14
PrMD <- 0.03
LExp <- 6.277e+01 + 1.497e+03*PrMD + 7.233e-05*TExp -6.026e-03*PrMD*TExp
round(LExp,1)
## [1] 107.7