library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
## Warning: package 'scales' was built under R version 3.3.3
who_df <- read.csv(file="hw12_who.csv", head=TRUE, sep=",", stringsAsFactors = FALSE)
colnames(who_df)
## [1] "Country" "LifeExp" "InfantSurvival" "Under5Survival" "TBFree" "PropMD" "PropRN" "PersExp" "GovtExp" "TotExp"
nrow(who_df)
## [1] 190
ncol(who_df)
## [1] 10
head(who_df, 10)
## Country LifeExp InfantSurvival Under5Survival TBFree PropMD PropRN PersExp GovtExp TotExp
## 1 Afghanistan 42 0.835 0.743 0.99769 0.000228841 0.000572294 20 92 112
## 2 Albania 71 0.985 0.983 0.99974 0.001143127 0.004614439 169 3128 3297
## 3 Algeria 71 0.967 0.962 0.99944 0.001060478 0.002091362 108 5184 5292
## 4 Andorra 82 0.997 0.996 0.99983 0.003297297 0.003500000 2589 169725 172314
## 5 Angola 41 0.846 0.740 0.99656 0.000070400 0.001146162 36 1620 1656
## 6 Antigua and Barbuda 73 0.990 0.989 0.99991 0.000142857 0.002773810 503 12543 13046
## 7 Argentina 75 0.986 0.983 0.99952 0.002780191 0.000741044 484 19170 19654
## 8 Armenia 69 0.979 0.976 0.99920 0.003698671 0.004918937 88 1856 1944
## 9 Australia 82 0.995 0.994 0.99993 0.002331953 0.009149391 3181 187616 190797
## 10 Austria 80 0.996 0.996 0.99990 0.003610904 0.006458749 3788 189354 193142
who_df1 = arrange(who_df, desc(LifeExp))
head(who_df1, 10)
## Country LifeExp InfantSurvival Under5Survival TBFree PropMD PropRN PersExp GovtExp TotExp
## 1 Japan 83 0.997 0.996 0.99971 0.002113049 0.009461544 2936 159192 162128
## 2 Andorra 82 0.997 0.996 0.99983 0.003297297 0.003500000 2589 169725 172314
## 3 Australia 82 0.995 0.994 0.99993 0.002331953 0.009149391 3181 187616 190797
## 4 Monaco 82 0.997 0.996 0.99998 0.005636364 0.014060606 6128 458700 464828
## 5 San Marino 82 0.997 0.997 0.99995 0.035129032 0.070838710 3490 278163 281653
## 6 Switzerland 82 0.996 0.995 0.99995 0.003864789 0.010617438 5694 258248 263942
## 7 Canada 81 0.995 0.994 0.99996 0.001912607 0.010044633 3430 192800 196230
## 8 France 81 0.996 0.995 0.99989 0.003379700 0.007924442 3819 234850 238669
## 9 Iceland 81 0.998 0.997 0.99997 0.003758389 0.009932886 5154 395622 400776
## 10 Israel 81 0.996 0.995 0.99994 0.003691336 0.006256828 1533 93748 95281
# Function to calculate the correlation
findCorrelation <- function(x, y) {
corr = round(cor(x, y),4)
print (paste0("Correlation = ",corr))
return (corr)
}
# Function to calculate the intercept
getIntercept <- function(m) {
int = round(m$coefficients[1], 4)
return (int)
}
# Function to calculate the slope
getSlope <- function(m) {
slp = round(m$coefficients[2], 4)
return (slp)
}
m1 = lm(LifeExp ~ TotExp, data=who_df)
summary(m1)
##
## Call:
## lm(formula = LifeExp ~ TotExp, data = who_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.764 -4.778 3.154 7.116 13.292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.475e+01 7.535e-01 85.933 < 2e-16 ***
## TotExp 6.297e-05 7.795e-06 8.079 7.71e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.371 on 188 degrees of freedom
## Multiple R-squared: 0.2577, Adjusted R-squared: 0.2537
## F-statistic: 65.26 on 1 and 188 DF, p-value: 7.714e-14
ggplot(who_df, aes(TotExp, LifeExp)) + geom_point(colour="blue", size=2) +
geom_abline(aes(slope=round(m1$coefficients[2], 4), intercept=round(m1$coefficients[1], 4))) +
labs(title = "Total Expenditures vs. Life Expetancy") +
xlab("Total Expenditures") +
ylab("Life Expectancy")
\[ \hat{LifeExp} = 64.7534 + (10^{-4} * TotExp) \]
ggplot(m1, aes(.fitted, .resid)) +
geom_point(color = "blue", size=2) +
labs(title = "Fitted Values vs Residuals") +
labs(x = "Fitted Values") +
labs(y = "Residuals")
c1 = findCorrelation(who_df$TotExp, who_df$LifeExp)
## [1] "Correlation = 0.5076"
c1
## [1] 0.5076
\(F-Statistic\) is \(65.26\) and the \(Standard\) \(Error\) is \(9.371\). The \(p-value\) is almost \(0\). The correlation is \(0.5076\) and \(R^2\) is \(0.2577\). The data points are not around the abline. This shows a week relationship between variables \(TotExp\) and \(LifeExp\).
who_df2 <- who_df
who_df2$LifeExp2 <- who_df$LifeExp^4.6
who_df2$TotExp2 <- who_df$TotExp^0.06
m2 = lm(LifeExp2 ~ TotExp2, data=who_df2)
summary(m2)
##
## Call:
## lm(formula = LifeExp2 ~ TotExp2, data = who_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -308616089 -53978977 13697187 59139231 211951764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -736527910 46817945 -15.73 <2e-16 ***
## TotExp2 620060216 27518940 22.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90490000 on 188 degrees of freedom
## Multiple R-squared: 0.7298, Adjusted R-squared: 0.7283
## F-statistic: 507.7 on 1 and 188 DF, p-value: < 2.2e-16
ggplot(who_df2, aes(TotExp2, LifeExp2)) + geom_point(colour="red", size=2) +
geom_abline(aes(slope=round(m2$coefficients[2], 4), intercept=round(m2$coefficients[1], 4))) +
labs(title = "Total Expenditures vs. Life Expetancy (Transformed)") +
xlab("Total Expenditures") +
ylab("Life Expectancy")
\[
\hat{LifeExp2} = -7.3652791\times 10^{8} + (6.2006022\times 10^{8} * TotExp2)
\]
ggplot(m2, aes(.fitted, .resid)) +
geom_point(color = "red", size=2) +
labs(title = "Fitted Values vs Residuals") +
labs(x = "Fitted Values") +
labs(y = "Residuals")
c2 = findCorrelation(who_df2$TotExp2, who_df2$LifeExp2)
## [1] "Correlation = 0.8543"
c2
## [1] 0.8543
\(F-Statistic\) is \(507.7\) and the \(Standard\) \(Error\) is \(90490000\). The \(p-value\) is again nearly \(0\). The correlation is \(0.8543\) which is much better than the previous case and \(R^2\) is \(0.7298\). In this new model, we notice that the data points are around the abline. This shows a strong relationship between transformed variables \(TotExp\) and \(LifeExp\). Therefore, we can say this transformed model is better than the previous model.
x = 1.5
y = round(m2$coefficients[1], 4) + round(m2$coefficients[2], 4) * x
le = round(y^(1/4.6), 2)
print(le)
## (Intercept)
## 63.31
x = 2.5
y = round(m2$coefficients[1], 4) + round(m2$coefficients[2], 4) * x
le = round(y^(1/4.6), 2)
print(le)
## (Intercept)
## 86.51
When \(TotExp = 1.5\), the forecast life expectancy is \(63.31\) years and when the \(TotExp = 2.5\), the life expectancy is \(86.51\) years.
m3 = lm(LifeExp ~ PropMD + TotExp + PropMD*TotExp, data = who_df)
summary(m3)
##
## Call:
## lm(formula = LifeExp ~ PropMD + TotExp + PropMD * TotExp, data = who_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.320 -4.132 2.098 6.540 13.074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.277e+01 7.956e-01 78.899 < 2e-16 ***
## PropMD 1.497e+03 2.788e+02 5.371 2.32e-07 ***
## TotExp 7.233e-05 8.982e-06 8.053 9.39e-14 ***
## PropMD:TotExp -6.026e-03 1.472e-03 -4.093 6.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.765 on 186 degrees of freedom
## Multiple R-squared: 0.3574, Adjusted R-squared: 0.3471
## F-statistic: 34.49 on 3 and 186 DF, p-value: < 2.2e-16
\[ \hat{LifeExp} = 62.7727 + (1497.494 * PropMD) + (10^{-4} * TotExp) + (-0.006 * PropMD * TotExp) \]
ggplot(m3, aes(.fitted, .resid)) +
geom_point(color = "brown", size=2) +
labs(title = "Fitted Values vs Residuals") +
labs(x = "Fitted Values") +
labs(y = "Residuals")
\(F-Statistic\) is \(34.49\) and the \(Standard\) \(Error\) is \(8.765\). The \(p-value\) is again nearly \(0\). The \(R^2\) is \(0.3574\). The model explains only 35.74% of variability. In this new model, we notice that the residuals are not normally distributed. This model is not a good model to describe the relationships between variables \(TotExp\), \(PropMd\) and \(LifeExp\).
propmd = 0.03
totexp = 14
y = round(m3$coefficients[1], 4) + (round(m3$coefficients[2], 4) * propmd) +
(round(m3$coefficients[3], 4) * totexp) + (round(m3$coefficients[4], 4) * propmd * totexp)
print(y)
## (Intercept)
## 107.6964
When \(PropMd = 0.03\) and \(TotExp = 14\), the forecast value of \(LifeExp\) is \(107.69\) years which is unrealistic because the highest life expectancy in the dataset is \(83\) years. Therefore, we conclude that this is unrealistic.