# set the working directory
setwd("c:/Users/mws16/OneDrive/Desktop/OPIM-5603-Statistics in Business Analytics/Project/life-expectancy-who")
# read the data file in a data frame
led <- read.csv("led.csv")
# led
options(scipen=999) # to avoid scientific notation
# check the data types of all the variables
str(led)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : Factor w/ 193 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : Factor w/ 2 levels "Developed","Developing": 2 2 2 2 2 2 2 2 2 2 ...
## $ Lifeexpectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ AdultMortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infantdeaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentageexpenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ HepatitisB : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.fivedeaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Totalexpenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness1.19years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness5.9years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Incomecompositionofresources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
# check the distributions
summary(led)
## Country Year Status Lifeexpectancy
## Afghanistan : 16 Min. :2000 Developed : 512 Min. :36.30
## Albania : 16 1st Qu.:2004 Developing:2426 1st Qu.:63.10
## Algeria : 16 Median :2008 Median :72.10
## Angola : 16 Mean :2008 Mean :69.22
## AntiguaandBarbuda: 16 3rd Qu.:2012 3rd Qu.:75.70
## Argentina : 16 Max. :2015 Max. :89.00
## (Other) :2842 NA's :10
## AdultMortality infantdeaths Alcohol percentageexpenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913
## Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251
## 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912
## NA's :10 NA's :194
## HepatitisB Measles BMI under.fivedeaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04
## 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## NA's :553 NA's :34
## Polio Totalexpenditure Diphtheria HIV.AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## NA's :19 NA's :226 NA's :19
## GDP Population thinness1.19years
## Min. : 1.68 Min. : 34 Min. : 0.10
## 1st Qu.: 463.94 1st Qu.: 195793 1st Qu.: 1.60
## Median : 1766.95 Median : 1386542 Median : 3.30
## Mean : 7483.16 Mean : 12753375 Mean : 4.84
## 3rd Qu.: 5910.81 3rd Qu.: 7420359 3rd Qu.: 7.20
## Max. :119172.74 Max. :1293859294 Max. :27.70
## NA's :448 NA's :652 NA's :34
## thinness5.9years Incomecompositionofresources Schooling
## Min. : 0.10 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
## Median : 3.30 Median :0.6770 Median :12.30
## Mean : 4.87 Mean :0.6276 Mean :11.99
## 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
## Max. :28.60 Max. :0.9480 Max. :20.70
## NA's :34 NA's :167 NA's :163
# check for missing values
colSums(is.na(led))
## Country Year
## 0 0
## Status Lifeexpectancy
## 0 10
## AdultMortality infantdeaths
## 10 0
## Alcohol percentageexpenditure
## 194 0
## HepatitisB Measles
## 553 0
## BMI under.fivedeaths
## 34 0
## Polio Totalexpenditure
## 19 226
## Diphtheria HIV.AIDS
## 19 0
## GDP Population
## 448 652
## thinness1.19years thinness5.9years
## 34 34
## Incomecompositionofresources Schooling
## 167 163
# remove missing values from the data
led_clean <- led[which(complete.cases(led) == TRUE),]
# View(led_clean)
# recheck the clean data
colSums(is.na(led_clean))
## Country Year
## 0 0
## Status Lifeexpectancy
## 0 0
## AdultMortality infantdeaths
## 0 0
## Alcohol percentageexpenditure
## 0 0
## HepatitisB Measles
## 0 0
## BMI under.fivedeaths
## 0 0
## Polio Totalexpenditure
## 0 0
## Diphtheria HIV.AIDS
## 0 0
## GDP Population
## 0 0
## thinness1.19years thinness5.9years
## 0 0
## Incomecompositionofresources Schooling
## 0 0
# recheck the distributions
summary(led_clean)
## Country Year Status Lifeexpectancy
## Afghanistan: 16 Min. :2000 Developed : 242 Min. :44.0
## Albania : 16 1st Qu.:2005 Developing:1407 1st Qu.:64.4
## Armenia : 15 Median :2008 Median :71.7
## Austria : 15 Mean :2008 Mean :69.3
## Belarus : 15 3rd Qu.:2011 3rd Qu.:75.0
## Belgium : 15 Max. :2015 Max. :89.0
## (Other) :1557
## AdultMortality infantdeaths Alcohol percentageexpenditure
## Min. : 1.0 Min. : 0.00 Min. : 0.010 Min. : 0.00
## 1st Qu.: 77.0 1st Qu.: 1.00 1st Qu.: 0.810 1st Qu.: 37.44
## Median :148.0 Median : 3.00 Median : 3.790 Median : 145.10
## Mean :168.2 Mean : 32.55 Mean : 4.533 Mean : 698.97
## 3rd Qu.:227.0 3rd Qu.: 22.00 3rd Qu.: 7.340 3rd Qu.: 509.39
## Max. :723.0 Max. :1600.00 Max. :17.870 Max. :18961.35
##
## HepatitisB Measles BMI under.fivedeaths
## Min. : 2.00 Min. : 0 Min. : 2.00 Min. : 0.00
## 1st Qu.:74.00 1st Qu.: 0 1st Qu.:19.50 1st Qu.: 1.00
## Median :89.00 Median : 15 Median :43.70 Median : 4.00
## Mean :79.22 Mean : 2224 Mean :38.13 Mean : 44.22
## 3rd Qu.:96.00 3rd Qu.: 373 3rd Qu.:55.80 3rd Qu.: 29.00
## Max. :99.00 Max. :131441 Max. :77.10 Max. :2100.00
##
## Polio Totalexpenditure Diphtheria HIV.AIDS
## Min. : 3.00 Min. : 0.740 Min. : 2.00 Min. : 0.100
## 1st Qu.:81.00 1st Qu.: 4.410 1st Qu.:82.00 1st Qu.: 0.100
## Median :93.00 Median : 5.840 Median :92.00 Median : 0.100
## Mean :83.56 Mean : 5.956 Mean :84.16 Mean : 1.984
## 3rd Qu.:97.00 3rd Qu.: 7.470 3rd Qu.:97.00 3rd Qu.: 0.700
## Max. :99.00 Max. :14.390 Max. :99.00 Max. :50.600
##
## GDP Population thinness1.19years
## Min. : 1.68 Min. : 34 Min. : 0.100
## 1st Qu.: 462.15 1st Qu.: 191897 1st Qu.: 1.600
## Median : 1592.57 Median : 1419631 Median : 3.000
## Mean : 5566.03 Mean : 14653626 Mean : 4.851
## 3rd Qu.: 4718.51 3rd Qu.: 7658972 3rd Qu.: 7.100
## Max. :119172.74 Max. :1293859294 Max. :27.200
##
## thinness5.9years Incomecompositionofresources Schooling
## Min. : 0.100 Min. :0.0000 Min. : 4.20
## 1st Qu.: 1.700 1st Qu.:0.5090 1st Qu.:10.30
## Median : 3.200 Median :0.6730 Median :12.30
## Mean : 4.908 Mean :0.6316 Mean :12.12
## 3rd Qu.: 7.100 3rd Qu.:0.7510 3rd Qu.:14.00
## Max. :28.200 Max. :0.9360 Max. :20.70
##
# group the data by the country and status to get average statistics for each country
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
led_grouped <- led_clean%>%
select(c(1,3:22))%>%
group_by(Country,Status)%>%
summarise_if(is.numeric,mean)
# check the number of rows
cat("Number of countries retained (# of rows): ",length(unique(led_grouped$Country)), "\n\n")
## Number of countries retained (# of rows): 133
# recheck the distribution
summary(led_grouped)
## Country Status Lifeexpectancy AdultMortality
## Afghanistan: 1 Developed : 19 Min. :48.42 Min. : 19.93
## Albania : 1 Developing:114 1st Qu.:62.74 1st Qu.:110.33
## Algeria : 1 Median :71.15 Median :151.27
## Angola : 1 Mean :68.62 Mean :172.84
## Argentina : 1 3rd Qu.:74.21 3rd Qu.:230.14
## Armenia : 1 Max. :83.44 Max. :547.17
## (Other) :127
## infantdeaths Alcohol percentageexpenditure
## Min. : 0.0000 Min. : 0.010 Min. : 5.046
## 1st Qu.: 0.7333 1st Qu.: 1.055 1st Qu.: 56.628
## Median : 3.2667 Median : 3.597 Median : 211.139
## Mean : 34.5899 Mean : 4.330 Mean : 683.877
## 3rd Qu.: 23.0000 3rd Qu.: 7.334 3rd Qu.: 551.083
## Max. :1268.8182 Max. :13.497 Max. :8722.748
##
## HepatitisB Measles BMI under.fivedeaths
## Min. : 2.00 Min. : 0.00 Min. :14.44 Min. : 0.00
## 1st Qu.:65.92 1st Qu.: 9.25 1st Qu.:19.30 1st Qu.: 1.00
## Median :79.20 Median : 168.40 Median :41.42 Median : 4.00
## Mean :76.93 Mean : 2140.89 Mean :36.95 Mean : 47.60
## 3rd Qu.:91.47 3rd Qu.: 1355.80 3rd Qu.:49.75 3rd Qu.: 33.64
## Max. :98.87 Max. :67424.40 Max. :68.89 Max. :1681.82
##
## Polio Totalexpenditure Diphtheria HIV.AIDS
## Min. :24.00 Min. : 1.070 Min. : 2.00 Min. : 0.1000
## 1st Qu.:73.36 1st Qu.: 4.705 1st Qu.:72.50 1st Qu.: 0.1000
## Median :85.80 Median : 5.959 Median :87.25 Median : 0.1067
## Mean :81.81 Mean : 5.945 Mean :82.21 Mean : 1.9974
## 3rd Qu.:95.20 3rd Qu.: 7.103 3rd Qu.:95.27 3rd Qu.: 1.2000
## Max. :98.60 Max. :11.850 Max. :98.93 Max. :34.6667
##
## GDP Population thinness1.19years thinness5.9years
## Min. : 82.19 Min. : 279 Min. : 0.100 Min. : 0.100
## 1st Qu.: 727.21 1st Qu.: 1063396 1st Qu.: 1.820 1st Qu.: 1.887
## Median : 2119.73 Median : 3674746 Median : 3.793 Median : 3.823
## Mean : 5536.09 Mean : 14679700 Mean : 5.091 Mean : 5.122
## 3rd Qu.: 5036.08 3rd Qu.: 12049556 3rd Qu.: 7.507 3rd Qu.: 7.454
## Max. :56727.49 Max. :594387175 Max. :27.000 Max. :27.818
##
## Incomecompositionofresources Schooling
## Min. :0.1559 Min. : 4.867
## 1st Qu.:0.4850 1st Qu.:10.123
## Median :0.6485 Median :12.140
## Mean :0.6206 Mean :11.894
## 3rd Qu.:0.7419 3rd Qu.:13.771
## Max. :0.9193 Max. :19.986
##
# recheck the data types
str(as.data.frame(led_grouped))
## 'data.frame': 133 obs. of 21 variables:
## $ Country : Factor w/ 193 levels "Afghanistan",..: 1 2 3 4 6 7 8 9 10 13 ...
## $ Status : Factor w/ 2 levels "Developed","Developing": 2 2 2 2 2 2 1 1 2 2 ...
## $ Lifeexpectancy : num 58.2 75.2 74.2 50.7 75.2 ...
## $ AdultMortality : num 269.1 45.1 102.8 362.8 100.4 ...
## $ infantdeaths : num 78.25 0.688 20.273 76.625 10 ...
## $ Alcohol : num 0.0144 4.8487 0.4473 7.62 8.0046 ...
## $ percentageexpenditure : num 35 193 300 174 838 ...
## $ HepatitisB : num 64.6 98 76.5 71 80.3 ...
## $ Measles : num 2362.2 53.4 647 3925.6 2 ...
## $ BMI : num 15.5 49.1 48.9 18.4 54.5 ...
## $ under.fivedeaths : num 107.562 0.938 23.364 118.75 11.231 ...
## $ Polio : num 48.4 98.1 93.2 70.9 94.5 ...
## $ Totalexpenditure : num 8.25 5.95 4.95 3.65 6.55 ...
## $ Diphtheria : num 52.3 98.1 93.4 64 93.7 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 2.45 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 340 2120 3261 2936 6933 ...
## $ Population : num 9972260 696912 24124739 10107848 20847454 ...
## $ thinness1.19years : num 16.58 1.62 6 9.04 1.06 ...
## $ thinness5.9years : num 15.581 1.7 5.891 8.925 0.931 ...
## $ Incomecompositionofresources: num 0.415 0.71 0.707 0.493 0.796 ...
## $ Schooling : num 8.21 12.14 13.16 9.47 16.63 ...
# View(led_grouped)
cat("In the data, there are two categorical variables (Factors), Country and Status. All the remaining variables are numeric.\n")
## In the data, there are two categorical variables (Factors), Country and Status. All the remaining variables are numeric.
# attach the dataframe to plots
attach(led_grouped)
par(mfrow=c(1,2))
# target variable
# histogram
hist(led_grouped$Lifeexpectancy,
main = "LifeExpectance Distribution",
xlab = "Life Expectancy(yrs)")
# kernel density plot with a vertical indication of location of the mean
plot(density(led_grouped$Lifeexpectancy),
main = "Distribution of Life Expectancy",
xlab = "Life Expectancy (yrs)")
abline(v=mean(led_grouped$Lifeexpectancy))
cat("The target variable is Life Expectancy. As it can be seen in the kernel density plot and the histogram, the target variable is not distributed perfectly normally, it is a little left-skewed. \nThe unit of Life Expectancy is number of years")
## The target variable is Life Expectancy. As it can be seen in the kernel density plot and the histogram, the target variable is not distributed perfectly normally, it is a little left-skewed.
## The unit of Life Expectancy is number of years
# attach(led_grouped)
# Checking distributions of a few predictor variables
# variable adultmortality
summary(led_grouped$AdultMortality) #distribution indicators
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.93 110.33 151.27 172.84 230.14 547.17
par(mfrow=c(2,2))
plot(density(led_grouped$AdultMortality),
main = "Distribution of Adult Mortality / 1000",
xlab = "Adult Mortality(cnt)") # kernel density plot
boxplot(led_grouped$AdultMortality,
main = "Adult Mortality") # box plot
# to normalize the density plot
plot(density(led_grouped$AdultMortality^0.5),
main = "Distribution of Adult Mortality Rate / 1000",
xlab = "Adult Mortality rate") # normalized kernel density plot
cat("The predictor variable Adult Mortality is not normally distributed. In fact, it is right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries being extremely underdeveloped with high population, while some countries having a very low population. Thus, they cannot be eliminated.")
## The predictor variable Adult Mortality is not normally distributed. In fact, it is right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries being extremely underdeveloped with high population, while some countries having a very low population. Thus, they cannot be eliminated.
# variable infantdeaths
summary(led_grouped$infantdeaths) #distribution indicators
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.7333 3.2667 34.5899 23.0000 1268.8182
par(mfrow=c(2,2))
plot(density(led_grouped$infantdeaths),
main = "Distribution of Infant deaths / 1000",
xlab = "Infant deaths(cnt)") # kernel density plot
boxplot(led_grouped$infantdeaths,
main = "Infant Deaths") # box plot
plot(density(led_grouped$infantdeaths^0.5),
main = "Distribution of Infant deaths / 1000",
xlab = "Infant deaths(cnt)") # normalized kernel density plot
cat("The predictor variable Infant deaths is not normally distributed. In fact, it is highly right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries being extremely underdeveloped. Thus, they cannot be eliminated.")
## The predictor variable Infant deaths is not normally distributed. In fact, it is highly right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries being extremely underdeveloped. Thus, they cannot be eliminated.
# variable alcohol
summary(led_grouped$Alcohol) #distribution indicators
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.010 1.055 3.597 4.330 7.334 13.497
par(mfrow=c(2,2))
plot(density(led_grouped$Alcohol),
main = "Distribution of Alcohol consumed",
xlab = "Alcohol(litres)") # kernel density plot
boxplot(led_grouped$Alcohol,
main = "Alcohol consumption") # box plot
# to normalize
plot(density(led_grouped$Alcohol^0.5),
main = "Distribution of Alcohol consumed",
xlab = "Alcohol(litres)") # normalized kernel density plot
cat("The predictor variable Alcohol is not normally distributed. In fact, it is a little right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries being having high population, whereas some countries having a very low population. Thus, they cannot be eliminated.")
## The predictor variable Alcohol is not normally distributed. In fact, it is a little right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries being having high population, whereas some countries having a very low population. Thus, they cannot be eliminated.
# variable percentageexpenditure
summary(led_grouped$percentageexpenditure) #distribution indicators
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.046 56.628 211.139 683.877 551.083 8722.748
par(mfrow=c(1,2))
plot(density(led_grouped$percentageexpenditure),
main = "% Expenditure on health",
xlab = "Percentage expenditure(%)") # kernel density plot
boxplot(led_grouped$percentageexpenditure,
main = "Percentage expenditure") # box plot
cat("The predictor variable Percentage expenditure is not normally distributed. In fact, it is heavily right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries having a very low GDP and being highly populated. Thus, they cannot be eliminated.")
## The predictor variable Percentage expenditure is not normally distributed. In fact, it is heavily right-skewed. Hence, we checked for any outliers, but looking at them it is clear that these outliers are not due to any data error, but just abnormal values due to some countries having a very low GDP and being highly populated. Thus, they cannot be eliminated.
library(psych)
# check correlations of the target variable with the first 5 predictors using Pearson correlation
pairs.panels(led_grouped[,3:8],
method = "pearson", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the first 5 predictors using Spearman correlation
pairs.panels(led_grouped[,3:8],
method = "spearman", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the next 5 predictors using Pearson correlation
pairs.panels(led_grouped[,c(3,9:13)],
method = "pearson", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the next 5 predictors using Spearman correlation
pairs.panels(led_grouped[,c(3,9:13)],
method = "spearman", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the next 5 predictors using Pearson correlation
pairs.panels(led_grouped[,c(3,14:18)],
method = "pearson", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the next 5 predictors using Spearman correlation
pairs.panels(led_grouped[,c(3,14:18)],
method = "spearman", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the last 3 predictors using Pearson correlation
pairs.panels(led_grouped[,c(3,19:21)],
method = "pearson", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
# check correlations of the target variable with the last 3 predictors using Spearman correlation
pairs.panels(led_grouped[,c(3,19:21)],
method = "spearman", # correlation method
hist.col = "green",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
cat("\n\nIt can be seen in the scatterplot matrices that:\n1) The target variable Life Expectancy is strongly correlated to AdultMortality, BMI, Income composition of resources and Schooling directly as indicated by both the Pearson and Spearman correlation.\n2) Variables percentageExpenditure, HIV.AIDS, GDP, thinness1.19years and thinness5.9years have a moderate Pearson correlation, whereas a stronger Spearman correlation with the target variable. This indicates that these variables might not have a purely linear correlation with the target.\n3) The variables Alcohol, Polio and Diphtheria have a moderate correlation with Life Expectancy, according to both Pearson and Spearman correlation.\n4) Infant deaths and under 5 deaths have a weak Pearson correlation with the target variable whereas a moderately strong Spearman correlation with Life Expectancy.\n5) The remaining variables, namely Hepatitis B, Measles, TotalExpenditure and Population have a very weak correlation with the target variable according to both the correlation indices.")
##
##
## It can be seen in the scatterplot matrices that:
## 1) The target variable Life Expectancy is strongly correlated to AdultMortality, BMI, Income composition of resources and Schooling directly as indicated by both the Pearson and Spearman correlation.
## 2) Variables percentageExpenditure, HIV.AIDS, GDP, thinness1.19years and thinness5.9years have a moderate Pearson correlation, whereas a stronger Spearman correlation with the target variable. This indicates that these variables might not have a purely linear correlation with the target.
## 3) The variables Alcohol, Polio and Diphtheria have a moderate correlation with Life Expectancy, according to both Pearson and Spearman correlation.
## 4) Infant deaths and under 5 deaths have a weak Pearson correlation with the target variable whereas a moderately strong Spearman correlation with Life Expectancy.
## 5) The remaining variables, namely Hepatitis B, Measles, TotalExpenditure and Population have a very weak correlation with the target variable according to both the correlation indices.
cat("\nWe have plotted three of the predictor variables to show how the variables relate with the target variable overall.\n")
##
## We have plotted three of the predictor variables to show how the variables relate with the target variable overall.
par(mfrow=c(2,2))
# life expectancy vs. income composition - positively correlated
plot(y = led_grouped$Lifeexpectancy,
x = led_grouped$Incomecompositionofresources,
main = "Life Expectancy vs. Income compositions",
xlab = "Income composition of resources",
ylab = "Life Expectancy",
pch = 19,
col = "green")
abline(60,1,
col = "red") # 45 degree line (line with slope 1)
cat("It can be seen that Life Expectancy and Income composition are positively correlated. The red line in the plot indicates a correlation of 1 (45 degree line). Thus it is clear that the correlation is definitely less than 1.")
## It can be seen that Life Expectancy and Income composition are positively correlated. The red line in the plot indicates a correlation of 1 (45 degree line). Thus it is clear that the correlation is definitely less than 1.
# life expectancy vs. adult mortality - negatively correlated
plot(y = led_grouped$Lifeexpectancy,
x = led_grouped$AdultMortality,
main = "Life Expectancy vs. Adult Mortality",
xlab = "Adult Mortality",
ylab = "Life Expectancy",
pch = 19,
col = "light green")
abline(80,-1,
col = "red") # 135 degree line (line with slope -1)
cat("It can be seen that Life Expectancy and Adult Mortality are negatively correlated. The red line in the plot indicates a correlation of -1 (135 degree line). Thus it is clear that the correlation is definitely not perfectly -1.")
## It can be seen that Life Expectancy and Adult Mortality are negatively correlated. The red line in the plot indicates a correlation of -1 (135 degree line). Thus it is clear that the correlation is definitely not perfectly -1.
# life expectancy vs population - no correlation
plot(y = led_grouped$Lifeexpectancy,
x = led_grouped$Population,
main = "Life Expectancy vs. Population",
xlab = "Population",
ylab = "Life Expectancy",
pch = 19,
col = "orange")
abline(50,1,
col = "red") # 45 degree line (line with slope 1)
cat("It can be seen that Life Expectancy and Population are not really correlated. The red line in the plot indicates a correlation of 1 (45 degree line). Thus it is clear that the correlation is negligible.")
## It can be seen that Life Expectancy and Population are not really correlated. The red line in the plot indicates a correlation of 1 (45 degree line). Thus it is clear that the correlation is negligible.
It would be especially interesting to use your target variable here (just recode it if it’s numeric).
# creating a dummy variable for GDP
led_grouped$dummygdp[led_grouped$GDP>mean(led_grouped$GDP)] <- "high GDP"
## Warning: Unknown or uninitialised column: 'dummygdp'.
led_grouped$dummygdp[led_grouped$GDP<=mean(led_grouped$GDP)] <- "low GDP"
# creating table to check the totals of categories in the column GDP with respect to Status
tableGDP_Status <- table(led_grouped$Status, led_grouped$dummygdp)
# creating propability table to get joint probabilities
pt1 <- prop.table(tableGDP_Status)
addmargins(pt1)
##
## high GDP low GDP Sum
## Developed 0.12781955 0.01503759 0.14285714
## Developing 0.11278195 0.74436090 0.85714286
## Sum 0.24060150 0.75939850 1.00000000
# creating a dummy variable for target variable-Life Expectancy
led_grouped$dummyLifeExpectancy[led_grouped$Lifeexpectancy>mean(led_grouped$Lifeexpectancy)] <- "high life expectancy"
## Warning: Unknown or uninitialised column: 'dummyLifeExpectancy'.
led_grouped$dummyLifeExpectancy[led_grouped$Lifeexpectancy<=mean(led_grouped$Lifeexpectancy)] <- "low life expectancy"
# creating table to check the totals of categories in the column Life Expecatancy with respect to GDP
tableLE_GDP <- table(led_grouped$dummyLifeExpectancy,led_grouped$dummygdp)
# creating propability table to get joint probabilities
pt2 <- prop.table(tableLE_GDP)
addmargins(pt2)
##
## high GDP low GDP Sum
## high life expectancy 0.22556391 0.33082707 0.55639098
## low life expectancy 0.01503759 0.42857143 0.44360902
## Sum 0.24060150 0.75939850 1.00000000
# creating table to check the totals of categories in the column Life Expecatancy with respect to Status
tableLE_Status <- table(led_grouped$dummyLifeExpectancy, led_grouped$Status)
# creating propability table to get joint probabilities
pt3 <- prop.table(tableLE_Status)
addmargins(pt3)
##
## Developed Developing Sum
## high life expectancy 0.1428571 0.4135338 0.5563910
## low life expectancy 0.0000000 0.4436090 0.4436090
## Sum 0.1428571 0.8571429 1.0000000
# chi-square test on table of LifeExpectancy and dummygdp
chisq.test(led_grouped$dummyLifeExpectancy, led_grouped$dummygdp, correct = F)
##
## Pearson's Chi-squared test
##
## data: led_grouped$dummyLifeExpectancy and led_grouped$dummygdp
## X-squared = 24.797, df = 1, p-value = 0.000000637
cat("The p-value here is 0.000000637 which is much less than 0.5. Hence it can be said that there is a strong relation between LifeExpectancy and GDP. Hence it can be said that GDP is statistically significant in predicting Life Expectancy.\n")
## The p-value here is 0.000000637 which is much less than 0.5. Hence it can be said that there is a strong relation between LifeExpectancy and GDP. Hence it can be said that GDP is statistically significant in predicting Life Expectancy.
# chi-square test on table of LifeExpectancy and status
chisq.test(led_grouped$dummyLifeExpectancy, led_grouped$Status, correct = F)
##
## Pearson's Chi-squared test
##
## data: led_grouped$dummyLifeExpectancy and led_grouped$Status
## X-squared = 17.673, df = 1, p-value = 0.00002623
cat("The p-value here is 0.00002623 which is much less than 0.5. Hence it can be said that there is a strong relation between LifeExpectancy and Status. Hence it can be said that Status is statistically significant in predicting Life Expectancy.")
## The p-value here is 0.00002623 which is much less than 0.5. Hence it can be said that there is a strong relation between LifeExpectancy and Status. Hence it can be said that Status is statistically significant in predicting Life Expectancy.
# H0 - LifeExpecteancy is unrelated to Status of the Country
# H1 - LifeExpecteancy is related to Status of the Country
ct1 <- tableLE_Status
addmargins(ct1)
##
## Developed Developing Sum
## high life expectancy 19 55 74
## low life expectancy 0 59 59
## Sum 19 114 133
observedCount1 <- c(19, 55, 0, 59)
# expected count = RowTotal * ColumnTotal / Total n
expectedCount_hdd <- 74*19/133
expectedCount_hdg <- 74*114/133
expectedCount_ldd <- 59*19/133
expectedCount_ldg <- 59*114/133
expectedCount1 <- c(expectedCount_hdd, expectedCount_hdg, expectedCount_ldd, expectedCount_ldg)
csStatistic1 = sum((observedCount1 - expectedCount1)^2/ expectedCount1)
# Degree of freedom = 1
pValue1 <- 1 - pchisq(csStatistic1,1)
# pValue < 0.05 - We reject H0
if(pValue1 < 0.05) {
cat("\nLife Expectancy has a strong relation with Status of a Country!\nThis matches with the result we obtained using the chisq.test()\n")
} else {
cat("\nLife Expectancy does not have a strong relation with Status of a Country!\nThis does not match with the result we obtained using the chisq.test()\n")
}
##
## Life Expectancy has a strong relation with Status of a Country!
## This matches with the result we obtained using the chisq.test()
ct2 <- tableLE_GDP
addmargins(ct2)
##
## high GDP low GDP Sum
## high life expectancy 30 44 74
## low life expectancy 2 57 59
## Sum 32 101 133
observedCount2 <- c(30, 44, 2, 57)
# expected count = RowTotal * ColumnTotal / Total n
expectedCount_hh <- 74*32/133
expectedCount_hl <- 74*101/133
expectedCount_lh <- 59*32/133
expectedCount_ll <- 59*101/133
expectedCount2 <- c(expectedCount_hh, expectedCount_hl, expectedCount_lh, expectedCount_ll)
csStatistic2 = sum((observedCount2 - expectedCount2)^2/ expectedCount2)
# Degree of freedom = 1
pValue2 <- 1 - pchisq(csStatistic2,1)
# pValue < 0.05 - We reject H0
if(pValue1 < 0.05) {
cat("\nLife Expectancy has a strong relation with GDP of a Country!\nThis matches with the result we obtained using the chisq.test()\n")
} else {
cat("\nLife Expectancy does not have a strong relation with GDP of a Country!\nThis does not match with the result we obtained using the chisq.test()\n")
}
##
## Life Expectancy has a strong relation with GDP of a Country!
## This matches with the result we obtained using the chisq.test()
cat("To prepare the data for analysis, we excluded(deleted) the rows with missing values as out of all, 10 were missing values in the target variable, and the total number of missing values constituted only 15% of the data set. \nAlso, originally our dataset was a set of records for 193 countries over a span of 16 years. It was the Life Expectancy records of countries from year 2000 to 2015. We wanted to group the data according to country for predicting an average Life expectancy for every country. However, most of the missing values belonged to the same group of countries, thus eliminating these missing values eliminated the countries with inadequate data. \nFinally, we have complete data of 133 countries which we have grouped according to the country and aggregated to the mean of every variable over 15 years.\n")
## To prepare the data for analysis, we excluded(deleted) the rows with missing values as out of all, 10 were missing values in the target variable, and the total number of missing values constituted only 15% of the data set.
## Also, originally our dataset was a set of records for 193 countries over a span of 16 years. It was the Life Expectancy records of countries from year 2000 to 2015. We wanted to group the data according to country for predicting an average Life expectancy for every country. However, most of the missing values belonged to the same group of countries, thus eliminating these missing values eliminated the countries with inadequate data.
## Finally, we have complete data of 133 countries which we have grouped according to the country and aggregated to the mean of every variable over 15 years.
cat("As it can be seen in the data, the variables have a variety of scales, with population lying in millions to Alcohol lying in tens. Hence, we need to normalize the data.")
## As it can be seen in the data, the variables have a variety of scales, with population lying in millions to Alcohol lying in tens. Hence, we need to normalize the data.
# function to normalize the data
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
#apply normalization function to the numerical predictors in the data
led_normal <- as.data.frame(lapply(led_grouped[4:21], min_max_norm))
# join the target variable and categorical columns to the normalized data columns
led_normal <- cbind.data.frame(led_grouped[1:3],led_normal)
# View(led_normal)
cat("The target variable is LifeExpectancy which is a numeric variable. Hence, we will be using Linear Regression and build a linear regression model to predict the life expectancy for a country.")
## The target variable is LifeExpectancy which is a numeric variable. Hence, we will be using Linear Regression and build a linear regression model to predict the life expectancy for a country.
# fitting a full model for Life Expectancy with every variable except Country
# (as Country is like an index, it will be a direct predictor)
fullModel <- lm(Lifeexpectancy~.- Country,
data = led_normal)
# check the model
summary(fullModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ . - Country, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9653 -1.4823 0.1677 1.5897 5.8006
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.7196 2.3315 25.185
## StatusDeveloping 0.2523 0.9880 0.255
## AdultMortality -19.5355 2.2958 -8.509
## infantdeaths 14.9031 45.9627 0.324
## Alcohol -1.2084 1.4623 -0.826
## percentageexpenditure 4.5966 4.2042 1.093
## HepatitisB -0.6402 2.3585 -0.271
## Measles 3.1787 3.7936 0.838
## BMI 5.3145 1.6956 3.134
## under.fivedeaths -22.7974 39.8954 -0.571
## Polio -2.8431 2.9474 -0.965
## Totalexpenditure 0.1715 1.6193 0.106
## Diphtheria 10.5015 3.9026 2.691
## HIV.AIDS -8.8120 2.2692 -3.883
## GDP 1.6674 4.7548 0.351
## Population 3.8322 7.9135 0.484
## thinness1.19years 1.5632 6.8102 0.230
## thinness5.9years 0.5989 7.0202 0.085
## Incomecompositionofresources 7.7276 2.4092 3.208
## Schooling 5.1190 2.8931 1.769
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.798925
## AdultMortality 0.0000000000000837 ***
## infantdeaths 0.746353
## Alcohol 0.410339
## percentageexpenditure 0.276574
## HepatitisB 0.786546
## Measles 0.403850
## BMI 0.002195 **
## under.fivedeaths 0.568844
## Polio 0.336799
## Totalexpenditure 0.915851
## Diphtheria 0.008207 **
## HIV.AIDS 0.000174 ***
## GDP 0.726483
## Population 0.629136
## thinness1.19years 0.818867
## thinness5.9years 0.932161
## Incomecompositionofresources 0.001742 **
## Schooling 0.079523 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.581 on 113 degrees of freedom
## Multiple R-squared: 0.9273, Adjusted R-squared: 0.915
## F-statistic: 75.83 on 19 and 113 DF, p-value: < 0.00000000000000022
# save predictions of full model
fullModelPreds <- predict(fullModel, # my model
newdata = led_normal, # dataset
type = "response") # to get predicted values
# baseline model with no predictors
nullModel <- lm(Lifeexpectancy~1,
data = led_normal)
# check the model
summary(nullModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ 1, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.197 -5.884 2.525 5.588 14.818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.6216 0.7677 89.38 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.854 on 132 degrees of freedom
# save predictions of the null model
nullModelPreds <- predict(fullModel, # my model
newdata = led_normal, # dataset
type = "response") # to get predicted values
# stepwise backward regression
cat("\nBackward Step Model:\n")
##
## Backward Step Model:
backwardStepModel <- step(fullModel)
## Start: AIC=270.51
## Lifeexpectancy ~ (Country + Status + AdultMortality + infantdeaths +
## Alcohol + percentageexpenditure + HepatitisB + Measles +
## BMI + under.fivedeaths + Polio + Totalexpenditure + Diphtheria +
## HIV.AIDS + GDP + Population + thinness1.19years + thinness5.9years +
## Incomecompositionofresources + Schooling) - Country
##
## Df Sum of Sq RSS AIC
## - thinness5.9years 1 0.05 752.60 268.51
## - Totalexpenditure 1 0.07 752.63 268.52
## - thinness1.19years 1 0.35 752.90 268.57
## - Status 1 0.43 752.99 268.58
## - HepatitisB 1 0.49 753.04 268.59
## - infantdeaths 1 0.70 753.25 268.63
## - GDP 1 0.82 753.37 268.65
## - Population 1 1.56 754.11 268.78
## - under.fivedeaths 1 2.17 754.73 268.89
## - Alcohol 1 4.55 757.10 269.31
## - Measles 1 4.68 757.23 269.33
## - Polio 1 6.20 758.75 269.60
## - percentageexpenditure 1 7.96 760.51 269.90
## <none> 752.55 270.51
## - Schooling 1 20.85 773.40 272.14
## - Diphtheria 1 48.22 800.78 276.77
## - BMI 1 65.42 817.98 279.59
## - Incomecompositionofresources 1 68.52 821.07 280.09
## - HIV.AIDS 1 100.43 852.98 285.17
## - AdultMortality 1 482.21 1234.76 334.36
##
## Step: AIC=268.51
## Lifeexpectancy ~ Status + AdultMortality + infantdeaths + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + under.fivedeaths +
## Polio + Totalexpenditure + Diphtheria + HIV.AIDS + GDP +
## Population + thinness1.19years + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## - Totalexpenditure 1 0.07 752.67 266.53
## - Status 1 0.44 753.04 266.59
## - HepatitisB 1 0.52 753.12 266.60
## - infantdeaths 1 0.72 753.32 266.64
## - GDP 1 0.79 753.39 266.65
## - Population 1 1.64 754.24 266.80
## - under.fivedeaths 1 2.22 754.82 266.91
## - thinness1.19years 1 4.26 756.86 267.26
## - Alcohol 1 4.52 757.12 267.31
## - Measles 1 4.66 757.26 267.34
## - Polio 1 6.16 758.76 267.60
## - percentageexpenditure 1 8.00 760.60 267.92
## <none> 752.60 268.51
## - Schooling 1 21.24 773.84 270.22
## - Diphtheria 1 48.21 800.81 274.77
## - BMI 1 67.28 819.88 277.90
## - Incomecompositionofresources 1 68.72 821.32 278.13
## - HIV.AIDS 1 100.73 853.33 283.22
## - AdultMortality 1 485.09 1237.69 332.68
##
## Step: AIC=266.53
## Lifeexpectancy ~ Status + AdultMortality + infantdeaths + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + under.fivedeaths +
## Polio + Diphtheria + HIV.AIDS + GDP + Population + thinness1.19years +
## Incomecompositionofresources + Schooling
##
## Df Sum of Sq RSS AIC
## - Status 1 0.41 753.08 264.60
## - HepatitisB 1 0.52 753.19 264.62
## - infantdeaths 1 0.70 753.37 264.65
## - GDP 1 0.77 753.44 264.66
## - Population 1 1.65 754.32 264.82
## - under.fivedeaths 1 2.19 754.86 264.91
## - thinness1.19years 1 4.27 756.94 265.28
## - Alcohol 1 4.48 757.14 265.31
## - Measles 1 4.68 757.35 265.35
## - Polio 1 6.23 758.90 265.62
## - percentageexpenditure 1 8.38 761.04 266.00
## <none> 752.67 266.53
## - Schooling 1 21.25 773.92 268.23
## - Diphtheria 1 49.27 801.94 272.96
## - Incomecompositionofresources 1 69.15 821.82 276.22
## - BMI 1 71.61 824.28 276.61
## - HIV.AIDS 1 103.06 855.73 281.59
## - AdultMortality 1 486.13 1238.80 330.80
##
## Step: AIC=264.6
## Lifeexpectancy ~ AdultMortality + infantdeaths + Alcohol + percentageexpenditure +
## HepatitisB + Measles + BMI + under.fivedeaths + Polio + Diphtheria +
## HIV.AIDS + GDP + Population + thinness1.19years + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## - GDP 1 0.51 753.60 262.69
## - infantdeaths 1 0.65 753.73 262.71
## - HepatitisB 1 0.77 753.85 262.73
## - Population 1 1.70 754.78 262.90
## - under.fivedeaths 1 2.10 755.19 262.97
## - thinness1.19years 1 4.21 757.29 263.34
## - Measles 1 4.83 757.91 263.45
## - Polio 1 6.14 759.23 263.68
## - Alcohol 1 6.35 759.43 263.72
## - percentageexpenditure 1 8.78 761.87 264.14
## <none> 753.08 264.60
## - Schooling 1 21.20 774.29 266.29
## - Diphtheria 1 50.73 803.81 271.27
## - Incomecompositionofresources 1 70.15 823.23 274.44
## - BMI 1 71.91 824.99 274.73
## - HIV.AIDS 1 103.98 857.06 279.80
## - AdultMortality 1 494.05 1247.13 329.69
##
## Step: AIC=262.69
## Lifeexpectancy ~ AdultMortality + infantdeaths + Alcohol + percentageexpenditure +
## HepatitisB + Measles + BMI + under.fivedeaths + Polio + Diphtheria +
## HIV.AIDS + Population + thinness1.19years + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## - infantdeaths 1 0.68 754.27 260.81
## - HepatitisB 1 1.19 754.79 260.90
## - Population 1 1.55 755.15 260.96
## - under.fivedeaths 1 2.13 755.72 261.06
## - thinness1.19years 1 4.02 757.61 261.40
## - Measles 1 4.62 758.21 261.50
## - Polio 1 5.87 759.47 261.72
## - Alcohol 1 6.17 759.77 261.77
## <none> 753.60 262.69
## - Schooling 1 22.88 776.48 264.67
## - Diphtheria 1 51.62 805.22 269.50
## - percentageexpenditure 1 57.22 810.82 270.42
## - Incomecompositionofresources 1 71.42 825.01 272.73
## - BMI 1 71.43 825.03 272.73
## - HIV.AIDS 1 104.34 857.94 277.94
## - AdultMortality 1 494.71 1248.30 327.81
##
## Step: AIC=260.81
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## HepatitisB + Measles + BMI + under.fivedeaths + Polio + Diphtheria +
## HIV.AIDS + Population + thinness1.19years + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## - HepatitisB 1 1.22 755.49 259.02
## - thinness1.19years 1 4.87 759.14 259.66
## - Population 1 5.01 759.28 259.69
## - Polio 1 5.90 760.17 259.84
## - Alcohol 1 7.08 761.35 260.05
## - Measles 1 9.82 764.09 260.53
## <none> 754.27 260.81
## - under.fivedeaths 1 14.98 769.25 261.42
## - Schooling 1 22.26 776.53 262.68
## - Diphtheria 1 53.36 807.64 267.90
## - percentageexpenditure 1 57.22 811.49 268.53
## - Incomecompositionofresources 1 73.11 827.38 271.11
## - BMI 1 74.49 828.76 271.33
## - HIV.AIDS 1 103.68 857.95 275.94
## - AdultMortality 1 510.32 1264.59 327.54
##
## Step: AIC=259.02
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## Measles + BMI + under.fivedeaths + Polio + Diphtheria + HIV.AIDS +
## Population + thinness1.19years + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## - thinness1.19years 1 4.67 760.16 257.84
## - Population 1 5.72 761.21 258.03
## - Polio 1 7.09 762.58 258.27
## - Alcohol 1 7.64 763.13 258.36
## - Measles 1 10.27 765.76 258.82
## <none> 755.49 259.02
## - under.fivedeaths 1 15.06 770.55 259.65
## - Schooling 1 22.76 778.25 260.97
## - Diphtheria 1 56.14 811.64 266.56
## - percentageexpenditure 1 64.72 820.21 267.95
## - BMI 1 73.55 829.04 269.38
## - Incomecompositionofresources 1 76.00 831.49 269.77
## - HIV.AIDS 1 102.47 857.96 273.94
## - AdultMortality 1 517.38 1272.87 326.40
##
## Step: AIC=257.84
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## Measles + BMI + under.fivedeaths + Polio + Diphtheria + HIV.AIDS +
## Population + Incomecompositionofresources + Schooling
##
## Df Sum of Sq RSS AIC
## - Polio 1 5.91 766.07 256.87
## - Population 1 7.44 767.59 257.14
## - Measles 1 7.88 768.04 257.21
## - Alcohol 1 10.72 770.87 257.70
## <none> 760.16 257.84
## - under.fivedeaths 1 12.70 772.86 258.05
## - Schooling 1 25.67 785.83 260.26
## - Diphtheria 1 53.47 813.63 264.88
## - percentageexpenditure 1 62.58 822.74 266.36
## - Incomecompositionofresources 1 74.80 834.96 268.33
## - BMI 1 84.28 844.44 269.83
## - HIV.AIDS 1 101.69 861.85 272.54
## - AdultMortality 1 523.38 1283.54 325.51
##
## Step: AIC=256.87
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## Measles + BMI + under.fivedeaths + Diphtheria + HIV.AIDS +
## Population + Incomecompositionofresources + Schooling
##
## Df Sum of Sq RSS AIC
## - Population 1 6.29 772.35 255.96
## - Measles 1 7.20 773.26 256.12
## - under.fivedeaths 1 11.00 777.07 256.77
## <none> 766.07 256.87
## - Alcohol 1 11.62 777.69 256.88
## - Schooling 1 23.98 790.05 258.97
## - percentageexpenditure 1 64.92 830.99 265.69
## - Incomecompositionofresources 1 76.54 842.60 267.54
## - BMI 1 82.28 848.35 268.44
## - HIV.AIDS 1 103.19 869.26 271.68
## - Diphtheria 1 115.48 881.55 273.55
## - AdultMortality 1 517.88 1283.95 323.56
##
## Step: AIC=255.96
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## Measles + BMI + under.fivedeaths + Diphtheria + HIV.AIDS +
## Incomecompositionofresources + Schooling
##
## Df Sum of Sq RSS AIC
## - Measles 1 4.06 776.41 254.66
## - under.fivedeaths 1 5.33 777.68 254.87
## <none> 772.35 255.96
## - Alcohol 1 14.73 787.08 256.47
## - Schooling 1 27.30 799.65 258.58
## - percentageexpenditure 1 64.47 836.82 264.62
## - Incomecompositionofresources 1 77.93 850.28 266.74
## - BMI 1 84.54 856.89 267.77
## - HIV.AIDS 1 102.34 874.69 270.51
## - Diphtheria 1 127.94 900.29 274.34
## - AdultMortality 1 523.84 1296.19 322.82
##
## Step: AIC=254.66
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## BMI + under.fivedeaths + Diphtheria + HIV.AIDS + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## - under.fivedeaths 1 1.71 778.12 252.95
## <none> 776.41 254.66
## - Alcohol 1 14.34 790.75 255.09
## - Schooling 1 27.63 804.04 257.31
## - percentageexpenditure 1 62.07 838.48 262.89
## - Incomecompositionofresources 1 79.66 856.06 265.65
## - BMI 1 80.72 857.13 265.81
## - HIV.AIDS 1 100.51 876.92 268.85
## - Diphtheria 1 131.22 907.63 273.43
## - AdultMortality 1 539.81 1316.22 322.86
##
## Step: AIC=252.95
## Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## BMI + Diphtheria + HIV.AIDS + Incomecompositionofresources +
## Schooling
##
## Df Sum of Sq RSS AIC
## <none> 778.12 252.95
## - Alcohol 1 15.92 794.03 253.64
## - Schooling 1 28.43 806.54 255.72
## - percentageexpenditure 1 64.13 842.25 261.48
## - Incomecompositionofresources 1 78.48 856.59 263.73
## - BMI 1 94.35 872.46 266.17
## - HIV.AIDS 1 100.53 878.65 267.11
## - Diphtheria 1 141.09 919.21 273.11
## - AdultMortality 1 539.44 1317.55 320.99
summary(backwardStepModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## BMI + Diphtheria + HIV.AIDS + Incomecompositionofresources +
## Schooling, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9872 -1.3479 0.2182 1.6534 5.6030
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 59.360 1.659 35.785
## AdultMortality -19.710 2.126 -9.272
## Alcohol -1.975 1.240 -1.593
## percentageexpenditure 5.873 1.837 3.197
## BMI 4.491 1.158 3.877
## Diphtheria 7.502 1.582 4.742
## HIV.AIDS -8.574 2.142 -4.003
## Incomecompositionofresources 8.055 2.278 3.536
## Schooling 5.715 2.685 2.128
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality 0.000000000000000729 ***
## Alcohol 0.113782
## percentageexpenditure 0.001763 **
## BMI 0.000170 ***
## Diphtheria 0.000005715102005088 ***
## HIV.AIDS 0.000107 ***
## Incomecompositionofresources 0.000571 ***
## Schooling 0.035285 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.505 on 124 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9199
## F-statistic: 190.6 on 8 and 124 DF, p-value: < 0.00000000000000022
# save predictions of Backward Step model
bsmPreds <- predict(backwardStepModel,
newdata = led_normal,
type = "response")
# stepwise forward regression
cat("\nForward Step Model:\n")
##
## Forward Step Model:
forwardStepModel <- step(nullModel,
scope=list(lower=formula(nullModel),
upper=formula(fullModel)),
direction="forward")
## Start: AIC=581.1
## Lifeexpectancy ~ 1
##
## Df Sum of Sq RSS AIC
## + AdultMortality 1 7927.3 2420.0 389.85
## + Incomecompositionofresources 1 6782.1 3565.1 441.38
## + Schooling 1 6065.5 4281.7 465.74
## + BMI 1 5485.7 4861.6 482.64
## + GDP 1 3671.8 6675.5 524.81
## + HIV.AIDS 1 3649.5 6697.8 525.25
## + Polio 1 3209.1 7138.2 533.72
## + percentageexpenditure 1 3081.9 7265.4 536.07
## + Diphtheria 1 3048.4 7298.8 536.68
## + thinness1.19years 1 2701.7 7645.6 542.85
## + thinness5.9years 1 2684.7 7662.5 543.15
## + Alcohol 1 2427.5 7919.8 547.54
## + Status 1 2420.5 7926.7 547.66
## + HepatitisB 1 825.4 9521.9 572.04
## + Totalexpenditure 1 733.9 9613.3 573.31
## + under.fivedeaths 1 370.3 9976.9 578.25
## + infantdeaths 1 281.7 10065.5 579.43
## <none> 10347.3 581.10
## + Measles 1 71.1 10276.2 582.18
## + Population 1 6.6 10340.6 583.01
##
## Step: AIC=389.85
## Lifeexpectancy ~ AdultMortality
##
## Df Sum of Sq RSS AIC
## + Schooling 1 1168.91 1251.1 304.11
## + Incomecompositionofresources 1 1164.50 1255.5 304.57
## + BMI 1 786.25 1633.7 339.60
## + Alcohol 1 670.94 1749.0 348.67
## + Diphtheria 1 665.62 1754.4 349.07
## + Polio 1 584.33 1835.7 355.10
## + GDP 1 574.40 1845.6 355.82
## + percentageexpenditure 1 476.33 1943.7 362.70
## + thinness1.19years 1 444.92 1975.1 364.83
## + thinness5.9years 1 402.83 2017.2 367.64
## + Status 1 282.47 2137.5 375.35
## + HepatitisB 1 252.29 2167.7 377.21
## + Totalexpenditure 1 202.22 2217.8 380.25
## + under.fivedeaths 1 145.73 2274.2 383.59
## + infantdeaths 1 137.17 2282.8 384.09
## + Measles 1 63.73 2356.2 388.31
## + Population 1 37.81 2382.2 389.76
## <none> 2420.0 389.85
## + HIV.AIDS 1 0.29 2419.7 391.84
##
## Step: AIC=304.11
## Lifeexpectancy ~ AdultMortality + Schooling
##
## Df Sum of Sq RSS AIC
## + Diphtheria 1 131.361 1119.7 291.35
## + Incomecompositionofresources 1 118.686 1132.4 292.85
## + BMI 1 96.229 1154.8 295.46
## + Polio 1 81.521 1169.5 297.14
## + HIV.AIDS 1 79.510 1171.6 297.37
## + HepatitisB 1 55.898 1195.2 300.03
## + percentageexpenditure 1 51.500 1199.6 300.52
## + thinness1.19years 1 49.648 1201.4 300.72
## + thinness5.9years 1 49.032 1202.0 300.79
## + GDP 1 45.180 1205.9 301.21
## + under.fivedeaths 1 28.061 1223.0 303.09
## + infantdeaths 1 24.523 1226.5 303.47
## <none> 1251.1 304.11
## + Totalexpenditure 1 13.368 1237.7 304.68
## + Alcohol 1 12.412 1238.7 304.78
## + Measles 1 10.155 1240.9 305.02
## + Population 1 9.472 1241.6 305.10
## + Status 1 3.561 1247.5 305.73
##
## Step: AIC=291.35
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria
##
## Df Sum of Sq RSS AIC
## + BMI 1 108.262 1011.4 279.83
## + Incomecompositionofresources 1 105.789 1013.9 280.15
## + HIV.AIDS 1 78.196 1041.5 283.72
## + percentageexpenditure 1 55.466 1064.2 286.60
## + GDP 1 48.028 1071.7 287.52
## + thinness1.19years 1 43.586 1076.1 288.07
## + thinness5.9years 1 43.403 1076.3 288.10
## <none> 1119.7 291.35
## + under.fivedeaths 1 13.800 1105.9 291.70
## + infantdeaths 1 12.707 1107.0 291.83
## + Measles 1 7.654 1112.0 292.44
## + Alcohol 1 6.368 1113.3 292.59
## + Polio 1 5.641 1114.1 292.68
## + Population 1 4.906 1114.8 292.77
## + Totalexpenditure 1 4.868 1114.8 292.77
## + HepatitisB 1 3.918 1115.8 292.89
## + Status 1 2.780 1116.9 293.02
##
## Step: AIC=279.83
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI
##
## Df Sum of Sq RSS AIC
## + Incomecompositionofresources 1 82.017 929.43 270.58
## + HIV.AIDS 1 74.496 936.95 271.65
## + percentageexpenditure 1 63.041 948.40 273.27
## + GDP 1 59.058 952.39 273.83
## <none> 1011.44 279.83
## + HepatitisB 1 11.440 1000.00 280.32
## + Polio 1 9.580 1001.86 280.56
## + Status 1 3.649 1007.80 281.35
## + Alcohol 1 2.331 1009.11 281.52
## + under.fivedeaths 1 1.310 1010.13 281.66
## + infantdeaths 1 0.946 1010.50 281.70
## + Totalexpenditure 1 0.342 1011.10 281.78
## + thinness1.19years 1 0.081 1011.36 281.82
## + Population 1 0.052 1011.39 281.82
## + Measles 1 0.029 1011.42 281.82
## + thinness5.9years 1 0.013 1011.43 281.83
##
## Step: AIC=270.58
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources
##
## Df Sum of Sq RSS AIC
## + HIV.AIDS 1 85.037 844.39 259.82
## + percentageexpenditure 1 40.277 889.15 266.69
## + GDP 1 34.222 895.20 267.59
## <none> 929.43 270.58
## + Polio 1 7.570 921.86 271.49
## + under.fivedeaths 1 3.908 925.52 272.02
## + HepatitisB 1 3.804 925.62 272.04
## + infantdeaths 1 3.284 926.14 272.11
## + Alcohol 1 1.075 928.35 272.43
## + Status 1 0.775 928.65 272.47
## + Population 1 0.732 928.69 272.48
## + Measles 1 0.406 929.02 272.52
## + thinness5.9years 1 0.107 929.32 272.57
## + thinness1.19years 1 0.096 929.33 272.57
## + Totalexpenditure 1 0.004 929.42 272.58
##
## Step: AIC=259.82
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS
##
## Df Sum of Sq RSS AIC
## + percentageexpenditure 1 50.356 794.03 253.64
## + GDP 1 42.463 801.93 254.96
## <none> 844.39 259.82
## + HepatitisB 1 8.981 835.41 260.40
## + Polio 1 6.337 838.05 260.82
## + under.fivedeaths 1 4.362 840.03 261.13
## + infantdeaths 1 3.470 840.92 261.27
## + Totalexpenditure 1 2.771 841.62 261.38
## + Alcohol 1 2.143 842.25 261.48
## + Status 1 1.417 842.97 261.60
## + Population 1 0.837 843.55 261.69
## + Measles 1 0.172 844.22 261.79
## + thinness1.19years 1 0.152 844.24 261.80
## + thinness5.9years 1 0.074 844.32 261.81
##
## Step: AIC=253.64
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS + percentageexpenditure
##
## Df Sum of Sq RSS AIC
## + Alcohol 1 15.9173 778.12 252.95
## <none> 794.03 253.64
## + Polio 1 4.8080 789.23 254.83
## + Status 1 4.3295 789.70 254.91
## + under.fivedeaths 1 3.2811 790.75 255.09
## + HepatitisB 1 2.5290 791.50 255.22
## + infantdeaths 1 2.3067 791.73 255.25
## + thinness5.9years 1 1.9149 792.12 255.32
## + thinness1.19years 1 1.8228 792.21 255.34
## + Population 1 0.3033 793.73 255.59
## + GDP 1 0.1021 793.93 255.62
## + Measles 1 0.0553 793.98 255.63
## + Totalexpenditure 1 0.0049 794.03 255.64
##
## Step: AIC=252.95
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol
##
## Df Sum of Sq RSS AIC
## <none> 778.12 252.95
## + Polio 1 4.2414 773.87 254.22
## + HepatitisB 1 1.7842 776.33 254.64
## + under.fivedeaths 1 1.7077 776.41 254.66
## + infantdeaths 1 1.1712 776.95 254.75
## + thinness5.9years 1 0.7978 777.32 254.81
## + thinness1.19years 1 0.6647 777.45 254.83
## + Measles 1 0.4320 777.68 254.87
## + Status 1 0.4009 777.72 254.88
## + GDP 1 0.2950 777.82 254.90
## + Population 1 0.1359 777.98 254.93
## + Totalexpenditure 1 0.0688 778.05 254.94
summary(forwardStepModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria +
## BMI + Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9872 -1.3479 0.2182 1.6534 5.6030
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 59.360 1.659 35.785
## AdultMortality -19.710 2.126 -9.272
## Schooling 5.715 2.685 2.128
## Diphtheria 7.502 1.582 4.742
## BMI 4.491 1.158 3.877
## Incomecompositionofresources 8.055 2.278 3.536
## HIV.AIDS -8.574 2.142 -4.003
## percentageexpenditure 5.873 1.837 3.197
## Alcohol -1.975 1.240 -1.593
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality 0.000000000000000729 ***
## Schooling 0.035285 *
## Diphtheria 0.000005715102005088 ***
## BMI 0.000170 ***
## Incomecompositionofresources 0.000571 ***
## HIV.AIDS 0.000107 ***
## percentageexpenditure 0.001763 **
## Alcohol 0.113782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.505 on 124 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9199
## F-statistic: 190.6 on 8 and 124 DF, p-value: < 0.00000000000000022
# save predictions of Forward Step model
fsmPreds <- predict(forwardStepModel,
newdata = led_normal,
type = "response")
# Mixed/Both
cat("\nMixed Step Model:\n")
##
## Mixed Step Model:
bothStepModel <- step(nullModel,
scope=list(lower=formula(nullModel),
upper=formula(fullModel)),
direction="both")
## Start: AIC=581.1
## Lifeexpectancy ~ 1
##
## Df Sum of Sq RSS AIC
## + AdultMortality 1 7927.3 2420.0 389.85
## + Incomecompositionofresources 1 6782.1 3565.1 441.38
## + Schooling 1 6065.5 4281.7 465.74
## + BMI 1 5485.7 4861.6 482.64
## + GDP 1 3671.8 6675.5 524.81
## + HIV.AIDS 1 3649.5 6697.8 525.25
## + Polio 1 3209.1 7138.2 533.72
## + percentageexpenditure 1 3081.9 7265.4 536.07
## + Diphtheria 1 3048.4 7298.8 536.68
## + thinness1.19years 1 2701.7 7645.6 542.85
## + thinness5.9years 1 2684.7 7662.5 543.15
## + Alcohol 1 2427.5 7919.8 547.54
## + Status 1 2420.5 7926.7 547.66
## + HepatitisB 1 825.4 9521.9 572.04
## + Totalexpenditure 1 733.9 9613.3 573.31
## + under.fivedeaths 1 370.3 9976.9 578.25
## + infantdeaths 1 281.7 10065.5 579.43
## <none> 10347.3 581.10
## + Measles 1 71.1 10276.2 582.18
## + Population 1 6.6 10340.6 583.01
##
## Step: AIC=389.85
## Lifeexpectancy ~ AdultMortality
##
## Df Sum of Sq RSS AIC
## + Schooling 1 1168.9 1251.1 304.11
## + Incomecompositionofresources 1 1164.5 1255.5 304.57
## + BMI 1 786.2 1633.7 339.60
## + Alcohol 1 670.9 1749.0 348.67
## + Diphtheria 1 665.6 1754.4 349.07
## + Polio 1 584.3 1835.6 355.10
## + GDP 1 574.4 1845.6 355.82
## + percentageexpenditure 1 476.3 1943.7 362.70
## + thinness1.19years 1 444.9 1975.1 364.83
## + thinness5.9years 1 402.8 2017.2 367.64
## + Status 1 282.5 2137.5 375.35
## + HepatitisB 1 252.3 2167.7 377.21
## + Totalexpenditure 1 202.2 2217.8 380.25
## + under.fivedeaths 1 145.7 2274.3 383.59
## + infantdeaths 1 137.2 2282.8 384.09
## + Measles 1 63.7 2356.3 388.31
## + Population 1 37.8 2382.2 389.76
## <none> 2420.0 389.85
## + HIV.AIDS 1 0.3 2419.7 391.84
## - AdultMortality 1 7927.3 10347.3 581.10
##
## Step: AIC=304.11
## Lifeexpectancy ~ AdultMortality + Schooling
##
## Df Sum of Sq RSS AIC
## + Diphtheria 1 131.36 1119.7 291.35
## + Incomecompositionofresources 1 118.69 1132.4 292.85
## + BMI 1 96.23 1154.8 295.46
## + Polio 1 81.52 1169.5 297.14
## + HIV.AIDS 1 79.51 1171.6 297.37
## + HepatitisB 1 55.90 1195.2 300.03
## + percentageexpenditure 1 51.50 1199.6 300.52
## + thinness1.19years 1 49.65 1201.4 300.72
## + thinness5.9years 1 49.03 1202.0 300.79
## + GDP 1 45.18 1205.9 301.21
## + under.fivedeaths 1 28.06 1223.0 303.09
## + infantdeaths 1 24.52 1226.5 303.47
## <none> 1251.1 304.11
## + Totalexpenditure 1 13.37 1237.7 304.68
## + Alcohol 1 12.41 1238.7 304.78
## + Measles 1 10.15 1240.9 305.02
## + Population 1 9.47 1241.6 305.10
## + Status 1 3.56 1247.5 305.73
## - Schooling 1 1168.91 2420.0 389.85
## - AdultMortality 1 3030.65 4281.7 465.74
##
## Step: AIC=291.35
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria
##
## Df Sum of Sq RSS AIC
## + BMI 1 108.26 1011.4 279.83
## + Incomecompositionofresources 1 105.79 1013.9 280.15
## + HIV.AIDS 1 78.20 1041.5 283.72
## + percentageexpenditure 1 55.47 1064.2 286.60
## + GDP 1 48.03 1071.7 287.52
## + thinness1.19years 1 43.59 1076.1 288.07
## + thinness5.9years 1 43.40 1076.3 288.09
## <none> 1119.7 291.35
## + under.fivedeaths 1 13.80 1105.9 291.70
## + infantdeaths 1 12.71 1107.0 291.83
## + Measles 1 7.65 1112.1 292.44
## + Alcohol 1 6.37 1113.3 292.59
## + Polio 1 5.64 1114.1 292.68
## + Population 1 4.91 1114.8 292.77
## + Totalexpenditure 1 4.87 1114.8 292.77
## + HepatitisB 1 3.92 1115.8 292.89
## + Status 1 2.78 1116.9 293.02
## - Diphtheria 1 131.36 1251.1 304.11
## - Schooling 1 634.65 1754.4 349.07
## - AdultMortality 1 2940.52 4060.2 460.68
##
## Step: AIC=279.83
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI
##
## Df Sum of Sq RSS AIC
## + Incomecompositionofresources 1 82.02 929.4 270.58
## + HIV.AIDS 1 74.50 936.9 271.65
## + percentageexpenditure 1 63.04 948.4 273.27
## + GDP 1 59.06 952.4 273.83
## <none> 1011.4 279.83
## + HepatitisB 1 11.44 1000.0 280.32
## + Polio 1 9.58 1001.9 280.56
## + Status 1 3.65 1007.8 281.35
## + Alcohol 1 2.33 1009.1 281.52
## + under.fivedeaths 1 1.31 1010.1 281.66
## + infantdeaths 1 0.95 1010.5 281.70
## + Totalexpenditure 1 0.34 1011.1 281.78
## + thinness1.19years 1 0.08 1011.4 281.82
## + Population 1 0.05 1011.4 281.82
## + Measles 1 0.03 1011.4 281.82
## + thinness5.9years 1 0.01 1011.4 281.83
## - BMI 1 108.26 1119.7 291.35
## - Diphtheria 1 143.39 1154.8 295.46
## - Schooling 1 228.66 1240.1 304.94
## - AdultMortality 1 2350.31 3361.8 437.57
##
## Step: AIC=270.58
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources
##
## Df Sum of Sq RSS AIC
## + HIV.AIDS 1 85.04 844.39 259.82
## + percentageexpenditure 1 40.28 889.15 266.69
## + GDP 1 34.22 895.20 267.59
## <none> 929.43 270.58
## + Polio 1 7.57 921.86 271.49
## - Schooling 1 22.54 951.97 271.77
## + under.fivedeaths 1 3.91 925.52 272.02
## + HepatitisB 1 3.80 925.62 272.04
## + infantdeaths 1 3.28 926.14 272.11
## + Alcohol 1 1.08 928.35 272.43
## + Status 1 0.78 928.65 272.47
## + Population 1 0.73 928.69 272.48
## + Measles 1 0.41 929.02 272.52
## + thinness5.9years 1 0.11 929.32 272.57
## + thinness1.19years 1 0.10 929.33 272.57
## + Totalexpenditure 1 0.00 929.42 272.58
## - Incomecompositionofresources 1 82.02 1011.44 279.83
## - BMI 1 84.49 1013.92 280.15
## - Diphtheria 1 129.92 1059.35 285.98
## - AdultMortality 1 1894.51 2823.94 416.39
##
## Step: AIC=259.82
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS
##
## Df Sum of Sq RSS AIC
## + percentageexpenditure 1 50.36 794.03 253.64
## + GDP 1 42.46 801.93 254.96
## <none> 844.39 259.82
## + HepatitisB 1 8.98 835.41 260.40
## + Polio 1 6.34 838.05 260.82
## + under.fivedeaths 1 4.36 840.03 261.13
## + infantdeaths 1 3.47 840.92 261.27
## + Totalexpenditure 1 2.77 841.62 261.38
## + Alcohol 1 2.14 842.25 261.48
## + Status 1 1.42 842.97 261.60
## + Population 1 0.84 843.55 261.69
## + Measles 1 0.17 844.22 261.79
## + thinness1.19years 1 0.15 844.24 261.80
## + thinness5.9years 1 0.07 844.32 261.81
## - Schooling 1 33.36 877.75 262.97
## - BMI 1 79.69 924.08 269.81
## - HIV.AIDS 1 85.04 929.43 270.58
## - Incomecompositionofresources 1 92.56 936.95 271.65
## - Diphtheria 1 127.52 971.91 276.53
## - AdultMortality 1 700.50 1544.89 338.16
##
## Step: AIC=253.64
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS + percentageexpenditure
##
## Df Sum of Sq RSS AIC
## + Alcohol 1 15.92 778.12 252.95
## <none> 794.03 253.64
## + Polio 1 4.81 789.23 254.83
## + Status 1 4.33 789.70 254.91
## + under.fivedeaths 1 3.28 790.75 255.09
## - Schooling 1 20.96 815.00 255.11
## + HepatitisB 1 2.53 791.50 255.22
## + infantdeaths 1 2.31 791.73 255.25
## + thinness5.9years 1 1.91 792.12 255.32
## + thinness1.19years 1 1.82 792.21 255.34
## + Population 1 0.30 793.73 255.59
## + GDP 1 0.10 793.93 255.62
## + Measles 1 0.06 793.98 255.63
## + Totalexpenditure 1 0.00 794.03 255.64
## - percentageexpenditure 1 50.36 844.39 259.82
## - Incomecompositionofresources 1 66.44 860.47 262.33
## - BMI 1 88.25 882.28 265.66
## - HIV.AIDS 1 95.12 889.15 266.69
## - Diphtheria 1 133.37 927.41 272.29
## - AdultMortality 1 653.43 1447.46 331.50
##
## Step: AIC=252.95
## Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol
##
## Df Sum of Sq RSS AIC
## <none> 778.12 252.95
## - Alcohol 1 15.92 794.03 253.64
## + Polio 1 4.24 773.87 254.22
## + HepatitisB 1 1.78 776.33 254.64
## + under.fivedeaths 1 1.71 776.41 254.66
## + infantdeaths 1 1.17 776.95 254.75
## + thinness5.9years 1 0.80 777.32 254.81
## + thinness1.19years 1 0.66 777.45 254.83
## + Measles 1 0.43 777.68 254.87
## + Status 1 0.40 777.72 254.88
## + GDP 1 0.29 777.82 254.90
## + Population 1 0.14 777.98 254.92
## + Totalexpenditure 1 0.07 778.05 254.94
## - Schooling 1 28.43 806.54 255.72
## - percentageexpenditure 1 64.13 842.25 261.48
## - Incomecompositionofresources 1 78.48 856.59 263.73
## - BMI 1 94.35 872.46 266.17
## - HIV.AIDS 1 100.53 878.65 267.11
## - Diphtheria 1 141.09 919.21 273.11
## - AdultMortality 1 539.44 1317.55 320.99
summary(bothStepModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria +
## BMI + Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9872 -1.3479 0.2182 1.6534 5.6030
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 59.360 1.659 35.785
## AdultMortality -19.710 2.126 -9.272
## Schooling 5.715 2.685 2.128
## Diphtheria 7.502 1.582 4.742
## BMI 4.491 1.158 3.877
## Incomecompositionofresources 8.055 2.278 3.536
## HIV.AIDS -8.574 2.142 -4.003
## percentageexpenditure 5.873 1.837 3.197
## Alcohol -1.975 1.240 -1.593
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality 0.000000000000000729 ***
## Schooling 0.035285 *
## Diphtheria 0.000005715102005088 ***
## BMI 0.000170 ***
## Incomecompositionofresources 0.000571 ***
## HIV.AIDS 0.000107 ***
## percentageexpenditure 0.001763 **
## Alcohol 0.113782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.505 on 124 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9199
## F-statistic: 190.6 on 8 and 124 DF, p-value: < 0.00000000000000022
# save predictions of Mixed Step model
msmPreds <- predict(bothStepModel,
newdata = led_normal,
type = "response")
# to reduce the model according to the Variance Inflation Factors, starting with the full model
fitModel <- fullModel
summary(fitModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ . - Country, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9653 -1.4823 0.1677 1.5897 5.8006
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.7196 2.3315 25.185
## StatusDeveloping 0.2523 0.9880 0.255
## AdultMortality -19.5355 2.2958 -8.509
## infantdeaths 14.9031 45.9627 0.324
## Alcohol -1.2084 1.4623 -0.826
## percentageexpenditure 4.5966 4.2042 1.093
## HepatitisB -0.6402 2.3585 -0.271
## Measles 3.1787 3.7936 0.838
## BMI 5.3145 1.6956 3.134
## under.fivedeaths -22.7974 39.8954 -0.571
## Polio -2.8431 2.9474 -0.965
## Totalexpenditure 0.1715 1.6193 0.106
## Diphtheria 10.5015 3.9026 2.691
## HIV.AIDS -8.8120 2.2692 -3.883
## GDP 1.6674 4.7548 0.351
## Population 3.8322 7.9135 0.484
## thinness1.19years 1.5632 6.8102 0.230
## thinness5.9years 0.5989 7.0202 0.085
## Incomecompositionofresources 7.7276 2.4092 3.208
## Schooling 5.1190 2.8931 1.769
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.798925
## AdultMortality 0.0000000000000837 ***
## infantdeaths 0.746353
## Alcohol 0.410339
## percentageexpenditure 0.276574
## HepatitisB 0.786546
## Measles 0.403850
## BMI 0.002195 **
## under.fivedeaths 0.568844
## Polio 0.336799
## Totalexpenditure 0.915851
## Diphtheria 0.008207 **
## HIV.AIDS 0.000174 ***
## GDP 0.726483
## Population 0.629136
## thinness1.19years 0.818867
## thinness5.9years 0.932161
## Incomecompositionofresources 0.001742 **
## Schooling 0.079523 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.581 on 113 degrees of freedom
## Multiple R-squared: 0.9273, Adjusted R-squared: 0.915
## F-statistic: 75.83 on 19 and 113 DF, p-value: < 0.00000000000000022
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
# check VIF for predictors
vif(fitModel)
## Status AdultMortality
## 2.386941 3.825999
## infantdeaths Alcohol
## 412.807759 3.275004
## percentageexpenditure HepatitisB
## 8.617462 3.663407
## Measles BMI
## 3.412736 4.753556
## under.fivedeaths Polio
## 323.927717 7.544519
## Totalexpenditure Diphtheria
## 1.406280 8.383679
## HIV.AIDS GDP
## 2.169314 11.502237
## Population thinness1.19years
## 10.043489 23.913603
## thinness5.9years Incomecompositionofresources
## 24.351331 5.707713
## Schooling
## 5.994323
# sort the variables in ascending order in a temporary variable, according to the VIFs
temp <- sort(vif(fitModel))
# reduce models until all the included predictors have a VIF < 5
while (temp[length(temp)] > 5) {
cat("\nVariable with highest VIF - ",names(temp[length(temp)])) # variable with highest VIF
frm <- as.formula(paste(".~.-", names(temp[length(temp)]))) # creating formula to remove variable from model
# names(temp[length(temp)])
# as.name(names(temp[length(temp)]))
cat("\nRemoving variable - ",names(temp[length(temp)]))
fitModel <- update(fitModel,frm) # updating model after removing the variable with highest VIF
#fitModel$call
cat("\n")
print(summary(fitModel)) # rechecking the VIFs for new model
temp <- sort(vif(fitModel))
}
##
## Variable with highest VIF - infantdeaths
## Removing variable - infantdeaths
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + under.fivedeaths +
## Polio + Totalexpenditure + Diphtheria + HIV.AIDS + GDP +
## Population + thinness1.19years + thinness5.9years + Incomecompositionofresources +
## Schooling, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9169 -1.4633 0.1477 1.5710 5.8426
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.6384 2.3089 25.397
## StatusDeveloping 0.2352 0.9827 0.239
## AdultMortality -19.6373 2.2653 -8.669
## Alcohol -1.2909 1.4343 -0.900
## percentageexpenditure 4.5809 4.1874 1.094
## HepatitisB -0.6512 2.3490 -0.277
## Measles 3.8634 3.1392 1.231
## BMI 5.3938 1.6713 3.227
## under.fivedeaths -10.0455 6.6785 -1.504
## Polio -2.8522 2.9356 -0.972
## Totalexpenditure 0.1496 1.6116 0.093
## Diphtheria 10.6380 3.8645 2.753
## HIV.AIDS -8.7554 2.2536 -3.885
## GDP 1.6801 4.7359 0.355
## Population 5.3995 6.2410 0.865
## thinness1.19years 1.6413 6.7791 0.242
## thinness5.9years 0.6964 6.9862 0.100
## Incomecompositionofresources 7.7883 2.3925 3.255
## Schooling 4.9992 2.8581 1.749
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.811299
## AdultMortality 0.0000000000000342 ***
## Alcohol 0.369989
## percentageexpenditure 0.276266
## HepatitisB 0.782095
## Measles 0.220958
## BMI 0.001632 **
## under.fivedeaths 0.135305
## Polio 0.333321
## Totalexpenditure 0.926178
## Diphtheria 0.006878 **
## HIV.AIDS 0.000172 ***
## GDP 0.723427
## Population 0.388769
## thinness1.19years 0.809125
## thinness5.9years 0.920774
## Incomecompositionofresources 0.001492 **
## Schooling 0.082958 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.571 on 114 degrees of freedom
## Multiple R-squared: 0.9272, Adjusted R-squared: 0.9157
## F-statistic: 80.67 on 18 and 114 DF, p-value: < 0.00000000000000022
##
##
## Variable with highest VIF - thinness5.9years
## Removing variable - thinness5.9years
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + under.fivedeaths +
## Polio + Totalexpenditure + Diphtheria + HIV.AIDS + GDP +
## Population + thinness1.19years + Incomecompositionofresources +
## Schooling, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9198 -1.4696 0.1556 1.6270 5.8400
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.6487 2.2967 25.536
## StatusDeveloping 0.2355 0.9784 0.241
## AdultMortality -19.6186 2.2478 -8.728
## Alcohol -1.2866 1.4274 -0.901
## percentageexpenditure 4.5927 4.1676 1.102
## HepatitisB -0.6686 2.3325 -0.287
## Measles 3.8669 3.1254 1.237
## BMI 5.3620 1.6336 3.282
## under.fivedeaths -10.1163 6.6120 -1.530
## Polio -2.8401 2.9205 -0.972
## Totalexpenditure 0.1391 1.6012 0.087
## Diphtheria 10.6372 3.8478 2.764
## HIV.AIDS -8.7629 2.2426 -3.907
## GDP 1.6449 4.7024 0.350
## Population 5.5068 6.1209 0.900
## thinness1.19years 2.2669 2.5537 0.888
## Incomecompositionofresources 7.7970 2.3806 3.275
## Schooling 5.0254 2.8337 1.773
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.810242
## AdultMortality 0.0000000000000236 ***
## Alcohol 0.369291
## percentageexpenditure 0.272764
## HepatitisB 0.774903
## Measles 0.218513
## BMI 0.001364 **
## under.fivedeaths 0.128764
## Polio 0.332851
## Totalexpenditure 0.930901
## Diphtheria 0.006643 **
## HIV.AIDS 0.000158 ***
## GDP 0.727124
## Population 0.370169
## thinness1.19years 0.376574
## Incomecompositionofresources 0.001395 **
## Schooling 0.078806 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.559 on 115 degrees of freedom
## Multiple R-squared: 0.9272, Adjusted R-squared: 0.9164
## F-statistic: 86.15 on 17 and 115 DF, p-value: < 0.00000000000000022
##
##
## Variable with highest VIF - GDP
## Removing variable - GDP
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + under.fivedeaths +
## Polio + Totalexpenditure + Diphtheria + HIV.AIDS + Population +
## thinness1.19years + Incomecompositionofresources + Schooling,
## data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9162 -1.4487 0.1641 1.6877 5.8270
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.7432 2.2721 25.855
## StatusDeveloping 0.1318 0.9289 0.142
## AdultMortality -19.5958 2.2384 -8.755
## Alcohol -1.3154 1.4196 -0.927
## percentageexpenditure 5.8493 2.1051 2.779
## HepatitisB -0.9126 2.2172 -0.412
## Measles 3.7949 3.1068 1.221
## BMI 5.3179 1.6226 3.277
## under.fivedeaths -9.9412 6.5680 -1.514
## Polio -2.7461 2.8971 -0.948
## Totalexpenditure 0.1022 1.5916 0.064
## Diphtheria 10.7730 3.8137 2.825
## HIV.AIDS -8.7810 2.2335 -3.931
## Population 5.3156 6.0733 0.875
## thinness1.19years 2.1996 2.5368 0.867
## Incomecompositionofresources 7.8650 2.3636 3.328
## Schooling 5.1762 2.7901 1.855
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.887417
## AdultMortality 0.0000000000000193 ***
## Alcohol 0.356068
## percentageexpenditure 0.006370 **
## HepatitisB 0.681395
## Measles 0.224383
## BMI 0.001383 **
## under.fivedeaths 0.132853
## Polio 0.345162
## Totalexpenditure 0.948890
## Diphtheria 0.005571 **
## HIV.AIDS 0.000144 ***
## Population 0.383253
## thinness1.19years 0.387702
## Incomecompositionofresources 0.001174 **
## Schooling 0.066107 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.55 on 116 degrees of freedom
## Multiple R-squared: 0.9271, Adjusted R-squared: 0.9171
## F-statistic: 92.23 on 16 and 116 DF, p-value: < 0.00000000000000022
##
##
## Variable with highest VIF - under.fivedeaths
## Removing variable - under.fivedeaths
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + Polio +
## Totalexpenditure + Diphtheria + HIV.AIDS + Population + thinness1.19years +
## Incomecompositionofresources + Schooling, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9495 -1.4275 0.2545 1.7212 5.9808
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.3269 2.2678 25.720
## StatusDeveloping 0.1669 0.9338 0.179
## AdultMortality -19.7138 2.2493 -8.764
## Alcohol -1.7060 1.4037 -1.215
## percentageexpenditure 5.8197 2.1166 2.750
## HepatitisB -0.9340 2.2294 -0.419
## Measles 1.0247 2.5243 0.406
## BMI 5.2093 1.6299 3.196
## Polio -2.1881 2.8893 -0.757
## Totalexpenditure 0.1216 1.6003 0.076
## Diphtheria 10.7541 3.8347 2.804
## HIV.AIDS -8.6832 2.2449 -3.868
## Population -2.2387 3.4795 -0.643
## thinness1.19years 1.5462 2.5136 0.615
## Incomecompositionofresources 7.9121 2.3764 3.329
## Schooling 5.7448 2.7799 2.067
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.858442
## AdultMortality 0.0000000000000173 ***
## Alcohol 0.226672
## percentageexpenditure 0.006916 **
## HepatitisB 0.676021
## Measles 0.685543
## BMI 0.001792 **
## Polio 0.450386
## Totalexpenditure 0.939545
## Diphtheria 0.005903 **
## HIV.AIDS 0.000181 ***
## Population 0.521225
## thinness1.19years 0.539652
## Incomecompositionofresources 0.001164 **
## Schooling 0.040984 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.564 on 117 degrees of freedom
## Multiple R-squared: 0.9257, Adjusted R-squared: 0.9162
## F-statistic: 97.15 on 15 and 117 DF, p-value: < 0.00000000000000022
##
##
## Variable with highest VIF - Diphtheria
## Removing variable - Diphtheria
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + Polio +
## Totalexpenditure + HIV.AIDS + Population + thinness1.19years +
## Incomecompositionofresources + Schooling, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9640 -1.5306 0.2494 1.6470 5.5327
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 60.1698 2.2327 26.949
## StatusDeveloping 0.3712 0.9576 0.388
## AdultMortality -19.7788 2.3137 -8.549
## Alcohol -1.7587 1.4438 -1.218
## percentageexpenditure 6.4106 2.1665 2.959
## HepatitisB 1.5927 2.0977 0.759
## Measles 1.0993 2.5965 0.423
## BMI 4.4170 1.6513 2.675
## Polio 3.6812 2.0492 1.796
## Totalexpenditure 0.5914 1.6372 0.361
## HIV.AIDS -8.6811 2.3092 -3.759
## Population -1.2079 3.5593 -0.339
## thinness1.19years 0.7744 2.5701 0.301
## Incomecompositionofresources 8.5716 2.4326 3.524
## Schooling 6.1657 2.8555 2.159
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.698960
## AdultMortality 0.0000000000000522 ***
## Alcohol 0.225621
## percentageexpenditure 0.003731 **
## HepatitisB 0.449226
## Measles 0.672798
## BMI 0.008537 **
## Polio 0.074983 .
## Totalexpenditure 0.718561
## HIV.AIDS 0.000267 ***
## Population 0.734925
## thinness1.19years 0.763695
## Incomecompositionofresources 0.000606 ***
## Schooling 0.032852 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.637 on 118 degrees of freedom
## Multiple R-squared: 0.9207, Adjusted R-squared: 0.9113
## F-statistic: 97.84 on 14 and 118 DF, p-value: < 0.00000000000000022
##
##
## Variable with highest VIF - Schooling
## Removing variable - Schooling
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + Polio +
## Totalexpenditure + HIV.AIDS + Population + thinness1.19years +
## Incomecompositionofresources, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9981 -1.5542 0.2724 1.9165 6.8223
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 60.2167 2.2667 26.566
## StatusDeveloping 0.2806 0.9713 0.289
## AdultMortality -20.0204 2.3463 -8.533
## Alcohol -1.1543 1.4380 -0.803
## percentageexpenditure 6.6659 2.1963 3.035
## HepatitisB 1.4491 2.1287 0.681
## Measles 1.0355 2.6360 0.393
## BMI 5.3533 1.6177 3.309
## Polio 4.4843 2.0459 2.192
## Totalexpenditure 0.6133 1.6622 0.369
## HIV.AIDS -8.1783 2.3326 -3.506
## Population -1.5258 3.6105 -0.423
## thinness1.19years 1.3032 2.5975 0.502
## Incomecompositionofresources 11.4121 2.0774 5.493
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.773192
## AdultMortality 0.000000000000054 ***
## Alcohol 0.423749
## percentageexpenditure 0.002955 **
## HepatitisB 0.497355
## Measles 0.695161
## BMI 0.001238 **
## Polio 0.030336 *
## Totalexpenditure 0.712781
## HIV.AIDS 0.000642 ***
## Population 0.673355
## thinness1.19years 0.616801
## Incomecompositionofresources 0.000000227478569 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.678 on 119 degrees of freedom
## Multiple R-squared: 0.9175, Adjusted R-squared: 0.9085
## F-statistic: 101.9 on 13 and 119 DF, p-value: < 0.00000000000000022
# check the final model
summary(fitModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + Polio +
## Totalexpenditure + HIV.AIDS + Population + thinness1.19years +
## Incomecompositionofresources, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9981 -1.5542 0.2724 1.9165 6.8223
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 60.2167 2.2667 26.566
## StatusDeveloping 0.2806 0.9713 0.289
## AdultMortality -20.0204 2.3463 -8.533
## Alcohol -1.1543 1.4380 -0.803
## percentageexpenditure 6.6659 2.1963 3.035
## HepatitisB 1.4491 2.1287 0.681
## Measles 1.0355 2.6360 0.393
## BMI 5.3533 1.6177 3.309
## Polio 4.4843 2.0459 2.192
## Totalexpenditure 0.6133 1.6622 0.369
## HIV.AIDS -8.1783 2.3326 -3.506
## Population -1.5258 3.6105 -0.423
## thinness1.19years 1.3032 2.5975 0.502
## Incomecompositionofresources 11.4121 2.0774 5.493
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.773192
## AdultMortality 0.000000000000054 ***
## Alcohol 0.423749
## percentageexpenditure 0.002955 **
## HepatitisB 0.497355
## Measles 0.695161
## BMI 0.001238 **
## Polio 0.030336 *
## Totalexpenditure 0.712781
## HIV.AIDS 0.000642 ***
## Population 0.673355
## thinness1.19years 0.616801
## Incomecompositionofresources 0.000000227478569 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.678 on 119 degrees of freedom
## Multiple R-squared: 0.9175, Adjusted R-squared: 0.9085
## F-statistic: 101.9 on 13 and 119 DF, p-value: < 0.00000000000000022
# check the VIFs of variables included in the final model
vif(fitModel)
## Status AdultMortality
## 2.143093 3.712070
## Alcohol percentageexpenditure
## 2.942337 2.184693
## HepatitisB Measles
## 2.772146 1.530626
## BMI Polio
## 4.019142 3.376876
## Totalexpenditure HIV.AIDS
## 1.376386 2.129194
## Population thinness1.19years
## 1.942131 3.231656
## Incomecompositionofresources
## 3.942421
# save predictions of the final model
fitModelPreds <- predict(fitModel, # my model
newdata = led_normal, # dataset
type = "response") # to get predicted values
# Can create a reduced model from the final model, after removing variables showing no significance
cat("\nReduced Model:\n")
##
## Reduced Model:
tempModel <- lm(Lifeexpectancy ~ AdultMortality +
percentageexpenditure +
BMI +
Polio +
HIV.AIDS +
Incomecompositionofresources,
data = led_normal)
# check reduced model
summary(tempModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + percentageexpenditure +
## BMI + Polio + HIV.AIDS + Incomecompositionofresources, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3671 -1.5829 0.2918 1.8367 7.0570
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 62.140 1.452 42.808
## AdultMortality -20.551 2.130 -9.649
## percentageexpenditure 5.535 1.797 3.080
## BMI 4.808 1.146 4.195
## Polio 5.415 1.296 4.177
## HIV.AIDS -8.036 2.227 -3.608
## Incomecompositionofresources 10.297 1.802 5.713
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality < 0.0000000000000002 ***
## percentageexpenditure 0.002542 **
## BMI 0.0000511236 ***
## Polio 0.0000546972 ***
## HIV.AIDS 0.000443 ***
## Incomecompositionofresources 0.0000000758 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.624 on 126 degrees of freedom
## Multiple R-squared: 0.9161, Adjusted R-squared: 0.9121
## F-statistic: 229.4 on 6 and 126 DF, p-value: < 0.00000000000000022
# save prediction of the reduced model
tempModelPreds <- predict(tempModel,
newdata = led_normal,
type = "response")
summary(fullModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ . - Country, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9653 -1.4823 0.1677 1.5897 5.8006
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.7196 2.3315 25.185
## StatusDeveloping 0.2523 0.9880 0.255
## AdultMortality -19.5355 2.2958 -8.509
## infantdeaths 14.9031 45.9627 0.324
## Alcohol -1.2084 1.4623 -0.826
## percentageexpenditure 4.5966 4.2042 1.093
## HepatitisB -0.6402 2.3585 -0.271
## Measles 3.1787 3.7936 0.838
## BMI 5.3145 1.6956 3.134
## under.fivedeaths -22.7974 39.8954 -0.571
## Polio -2.8431 2.9474 -0.965
## Totalexpenditure 0.1715 1.6193 0.106
## Diphtheria 10.5015 3.9026 2.691
## HIV.AIDS -8.8120 2.2692 -3.883
## GDP 1.6674 4.7548 0.351
## Population 3.8322 7.9135 0.484
## thinness1.19years 1.5632 6.8102 0.230
## thinness5.9years 0.5989 7.0202 0.085
## Incomecompositionofresources 7.7276 2.4092 3.208
## Schooling 5.1190 2.8931 1.769
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.798925
## AdultMortality 0.0000000000000837 ***
## infantdeaths 0.746353
## Alcohol 0.410339
## percentageexpenditure 0.276574
## HepatitisB 0.786546
## Measles 0.403850
## BMI 0.002195 **
## under.fivedeaths 0.568844
## Polio 0.336799
## Totalexpenditure 0.915851
## Diphtheria 0.008207 **
## HIV.AIDS 0.000174 ***
## GDP 0.726483
## Population 0.629136
## thinness1.19years 0.818867
## thinness5.9years 0.932161
## Incomecompositionofresources 0.001742 **
## Schooling 0.079523 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.581 on 113 degrees of freedom
## Multiple R-squared: 0.9273, Adjusted R-squared: 0.915
## F-statistic: 75.83 on 19 and 113 DF, p-value: < 0.00000000000000022
cat("\nInterpreting Full Model:\n1) In the full model, the largest parameter estimate is that for under.fivedeaths which is -22.7974, followed by AdultMortality which is -19.5355, indicating that the value of under.fivedeaths will affect the LifeExpectancy the most in a negative direction, followed by AdultMortality which will also pull the value of LifeExpeactancy down as it increases.\n2) On the other hand, variable Incomecompositionofresources will positively affect the LifeExpectancy.\n3) The p-value for AdultMortality can be seen to be very small, in fact the least among all other variables, thus indicating it is a very significant predictor for LifeExpectancy.\n")
##
## Interpreting Full Model:
## 1) In the full model, the largest parameter estimate is that for under.fivedeaths which is -22.7974, followed by AdultMortality which is -19.5355, indicating that the value of under.fivedeaths will affect the LifeExpectancy the most in a negative direction, followed by AdultMortality which will also pull the value of LifeExpeactancy down as it increases.
## 2) On the other hand, variable Incomecompositionofresources will positively affect the LifeExpectancy.
## 3) The p-value for AdultMortality can be seen to be very small, in fact the least among all other variables, thus indicating it is a very significant predictor for LifeExpectancy.
summary(fitModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ Status + AdultMortality + Alcohol +
## percentageexpenditure + HepatitisB + Measles + BMI + Polio +
## Totalexpenditure + HIV.AIDS + Population + thinness1.19years +
## Incomecompositionofresources, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9981 -1.5542 0.2724 1.9165 6.8223
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 60.2167 2.2667 26.566
## StatusDeveloping 0.2806 0.9713 0.289
## AdultMortality -20.0204 2.3463 -8.533
## Alcohol -1.1543 1.4380 -0.803
## percentageexpenditure 6.6659 2.1963 3.035
## HepatitisB 1.4491 2.1287 0.681
## Measles 1.0355 2.6360 0.393
## BMI 5.3533 1.6177 3.309
## Polio 4.4843 2.0459 2.192
## Totalexpenditure 0.6133 1.6622 0.369
## HIV.AIDS -8.1783 2.3326 -3.506
## Population -1.5258 3.6105 -0.423
## thinness1.19years 1.3032 2.5975 0.502
## Incomecompositionofresources 11.4121 2.0774 5.493
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.773192
## AdultMortality 0.000000000000054 ***
## Alcohol 0.423749
## percentageexpenditure 0.002955 **
## HepatitisB 0.497355
## Measles 0.695161
## BMI 0.001238 **
## Polio 0.030336 *
## Totalexpenditure 0.712781
## HIV.AIDS 0.000642 ***
## Population 0.673355
## thinness1.19years 0.616801
## Incomecompositionofresources 0.000000227478569 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.678 on 119 degrees of freedom
## Multiple R-squared: 0.9175, Adjusted R-squared: 0.9085
## F-statistic: 101.9 on 13 and 119 DF, p-value: < 0.00000000000000022
cat("\nInterpreting Fit Model:\n1) As seen in the fit model, AdultMortality again has a large negative parameter estimate, indicating it will affect the LifeExpectancy in a negative direction.\n2) The significance of variable Incomecompositionofresources has increased as compared to that in the full model. Now, this variable has a very low p-value, at the same time has a high parameter estimate, indicating it significantly affects the value of LifeExpectancy positively.\n3) Variable percentageexpenditure also has an increased significance as compared to the full model.\n")
##
## Interpreting Fit Model:
## 1) As seen in the fit model, AdultMortality again has a large negative parameter estimate, indicating it will affect the LifeExpectancy in a negative direction.
## 2) The significance of variable Incomecompositionofresources has increased as compared to that in the full model. Now, this variable has a very low p-value, at the same time has a high parameter estimate, indicating it significantly affects the value of LifeExpectancy positively.
## 3) Variable percentageexpenditure also has an increased significance as compared to the full model.
summary(tempModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + percentageexpenditure +
## BMI + Polio + HIV.AIDS + Incomecompositionofresources, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3671 -1.5829 0.2918 1.8367 7.0570
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 62.140 1.452 42.808
## AdultMortality -20.551 2.130 -9.649
## percentageexpenditure 5.535 1.797 3.080
## BMI 4.808 1.146 4.195
## Polio 5.415 1.296 4.177
## HIV.AIDS -8.036 2.227 -3.608
## Incomecompositionofresources 10.297 1.802 5.713
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality < 0.0000000000000002 ***
## percentageexpenditure 0.002542 **
## BMI 0.0000511236 ***
## Polio 0.0000546972 ***
## HIV.AIDS 0.000443 ***
## Incomecompositionofresources 0.0000000758 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.624 on 126 degrees of freedom
## Multiple R-squared: 0.9161, Adjusted R-squared: 0.9121
## F-statistic: 229.4 on 6 and 126 DF, p-value: < 0.00000000000000022
cat("\nInterpreting temp Model:\nThe output of this model is similar to the fit model, the only difference being this model is built considering only the significant variables from the fit model.\n")
##
## Interpreting temp Model:
## The output of this model is similar to the fit model, the only difference being this model is built considering only the significant variables from the fit model.
summary(backwardStepModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + Alcohol + percentageexpenditure +
## BMI + Diphtheria + HIV.AIDS + Incomecompositionofresources +
## Schooling, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9872 -1.3479 0.2182 1.6534 5.6030
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 59.360 1.659 35.785
## AdultMortality -19.710 2.126 -9.272
## Alcohol -1.975 1.240 -1.593
## percentageexpenditure 5.873 1.837 3.197
## BMI 4.491 1.158 3.877
## Diphtheria 7.502 1.582 4.742
## HIV.AIDS -8.574 2.142 -4.003
## Incomecompositionofresources 8.055 2.278 3.536
## Schooling 5.715 2.685 2.128
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality 0.000000000000000729 ***
## Alcohol 0.113782
## percentageexpenditure 0.001763 **
## BMI 0.000170 ***
## Diphtheria 0.000005715102005088 ***
## HIV.AIDS 0.000107 ***
## Incomecompositionofresources 0.000571 ***
## Schooling 0.035285 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.505 on 124 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9199
## F-statistic: 190.6 on 8 and 124 DF, p-value: < 0.00000000000000022
summary(forwardStepModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria +
## BMI + Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9872 -1.3479 0.2182 1.6534 5.6030
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 59.360 1.659 35.785
## AdultMortality -19.710 2.126 -9.272
## Schooling 5.715 2.685 2.128
## Diphtheria 7.502 1.582 4.742
## BMI 4.491 1.158 3.877
## Incomecompositionofresources 8.055 2.278 3.536
## HIV.AIDS -8.574 2.142 -4.003
## percentageexpenditure 5.873 1.837 3.197
## Alcohol -1.975 1.240 -1.593
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality 0.000000000000000729 ***
## Schooling 0.035285 *
## Diphtheria 0.000005715102005088 ***
## BMI 0.000170 ***
## Incomecompositionofresources 0.000571 ***
## HIV.AIDS 0.000107 ***
## percentageexpenditure 0.001763 **
## Alcohol 0.113782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.505 on 124 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9199
## F-statistic: 190.6 on 8 and 124 DF, p-value: < 0.00000000000000022
summary(bothStepModel)
##
## Call:
## lm(formula = Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria +
## BMI + Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol, data = led_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9872 -1.3479 0.2182 1.6534 5.6030
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 59.360 1.659 35.785
## AdultMortality -19.710 2.126 -9.272
## Schooling 5.715 2.685 2.128
## Diphtheria 7.502 1.582 4.742
## BMI 4.491 1.158 3.877
## Incomecompositionofresources 8.055 2.278 3.536
## HIV.AIDS -8.574 2.142 -4.003
## percentageexpenditure 5.873 1.837 3.197
## Alcohol -1.975 1.240 -1.593
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## AdultMortality 0.000000000000000729 ***
## Schooling 0.035285 *
## Diphtheria 0.000005715102005088 ***
## BMI 0.000170 ***
## Incomecompositionofresources 0.000571 ***
## HIV.AIDS 0.000107 ***
## percentageexpenditure 0.001763 **
## Alcohol 0.113782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.505 on 124 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9199
## F-statistic: 190.6 on 8 and 124 DF, p-value: < 0.00000000000000022
cat("\nInterpreting Stepwise Models:\n1) AdultMortality is the most significant variable with the numerically largest parameter estimate in all three stepwise model, indicating it has the maximum effect on the target variable, but in a negative direction.\n2) All the step models have the same model output, where Alcohol has the least significance among all variables.\n")
##
## Interpreting Stepwise Models:
## 1) AdultMortality is the most significant variable with the numerically largest parameter estimate in all three stepwise model, indicating it has the maximum effect on the target variable, but in a negative direction.
## 2) All the step models have the same model output, where Alcohol has the least significance among all variables.
# actual vs predicted
par(mfrow = c(2,2))
# For Full model
plot(y = fullModel$fitted.values,
x = led_normal$Lifeexpectancy,
main = "Actual vs Predicted using Full model",
xlab = "Actual",
ylab = "Predicted(fullModel)",
pch = 19)
abline(0,1, lwd = 2) # this is a perfect prediction - 45 degree line
# add the regression line
abline(lm(fullModel$fitted.values ~ fullModel$model$Lifeexpectancy),
col = "red", lwd = 2)
# for stepwise model
plot(y = bothStepModel$fitted.values,
x = led_normal$Lifeexpectancy,
main = "Actual vs Predicted using Stepwise Model",
xlab = "Actual",
ylab = "Predicted(stepModel)",
pch = 19)
abline(0,1, lwd = 2) # this is a perfect prediction - 45 degree line
# add the regression line
abline(lm(bothStepModel$fitted.values ~ bothStepModel$model$Lifeexpectancy),
col = "red", lwd = 2)
# for Fit model
plot(y = fitModel$fitted.values,
x = led_normal$Lifeexpectancy,
main = "Actual vs Predicted using FitModel",
xlab = "Actual",
ylab = "Predicted(fitModel)",
pch = 19)
abline(0,1, lwd = 2) # this is a perfect prediction - 45 degree line
# add the regression line
abline(lm(fitModel$fitted.values ~ fitModel$model$Lifeexpectancy),
col = "red", lwd = 2)
# for temp model
plot(y = tempModel$fitted.values,
x = led_normal$Lifeexpectancy,
main = "Actual vs Predicted using tempModel",
xlab = "Actual",
ylab = "Predicted(tempModel)",
pch = 19)
abline(0,1, lwd = 2) # this is a perfect prediction - 45 degree line
# add the regression line
abline(lm(tempModel$fitted.values ~ tempModel$model$Lifeexpectancy),
col = "red", lwd = 2)
cat("It can be seen in all the plots that the fitted values are quite tight around the regression line. This indicates that the model fit pretty well and the residual spread is not very wide.")
## It can be seen in all the plots that the fitted values are quite tight around the regression line. This indicates that the model fit pretty well and the residual spread is not very wide.
# To add Confidence Interval and Prediction Interval
par(mfrow = c(2,2))
# for full Model
# predict Life expectancy
predictedLE <- predict(fullModel, interval = "prediction")
## Warning in predict.lm(fullModel, interval = "prediction"): predictions on current data refer to _future_ responses
mydata <- led_normal
mydata <- cbind.data.frame(mydata, predictedLE)
# plot confidence interval and prediction interval
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
p_ci <- ggplot(mydata, aes(Lifeexpectancy, fit)) +
geom_point() +
stat_smooth(method = lm)
# plot prediction intervals
p_ci + geom_line(aes(y = lwr), color = "red", linetype = "dashed")+
geom_line(aes(y = upr), color = "red", linetype = "dashed")+
geom_smooth(method=lm, se=TRUE)+
ggtitle("Actual vs. Predicted for FullModel") +xlab("Actual Life Expectancy") + ylab("Predicted Life Expectancy")
# for stepwise Model
# predict Life expectancy
predictedLE <- predict(bothStepModel, interval = "prediction")
## Warning in predict.lm(bothStepModel, interval = "prediction"): predictions on current data refer to _future_ responses
mydata <- led_normal
mydata <- cbind.data.frame(mydata, predictedLE)
# plot confidence interval and prediction interval
library(ggplot2)
p_ci <- ggplot(mydata, aes(Lifeexpectancy, fit)) +
geom_point() +
stat_smooth(method = lm)
# plot prediction intervals
p_ci + geom_line(aes(y = lwr), color = "red", linetype = "dashed")+
geom_line(aes(y = upr), color = "red", linetype = "dashed")+
geom_smooth(method=lm, se=TRUE)+
ggtitle("Actual vs. Predicted for Stepwise Model") +xlab("Actual Life Expectancy") + ylab("Predicted Life Expectancy")
# for fit Model
# predict Life expectancy
predictedLE <- predict(fitModel, interval = "prediction")
## Warning in predict.lm(fitModel, interval = "prediction"): predictions on current data refer to _future_ responses
mydata <- led_normal
mydata <- cbind.data.frame(mydata, predictedLE)
# plot confidence interval and prediction interval
library(ggplot2)
p_ci <- ggplot(mydata, aes(Lifeexpectancy, fit)) +
geom_point() +
stat_smooth(method = lm)
# plot prediction intervals
p_ci + geom_line(aes(y = lwr), color = "red", linetype = "dashed")+
geom_line(aes(y = upr), color = "red", linetype = "dashed")+
geom_smooth(method=lm, se=TRUE)+
ggtitle("Actual vs. Predicted for FitModel") +xlab("Actual Life Expectancy") + ylab("Predicted Life Expectancy")
# for temp Model
# predict Life expectancy
predictedLE <- predict(tempModel, interval = "prediction")
## Warning in predict.lm(tempModel, interval = "prediction"): predictions on current data refer to _future_ responses
mydata <- led_normal
mydata <- cbind.data.frame(mydata, predictedLE)
# plot confidence interval and prediction interval
library(ggplot2)
p_ci <- ggplot(mydata, aes(Lifeexpectancy, fit)) +
geom_point() +
stat_smooth(method = lm)
# plot prediction intervals
p_ci + geom_line(aes(y = lwr), color = "red", linetype = "dashed")+
geom_line(aes(y = upr), color = "red", linetype = "dashed")+
geom_smooth(method=lm, se=TRUE)+
ggtitle("Actual vs. Predicted for TempModel") +xlab("Actual Life Expectancy") + ylab("Predicted Life Expectancy")
cat("The plot above shows the actual vs. predicted plot with the regression line. Additionally, it also shows the prediction interval and the confidence interval for the predictions.\nThe blue line is the regression line, surrounding which in grey shade is the prediction interval.\nThe confidence interval for the prediction is indicated by the dotted red line both above and below the regression line. It can be seen that almost all the data points lie well within the confidence interval of 95%.")
## The plot above shows the actual vs. predicted plot with the regression line. Additionally, it also shows the prediction interval and the confidence interval for the predictions.
## The blue line is the regression line, surrounding which in grey shade is the prediction interval.
## The confidence interval for the prediction is indicated by the dotted red line both above and below the regression line. It can be seen that almost all the data points lie well within the confidence interval of 95%.
# analyzing model fit and residuals
cat("\nFull Model accuracy : \n")
##
## Full Model accuracy :
hydroGOF::gof(sim = fullModelPreds,
obs = led_normal$Lifeexpectancy)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## [,1]
## ME 0.00
## MAE 1.87
## MSE 5.66
## RMSE 2.38
## NRMSE % 26.90
## PBIAS % 0.00
## RSR 0.27
## rSD 0.96
## NSE 0.93
## mNSE 0.75
## rNSE 0.91
## d 0.98
## md 0.87
## rd 0.98
## cp 0.96
## r 0.96
## R2 0.93
## bR2 0.93
## KGE 0.95
## VE 0.97
cat("\nNull Model accuracy : \n")
##
## Null Model accuracy :
hydroGOF::gof(sim = nullModelPreds,
obs = led_normal$Lifeexpectancy)
## [,1]
## ME 0.00
## MAE 1.87
## MSE 5.66
## RMSE 2.38
## NRMSE % 26.90
## PBIAS % 0.00
## RSR 0.27
## rSD 0.96
## NSE 0.93
## mNSE 0.75
## rNSE 0.91
## d 0.98
## md 0.87
## rd 0.98
## cp 0.96
## r 0.96
## R2 0.93
## bR2 0.93
## KGE 0.95
## VE 0.97
cat("\nFit Model accuracy : \n")
##
## Fit Model accuracy :
hydroGOF::gof(sim = fitModelPreds,
obs = led_normal$Lifeexpectancy)
## [,1]
## ME 0.00
## MAE 1.99
## MSE 6.41
## RMSE 2.53
## NRMSE % 28.60
## PBIAS % 0.00
## RSR 0.29
## rSD 0.96
## NSE 0.92
## mNSE 0.73
## rNSE 0.90
## d 0.98
## md 0.86
## rd 0.97
## cp 0.96
## r 0.96
## R2 0.92
## bR2 0.92
## KGE 0.94
## VE 0.97
cat("\nTemp Model accuracy : \n")
##
## Temp Model accuracy :
hydroGOF::gof(sim = tempModelPreds,
obs = led_normal$Lifeexpectancy)
## [,1]
## ME 0.00
## MAE 2.01
## MSE 6.52
## RMSE 2.55
## NRMSE % 28.80
## PBIAS % 0.00
## RSR 0.29
## rSD 0.96
## NSE 0.92
## mNSE 0.73
## rNSE 0.89
## d 0.98
## md 0.86
## rd 0.97
## cp 0.96
## r 0.96
## R2 0.92
## bR2 0.91
## KGE 0.94
## VE 0.97
cat("\nBackward Step Model accuracy : \n")
##
## Backward Step Model accuracy :
hydroGOF::gof(sim = bsmPreds,
obs = led_normal$Lifeexpectancy)
## [,1]
## ME 0.00
## MAE 1.89
## MSE 5.85
## RMSE 2.42
## NRMSE % 27.30
## PBIAS % 0.00
## RSR 0.27
## rSD 0.96
## NSE 0.92
## mNSE 0.74
## rNSE 0.91
## d 0.98
## md 0.87
## rd 0.98
## cp 0.96
## r 0.96
## R2 0.92
## bR2 0.92
## KGE 0.95
## VE 0.97
cat("\nForward Step Model accuracy : \n")
##
## Forward Step Model accuracy :
hydroGOF::gof(sim = fsmPreds,
obs = led_normal$Lifeexpectancy)
## [,1]
## ME 0.00
## MAE 1.89
## MSE 5.85
## RMSE 2.42
## NRMSE % 27.30
## PBIAS % 0.00
## RSR 0.27
## rSD 0.96
## NSE 0.92
## mNSE 0.74
## rNSE 0.91
## d 0.98
## md 0.87
## rd 0.98
## cp 0.96
## r 0.96
## R2 0.92
## bR2 0.92
## KGE 0.95
## VE 0.97
cat("\nMixed Step Model accuracy : \n")
##
## Mixed Step Model accuracy :
hydroGOF::gof(sim = msmPreds,
obs = led_normal$Lifeexpectancy)
## [,1]
## ME 0.00
## MAE 1.89
## MSE 5.85
## RMSE 2.42
## NRMSE % 27.30
## PBIAS % 0.00
## RSR 0.27
## rSD 0.96
## NSE 0.92
## mNSE 0.74
## rNSE 0.91
## d 0.98
## md 0.87
## rd 0.98
## cp 0.96
## r 0.96
## R2 0.92
## bR2 0.92
## KGE 0.95
## VE 0.97
cat("Looking at the Performance metrics of all the models, based on the error metrics (RMSE, ME, MAE, MSE and %NRMSE) and the R-Squares, it is evident that the Full Model is the best model here.")
## Looking at the Performance metrics of all the models, based on the error metrics (RMSE, ME, MAE, MSE and %NRMSE) and the R-Squares, it is evident that the Full Model is the best model here.
# comparing models using Anova
anova(fitModel, fullModel)
## Analysis of Variance Table
##
## Model 1: Lifeexpectancy ~ Status + AdultMortality + Alcohol + percentageexpenditure +
## HepatitisB + Measles + BMI + Polio + Totalexpenditure + HIV.AIDS +
## Population + thinness1.19years + Incomecompositionofresources
## Model 2: Lifeexpectancy ~ (Country + Status + AdultMortality + infantdeaths +
## Alcohol + percentageexpenditure + HepatitisB + Measles +
## BMI + under.fivedeaths + Polio + Totalexpenditure + Diphtheria +
## HIV.AIDS + GDP + Population + thinness1.19years + thinness5.9years +
## Incomecompositionofresources + Schooling) - Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 119 853.14
## 2 113 752.55 6 100.58 2.5172 0.02526 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(tempModel, fullModel)
## Analysis of Variance Table
##
## Model 1: Lifeexpectancy ~ AdultMortality + percentageexpenditure + BMI +
## Polio + HIV.AIDS + Incomecompositionofresources
## Model 2: Lifeexpectancy ~ (Country + Status + AdultMortality + infantdeaths +
## Alcohol + percentageexpenditure + HepatitisB + Measles +
## BMI + under.fivedeaths + Polio + Totalexpenditure + Diphtheria +
## HIV.AIDS + GDP + Population + thinness1.19years + thinness5.9years +
## Incomecompositionofresources + Schooling) - Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 126 867.73
## 2 113 752.55 13 115.18 1.3303 0.2059
anova(tempModel, bothStepModel)
## Analysis of Variance Table
##
## Model 1: Lifeexpectancy ~ AdultMortality + percentageexpenditure + BMI +
## Polio + HIV.AIDS + Incomecompositionofresources
## Model 2: Lifeexpectancy ~ AdultMortality + Schooling + Diphtheria + BMI +
## Incomecompositionofresources + HIV.AIDS + percentageexpenditure +
## Alcohol
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 126 867.73
## 2 124 778.12 2 89.613 7.1403 0.001161 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Looking at the results of anova test to compare models, the Stepwise model is turning out to be the best among all.")
## Looking at the results of anova test to compare models, the Stepwise model is turning out to be the best among all.
# checking the AIC of each model
cat("AIC of Full model: ",AIC(fullModel))
## AIC of Full model: 649.9428
cat("\nAIC of Null Model: ",AIC(nullModel))
##
## AIC of Null Model: 960.5366
cat("\nAIC of fit model: ",AIC(fitModel))
##
## AIC of fit model: 654.6274
cat("\nAIC of temp model: ",AIC(tempModel))
##
## AIC of temp model: 642.8832
cat("\nAIC of backward step model: ",AIC(backwardStepModel))
##
## AIC of backward step model: 632.3857
cat("\nAIC of forward step model: ",AIC(forwardStepModel))
##
## AIC of forward step model: 632.3857
cat("\nAIC of mixed step model: ",AIC(bothStepModel))
##
## AIC of mixed step model: 632.3857
# checking the BIC of each model
cat("\nBIC of Full model: ",BIC(fullModel))
##
## BIC of Full model: 710.6402
cat("\nBIC of Null model: ",BIC(nullModel))
##
## BIC of Null model: 966.3173
cat("\nBIC of fit model: ",BIC(fitModel))
##
## BIC of fit model: 697.9826
cat("\nBIC of temp model: ",BIC(tempModel))
##
## BIC of temp model: 666.006
cat("\nBIC of backward step model: ",BIC(backwardStepModel))
##
## BIC of backward step model: 661.2892
cat("\nBIC of forward step model: ",BIC(forwardStepModel))
##
## BIC of forward step model: 661.2892
cat("\nBIC of mixed step model: ",BIC(bothStepModel))
##
## BIC of mixed step model: 661.2892
cat("Looking at the AICs and BICs for all the models, based on this metric, the stepwise models created (forward step, backward step and mixed) turn out to be the best among all the models.")
## Looking at the AICs and BICs for all the models, based on this metric, the stepwise models created (forward step, backward step and mixed) turn out to be the best among all the models.
cat("No!\nThere are various methods that can be used to assess a model and compare multiple models to decide on the best model. We have used three popular methods to compare our models, and each method reached to a different conclusion.\nMethod 1: Checking Goodness of fit:\nUsing this method, we came to a conclusion that the Full model is the best among all models.\nMethod 2: Comparing models using Anova:\nAfter comparing all the models using anova, the Stepwise Model turned out to be the best.\nMethod 3: Checking AIC and BIC:\nAfter checking AIC and BIC for each model, the Stepwise Model turned out to be the best again.")
## No!
## There are various methods that can be used to assess a model and compare multiple models to decide on the best model. We have used three popular methods to compare our models, and each method reached to a different conclusion.
## Method 1: Checking Goodness of fit:
## Using this method, we came to a conclusion that the Full model is the best among all models.
## Method 2: Comparing models using Anova:
## After comparing all the models using anova, the Stepwise Model turned out to be the best.
## Method 3: Checking AIC and BIC:
## After checking AIC and BIC for each model, the Stepwise Model turned out to be the best again.
cat("As the models are compared based on different metrics in these methods, the final conclusion that can be drawn is:\n1) The Full model should be used when the model requires to have least error, and precision is of high importance.\n2) In case of requirement for a less complex model, the Stepwise Model should be used.\n")
## As the models are compared based on different metrics in these methods, the final conclusion that can be drawn is:
## 1) The Full model should be used when the model requires to have least error, and precision is of high importance.
## 2) In case of requirement for a less complex model, the Stepwise Model should be used.