# Climate change
setwd("C:/Users/jzchen/Documents/Courses/Analytics edge/Unit 2")
climate <- read.csv("climate_change.csv")
str(climate)
## 'data.frame': 308 obs. of 11 variables:
## $ Year : int 1983 1983 1983 1983 1983 1983 1983 1983 1984 1984 ...
## $ Month : int 5 6 7 8 9 10 11 12 1 2 ...
## $ MEI : num 2.556 2.167 1.741 1.13 0.428 ...
## $ CO2 : num 346 346 344 342 340 ...
## $ CH4 : num 1639 1634 1633 1631 1648 ...
## $ N2O : num 304 304 304 304 304 ...
## $ CFC.11 : num 191 192 193 194 194 ...
## $ CFC.12 : num 350 352 354 356 357 ...
## $ TSI : num 1366 1366 1366 1366 1366 ...
## $ Aerosols: num 0.0863 0.0794 0.0731 0.0673 0.0619 0.0569 0.0524 0.0486 0.0451 0.0416 ...
## $ Temp : num 0.109 0.118 0.137 0.176 0.149 0.093 0.232 0.078 0.089 0.013 ...
climate_train <- subset(climate, Year <= 2006)
climate_test <- subset(climate, Year > 2006)
TempReg <- lm(Temp ~ MEI + CO2+ CH4 + N2O + CFC.11 + CFC.12 + TSI + Aerosols, data = climate_train)
summary(TempReg)
##
## Call:
## lm(formula = Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 +
## TSI + Aerosols, data = climate_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25888 -0.05913 -0.00082 0.05649 0.32433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.246e+02 1.989e+01 -6.265 1.43e-09 ***
## MEI 6.421e-02 6.470e-03 9.923 < 2e-16 ***
## CO2 6.457e-03 2.285e-03 2.826 0.00505 **
## CH4 1.240e-04 5.158e-04 0.240 0.81015
## N2O -1.653e-02 8.565e-03 -1.930 0.05467 .
## CFC.11 -6.631e-03 1.626e-03 -4.078 5.96e-05 ***
## CFC.12 3.808e-03 1.014e-03 3.757 0.00021 ***
## TSI 9.314e-02 1.475e-02 6.313 1.10e-09 ***
## Aerosols -1.538e+00 2.133e-01 -7.210 5.41e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09171 on 275 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7436
## F-statistic: 103.6 on 8 and 275 DF, p-value: < 2.2e-16
cor(climate_train)
## Year Month MEI CO2 CH4
## Year 1.00000000 -0.0279419602 -0.0369876842 0.98274939 0.91565945
## Month -0.02794196 1.0000000000 0.0008846905 -0.10673246 0.01856866
## MEI -0.03698768 0.0008846905 1.0000000000 -0.04114717 -0.03341930
## CO2 0.98274939 -0.1067324607 -0.0411471651 1.00000000 0.87727963
## CH4 0.91565945 0.0185686624 -0.0334193014 0.87727963 1.00000000
## N2O 0.99384523 0.0136315303 -0.0508197755 0.97671982 0.89983864
## CFC.11 0.56910643 -0.0131112236 0.0690004387 0.51405975 0.77990402
## CFC.12 0.89701166 0.0006751102 0.0082855443 0.85268963 0.96361625
## TSI 0.17030201 -0.0346061935 -0.1544919227 0.17742893 0.24552844
## Aerosols -0.34524670 0.0148895406 0.3402377871 -0.35615480 -0.26780919
## Temp 0.78679714 -0.0998567411 0.1724707512 0.78852921 0.70325502
## N2O CFC.11 CFC.12 TSI Aerosols
## Year 0.99384523 0.56910643 0.8970116635 0.17030201 -0.34524670
## Month 0.01363153 -0.01311122 0.0006751102 -0.03460619 0.01488954
## MEI -0.05081978 0.06900044 0.0082855443 -0.15449192 0.34023779
## CO2 0.97671982 0.51405975 0.8526896272 0.17742893 -0.35615480
## CH4 0.89983864 0.77990402 0.9636162478 0.24552844 -0.26780919
## N2O 1.00000000 0.52247732 0.8679307757 0.19975668 -0.33705457
## CFC.11 0.52247732 1.00000000 0.8689851828 0.27204596 -0.04392120
## CFC.12 0.86793078 0.86898518 1.0000000000 0.25530281 -0.22513124
## TSI 0.19975668 0.27204596 0.2553028138 1.00000000 0.05211651
## Aerosols -0.33705457 -0.04392120 -0.2251312440 0.05211651 1.00000000
## Temp 0.77863893 0.40771029 0.6875575483 0.24338269 -0.38491375
## Temp
## Year 0.78679714
## Month -0.09985674
## MEI 0.17247075
## CO2 0.78852921
## CH4 0.70325502
## N2O 0.77863893
## CFC.11 0.40771029
## CFC.12 0.68755755
## TSI 0.24338269
## Aerosols -0.38491375
## Temp 1.00000000
TempReg_1 <- lm(Temp ~ MEI + N2O + TSI + Aerosols, data = climate_train)
summary(TempReg_1)
##
## Call:
## lm(formula = Temp ~ MEI + N2O + TSI + Aerosols, data = climate_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27916 -0.05975 -0.00595 0.05672 0.34195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.162e+02 2.022e+01 -5.747 2.37e-08 ***
## MEI 6.419e-02 6.652e-03 9.649 < 2e-16 ***
## N2O 2.532e-02 1.311e-03 19.307 < 2e-16 ***
## TSI 7.949e-02 1.487e-02 5.344 1.89e-07 ***
## Aerosols -1.702e+00 2.180e-01 -7.806 1.19e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09547 on 279 degrees of freedom
## Multiple R-squared: 0.7261, Adjusted R-squared: 0.7222
## F-statistic: 184.9 on 4 and 279 DF, p-value: < 2.2e-16
# use step function to automate
TempReg_step <- step(TempReg)
## Start: AIC=-1348.16
## Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
##
## Df Sum of Sq RSS AIC
## - CH4 1 0.00049 2.3135 -1350.1
## <none> 2.3130 -1348.2
## - N2O 1 0.03132 2.3443 -1346.3
## - CO2 1 0.06719 2.3802 -1342.0
## - CFC.12 1 0.11874 2.4318 -1335.9
## - CFC.11 1 0.13986 2.4529 -1333.5
## - TSI 1 0.33516 2.6482 -1311.7
## - Aerosols 1 0.43727 2.7503 -1301.0
## - MEI 1 0.82823 3.1412 -1263.2
##
## Step: AIC=-1350.1
## Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
##
## Df Sum of Sq RSS AIC
## <none> 2.3135 -1350.1
## - N2O 1 0.03133 2.3448 -1348.3
## - CO2 1 0.06672 2.3802 -1344.0
## - CFC.12 1 0.13023 2.4437 -1336.5
## - CFC.11 1 0.13938 2.4529 -1335.5
## - TSI 1 0.33500 2.6485 -1313.7
## - Aerosols 1 0.43987 2.7534 -1302.7
## - MEI 1 0.83118 3.1447 -1264.9
summary(TempReg_step)
##
## Call:
## lm(formula = Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI +
## Aerosols, data = climate_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25770 -0.05994 -0.00104 0.05588 0.32203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.245e+02 1.985e+01 -6.273 1.37e-09 ***
## MEI 6.407e-02 6.434e-03 9.958 < 2e-16 ***
## CO2 6.402e-03 2.269e-03 2.821 0.005129 **
## N2O -1.602e-02 8.287e-03 -1.933 0.054234 .
## CFC.11 -6.609e-03 1.621e-03 -4.078 5.95e-05 ***
## CFC.12 3.868e-03 9.812e-04 3.942 0.000103 ***
## TSI 9.312e-02 1.473e-02 6.322 1.04e-09 ***
## Aerosols -1.540e+00 2.126e-01 -7.244 4.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09155 on 276 degrees of freedom
## Multiple R-squared: 0.7508, Adjusted R-squared: 0.7445
## F-statistic: 118.8 on 7 and 276 DF, p-value: < 2.2e-16
Temp_predict <- predict(TempReg_step, newdata = climate_test)
Temp_predict
## 285 286 287 288 289 290 291
## 0.4677808 0.4435404 0.4265541 0.4299162 0.4455113 0.4151422 0.4097367
## 292 293 294 295 296 297 298
## 0.3839390 0.3255595 0.3274147 0.3231401 0.3316704 0.3522134 0.3313129
## 299 300 301 302 303 304 305
## 0.3142112 0.3703410 0.4162213 0.4391458 0.4237965 0.3913679 0.3587615
## 306 307 308
## 0.3451991 0.3607087 0.3638076
SSE <- sum((Temp_predict- climate_test$Temp)^2)
SST <- sum((mean(climate_train$Temp)-climate_test$Temp)^2)
R2 <- 1 - SSE/SST
R2
## [1] 0.6286051
# reading test scores
pisa_train <- read.csv("pisa2009train.csv")
pisa_test <- read.csv("pisa2009test.csv")
str(pisa_train)
## 'data.frame': 3663 obs. of 24 variables:
## $ grade : int 11 11 9 10 10 10 10 10 9 10 ...
## $ male : int 1 1 1 0 1 1 0 0 0 1 ...
## $ raceeth : Factor w/ 7 levels "American Indian/Alaska Native",..: NA 7 7 3 4 3 2 7 7 5 ...
## $ preschool : int NA 0 1 1 1 1 0 1 1 1 ...
## $ expectBachelors : int 0 0 1 1 0 1 1 1 0 1 ...
## $ motherHS : int NA 1 1 0 1 NA 1 1 1 1 ...
## $ motherBachelors : int NA 1 1 0 0 NA 0 0 NA 1 ...
## $ motherWork : int 1 1 1 1 1 1 1 0 1 1 ...
## $ fatherHS : int NA 1 1 1 1 1 NA 1 0 0 ...
## $ fatherBachelors : int NA 0 NA 0 0 0 NA 0 NA 0 ...
## $ fatherWork : int 1 1 1 1 0 1 NA 1 1 1 ...
## $ selfBornUS : int 1 1 1 1 1 1 0 1 1 1 ...
## $ motherBornUS : int 0 1 1 1 1 1 1 1 1 1 ...
## $ fatherBornUS : int 0 1 1 1 0 1 NA 1 1 1 ...
## $ englishAtHome : int 0 1 1 1 1 1 1 1 1 1 ...
## $ computerForSchoolwork: int 1 1 1 1 1 1 1 1 1 1 ...
## $ read30MinsADay : int 0 1 0 1 1 0 0 1 0 0 ...
## $ minutesPerWeekEnglish: int 225 450 250 200 250 300 250 300 378 294 ...
## $ studentsInEnglish : int NA 25 28 23 35 20 28 30 20 24 ...
## $ schoolHasLibrary : int 1 1 1 1 1 1 1 1 0 1 ...
## $ publicSchool : int 1 1 1 1 1 1 1 1 1 1 ...
## $ urban : int 1 0 0 1 1 0 1 0 1 0 ...
## $ schoolSize : int 673 1173 1233 2640 1095 227 2080 1913 502 899 ...
## $ readingScore : num 476 575 555 458 614 ...
# what is the average reading test score of males?
tapply(pisa_train$readingScore, pisa_train$male,mean)
## 0 1
## 512.9406 483.5325
summary(pisa_train)
## grade male raceeth
## Min. : 8.00 Min. :0.0000 White :2015
## 1st Qu.:10.00 1st Qu.:0.0000 Hispanic : 834
## Median :10.00 Median :1.0000 Black : 444
## Mean :10.09 Mean :0.5111 Asian : 143
## 3rd Qu.:10.00 3rd Qu.:1.0000 More than one race: 124
## Max. :12.00 Max. :1.0000 (Other) : 68
## NA's : 35
## preschool expectBachelors motherHS motherBachelors
## Min. :0.0000 Min. :0.0000 Min. :0.00 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.00 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.00 Median :0.0000
## Mean :0.7228 Mean :0.7859 Mean :0.88 Mean :0.3481
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.00 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00 Max. :1.0000
## NA's :56 NA's :62 NA's :97 NA's :397
## motherWork fatherHS fatherBachelors fatherWork
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.7345 Mean :0.8593 Mean :0.3319 Mean :0.8531
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :93 NA's :245 NA's :569 NA's :233
## selfBornUS motherBornUS fatherBornUS englishAtHome
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.9313 Mean :0.7725 Mean :0.7668 Mean :0.8717
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :69 NA's :71 NA's :113 NA's :71
## computerForSchoolwork read30MinsADay minutesPerWeekEnglish
## Min. :0.0000 Min. :0.0000 Min. : 0.0
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.: 225.0
## Median :1.0000 Median :0.0000 Median : 250.0
## Mean :0.8994 Mean :0.2899 Mean : 266.2
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 300.0
## Max. :1.0000 Max. :1.0000 Max. :2400.0
## NA's :65 NA's :34 NA's :186
## studentsInEnglish schoolHasLibrary publicSchool urban
## Min. : 1.0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:20.0 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :25.0 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :24.5 Mean :0.9676 Mean :0.9339 Mean :0.3849
## 3rd Qu.:30.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :75.0 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :249 NA's :143
## schoolSize readingScore
## Min. : 100 Min. :168.6
## 1st Qu.: 712 1st Qu.:431.7
## Median :1212 Median :499.7
## Mean :1369 Mean :497.9
## 3rd Qu.:1900 3rd Qu.:566.2
## Max. :6694 Max. :746.0
## NA's :162
# Linear regression discards observations with missing data, so we will
# remove all such observations from the training and testing sets. Later
# in the course, we will learn about imputation, which deals with missing
# data by filling in missing values with plausible information.
pisa_train <- na.omit(pisa_train)
pisa_test <- na.omit(pisa_test)
nrow(pisa_test)
## [1] 990
lmScore <- lm(readingScore ~., data = pisa_train)
summary(lmScore)
##
## Call:
## lm(formula = readingScore ~ ., data = pisa_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -247.44 -48.86 1.86 49.77 217.18
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 76.489006 37.302678
## grade 29.542707 2.937399
## male -14.521653 3.155926
## raceethAsian 63.167002 18.972648
## raceethBlack 0.264980 17.369507
## raceethHispanic 28.301842 17.258860
## raceethMore than one race 50.354805 18.570123
## raceethNative Hawaiian/Other Pacific Islander 62.175726 23.782766
## raceethWhite 67.277327 16.786935
## preschool -4.463670 3.486055
## expectBachelors 55.267080 4.293893
## motherHS 6.058774 6.091423
## motherBachelors 12.638068 3.861457
## motherWork -2.809101 3.521827
## fatherHS 4.018214 5.579269
## fatherBachelors 16.929755 3.995253
## fatherWork 5.842798 4.395978
## selfBornUS -3.806278 7.323718
## motherBornUS -8.798153 6.587621
## fatherBornUS 4.306994 6.263875
## englishAtHome 8.035685 6.859492
## computerForSchoolwork 22.500232 5.702562
## read30MinsADay 34.871924 3.408447
## minutesPerWeekEnglish 0.012788 0.010712
## studentsInEnglish -0.286631 0.227819
## schoolHasLibrary 12.215085 9.264884
## publicSchool -16.857475 6.725614
## urban -0.110132 3.962724
## schoolSize 0.006540 0.002197
## t value Pr(>|t|)
## (Intercept) 2.050 0.040425 *
## grade 10.057 < 2e-16 ***
## male -4.601 4.42e-06 ***
## raceethAsian 3.329 0.000884 ***
## raceethBlack 0.015 0.987830
## raceethHispanic 1.640 0.101169
## raceethMore than one race 2.712 0.006744 **
## raceethNative Hawaiian/Other Pacific Islander 2.614 0.008997 **
## raceethWhite 4.008 6.32e-05 ***
## preschool -1.280 0.200516
## expectBachelors 12.871 < 2e-16 ***
## motherHS 0.995 0.320012
## motherBachelors 3.273 0.001080 **
## motherWork -0.798 0.425167
## fatherHS 0.720 0.471470
## fatherBachelors 4.237 2.35e-05 ***
## fatherWork 1.329 0.183934
## selfBornUS -0.520 0.603307
## motherBornUS -1.336 0.181821
## fatherBornUS 0.688 0.491776
## englishAtHome 1.171 0.241527
## computerForSchoolwork 3.946 8.19e-05 ***
## read30MinsADay 10.231 < 2e-16 ***
## minutesPerWeekEnglish 1.194 0.232644
## studentsInEnglish -1.258 0.208460
## schoolHasLibrary 1.318 0.187487
## publicSchool -2.506 0.012261 *
## urban -0.028 0.977830
## schoolSize 2.977 0.002942 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.81 on 2385 degrees of freedom
## Multiple R-squared: 0.3251, Adjusted R-squared: 0.3172
## F-statistic: 41.04 on 28 and 2385 DF, p-value: < 2.2e-16
# the model will break down the raceeth variable to different variables, raceethAsian,
# raceethBlack, etc.
# To include unordered factors in a linear regression model, we define one
# level as the "reference level" and add a binary variable for each of the
# remaining levels. In this way, a factor with n levels is replaced by n-1
# binary variables. The reference level is typically selected to be the most
# frequently occurring level in the dataset.
#
# As an example, consider the unordered factor variable "color", with levels
# "red", "green", and "blue". If "green" were the reference level, then we
# would add binary variables "colorred" and "colorblue" to a linear regression
# problem. All red examples would have colorred=1 and colorblue=0. All blue
# examples would have colorred=0 and colorblue=1. All green examples would have
# colorred=0 and colorblue=0.
# Because the race variable takes on text values, it was loaded as a factor
# variable when we read in the dataset with read.csv() -- you can see this when
# you run str(pisaTrain) or str(pisaTest). However, by default R selects the first
# level alphabetically ("American Indian/Alaska Native") as the reference level of
# our factor instead of the most common level ("White"). Set the reference level of
# the factor by typing the following two lines in your R console:
pisa_train$raceeth <- relevel(pisa_train$raceeth, "White")
pisa_test$raceeth <- relevel(pisa_test$raceeth, "White")
lmScore <- lm(readingScore ~., data = pisa_train)
summary(lmScore)
##
## Call:
## lm(formula = readingScore ~ ., data = pisa_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -247.44 -48.86 1.86 49.77 217.18
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 143.766333 33.841226
## grade 29.542707 2.937399
## male -14.521653 3.155926
## raceethAmerican Indian/Alaska Native -67.277327 16.786935
## raceethAsian -4.110325 9.220071
## raceethBlack -67.012347 5.460883
## raceethHispanic -38.975486 5.177743
## raceethMore than one race -16.922522 8.496268
## raceethNative Hawaiian/Other Pacific Islander -5.101601 17.005696
## preschool -4.463670 3.486055
## expectBachelors 55.267080 4.293893
## motherHS 6.058774 6.091423
## motherBachelors 12.638068 3.861457
## motherWork -2.809101 3.521827
## fatherHS 4.018214 5.579269
## fatherBachelors 16.929755 3.995253
## fatherWork 5.842798 4.395978
## selfBornUS -3.806278 7.323718
## motherBornUS -8.798153 6.587621
## fatherBornUS 4.306994 6.263875
## englishAtHome 8.035685 6.859492
## computerForSchoolwork 22.500232 5.702562
## read30MinsADay 34.871924 3.408447
## minutesPerWeekEnglish 0.012788 0.010712
## studentsInEnglish -0.286631 0.227819
## schoolHasLibrary 12.215085 9.264884
## publicSchool -16.857475 6.725614
## urban -0.110132 3.962724
## schoolSize 0.006540 0.002197
## t value Pr(>|t|)
## (Intercept) 4.248 2.24e-05 ***
## grade 10.057 < 2e-16 ***
## male -4.601 4.42e-06 ***
## raceethAmerican Indian/Alaska Native -4.008 6.32e-05 ***
## raceethAsian -0.446 0.65578
## raceethBlack -12.271 < 2e-16 ***
## raceethHispanic -7.528 7.29e-14 ***
## raceethMore than one race -1.992 0.04651 *
## raceethNative Hawaiian/Other Pacific Islander -0.300 0.76421
## preschool -1.280 0.20052
## expectBachelors 12.871 < 2e-16 ***
## motherHS 0.995 0.32001
## motherBachelors 3.273 0.00108 **
## motherWork -0.798 0.42517
## fatherHS 0.720 0.47147
## fatherBachelors 4.237 2.35e-05 ***
## fatherWork 1.329 0.18393
## selfBornUS -0.520 0.60331
## motherBornUS -1.336 0.18182
## fatherBornUS 0.688 0.49178
## englishAtHome 1.171 0.24153
## computerForSchoolwork 3.946 8.19e-05 ***
## read30MinsADay 10.231 < 2e-16 ***
## minutesPerWeekEnglish 1.194 0.23264
## studentsInEnglish -1.258 0.20846
## schoolHasLibrary 1.318 0.18749
## publicSchool -2.506 0.01226 *
## urban -0.028 0.97783
## schoolSize 2.977 0.00294 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.81 on 2385 degrees of freedom
## Multiple R-squared: 0.3251, Adjusted R-squared: 0.3172
## F-statistic: 41.04 on 28 and 2385 DF, p-value: < 2.2e-16
SSE <- sum(lmScore$residuals^2)
RMSE <- sqrt(SSE/nrow(pisa_train))
RMSE
## [1] 73.36555
# predicting
predTest <- predict(lmScore, newdata = pisa_test)
summary(predTest)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 353.2 482.0 524.0 516.7 555.7 637.7
SSE <- sum((predTest - pisa_test$readingScore)^2)
SSE
## [1] 5762082
RMSE <- sqrt(SSE/nrow(pisa_test))
RMSE
## [1] 76.29079
# What is the predicted test score used in the baseline model?
mean(pisa_train$readingScore)
## [1] 517.9629
# What is the sum of squared errors of the baseline model on the testing set?
# HINT: We call the sum of squared errors for the baseline model the total
# sum of squares (SST).
SST <- sum((mean(pisa_train$readingScore) - pisa_test$readingScore)^2)
SST
## [1] 7802354
R2 <- 1-SSE/SST
R2
## [1] 0.2614944
# DETECTING FLU EPIDEMICS VIA SEARCH ENGINE QUERY DATA
FluTrain <- read.csv("FluTrain.csv")
str(FluTrain)
## 'data.frame': 417 obs. of 3 variables:
## $ Week : Factor w/ 417 levels "2004-01-04 - 2004-01-10",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ILI : num 2.42 1.81 1.71 1.54 1.44 ...
## $ Queries: num 0.238 0.22 0.226 0.238 0.224 ...
which.max(FluTrain$ILI)
## [1] 303
FluTrain$Week[303]
## [1] 2009-10-18 - 2009-10-24
## 417 Levels: 2004-01-04 - 2004-01-10 ... 2011-12-25 - 2011-12-31
which.max(FluTrain$Queries)
## [1] 303
hist(FluTrain$ILI)

plot(log(FluTrain$ILI),FluTrain$Queries)
FluTrend1 <- lm(log(ILI)~Queries, data = FluTrain)
summary(FluTrend1)
##
## Call:
## lm(formula = log(ILI) ~ Queries, data = FluTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76003 -0.19696 -0.01657 0.18685 1.06450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.49934 0.03041 -16.42 <2e-16 ***
## Queries 2.96129 0.09312 31.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2995 on 415 degrees of freedom
## Multiple R-squared: 0.709, Adjusted R-squared: 0.7083
## F-statistic: 1011 on 1 and 415 DF, p-value: < 2.2e-16
correlation <- cor(log(FluTrain$ILI), FluTrain$Queries)
log(1/correlation)
## [1] 0.1719357
exp(-0.5*correlation)
## [1] 0.6563792
correlation^2
## [1] 0.7090201
# The csv file FluTest.csv provides the 2012 weekly data of the
# ILI-related search queries and the observed weekly percentage of
# ILI-related physician visits.
FluTest <- read.csv("FluTest.csv")
PreTest1 <- exp(predict(FluTrend1, newdata = FluTest))
# What is our estimate for the percentage of ILI-related physician visits
# for the week of March 11, 2012?
FluTest$Week
## [1] 2012-01-01 - 2012-01-07 2012-01-08 - 2012-01-14
## [3] 2012-01-15 - 2012-01-21 2012-01-22 - 2012-01-28
## [5] 2012-01-29 - 2012-02-04 2012-02-05 - 2012-02-11
## [7] 2012-02-12 - 2012-02-18 2012-02-19 - 2012-02-25
## [9] 2012-02-26 - 2012-03-03 2012-03-04 - 2012-03-10
## [11] 2012-03-11 - 2012-03-17 2012-03-18 - 2012-03-24
## [13] 2012-03-25 - 2012-03-31 2012-04-01 - 2012-04-07
## [15] 2012-04-08 - 2012-04-14 2012-04-15 - 2012-04-21
## [17] 2012-04-22 - 2012-04-28 2012-04-29 - 2012-05-05
## [19] 2012-05-06 - 2012-05-12 2012-05-13 - 2012-05-19
## [21] 2012-05-20 - 2012-05-26 2012-05-27 - 2012-06-02
## [23] 2012-06-03 - 2012-06-09 2012-06-10 - 2012-06-16
## [25] 2012-06-17 - 2012-06-23 2012-06-24 - 2012-06-30
## [27] 2012-07-01 - 2012-07-07 2012-07-08 - 2012-07-14
## [29] 2012-07-15 - 2012-07-21 2012-07-22 - 2012-07-28
## [31] 2012-07-29 - 2012-08-04 2012-08-05 - 2012-08-11
## [33] 2012-08-12 - 2012-08-18 2012-08-19 - 2012-08-25
## [35] 2012-08-26 - 2012-09-01 2012-09-02 - 2012-09-08
## [37] 2012-09-09 - 2012-09-15 2012-09-16 - 2012-09-22
## [39] 2012-09-23 - 2012-09-29 2012-09-30 - 2012-10-06
## [41] 2012-10-07 - 2012-10-13 2012-10-14 - 2012-10-20
## [43] 2012-10-21 - 2012-10-27 2012-10-28 - 2012-11-03
## [45] 2012-11-04 - 2012-11-10 2012-11-11 - 2012-11-17
## [47] 2012-11-18 - 2012-11-24 2012-11-25 - 2012-12-01
## [49] 2012-12-02 - 2012-12-08 2012-12-09 - 2012-12-15
## [51] 2012-12-16 - 2012-12-22 2012-12-23 - 2012-12-29
## 52 Levels: 2012-01-01 - 2012-01-07 ... 2012-12-23 - 2012-12-29
FluTest$Week[11]
## [1] 2012-03-11 - 2012-03-17
## 52 Levels: 2012-01-01 - 2012-01-07 ... 2012-12-23 - 2012-12-29
PreTest1[11]
## 11
## 2.187378
(FluTest$ILI[11] - PreTest1[11])/FluTest$ILI[11]
## 11
## 0.04623827
SSE <- sum((FluTest$ILI - PreTest1)^2)
SSE
## [1] 29.17708
RMSE <- sqrt(SSE/nrow(FluTest))
RMSE
## [1] 0.7490645
# TRAINING A TIME SERIES MODEL
# The observations in this dataset are consecutive weekly measurements of
# the dependent and independent variables. This sort of dataset is called
# a "time series."
# Because the ILI variable is reported with a 1- or 2-week lag, a decision
# maker cannot rely on the previous week's ILI value to predict the current
# week's value. Instead, the decision maker will only have data available
# from 2 or more weeks ago. We will build a variable called ILILag2 that
# contains the ILI value from 2 weeks before the current observation.
# To do so, we will use the "zoo" package, which provides a number of helpful
# methods for time series models.
# install.packages("zoo")
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric

# create the ILILag2 variable in the training set:
ILILag2 <- lag(zoo(FluTrain$ILI), -2,na.pad = TRUE)
FluTrain$ILILag2 <- coredata(ILILag2)
# In these commands, the value of -2 passed to lag means to return 2 observations
# before the current one; a positive value would have returned future observations.
# The parameter na.pad=TRUE means to add missing values for the first two weeks of
# our dataset, where we can't compute the data from 2 weeks earlier.
FluTrain$ILILag2
## [1] NA NA 2.4183312 1.8090560 1.7120239 1.5424951 1.4378683
## [8] 1.3242740 1.3072567 1.0369770 1.0103204 1.0524925 1.0200901 0.9244187
## [15] 0.7906450 0.8026098 0.8361300 0.7924358 0.6835877 0.7574523 0.7885854
## [22] 0.8121710 0.8044629 0.8777009 0.7414530 0.6610222 0.7151092 0.5622412
## [29] 0.7868082 0.8606578 0.6899440 0.7796912 0.6281439 0.9024586 0.8064432
## [36] 0.8748878 0.9932130 0.8761408 0.9480916 0.9269426 0.9716430 0.8971591
## [43] 1.0224828 1.0629632 1.1469570 1.2049501 1.3051655 1.2869916 1.5946756
## [50] 1.3971432 1.4499567 1.6174545 2.1911192 2.5664893 2.1764491 2.2017121
## [57] 2.5301211 3.0652381 3.9806083 4.5956803 4.7519706 4.1796206 3.4535851
## [64] 3.1585224 2.6732010 2.3516104 1.8924285 1.5249048 1.4113441 1.2506826
## [71] 1.2070250 1.0789550 1.1452080 1.0612426 1.0567977 1.2519310 1.0141893
## [78] 1.0419693 0.9540274 0.8482299 0.8418715 0.7308936 0.7134316 0.6706772
## [85] 0.6892776 0.7049290 0.6159033 0.6094256 0.6802587 0.7754884 0.6834214
## [92] 0.7810748 0.8069435 1.0763468 1.0586890 1.1152326 1.1238125 1.2548892
## [99] 1.3366090 1.3786364 1.6082900 1.4831056 1.6537399 2.0067892 2.5685716
## [106] 3.0527762 2.4250373 2.0019506 2.0586902 2.2127697 2.3222001 2.4927920
## [113] 2.7948942 2.9691114 2.8395905 2.7779902 2.4728693 2.1806146 2.0167951
## [120] 1.6410133 1.3582865 1.1427983 1.0403125 0.9643469 0.9379817 0.9474493
## [127] 0.8919182 0.8646427 0.9703199 0.8443901 0.7748704 0.8213725 0.8727445
## [134] 0.9226345 0.8994868 0.8430824 0.8818244 0.8171452 0.8715001 0.7386205
## [141] 0.7979660 1.0139373 0.8809358 0.9433663 0.8915462 1.2032228 1.0578822
## [148] 1.1305354 1.1255230 1.2080820 1.3495244 1.4689004 1.8276716 1.6656012
## [155] 1.8596834 2.3889130 2.7897759 3.1154858 2.2694245 1.8635464 1.9998635
## [162] 2.4406044 2.8301821 3.1234256 3.2701949 3.1775688 2.7236366 2.5020140
## [169] 2.4271992 1.9604132 1.5913980 1.3697835 1.3631668 1.1736951 1.0635756
## [176] 0.9697111 0.9653617 0.8567489 0.8633465 0.9353695 0.7455694 0.7404281
## [183] 0.6728965 0.6662820 0.6627473 0.5456190 0.5862306 0.6606867 0.5340928
## [190] 0.5855491 0.6180750 0.6874647 0.7156961 0.8293131 0.8009115 0.9184839
## [197] 0.8142590 1.0719708 1.2178574 1.2457554 1.3598449 1.4467085 1.5328638
## [204] 1.6665324 1.9748773 1.6730547 1.6340509 1.7459475 1.9364319 2.4890534
## [211] 2.2540484 2.0914715 2.3593428 3.3233143 4.4338100 5.3454714 5.4225751
## [218] 5.3030330 4.2445550 3.6280001 3.0346275 2.5359536 2.0573015 1.7415035
## [225] 1.4065217 1.2686070 1.0771887 0.9934452 0.9112119 0.9721091 0.9932575
## [232] 1.0913202 0.8884460 0.8876915 0.8831874 0.8267564 0.7832014 0.7806103
## [239] 0.7690726 0.7212979 0.7525273 0.7527210 0.7927660 0.7438962 0.8141663
## [246] 0.8384009 0.8511236 1.1097575 1.0311436 1.0228436 1.0301739 1.0124478
## [253] 1.0835911 1.1657765 1.1912964 1.2807470 1.2705251 1.5957825 1.4584994
## [260] 1.4992072 1.6298157 2.1556121 2.0205270 1.5456623 1.6422367 1.9652378
## [267] 2.3436784 2.8605744 3.3421049 3.2056588 3.1004908 2.9581850 2.4638058
## [274] 2.1927224 1.8739459 1.6481690 1.4987776 1.2923267 1.2716411 2.9815890
## [281] 2.4370224 2.2813011 3.8157199 4.2131523 3.1783224 2.5097162 2.0663177
## [288] 1.7180460 1.5596467 1.3085629 1.1869460 1.1379623 1.1500523 1.1126189
## [295] 1.1614188 1.6410714 2.4716598 3.7196936 3.9497480 4.0875636 4.0189724
## [302] 4.6036164 5.6608671 6.8152222 7.6188921 7.3883586 6.3392723 4.9434950
## [309] 3.8099612 3.4410588 2.6677306 2.4718250 2.3449995 2.7143498 2.6766718
## [316] 1.9828382 1.8274862 1.9260563 1.9249472 2.0887684 2.0343408 1.9764946
## [323] 1.9936177 1.8538260 1.8673036 1.6998677 1.4974082 1.4511188 1.2071478
## [330] 1.1741508 1.1620668 1.1721343 1.1216765 1.1498116 1.1332758 1.0817133
## [337] 1.1995860 0.9528083 0.9160321 0.9265822 0.8696197 0.9031331 0.7737757
## [344] 0.7427744 0.7309345 0.7868818 0.7630507 0.8410432 0.7915728 0.9127318
## [351] 1.0339765 0.9340091 1.0818888 1.0656260 1.1350529 1.2525629 1.2456956
## [358] 1.2677380 1.4372295 1.5334125 1.6944544 1.9915024 1.8130453 2.0142579
## [365] 2.5565913 3.3818486 3.4317231 2.6915111 2.9106289 3.4923189 4.0036963
## [372] 4.4353368 4.2421482 4.3971861 3.9025565 3.1507275 2.7242234 2.3333563
## [379] 1.9250003 1.7524260 1.5770365 1.3576558 1.3122310 1.1493747 1.1145057
## [386] 1.1098449 1.0524026 1.0353647 1.1177658 0.9829495 0.9251944 0.8355311
## [393] 0.8323927 0.8555910 0.7069494 0.6943868 0.6879762 0.6447430 0.6753299
## [400] 0.7282297 0.8065263 0.8604084 0.9360754 0.9666827 0.9960071 1.1084635
## [407] 1.2030858 1.2369566 1.2525865 1.3054612 1.4528432 1.4408922 1.4622115
## [414] 1.6554147 1.4657230 1.5181061 1.6639544
plot(log(FluTrain$ILILag2), log(FluTrain$ILI))

# Train a linear regression model on the FluTrain dataset to predict the log of
# the ILI variable using the Queries variable as well as the log of the ILILag2
# variable.
FluTrend2 <- lm(log(ILI) ~ Queries + log(ILILag2), data = FluTrain)
summary(FluTrend2)
##
## Call:
## lm(formula = log(ILI) ~ Queries + log(ILILag2), data = FluTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52209 -0.11082 -0.01819 0.08143 0.76785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.24064 0.01953 -12.32 <2e-16 ***
## Queries 1.25578 0.07910 15.88 <2e-16 ***
## log(ILILag2) 0.65569 0.02251 29.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1703 on 412 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.9063, Adjusted R-squared: 0.9059
## F-statistic: 1993 on 2 and 412 DF, p-value: < 2.2e-16
# So far, we have only added the ILILag2 variable to the FluTrain data frame.
# To make predictions with our FluTrend2 model, we will also need to add ILILag2
# to the FluTest data frame (note that adding variables before splitting into a
# training and testing set can prevent this duplication of effort).
ILILag2 <- lag(zoo(FluTest$ILI), -2,na.pad = TRUE)
FluTest$ILILag2 <- coredata(ILILag2)
FluTest$ILILag2
## [1] NA NA 1.7667069 1.5434005 1.6476154 1.6842972 1.8635415
## [8] 1.8640793 2.0199272 2.1038510 2.0955493 2.1039832 2.2934216 1.9222345
## [15] 1.7423860 1.6760138 1.4963706 1.3811978 1.2884929 1.3578416 1.3076279
## [22] 1.2669188 1.2773098 1.2990197 1.1623532 1.0861211 1.0567958 1.0787129
## [29] 1.0467024 0.9281519 0.9019358 0.9160412 0.9524073 0.9164542 0.9017871
## [36] 1.0331514 1.0519866 1.1271337 1.1864886 1.2141531 1.2519805 1.1918833
## [43] 1.3238891 1.3085430 1.3605903 1.4860247 1.6109150 1.7332935 2.3046254
## [50] 2.2259966 2.9780471 3.6002297
# In this problem, the training and testing sets are split sequentially -- the
# training set contains all observations from 2004-2011 and the testing set
# contains all observations from 2012. There is no time gap between the two
# datasets, meaning the first observation in FluTest was recorded one week after
# the last observation in FluTrain. From this, we can identify how to fill in the
# missing values for the ILILag2 variable in FluTest.
# Fill in the missing values for ILILag2 in FluTest.
FluTest$ILILag2[1] = FluTrain$ILI[416]
FluTest$ILILag2[2] = FluTrain$ILI[417]
FluTest$ILILag2
## [1] 1.8527356 2.1241299 1.7667069 1.5434005 1.6476154 1.6842972 1.8635415
## [8] 1.8640793 2.0199272 2.1038510 2.0955493 2.1039832 2.2934216 1.9222345
## [15] 1.7423860 1.6760138 1.4963706 1.3811978 1.2884929 1.3578416 1.3076279
## [22] 1.2669188 1.2773098 1.2990197 1.1623532 1.0861211 1.0567958 1.0787129
## [29] 1.0467024 0.9281519 0.9019358 0.9160412 0.9524073 0.9164542 0.9017871
## [36] 1.0331514 1.0519866 1.1271337 1.1864886 1.2141531 1.2519805 1.1918833
## [43] 1.3238891 1.3085430 1.3605903 1.4860247 1.6109150 1.7332935 2.3046254
## [50] 2.2259966 2.9780471 3.6002297
ILI_predict <- exp(predict(FluTrend2, newdata = FluTest))
SSE <- sum((ILI_predict - FluTest$ILI)^2)
RMSE <- sqrt(SSE/nrow(FluTest))
RMSE
## [1] 0.2942029
data(state)
state.x77
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20
## Alaska 365 6315 1.5 69.31 11.3 66.7 152
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15
## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65
## California 21198 5114 1.1 71.71 10.3 62.6 20
## Colorado 2541 4884 0.7 72.06 6.8 63.9 166
## Connecticut 3100 5348 1.1 72.48 3.1 56.0 139
## Delaware 579 4809 0.9 70.06 6.2 54.6 103
## Florida 8277 4815 1.3 70.66 10.7 52.6 11
## Georgia 4931 4091 2.0 68.54 13.9 40.6 60
## Hawaii 868 4963 1.9 73.60 6.2 61.9 0
## Idaho 813 4119 0.6 71.87 5.3 59.5 126
## Illinois 11197 5107 0.9 70.14 10.3 52.6 127
## Indiana 5313 4458 0.7 70.88 7.1 52.9 122
## Iowa 2861 4628 0.5 72.56 2.3 59.0 140
## Kansas 2280 4669 0.6 72.58 4.5 59.9 114
## Kentucky 3387 3712 1.6 70.10 10.6 38.5 95
## Louisiana 3806 3545 2.8 68.76 13.2 42.2 12
## Maine 1058 3694 0.7 70.39 2.7 54.7 161
## Maryland 4122 5299 0.9 70.22 8.5 52.3 101
## Massachusetts 5814 4755 1.1 71.83 3.3 58.5 103
## Michigan 9111 4751 0.9 70.63 11.1 52.8 125
## Minnesota 3921 4675 0.6 72.96 2.3 57.6 160
## Mississippi 2341 3098 2.4 68.09 12.5 41.0 50
## Missouri 4767 4254 0.8 70.69 9.3 48.8 108
## Montana 746 4347 0.6 70.56 5.0 59.2 155
## Nebraska 1544 4508 0.6 72.60 2.9 59.3 139
## Nevada 590 5149 0.5 69.03 11.5 65.2 188
## New Hampshire 812 4281 0.7 71.23 3.3 57.6 174
## New Jersey 7333 5237 1.1 70.93 5.2 52.5 115
## New Mexico 1144 3601 2.2 70.32 9.7 55.2 120
## New York 18076 4903 1.4 70.55 10.9 52.7 82
## North Carolina 5441 3875 1.8 69.21 11.1 38.5 80
## North Dakota 637 5087 0.8 72.78 1.4 50.3 186
## Ohio 10735 4561 0.8 70.82 7.4 53.2 124
## Oklahoma 2715 3983 1.1 71.42 6.4 51.6 82
## Oregon 2284 4660 0.6 72.13 4.2 60.0 44
## Pennsylvania 11860 4449 1.0 70.43 6.1 50.2 126
## Rhode Island 931 4558 1.3 71.90 2.4 46.4 127
## South Carolina 2816 3635 2.3 67.96 11.6 37.8 65
## South Dakota 681 4167 0.5 72.08 1.7 53.3 172
## Tennessee 4173 3821 1.7 70.11 11.0 41.8 70
## Texas 12237 4188 2.2 70.90 12.2 47.4 35
## Utah 1203 4022 0.6 72.90 4.5 67.3 137
## Vermont 472 3907 0.6 71.64 5.5 57.1 168
## Virginia 4981 4701 1.4 70.08 9.5 47.8 85
## Washington 3559 4864 0.6 71.72 4.3 63.5 32
## West Virginia 1799 3617 1.4 69.48 6.7 41.6 100
## Wisconsin 4589 4468 0.7 72.48 3.0 54.5 149
## Wyoming 376 4566 0.6 70.29 6.9 62.9 173
## Area
## Alabama 50708
## Alaska 566432
## Arizona 113417
## Arkansas 51945
## California 156361
## Colorado 103766
## Connecticut 4862
## Delaware 1982
## Florida 54090
## Georgia 58073
## Hawaii 6425
## Idaho 82677
## Illinois 55748
## Indiana 36097
## Iowa 55941
## Kansas 81787
## Kentucky 39650
## Louisiana 44930
## Maine 30920
## Maryland 9891
## Massachusetts 7826
## Michigan 56817
## Minnesota 79289
## Mississippi 47296
## Missouri 68995
## Montana 145587
## Nebraska 76483
## Nevada 109889
## New Hampshire 9027
## New Jersey 7521
## New Mexico 121412
## New York 47831
## North Carolina 48798
## North Dakota 69273
## Ohio 40975
## Oklahoma 68782
## Oregon 96184
## Pennsylvania 44966
## Rhode Island 1049
## South Carolina 30225
## South Dakota 75955
## Tennessee 41328
## Texas 262134
## Utah 82096
## Vermont 9267
## Virginia 39780
## Washington 66570
## West Virginia 24070
## Wisconsin 54464
## Wyoming 97203
# combine columns
statedata <- cbind(data.frame(state.x77), state.abb, state.area, state.center, state.division, state.name, state.region)
str(statedata)
## 'data.frame': 50 obs. of 15 variables:
## $ Population : num 3615 365 2212 2110 21198 ...
## $ Income : num 3624 6315 4530 3378 5114 ...
## $ Illiteracy : num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life.Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS.Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : num 50708 566432 113417 51945 156361 ...
## $ state.abb : Factor w/ 50 levels "AK","AL","AR",..: 2 1 4 3 5 6 7 8 9 10 ...
## $ state.area : num 51609 589757 113909 53104 158693 ...
## $ x : num -86.8 -127.2 -111.6 -92.3 -119.8 ...
## $ y : num 32.6 49.2 34.2 34.7 36.5 ...
## $ state.division: Factor w/ 9 levels "New England",..: 4 9 8 5 9 8 1 3 3 3 ...
## $ state.name : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ state.region : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
plot(statedata$x, statedata$y)

# determine which region of the US (West, North Central, South, or Northeast) has
# the highest average high school graduation rate of all the states in the region
tapply(statedata$HS.Grad,statedata$state.region,mean)
## Northeast South North Central West
## 53.96667 44.34375 54.51667 62.00000
boxplot(statedata$Murder ~ statedata$state.region)

# there is an outlier in the Northeast region of the boxplot you just generated.
# Which state does this correspond to?
subset(statedata$Murder, state.region == "Northeast")
## [1] 3.1 2.7 3.3 3.3 5.2 10.9 6.1 2.4 5.5
statedata$state.name[which(statedata$Murder == 10.9)]
## [1] New York
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
# PREDICTING LIFE EXPECTANCY - AN INITIAL MODEL
lifeReg <- lm(Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + Frost + Area, data = statedata)
summary(lifeReg)
##
## Call:
## lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder +
## HS.Grad + Frost + Area, data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48895 -0.51232 -0.02747 0.57002 1.49447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+01 1.748e+00 40.586 < 2e-16 ***
## Population 5.180e-05 2.919e-05 1.775 0.0832 .
## Income -2.180e-05 2.444e-04 -0.089 0.9293
## Illiteracy 3.382e-02 3.663e-01 0.092 0.9269
## Murder -3.011e-01 4.662e-02 -6.459 8.68e-08 ***
## HS.Grad 4.893e-02 2.332e-02 2.098 0.0420 *
## Frost -5.735e-03 3.143e-03 -1.825 0.0752 .
## Area -7.383e-08 1.668e-06 -0.044 0.9649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.6922
## F-statistic: 16.74 on 7 and 42 DF, p-value: 2.534e-10
# the coefficient of Income is negative, but if we make the following plot,
# we will actually see a positive relationship between income and life expectancy.
plot(statedata$Income, statedata$Life.Exp)

cor(statedata$Income, statedata$Life.Exp)
## [1] 0.3402553
# meaning there is multicollinearity in the model.
# Refine the model
# remove area
lifeReg2 <- lm(Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + Frost, data = statedata)
summary(lifeReg2)
##
## Call:
## lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder +
## HS.Grad + Frost, data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.49047 -0.52533 -0.02546 0.57160 1.50374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.099e+01 1.387e+00 51.165 < 2e-16 ***
## Population 5.188e-05 2.879e-05 1.802 0.0785 .
## Income -2.444e-05 2.343e-04 -0.104 0.9174
## Illiteracy 2.846e-02 3.416e-01 0.083 0.9340
## Murder -3.018e-01 4.334e-02 -6.963 1.45e-08 ***
## HS.Grad 4.847e-02 2.067e-02 2.345 0.0237 *
## Frost -5.776e-03 2.970e-03 -1.945 0.0584 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7361 on 43 degrees of freedom
## Multiple R-squared: 0.7361, Adjusted R-squared: 0.6993
## F-statistic: 19.99 on 6 and 43 DF, p-value: 5.362e-11
# remove Illiteracy
lifeReg3 <- lm(Life.Exp ~ Population + Income + Murder + HS.Grad + Frost, data = statedata)
summary(lifeReg3)
##
## Call:
## lm(formula = Life.Exp ~ Population + Income + Murder + HS.Grad +
## Frost, data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4892 -0.5122 -0.0329 0.5645 1.5166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.107e+01 1.029e+00 69.067 < 2e-16 ***
## Population 5.115e-05 2.709e-05 1.888 0.0657 .
## Income -2.477e-05 2.316e-04 -0.107 0.9153
## Murder -3.000e-01 3.704e-02 -8.099 2.91e-10 ***
## HS.Grad 4.776e-02 1.859e-02 2.569 0.0137 *
## Frost -5.910e-03 2.468e-03 -2.395 0.0210 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7277 on 44 degrees of freedom
## Multiple R-squared: 0.7361, Adjusted R-squared: 0.7061
## F-statistic: 24.55 on 5 and 44 DF, p-value: 1.019e-11
# remove income
lifeReg4 <- lm(Life.Exp ~ Population + Murder + HS.Grad + Frost, data = statedata)
summary(lifeReg4)
##
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost,
## data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47095 -0.53464 -0.03701 0.57621 1.50683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.103e+01 9.529e-01 74.542 < 2e-16 ***
## Population 5.014e-05 2.512e-05 1.996 0.05201 .
## Murder -3.001e-01 3.661e-02 -8.199 1.77e-10 ***
## HS.Grad 4.658e-02 1.483e-02 3.142 0.00297 **
## Frost -5.943e-03 2.421e-03 -2.455 0.01802 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7126
## F-statistic: 31.37 on 4 and 45 DF, p-value: 1.696e-12
life_predict <- predict(lifeReg4)
which.min(life_predict)
## Alabama
## 1
statedata$state.name[which.min(statedata$Life.Exp)]
## [1] South Carolina
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
which.max(life_predict)
## Washington
## 47
statedata$state.name[which.max(statedata$Life.Exp)]
## [1] Hawaii
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
# For which state do we make the smallest absolute error?
which.min(abs(lifeReg4$residuals))
## Indiana
## 14