Unit2_HW_TimeSeries.R

# Climate change

setwd("C:/Users/jzchen/Documents/Courses/Analytics edge/Unit 2")
climate <- read.csv("climate_change.csv")
str(climate)

## 'data.frame':    308 obs. of  11 variables:
##  $ Year    : int  1983 1983 1983 1983 1983 1983 1983 1983 1984 1984 ...
##  $ Month   : int  5 6 7 8 9 10 11 12 1 2 ...
##  $ MEI     : num  2.556 2.167 1.741 1.13 0.428 ...
##  $ CO2     : num  346 346 344 342 340 ...
##  $ CH4     : num  1639 1634 1633 1631 1648 ...
##  $ N2O     : num  304 304 304 304 304 ...
##  $ CFC.11  : num  191 192 193 194 194 ...
##  $ CFC.12  : num  350 352 354 356 357 ...
##  $ TSI     : num  1366 1366 1366 1366 1366 ...
##  $ Aerosols: num  0.0863 0.0794 0.0731 0.0673 0.0619 0.0569 0.0524 0.0486 0.0451 0.0416 ...
##  $ Temp    : num  0.109 0.118 0.137 0.176 0.149 0.093 0.232 0.078 0.089 0.013 ...

climate_train <- subset(climate, Year <= 2006)
climate_test <- subset(climate, Year > 2006)
TempReg <- lm(Temp ~ MEI + CO2+ CH4 + N2O + CFC.11 + CFC.12 + TSI + Aerosols, data = climate_train)
summary(TempReg)

## 
## Call:
## lm(formula = Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 + 
##     TSI + Aerosols, data = climate_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25888 -0.05913 -0.00082  0.05649  0.32433 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.246e+02  1.989e+01  -6.265 1.43e-09 ***
## MEI          6.421e-02  6.470e-03   9.923  < 2e-16 ***
## CO2          6.457e-03  2.285e-03   2.826  0.00505 ** 
## CH4          1.240e-04  5.158e-04   0.240  0.81015    
## N2O         -1.653e-02  8.565e-03  -1.930  0.05467 .  
## CFC.11      -6.631e-03  1.626e-03  -4.078 5.96e-05 ***
## CFC.12       3.808e-03  1.014e-03   3.757  0.00021 ***
## TSI          9.314e-02  1.475e-02   6.313 1.10e-09 ***
## Aerosols    -1.538e+00  2.133e-01  -7.210 5.41e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09171 on 275 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7436 
## F-statistic: 103.6 on 8 and 275 DF,  p-value: < 2.2e-16

cor(climate_train)

##                 Year         Month           MEI         CO2         CH4
## Year      1.00000000 -0.0279419602 -0.0369876842  0.98274939  0.91565945
## Month    -0.02794196  1.0000000000  0.0008846905 -0.10673246  0.01856866
## MEI      -0.03698768  0.0008846905  1.0000000000 -0.04114717 -0.03341930
## CO2       0.98274939 -0.1067324607 -0.0411471651  1.00000000  0.87727963
## CH4       0.91565945  0.0185686624 -0.0334193014  0.87727963  1.00000000
## N2O       0.99384523  0.0136315303 -0.0508197755  0.97671982  0.89983864
## CFC.11    0.56910643 -0.0131112236  0.0690004387  0.51405975  0.77990402
## CFC.12    0.89701166  0.0006751102  0.0082855443  0.85268963  0.96361625
## TSI       0.17030201 -0.0346061935 -0.1544919227  0.17742893  0.24552844
## Aerosols -0.34524670  0.0148895406  0.3402377871 -0.35615480 -0.26780919
## Temp      0.78679714 -0.0998567411  0.1724707512  0.78852921  0.70325502
##                  N2O      CFC.11        CFC.12         TSI    Aerosols
## Year      0.99384523  0.56910643  0.8970116635  0.17030201 -0.34524670
## Month     0.01363153 -0.01311122  0.0006751102 -0.03460619  0.01488954
## MEI      -0.05081978  0.06900044  0.0082855443 -0.15449192  0.34023779
## CO2       0.97671982  0.51405975  0.8526896272  0.17742893 -0.35615480
## CH4       0.89983864  0.77990402  0.9636162478  0.24552844 -0.26780919
## N2O       1.00000000  0.52247732  0.8679307757  0.19975668 -0.33705457
## CFC.11    0.52247732  1.00000000  0.8689851828  0.27204596 -0.04392120
## CFC.12    0.86793078  0.86898518  1.0000000000  0.25530281 -0.22513124
## TSI       0.19975668  0.27204596  0.2553028138  1.00000000  0.05211651
## Aerosols -0.33705457 -0.04392120 -0.2251312440  0.05211651  1.00000000
## Temp      0.77863893  0.40771029  0.6875575483  0.24338269 -0.38491375
##                 Temp
## Year      0.78679714
## Month    -0.09985674
## MEI       0.17247075
## CO2       0.78852921
## CH4       0.70325502
## N2O       0.77863893
## CFC.11    0.40771029
## CFC.12    0.68755755
## TSI       0.24338269
## Aerosols -0.38491375
## Temp      1.00000000

TempReg_1 <- lm(Temp ~ MEI + N2O + TSI + Aerosols, data = climate_train)
summary(TempReg_1)

## 
## Call:
## lm(formula = Temp ~ MEI + N2O + TSI + Aerosols, data = climate_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27916 -0.05975 -0.00595  0.05672  0.34195 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.162e+02  2.022e+01  -5.747 2.37e-08 ***
## MEI          6.419e-02  6.652e-03   9.649  < 2e-16 ***
## N2O          2.532e-02  1.311e-03  19.307  < 2e-16 ***
## TSI          7.949e-02  1.487e-02   5.344 1.89e-07 ***
## Aerosols    -1.702e+00  2.180e-01  -7.806 1.19e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09547 on 279 degrees of freedom
## Multiple R-squared:  0.7261, Adjusted R-squared:  0.7222 
## F-statistic: 184.9 on 4 and 279 DF,  p-value: < 2.2e-16

# use step function to automate
TempReg_step <- step(TempReg)

## Start:  AIC=-1348.16
## Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
## 
##            Df Sum of Sq    RSS     AIC
## - CH4       1   0.00049 2.3135 -1350.1
## <none>                  2.3130 -1348.2
## - N2O       1   0.03132 2.3443 -1346.3
## - CO2       1   0.06719 2.3802 -1342.0
## - CFC.12    1   0.11874 2.4318 -1335.9
## - CFC.11    1   0.13986 2.4529 -1333.5
## - TSI       1   0.33516 2.6482 -1311.7
## - Aerosols  1   0.43727 2.7503 -1301.0
## - MEI       1   0.82823 3.1412 -1263.2
## 
## Step:  AIC=-1350.1
## Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
## 
##            Df Sum of Sq    RSS     AIC
## <none>                  2.3135 -1350.1
## - N2O       1   0.03133 2.3448 -1348.3
## - CO2       1   0.06672 2.3802 -1344.0
## - CFC.12    1   0.13023 2.4437 -1336.5
## - CFC.11    1   0.13938 2.4529 -1335.5
## - TSI       1   0.33500 2.6485 -1313.7
## - Aerosols  1   0.43987 2.7534 -1302.7
## - MEI       1   0.83118 3.1447 -1264.9

summary(TempReg_step)

## 
## Call:
## lm(formula = Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI + 
##     Aerosols, data = climate_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25770 -0.05994 -0.00104  0.05588  0.32203 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.245e+02  1.985e+01  -6.273 1.37e-09 ***
## MEI          6.407e-02  6.434e-03   9.958  < 2e-16 ***
## CO2          6.402e-03  2.269e-03   2.821 0.005129 ** 
## N2O         -1.602e-02  8.287e-03  -1.933 0.054234 .  
## CFC.11      -6.609e-03  1.621e-03  -4.078 5.95e-05 ***
## CFC.12       3.868e-03  9.812e-04   3.942 0.000103 ***
## TSI          9.312e-02  1.473e-02   6.322 1.04e-09 ***
## Aerosols    -1.540e+00  2.126e-01  -7.244 4.36e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09155 on 276 degrees of freedom
## Multiple R-squared:  0.7508, Adjusted R-squared:  0.7445 
## F-statistic: 118.8 on 7 and 276 DF,  p-value: < 2.2e-16

Temp_predict <- predict(TempReg_step, newdata = climate_test)
Temp_predict

##       285       286       287       288       289       290       291 
## 0.4677808 0.4435404 0.4265541 0.4299162 0.4455113 0.4151422 0.4097367 
##       292       293       294       295       296       297       298 
## 0.3839390 0.3255595 0.3274147 0.3231401 0.3316704 0.3522134 0.3313129 
##       299       300       301       302       303       304       305 
## 0.3142112 0.3703410 0.4162213 0.4391458 0.4237965 0.3913679 0.3587615 
##       306       307       308 
## 0.3451991 0.3607087 0.3638076

SSE <- sum((Temp_predict- climate_test$Temp)^2)
SST <- sum((mean(climate_train$Temp)-climate_test$Temp)^2)
R2 <- 1 - SSE/SST
R2

## [1] 0.6286051

# reading test scores
pisa_train <- read.csv("pisa2009train.csv")
pisa_test <- read.csv("pisa2009test.csv")
str(pisa_train)

## 'data.frame':    3663 obs. of  24 variables:
##  $ grade                : int  11 11 9 10 10 10 10 10 9 10 ...
##  $ male                 : int  1 1 1 0 1 1 0 0 0 1 ...
##  $ raceeth              : Factor w/ 7 levels "American Indian/Alaska Native",..: NA 7 7 3 4 3 2 7 7 5 ...
##  $ preschool            : int  NA 0 1 1 1 1 0 1 1 1 ...
##  $ expectBachelors      : int  0 0 1 1 0 1 1 1 0 1 ...
##  $ motherHS             : int  NA 1 1 0 1 NA 1 1 1 1 ...
##  $ motherBachelors      : int  NA 1 1 0 0 NA 0 0 NA 1 ...
##  $ motherWork           : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ fatherHS             : int  NA 1 1 1 1 1 NA 1 0 0 ...
##  $ fatherBachelors      : int  NA 0 NA 0 0 0 NA 0 NA 0 ...
##  $ fatherWork           : int  1 1 1 1 0 1 NA 1 1 1 ...
##  $ selfBornUS           : int  1 1 1 1 1 1 0 1 1 1 ...
##  $ motherBornUS         : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ fatherBornUS         : int  0 1 1 1 0 1 NA 1 1 1 ...
##  $ englishAtHome        : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ computerForSchoolwork: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ read30MinsADay       : int  0 1 0 1 1 0 0 1 0 0 ...
##  $ minutesPerWeekEnglish: int  225 450 250 200 250 300 250 300 378 294 ...
##  $ studentsInEnglish    : int  NA 25 28 23 35 20 28 30 20 24 ...
##  $ schoolHasLibrary     : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ publicSchool         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ urban                : int  1 0 0 1 1 0 1 0 1 0 ...
##  $ schoolSize           : int  673 1173 1233 2640 1095 227 2080 1913 502 899 ...
##  $ readingScore         : num  476 575 555 458 614 ...

# what is the average reading test score of males?
tapply(pisa_train$readingScore, pisa_train$male,mean)

##        0        1 
## 512.9406 483.5325

summary(pisa_train)

##      grade            male                      raceeth    
##  Min.   : 8.00   Min.   :0.0000   White             :2015  
##  1st Qu.:10.00   1st Qu.:0.0000   Hispanic          : 834  
##  Median :10.00   Median :1.0000   Black             : 444  
##  Mean   :10.09   Mean   :0.5111   Asian             : 143  
##  3rd Qu.:10.00   3rd Qu.:1.0000   More than one race: 124  
##  Max.   :12.00   Max.   :1.0000   (Other)           :  68  
##                                   NA's              :  35  
##    preschool      expectBachelors     motherHS    motherBachelors 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.00   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.00   Median :0.0000  
##  Mean   :0.7228   Mean   :0.7859   Mean   :0.88   Mean   :0.3481  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.00   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00   Max.   :1.0000  
##  NA's   :56       NA's   :62       NA's   :97     NA's   :397     
##    motherWork        fatherHS      fatherBachelors    fatherWork    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :1.0000   Median :1.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.7345   Mean   :0.8593   Mean   :0.3319   Mean   :0.8531  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :93       NA's   :245      NA's   :569      NA's   :233     
##    selfBornUS      motherBornUS     fatherBornUS    englishAtHome   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.9313   Mean   :0.7725   Mean   :0.7668   Mean   :0.8717  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :69       NA's   :71       NA's   :113      NA's   :71      
##  computerForSchoolwork read30MinsADay   minutesPerWeekEnglish
##  Min.   :0.0000        Min.   :0.0000   Min.   :   0.0       
##  1st Qu.:1.0000        1st Qu.:0.0000   1st Qu.: 225.0       
##  Median :1.0000        Median :0.0000   Median : 250.0       
##  Mean   :0.8994        Mean   :0.2899   Mean   : 266.2       
##  3rd Qu.:1.0000        3rd Qu.:1.0000   3rd Qu.: 300.0       
##  Max.   :1.0000        Max.   :1.0000   Max.   :2400.0       
##  NA's   :65            NA's   :34       NA's   :186          
##  studentsInEnglish schoolHasLibrary  publicSchool        urban       
##  Min.   : 1.0      Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:20.0      1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :25.0      Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :24.5      Mean   :0.9676   Mean   :0.9339   Mean   :0.3849  
##  3rd Qu.:30.0      3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :75.0      Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :249       NA's   :143                                       
##    schoolSize    readingScore  
##  Min.   : 100   Min.   :168.6  
##  1st Qu.: 712   1st Qu.:431.7  
##  Median :1212   Median :499.7  
##  Mean   :1369   Mean   :497.9  
##  3rd Qu.:1900   3rd Qu.:566.2  
##  Max.   :6694   Max.   :746.0  
##  NA's   :162

# Linear regression discards observations with missing data, so we will 
# remove all such observations from the training and testing sets. Later
# in the course, we will learn about imputation, which deals with missing
# data by filling in missing values with plausible information.
pisa_train <- na.omit(pisa_train)
pisa_test <- na.omit(pisa_test)
nrow(pisa_test)

## [1] 990

lmScore <- lm(readingScore ~., data = pisa_train)
summary(lmScore)

## 
## Call:
## lm(formula = readingScore ~ ., data = pisa_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -247.44  -48.86    1.86   49.77  217.18 
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                    76.489006  37.302678
## grade                                          29.542707   2.937399
## male                                          -14.521653   3.155926
## raceethAsian                                   63.167002  18.972648
## raceethBlack                                    0.264980  17.369507
## raceethHispanic                                28.301842  17.258860
## raceethMore than one race                      50.354805  18.570123
## raceethNative Hawaiian/Other Pacific Islander  62.175726  23.782766
## raceethWhite                                   67.277327  16.786935
## preschool                                      -4.463670   3.486055
## expectBachelors                                55.267080   4.293893
## motherHS                                        6.058774   6.091423
## motherBachelors                                12.638068   3.861457
## motherWork                                     -2.809101   3.521827
## fatherHS                                        4.018214   5.579269
## fatherBachelors                                16.929755   3.995253
## fatherWork                                      5.842798   4.395978
## selfBornUS                                     -3.806278   7.323718
## motherBornUS                                   -8.798153   6.587621
## fatherBornUS                                    4.306994   6.263875
## englishAtHome                                   8.035685   6.859492
## computerForSchoolwork                          22.500232   5.702562
## read30MinsADay                                 34.871924   3.408447
## minutesPerWeekEnglish                           0.012788   0.010712
## studentsInEnglish                              -0.286631   0.227819
## schoolHasLibrary                               12.215085   9.264884
## publicSchool                                  -16.857475   6.725614
## urban                                          -0.110132   3.962724
## schoolSize                                      0.006540   0.002197
##                                               t value Pr(>|t|)    
## (Intercept)                                     2.050 0.040425 *  
## grade                                          10.057  < 2e-16 ***
## male                                           -4.601 4.42e-06 ***
## raceethAsian                                    3.329 0.000884 ***
## raceethBlack                                    0.015 0.987830    
## raceethHispanic                                 1.640 0.101169    
## raceethMore than one race                       2.712 0.006744 ** 
## raceethNative Hawaiian/Other Pacific Islander   2.614 0.008997 ** 
## raceethWhite                                    4.008 6.32e-05 ***
## preschool                                      -1.280 0.200516    
## expectBachelors                                12.871  < 2e-16 ***
## motherHS                                        0.995 0.320012    
## motherBachelors                                 3.273 0.001080 ** 
## motherWork                                     -0.798 0.425167    
## fatherHS                                        0.720 0.471470    
## fatherBachelors                                 4.237 2.35e-05 ***
## fatherWork                                      1.329 0.183934    
## selfBornUS                                     -0.520 0.603307    
## motherBornUS                                   -1.336 0.181821    
## fatherBornUS                                    0.688 0.491776    
## englishAtHome                                   1.171 0.241527    
## computerForSchoolwork                           3.946 8.19e-05 ***
## read30MinsADay                                 10.231  < 2e-16 ***
## minutesPerWeekEnglish                           1.194 0.232644    
## studentsInEnglish                              -1.258 0.208460    
## schoolHasLibrary                                1.318 0.187487    
## publicSchool                                   -2.506 0.012261 *  
## urban                                          -0.028 0.977830    
## schoolSize                                      2.977 0.002942 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.81 on 2385 degrees of freedom
## Multiple R-squared:  0.3251, Adjusted R-squared:  0.3172 
## F-statistic: 41.04 on 28 and 2385 DF,  p-value: < 2.2e-16

# the model will break down the raceeth variable to different variables, raceethAsian,
# raceethBlack, etc.



# To include unordered factors in a linear regression model, we define one 
# level as the "reference level" and add a binary variable for each of the 
# remaining levels. In this way, a factor with n levels is replaced by n-1 
# binary variables. The reference level is typically selected to be the most
# frequently occurring level in the dataset.
# 
# As an example, consider the unordered factor variable "color", with levels 
# "red", "green", and "blue". If "green" were the reference level, then we 
# would add binary variables "colorred" and "colorblue" to a linear regression
# problem. All red examples would have colorred=1 and colorblue=0. All blue 
# examples would have colorred=0 and colorblue=1. All green examples would have
# colorred=0 and colorblue=0.

# Because the race variable takes on text values, it was loaded as a factor 
# variable when we read in the dataset with read.csv() -- you can see this when 
# you run str(pisaTrain) or str(pisaTest). However, by default R selects the first
# level alphabetically ("American Indian/Alaska Native") as the reference level of
# our factor instead of the most common level ("White"). Set the reference level of
# the factor by typing the following two lines in your R console:
pisa_train$raceeth <- relevel(pisa_train$raceeth, "White")
pisa_test$raceeth <- relevel(pisa_test$raceeth, "White")

lmScore <- lm(readingScore ~., data = pisa_train)
summary(lmScore)

## 
## Call:
## lm(formula = readingScore ~ ., data = pisa_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -247.44  -48.86    1.86   49.77  217.18 
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                   143.766333  33.841226
## grade                                          29.542707   2.937399
## male                                          -14.521653   3.155926
## raceethAmerican Indian/Alaska Native          -67.277327  16.786935
## raceethAsian                                   -4.110325   9.220071
## raceethBlack                                  -67.012347   5.460883
## raceethHispanic                               -38.975486   5.177743
## raceethMore than one race                     -16.922522   8.496268
## raceethNative Hawaiian/Other Pacific Islander  -5.101601  17.005696
## preschool                                      -4.463670   3.486055
## expectBachelors                                55.267080   4.293893
## motherHS                                        6.058774   6.091423
## motherBachelors                                12.638068   3.861457
## motherWork                                     -2.809101   3.521827
## fatherHS                                        4.018214   5.579269
## fatherBachelors                                16.929755   3.995253
## fatherWork                                      5.842798   4.395978
## selfBornUS                                     -3.806278   7.323718
## motherBornUS                                   -8.798153   6.587621
## fatherBornUS                                    4.306994   6.263875
## englishAtHome                                   8.035685   6.859492
## computerForSchoolwork                          22.500232   5.702562
## read30MinsADay                                 34.871924   3.408447
## minutesPerWeekEnglish                           0.012788   0.010712
## studentsInEnglish                              -0.286631   0.227819
## schoolHasLibrary                               12.215085   9.264884
## publicSchool                                  -16.857475   6.725614
## urban                                          -0.110132   3.962724
## schoolSize                                      0.006540   0.002197
##                                               t value Pr(>|t|)    
## (Intercept)                                     4.248 2.24e-05 ***
## grade                                          10.057  < 2e-16 ***
## male                                           -4.601 4.42e-06 ***
## raceethAmerican Indian/Alaska Native           -4.008 6.32e-05 ***
## raceethAsian                                   -0.446  0.65578    
## raceethBlack                                  -12.271  < 2e-16 ***
## raceethHispanic                                -7.528 7.29e-14 ***
## raceethMore than one race                      -1.992  0.04651 *  
## raceethNative Hawaiian/Other Pacific Islander  -0.300  0.76421    
## preschool                                      -1.280  0.20052    
## expectBachelors                                12.871  < 2e-16 ***
## motherHS                                        0.995  0.32001    
## motherBachelors                                 3.273  0.00108 ** 
## motherWork                                     -0.798  0.42517    
## fatherHS                                        0.720  0.47147    
## fatherBachelors                                 4.237 2.35e-05 ***
## fatherWork                                      1.329  0.18393    
## selfBornUS                                     -0.520  0.60331    
## motherBornUS                                   -1.336  0.18182    
## fatherBornUS                                    0.688  0.49178    
## englishAtHome                                   1.171  0.24153    
## computerForSchoolwork                           3.946 8.19e-05 ***
## read30MinsADay                                 10.231  < 2e-16 ***
## minutesPerWeekEnglish                           1.194  0.23264    
## studentsInEnglish                              -1.258  0.20846    
## schoolHasLibrary                                1.318  0.18749    
## publicSchool                                   -2.506  0.01226 *  
## urban                                          -0.028  0.97783    
## schoolSize                                      2.977  0.00294 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.81 on 2385 degrees of freedom
## Multiple R-squared:  0.3251, Adjusted R-squared:  0.3172 
## F-statistic: 41.04 on 28 and 2385 DF,  p-value: < 2.2e-16

SSE <- sum(lmScore$residuals^2)
RMSE <- sqrt(SSE/nrow(pisa_train))
RMSE

## [1] 73.36555

# predicting
predTest <- predict(lmScore, newdata = pisa_test)
summary(predTest)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   353.2   482.0   524.0   516.7   555.7   637.7

SSE <- sum((predTest - pisa_test$readingScore)^2)
SSE

## [1] 5762082

RMSE <- sqrt(SSE/nrow(pisa_test))
RMSE

## [1] 76.29079

# What is the predicted test score used in the baseline model?
mean(pisa_train$readingScore)

## [1] 517.9629

# What is the sum of squared errors of the baseline model on the testing set? 
# HINT: We call the sum of squared errors for the baseline model the total 
# sum of squares (SST).
SST <- sum((mean(pisa_train$readingScore) - pisa_test$readingScore)^2)
SST

## [1] 7802354

R2 <- 1-SSE/SST
R2

## [1] 0.2614944

# DETECTING FLU EPIDEMICS VIA SEARCH ENGINE QUERY DATA 
FluTrain <- read.csv("FluTrain.csv")
str(FluTrain)

## 'data.frame':    417 obs. of  3 variables:
##  $ Week   : Factor w/ 417 levels "2004-01-04 - 2004-01-10",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ILI    : num  2.42 1.81 1.71 1.54 1.44 ...
##  $ Queries: num  0.238 0.22 0.226 0.238 0.224 ...

which.max(FluTrain$ILI)

## [1] 303

FluTrain$Week[303]

## [1] 2009-10-18 - 2009-10-24
## 417 Levels: 2004-01-04 - 2004-01-10 ... 2011-12-25 - 2011-12-31

which.max(FluTrain$Queries)

## [1] 303

hist(FluTrain$ILI)

plot(log(FluTrain$ILI),FluTrain$Queries)

FluTrend1 <- lm(log(ILI)~Queries, data = FluTrain)
summary(FluTrend1)

## 
## Call:
## lm(formula = log(ILI) ~ Queries, data = FluTrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76003 -0.19696 -0.01657  0.18685  1.06450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.49934    0.03041  -16.42   <2e-16 ***
## Queries      2.96129    0.09312   31.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2995 on 415 degrees of freedom
## Multiple R-squared:  0.709,  Adjusted R-squared:  0.7083 
## F-statistic:  1011 on 1 and 415 DF,  p-value: < 2.2e-16

correlation <- cor(log(FluTrain$ILI), FluTrain$Queries)
log(1/correlation)

## [1] 0.1719357

exp(-0.5*correlation)

## [1] 0.6563792

correlation^2

## [1] 0.7090201

# The csv file FluTest.csv provides the 2012 weekly data of the 
# ILI-related search queries and the observed weekly percentage of 
# ILI-related physician visits.
FluTest <- read.csv("FluTest.csv")
PreTest1 <- exp(predict(FluTrend1, newdata = FluTest))
# What is our estimate for the percentage of ILI-related physician visits
# for the week of March 11, 2012?
FluTest$Week

##  [1] 2012-01-01 - 2012-01-07 2012-01-08 - 2012-01-14
##  [3] 2012-01-15 - 2012-01-21 2012-01-22 - 2012-01-28
##  [5] 2012-01-29 - 2012-02-04 2012-02-05 - 2012-02-11
##  [7] 2012-02-12 - 2012-02-18 2012-02-19 - 2012-02-25
##  [9] 2012-02-26 - 2012-03-03 2012-03-04 - 2012-03-10
## [11] 2012-03-11 - 2012-03-17 2012-03-18 - 2012-03-24
## [13] 2012-03-25 - 2012-03-31 2012-04-01 - 2012-04-07
## [15] 2012-04-08 - 2012-04-14 2012-04-15 - 2012-04-21
## [17] 2012-04-22 - 2012-04-28 2012-04-29 - 2012-05-05
## [19] 2012-05-06 - 2012-05-12 2012-05-13 - 2012-05-19
## [21] 2012-05-20 - 2012-05-26 2012-05-27 - 2012-06-02
## [23] 2012-06-03 - 2012-06-09 2012-06-10 - 2012-06-16
## [25] 2012-06-17 - 2012-06-23 2012-06-24 - 2012-06-30
## [27] 2012-07-01 - 2012-07-07 2012-07-08 - 2012-07-14
## [29] 2012-07-15 - 2012-07-21 2012-07-22 - 2012-07-28
## [31] 2012-07-29 - 2012-08-04 2012-08-05 - 2012-08-11
## [33] 2012-08-12 - 2012-08-18 2012-08-19 - 2012-08-25
## [35] 2012-08-26 - 2012-09-01 2012-09-02 - 2012-09-08
## [37] 2012-09-09 - 2012-09-15 2012-09-16 - 2012-09-22
## [39] 2012-09-23 - 2012-09-29 2012-09-30 - 2012-10-06
## [41] 2012-10-07 - 2012-10-13 2012-10-14 - 2012-10-20
## [43] 2012-10-21 - 2012-10-27 2012-10-28 - 2012-11-03
## [45] 2012-11-04 - 2012-11-10 2012-11-11 - 2012-11-17
## [47] 2012-11-18 - 2012-11-24 2012-11-25 - 2012-12-01
## [49] 2012-12-02 - 2012-12-08 2012-12-09 - 2012-12-15
## [51] 2012-12-16 - 2012-12-22 2012-12-23 - 2012-12-29
## 52 Levels: 2012-01-01 - 2012-01-07 ... 2012-12-23 - 2012-12-29

FluTest$Week[11]

## [1] 2012-03-11 - 2012-03-17
## 52 Levels: 2012-01-01 - 2012-01-07 ... 2012-12-23 - 2012-12-29

PreTest1[11]

##       11 
## 2.187378

(FluTest$ILI[11] - PreTest1[11])/FluTest$ILI[11]

##         11 
## 0.04623827

SSE <- sum((FluTest$ILI - PreTest1)^2)
SSE

## [1] 29.17708

RMSE <- sqrt(SSE/nrow(FluTest))
RMSE

## [1] 0.7490645

# TRAINING A TIME SERIES MODEL
# The observations in this dataset are consecutive weekly measurements of 
# the dependent and independent variables. This sort of dataset is called 
# a "time series." 
# Because the ILI variable is reported with a 1- or 2-week lag, a decision 
# maker cannot rely on the previous week's ILI value to predict the current
# week's value. Instead, the decision maker will only have data available
# from 2 or more weeks ago. We will build a variable called ILILag2 that 
# contains the ILI value from 2 weeks before the current observation.
# To do so, we will use the "zoo" package, which provides a number of helpful
# methods for time series models. 
# install.packages("zoo")
library(zoo)

## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# create the ILILag2 variable in the training set:
ILILag2 <- lag(zoo(FluTrain$ILI), -2,na.pad = TRUE)
FluTrain$ILILag2 <- coredata(ILILag2)
# In these commands, the value of -2 passed to lag means to return 2 observations
# before the current one; a positive value would have returned future observations.
# The parameter na.pad=TRUE means to add missing values for the first two weeks of 
# our dataset, where we can't compute the data from 2 weeks earlier.
FluTrain$ILILag2

##   [1]        NA        NA 2.4183312 1.8090560 1.7120239 1.5424951 1.4378683
##   [8] 1.3242740 1.3072567 1.0369770 1.0103204 1.0524925 1.0200901 0.9244187
##  [15] 0.7906450 0.8026098 0.8361300 0.7924358 0.6835877 0.7574523 0.7885854
##  [22] 0.8121710 0.8044629 0.8777009 0.7414530 0.6610222 0.7151092 0.5622412
##  [29] 0.7868082 0.8606578 0.6899440 0.7796912 0.6281439 0.9024586 0.8064432
##  [36] 0.8748878 0.9932130 0.8761408 0.9480916 0.9269426 0.9716430 0.8971591
##  [43] 1.0224828 1.0629632 1.1469570 1.2049501 1.3051655 1.2869916 1.5946756
##  [50] 1.3971432 1.4499567 1.6174545 2.1911192 2.5664893 2.1764491 2.2017121
##  [57] 2.5301211 3.0652381 3.9806083 4.5956803 4.7519706 4.1796206 3.4535851
##  [64] 3.1585224 2.6732010 2.3516104 1.8924285 1.5249048 1.4113441 1.2506826
##  [71] 1.2070250 1.0789550 1.1452080 1.0612426 1.0567977 1.2519310 1.0141893
##  [78] 1.0419693 0.9540274 0.8482299 0.8418715 0.7308936 0.7134316 0.6706772
##  [85] 0.6892776 0.7049290 0.6159033 0.6094256 0.6802587 0.7754884 0.6834214
##  [92] 0.7810748 0.8069435 1.0763468 1.0586890 1.1152326 1.1238125 1.2548892
##  [99] 1.3366090 1.3786364 1.6082900 1.4831056 1.6537399 2.0067892 2.5685716
## [106] 3.0527762 2.4250373 2.0019506 2.0586902 2.2127697 2.3222001 2.4927920
## [113] 2.7948942 2.9691114 2.8395905 2.7779902 2.4728693 2.1806146 2.0167951
## [120] 1.6410133 1.3582865 1.1427983 1.0403125 0.9643469 0.9379817 0.9474493
## [127] 0.8919182 0.8646427 0.9703199 0.8443901 0.7748704 0.8213725 0.8727445
## [134] 0.9226345 0.8994868 0.8430824 0.8818244 0.8171452 0.8715001 0.7386205
## [141] 0.7979660 1.0139373 0.8809358 0.9433663 0.8915462 1.2032228 1.0578822
## [148] 1.1305354 1.1255230 1.2080820 1.3495244 1.4689004 1.8276716 1.6656012
## [155] 1.8596834 2.3889130 2.7897759 3.1154858 2.2694245 1.8635464 1.9998635
## [162] 2.4406044 2.8301821 3.1234256 3.2701949 3.1775688 2.7236366 2.5020140
## [169] 2.4271992 1.9604132 1.5913980 1.3697835 1.3631668 1.1736951 1.0635756
## [176] 0.9697111 0.9653617 0.8567489 0.8633465 0.9353695 0.7455694 0.7404281
## [183] 0.6728965 0.6662820 0.6627473 0.5456190 0.5862306 0.6606867 0.5340928
## [190] 0.5855491 0.6180750 0.6874647 0.7156961 0.8293131 0.8009115 0.9184839
## [197] 0.8142590 1.0719708 1.2178574 1.2457554 1.3598449 1.4467085 1.5328638
## [204] 1.6665324 1.9748773 1.6730547 1.6340509 1.7459475 1.9364319 2.4890534
## [211] 2.2540484 2.0914715 2.3593428 3.3233143 4.4338100 5.3454714 5.4225751
## [218] 5.3030330 4.2445550 3.6280001 3.0346275 2.5359536 2.0573015 1.7415035
## [225] 1.4065217 1.2686070 1.0771887 0.9934452 0.9112119 0.9721091 0.9932575
## [232] 1.0913202 0.8884460 0.8876915 0.8831874 0.8267564 0.7832014 0.7806103
## [239] 0.7690726 0.7212979 0.7525273 0.7527210 0.7927660 0.7438962 0.8141663
## [246] 0.8384009 0.8511236 1.1097575 1.0311436 1.0228436 1.0301739 1.0124478
## [253] 1.0835911 1.1657765 1.1912964 1.2807470 1.2705251 1.5957825 1.4584994
## [260] 1.4992072 1.6298157 2.1556121 2.0205270 1.5456623 1.6422367 1.9652378
## [267] 2.3436784 2.8605744 3.3421049 3.2056588 3.1004908 2.9581850 2.4638058
## [274] 2.1927224 1.8739459 1.6481690 1.4987776 1.2923267 1.2716411 2.9815890
## [281] 2.4370224 2.2813011 3.8157199 4.2131523 3.1783224 2.5097162 2.0663177
## [288] 1.7180460 1.5596467 1.3085629 1.1869460 1.1379623 1.1500523 1.1126189
## [295] 1.1614188 1.6410714 2.4716598 3.7196936 3.9497480 4.0875636 4.0189724
## [302] 4.6036164 5.6608671 6.8152222 7.6188921 7.3883586 6.3392723 4.9434950
## [309] 3.8099612 3.4410588 2.6677306 2.4718250 2.3449995 2.7143498 2.6766718
## [316] 1.9828382 1.8274862 1.9260563 1.9249472 2.0887684 2.0343408 1.9764946
## [323] 1.9936177 1.8538260 1.8673036 1.6998677 1.4974082 1.4511188 1.2071478
## [330] 1.1741508 1.1620668 1.1721343 1.1216765 1.1498116 1.1332758 1.0817133
## [337] 1.1995860 0.9528083 0.9160321 0.9265822 0.8696197 0.9031331 0.7737757
## [344] 0.7427744 0.7309345 0.7868818 0.7630507 0.8410432 0.7915728 0.9127318
## [351] 1.0339765 0.9340091 1.0818888 1.0656260 1.1350529 1.2525629 1.2456956
## [358] 1.2677380 1.4372295 1.5334125 1.6944544 1.9915024 1.8130453 2.0142579
## [365] 2.5565913 3.3818486 3.4317231 2.6915111 2.9106289 3.4923189 4.0036963
## [372] 4.4353368 4.2421482 4.3971861 3.9025565 3.1507275 2.7242234 2.3333563
## [379] 1.9250003 1.7524260 1.5770365 1.3576558 1.3122310 1.1493747 1.1145057
## [386] 1.1098449 1.0524026 1.0353647 1.1177658 0.9829495 0.9251944 0.8355311
## [393] 0.8323927 0.8555910 0.7069494 0.6943868 0.6879762 0.6447430 0.6753299
## [400] 0.7282297 0.8065263 0.8604084 0.9360754 0.9666827 0.9960071 1.1084635
## [407] 1.2030858 1.2369566 1.2525865 1.3054612 1.4528432 1.4408922 1.4622115
## [414] 1.6554147 1.4657230 1.5181061 1.6639544

plot(log(FluTrain$ILILag2), log(FluTrain$ILI))

# Train a linear regression model on the FluTrain dataset to predict the log of 
# the ILI variable using the Queries variable as well as the log of the ILILag2 
# variable. 
FluTrend2 <- lm(log(ILI) ~ Queries + log(ILILag2), data = FluTrain)
summary(FluTrend2)

## 
## Call:
## lm(formula = log(ILI) ~ Queries + log(ILILag2), data = FluTrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52209 -0.11082 -0.01819  0.08143  0.76785 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.24064    0.01953  -12.32   <2e-16 ***
## Queries       1.25578    0.07910   15.88   <2e-16 ***
## log(ILILag2)  0.65569    0.02251   29.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1703 on 412 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.9063, Adjusted R-squared:  0.9059 
## F-statistic:  1993 on 2 and 412 DF,  p-value: < 2.2e-16

# So far, we have only added the ILILag2 variable to the FluTrain data frame.
# To make predictions with our FluTrend2 model, we will also need to add ILILag2
# to the FluTest data frame (note that adding variables before splitting into a 
# training and testing set can prevent this duplication of effort).
ILILag2 <- lag(zoo(FluTest$ILI), -2,na.pad = TRUE)
FluTest$ILILag2 <- coredata(ILILag2)
FluTest$ILILag2

##  [1]        NA        NA 1.7667069 1.5434005 1.6476154 1.6842972 1.8635415
##  [8] 1.8640793 2.0199272 2.1038510 2.0955493 2.1039832 2.2934216 1.9222345
## [15] 1.7423860 1.6760138 1.4963706 1.3811978 1.2884929 1.3578416 1.3076279
## [22] 1.2669188 1.2773098 1.2990197 1.1623532 1.0861211 1.0567958 1.0787129
## [29] 1.0467024 0.9281519 0.9019358 0.9160412 0.9524073 0.9164542 0.9017871
## [36] 1.0331514 1.0519866 1.1271337 1.1864886 1.2141531 1.2519805 1.1918833
## [43] 1.3238891 1.3085430 1.3605903 1.4860247 1.6109150 1.7332935 2.3046254
## [50] 2.2259966 2.9780471 3.6002297

# In this problem, the training and testing sets are split sequentially -- the 
# training set contains all observations from 2004-2011 and the testing set 
# contains all observations from 2012. There is no time gap between the two 
# datasets, meaning the first observation in FluTest was recorded one week after
# the last observation in FluTrain. From this, we can identify how to fill in the
# missing values for the ILILag2 variable in FluTest.
# Fill in the missing values for ILILag2 in FluTest. 
FluTest$ILILag2[1] = FluTrain$ILI[416]
FluTest$ILILag2[2] = FluTrain$ILI[417]
FluTest$ILILag2

##  [1] 1.8527356 2.1241299 1.7667069 1.5434005 1.6476154 1.6842972 1.8635415
##  [8] 1.8640793 2.0199272 2.1038510 2.0955493 2.1039832 2.2934216 1.9222345
## [15] 1.7423860 1.6760138 1.4963706 1.3811978 1.2884929 1.3578416 1.3076279
## [22] 1.2669188 1.2773098 1.2990197 1.1623532 1.0861211 1.0567958 1.0787129
## [29] 1.0467024 0.9281519 0.9019358 0.9160412 0.9524073 0.9164542 0.9017871
## [36] 1.0331514 1.0519866 1.1271337 1.1864886 1.2141531 1.2519805 1.1918833
## [43] 1.3238891 1.3085430 1.3605903 1.4860247 1.6109150 1.7332935 2.3046254
## [50] 2.2259966 2.9780471 3.6002297

ILI_predict <- exp(predict(FluTrend2, newdata = FluTest))
SSE <- sum((ILI_predict - FluTest$ILI)^2)
RMSE <- sqrt(SSE/nrow(FluTest))
RMSE

## [1] 0.2942029

data(state)
state.x77

##                Population Income Illiteracy Life Exp Murder HS Grad Frost
## Alabama              3615   3624        2.1    69.05   15.1    41.3    20
## Alaska                365   6315        1.5    69.31   11.3    66.7   152
## Arizona              2212   4530        1.8    70.55    7.8    58.1    15
## Arkansas             2110   3378        1.9    70.66   10.1    39.9    65
## California          21198   5114        1.1    71.71   10.3    62.6    20
## Colorado             2541   4884        0.7    72.06    6.8    63.9   166
## Connecticut          3100   5348        1.1    72.48    3.1    56.0   139
## Delaware              579   4809        0.9    70.06    6.2    54.6   103
## Florida              8277   4815        1.3    70.66   10.7    52.6    11
## Georgia              4931   4091        2.0    68.54   13.9    40.6    60
## Hawaii                868   4963        1.9    73.60    6.2    61.9     0
## Idaho                 813   4119        0.6    71.87    5.3    59.5   126
## Illinois            11197   5107        0.9    70.14   10.3    52.6   127
## Indiana              5313   4458        0.7    70.88    7.1    52.9   122
## Iowa                 2861   4628        0.5    72.56    2.3    59.0   140
## Kansas               2280   4669        0.6    72.58    4.5    59.9   114
## Kentucky             3387   3712        1.6    70.10   10.6    38.5    95
## Louisiana            3806   3545        2.8    68.76   13.2    42.2    12
## Maine                1058   3694        0.7    70.39    2.7    54.7   161
## Maryland             4122   5299        0.9    70.22    8.5    52.3   101
## Massachusetts        5814   4755        1.1    71.83    3.3    58.5   103
## Michigan             9111   4751        0.9    70.63   11.1    52.8   125
## Minnesota            3921   4675        0.6    72.96    2.3    57.6   160
## Mississippi          2341   3098        2.4    68.09   12.5    41.0    50
## Missouri             4767   4254        0.8    70.69    9.3    48.8   108
## Montana               746   4347        0.6    70.56    5.0    59.2   155
## Nebraska             1544   4508        0.6    72.60    2.9    59.3   139
## Nevada                590   5149        0.5    69.03   11.5    65.2   188
## New Hampshire         812   4281        0.7    71.23    3.3    57.6   174
## New Jersey           7333   5237        1.1    70.93    5.2    52.5   115
## New Mexico           1144   3601        2.2    70.32    9.7    55.2   120
## New York            18076   4903        1.4    70.55   10.9    52.7    82
## North Carolina       5441   3875        1.8    69.21   11.1    38.5    80
## North Dakota          637   5087        0.8    72.78    1.4    50.3   186
## Ohio                10735   4561        0.8    70.82    7.4    53.2   124
## Oklahoma             2715   3983        1.1    71.42    6.4    51.6    82
## Oregon               2284   4660        0.6    72.13    4.2    60.0    44
## Pennsylvania        11860   4449        1.0    70.43    6.1    50.2   126
## Rhode Island          931   4558        1.3    71.90    2.4    46.4   127
## South Carolina       2816   3635        2.3    67.96   11.6    37.8    65
## South Dakota          681   4167        0.5    72.08    1.7    53.3   172
## Tennessee            4173   3821        1.7    70.11   11.0    41.8    70
## Texas               12237   4188        2.2    70.90   12.2    47.4    35
## Utah                 1203   4022        0.6    72.90    4.5    67.3   137
## Vermont               472   3907        0.6    71.64    5.5    57.1   168
## Virginia             4981   4701        1.4    70.08    9.5    47.8    85
## Washington           3559   4864        0.6    71.72    4.3    63.5    32
## West Virginia        1799   3617        1.4    69.48    6.7    41.6   100
## Wisconsin            4589   4468        0.7    72.48    3.0    54.5   149
## Wyoming               376   4566        0.6    70.29    6.9    62.9   173
##                  Area
## Alabama         50708
## Alaska         566432
## Arizona        113417
## Arkansas        51945
## California     156361
## Colorado       103766
## Connecticut      4862
## Delaware         1982
## Florida         54090
## Georgia         58073
## Hawaii           6425
## Idaho           82677
## Illinois        55748
## Indiana         36097
## Iowa            55941
## Kansas          81787
## Kentucky        39650
## Louisiana       44930
## Maine           30920
## Maryland         9891
## Massachusetts    7826
## Michigan        56817
## Minnesota       79289
## Mississippi     47296
## Missouri        68995
## Montana        145587
## Nebraska        76483
## Nevada         109889
## New Hampshire    9027
## New Jersey       7521
## New Mexico     121412
## New York        47831
## North Carolina  48798
## North Dakota    69273
## Ohio            40975
## Oklahoma        68782
## Oregon          96184
## Pennsylvania    44966
## Rhode Island     1049
## South Carolina  30225
## South Dakota    75955
## Tennessee       41328
## Texas          262134
## Utah            82096
## Vermont          9267
## Virginia        39780
## Washington      66570
## West Virginia   24070
## Wisconsin       54464
## Wyoming         97203

# combine columns
statedata <- cbind(data.frame(state.x77), state.abb, state.area, state.center, state.division, state.name, state.region)
str(statedata)

## 'data.frame':    50 obs. of  15 variables:
##  $ Population    : num  3615 365 2212 2110 21198 ...
##  $ Income        : num  3624 6315 4530 3378 5114 ...
##  $ Illiteracy    : num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
##  $ Life.Exp      : num  69 69.3 70.5 70.7 71.7 ...
##  $ Murder        : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
##  $ HS.Grad       : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
##  $ Frost         : num  20 152 15 65 20 166 139 103 11 60 ...
##  $ Area          : num  50708 566432 113417 51945 156361 ...
##  $ state.abb     : Factor w/ 50 levels "AK","AL","AR",..: 2 1 4 3 5 6 7 8 9 10 ...
##  $ state.area    : num  51609 589757 113909 53104 158693 ...
##  $ x             : num  -86.8 -127.2 -111.6 -92.3 -119.8 ...
##  $ y             : num  32.6 49.2 34.2 34.7 36.5 ...
##  $ state.division: Factor w/ 9 levels "New England",..: 4 9 8 5 9 8 1 3 3 3 ...
##  $ state.name    : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ state.region  : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...

plot(statedata$x, statedata$y)

# determine which region of the US (West, North Central, South, or Northeast) has
# the highest average high school graduation rate of all the states in the region
tapply(statedata$HS.Grad,statedata$state.region,mean)

##     Northeast         South North Central          West 
##      53.96667      44.34375      54.51667      62.00000

boxplot(statedata$Murder ~ statedata$state.region)

# there is an outlier in the Northeast region of the boxplot you just generated. 
# Which state does this correspond to? 
subset(statedata$Murder, state.region == "Northeast")

## [1]  3.1  2.7  3.3  3.3  5.2 10.9  6.1  2.4  5.5

statedata$state.name[which(statedata$Murder == 10.9)]

## [1] New York
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming

# PREDICTING LIFE EXPECTANCY - AN INITIAL MODEL  
lifeReg <- lm(Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + Frost + Area, data = statedata)
summary(lifeReg)

## 
## Call:
## lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder + 
##     HS.Grad + Frost + Area, data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48895 -0.51232 -0.02747  0.57002  1.49447 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.094e+01  1.748e+00  40.586  < 2e-16 ***
## Population   5.180e-05  2.919e-05   1.775   0.0832 .  
## Income      -2.180e-05  2.444e-04  -0.089   0.9293    
## Illiteracy   3.382e-02  3.663e-01   0.092   0.9269    
## Murder      -3.011e-01  4.662e-02  -6.459 8.68e-08 ***
## HS.Grad      4.893e-02  2.332e-02   2.098   0.0420 *  
## Frost       -5.735e-03  3.143e-03  -1.825   0.0752 .  
## Area        -7.383e-08  1.668e-06  -0.044   0.9649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.6922 
## F-statistic: 16.74 on 7 and 42 DF,  p-value: 2.534e-10

# the coefficient of Income is negative, but if we make the following plot,
# we will actually see a positive relationship between income and life expectancy.

plot(statedata$Income, statedata$Life.Exp)

cor(statedata$Income, statedata$Life.Exp)

## [1] 0.3402553

# meaning there is multicollinearity in the model.

# Refine the model
# remove area
lifeReg2 <- lm(Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + Frost, data = statedata)
summary(lifeReg2)

## 
## Call:
## lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder + 
##     HS.Grad + Frost, data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.49047 -0.52533 -0.02546  0.57160  1.50374 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.099e+01  1.387e+00  51.165  < 2e-16 ***
## Population   5.188e-05  2.879e-05   1.802   0.0785 .  
## Income      -2.444e-05  2.343e-04  -0.104   0.9174    
## Illiteracy   2.846e-02  3.416e-01   0.083   0.9340    
## Murder      -3.018e-01  4.334e-02  -6.963 1.45e-08 ***
## HS.Grad      4.847e-02  2.067e-02   2.345   0.0237 *  
## Frost       -5.776e-03  2.970e-03  -1.945   0.0584 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7361 on 43 degrees of freedom
## Multiple R-squared:  0.7361, Adjusted R-squared:  0.6993 
## F-statistic: 19.99 on 6 and 43 DF,  p-value: 5.362e-11

# remove Illiteracy
lifeReg3 <- lm(Life.Exp ~ Population + Income + Murder + HS.Grad + Frost, data = statedata)
summary(lifeReg3)

## 
## Call:
## lm(formula = Life.Exp ~ Population + Income + Murder + HS.Grad + 
##     Frost, data = statedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4892 -0.5122 -0.0329  0.5645  1.5166 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.107e+01  1.029e+00  69.067  < 2e-16 ***
## Population   5.115e-05  2.709e-05   1.888   0.0657 .  
## Income      -2.477e-05  2.316e-04  -0.107   0.9153    
## Murder      -3.000e-01  3.704e-02  -8.099 2.91e-10 ***
## HS.Grad      4.776e-02  1.859e-02   2.569   0.0137 *  
## Frost       -5.910e-03  2.468e-03  -2.395   0.0210 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7277 on 44 degrees of freedom
## Multiple R-squared:  0.7361, Adjusted R-squared:  0.7061 
## F-statistic: 24.55 on 5 and 44 DF,  p-value: 1.019e-11

# remove income
lifeReg4 <- lm(Life.Exp ~ Population + Murder + HS.Grad + Frost, data = statedata)
summary(lifeReg4)

## 
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost, 
##     data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47095 -0.53464 -0.03701  0.57621  1.50683 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.103e+01  9.529e-01  74.542  < 2e-16 ***
## Population   5.014e-05  2.512e-05   1.996  0.05201 .  
## Murder      -3.001e-01  3.661e-02  -8.199 1.77e-10 ***
## HS.Grad      4.658e-02  1.483e-02   3.142  0.00297 ** 
## Frost       -5.943e-03  2.421e-03  -2.455  0.01802 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared:  0.736,  Adjusted R-squared:  0.7126 
## F-statistic: 31.37 on 4 and 45 DF,  p-value: 1.696e-12

life_predict <- predict(lifeReg4)
which.min(life_predict)

## Alabama 
##       1

statedata$state.name[which.min(statedata$Life.Exp)]

## [1] South Carolina
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming

which.max(life_predict)

## Washington 
##         47

statedata$state.name[which.max(statedata$Life.Exp)]

## [1] Hawaii
## 50 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming

# For which state do we make the smallest absolute error?
which.min(abs(lifeReg4$residuals))

## Indiana 
##      14

Unit2_HW_TimeSeries.R

jzchen

Fri Apr 03 21:52:02 2015