Chapter 2 - Q9

data(Auto)
Auto <- na.omit(Auto)

(a)

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

(b)

sapply(Auto[, -9], range)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,]  9.0         3           68         46   1613          8.0   70      1
## [2,] 46.6         8          455        230   5140         24.8   82      3

(c)

sapply(Auto[, -9], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    23.445918     5.471939   194.411990   104.469388  2977.584184    15.541327 
##         year       origin 
##    75.979592     1.576531
sapply(Auto[, -9], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    7.8050075    1.7057832  104.6440039   38.4911599  849.4025600    2.7588641 
##         year       origin 
##    3.6837365    0.8055182

(d)

Auto_sub <- Auto[-(10:85), ]
sapply(Auto_sub[, -9], range)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0         3           68         46   1649          8.5   70      1
## [2,] 46.6         8          455        230   4997         24.8   82      3
sapply(Auto_sub[, -9], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    24.404430     5.373418   187.240506   100.721519  2935.971519    15.726899 
##         year       origin 
##    77.145570     1.601266
sapply(Auto_sub[, -9], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.867283     1.654179    99.678367    35.708853   811.300208     2.693721 
##         year       origin 
##     3.106217     0.819910

(e)

GGally::ggpairs(Auto %>% select_if(is.numeric))

(f)

Strong predictors: horsepower, weight, displacement.


Chapter 3 - Q9

(a)

GGally::ggpairs(Auto %>% select_if(is.numeric))

(b)

cor(Auto %>% select_if(is.numeric))
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

(c)

lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

(d)

par(mfrow=c(2,2))
plot(lm_fit)

(e)

lm_interact <- lm(mpg ~ (.-name)^2, data=Auto)
summary(lm_interact)
## 
## Call:
## lm(formula = mpg ~ (. - name)^2, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6303 -1.4481  0.0596  1.2739 11.1386 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                3.548e+01  5.314e+01   0.668  0.50475   
## cylinders                  6.989e+00  8.248e+00   0.847  0.39738   
## displacement              -4.785e-01  1.894e-01  -2.527  0.01192 * 
## horsepower                 5.034e-01  3.470e-01   1.451  0.14769   
## weight                     4.133e-03  1.759e-02   0.235  0.81442   
## acceleration              -5.859e+00  2.174e+00  -2.696  0.00735 **
## year                       6.974e-01  6.097e-01   1.144  0.25340   
## origin                    -2.090e+01  7.097e+00  -2.944  0.00345 **
## cylinders:displacement    -3.383e-03  6.455e-03  -0.524  0.60051   
## cylinders:horsepower       1.161e-02  2.420e-02   0.480  0.63157   
## cylinders:weight           3.575e-04  8.955e-04   0.399  0.69000   
## cylinders:acceleration     2.779e-01  1.664e-01   1.670  0.09584 . 
## cylinders:year            -1.741e-01  9.714e-02  -1.793  0.07389 . 
## cylinders:origin           4.022e-01  4.926e-01   0.816  0.41482   
## displacement:horsepower   -8.491e-05  2.885e-04  -0.294  0.76867   
## displacement:weight        2.472e-05  1.470e-05   1.682  0.09342 . 
## displacement:acceleration -3.479e-03  3.342e-03  -1.041  0.29853   
## displacement:year          5.934e-03  2.391e-03   2.482  0.01352 * 
## displacement:origin        2.398e-02  1.947e-02   1.232  0.21875   
## horsepower:weight         -1.968e-05  2.924e-05  -0.673  0.50124   
## horsepower:acceleration   -7.213e-03  3.719e-03  -1.939  0.05325 . 
## horsepower:year           -5.838e-03  3.938e-03  -1.482  0.13916   
## horsepower:origin          2.233e-03  2.930e-02   0.076  0.93931   
## weight:acceleration        2.346e-04  2.289e-04   1.025  0.30596   
## weight:year               -2.245e-04  2.127e-04  -1.056  0.29182   
## weight:origin             -5.789e-04  1.591e-03  -0.364  0.71623   
## acceleration:year          5.562e-02  2.558e-02   2.174  0.03033 * 
## acceleration:origin        4.583e-01  1.567e-01   2.926  0.00365 **
## year:origin                1.393e-01  7.399e-02   1.882  0.06062 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared:  0.8893, Adjusted R-squared:  0.8808 
## F-statistic: 104.2 on 28 and 363 DF,  p-value: < 2.2e-16

(f)

lm_poly <- lm(mpg ~ poly(horsepower,2), data=Auto)
summary(lm_poly)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

Chapter 3 - Q15 (Boston)

data(Boston)

(a)

results <- map(names(Boston)[-1], function(var){
  model <- lm(as.formula(paste("crim ~", var)), data=Boston)
  summary(model)$coefficients[2,]
})
names(results) <- names(Boston)[-1]
results
## $zn
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -7.393498e-02  1.609460e-02 -4.593776e+00  5.506472e-06 
## 
## $indus
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 5.097763e-01 5.102433e-02 9.990848e+00 1.450349e-21 
## 
## $chas
##   Estimate Std. Error    t value   Pr(>|t|) 
## -1.8927766  1.5061155 -1.2567274  0.2094345 
## 
## $nox
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 3.124853e+01 2.999190e+00 1.041899e+01 3.751739e-23 
## 
## $rm
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -2.684051e+00  5.320411e-01 -5.044819e+00  6.346703e-07 
## 
## $age
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 1.077862e-01 1.273644e-02 8.462825e+00 2.854869e-16 
## 
## $dis
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -1.550902e+00  1.683300e-01 -9.213458e+00  8.519949e-19 
## 
## $rad
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 6.179109e-01 3.433182e-02 1.799820e+01 2.693844e-56 
## 
## $tax
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 2.974225e-02 1.847415e-03 1.609939e+01 2.357127e-47 
## 
## $ptratio
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 1.151983e+00 1.693736e-01 6.801430e+00 2.942922e-11 
## 
## $black
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -3.627964e-02  3.873154e-03 -9.366951e+00  2.487274e-19 
## 
## $lstat
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 5.488048e-01 4.776097e-02 1.149065e+01 2.654277e-27 
## 
## $medv
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -3.631599e-01  3.839017e-02 -9.459710e+00  1.173987e-19

(b)

lm_boston <- lm(crim ~ ., data=Boston)
summary(lm_boston)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.924 -2.120 -0.353  1.019 75.051 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.033228   7.234903   2.354 0.018949 *  
## zn            0.044855   0.018734   2.394 0.017025 *  
## indus        -0.063855   0.083407  -0.766 0.444294    
## chas         -0.749134   1.180147  -0.635 0.525867    
## nox         -10.313535   5.275536  -1.955 0.051152 .  
## rm            0.430131   0.612830   0.702 0.483089    
## age           0.001452   0.017925   0.081 0.935488    
## dis          -0.987176   0.281817  -3.503 0.000502 ***
## rad           0.588209   0.088049   6.680 6.46e-11 ***
## tax          -0.003780   0.005156  -0.733 0.463793    
## ptratio      -0.271081   0.186450  -1.454 0.146611    
## black        -0.007538   0.003673  -2.052 0.040702 *  
## lstat         0.126211   0.075725   1.667 0.096208 .  
## medv         -0.198887   0.060516  -3.287 0.001087 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.4396 
## F-statistic: 31.47 on 13 and 492 DF,  p-value: < 2.2e-16

(c)

simple_coef <- sapply(results, function(x) x[1])
multi_coef <- coef(lm_boston)[-1]

plot(simple_coef, multi_coef)
abline(0,1,col="red")

(d)

poly_results <- map(names(Boston)[-1], function(var){
  
  x <- Boston[[var]]
  
  if(length(unique(x)) > 3){
    model <- lm(as.formula(paste("crim ~ poly(", var, ",3)", sep="")), data=Boston)
  } else {
    model <- lm(as.formula(paste("crim ~", var)), data=Boston)
  }
  
  summary(model)
})

Chapter 4 - Q13

data(Weekly)

(a)

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
GGally::ggpairs(Weekly %>% select_if(is.numeric))

(b)

glm_full <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
                data=Weekly, family=binomial)
summary(glm_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

(c)

prob <- predict(glm_full, type="response")
pred <- ifelse(prob>0.5,"Up","Down")

table(pred, Weekly$Direction)
##       
## pred   Down  Up
##   Down   54  48
##   Up    430 557
mean(pred==Weekly$Direction)
## [1] 0.5610652

(d)

train <- Weekly$Year < 2009

glm_lag2 <- glm(Direction ~ Lag2,
                data=Weekly, subset=train, family=binomial)

prob_test <- predict(glm_lag2, Weekly[!train,], type="response")
pred_test <- ifelse(prob_test>0.5,"Up","Down")

table(pred_test, Weekly$Direction[!train])
##          
## pred_test Down Up
##      Down    9  5
##      Up     34 56
mean(pred_test==Weekly$Direction[!train])
## [1] 0.625

(e)

lda_fit <- lda(Direction ~ Lag2, data=Weekly, subset=train)
lda_pred <- predict(lda_fit, Weekly[!train,])

table(lda_pred$class, Weekly$Direction[!train])
##       
##        Down Up
##   Down    9  5
##   Up     34 56
mean(lda_pred$class==Weekly$Direction[!train])
## [1] 0.625

(f)

qda_fit <- qda(Direction ~ Lag2, data=Weekly, subset=train)
qda_pred <- predict(qda_fit, Weekly[!train,])

table(qda_pred$class, Weekly$Direction[!train])
##       
##        Down Up
##   Down    0  0
##   Up     43 61
mean(qda_pred$class==Weekly$Direction[!train])
## [1] 0.5865385

(g)

train_x <- as.matrix(Weekly[train,"Lag2",drop=FALSE])
test_x  <- as.matrix(Weekly[!train,"Lag2",drop=FALSE])
train_y <- Weekly$Direction[train]

knn_pred <- knn(train_x, test_x, train_y, k=1)

table(knn_pred, Weekly$Direction[!train])
##         
## knn_pred Down Up
##     Down   21 30
##     Up     22 31
mean(knn_pred==Weekly$Direction[!train])
## [1] 0.5

(h)

nb_fit <- naiveBayes(Direction ~ Lag2, data=Weekly[train,])
nb_pred <- predict(nb_fit, Weekly[!train,])

table(nb_pred, Weekly$Direction[!train])
##        
## nb_pred Down Up
##    Down    0  0
##    Up     43 61
mean(nb_pred==Weekly$Direction[!train])
## [1] 0.5865385

(i)

Best: Logistic / LDA (~60%)

(j)

glm_best <- glm(Direction ~ Lag1+Lag2,
                data=Weekly, subset=train, family=binomial)

prob_best <- predict(glm_best, Weekly[!train,], type="response")
pred_best <- ifelse(prob_best>0.5,"Up","Down")

table(pred_best, Weekly$Direction[!train])
##          
## pred_best Down Up
##      Down    7  8
##      Up     36 53
mean(pred_best==Weekly$Direction[!train])
## [1] 0.5769231

Final Conclusion

All models run successfully. Nonlinearity exists in Boston data. Financial prediction remains difficult.