Chapter 04 (page 168): 10, 11, 13

Chapter 4 ex 10:

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
fix(Weekly)
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
pairs(Weekly)

par(mfrow=c(2,2))
plot(Weekly$Lag1, Weekly$Year)
plot(Weekly$Lag2, Weekly$Year)
plot(Weekly$Lag3, Weekly$Year)
plot(Weekly$Lag4, Weekly$Year)

plot(Weekly$Lag1, Weekly$Lag2)
plot(Weekly$Lag2, Weekly$Lag3)
plot(Weekly$Lag3, Weekly$Lag4)
plot(Weekly$Lag4, Weekly$Lag5)

par(mfrow=c(1,1))
plot(Weekly$Today, Weekly$Volume)

Lovaas answer ex 10a: there aren’t terribly strong linear patterns, and I’m not sure what we’re going to do with this data. Volume tends to increase a bit when returns are high, and vise versa.

glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomial)
summary(glm.fits)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lovaas answer ex 10b Only lag2 appears to be significant, which is a bummer.

glm.probs = predict(glm.fits, type = "response")
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up"
attach(Weekly)
table(glm.preds, Direction)#confusion matrix
##          Direction
## glm.preds Down  Up
##      Down   54  48
##      Up    430 557
(54+557)/(54+48+430+557) #56% accuracy.
## [1] 0.5610652

The confusion matrix is telling us which observations were classified by the model as down or up vs which were actually down and up. The market was down (54 + 430) times, but the model only predicted it being down 54 times. The model was better at predicting market up - the market was actually up (48 + 557) times and the model predicted that 557 times.

train = (Year < 2009) #train dataset
Weekly.test = Weekly[!train, ] #just 2009 and 2010
glm.train = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train) #note subset
glm.probs = predict(glm.train, Weekly.test, type = "response")#predictions
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up" #classification
Direction.test = Direction[!train] #Removed training data
table(glm.preds, Direction.test) #Confusion matrix
##          Direction.test
## glm.preds Down Up
##      Down    9  5
##      Up     34 56
(9+56)/(9+5+34+56) #fraction of correct predictions
## [1] 0.625
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
lda.train = lda(Direction ~ Lag2, data = Weekly, subset = train) #note subset
lda.pred = predict(lda.train, Weekly.test)
table(lda.pred$class, Direction.test)
##       Direction.test
##        Down Up
##   Down    9  5
##   Up     34 56
(9+56)/(9+5+34+56)#fraction of correct predictions
## [1] 0.625

Lovaas note ex 10e: very suspicious how the confusion matrix was identical to 10d.

qda.train = qda(Direction ~ Lag2, data = Weekly, subset = train) #note subset
qda.pred = predict(qda.train, Weekly.test)
table(qda.pred$class, Direction.test)
##       Direction.test
##        Down Up
##   Down    0  0
##   Up     43 61
(0+61)/(0+0+43+61)#fraction of correct predictions
## [1] 0.5865385

Lovaas note ex 10f: Don’t love how the model predicted 0 “down” weeks.

library(class)
## Warning: package 'class' was built under R version 4.0.5
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(21)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.test)
##         Direction.test
## knn.pred Down Up
##     Down   21 29
##     Up     22 32
(21+32)/(21+29+22+32)#fraction of correct predictions
## [1] 0.5096154

51% accuracy; that can vary a tiny bit based on the set.seed()

  1. Which of these methods appears to provide the best results on this data?
    Lovaas answer: GLM and LDA are tied for the best accuracy score at 62.5%
  2. Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
#playing around with diff values of 

train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(21)
knn.pred2 = knn(train.X, test.X, train.Direction, k = 2)
table(knn.pred2, Direction.test)
##          Direction.test
## knn.pred2 Down Up
##      Down   23 33
##      Up     20 28
(23+28)/(23+33+20+28)#fraction of correct predictions
## [1] 0.4903846
knn.pred3 = knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred3, Direction.test)
##          Direction.test
## knn.pred3 Down Up
##      Down   16 19
##      Up     27 42
(16+42)/(16+19+27+42)#fraction of correct predictions
## [1] 0.5576923
knn.pred4 = knn(train.X, test.X, train.Direction, k = 4)
table(knn.pred4, Direction.test)
##          Direction.test
## knn.pred4 Down Up
##      Down   17 23
##      Up     26 38
(17+38)/(17+23+26+38)#fraction of correct predictions
## [1] 0.5288462
#Using volume in place of Lag2
glm.train = glm(Direction ~ Volume, data = Weekly, family = binomial, subset = train) #note subset
glm.probs = predict(glm.train, Weekly.test, type = "response")#predictions
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up" #classification
Direction.test = Direction[!train] #Removed training data
table(glm.preds, Direction.test)#Confusion matrix
##          Direction.test
## glm.preds Down Up
##      Down   31 47
##      Up     12 14
(31+14)/(31+47+12+14)#fraction of correct predictions
## [1] 0.4326923
lda.train = lda(Direction ~ Volume, data = Weekly, subset = train) #note subset
lda.pred = predict(lda.train, Weekly.test)
table(lda.pred$class, Direction.test)
##       Direction.test
##        Down Up
##   Down   33 47
##   Up     10 14
(33+14)/(33+47+10+14)#fraction of correct predictions
## [1] 0.4519231

Lovaas notes: KNN with k = 3 has the highest success rate of all KNN, but still lower than GLM. Using volume in lieu of Lag2 didn’t improve prediction accuracy.

Chapter 4 ex 11:

mpg01 = rep(0, length(Auto$mpg))
Auto1<- Auto
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto1 = data.frame(Auto1, mpg01)
par(mfrow=c(2,2))
plot(Auto1$mpg01, Auto1$Year)
plot(Auto1$mpg01, Auto1$Cylinders)
plot(Auto1$mpg01, Auto1$Acceleration)
plot(Auto1$mpg01, Auto1$Horsepower)

Lovaas answer 11b: honestly, I didn’t find too much. We’ll be able to set up a model but I doubt the rsquared values will be above 50%.

attach(Auto1)
## The following object is masked _by_ .GlobalEnv:
## 
##     mpg01
test = (year%%5 == 0)  # if the year is divisible by 5; this leaves  20% of the data for testing. 
train = !test
train.x = Auto1[train, ]
test.x = Auto1[test, ]
mpg01.test = mpg01[test]
lda.fit=lda(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train)
lda.fit
## Call:
## lda(mpg01 ~ year + cylinders + acceleration + horsepower, data = Auto1, 
##     subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.503268 0.496732 
## 
## Group means:
##       year cylinders acceleration horsepower
## 0 74.91558  6.675325     14.74416   127.4545
## 1 77.67763  4.217105     16.48092    78.3750
## 
## Coefficients of linear discriminants:
##                      LD1
## year          0.11329929
## cylinders    -0.57588930
## acceleration -0.10862945
## horsepower   -0.01686372
lda.pred=predict(lda.fit, test.x)
mean(lda.pred$class != mpg01.test)
## [1] 0.01162791

Lovaas answer ex 11d: test error using this size training/test data is 0.0116, which is lower than I expected. I guess since we’re just testing whether MPG is below or above the median it’s easier to fit a model.

qda.fit=qda(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train)
qda.fit
## Call:
## qda(mpg01 ~ year + cylinders + acceleration + horsepower, data = Auto1, 
##     subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.503268 0.496732 
## 
## Group means:
##       year cylinders acceleration horsepower
## 0 74.91558  6.675325     14.74416   127.4545
## 1 77.67763  4.217105     16.48092    78.3750
qda.pred=predict(qda.fit,test.x)
mean(qda.pred$class !=mpg01.test)
## [1] 0.06976744

Lovaas answer 11e: Error is 0.0698, which is higher than in 11d.

glm.fit=glm(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = mpg01 ~ year + cylinders + acceleration + horsepower, 
##     family = binomial, data = Auto1, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0399  -0.1944  -0.0004   0.2986   3.2065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.54409    5.11283  -0.889  0.37413    
## year          0.32944    0.06788   4.854 1.21e-06 ***
## cylinders    -0.68342    0.23622  -2.893  0.00381 ** 
## acceleration -0.37853    0.11931  -3.173  0.00151 ** 
## horsepower   -0.11651    0.02209  -5.274 1.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 424.19  on 305  degrees of freedom
## Residual deviance: 151.74  on 301  degrees of freedom
## AIC: 161.74
## 
## Number of Fisher Scoring iterations: 7
glm.prob = predict(glm.fit, test.x, type = "response")
glm.pred = rep(0, length(glm.prob))
glm.pred[glm.prob > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.127907
glm.pred[glm.prob > 0.6] = 1
mean(glm.pred != mpg01.test)
## [1] 0.127907

Lovaas answer ex 11f: test error is 12.8%; regardless of where I set the classification (0.5 or 0.6)

library(class)
train.X2 = cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X2 = cbind(cylinders, weight, displacement, horsepower)[test, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 1)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 10)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 50)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 100)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 200)
mean(knn.pred != mpg01.test)
## [1] 0.09302326

Lovaas answer ex 11g: many test errors, see above. 1 through 100 all got the same result! I got an error where I mistakenly selected 500 nearest neighbors since there are only 306 data items in my training dataset. k=200 seemed to work best but I feel like that’s overfitting.

Chapter 4 ex 13:

# Set up
library(MASS)
#fix(Boston)
attach(Boston)
crime1 = rep(0, length(crim))
crime1[crim > median(crim)] = 1 #above or below median.
Boston1 = data.frame(Boston, crime1)

train = 1:(dim(Boston1)[1]/2) #train data 
test = (dim(Boston1)[1]/2 + 1):dim(Boston1)[1] #test data
Boston1.train = Boston1[train, ]
Boston1.test = Boston1[test, ]
crime1.test = crime1[test]
# Logistic Regression
glm.fit=glm(crime1~medv + tax + ptratio + black,data=Boston1,subset=train,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = crime1 ~ medv + tax + ptratio + black, family = binomial, 
##     data = Boston1, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3005  -0.6995  -0.5220   0.9033   1.9041  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.772593   3.374992   0.822 0.411355    
## medv         0.019422   0.020704   0.938 0.348204    
## tax          0.011496   0.002359   4.873  1.1e-06 ***
## ptratio      0.177576   0.079955   2.221 0.026354 *  
## black       -0.028428   0.008171  -3.479 0.000503 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 329.37  on 252  degrees of freedom
## Residual deviance: 260.52  on 248  degrees of freedom
## AIC: 270.52
## 
## Number of Fisher Scoring iterations: 6
glm.prob = predict(glm.fit, test.x, type = "response")
## Warning: 'newdata' had 86 rows but variables found have 506 rows
glm.pred = rep(0, length(glm.prob))
glm.pred[glm.prob > 0.5] = 1
mean(glm.pred != crime1.test)
## [1] 0.326087

I used median home value, property tax, pupil to teacher ratio, and race (proportion of african american citizens). All were statistically significant (assuming p = 0.05) except for median home value. The error rate was still 33% though, which is pretty high.

# LDA
lda.fit=lda(crime1~medv + rad + ptratio + black,data=Boston1,subset=train)
lda.pred=predict(lda.fit, test.x)
## Warning: 'newdata' had 86 rows but variables found have 506 rows
mean(lda.pred$class != crime1.test)
## [1] 0.3557312

35% misclassification; not great. I used median home value, access to radial highways, pupil to teacher ratio, and race (proportion of african american citizens).

# KNN
train.X2 = cbind(age,dis,ptratio,tax)[train, ]
test.X2 = cbind(age,dis,ptratio,tax)[test, ]
train.crim1 = crime1[train]
set.seed(1)
knn.pred = knn(train.X2, test.X2, train.crim1, k = 1)
mean(knn.pred != crime1.test)
## [1] 0.6363636
knn.pred = knn(train.X2, test.X2, train.crim1, k = 10)
mean(knn.pred != crime1.test)
## [1] 0.1225296
knn.pred = knn(train.X2, test.X2, train.crim1, k = 50)
mean(knn.pred != crime1.test)
## [1] 0.1304348
knn.pred = knn(train.X2, test.X2, train.crim1, k = 20)
mean(knn.pred != crime1.test)
## [1] 0.1185771
knn.pred = knn(train.X2, test.X2, train.crim1, k = 15)
mean(knn.pred != crime1.test)
## [1] 0.1185771

I used home age,distance to employment centers, pupil to teach ratio, and property tax. The optimal model is around k = 15 to 20 (nearest neighbors)That makes sense; anything higher seems like an extreme number of neighbors.