library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
fix(Weekly)
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
pairs(Weekly)
par(mfrow=c(2,2))
plot(Weekly$Lag1, Weekly$Year)
plot(Weekly$Lag2, Weekly$Year)
plot(Weekly$Lag3, Weekly$Year)
plot(Weekly$Lag4, Weekly$Year)
plot(Weekly$Lag1, Weekly$Lag2)
plot(Weekly$Lag2, Weekly$Lag3)
plot(Weekly$Lag3, Weekly$Lag4)
plot(Weekly$Lag4, Weekly$Lag5)
par(mfrow=c(1,1))
plot(Weekly$Today, Weekly$Volume)
 Lovaas answer ex 10a: there aren’t terribly strong linear patterns, and I’m not sure what we’re going to do with this data. Volume tends to increase a bit when returns are high, and vise versa.
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomial)
summary(glm.fits)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
Lovaas answer ex 10b Only lag2 appears to be significant, which is a bummer.
glm.probs = predict(glm.fits, type = "response")
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up"
attach(Weekly)
table(glm.preds, Direction)#confusion matrix
##          Direction
## glm.preds Down  Up
##      Down   54  48
##      Up    430 557
(54+557)/(54+48+430+557) #56% accuracy.
## [1] 0.5610652
The confusion matrix is telling us which observations were classified by the model as down or up vs which were actually down and up. The market was down (54 + 430) times, but the model only predicted it being down 54 times. The model was better at predicting market up - the market was actually up (48 + 557) times and the model predicted that 557 times.
train = (Year < 2009) #train dataset
Weekly.test = Weekly[!train, ] #just 2009 and 2010
glm.train = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train) #note subset
glm.probs = predict(glm.train, Weekly.test, type = "response")#predictions
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up" #classification
Direction.test = Direction[!train] #Removed training data
table(glm.preds, Direction.test) #Confusion matrix
##          Direction.test
## glm.preds Down Up
##      Down    9  5
##      Up     34 56
(9+56)/(9+5+34+56) #fraction of correct predictions
## [1] 0.625
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
lda.train = lda(Direction ~ Lag2, data = Weekly, subset = train) #note subset
lda.pred = predict(lda.train, Weekly.test)
table(lda.pred$class, Direction.test)
##       Direction.test
##        Down Up
##   Down    9  5
##   Up     34 56
(9+56)/(9+5+34+56)#fraction of correct predictions
## [1] 0.625
Lovaas note ex 10e: very suspicious how the confusion matrix was identical to 10d.
qda.train = qda(Direction ~ Lag2, data = Weekly, subset = train) #note subset
qda.pred = predict(qda.train, Weekly.test)
table(qda.pred$class, Direction.test)
##       Direction.test
##        Down Up
##   Down    0  0
##   Up     43 61
(0+61)/(0+0+43+61)#fraction of correct predictions
## [1] 0.5865385
Lovaas note ex 10f: Don’t love how the model predicted 0 “down” weeks.
library(class)
## Warning: package 'class' was built under R version 4.0.5
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(21)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.test)
##         Direction.test
## knn.pred Down Up
##     Down   21 29
##     Up     22 32
(21+32)/(21+29+22+32)#fraction of correct predictions
## [1] 0.5096154
51% accuracy; that can vary a tiny bit based on the set.seed()
#playing around with diff values of 
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(21)
knn.pred2 = knn(train.X, test.X, train.Direction, k = 2)
table(knn.pred2, Direction.test)
##          Direction.test
## knn.pred2 Down Up
##      Down   23 33
##      Up     20 28
(23+28)/(23+33+20+28)#fraction of correct predictions
## [1] 0.4903846
knn.pred3 = knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred3, Direction.test)
##          Direction.test
## knn.pred3 Down Up
##      Down   16 19
##      Up     27 42
(16+42)/(16+19+27+42)#fraction of correct predictions
## [1] 0.5576923
knn.pred4 = knn(train.X, test.X, train.Direction, k = 4)
table(knn.pred4, Direction.test)
##          Direction.test
## knn.pred4 Down Up
##      Down   17 23
##      Up     26 38
(17+38)/(17+23+26+38)#fraction of correct predictions
## [1] 0.5288462
#Using volume in place of Lag2
glm.train = glm(Direction ~ Volume, data = Weekly, family = binomial, subset = train) #note subset
glm.probs = predict(glm.train, Weekly.test, type = "response")#predictions
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up" #classification
Direction.test = Direction[!train] #Removed training data
table(glm.preds, Direction.test)#Confusion matrix
##          Direction.test
## glm.preds Down Up
##      Down   31 47
##      Up     12 14
(31+14)/(31+47+12+14)#fraction of correct predictions
## [1] 0.4326923
lda.train = lda(Direction ~ Volume, data = Weekly, subset = train) #note subset
lda.pred = predict(lda.train, Weekly.test)
table(lda.pred$class, Direction.test)
##       Direction.test
##        Down Up
##   Down   33 47
##   Up     10 14
(33+14)/(33+47+10+14)#fraction of correct predictions
## [1] 0.4519231
Lovaas notes: KNN with k = 3 has the highest success rate of all KNN, but still lower than GLM. Using volume in lieu of Lag2 didn’t improve prediction accuracy.
mpg01 = rep(0, length(Auto$mpg))
Auto1<- Auto
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto1 = data.frame(Auto1, mpg01)
par(mfrow=c(2,2))
plot(Auto1$mpg01, Auto1$Year)
plot(Auto1$mpg01, Auto1$Cylinders)
plot(Auto1$mpg01, Auto1$Acceleration)
plot(Auto1$mpg01, Auto1$Horsepower)
 Lovaas answer 11b: honestly, I didn’t find too much. We’ll be able to set up a model but I doubt the rsquared values will be above 50%.
attach(Auto1)
## The following object is masked _by_ .GlobalEnv:
## 
##     mpg01
test = (year%%5 == 0)  # if the year is divisible by 5; this leaves  20% of the data for testing. 
train = !test
train.x = Auto1[train, ]
test.x = Auto1[test, ]
mpg01.test = mpg01[test]
lda.fit=lda(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train)
lda.fit
## Call:
## lda(mpg01 ~ year + cylinders + acceleration + horsepower, data = Auto1, 
##     subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.503268 0.496732 
## 
## Group means:
##       year cylinders acceleration horsepower
## 0 74.91558  6.675325     14.74416   127.4545
## 1 77.67763  4.217105     16.48092    78.3750
## 
## Coefficients of linear discriminants:
##                      LD1
## year          0.11329929
## cylinders    -0.57588930
## acceleration -0.10862945
## horsepower   -0.01686372
lda.pred=predict(lda.fit, test.x)
mean(lda.pred$class != mpg01.test)
## [1] 0.01162791
Lovaas answer ex 11d: test error using this size training/test data is 0.0116, which is lower than I expected. I guess since we’re just testing whether MPG is below or above the median it’s easier to fit a model.
qda.fit=qda(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train)
qda.fit
## Call:
## qda(mpg01 ~ year + cylinders + acceleration + horsepower, data = Auto1, 
##     subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.503268 0.496732 
## 
## Group means:
##       year cylinders acceleration horsepower
## 0 74.91558  6.675325     14.74416   127.4545
## 1 77.67763  4.217105     16.48092    78.3750
qda.pred=predict(qda.fit,test.x)
mean(qda.pred$class !=mpg01.test)
## [1] 0.06976744
Lovaas answer 11e: Error is 0.0698, which is higher than in 11d.
glm.fit=glm(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = mpg01 ~ year + cylinders + acceleration + horsepower, 
##     family = binomial, data = Auto1, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0399  -0.1944  -0.0004   0.2986   3.2065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.54409    5.11283  -0.889  0.37413    
## year          0.32944    0.06788   4.854 1.21e-06 ***
## cylinders    -0.68342    0.23622  -2.893  0.00381 ** 
## acceleration -0.37853    0.11931  -3.173  0.00151 ** 
## horsepower   -0.11651    0.02209  -5.274 1.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 424.19  on 305  degrees of freedom
## Residual deviance: 151.74  on 301  degrees of freedom
## AIC: 161.74
## 
## Number of Fisher Scoring iterations: 7
glm.prob = predict(glm.fit, test.x, type = "response")
glm.pred = rep(0, length(glm.prob))
glm.pred[glm.prob > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.127907
glm.pred[glm.prob > 0.6] = 1
mean(glm.pred != mpg01.test)
## [1] 0.127907
Lovaas answer ex 11f: test error is 12.8%; regardless of where I set the classification (0.5 or 0.6)
library(class)
train.X2 = cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X2 = cbind(cylinders, weight, displacement, horsepower)[test, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 1)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 10)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 50)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 100)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 200)
mean(knn.pred != mpg01.test)
## [1] 0.09302326
Lovaas answer ex 11g: many test errors, see above. 1 through 100 all got the same result! I got an error where I mistakenly selected 500 nearest neighbors since there are only 306 data items in my training dataset. k=200 seemed to work best but I feel like that’s overfitting.
# Set up
library(MASS)
#fix(Boston)
attach(Boston)
crime1 = rep(0, length(crim))
crime1[crim > median(crim)] = 1 #above or below median.
Boston1 = data.frame(Boston, crime1)
train = 1:(dim(Boston1)[1]/2) #train data 
test = (dim(Boston1)[1]/2 + 1):dim(Boston1)[1] #test data
Boston1.train = Boston1[train, ]
Boston1.test = Boston1[test, ]
crime1.test = crime1[test]
# Logistic Regression
glm.fit=glm(crime1~medv + tax + ptratio + black,data=Boston1,subset=train,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = crime1 ~ medv + tax + ptratio + black, family = binomial, 
##     data = Boston1, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3005  -0.6995  -0.5220   0.9033   1.9041  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.772593   3.374992   0.822 0.411355    
## medv         0.019422   0.020704   0.938 0.348204    
## tax          0.011496   0.002359   4.873  1.1e-06 ***
## ptratio      0.177576   0.079955   2.221 0.026354 *  
## black       -0.028428   0.008171  -3.479 0.000503 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 329.37  on 252  degrees of freedom
## Residual deviance: 260.52  on 248  degrees of freedom
## AIC: 270.52
## 
## Number of Fisher Scoring iterations: 6
glm.prob = predict(glm.fit, test.x, type = "response")
## Warning: 'newdata' had 86 rows but variables found have 506 rows
glm.pred = rep(0, length(glm.prob))
glm.pred[glm.prob > 0.5] = 1
mean(glm.pred != crime1.test)
## [1] 0.326087
I used median home value, property tax, pupil to teacher ratio, and race (proportion of african american citizens). All were statistically significant (assuming p = 0.05) except for median home value. The error rate was still 33% though, which is pretty high.
# LDA
lda.fit=lda(crime1~medv + rad + ptratio + black,data=Boston1,subset=train)
lda.pred=predict(lda.fit, test.x)
## Warning: 'newdata' had 86 rows but variables found have 506 rows
mean(lda.pred$class != crime1.test)
## [1] 0.3557312
35% misclassification; not great. I used median home value, access to radial highways, pupil to teacher ratio, and race (proportion of african american citizens).
# KNN
train.X2 = cbind(age,dis,ptratio,tax)[train, ]
test.X2 = cbind(age,dis,ptratio,tax)[test, ]
train.crim1 = crime1[train]
set.seed(1)
knn.pred = knn(train.X2, test.X2, train.crim1, k = 1)
mean(knn.pred != crime1.test)
## [1] 0.6363636
knn.pred = knn(train.X2, test.X2, train.crim1, k = 10)
mean(knn.pred != crime1.test)
## [1] 0.1225296
knn.pred = knn(train.X2, test.X2, train.crim1, k = 50)
mean(knn.pred != crime1.test)
## [1] 0.1304348
knn.pred = knn(train.X2, test.X2, train.crim1, k = 20)
mean(knn.pred != crime1.test)
## [1] 0.1185771
knn.pred = knn(train.X2, test.X2, train.crim1, k = 15)
mean(knn.pred != crime1.test)
## [1] 0.1185771
I used home age,distance to employment centers, pupil to teach ratio, and property tax. The optimal model is around k = 15 to 20 (nearest neighbors)That makes sense; anything higher seems like an extreme number of neighbors.