library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
fix(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
par(mfrow=c(2,2))
plot(Weekly$Lag1, Weekly$Year)
plot(Weekly$Lag2, Weekly$Year)
plot(Weekly$Lag3, Weekly$Year)
plot(Weekly$Lag4, Weekly$Year)
plot(Weekly$Lag1, Weekly$Lag2)
plot(Weekly$Lag2, Weekly$Lag3)
plot(Weekly$Lag3, Weekly$Lag4)
plot(Weekly$Lag4, Weekly$Lag5)
par(mfrow=c(1,1))
plot(Weekly$Today, Weekly$Volume)
Lovaas answer ex 10a: there aren’t terribly strong linear patterns, and I’m not sure what we’re going to do with this data. Volume tends to increase a bit when returns are high, and vise versa.
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomial)
summary(glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lovaas answer ex 10b Only lag2 appears to be significant, which is a bummer.
glm.probs = predict(glm.fits, type = "response")
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up"
attach(Weekly)
table(glm.preds, Direction)#confusion matrix
## Direction
## glm.preds Down Up
## Down 54 48
## Up 430 557
(54+557)/(54+48+430+557) #56% accuracy.
## [1] 0.5610652
The confusion matrix is telling us which observations were classified by the model as down or up vs which were actually down and up. The market was down (54 + 430) times, but the model only predicted it being down 54 times. The model was better at predicting market up - the market was actually up (48 + 557) times and the model predicted that 557 times.
train = (Year < 2009) #train dataset
Weekly.test = Weekly[!train, ] #just 2009 and 2010
glm.train = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train) #note subset
glm.probs = predict(glm.train, Weekly.test, type = "response")#predictions
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up" #classification
Direction.test = Direction[!train] #Removed training data
table(glm.preds, Direction.test) #Confusion matrix
## Direction.test
## glm.preds Down Up
## Down 9 5
## Up 34 56
(9+56)/(9+5+34+56) #fraction of correct predictions
## [1] 0.625
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
lda.train = lda(Direction ~ Lag2, data = Weekly, subset = train) #note subset
lda.pred = predict(lda.train, Weekly.test)
table(lda.pred$class, Direction.test)
## Direction.test
## Down Up
## Down 9 5
## Up 34 56
(9+56)/(9+5+34+56)#fraction of correct predictions
## [1] 0.625
Lovaas note ex 10e: very suspicious how the confusion matrix was identical to 10d.
qda.train = qda(Direction ~ Lag2, data = Weekly, subset = train) #note subset
qda.pred = predict(qda.train, Weekly.test)
table(qda.pred$class, Direction.test)
## Direction.test
## Down Up
## Down 0 0
## Up 43 61
(0+61)/(0+0+43+61)#fraction of correct predictions
## [1] 0.5865385
Lovaas note ex 10f: Don’t love how the model predicted 0 “down” weeks.
library(class)
## Warning: package 'class' was built under R version 4.0.5
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(21)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.test)
## Direction.test
## knn.pred Down Up
## Down 21 29
## Up 22 32
(21+32)/(21+29+22+32)#fraction of correct predictions
## [1] 0.5096154
51% accuracy; that can vary a tiny bit based on the set.seed()
#playing around with diff values of
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(21)
knn.pred2 = knn(train.X, test.X, train.Direction, k = 2)
table(knn.pred2, Direction.test)
## Direction.test
## knn.pred2 Down Up
## Down 23 33
## Up 20 28
(23+28)/(23+33+20+28)#fraction of correct predictions
## [1] 0.4903846
knn.pred3 = knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred3, Direction.test)
## Direction.test
## knn.pred3 Down Up
## Down 16 19
## Up 27 42
(16+42)/(16+19+27+42)#fraction of correct predictions
## [1] 0.5576923
knn.pred4 = knn(train.X, test.X, train.Direction, k = 4)
table(knn.pred4, Direction.test)
## Direction.test
## knn.pred4 Down Up
## Down 17 23
## Up 26 38
(17+38)/(17+23+26+38)#fraction of correct predictions
## [1] 0.5288462
#Using volume in place of Lag2
glm.train = glm(Direction ~ Volume, data = Weekly, family = binomial, subset = train) #note subset
glm.probs = predict(glm.train, Weekly.test, type = "response")#predictions
glm.preds = rep("Down", length(glm.probs))
glm.preds[glm.probs > 0.5] = "Up" #classification
Direction.test = Direction[!train] #Removed training data
table(glm.preds, Direction.test)#Confusion matrix
## Direction.test
## glm.preds Down Up
## Down 31 47
## Up 12 14
(31+14)/(31+47+12+14)#fraction of correct predictions
## [1] 0.4326923
lda.train = lda(Direction ~ Volume, data = Weekly, subset = train) #note subset
lda.pred = predict(lda.train, Weekly.test)
table(lda.pred$class, Direction.test)
## Direction.test
## Down Up
## Down 33 47
## Up 10 14
(33+14)/(33+47+10+14)#fraction of correct predictions
## [1] 0.4519231
Lovaas notes: KNN with k = 3 has the highest success rate of all KNN, but still lower than GLM. Using volume in lieu of Lag2 didn’t improve prediction accuracy.
mpg01 = rep(0, length(Auto$mpg))
Auto1<- Auto
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto1 = data.frame(Auto1, mpg01)
par(mfrow=c(2,2))
plot(Auto1$mpg01, Auto1$Year)
plot(Auto1$mpg01, Auto1$Cylinders)
plot(Auto1$mpg01, Auto1$Acceleration)
plot(Auto1$mpg01, Auto1$Horsepower)
Lovaas answer 11b: honestly, I didn’t find too much. We’ll be able to set up a model but I doubt the rsquared values will be above 50%.
attach(Auto1)
## The following object is masked _by_ .GlobalEnv:
##
## mpg01
test = (year%%5 == 0) # if the year is divisible by 5; this leaves 20% of the data for testing.
train = !test
train.x = Auto1[train, ]
test.x = Auto1[test, ]
mpg01.test = mpg01[test]
lda.fit=lda(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train)
lda.fit
## Call:
## lda(mpg01 ~ year + cylinders + acceleration + horsepower, data = Auto1,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.503268 0.496732
##
## Group means:
## year cylinders acceleration horsepower
## 0 74.91558 6.675325 14.74416 127.4545
## 1 77.67763 4.217105 16.48092 78.3750
##
## Coefficients of linear discriminants:
## LD1
## year 0.11329929
## cylinders -0.57588930
## acceleration -0.10862945
## horsepower -0.01686372
lda.pred=predict(lda.fit, test.x)
mean(lda.pred$class != mpg01.test)
## [1] 0.01162791
Lovaas answer ex 11d: test error using this size training/test data is 0.0116, which is lower than I expected. I guess since we’re just testing whether MPG is below or above the median it’s easier to fit a model.
qda.fit=qda(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train)
qda.fit
## Call:
## qda(mpg01 ~ year + cylinders + acceleration + horsepower, data = Auto1,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.503268 0.496732
##
## Group means:
## year cylinders acceleration horsepower
## 0 74.91558 6.675325 14.74416 127.4545
## 1 77.67763 4.217105 16.48092 78.3750
qda.pred=predict(qda.fit,test.x)
mean(qda.pred$class !=mpg01.test)
## [1] 0.06976744
Lovaas answer 11e: Error is 0.0698, which is higher than in 11d.
glm.fit=glm(mpg01~year + cylinders + acceleration + horsepower,data=Auto1,subset=train,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ year + cylinders + acceleration + horsepower,
## family = binomial, data = Auto1, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0399 -0.1944 -0.0004 0.2986 3.2065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.54409 5.11283 -0.889 0.37413
## year 0.32944 0.06788 4.854 1.21e-06 ***
## cylinders -0.68342 0.23622 -2.893 0.00381 **
## acceleration -0.37853 0.11931 -3.173 0.00151 **
## horsepower -0.11651 0.02209 -5.274 1.34e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 424.19 on 305 degrees of freedom
## Residual deviance: 151.74 on 301 degrees of freedom
## AIC: 161.74
##
## Number of Fisher Scoring iterations: 7
glm.prob = predict(glm.fit, test.x, type = "response")
glm.pred = rep(0, length(glm.prob))
glm.pred[glm.prob > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.127907
glm.pred[glm.prob > 0.6] = 1
mean(glm.pred != mpg01.test)
## [1] 0.127907
Lovaas answer ex 11f: test error is 12.8%; regardless of where I set the classification (0.5 or 0.6)
library(class)
train.X2 = cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X2 = cbind(cylinders, weight, displacement, horsepower)[test, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 1)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 10)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 50)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 100)
mean(knn.pred != mpg01.test)
## [1] 0.1046512
knn.pred = knn(train.X2, test.X2, train.mpg01, k = 200)
mean(knn.pred != mpg01.test)
## [1] 0.09302326
Lovaas answer ex 11g: many test errors, see above. 1 through 100 all got the same result! I got an error where I mistakenly selected 500 nearest neighbors since there are only 306 data items in my training dataset. k=200 seemed to work best but I feel like that’s overfitting.
# Set up
library(MASS)
#fix(Boston)
attach(Boston)
crime1 = rep(0, length(crim))
crime1[crim > median(crim)] = 1 #above or below median.
Boston1 = data.frame(Boston, crime1)
train = 1:(dim(Boston1)[1]/2) #train data
test = (dim(Boston1)[1]/2 + 1):dim(Boston1)[1] #test data
Boston1.train = Boston1[train, ]
Boston1.test = Boston1[test, ]
crime1.test = crime1[test]
# Logistic Regression
glm.fit=glm(crime1~medv + tax + ptratio + black,data=Boston1,subset=train,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = crime1 ~ medv + tax + ptratio + black, family = binomial,
## data = Boston1, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3005 -0.6995 -0.5220 0.9033 1.9041
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.772593 3.374992 0.822 0.411355
## medv 0.019422 0.020704 0.938 0.348204
## tax 0.011496 0.002359 4.873 1.1e-06 ***
## ptratio 0.177576 0.079955 2.221 0.026354 *
## black -0.028428 0.008171 -3.479 0.000503 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.37 on 252 degrees of freedom
## Residual deviance: 260.52 on 248 degrees of freedom
## AIC: 270.52
##
## Number of Fisher Scoring iterations: 6
glm.prob = predict(glm.fit, test.x, type = "response")
## Warning: 'newdata' had 86 rows but variables found have 506 rows
glm.pred = rep(0, length(glm.prob))
glm.pred[glm.prob > 0.5] = 1
mean(glm.pred != crime1.test)
## [1] 0.326087
I used median home value, property tax, pupil to teacher ratio, and race (proportion of african american citizens). All were statistically significant (assuming p = 0.05) except for median home value. The error rate was still 33% though, which is pretty high.
# LDA
lda.fit=lda(crime1~medv + rad + ptratio + black,data=Boston1,subset=train)
lda.pred=predict(lda.fit, test.x)
## Warning: 'newdata' had 86 rows but variables found have 506 rows
mean(lda.pred$class != crime1.test)
## [1] 0.3557312
35% misclassification; not great. I used median home value, access to radial highways, pupil to teacher ratio, and race (proportion of african american citizens).
# KNN
train.X2 = cbind(age,dis,ptratio,tax)[train, ]
test.X2 = cbind(age,dis,ptratio,tax)[test, ]
train.crim1 = crime1[train]
set.seed(1)
knn.pred = knn(train.X2, test.X2, train.crim1, k = 1)
mean(knn.pred != crime1.test)
## [1] 0.6363636
knn.pred = knn(train.X2, test.X2, train.crim1, k = 10)
mean(knn.pred != crime1.test)
## [1] 0.1225296
knn.pred = knn(train.X2, test.X2, train.crim1, k = 50)
mean(knn.pred != crime1.test)
## [1] 0.1304348
knn.pred = knn(train.X2, test.X2, train.crim1, k = 20)
mean(knn.pred != crime1.test)
## [1] 0.1185771
knn.pred = knn(train.X2, test.X2, train.crim1, k = 15)
mean(knn.pred != crime1.test)
## [1] 0.1185771
I used home age,distance to employment centers, pupil to teach ratio, and property tax. The optimal model is around k = 15 to 20 (nearest neighbors)That makes sense; anything higher seems like an extreme number of neighbors.