library(ISLR)
- Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
cor(subset(Weekly, select = -Direction))
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Year and Volume appear to have a relationship.
- Do any of the predictors appear to be statistically significant ? If so, which ones ?
logit_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 seems to be the only variable to have some statistical significance with a p-value of less than 0.05
- Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
- Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
dim(Weekly)
## [1] 1089 9
train=Weekly$Year <= 2008
Weekly.test=Weekly[!train,]
logit.fit = glm(Direction ~ Lag2, family=binomial, data=Weekly, subset=train)
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
summary(logit.fit)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
glm.probs=predict(logit.fit,Weekly.test,type="response")
glm.pred=rep("Down",nrow(Weekly.test))
glm.pred[glm.probs > 0.50]="Up"
table(glm.pred,Weekly.test$Direction)
##
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred==Weekly.test$Direction)
## [1] 0.625
The percentage of correct predictions is 62.5%, 37.5% is the test error rate.
- Repeat (d) using LDA.
library(MASS)
fit.lda <- lda(Direction ~ Lag2, data = Weekly, subset = train)
fit.lda
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
pred.lda <- predict(fit.lda, Weekly)
- Repeat (d) using QDA.
fit.qda <- qda(Direction ~ Lag2, data = Weekly, subset = train)
fit.qda
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
- Repeat (d) using KNN with K = 1.
library(class)
train.X=Weekly[train,"Lag2",drop=F]
test.X=Weekly[!train,"Lag2",drop=F]
train.Direction=Weekly[train,"Direction",drop=T]
test.Direction=Weekly[!train,"Direction",drop=T]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,test.Direction)
## test.Direction
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred==test.Direction)
## [1] 0.5
- Which of these methods appears to provide the best results on this data?
Logistic Regression and Linear discriminant analysis have the highest accuracy rates, both have rates of 62.5%.
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=4)
table(knn.pred,test.Direction)
## test.Direction
## knn.pred Down Up
## Down 20 17
## Up 23 44
mean(knn.pred==test.Direction)
## [1] 0.6153846
qda.fit = qda(Direction ~ Lag2, data=Weekly, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class=predict(qda.fit,Weekly.test)$class
table(qda.class,Weekly.test$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==Weekly.test$Direction)
## [1] 0.5865385
train=Weekly$Year <= 2008
Weekly.test=Weekly[!train,]
logit.fit = glm(Direction ~ Lag1+Lag2+Volume, family=binomial, data=Weekly, subset=train)
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
summary(logit.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Volume, family = binomial,
## data = Weekly, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4681 -1.2581 0.9929 1.0840 1.5339
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.29792 0.09136 3.261 0.00111 **
## Lag1 -0.05975 0.02917 -2.048 0.04054 *
## Lag2 0.04774 0.02941 1.624 0.10446
## Volume -0.07093 0.05263 -1.348 0.17777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1345.1 on 981 degrees of freedom
## AIC: 1353.1
##
## Number of Fisher Scoring iterations: 4
glm.probs=predict(logit.fit,Weekly.test,type="response")
glm.pred=rep("Down",nrow(Weekly.test))
glm.pred[glm.probs > 0.50]="Up"
table(glm.pred,Weekly.test$Direction)
##
## glm.pred Down Up
## Down 27 33
## Up 16 28
mean(glm.pred==Weekly.test$Direction)
## [1] 0.5288462
library(ISLR)
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto = data.frame(Auto, mpg01)
cor(Auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
some association exists between “mpg01” and “cylinders”, “weight”, “displacement” and “horsepower”.
- Split the data into a training set and a test set
train <- (year %% 2 == 0)
train.auto <- Auto[train,]
test.auto <- Auto[-train,]
(d)What is the test error of the model obtained?
autolda.fit <- lda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto)
autolda.pred <- predict(autolda.fit, test.auto)
table(autolda.pred$class, test.auto$mpg01)
##
## 0 1
## 0 169 7
## 1 26 189
test error rate of 8.44%
(e)What is the test error of the model obtained?
autoqda.fit <- qda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto)
autoqda.pred <- predict(autoqda.fit, test.auto)
table(autoqda.pred$class, test.auto$mpg01)
##
## 0 1
## 0 176 20
## 1 19 176
test error rate of 9.97%
(f)What is the test error of the model obtained?
auto.fit<-glm(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto,family=binomial)
auto.probs = predict(auto.fit, test.auto, type = "response")
auto.pred = rep(0, length(auto.probs))
auto.pred[auto.probs > 0.5] = 1
table(auto.pred, test.auto$mpg01)
##
## auto.pred 0 1
## 0 174 12
## 1 21 184
test error rate of 8.44%
(g)What test errors do you obtain? Which value of K seems to perform the best on this data set?
k=1
train.K= cbind(displacement,horsepower,weight,cylinders,year, origin)[train,]
test.K=cbind(displacement,horsepower,weight,cylinders, year, origin)[-train,]
set.seed(1)
autok.pred=knn(train.K,test.K,train.auto$mpg01,k=1)
mean(autok.pred != test.auto$mpg01)
## [1] 0.07161125
k=5
autok.pred=knn(train.K,test.K,train.auto$mpg01,k=5)
mean(autok.pred != test.auto$mpg01)
## [1] 0.112532
k=10
autok.pred=knn(train.K,test.K,train.auto$mpg01,k=10)
mean(autok.pred != test.auto$mpg01)
## [1] 0.1253197
K=1 has the lowest error rate of 7.16%. When k increases so does the error rate for this particular model.
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
crime01 <- rep(0, length(crim))
crime01[crim > median(crim)] <- 1
Boston= data.frame(Boston,crime01)
train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
Boston$crim01 <- as.numeric(Boston$crim > median(Boston$crim))
set.seed(1)
Boston.fit <-glm(crime01~ indus+nox+age+dis+rad+tax, data=Boston.train,family=binomial)
Boston.probs = predict(Boston.fit, Boston.test, type = "response")
Boston.pred = rep(0, length(Boston.probs))
Boston.pred[Boston.probs > 0.5] = 1
summary(Boston.fit)
##
## Call:
## glm(formula = crime01 ~ indus + nox + age + dis + rad + tax,
## family = binomial, data = Boston.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.97810 -0.21406 -0.03454 0.47107 3.04502
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.214032 7.617440 -5.542 2.99e-08 ***
## indus -0.213126 0.073236 -2.910 0.00361 **
## nox 80.868029 16.066473 5.033 4.82e-07 ***
## age 0.003397 0.012032 0.282 0.77772
## dis 0.307145 0.190502 1.612 0.10690
## rad 0.847236 0.183767 4.610 4.02e-06 ***
## tax -0.013760 0.004956 -2.777 0.00549 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.37 on 252 degrees of freedom
## Residual deviance: 144.44 on 246 degrees of freedom
## AIC: 158.44
##
## Number of Fisher Scoring iterations: 8
Logistic Regression
fit.glm1 <- glm(crim01 ~ . - crim01 - crim, data = Boston, family = binomial)
## Warning: glm.fit: algorithm did not converge
fit.glm1
##
## Call: glm(formula = crim01 ~ . - crim01 - crim, family = binomial,
## data = Boston)
##
## Coefficients:
## (Intercept) zn indus chas nox rm
## -2.657e+01 -1.142e-09 -2.471e-09 -8.194e-08 -6.590e-06 1.148e-07
## age dis rad tax ptratio black
## 3.250e-09 -7.410e-08 2.814e-08 2.812e-10 7.009e-08 3.904e-10
## lstat medv crime01
## 1.393e-08 6.958e-10 5.313e+01
##
## Degrees of Freedom: 505 Total (i.e. Null); 491 Residual
## Null Deviance: 701.5
## Residual Deviance: 2.936e-09 AIC: 30
test error rate of 12.5%
Linear Discriminant Analysis
Boston.ldafit <-lda(crime01~ indus+nox+age+dis+rad+tax, data=Boston.train,family=binomial)
Bostonlda.pred = predict(Boston.ldafit, Boston.test)
fit.lda <- lda(crim01 ~ nox + indus + age + rad , data = Boston)
pred.lda <- predict(fit.lda, Boston.test)
test error rate of 15.1%
KNN
data = scale(Boston[,-c(1,15)])
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
training_data = data[train, c("nox" , "indus" , "age" , "rad")]
testing_data = data[test, c("nox" , "indus" , "age" , "rad")]
train.crime01 = Boston$crim01[train]
test.crime01= Boston$crim01[test]
library(class)
set.seed(1234)
knn_pred_y = knn(training_data, testing_data, train.crime01, k = 1)
table(knn_pred_y, test.crime01)
## test.crime01
## knn_pred_y 0 1
## 0 67 6
## 1 8 71
mean(knn_pred_y != test.crime01)
## [1] 0.09210526
Test error rate 9.21% for K=1 Logistic Regression has the lowest test error rate.