library(ISLR)
library(corrplot)
## corrplot 0.90 loaded
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
corrplot(cor(Weekly[,-9]), method="circle")
From looking at these summaries, it looks like there might be a linear relationship between year and volume.
attach(Weekly)
part10b_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(part10b_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The only variable that seems to be statistically significant is Lag2 with the p-value of 0.0296.
partc.prob= predict(part10b_model, type='response')
partc.pred =rep("Down", length(partc.prob))
partc.pred[partc.prob > 0.5] = "Up"
table(partc.pred, Direction)
## Direction
## partc.pred Down Up
## Down 54 48
## Up 430 557
## Calculation to find percentage of current current conditions.
(54+557)/(54+48+430+557)
## [1] 0.5610652
This model predicted 56.11% of the trend correctly.
train = (Year<2009)
weekly_partd_model <-Weekly[!train,]
partd_model<-glm(Direction~Lag2, data=Weekly,family=binomial, subset=train)
partd.prob= predict(partd_model, weekly_partd_model, type = "response")
partd.pred = rep("Down", length(partd.prob))
partd.pred[partd.prob > 0.5] = "Up"
Direction.data = Direction[!train]
table(partd.pred, Direction.data)
## Direction.data
## partd.pred Down Up
## Down 9 5
## Up 34 56
mean(partd.pred == Direction.data)
## [1] 0.625
This yields an accuracy of 62.5%.
library(MASS)
lda_model<-lda(Direction~Lag2, data=Weekly,family=binomial, subset=train)
lda.pred<-predict(lda_model, weekly_partd_model)
table(lda.pred$class, Direction.data)
## Direction.data
## Down Up
## Down 9 5
## Up 34 56
mean(lda.pred$class==Direction.data)
## [1] 0.625
This yields 62.5% accuracy.
qda_model = qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred = predict(qda_model, weekly_partd_model)$class
table(qda.pred, Direction.data)
## Direction.data
## qda.pred Down Up
## Down 0 0
## Up 43 61
mean(qda.pred==Direction.data)
## [1] 0.5865385
This yields 58.65% accuracy.
library(class)
week_train = as.matrix(Lag2[train])
week_test = as.matrix(Lag2[!train])
train_direction = Direction[train]
set.seed(1)
knn.pred = knn(week_train, week_test, train_direction, k=1)
table(knn.pred, Direction.data)
## Direction.data
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == Direction.data)
## [1] 0.5
This yields an accuracy of 50%.
Of these methods Logistic Regression and LDA yield the highest accuracies.
## Regression with Interaction Lag1 and Lag2
part_j_model<-glm(Direction ~ Lag1:Lag2 + Lag2, data=Weekly,family=binomial, subset=train)
partj.prob= predict(part_j_model, weekly_partd_model, type = "response")
partj.pred = rep("Down", length(partj.prob))
partj.pred[partj.prob > 0.5] = "Up"
Direction.data = Direction[!train]
table(partj.pred, Direction.data)
## Direction.data
## partj.pred Down Up
## Down 3 3
## Up 40 58
## LDA Interaction with Lag1:Lag2
ldaint_model<-lda(Direction~Lag1:Lag2+Lag1, data=Weekly,family=binomial, subset=train)
ldaint.pred<-predict(ldaint_model, weekly_partd_model)
table(ldaint.pred$class, Direction.data)
## Direction.data
## Down Up
## Down 4 4
## Up 39 57
mean(ldaint.pred$class==Direction.data)
## [1] 0.5865385
## QDA Interaction with Lag1:Lag2
qdaint_model<-qda(Direction~Lag1:Lag2+Lag1, data=Weekly,family=binomial, subset=train)
qdaint.pred<-predict(qdaint_model, weekly_partd_model)
table(qdaint.pred$class, Direction.data)
## Direction.data
## Down Up
## Down 19 36
## Up 24 25
## k = 35
knn_train=as.matrix(Lag2[train])
knn_test=as.matrix(Lag2[!train])
knn_train_direction =Direction[train]
set.seed(1)
knn35.pred=knn(knn_train, knn_test,knn_train_direction,k=10)
table(knn35.pred,Direction.data)
## Direction.data
## knn35.pred Down Up
## Down 17 21
## Up 26 40
mean(knn35.pred == Direction.data)
## [1] 0.5480769
library(ISLR)
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto = data.frame(Auto, mpg01)
corrplot(cor(Auto[,-9]), method="circle")
Cylinders
, Displacement
, and Weight
look like they all strongly correlate negatively with mpg01
. Horsepower
also seemes to negatibely correlate.
train_split <- (year %% 2 == 0)
train_auto <- Auto[train_split,]
test_auto <- Auto[-train_split,]
autolda <- lda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train_auto)
autolda_pred <- predict(autolda, test_auto)
table(autolda_pred$class, test_auto$mpg01)
##
## 0 1
## 0 169 7
## 1 26 189
mean(autolda_pred$class != test_auto$mpg01)
## [1] 0.08439898
The test error is 8.44%.
autoqda <- qda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train_auto)
autoqda_pred <- predict(autoqda, test_auto)
table(autoqda_pred$class, test_auto$mpg01)
##
## 0 1
## 0 176 20
## 1 19 176
mean(autoqda_pred$class != test_auto$mpg01)
## [1] 0.09974425
The test error is 9.97%.
auto_log_model<-glm(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train_auto,family=binomial)
auto_probs = predict(auto_log_model, test_auto, type = "response")
auto_pred = rep(0, length(auto_probs))
auto_pred[auto_probs > 0.5] = 1
table(auto_pred, test_auto$mpg01)
##
## auto_pred 0 1
## 0 174 12
## 1 21 184
mean(auto_pred != test_auto$mpg01)
## [1] 0.08439898
The test error rate is 8.44%.
#K=5
#train_k5= cbind(displacement,horsepower,weight,cylinders,year,origin)[train,]
#test_k5=cbind(displacement,horsepower,weight,cylinders, year, origin)[-train,]
#set.seed(1)
#autok5_pred=knn(train_k5,test_k5,train_auto$mpg01,k=5)
#mean(autok5_pred != test_auto$mpg01)
## I tried running this chunkk but kept getting errors. Commented it out so i could knit document
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median.Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
Creating a binary variable from crime:
crime01 <- rep(0, length(crim))
crime01[crim > median(crim)] <- 1
Boston= data.frame(Boston,crime01)
Splitting the data into Train and Test sets:
train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
train_boston = Boston[train, ]
test_boston = Boston[test, ]
crime01_test = crime01[test]
Graph of Correlating variables:
corrplot(cor(Boston), method="circle")
Logistic Regression
set.seed(100)
boston_log <-glm(crime01~ indus+nox+age+dis+rad+tax, data= train_boston,family=binomial)
boston_probs = predict(boston_log, test_boston, type = "response")
boston_pred = rep(0, length(boston_probs))
boston_pred[boston_probs > 0.5] = 1
table(boston_pred, crime01_test)
## crime01_test
## boston_pred 0 1
## 0 75 8
## 1 15 155
mean(boston_pred != crime01_test)
## [1] 0.09090909
LDA
boston_lda <-lda(crime01~ indus+nox+age+dis+rad+tax, data=train_boston,family=binomial)
bostonlda_pred = predict(boston_lda, test_boston)
table(bostonlda_pred$class, crime01_test)
## crime01_test
## 0 1
## 0 81 18
## 1 9 145
mean(bostonlda_pred$class != crime01_test)
## [1] 0.1067194
K Nearest Neighbors
### k = 10
train_k10=cbind(indus,nox,age,dis,rad,tax)[train,]
test_k10=cbind(indus,nox,age,dis,rad,tax)[test,]
boston_knn_pred=knn(train_k10, test_k10, crime01_test, k=10)
table(boston_knn_pred,crime01_test)
## crime01_test
## boston_knn_pred 0 1
## 0 39 10
## 1 51 153
mean(boston_knn_pred !=crime01_test)
## [1] 0.2411067
### k = 20
train_k20=cbind(indus,nox,age,dis,rad,tax)[train,]
test_k20=cbind(indus,nox,age,dis,rad,tax)[test,]
boston_knn_pred20=knn(train_k20, test_k20, crime01_test, k=20)
table(boston_knn_pred20,crime01_test)
## crime01_test
## boston_knn_pred20 0 1
## 0 41 17
## 1 49 146
mean(boston_knn_pred20 !=crime01_test)
## [1] 0.2608696
### k = 50
train_k50=cbind(indus,nox,age,dis,rad,tax)[train,]
test_k50=cbind(indus,nox,age,dis,rad,tax)[test,]
boston_knn_pred50=knn(train_k20, test_k50, crime01_test, k=50)
table(boston_knn_pred50,crime01_test)
## crime01_test
## boston_knn_pred50 0 1
## 0 38 13
## 1 52 150
mean(boston_knn_pred50 !=crime01_test)
## [1] 0.256917
By examining all of the models, the Logistic regression yielded the lowest test error of 9.09%. The KNN models yielded the highest error, with the error rising as we increase k.