Weekly Dataset
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
corrplot(cor(Weekly[,-9]))
glm.fit1 = glm(Direction~.-Year - Today, Weekly , family = "binomial")
summary(glm.fit1)
##
## Call:
## glm(formula = Direction ~ . - Year - Today, family = "binomial",
## data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm.prob1 = predict(glm.fit1, type="response")
glm.pred1=rep("Down",1089)
glm.pred1[glm.prob1>.5]="Up"
table(glm.pred1,Weekly$Direction)
##
## glm.pred1 Down Up
## Down 54 48
## Up 430 557
mean(glm.pred1==Weekly$Direction)
## [1] 0.5610652
Logistic regression correctly predicted the movement of the market 56.3% of the time. It is a little bit better than random guessing and the training error rate is 47.7%. This error is resulted from training and testing the model on a same data and is optimistic as it tends to underestimate the test error rate.
The confussion matrix also tells us that when the markers are up, the model appear to predict it correctly most of the time (557/(557+48) = 92%). On the other hand, when markets are down, the model gets it right only about 11%.
train = (Weekly$Year<2008)
Weekly.2009 = Weekly[!train,] # test
Direction.2009 = Weekly$Direction[!train] # overall fraction of correct predictions
glm.fit2 = glm(Direction ~ Lag2, data=Weekly ,family =binomial , subset =train )
glm2.probs = predict(glm.fit2,newdata = Weekly.2009, type="response")
glm2.pred = rep("Down", 156)
glm2.pred[glm2.probs > 0.5] = "Up"
table(glm2.pred,Direction.2009)
## Direction.2009
## glm2.pred Down Up
## Down 7 5
## Up 65 79
mean(glm2.pred ==Direction.2009) # overall fraction of correct predictions
## [1] 0.5512821
lda.fit = lda(Direction ~ Lag2, data=Weekly , subset =train)
lda.pred = predict(lda.fit,Weekly.2009)
lda.class = lda.pred$class
table(lda.class,Direction.2009)
## Direction.2009
## lda.class Down Up
## Down 6 5
## Up 66 79
mean(lda.class ==Direction.2009)
## [1] 0.5448718
qda.fit = qda(Direction ~ Lag2, data=Weekly, subset =train)
qda.pred = predict(qda.fit,Weekly.2009)
qda.class = qda.pred$class
table(qda.class,Direction.2009)
## Direction.2009
## qda.class Down Up
## Down 0 0
## Up 72 84
mean(qda.class ==Direction.2009)
## [1] 0.5384615
train.X = data.frame(Weekly[train,]$Lag2)
test.X = data.frame(Weekly[!train,]$Lag2)
train.Direction = Weekly$Direction[train]
set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=1)
table(knn.pred,Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 32 38
## Up 40 46
mean(knn.pred ==Direction.2009)
## [1] 0.5
The logistic regression provided the best the overall fraction of correct predictions on the holdout data at 55.13%.
glm.fit3 = glm(Direction ~ Lag1*Lag2, data=Weekly ,family =binomial , subset =train )
glm3.probs = predict(glm.fit3, newdata = Weekly.2009, type="response")
glm3.pred = rep("Down", 156)
glm3.pred[glm3.probs > 0.5] = "Up"
table(glm3.pred, Direction.2009)
## Direction.2009
## glm3.pred Down Up
## Down 10 11
## Up 62 73
mean(glm3.pred == Direction.2009)
## [1] 0.5320513
lda.fit1 = lda(Direction ~ Lag1:Lag2, data=Weekly , subset =train)
lda1.pred = predict(lda.fit1,Weekly.2009)
lda1.class = lda1.pred$class
table(lda1.class,Direction.2009)
## Direction.2009
## lda1.class Down Up
## Down 0 1
## Up 72 83
mean(lda1.class == Direction.2009)
## [1] 0.5320513
qda.fit1 = qda(Direction ~ Lag1 + poly(Lag2,3), data=Weekly, subset =train)
qda1.pred = predict(qda.fit1,Weekly.2009)
qda1.class = qda1.pred$class
table(qda1.class,Direction.2009)
## Direction.2009
## qda1.class Down Up
## Down 55 64
## Up 17 20
mean(qda1.class == Direction.2009)
## [1] 0.4807692
set.seed(1)
error = rep(1,5) # Initiate an empty vector to store computed test errors
for (i in seq(1,5,1)) { # Set up for loop to increment from 1 to ten
auto.knn.fit=knn(train.X, test.X, train.Direction, k=1) # Fit knn
error[i]=mean(auto.knn.fit==Direction.2009)
}
error # Print out the resulting test error
## [1] 0.5000000 0.5000000 0.4871795 0.4935897 0.4807692
According to my experiment with different variable combinations, transformation, interaction, and k-values, the best results on the held out data appears to come from logistic regression and LDA models with Lag1 and Lag2 interaction at 53.21% correct prediction of the market direction. It is however lower than the correct market direction predicted percentage by the logistic regression model with Lag2 as the only predictor.
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
corrplot(cor(Auto[,-9]))
ggplot(Auto,aes(weight,mpg01)) +
geom_point(aes(color=as.factor(mpg01)))
ggplot(Auto,aes(horsepower,mpg01)) +
geom_point(aes(color=as.factor(mpg01)))
ggplot(Auto,aes(horsepower)) + geom_boxplot(aes(color=as.factor(mpg01)))
The variables cylinders, displacement, horsepower and weight seem to be most likely to be useful in predicting mpg01.
set.seed(1)
atrain = sample(nrow(Auto), 0.5*nrow(Auto)) #split data in half
autotest = Auto[-atrain,] #train set
autotrain = Auto[atrain,] #test set
mpg01.test = autotest$mpg01
auto.lda.fit = lda(mpg01 ~ displacement + weight + horsepower + cylinders,
data=autotrain)
auto.lda.pred = predict(auto.lda.fit, autotest)
auto.lda.class = auto.lda.pred$class
table(auto.lda.class, mpg01.test)
## mpg01.test
## auto.lda.class 0 1
## 0 83 6
## 1 19 88
mean(auto.lda.class != mpg01.test)
## [1] 0.127551
auto.qda.fit = qda(mpg01 ~ displacement + weight + horsepower + cylinders,
data=autotrain)
auto.qda.pred = predict(auto.qda.fit, autotest)
auto.qda.class = auto.qda.pred$class
table(auto.qda.class, mpg01.test)
## mpg01.test
## auto.qda.class 0 1
## 0 89 10
## 1 13 84
mean(auto.qda.class != mpg01.test)
## [1] 0.1173469
auto.glm.fit = glm(mpg01 ~ displacement + weight + horsepower + cylinders,
data=Auto ,family = binomial, subset = atrain)
auto.glm.prob = predict(auto.glm.fit, autotest, type="response")
auto.glm.pred = rep("0",nrow(Auto))
auto.glm.pred[auto.glm.prob>.5] = "1"
mean(auto.glm.pred != mpg01.test)
## [1] 0.122449
atrain.X = data.frame(autotrain[,2:5])
atest.X = data.frame(autotest[,2:5])
I will iterate 10 k-values in to check on the k value that performs best with this data.
set.seed(1)
error_k = rep(1,10)
for (i in seq(1,10,1)) {
auto.knn.fit=knn(atrain.X, atest.X, mpg01.test, k = i)
error_k[i]= mean(auto.knn.fit != mpg01.test)
}
error_k
## [1] 0.5102041 0.5051020 0.5204082 0.5000000 0.4846939 0.5102041 0.4336735
## [8] 0.4489796 0.4132653 0.4234694
k=9 seems to perform best with this data set among the KNN models with test error of 41.32%.
Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
summary(Boston)
Boston$crim.med <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
boston = Boston
set.seed(1)
btrain = sample(nrow(Boston), 0.5*nrow(Boston), ) #split data in half
bostontest = Boston[-btrain,] #train set
bostontrain = Boston[btrain,] #test set
crim.test = bostontest$crim.med
boston.glm = glm(crim.med~.-crim, data = bostontrain, family = binomial())
summary(boston.glm)
##
## Call:
## glm(formula = crim.med ~ . - crim, family = binomial(), data = bostontrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6596 -0.1582 -0.0002 0.0022 3.7586
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -40.760938 9.177609 -4.441 8.94e-06 ***
## zn -0.119686 0.054901 -2.180 0.029255 *
## indus -0.001286 0.061838 -0.021 0.983414
## chas 0.799622 1.100037 0.727 0.467284
## nox 46.304574 10.423302 4.442 8.90e-06 ***
## rm -0.870265 1.082365 -0.804 0.421374
## age 0.041053 0.017749 2.313 0.020725 *
## dis 1.206786 0.349609 3.452 0.000557 ***
## rad 0.650511 0.225016 2.891 0.003841 **
## tax -0.003316 0.003640 -0.911 0.362317
## ptratio 0.476177 0.193469 2.461 0.013845 *
## black -0.007975 0.005461 -1.460 0.144249
## lstat 0.043148 0.074055 0.583 0.560135
## medv 0.242090 0.109941 2.202 0.027665 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 350.73 on 252 degrees of freedom
## Residual deviance: 105.48 on 239 degrees of freedom
## AIC: 133.48
##
## Number of Fisher Scoring iterations: 9
boston.glm.prob = predict(boston.glm, bostontest, type="response")
boston.glm.pred = rep("0",nrow(bostontest))
boston.glm.pred[boston.glm.prob>.5] = "1"
table(boston.glm.pred, crim.test)
## crim.test
## boston.glm.pred 0 1
## 0 116 15
## 1 10 112
mean(boston.glm.pred != crim.test)
## [1] 0.09881423
boston.lda.fit = lda(crim.med ~ .-crim, data=bostontrain)
boston.lda.pred = predict(boston.lda.fit, bostontest)
boston.lda.class = boston.lda.pred$class
table(boston.lda.class, crim.test)
## crim.test
## boston.lda.class 0 1
## 0 118 33
## 1 8 94
mean(boston.lda.class != crim.test)
## [1] 0.1620553
boston.qda.fit = qda(crim.med ~ .-crim, data=Boston)
boston.qda.pred = predict(boston.qda.fit, bostontest)
boston.qda.class = boston.qda.pred$class
table(boston.qda.class, crim.test)
## crim.test
## boston.qda.class 0 1
## 0 126 20
## 1 0 107
mean(boston.qda.class != crim.test)
## [1] 0.07905138
btrain.X = data.frame(bostontrain[,-1])
btest.X = data.frame(bostontest[,-1])
train.crim.med = btest.X$crim.med
set.seed(1)
knn.pred.bost=knn(btrain.X, btest.X, train.crim.med, k=1)
table(knn.pred.bost,train.crim.med)
## train.crim.med
## knn.pred.bost 0 1
## 0 57 54
## 1 69 73
mean(knn.pred.bost != crim.test)
## [1] 0.486166
Based on the fitted classification models in order to predict whether a given suburb has a crime rate above or below the median, the QDA model appears to provide the best (lowest) error rate at 7.91%. The KNN model at k=1 have the highest error rate.