library(ISLR)
attach(Weekly)
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly, panel = panel.smooth)
cor(Weekly[,1:8])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Year and volume appear to have a linear relationship.
glm.weekly <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial )
summary(glm.weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The smallest p-value here is associated with Lag2. Lag2 has a positive coefficient for this predictor suggests that stock market will have a positive return by 0.05844 on a given week with one unit increase in Lag2.
We will use a test statistic value of 0.05 and will see that Lag2 is the only variable that is statistically significant.
We use the coef() function in order to access just the coefficients for this fitted model. We can also use the summary() function to access particular aspects of the fitted model, such as the p-values for the coefficients.
coef(glm.weekly)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## 0.26686414 -0.04126894 0.05844168 -0.01606114 -0.02779021 -0.01447206
## Volume
## -0.02274153
summary(glm.weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.weekly)
glm.probs[1:10]
## 1 2 3 4 5 6
## 0.44153590 0.40976461 0.35392867 -0.07346677 0.47641635 0.27540369
## 7 8 9 10
## 0.31706873 0.06080770 0.28805531 0.22262994
contrasts(Direction)
## Up
## Down 0
## Up 1
glm.pred <- rep("Down" ,length(glm.probs))
glm.pred[glm.probs >.5] = "Up"
table(glm.pred,Weekly$Direction)
##
## glm.pred Down Up
## Down 465 563
## Up 19 42
(563+19) /1089
## [1] 0.5344353
mean(glm.pred == Direction)
## [1] 0.4655647
The diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent incorrect predictions. Hence our model correctly predicted that the market would go up on 563 days and that it would go down on 19 days, for a total of 563 + 19 = 582 correct predictions. The mean() function can be used to compute the fraction of days for which the prediction was correct. In this case, logistic regression correctly predicted the movement of the market 46.6% of the time.
train <- (Year < 2009)
weekly.2009.2010<- Weekly[!train, ]
dim(weekly.2009.2010)
## [1] 104 9
direction.2009.2010 <- Direction[!train]
glm.weekly.data <- glm(Direction~Lag2, data=weekly.2009.2010,family=binomial, subset = train )
summary(glm.weekly.data)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = weekly.2009.2010,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5767 -1.3052 0.9242 1.0419 1.2978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.32377 0.20136 1.608 0.108
## Lag2 0.08562 0.06707 1.277 0.202
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 141.04 on 103 degrees of freedom
## Residual deviance: 139.37 on 102 degrees of freedom
## (881 observations deleted due to missingness)
## AIC: 143.37
##
## Number of Fisher Scoring iterations: 4
glm.probs2 <- predict(glm.weekly.data, weekly.2009.2010, type = "response")
glm.pred2 <- rep("Down",104)
glm.pred2[glm.probs2 >.5] = "Up"
table(glm.pred2,direction.2009.2010)
## direction.2009.2010
## glm.pred2 Down Up
## Down 8 4
## Up 35 57
(4+35) /104
## [1] 0.375
mean(glm.pred2 == direction.2009.2010)
## [1] 0.625
mean(glm.pred2 != direction.2009.2010)
## [1] 0.375
The diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent incorrect predictions. Hence our model correctly predicted that the market would go up on 4 days and that it would go down on 35 days, for a total of 4 + 35 = 39 correct predictions. The mean() function can be used to compute the fraction of days for which the prediction was correct. In this case, logistic regression correctly predicted the movement of the market 62.5% of the time. There is a test error rate of 37.5%.
library(MASS)
lda.fit <- lda(Direction~Lag2, data=weekly.2009.2010,family=binomial, subset = train )
lda.fit
## Call:
## lda(Direction ~ Lag2, data = weekly.2009.2010, family = binomial,
## subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4134615 0.5865385
##
## Group means:
## Lag2
## Down -0.08904651
## Up 0.69591803
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.3262636
lda.pred<-predict(lda.fit,weekly.2009.2010)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class<-lda.pred$class
table(lda.class, direction.2009.2010)
## direction.2009.2010
## lda.class Down Up
## Down 8 4
## Up 35 57
mean(lda.class == direction.2009.2010)
## [1] 0.625
Linear Discriminant Analysis to develop a classifying model yielded similar results as the logistic regression model created in part D.
qda.fit <- qda(Direction~Lag2, data = Weekly, subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class <- predict(qda.fit, weekly.2009.2010)$class
table(qda.class,direction.2009.2010)
## direction.2009.2010
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == direction.2009.2010)
## [1] 0.5865385
Quadratic Linear Analysis created a model with an accuracy of 58.65%, which is lower than the previous methods.
library(class)
train.X <- as.matrix(Lag2[train])
test.X <- as.matrix(Lag2[!train])
train.direction <- Direction[train]
set.seed(1)
knn.pred <- knn(train.X,test.X,train.direction ,k=1)
table(knn.pred,direction.2009.2010)
## direction.2009.2010
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == direction.2009.2010)
## [1] 0.5
The K-Nearest neighbors resulted in a classifying model with an accuracy rate of 50%.
Which of these methods appears to provide the best results on this data? The methods that have the highest accuracy rates are the Logistic Regression and Linear Discriminant Analysis. Both have rates of 62.5%.
Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
I will rerun all models using the full data set with the variables Lag 1 and Lag 2.
glm.weekly2 <- glm(Direction~Lag1+Lag2, data=Weekly,family=binomial )
summary(glm.weekly2)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.623 -1.261 1.001 1.083 1.506
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
glm.probs2 <- predict(glm.weekly2)
glm.pred2 <- rep("Down" ,length(glm.probs2))
glm.pred2[glm.probs2 >.5] = "Up"
table(glm.pred2,Weekly$Direction)
##
## glm.pred2 Down Up
## Down 470 576
## Up 14 29
mean(glm.pred2 == Direction)
## [1] 0.4582185
There is a test accuracy rate of 45.8%, which is much less than our first logistic model that produced a test error rate of 62.5%%. Added the additional variable reduced the accuracy.
Testing LDA next:
lda.fit2 <- lda(Direction~Lag2 + Lag1, data=Weekly)
lda.fit2
## Call:
## lda(Direction ~ Lag2 + Lag1, data = Weekly)
##
## Prior probabilities of groups:
## Down Up
## 0.4444444 0.5555556
##
## Group means:
## Lag2 Lag1
## Down -0.04042355 0.28229545
## Up 0.30428099 0.04521653
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.3458272
## Lag1 -0.2235194
lda.pred2<-predict(lda.fit2,Weekly)
names(lda.pred2)
## [1] "class" "posterior" "x"
lda.class2<-lda.pred2$class
table(lda.class2, Weekly$Direction)
##
## lda.class2 Down Up
## Down 37 37
## Up 447 568
mean(lda.class2 == Weekly$Direction)
## [1] 0.5555556
Linear Discriminant Analysis to develop a classifying model yielded higher results than the logistic regression model. With LDA there is a test accuracy rate of 55.6%. Which is less than the model with just Lag2 as a variable.
QDA:
qda.fit2 <- qda(Direction~Lag2 +Lag1, data = Weekly)
qda.fit2
## Call:
## qda(Direction ~ Lag2 + Lag1, data = Weekly)
##
## Prior probabilities of groups:
## Down Up
## 0.4444444 0.5555556
##
## Group means:
## Lag2 Lag1
## Down -0.04042355 0.28229545
## Up 0.30428099 0.04521653
qda.class2 <- predict(qda.fit2, Weekly)$class
table(qda.class2,Weekly$Direction)
##
## qda.class2 Down Up
## Down 61 63
## Up 423 542
mean(qda.class2 == Weekly$Direction)
## [1] 0.553719
QDA yielded results very similar to LDA, and is still much less than the model with two variables.
Run our originial KNN with different values of K
knn.pred5 <- knn(train.X,test.X,train.direction ,k=5)
table(knn.pred5,direction.2009.2010)
## direction.2009.2010
## knn.pred5 Down Up
## Down 15 20
## Up 28 41
mean(knn.pred5 == direction.2009.2010)
## [1] 0.5384615
with K = 5 we receive a test accuracy rate of 53.8% which is slightly higher than our KNN model with k=1. THat model produced an accuracy rate 50%.
Try k=10
knn.pred10 <- knn(train.X,test.X,train.direction ,k=10)
table(knn.pred10,direction.2009.2010)
## direction.2009.2010
## knn.pred10 Down Up
## Down 17 19
## Up 26 42
mean(knn.pred10 == direction.2009.2010)
## [1] 0.5673077
Our accuracy rate increase yet again to 56.7%! Thus increases our k value increases our test accuracy.
library(ISLR)
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
Auto$mpg01 <- ifelse(Auto$mpg>median(Auto$mpg),1,0)
pairs(Auto[,-9], panel = panel.smooth)
cor(Auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
MPG01 has a strong negative correlation with cylinders, displascement,horsepower, and weight. There is a positive relationship with acceleration, year, and origin.
test=1:100 #test will contain 100 observatiois - train will contain 292 - 392 total obs in Auto
train.X= Auto[-test ,]
test.X= Auto[test ,]
#LDA on train to predict mpg01 using values associated
lda.auto <- lda(mpg01~cylinders+displacement+horsepower+weight+acceleration+year+origin, data=train.X)
lda.auto
## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight +
## acceleration + year + origin, data = train.X)
##
## Prior probabilities of groups:
## 0 1
## 0.4178082 0.5821918
##
## Group means:
## cylinders displacement horsepower weight acceleration year origin
## 0 6.549180 252.5000 120.22131 3541.566 15.33443 76.21311 1.221311
## 1 4.205882 117.7647 78.36471 2361.071 16.48176 78.55882 1.958824
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.500462233
## displacement 0.002835733
## horsepower -0.003165444
## weight -0.001158075
## acceleration -0.022867850
## year 0.163478784
## origin 0.074994146
lda.auto.pred<-predict(lda.auto,test.X)
lda.auto.class<-lda.auto.pred$class
table(lda.auto.class, test.X$mpg01)
##
## lda.auto.class 0 1
## 0 68 1
## 1 6 25
mean(lda.auto.class != test.X$mpg01)
## [1] 0.07
The test error of the model is 7%.
qda.auto <- qda(mpg01~cylinders+displacement+horsepower+weight+acceleration+year+origin, data=train.X)
qda.auto
## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight +
## acceleration + year + origin, data = train.X)
##
## Prior probabilities of groups:
## 0 1
## 0.4178082 0.5821918
##
## Group means:
## cylinders displacement horsepower weight acceleration year origin
## 0 6.549180 252.5000 120.22131 3541.566 15.33443 76.21311 1.221311
## 1 4.205882 117.7647 78.36471 2361.071 16.48176 78.55882 1.958824
qda.auto.class <- predict(qda.auto, test.X)$class
table(qda.auto.class,test.X$mpg01)
##
## qda.auto.class 0 1
## 0 68 2
## 1 6 24
mean(qda.auto.class != test.X$mpg01)
## [1] 0.08
The test error of the model is 8%.
glm.auto <- glm(mpg01~cylinders+displacement+horsepower+weight+acceleration+year+origin, data=train.X, family = binomial)
auto.probs <- predict(glm.auto, test.X, type = "response")
auto.pred = rep(0, length(auto.probs))
auto.pred[auto.probs > 0.5] = 1
table(auto.pred, test.X$mpg01)
##
## auto.pred 0 1
## 0 73 10
## 1 1 16
mean(auto.pred != test.X$mpg01)
## [1] 0.11
The test error for this model is 11%.
train.Z <- cbind(displacement,horsepower,weight,cylinders,year,weight, origin)[-test,]
test.Z <- cbind(displacement,horsepower,weight,cylinders, year,weight, origin)[test,]
set.seed(1)
auto.knn<-knn(train.Z,test.Z,train.X$mpg01,k=1)
mean(auto.knn != test.X$mpg01)
## [1] 0.13
The test error for KNN with k=1 is 13%.
#k-4
auto.knn4<-knn(train.Z,test.Z,train.X$mpg01,k=4)
mean(auto.knn4 != test.X$mpg01)
## [1] 0.12
The test error for KNN with k=4 is 12%.
#k-8
auto.knn8<-knn(train.Z,test.Z,train.X$mpg01,k=8)
mean(auto.knn8 != test.X$mpg01)
## [1] 0.12
The test error for KNN with k=8 is 12%.
#k=15
auto.knn15<-knn(train.Z,test.Z,train.X$mpg01,k=15)
mean(auto.knn15 != test.X$mpg01)
## [1] 0.14
The test error for KNN with k=8 is 14%.
#k=30
auto.knn30<-knn(train.Z,test.Z,train.X$mpg01,k=30)
mean(auto.knn30 != test.X$mpg01)
## [1] 0.11
The test error for KNN with k=30 is 11%.
The value of K that performs the best on this data set is k= 30. This provides accuracy rate of 89%.
detach(Auto)
library(MASS)
attach(Boston)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
Boston$crim01 <- ifelse(Boston$crim>median(Boston$crim),1,0)
Train - Test
test <- 1:200 #test will contain 200 obs - test will contain 200 obs
bostontrain.X= Boston[-test ,]
bostontest.X= Boston[test ,]
pairs(Boston, panel = panel.smooth)
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv crim01
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 -0.35121093 0.4532627 -0.2630167 1.00000000
The variables with the strongest correlation with crim01 are nox, indus, rad, tax, age, and dis.
dis, rad, indus, and age being the highest Logistic Regression 1:
glm.boston <- glm(crim01~nox+indus+rad+tax+age+dis, data=bostontrain.X, family = binomial)
glm.boston
##
## Call: glm(formula = crim01 ~ nox + indus + rad + tax + age + dis, family = binomial,
## data = bostontrain.X)
##
## Coefficients:
## (Intercept) nox indus rad tax age
## -10.505418 17.708214 -0.054260 0.628094 -0.005775 0.013095
## dis
## -0.250230
##
## Degrees of Freedom: 305 Total (i.e. Null); 299 Residual
## Null Deviance: 410.7
## Residual Deviance: 125.6 AIC: 139.6
boston.probs <- predict(glm.boston, bostontest.X, type = "response")
boston.pred <- rep(0, length(boston.probs))
boston.pred[boston.probs > 0.5] = 1
table(boston.pred, bostontest.X$crim01)
##
## boston.pred 0 1
## 0 119 25
## 1 13 43
mean(boston.pred != bostontest.X$crim01)
## [1] 0.19
The logistic regression model produces a 19% chance of error.
Logistic Regression 1:
glm.boston2 <- glm(crim01~rad+indus+age+dis, data=bostontrain.X, family = binomial)
glm.boston2
##
## Call: glm(formula = crim01 ~ rad + indus + age + dis, family = binomial,
## data = bostontrain.X)
##
## Coefficients:
## (Intercept) rad indus age dis
## -0.51105 0.50174 -0.13104 0.02139 -0.71684
##
## Degrees of Freedom: 305 Total (i.e. Null); 301 Residual
## Null Deviance: 410.7
## Residual Deviance: 132.6 AIC: 142.6
boston.probs2 <- predict(glm.boston2, bostontest.X, type = "response")
boston.pred2 <- rep(0, length(boston.probs2))
boston.pred2[boston.probs2 > 0.5] = 1
table(boston.pred2, bostontest.X$crim01)
##
## boston.pred2 0 1
## 0 101 47
## 1 31 21
mean(boston.pred2!= bostontest.X$crim01)
## [1] 0.39
This logistic regression model produces a 39% chance of error. Reducing the number of variables increases the chance of error.
LDA:
#LDA on train to predict crim01 using values associated
lda.boston <- lda(crim01~nox+indus+rad+tax+age+dis, data=bostontrain.X)
lda.boston.pred <- predict(lda.boston,bostontest.X)
lda.boston.class <- lda.boston.pred$class
table(lda.boston.class,bostontest.X$crim01)
##
## lda.boston.class 0 1
## 0 104 52
## 1 28 16
mean(lda.boston.class != bostontest.X$crim01)
## [1] 0.4
The linear discriminant analysis produces a 40% chance of error.
#LDA on train to predict crim01 using values associated
lda.boston2 <- lda(crim01~rad+indus+age+dis, data=bostontrain.X)
lda.boston.pred2 <- predict(lda.boston2,bostontest.X)
lda.boston.class2 <- lda.boston.pred2$class
table(lda.boston.class2,bostontest.X$crim01)
##
## lda.boston.class2 0 1
## 0 98 67
## 1 34 1
mean(lda.boston.class2 != bostontest.X$crim01)
## [1] 0.505
The linear discriminant analysis produces a 51% chance of error.Reducing the number of variables increases the chance of error.
KNN:
bostontrain.Y <- cbind(nox,indus,rad,tax,age,dis)[-test,]
bostontest.Y <- cbind(nox,indus,rad,tax,age,dis)[test,]
set.seed(1)
boston.knn<-knn(bostontrain.Y,bostontest.Y,bostontrain.X$crim01,k=1)
mean(boston.knn != bostontest.X$crim01)
## [1] 0.37
#knn = 6
boston.knn6<-knn(bostontrain.Y,bostontest.Y,bostontrain.X$crim01,k=6)
mean(boston.knn6 != bostontest.X$crim01)
## [1] 0.32
The K-nearest neighbors analysis with k = 1 produces a 37% chance of error and k = 6 produces a 32% chance of error. We will now reduce the number of variables used and run KNN again.
bostontrain.Y2 <- cbind(indus,rad,age,dis)[-test,]
bostontest.Y2 <- cbind(indus,tax,age,dis)[test,]
set.seed(1)
boston.knn.2<-knn(bostontrain.Y2,bostontest.Y2,bostontrain.X$crim01,k=1)
mean(boston.knn.2 != bostontest.X$crim01)
## [1] 0.66
#knn = 6
boston.knn.6<-knn(bostontrain.Y2,bostontest.Y2,bostontrain.X$crim01,k=6)
mean(boston.knn.6 != bostontest.X$crim01)
## [1] 0.66
Both KNNs analyis above produces a test error rate of 66%. Reducing the variables has also increased the test error rate.
Among all the models tested the KNN models produced the highest chance of error at 66% when only using 4 variables.