Assignment 3 - ISLR Chapter 4

Problem 10 - Part A

weekly <- Weekly
attach(weekly)
summary(weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

The results of the pairs plot suggest that there may be a relationship shared between the Volume and Year variables. At a glance, there does not appear to be an easily identifiable relationship between any of the other variables.

pairs(weekly)

plot(Volume)

The results of the correlation table provide further evidence to suggest that there is a relationship shared between the Volume and Year variables. the correlation value 0.8419162 for Volume/Year is quite large in comparison to any of the other correlation values in this data.

cor(weekly[,-9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

Problem 10 - Part B

The only variable in the logistic regression model which appears to be statistically significant is Lag 2 which has a relatively small p-value (0.0296). The remaining variables in this model are all statistically insignificant with large p-values.

weeklyGLM <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=weekly, family=binomial)
summary(weeklyGLM)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Problem 10 - Part C

weeklyProb.1 <- predict(weeklyGLM, type="response")
weeklyPred.1 <- rep("Down", length(weeklyProb.1))
weeklyPred.1[weeklyProb.1 > 0.5] = "Up"

The amount of correct predictions in this model is 611 out of 1089 total predictions. This equals a classification accuracy rate of about 56.1% and a test error rate of 43.9%. Further examination of the amount of “Up” and “Down” predicted correctly by the logistic model reveals that the predictions are mostly correct for weeks when the market goes up but mostly incorrect for weeks when the market goes down. The accuracy for “Up” is about 92% and the accuracy for “Down” is only about 11.1%.

table(weeklyPred.1, Direction)

##             Direction
## weeklyPred.1 Down  Up
##         Down   54  48
##         Up    430 557

54+557

## [1] 611

54+48+430+557

## [1] 1089

(54+557)/(54+48+430+557)

## [1] 0.5610652

557/(557+48)

## [1] 0.9206612

54/(430+54)

## [1] 0.1115702

Problem 10 - Part D

weeklyTrain <- (Year < 2009)
weekly.0910 <- weekly[!weeklyTrain, ]
direction.0910 <- Direction[!weeklyTrain]
weeklyGLM.2 <- glm(Direction~Lag2, data=weekly, family=binomial, subset=weeklyTrain)
weeklyProb.2 <- predict(weeklyGLM.2, weekly.0910, type="response")
weeklyPred.2 <- rep("Down", length(weeklyProb.2))
weeklyPred.2[weeklyProb.2 > 0.5] = "Up"
summary(weeklyGLM.2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = weekly, 
##     subset = weeklyTrain)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

The amount of correct predictions in the logistic regression model containing 2009/2010 data is 65 out of 104 total predictions. This equals a classification accuracy rate of exactly 62.5%.

table(weeklyPred.2, direction.0910)

##             direction.0910
## weeklyPred.2 Down Up
##         Down    9  5
##         Up     34 56

9+56

## [1] 65

9+56+5+34

## [1] 104

mean(weeklyPred.2 == direction.0910)

## [1] 0.625

Problem 10 - Part E

weeklyLDA <- lda(Direction~Lag2, data=weekly, subset=weeklyTrain)
weeklyPred.3 <- predict(weeklyLDA, weekly.0910)

Similar to the GLM above, the amount of correct predictions in the LDA model containing 2009/2010 data is 65 out of 104 total predictions. This equals a classification accuracy rate of exactly 62.5%.

table(weeklyPred.3$class, direction.0910)

##       direction.0910
##        Down Up
##   Down    9  5
##   Up     34 56

mean(weeklyPred.3$class == direction.0910)

## [1] 0.625

Problem 10 - Part F

weeklyQDA <- qda(Direction~Lag2, data=weekly, subset=weeklyTrain)
weeklyPred.4 <- predict(weeklyQDA, weekly.0910)

The amount of correct predictions in the QDA model containing 2009/2010 data is 61 out of 104 total predictions. This equals a classification accuracy rate of approximately 58.65%.

table(weeklyPred.4$class, direction.0910)

##       direction.0910
##        Down Up
##   Down    0  0
##   Up     43 61

mean(weeklyPred.4$class == direction.0910)

## [1] 0.5865385

Problem 10 - Part G

The amount of correct predictions in the KNN model containing 2009/2010 data is 52 out of 104 total predictions. This equals a classification accuracy rate of exactly 50%.

weeklyTrain.KNN <- as.matrix(Lag2[weeklyTrain])
weeklyTest.KNN <- as.matrix(Lag2[!weeklyTrain])
trainDirection.1 <- Direction[weeklyTrain]
set.seed(1)
weeklyPred.5 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k = 1)
table(weeklyPred.5, direction.0910)

##             direction.0910
## weeklyPred.5 Down Up
##         Down   21 30
##         Up     22 31

mean(weeklyPred.5 == direction.0910)

## [1] 0.5

Problem 10 - Part H

The model which appears to provide the best results on this data is a actually a tie between logistic regression and LDA. Both of these models have an identical classification accuracy rate of 62.5% and a test error rate of 37.5%. The accuracy of these two models exceeds that of the QDA and KNN models which have accuracy rates of 58.7% and 50%, respectively.

Problem 10 - Part I

The classification accuracy rate for the LDA model with Lag 1 and Lag 3 is 59.61%.

weeklyLDA.2 <- lda(Direction~Lag1+Lag3, data=weekly, subset=weeklyTrain)
weeklyPred.6 <- predict(weeklyLDA.2, weekly.0910)
table(weeklyPred.6$class, direction.0910)

##       direction.0910
##        Down Up
##   Down    5  4
##   Up     38 57

mean(weeklyPred.6$class == direction.0910)

## [1] 0.5961538

The classification accuracy rate for the QDA model with the interaction of Lag 1 and Lag 3 is 58.65%.

weeklyQDA.2 <- qda(Direction~Lag1:Lag3, data=weekly, subset=weeklyTrain)
weeklyPred.7 <- predict(weeklyQDA.2, weekly.0910)
table(weeklyPred.7$class, direction.0910)

##       direction.0910
##        Down Up
##   Down    0  0
##   Up     43 61

mean(weeklyPred.7$class == direction.0910)

## [1] 0.5865385

The classification accuracy rate for the LDA model with the Lag 1 and Lag 3 squared is 57.69%.

weeklyLDA.3 <- lda(Direction~Lag1+I(Lag3^2), data=weekly, subset=weeklyTrain)
weeklyPred.8 <- predict(weeklyLDA.3, weekly.0910)
table(weeklyPred.8$class, direction.0910)

##       direction.0910
##        Down Up
##   Down    4  5
##   Up     39 56

mean(weeklyPred.8$class == direction.0910)

## [1] 0.5769231

The classification accuracy rate for the GLM model with the interaction of Lag 1 and Lag 3 is 58.65%.

weeklyGLM.2 <- glm(Direction~Lag1:Lag3, data=weekly, family=binomial, subset=weeklyTrain)
weeklyProb.3 <- predict(weeklyGLM.2, weekly.0910, type="response")
weeklyPred.9 <- rep("Down", length(weeklyProb.3))
weeklyPred.9[weeklyProb.3 > 0.5] = "Up"
direction.0910.2 <- Direction[!weeklyTrain]
table(weeklyPred.9, direction.0910.2)

##             direction.0910.2
## weeklyPred.9 Down Up
##         Down    3  3
##         Up     40 58

mean(weeklyPred.9 == direction.0910.2)

## [1] 0.5865385

The classification accuracy rate for the KNN model for k=5 is 53.85%.

set.seed(1)
weeklyPred.10 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k=5)
table(weeklyPred.10, direction.0910)

##              direction.0910
## weeklyPred.10 Down Up
##          Down   16 21
##          Up     27 40

mean(weeklyPred.10 == direction.0910)

## [1] 0.5384615

The classification accuracy rate for the KNN model for k=25 is 52.88%.

set.seed(1)
weeklyPred.11 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k=25)
table(weeklyPred.11, direction.0910)

##              direction.0910
## weeklyPred.11 Down Up
##          Down   19 25
##          Up     24 36

mean(weeklyPred.11 == direction.0910)

## [1] 0.5288462

The classification accuracy rate for the KNN model for k=25 is 55.77%.

set.seed(1)
weeklyPred.12 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k=40)
table(weeklyPred.12, direction.0910)

##              direction.0910
## weeklyPred.12 Down Up
##          Down   21 24
##          Up     22 37

mean(weeklyPred.12 == direction.0910)

## [1] 0.5576923

In terms of classification accuracy, the first LDA model provides the best results. This model includes Lag 1 and Lag 3 as predictors and has a classification accuracy rate of 59.61% and test error rate of 40.39%.

Problem 11 - Part A

In this chunk, I am creating the mpg01 binary variable. For values in mpg above its median, mpg01 is equal to 1. For values in mpg below its median, mpg01 is equal to 0.

auto <- Auto
attach(auto)
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
auto <- data.frame(auto, mpg01)

Problem 11 - Part B

The results of the scatterplot matrix and correlation table suggest that the variables cylinders, displacement, horsepower and weight are the most useful for predicting mpg01. The variable mpg is not considered since it is directly correlated with mpg01. The variables origin and cylinders are not explored in the boxplots due to the two variables being somewhat categorical in nature.

pairs(auto)

cor(auto[,-9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

The results of the boxplot suggest that the mean of horsepower is larger when mpg01 is below the median.

boxplot(horsepower~mpg01, auto)

The results of the boxplot suggest that the mean of of displacement is larger when mpg01 is below the median.

boxplot(displacement~mpg01, auto)

The results of the boxplot suggest that the mean of weight is larger when mpg01 is below the median.

boxplot(weight~mpg01, auto)

The results of the boxplot suggest that the mean of acceleration is slightly smaller when mpg01 is below the median.

boxplot(acceleration~mpg01, auto)

The results of the boxplot below suggest that mpg01 is below the median for cars that are from earlier years.

boxplot(year~mpg01, auto)

Problem 11 - Part C

In this chunk, I am splitting the data set into 70% train and 30% test.

set.seed(1)
split <- sample(nrow(auto), nrow(auto) * 0.7)
autoTrain <- auto[split,]
autoTest <- auto[-split,]

Problem 11 - Part D

The test error rate for the LDA model with the variables cylinders, displacement, horsepower and weight is 11.86%.

autoLDA <- lda(mpg01~cylinders+displacement+horsepower+weight, data=autoTrain)
autoPred.1 <- predict(autoLDA, autoTest)
table(autoPred.1$class, autoTest$mpg01)

##    
##      0  1
##   0 50  3
##   1 11 54

mean(autoPred.1$class != autoTest$mpg01)

## [1] 0.1186441

Problem 11 - Part E

The test error rate for the LDA model with the variables cylinders, displacement, horsepower and weight is 11.86%.

autoQDA <- qda(mpg01~cylinders+displacement+horsepower+weight, data=autoTrain)
autoPred.2 <- predict(autoQDA, autoTest)
table(autoPred.2$class, autoTest$mpg01)

##    
##      0  1
##   0 52  5
##   1  9 52

mean(autoPred.2$class != autoTest$mpg01)

## [1] 0.1186441

Problem 11 - Part F

The test error rate for the LDA model with the variables cylinders, displacement, horsepower and weight is 9.32%.

autoGLM <- glm(mpg01~cylinders+displacement+horsepower+weight, data=autoTrain, family=binomial)
autoProb <- predict(autoGLM, autoTest, type="response")
autoPred.3 <- rep(0, length(autoProb))
autoPred.3[autoProb > 0.5] = 1
table(autoPred.3, autoTest$mpg01)

##           
## autoPred.3  0  1
##          0 53  3
##          1  8 54

mean(autoPred.3 != autoTest$mpg01)

## [1] 0.09322034

Problem 11 - Part G

The test error rate for the KNN model for K=1 with the variables cylinders, displacement, horsepower and weight is 13.56%.

autoTrain.KNN <- cbind(autoTrain$cylinders, autoTrain$weight, autoTrain$displacement, autoTrain$horsepower)
autoTest.KNN <- cbind(autoTest$cylinders, autoTest$weight, autoTest$displacement, autoTest$horsepower)
set.seed(1)
autoPred.4 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=1)
table(autoPred.4, autoTest$mpg01)

##           
## autoPred.4  0  1
##          0 51  6
##          1 10 51

mean(autoPred.4 != autoTest$mpg01)

## [1] 0.1355932

The test error rate for the KNN model for K=5 with the variables cylinders, displacement, horsepower and weight is 12.71%.

set.seed(1)
autoPred.5 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=5)
table(autoPred.5, autoTest$mpg01)

##           
## autoPred.5  0  1
##          0 51  5
##          1 10 52

mean(autoPred.5 != autoTest$mpg01)

## [1] 0.1271186

The test error rate for the KNN model for K=10 with the variables cylinders, displacement, horsepower and weight is 14.40%.

set.seed(1)
autoPred.6 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=10)
table(autoPred.6, autoTest$mpg01)

##           
## autoPred.6  0  1
##          0 49  5
##          1 12 52

mean(autoPred.6 != autoTest$mpg01)

## [1] 0.1440678

The test error rate for the KNN model for K=10 with the variables cylinders, displacement, horsepower and weight is 11.01%.

set.seed(1)
autoPred.7 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=3)
table(autoPred.7, autoTest$mpg01)

##           
## autoPred.7  0  1
##          0 52  4
##          1  9 53

mean(autoPred.7 != autoTest$mpg01)

## [1] 0.1101695

Problem 13

boston <- Boston
attach(boston)

In this chunk, I am creating the crim01 binary variable. For values in crim above its median, crim01 is equal to 1. For values in crim below its median, crim01 is equal to 0.

crim01 <- rep(0, length(crim))
crim01[crim > median(crim)] = 1
boston <- data.frame(boston, crim01)

In this chunk, I am splitting the data set into 70% train and 30% test.

set.seed(1)
split.2 <- sample(nrow(boston), nrow(boston) * 0.7)
bostonTrain <- boston[split.2,]
bostonTest <- boston[-split.2,]

summary(boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv           crim01   
##  Min.   : 1.73   Min.   : 5.00   Min.   :0.0  
##  1st Qu.: 6.95   1st Qu.:17.02   1st Qu.:0.0  
##  Median :11.36   Median :21.20   Median :0.5  
##  Mean   :12.65   Mean   :22.53   Mean   :0.5  
##  3rd Qu.:16.95   3rd Qu.:25.00   3rd Qu.:1.0  
##  Max.   :37.97   Max.   :50.00   Max.   :1.0

The classification accuracy rate for the LDA model with the predictors nox, age, dis, ptratio, lstat and medv is 84.87%.

bostonLDA <- lda(crim01~nox+age+dis+ptratio+lstat+medv, data=bostonTrain)
bostonPred.1 <- predict(bostonLDA, bostonTest)
table(bostonPred.1$class, bostonTest$crim01)

##    
##      0  1
##   0 66 16
##   1  7 63

mean(bostonPred.1$class == bostonTest$crim01)

## [1] 0.8486842

The classification accuracy rate for the GLM model with the predictors nox, age, tax and dis is 81.58%.

bostonGLM <- glm(crim01~nox+age+tax+dis, data=bostonTrain, family=binomial)
bostonProb.1 = predict(bostonGLM, bostonTest, type="response")
bostonPred.2 = rep(0, length(bostonProb.1))
bostonPred.2[bostonProb.1 > 0.5] = 1
table(bostonPred.2, bostonTest$crim01)

##             
## bostonPred.2  0  1
##            0 56 11
##            1 17 68

mean(bostonPred.2 == bostonTest$crim01)

## [1] 0.8157895

The classification accuracy rate for the GLM model with the predictors nox, age, dis, ptratio, lstat and medv is 85.53%.

bostonGLM.2 <- glm(crim01~nox+age+dis+ptratio+lstat+medv, data=bostonTrain, family=binomial)
bostonProb.2 = predict(bostonGLM.2, bostonTest, type="response")
bostonPred.3 = rep(0, length(bostonProb.2))
bostonPred.3[bostonProb.2 > 0.5] = 1
table(bostonPred.3, bostonTest$crim01)

##             
## bostonPred.3  0  1
##            0 61 10
##            1 12 69

mean(bostonPred.3 == bostonTest$crim01)

## [1] 0.8552632

The classification accuracy rate for the LDA model with the predictors nox and age is 84.87%.

bostonLDA.2 <- lda(crim01~nox+age, data=bostonTrain)
bostonPred.4 <- predict(bostonLDA.2, bostonTest)
table(bostonPred.4$class, bostonTest$crim01)

##    
##      0  1
##   0 62 12
##   1 11 67

mean(bostonPred.4$class == bostonTest$crim01)

## [1] 0.8486842

The classification accuracy rate for the KNN model with the predictors age, dis and medv is 78.29%.

bostonTrain.KNN <- cbind(bostonTrain$age, bostonTrain$dis, bostonTrain$medv)
bostonTest.KNN <- cbind(bostonTest$age, bostonTest$dis, bostonTest$medv)
set.seed(1)
bostonPred.5 <- knn(bostonTrain.KNN, bostonTest.KNN, bostonTrain$crim01, k=1)
table(bostonPred.5, bostonTest$crim01)

##             
## bostonPred.5  0  1
##            0 55 15
##            1 18 64

mean(bostonPred.5 == bostonTest$crim01)

## [1] 0.7828947

The classification accuracy rate for the KNN model with the predictors age, dis and medv is 80.26%.

set.seed(1)
bostonPred.6 <- knn(bostonTrain.KNN, bostonTest.KNN, bostonTrain$crim01, k=10)
table(bostonPred.6, bostonTest$crim01)

##             
## bostonPred.6  0  1
##            0 59 16
##            1 14 63

mean(bostonPred.6 == bostonTest$crim01)

## [1] 0.8026316

The classification accuracy rate for the KNN model with the predictors age, dis and medv is 82.89%.

set.seed(1)
bostonPred.7 <- knn(bostonTrain.KNN, bostonTest.KNN, bostonTrain$crim01, k=20)
table(bostonPred.7, bostonTest$crim01)

##             
## bostonPred.7  0  1
##            0 60 13
##            1 13 66

mean(bostonPred.7 == bostonTest$crim01)

## [1] 0.8289474

Assignment 3 - ISLR Chapter 4

Ryan Farias

6/21/2020