Homework 3 - Classification

Problem 10

This question should be answered using the [Weekly] (https://rdrr.io/cran/ISLR/man/Weekly.html) data set, which is part of the ISLR package. This data is similar in nature to the [Smarket] (https://rdrr.io/cran/ISLR/man/Smarket.html) data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.

(a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
As seen from the summaries and graphs below, there do not appear to be any major patterns in the data. The only thing I was able to pick out was that as the Year increases, so does the Volume which means they have a positive correlation.

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume       
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202  
##  Median :  0.2380   Median :  0.2340   Median :1.00268  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821  
##      Today          Direction 
##  Min.   :-18.1950   Down:484  
##  1st Qu.: -1.1540   Up  :605  
##  Median :  0.2410             
##  Mean   :  0.1499             
##  3rd Qu.:  1.4050             
##  Max.   : 12.0260

pairs(Weekly)

cor(Weekly[,-9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

attach(Weekly)
plot(Volume)  # volume looks to be the only predictor that may have a correlation with Direction

(b) Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

The only predictor that appears to be significant is Lag2 which has a pvalue of 0.0296.

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, family=binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
The confusion matrix is a way to compare the classifications we came up with from our glm.fit compared to the actul data inorder to see how well our model did with the predicting the classification. Overall, our model was 56% accurate which can be calculated by (54+557)/1089. If we look at our Down and Up seperately, our model predicted Down correctly only 11% of the time as calculated by 54/(54+430). Our model predicted Up correctly 92% of the time as calculated by 557/(48+557).

glm.probs = predict(glm.fit, type='response')
contrasts(Direction)

##      Up
## Down  0
## Up    1

#we want to assign each observation with classification up or down
glm.pred=rep("Down",1089) # creates a vector with Down 1089 times
glm.pred[glm.probs>0.5]="Up" #if glm.prob is greater then .5, the obs gets reassigned as Up
table(glm.pred,Direction)

##         Direction
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557

mean(glm.pred==Direction)  #outputs the overall percentage of correct predictions

## [1] 0.5610652

mean(glm.pred!=Direction)  #outputs the miscalssification rate

## [1] 0.4389348

(d) Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
Once the model was refit with only the Lag2 predictor, we got a higher percentage of 62.5% classifications correct when compared to our test data.

#create training dataset
train =(Year<2009)
head(train)

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

tail(train)

## [1] FALSE FALSE FALSE FALSE FALSE FALSE

#to check numbers in train versus original
Weekly.2009=Weekly[!train,] #this is our testing dataset
dim(Weekly)

## [1] 1089    9

dim(Weekly.2009)

## [1] 104   9

#gets direction for training dataset
Direction.2009=Direction[!train]

glm.fits=glm(Direction~Lag2, data=Weekly, subset=train, family=binomial)  #fits model using only `Lag2`
glm.probs=predict(glm.fits, Weekly.2009, type='response')

glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)

##         Direction.2009
## glm.pred Down Up
##     Down    9  5
##     Up     34 56

mean(glm.pred==Direction.2009)  #did what I guess equal what really happened?

## [1] 0.625

mean(glm.pred!=Direction.2009)  #misclassification rate

## [1] 0.375

(e) Repeat (d) using LDA.
As seen by the confusion matrix below, The LDA produces the same correction classification rate of 62.5% as did our logistic regression.

##LDA Model - will this help us to get a better fit?
library(MASS)
lda.fit=lda(Direction~Lag2, data = Weekly, subset = train)
lda.fit

## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162

plot(lda.fit)

lda.pred=predict(lda.fit, Weekly.2009)
lda.class=lda.pred$class
table(lda.class, Direction.2009)

##          Direction.2009
## lda.class Down Up
##      Down    9  5
##      Up     34 56

mean(lda.class==Direction.2009)

## [1] 0.625

(f) Repeat (d) using QDA.
The percentage of correct classifiactions went down with QDA at 58.65 percent. One thing I found interesting is that by looking at the confusino matrix, QDA picked Up every time.

qda.fit=qda(Direction~Lag2, data = Weekly, subset = train)
qda.fit

## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581

qda.class=predict(qda.fit, Weekly.2009)$class
table(qda.class, Direction.2009)

##          Direction.2009
## qda.class Down Up
##      Down    0  0
##      Up     43 61

mean(qda.class==Direction.2009)

## [1] 0.5865385

(g) Repeat (d) using KNN with K = 1.
KNN with K=1 produced 50% of the classifications correct.

library(class)
train.X=as.matrix(Lag2[train])
test.X=as.matrix(Lag2[!train])
train.Direction=Direction[train]
dim(train.X)

## [1] 985   1

dim(test.X)

## [1] 104   1

dim(train.Direction)

## NULL

length(train.Direction)

## [1] 985

set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.2009)

##         Direction.2009
## knn.pred Down Up
##     Down   21 30
##     Up     22 31

mean(knn.pred==Direction.2009)

## [1] 0.5

(h) Which of these methods appears to provide the best results on this data?

Logisitc regression and LDA methods yielded the highest precentage of correct classifications at 62.5% correct.

(i) Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
I ran several different models below and found that the logistic regression and LDA models with Lag1^2 and Lag2 gave me the best results at 64.42%.

#taking the sqaure root of Lag 2 was not beneficial

glm.fit2=glm(Direction~(sqrt(Lag2)), data=Weekly, family=binomial)

## Warning in sqrt(Lag2): NaNs produced

glm.probs=predict(glm.fit2, Weekly.2009, type='response')

## Warning in sqrt(Lag2): NaNs produced

glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)

##         Direction.2009
## glm.pred Down Up
##     Down   21 28
##     Up     22 33

mean(glm.pred==Direction.2009)  #correctly predicted

## [1] 0.5192308

mean(glm.pred!=Direction.2009)  #misclassification rate

## [1] 0.4807692

#Lag1 and Lag2 and the interaction effect

glm.fit3=glm(Direction~Lag1*Lag2, data=Weekly, family=binomial)
glm.probs=predict(glm.fit3, Weekly.2009, type='response')

glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)

##         Direction.2009
## glm.pred Down Up
##     Down    5  6
##     Up     38 55

mean(glm.pred==Direction.2009)  #correctly predicted

## [1] 0.5769231

mean(glm.pred!=Direction.2009)  #misclassification rate

## [1] 0.4230769

#Lag1^2 and Lag2

glm.fit4=glm(Direction~Lag2+I(Lag1^2), data=Weekly, family=binomial)
glm.probs=predict(glm.fit4, Weekly.2009, type='response')

glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)

##         Direction.2009
## glm.pred Down Up
##     Down    8  2
##     Up     35 59

mean(glm.pred==Direction.2009)  #correctly predicted

## [1] 0.6442308

mean(glm.pred!=Direction.2009)  #misclassification rate

## [1] 0.3557692

#LDA with Lag1^2 and Lag2

lda.fit2=lda(Direction~Lag2 + I(Lag1^2), data = Weekly, subset = train)
lda.fit2

## Call:
## lda(Direction ~ Lag2 + I(Lag1^2), data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2 I(Lag1^2)
## Down -0.03568254  4.964397
## Up    0.26036581  5.318940
## 
## Coefficients of linear discriminants:
##                  LD1
## Lag2      0.45006708
## I(Lag1^2) 0.02767883

lda.pred2=predict(lda.fit2, Weekly.2009)
lda.class2=lda.pred2$class
table(lda.class2, Direction.2009)

##           Direction.2009
## lda.class2 Down Up
##       Down    8  2
##       Up     35 59

mean(lda.class2==Direction.2009)

## [1] 0.6442308

mean(lda.class2!=Direction.2009)

## [1] 0.3557692

#QDA with Lag1^2 and Lag2

qda.fit2=qda(Direction~Lag2 + I(Lag1^2), data = Weekly, subset = train)
qda.fit2

## Call:
## qda(Direction ~ Lag2 + I(Lag1^2), data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2 I(Lag1^2)
## Down -0.03568254  4.964397
## Up    0.26036581  5.318940

qda.pred=predict(qda.fit2, Weekly.2009)
qda.class=qda.pred$class
table(qda.class, Direction.2009)

##          Direction.2009
## qda.class Down Up
##      Down   32 40
##      Up     11 21

mean(qda.class==Direction.2009)

## [1] 0.5096154

mean(qda.class!=Direction.2009)

## [1] 0.4903846

#KNN with different values of K

set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=5)
table(knn.pred, Direction.2009)

##         Direction.2009
## knn.pred Down Up
##     Down   16 21
##     Up     27 40

mean(knn.pred==Direction.2009)

## [1] 0.5384615

mean(knn.pred!=Direction.2009)

## [1] 0.4615385

set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=10)
table(knn.pred, Direction.2009)

##         Direction.2009
## knn.pred Down Up
##     Down   17 21
##     Up     26 40

mean(knn.pred==Direction.2009)

## [1] 0.5480769

mean(knn.pred!=Direction.2009)

## [1] 0.4519231

Problem 11

In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the [Auto] (https://rdrr.io/cran/ISLR/man/Auto.html) data set.

(a) Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.

library(ISLR)
summary(Auto)

##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365

detach(Weekly)
attach(Auto)
mpg01=rep(0, length(mpg)) #creates a new column `mpg01` with all values of 0
mpg01[mpg > median(mpg)] = 1  #any row with `mpg` greater than the median will be reassigned a valur of 1
Auto2 = data.frame(Auto, mpg01)  # creates new dataset `Auto2` with added column of `mpg01`
summary(Auto2)

##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name         mpg01    
##  amc matador       :  5   Min.   :0.0  
##  ford pinto        :  5   1st Qu.:0.0  
##  toyota corolla    :  5   Median :0.5  
##  amc gremlin       :  4   Mean   :0.5  
##  amc hornet        :  4   3rd Qu.:1.0  
##  chevrolet chevette:  4   Max.   :1.0  
##  (Other)           :365

(b) Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
Using the correlation values and comparing to the boxplots, the predictors that look like they may be most uselful when predicting mpg01 are cylinders, displacement, horsepower, and weight.

pairs(Auto2[,-9]) #not very helpful since mpg01 is binary

cor(Auto2[,-9]) #looks like `cylinders`, `displacement`, `horsepower`, and `weight` may be correlated to `mpg01`

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

par(mfrow=c(2,2))
boxplot(cylinders~mpg01)
boxplot(displacement~mpg01)
boxplot(horsepower~mpg01)
boxplot(weight~mpg01)

(c) Split the data into a training set and a test set.

set.seed(1)
inTrain=sample(1:nrow(Auto2), nrow(Auto2)*0.7) #inTrain is from 70% random data from Auto
inTrain

##   [1] 324 167 129 299 270 187 307  85 277 362 330 263 329  79 213  37 105
##  [18] 217 366 165 290 383  89 289 340 326 382  42 111  20  44 343  70 121
##  [35]  40 172  25 248 198  39 298 280 160  14 130  45  22 206 230 193 104
##  [52] 367 255 341 342 103 331  13 296 375 176 279 110  84  29 141 252 221
##  [69] 108 304  33 347 149 287 102 145 118 323 107  64 224 337  51 325 372
##  [86] 138 390 389 282 143 285 170  48 204 295  24 181 214 225 163  43   1
## [103] 328  78 284 116 233  61  86 374  49 242 246 247 239 219 135 364 363
## [120] 310  53 348  65 376 124  77 218  98 194  19  31 174 237  75  16 358
## [137]   9  50  92 122 152 386 207 244 229 350 355 391 223 373 309 140 126
## [154] 349 344 319 258  15 271 388 195 201 318  17 212 127 133  41 384 392
## [171] 159 117  72  36 315 294 157 378 313 306 272 106 185  88 281 228 238
## [188] 368  80  30  93 234 220 240 369 164 168 243 200 184 260 100 113 359
## [205]  73  27 333 235  38  62 134 132  35 125  99 267 269  71 153 262 377
## [222]  28 183 148 308 227 365  60 171 354 173  12 202 305 371 265  26 322
## [239] 334 208 288 297 357 249 210 278  82  97 264 250  56 216 101 336 259
## [256]   2 192 131 275 169 292 291 317  83  90  23  68 222 190  91  57 311
## [273] 345  52

training = Auto2[inTrain, -1]  #training dataset is rows from inTraining(random 70%) and all columns execpt mpg(first column)
summary(training)

##    cylinders      displacement     horsepower        weight    
##  Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1649  
##  1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 78.0   1st Qu.:2224  
##  Median :4.000   Median :151.0   Median : 93.5   Median :2790  
##  Mean   :5.464   Mean   :193.2   Mean   :103.8   Mean   :2967  
##  3rd Qu.:8.000   3rd Qu.:261.5   3rd Qu.:123.8   3rd Qu.:3612  
##  Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                
##   acceleration        year           origin    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.00  
##  1st Qu.:14.00   1st Qu.:73.00   1st Qu.:1.00  
##  Median :15.50   Median :76.00   Median :1.00  
##  Mean   :15.66   Mean   :76.06   Mean   :1.54  
##  3rd Qu.:17.27   3rd Qu.:79.00   3rd Qu.:2.00  
##  Max.   :24.80   Max.   :82.00   Max.   :3.00  
##                                                
##                         name         mpg01       
##  amc gremlin              :  4   Min.   :0.0000  
##  amc hornet               :  4   1st Qu.:0.0000  
##  amc matador              :  3   Median :1.0000  
##  chevrolet caprice classic:  3   Mean   :0.5073  
##  chevrolet chevette       :  3   3rd Qu.:1.0000  
##  chevrolet impala         :  3   Max.   :1.0000  
##  (Other)                  :254

dim(training)

## [1] 274   9

testing = Auto2[-inTrain, -1]  #testing dataset - everything not in training dataset
dim(testing)

## [1] 118   9

dim(Auto2)

## [1] 392  10

test.mpg01=mpg01[-inTrain] #vector of mpg01 for the test datset
train.mpg01=mpg01[inTrain] #vector og mpg01 for the training dataset
length(test.mpg01)

## [1] 118

length(train.mpg01)

## [1] 274

(d) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
This LDA model yields an 11.86% misclassifications rate, which means it predicted about 88% correct which seems pretty good.

library(MASS)
lda.fit=lda(mpg01~cylinders+displacement+horsepower+weight, data = Auto2, subset=inTrain)
lda.fit

## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2, 
##     subset = inTrain)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4927007 0.5072993 
## 
## Group means:
##   cylinders displacement horsepower   weight
## 0  6.777778     271.9333  129.13333 3611.052
## 1  4.187050     116.8129   79.27338 2342.165
## 
## Coefficients of linear discriminants:
##                        LD1
## cylinders    -0.3962357999
## displacement -0.0047630097
## horsepower    0.0061919395
## weight       -0.0008321338

plot(lda.fit)

lda.pred=predict(lda.fit, testing)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class=lda.pred$class  #assigns classification for our predictions
table(lda.class, test.mpg01) #compares our prectiction with the real results for our test data

##          test.mpg01
## lda.class  0  1
##         0 50  3
##         1 11 54

mean(lda.class==test.mpg01) #correctly predicted

## [1] 0.8813559

mean(lda.class!=test.mpg01) #misclassification rate

## [1] 0.1186441

(e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
The restults from QDA were very much like LDA with an 11% misclassification rate.

qda.fit=qda(mpg01~cylinders+displacement+horsepower+weight, data = Auto2, subset=inTrain)
qda.fit

## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2, 
##     subset = inTrain)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4927007 0.5072993 
## 
## Group means:
##   cylinders displacement horsepower   weight
## 0  6.777778     271.9333  129.13333 3611.052
## 1  4.187050     116.8129   79.27338 2342.165

qda.class=predict(qda.fit,testing)$class
#names(qda.pred)
table(qda.class, test.mpg01)

##          test.mpg01
## qda.class  0  1
##         0 52  5
##         1  9 52

mean(qda.class==test.mpg01) #correctly predicted

## [1] 0.8813559

mean(qda.class!=test.mpg01) #misclassification rate

## [1] 0.1186441

(f) Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
Logistics regression reulted in a 9% misclassification rate, which mean about 91% were predicted correctly - this is the best so far.

glm.fit=glm(mpg01~cylinders + displacement + horsepower + weight, data = Auto2, subset=inTrain, family=binomial)
glm.probs=predict(glm.fit, testing, type='response')

glm.pred=rep(0,length(glm.probs))
glm.pred[glm.probs>0.5]=1
table(glm.pred, test.mpg01)

##         test.mpg01
## glm.pred  0  1
##        0 53  3
##        1  8 54

mean(glm.pred==test.mpg01) #correctly predicted

## [1] 0.9067797

mean(glm.pred!=test.mpg01) #misclassification rate

## [1] 0.09322034

(g) Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?
I ran the KNN for several values of K with the best resuslt being when K=5 with a 12.71% miscalssification rate.
KNN = 1 - 13.56% misclassification rate KNN = 10 - 14.41% misclassification rate KNN = 100 - 15.25% misclassification rate

train.X=cbind(training$cylinders, training$weight, training$displacement, training$horsepower)
test.X=cbind(testing$cylinders, testing$weight, testing$displacement, testing$horsepower)

#k=1
knn.pred=knn(train.X, test.X, train.mpg01)

mean(knn.pred==test.mpg01) #correctly predicted

## [1] 0.8644068

mean(knn.pred!=test.mpg01) #misclassification rate

## [1] 0.1355932

#k=5
knn.pred=knn(train.X, test.X, train.mpg01, k=5)

mean(knn.pred==test.mpg01) #correctly predicted

## [1] 0.8728814

mean(knn.pred!=test.mpg01) #misclassification rate

## [1] 0.1271186

#k=10
knn.pred=knn(train.X, test.X, train.mpg01, k=10)

mean(knn.pred==test.mpg01) #correctly predicted

## [1] 0.8559322

mean(knn.pred!=test.mpg01) #misclassification rate

## [1] 0.1440678

#k=100
knn.pred=knn(train.X, test.X, train.mpg01, k=100)

mean(knn.pred==test.mpg01) #correctly predicted

## [1] 0.8474576

mean(knn.pred!=test.mpg01) #misclassification rate

## [1] 0.1525424

Problem 13

Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
Below are the results for logistic regression, LDA, QDA, and KNN models. I found that the LDA and KNN whern K=1 yielded the lowest error rate of 13.82%, followed by QDA at 14.47%, logistic regression at 15.7%, and then by the other KNN models woth various values for K.

summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

cor(Boston)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  rm         age         dis          rad         tax
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593
##            ptratio       black      lstat       medv
## crim     0.2899456 -0.38506394  0.4556215 -0.3883046
## zn      -0.3916785  0.17552032 -0.4129946  0.3604453
## indus    0.3832476 -0.35697654  0.6037997 -0.4837252
## chas    -0.1215152  0.04878848 -0.0539293  0.1752602
## nox      0.1889327 -0.38005064  0.5908789 -0.4273208
## rm      -0.3555015  0.12806864 -0.6138083  0.6953599
## age      0.2615150 -0.27353398  0.6023385 -0.3769546
## dis     -0.2324705  0.29151167 -0.4969958  0.2499287
## rad      0.4647412 -0.44441282  0.4886763 -0.3816262
## tax      0.4608530 -0.44180801  0.5439934 -0.4685359
## ptratio  1.0000000 -0.17738330  0.3740443 -0.5077867
## black   -0.1773833  1.00000000 -0.3660869  0.3334608
## lstat    0.3740443 -0.36608690  1.0000000 -0.7376627
## medv    -0.5077867  0.33346082 -0.7376627  1.0000000

Setting up the new variable crim_ind and new Boston2 dataset

attach(Boston)
crim_ind=rep(0, length(crim)) #create a binary crime indicator
crim_ind[crim>median(crim)]=1 # reassign rows with crime rate greater than the median to 1
Boston2 = data.frame(Boston, crim_ind) #Boston2 dataset has the new crim_ind variable added
summary(Boston2)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv          crim_ind  
##  Min.   : 1.73   Min.   : 5.00   Min.   :0.0  
##  1st Qu.: 6.95   1st Qu.:17.02   1st Qu.:0.0  
##  Median :11.36   Median :21.20   Median :0.5  
##  Mean   :12.65   Mean   :22.53   Mean   :0.5  
##  3rd Qu.:16.95   3rd Qu.:25.00   3rd Qu.:1.0  
##  Max.   :37.97   Max.   :50.00   Max.   :1.0

** Create training and testing datsets.**

set.seed(1)
train = sample(1:nrow(Boston2), 0.7*nrow(Boston2))  #train is made up of randon 70% of Boston2 dataset for training purposes
Boston2.train = Boston2[train,]  #training dataset
Boston2.test = Boston2[-train,]  #testing dataset
dim(Boston2.train)

## [1] 354  15

dim(Boston2.test)

## [1] 152  15

crim_ind.test=crim_ind[-train] # create the crim indicator vector for the Boston2.test testing dataset
crim_ind.test

##   [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [36] 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1
##  [71] 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 0 0 1 0 0 0

Logistic Regression with all predictors

#fit for all predictors
glm.fit=glm(crim_ind~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data=Boston2, subset=train, family=binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = crim_ind ~ zn + indus + chas + nox + rm + age + 
##     dis + rad + tax + ptratio + black + lstat + medv, family = binomial, 
##     data = Boston2, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1363  -0.1504  -0.0009   0.0018   3.5797  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -42.731502   8.009551  -5.335 9.55e-08 ***
## zn           -0.105323   0.045585  -2.310 0.020862 *  
## indus        -0.066308   0.057649  -1.150 0.250057    
## chas          0.187896   0.810675   0.232 0.816711    
## nox          56.344292  10.073420   5.593 2.23e-08 ***
## rm           -0.234402   0.900292  -0.260 0.794585    
## age           0.022052   0.014304   1.542 0.123141    
## dis           1.143918   0.303909   3.764 0.000167 ***
## rad           0.657976   0.184373   3.569 0.000359 ***
## tax          -0.006299   0.003239  -1.944 0.051849 .  
## ptratio       0.292158   0.155755   1.876 0.060689 .  
## black        -0.008703   0.005212  -1.670 0.094933 .  
## lstat         0.112414   0.060168   1.868 0.061715 .  
## medv          0.191745   0.087816   2.184 0.028999 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.65  on 353  degrees of freedom
## Residual deviance: 140.41  on 340  degrees of freedom
## AIC: 168.41
## 
## Number of Fisher Scoring iterations: 9

Logistic Regression with significant predictors

# fit with only the predictors that are significant - zn, nox, dis, rad and medv
glm.fit2=glm(crim_ind~ zn+nox+dis+rad+medv, data=Boston2, subset=train, family=binomial)
summary(glm.fit2)

## 
## Call:
## glm(formula = crim_ind ~ zn + nox + dis + rad + medv, family = binomial, 
##     data = Boston2, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0766  -0.3226  -0.0034   0.0069   3.2715  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -30.10487    4.91615  -6.124 9.14e-10 ***
## zn           -0.09780    0.03537  -2.765 0.005698 ** 
## nox          42.65837    7.07436   6.030 1.64e-09 ***
## dis           0.84586    0.23756   3.561 0.000370 ***
## rad           0.49353    0.13194   3.741 0.000184 ***
## medv          0.08028    0.02906   2.763 0.005730 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.65  on 353  degrees of freedom
## Residual deviance: 164.99  on 348  degrees of freedom
## AIC: 176.99
## 
## Number of Fisher Scoring iterations: 8

glm.probs=predict(glm.fit2, Boston2.test, type='response')
glm.pred=rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1  
table(glm.pred, crim_ind.test)

##         crim_ind.test
## glm.pred  0  1
##        0 60 11
##        1 13 68

mean(glm.pred == crim_ind.test)

## [1] 0.8421053

mean(glm.pred != crim_ind.test)

## [1] 0.1578947

LDA

#LDA regression
lda.fit=lda(crim_ind~zn+nox+dis+rad+medv, data=Boston2, subset=train)
lda.fit

## Call:
## lda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5084746 0.4915254 
## 
## Group means:
##          zn       nox      dis       rad     medv
## 0 21.188889 0.4695150 5.070315  4.127778 25.37056
## 1  1.172414 0.6377989 2.537681 14.873563 20.44770
## 
## Coefficients of linear discriminants:
##               LD1
## zn   -0.010534928
## nox   9.134260498
## dis   0.008656437
## rad   0.076273497
## medv  0.029024616

lda.pred = predict(lda.fit, Boston2.test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class=lda.pred$class
table(lda.class, crim_ind.test)

##          crim_ind.test
## lda.class  0  1
##         0 71 19
##         1  2 60

mean(lda.class == crim_ind.test)

## [1] 0.8618421

mean(lda.class != crim_ind.test)

## [1] 0.1381579

QDA

#QDA
qda.fit=qda(crim_ind~zn+nox+dis+rad+medv, data=Boston2, subset=train)
qda.fit

## Call:
## qda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5084746 0.4915254 
## 
## Group means:
##          zn       nox      dis       rad     medv
## 0 21.188889 0.4695150 5.070315  4.127778 25.37056
## 1  1.172414 0.6377989 2.537681 14.873563 20.44770

qda.pred = predict(qda.fit, Boston2.test)
names(qda.pred)

## [1] "class"     "posterior"

qda.class=qda.pred$class
table(qda.class, crim_ind.test)

##          crim_ind.test
## qda.class  0  1
##         0 66 15
##         1  7 64

mean(qda.class== crim_ind.test)

## [1] 0.8552632

mean(qda.class!=crim_ind.test)

## [1] 0.1447368

#KNN regression
library(class)
train.X=cbind(nox,rad,ptratio,medv)[train,]
test.X=cbind(nox,rad,ptratio,medv)[-train,]
train.crim_ind = crim_ind[train]
dim(train.X)

## [1] 354   4

dim(test.X)

## [1] 152   4

dim(train.crim_ind)

## NULL

set.seed(1)
#KNN=1
knn.pred=knn(train.X, test.X, train.crim_ind, k=1)
table(knn.pred, crim_ind.test)

##         crim_ind.test
## knn.pred  0  1
##        0 58  6
##        1 15 73

mean(knn.pred== crim_ind.test)

## [1] 0.8618421

mean(knn.pred!=crim_ind.test)

## [1] 0.1381579

#KNN=5
knn.pred=knn(train.X, test.X, train.crim_ind, k=5)
table(knn.pred, crim_ind.test)

##         crim_ind.test
## knn.pred  0  1
##        0 61 12
##        1 12 67

mean(knn.pred== crim_ind.test)

## [1] 0.8421053

mean(knn.pred!=crim_ind.test)

## [1] 0.1578947

#KNN=10
knn.pred=knn(train.X, test.X, train.crim_ind, k=10)
table(knn.pred, crim_ind.test)

##         crim_ind.test
## knn.pred  0  1
##        0 67 20
##        1  6 59

mean(knn.pred== crim_ind.test)

## [1] 0.8289474

mean(knn.pred!=crim_ind.test)

## [1] 0.1710526

#KNN=100
knn.pred=knn(train.X, test.X, train.crim_ind, k=100)
table(knn.pred, crim_ind.test)

##         crim_ind.test
## knn.pred  0  1
##        0 73 39
##        1  0 40

mean(knn.pred== crim_ind.test)

## [1] 0.7434211

mean(knn.pred!=crim_ind.test)

## [1] 0.2565789

Homework 3 - Classification - Algorithms 2

Lucy Metz

6/22/2020

Problem 10

Problem 11

Problem 13