Assignment #3

library(ISLR)
library(MASS)
library(class)

set.seed(1)
data("Weekly")
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

cor(Weekly[, -9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

pairs(Weekly)

summary() showed that the values of the lags are similar. Correlations aren’t particularly strong, though some correlation can be observed between Year and Volume.

lg_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(lg_fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag2 appears to give some statistical significance with its p-value denoted by * (< 0.05).

lg_prediction <- predict(lg_fit, type = "response")
lg_prediction_results <- rep("Down", length(lg_prediction))
lg_prediction_results[lg_prediction > 0.5] <- "Up"
table(lg_prediction_results, Weekly$Direction)

##                      
## lg_prediction_results Down  Up
##                  Down   54  48
##                  Up    430 557

The confusion matrix is telling us that the test error rate is approximately 43.89%, as shown below. It can also be observed that when the market went “Up” the predictions were right approximately 92.07% (557 / 605) of the time; conversely, when the market went “Down” the predictions were right approximately 11.16% (54 / 484) of the time.

mean(lg_prediction_results != Weekly$Direction)

## [1] 0.4389348

# the confusion matrix
weekly_training_data = Weekly[Weekly$Year<2009,]
weekly_testing_data = Weekly[Weekly$Year>2008,]
lg_fit_2 = glm(Direction ~ Lag2, data = weekly_training_data, family = "binomial")
lg_prediction_2 <- predict(lg_fit_2, weekly_testing_data, type = "response")
lg_prediction_results_2 <- rep("Down", length(lg_prediction_2))
lg_prediction_results_2[lg_prediction_2 > 0.5] <- "Up"
table(lg_prediction_results_2, Weekly[Weekly$Year>2008,]$Direction)

##                        
## lg_prediction_results_2 Down Up
##                    Down    9  5
##                    Up     34 56

# the overall fraction of correct predictions
mean(lg_prediction_results_2 == Weekly[Weekly$Year>2008,]$Direction)

## [1] 0.625

lda_fit = lda(Direction ~ Lag2, data = weekly_training_data)
lda_prediction = predict(lda_fit, weekly_testing_data, type ="response")
table(lda_prediction$class, Weekly[Weekly$Year>2008,]$Direction)

##       
##        Down Up
##   Down    9  5
##   Up     34 56

mean(lda_prediction$class == Weekly[Weekly$Year>2008,]$Direction)

## [1] 0.625

As can be seen, the output resembles that of the logistic regression model.

qda_fit = qda(Direction ~ Lag2, data = weekly_training_data)
qda_prediction = predict(qda_fit, weekly_testing_data, type ="response")
table(qda_prediction$class, Weekly[Weekly$Year>2008,]$Direction)

##       
##        Down Up
##   Down    0  0
##   Up     43 61

mean(qda_prediction$class == Weekly[Weekly$Year>2008,]$Direction)

## [1] 0.5865385

The overall fraction of correct predictions decreased with this model.

knn_weekly_training_data = as.matrix(weekly_training_data[, c("Lag2")])
knn_weekly_testing_data = as.matrix(weekly_testing_data[, c("Lag2")])
knn_weekly_training_data_direction = as.matrix(weekly_training_data[, c("Direction")])
knn_weekly_prediction = knn(knn_weekly_training_data, knn_weekly_testing_data, knn_weekly_training_data_direction, k = 1)
table(knn_weekly_prediction, weekly_testing_data$Direction)

##                      
## knn_weekly_prediction Down Up
##                  Down   21 30
##                  Up     22 31

mean(knn_weekly_prediction == weekly_testing_data$Direction)

## [1] 0.5

While the model was right 50% of the time which was a decrease from models prior, it can be seen that the model improved when the market went “Down”.

If we base it on the overall fraction of correct predictions, it can be observed that logistic regression and LDA gave a higher percentage that QDA, and than KNN which came in last. Again, it should be noted that KNN showed improvement over the others when the market went “Down”.

# logistic regression with interaction
fit_interaction_lr_1 <- lda(Direction ~ Lag2:Volume, data = weekly_training_data)
prediction_interaction_lr_1 <- predict(fit_interaction_lr_1, weekly_testing_data, type = "response")
table(prediction_interaction_lr_1$class, weekly_testing_data$Direction)

##       
##        Down Up
##   Down    8  6
##   Up     35 55

mean(prediction_interaction_lr_1$class == weekly_testing_data$Direction)

## [1] 0.6057692

# QDA with transformation
fit_qda_transformation <- qda(Direction ~ sqrt(abs(Lag2)), data = weekly_training_data)
prediction_qda_transformation <- predict(fit_qda_transformation, weekly_testing_data)
table(prediction_qda_transformation$class, weekly_testing_data$Direction)

##       
##        Down Up
##   Down    0  0
##   Up     43 61

mean(prediction_qda_transformation$class == weekly_testing_data$Direction)

## [1] 0.5865385

Of logistic regression with interaction and QDA with transformation, logistic regression with interaction had a higher overall fraction of correct predictions of approximately 60.58%.

# KNN with different values of K
errors <- array(0, dim = 50)
for(i in 1:50){
  knn_weekly_prediction_2 = knn(knn_weekly_training_data, knn_weekly_testing_data, knn_weekly_training_data_direction, k = i)
  errors[i] = mean(knn_weekly_prediction_2 != weekly_testing_data$Direction)
}

plot(errors, xlab="K value", ylab="Test error")
lines(errors, pch=16)

min(errors)

## [1] 0.3846154

which(errors == min(errors))

## [1] 47

As can be seen, K = 47 gave the lowest test error rate of approximately 38.46%.

data("Auto")

summary(Auto)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365

mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto_part_a <- data.frame(Auto, mpg01)
summary(Auto_part_a)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365  
##      mpg01    
##  Min.   :0.0  
##  1st Qu.:0.0  
##  Median :0.5  
##  Mean   :0.5  
##  3rd Qu.:1.0  
##  Max.   :1.0  
##

cor(Auto_part_a[,-9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

pairs(Auto_part_a)

boxplot(displacement ~ mpg01, data = Auto_part_a, main = "Displacement")

boxplot(weight ~ mpg01, data = Auto_part_a, main = "Weight")

boxplot(cylinders ~ mpg01, data = Auto_part_a, main = "Cylinders")

boxplot(horsepower ~ mpg01, data = Auto_part_a, main = "Horsepower")

Based off the outputs above, some associations between mpg01 and cylinders, weight, horsepower, and displacement can be observed.

set.seed(1)
training_amount <- nrow(Auto_part_a) * 0.75
training_index <- sample(nrow(Auto_part_a), size = training_amount)
training_data <- Auto_part_a[training_index,]
testing_data <- Auto_part_a[-training_index,]

library(MASS)

lda_fit <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
lda_fit

## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4863946 0.5136054 
## 
## Group means:
##   cylinders   weight displacement horsepower
## 0  6.804196 3625.434     273.8881  129.60839
## 1  4.198675 2347.728     118.0265   79.66225
## 
## Coefficients of linear discriminants:
##                        LD1
## cylinders    -0.3975941148
## weight       -0.0008601142
## displacement -0.0038389582
## horsepower    0.0048106758

lda_prediction <-  predict(lda_fit, testing_data)
table(lda_prediction$class, mpg01[-training_index])

##    
##      0  1
##   0 42  2
##   1 11 43

mean(lda_prediction$class != mpg01[-training_index])

## [1] 0.1326531

As shown, the test error rate is approximately 13.27%.

qda_fit <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
qda_fit

## Call:
## qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4863946 0.5136054 
## 
## Group means:
##   cylinders   weight displacement horsepower
## 0  6.804196 3625.434     273.8881  129.60839
## 1  4.198675 2347.728     118.0265   79.66225

qda_prediction <-  predict(qda_fit, testing_data)
table(qda_prediction$class, mpg01[-training_index])

##    
##      0  1
##   0 45  4
##   1  8 41

mean(qda_prediction$class != mpg01[-training_index])

## [1] 0.122449

As shown, the given test error rate is approximately 12.24%.

lr_fit <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, family = binomial, data = training_data)
lr_fit

## 
## Call:  glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower, 
##     family = binomial, data = training_data)
## 
## Coefficients:
##  (Intercept)     cylinders        weight  displacement    horsepower  
##    11.114795      0.018886     -0.001956     -0.012320     -0.035758  
## 
## Degrees of Freedom: 293 Total (i.e. Null);  289 Residual
## Null Deviance:       407.4 
## Residual Deviance: 161.4     AIC: 171.4

lr_prediction <- predict(lr_fit, testing_data, type = "response")
lr_prediction_final <- round(lr_prediction)
table(lr_prediction_final, mpg01[-training_index])

##                    
## lr_prediction_final  0  1
##                   0 45  2
##                   1  8 43

mean(lr_prediction_final != mpg01[-training_index])

## [1] 0.1020408

It can be seen that the test error rate is approximately 10.20%.

knn_training_data = training_data[, c("cylinders","horsepower","weight","acceleration")]
knn_testing_data = testing_data[, c("cylinders", "horsepower","weight","acceleration")]
knn_prediction = knn(knn_training_data, knn_testing_data, mpg01[training_index], k = 1)
table(knn_prediction, mpg01[-training_index])

##               
## knn_prediction  0  1
##              0 44  8
##              1  9 37

mean(knn_prediction != mpg01[-training_index])

## [1] 0.1734694

Here, the test error rate is approximately 17.35% for K=1. Compared to previous model outputs, this is an increase in test error rate which is not desirable. In turn, we conduct a loop below to find a better value for K.

errors <- array(0, dim = 50)
for(i in 1:50){
  knn_prediction = knn(knn_training_data, knn_testing_data, mpg01[training_index], k = i)
  errors[i] = mean(knn_prediction != mpg01[-training_index])
}

plot(errors, xlab="K value", ylab="Test error")
lines(errors, pch=16)

min(errors)

## [1] 0.1020408

which(errors == min(errors))

## [1] 3

As shown, K = 3 gave the lowest test error rate of approximately 10.20%. As depicted in the plot above, the next lowest test error rate was given by K = 5 of approximately 11.22% (outputted below).

errors[5]

## [1] 0.1122449

data("Boston")

summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

crim01 <- rep(0, length(Boston$crim))
crim01[Boston$crim > median(Boston$crim)] <- 1
boston_df <- data.frame(Boston, crim01)

set.seed(1)
boston_training_amount <- nrow(boston_df) * 0.75
boston_training_index <- sample(nrow(boston_df), size = boston_training_amount)
boston_training_data <- boston_df[boston_training_index,]
boston_testing_data <- boston_df[-boston_training_index,]

cor(boston_df)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## crim01   0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01  -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128  0.2535684
##               black      lstat       medv      crim01
## crim    -0.38506394  0.4556215 -0.3883046  0.40939545
## zn       0.17552032 -0.4129946  0.3604453 -0.43615103
## indus   -0.35697654  0.6037997 -0.4837252  0.60326017
## chas     0.04878848 -0.0539293  0.1752602  0.07009677
## nox     -0.38005064  0.5908789 -0.4273208  0.72323480
## rm       0.12806864 -0.6138083  0.6953599 -0.15637178
## age     -0.27353398  0.6023385 -0.3769546  0.61393992
## dis      0.29151167 -0.4969958  0.2499287 -0.61634164
## rad     -0.44441282  0.4886763 -0.3816262  0.61978625
## tax     -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio -0.17738330  0.3740443 -0.5077867  0.25356836
## black    1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat   -0.36608690  1.0000000 -0.7376627  0.45326273
## medv     0.33346082 -0.7376627  1.0000000 -0.26301673
## crim01  -0.35121093  0.4532627 -0.2630167  1.00000000

#only removing the crime variables
boston_lda_fit <- lda(crim01 ~ . - crim01 - crim, data = boston_training_data)
boston_prediction_lda <- predict(boston_lda_fit, boston_testing_data)
table(boston_prediction_lda$class, crim01[-boston_training_index])

##    
##      0  1
##   0 59 15
##   1  4 49

boston_lda_fit

## Call:
## lda(crim01 ~ . - crim01 - crim, data = boston_training_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5013193 0.4986807 
## 
## Group means:
##          zn     indus       chas       nox       rm      age      dis       rad
## 0 21.757895  7.103263 0.06315789 0.4694826 6.443258 50.56105 5.060515  4.157895
## 1  1.079365 15.403016 0.08994709 0.6363175 6.174714 85.66190 2.540708 14.968254
##        tax  ptratio    black     lstat     medv
## 0 307.4632 17.87579 388.0490  9.241211 25.34895
## 1 512.4550 19.10317 320.9546 15.866455 20.26032
## 
## Coefficients of linear discriminants:
##                   LD1
## zn      -0.0085230246
## indus    0.0150302363
## chas    -0.1728111352
## nox      8.6054129758
## rm       0.1119278355
## age      0.0121565634
## dis      0.1284274358
## rad      0.0745769280
## tax     -0.0010170190
## ptratio  0.0657333031
## black   -0.0009543915
## lstat    0.0279785001
## medv     0.0460937756

mean(boston_prediction_lda$class != crim01[-boston_training_index])

## [1] 0.1496063

For the LDA model after only removing the crime variables, we arrived at a test error rate of approximately 14.96%.

boston_lda_fit_2 <- lda(crim01 ~ chas + nox + rm + ptratio + medv, data = boston_training_data)
boston_prediction_lda_2 <- predict(boston_lda_fit_2, boston_testing_data)
table(boston_prediction_lda_2$class, crim01[-boston_training_index])

##    
##      0  1
##   0 56 16
##   1  7 48

mean(boston_prediction_lda_2$class != crim01[-boston_training_index])

## [1] 0.1811024

For the LDA model after removing some of the predictors, we arrived at a test error rate of approximately 18.11%.

boston_lda_fit_3 <- lda(crim01 ~ chas + nox + dis + tax, data = boston_training_data)
boston_prediction_lda_3 <- predict(boston_lda_fit_3, boston_testing_data)
table(boston_prediction_lda_3$class, crim01[-boston_training_index])

##    
##      0  1
##   0 59 15
##   1  4 49

mean(boston_prediction_lda_3$class != crim01[-boston_training_index])

## [1] 0.1496063

The LDA model here was able to be reduced to 4 predictors while giving the same test error rate of approximately 14.96% as when only the crime variables were removed.

boston_lr_fit_1 <- glm(crim01 ~ . - crim01 - crim, data = boston_training_data, family = binomial)
boston_lr_fit_1

## 
## Call:  glm(formula = crim01 ~ . - crim01 - crim, family = binomial, 
##     data = boston_training_data)
## 
## Coefficients:
## (Intercept)           zn        indus         chas          nox           rm  
##  -44.511620    -0.115935    -0.085428     0.204896    60.075029    -0.422718  
##         age          dis          rad          tax      ptratio        black  
##    0.027567     1.262936     0.641510    -0.006109     0.309168    -0.009583  
##       lstat         medv  
##    0.111155     0.210806  
## 
## Degrees of Freedom: 378 Total (i.e. Null);  365 Residual
## Null Deviance:       525.4 
## Residual Deviance: 147.5     AIC: 175.5

boston_prediction_lr_1 <-  predict(boston_lr_fit_1, boston_testing_data, type = "response")
lr_prediction_final_1 <- round(boston_prediction_lr_1)
table(lr_prediction_final_1, crim01[-boston_training_index])

##                      
## lr_prediction_final_1  0  1
##                     0 51  5
##                     1 12 59

mean(lr_prediction_final_1 != crim01[-boston_training_index])

## [1] 0.1338583

For the logistic regression model after only removing the crime variables, the test error rate was improved from that of the LDA model to approximately 13.39%.

boston_lr_fit_2 <- glm(crim01 ~ chas + nox + rm + ptratio + medv, data = boston_training_data, family = binomial)
boston_prediction_lr_2 <-  predict(boston_lr_fit_2, boston_testing_data, type = "response")
lr_prediction_final_2 <- round(boston_prediction_lr_2)
table(lr_prediction_final_2, crim01[-boston_training_index])

##                      
## lr_prediction_final_2  0  1
##                     0 57 12
##                     1  6 52

mean(lr_prediction_final_2 != crim01[-boston_training_index])

## [1] 0.1417323

For the logistic regression model after removing some of the predictors, an improvement over that of the corresponding LDA model was observed giving a test error rate of approximately 14.17%.

boston_lr_fit_3 <- glm(crim01 ~ chas + nox + dis + tax, data = boston_training_data, family = binomial)
boston_prediction_lr_3 <-  predict(boston_lr_fit_3, boston_testing_data, type = "response")
lr_prediction_final_3 <- round(boston_prediction_lr_3)
table(lr_prediction_final_3, crim01[-boston_training_index])

##                      
## lr_prediction_final_3  0  1
##                     0 49 10
##                     1 14 54

mean(lr_prediction_final_3 != crim01[-boston_training_index])

## [1] 0.1889764

Interestingly, the test error rate of this logistic regression model was more that its LDA counterpart, giving approximately 18.90%.

knn_boston_training_data_1 = boston_training_data[, c("chas","nox","dis","tax")]
knn_boston_testing_data_1 = boston_testing_data[, c("chas","nox","dis","tax")]
knn_boston_prediction_1 = knn(knn_boston_training_data_1, knn_boston_testing_data_1, crim01[boston_training_index], k = 1)
table(knn_boston_prediction_1, crim01[-boston_training_index])

##                        
## knn_boston_prediction_1  0  1
##                       0 58  2
##                       1  5 62

mean(knn_boston_prediction_1 != crim01[-boston_training_index])

## [1] 0.05511811

This model with K = 1 and reducing the predictors to the 4 used above gave the best test error rate thus far of approximately 5.51%.

dropping = c("crim", "crim01")
knn_boston_training_data_2 = boston_training_data[,!names(boston_training_data) %in% dropping]
knn_boston_testing_data_2 = boston_testing_data[,!names(boston_training_data) %in% dropping]
knn_boston_prediction_2 = knn(knn_boston_training_data_2, knn_boston_testing_data_2, crim01[boston_training_index], k = 1)
table(knn_boston_prediction_2, crim01[-boston_training_index])

##                        
## knn_boston_prediction_2  0  1
##                       0 55  5
##                       1  8 59

mean(knn_boston_prediction_2 != crim01[-boston_training_index])

## [1] 0.1023622

While this model with K = 1 and only removing the crim predictors gave better test error rates than the LDA and logistic regression models far of approximately 10.24%, the previous KNN model gave a better test error rate so we explore additional K values for the first KNN model below.

errors <- array(0, dim = 50)
for(i in 1:50){
  knn_boston_prediction = knn(knn_boston_training_data_1, knn_boston_testing_data_1, crim01[boston_training_index], k = i)
  errors[i] = mean(knn_boston_prediction != crim01[-boston_training_index])
}

plot(errors, xlab="K value", ylab="Test error")
lines(errors, pch=16)

min(errors)

## [1] 0.05511811

which(errors == min(errors))

## [1] 1

As shown above, K = 1 that we previously utilized gave the lowest test error rate of all the models covered of approximately 5.51%.

Assignment #3

3/5/2021