\[ Predicting the Class of "2" & "3" from Zipcode dataset \]

Loading Zipcode dataset and choosing Class Labels “Y=3” and “Y=2” for classification out of 0 to 9 digits

set.seed(123)
digits_train=read.table(gzfile("zip.train.gz"))
digits_test=read.table(gzfile("zip.test.gz"))

#digits_train

#filter(digits_train, V2 == 3)

#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
train= digits_train %>% filter(V1 == 2 | V1 == 3)
test= digits_test %>% filter(V1 == 2 | V1 == 3)

KNN (k=1)

##KNN-1

# Installing Packages
#install.packages("e1071")
#install.packages("caTools")
#install.packages("class")

# Loading package
library(e1071)
library(caTools)
library(class)


# to training dataset
classifier_knn <- knn(train = train,
                      test = test,
                      cl = train$V1,
                      k = 1)
classifier_knn
##   [1] 3 2 2 3 2 2 2 3 3 2 3 2 2 2 2 3 2 2 2 2 2 2 3 2 3 3 3 3 3 2 2 2 3 2 3 3 2
##  [38] 3 3 2 3 2 3 2 2 2 2 2 2 3 2 3 2 2 3 2 3 2 2 3 3 3 2 2 2 3 3 3 3 2 2 2 2 2
##  [75] 3 2 2 2 2 3 2 3 3 3 2 2 3 3 3 3 3 2 2 3 2 3 2 3 2 2 3 2 3 2 3 2 2 3 2 2 2
## [112] 2 3 2 3 2 2 2 2 2 3 2 2 2 2 2 3 3 3 3 3 2 2 2 2 2 3 3 3 3 3 3 3 3 2 3 3 2
## [149] 3 3 2 2 3 3 3 2 3 2 2 2 2 2 3 2 2 3 3 3 3 2 2 2 2 3 2 2 2 2 2 2 3 3 3 2 3
## [186] 2 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 2 2 3 2 2 2 2 2 3 3 2 2 3 2 3 2
## [223] 2 3 3 3 2 3 2 2 3 3 3 2 2 2 3 2 2 2 2 3 3 2 3 2 2 3 2 3 2 2 3 3 3 2 3 3 2
## [260] 3 3 3 3 3 2 3 3 3 3 3 2 3 3 3 3 2 2 2 3 3 2 2 3 3 2 2 2 3 3 3 3 2 2 2 2 2
## [297] 2 2 2 2 2 3 2 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2 3 2 3 2 2 2 3 3 3 3 3 3 2 2 2
## [334] 3 2 2 3 2 2 2 2 2 3 3 3 3 2 2 2 3 2 2 3 3 2 2 2 3 3 3 2 3 2 3
## Levels: 2 3
cm <- table(test$V1, classifier_knn)
cm
##    classifier_knn
##       2   3
##   2 192   6
##   3   3 163
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.975274725274725"

k=3

classifier_knn3 <- knn(train = train,
                      test = test,
                      cl = train$V1,
                      k = 3)
#classifier_knn



cm <- table(test$V1, classifier_knn3)
cm
##    classifier_knn3
##       2   3
##   2 191   7
##   3   4 162
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn3 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.96978021978022"

k=5

classifier_knn5 <- knn(train = train,
                      test = test,
                      cl = train$V1,
                      k = 5)
#classifier_knn



cm <- table(test$V1, classifier_knn5)
cm
##    classifier_knn5
##       2   3
##   2 191   7
##   3   4 162
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn5 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.96978021978022"

k=7

classifier_knn7 <- knn(train = train,
                      test = test,
                      cl = train$V1,
                      k = 7)
#classifier_knn



cm <- table(test$V1, classifier_knn7)
cm
##    classifier_knn7
##       2   3
##   2 189   9
##   3   2 164
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn7 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.96978021978022"

k=15

##KNN= 15
classifier_knn15 <- knn(train = train,
                      test = test,
                      cl = train$V1,
                      k = 15)
#classifier_knn



cm <- table(test$V1, classifier_knn15)
cm
##    classifier_knn15
##       2   3
##   2 187  11
##   3   3 163
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn15 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.961538461538462"

Regression for Clasification

set.seed(12)
library(caTools)
library(ROCR) 
   
logistic_model <- lm(V1 ~., data = train, family = "binomial")
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'family' will be disregarded
summary(logistic_model)
## 
## Call:
## lm(formula = V1 ~ ., data = train, family = "binomial")
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59281 -0.09679 -0.00057  0.09365  0.70122 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.359e+01  1.042e+02   0.226  0.82090    
## V2          -2.889e-01  1.463e-01  -1.974  0.04860 *  
## V3           5.930e-02  6.753e-02   0.878  0.38007    
## V4           2.406e-02  3.827e-02   0.629  0.52976    
## V5          -2.644e-02  2.721e-02  -0.972  0.33140    
## V6           1.168e-02  2.328e-02   0.502  0.61604    
## V7           3.749e-03  2.260e-02   0.166  0.86825    
## V8          -3.795e-03  2.271e-02  -0.167  0.86733    
## V9          -3.584e-03  2.229e-02  -0.161  0.87228    
## V10         -1.366e-02  2.222e-02  -0.615  0.53881    
## V11          2.426e-02  2.480e-02   0.978  0.32825    
## V12         -5.717e-02  3.220e-02  -1.776  0.07608 .  
## V13          7.696e-02  4.284e-02   1.796  0.07269 .  
## V14         -9.824e-03  6.383e-02  -0.154  0.87772    
## V15          1.736e-01  1.220e-01   1.423  0.15509    
## V16         -2.109e-01  4.220e-01  -0.500  0.61740    
## V17          2.125e+01  1.044e+02   0.204  0.83873    
## V18         -5.654e-02  7.946e-02  -0.711  0.47692    
## V19         -5.524e-02  4.052e-02  -1.363  0.17302    
## V20          9.906e-03  2.500e-02   0.396  0.69198    
## V21         -3.085e-03  1.961e-02  -0.157  0.87504    
## V22          3.690e-03  1.868e-02   0.198  0.84345    
## V23         -1.208e-02  2.327e-02  -0.519  0.60391    
## V24          5.070e-02  2.860e-02   1.773  0.07649 .  
## V25         -3.083e-02  2.729e-02  -1.129  0.25896    
## V26         -1.087e-02  2.446e-02  -0.444  0.65688    
## V27         -1.533e-04  2.257e-02  -0.007  0.99458    
## V28          4.249e-02  2.191e-02   1.939  0.05273 .  
## V29          3.745e-02  2.716e-02   1.379  0.16819    
## V30         -2.430e-02  3.744e-02  -0.649  0.51642    
## V31         -7.055e-03  6.460e-02  -0.109  0.91306    
## V32         -3.003e-01  1.425e-01  -2.107  0.03533 *  
## V33          8.525e-01  5.674e-01   1.503  0.13323    
## V34         -1.079e-02  5.103e-02  -0.211  0.83259    
## V35         -2.298e-02  3.081e-02  -0.746  0.45591    
## V36          2.882e-02  2.205e-02   1.307  0.19146    
## V37          1.026e-02  1.780e-02   0.577  0.56436    
## V38         -4.103e-02  1.734e-02  -2.366  0.01815 *  
## V39          8.164e-03  1.952e-02   0.418  0.67582    
## V40          1.561e-03  2.149e-02   0.073  0.94209    
## V41         -2.097e-03  2.142e-02  -0.098  0.92202    
## V42          1.850e-02  2.052e-02   0.901  0.36755    
## V43          4.566e-03  1.916e-02   0.238  0.81163    
## V44          1.446e-03  1.863e-02   0.078  0.93813    
## V45         -2.673e-02  2.204e-02  -1.212  0.22559    
## V46         -1.844e-02  3.064e-02  -0.602  0.54730    
## V47         -4.644e-02  4.975e-02  -0.933  0.35082    
## V48          8.354e-02  8.009e-02   1.043  0.29715    
## V49          7.162e-02  1.815e-01   0.395  0.69317    
## V50         -1.420e-02  4.604e-02  -0.308  0.75784    
## V51         -2.603e-02  3.109e-02  -0.837  0.40259    
## V52          1.751e-02  2.248e-02   0.779  0.43618    
## V53         -1.896e-02  1.862e-02  -1.018  0.30873    
## V54          1.514e-02  1.783e-02   0.849  0.39585    
## V55          1.397e-02  2.087e-02   0.670  0.50327    
## V56         -2.504e-02  2.240e-02  -1.118  0.26392    
## V57          2.974e-02  2.121e-02   1.402  0.16124    
## V58         -1.804e-03  1.809e-02  -0.100  0.92058    
## V59         -4.102e-02  1.668e-02  -2.458  0.01410 *  
## V60          2.757e-02  1.828e-02   1.508  0.13185    
## V61          4.684e-03  2.101e-02   0.223  0.82362    
## V62          6.673e-03  2.872e-02   0.232  0.81631    
## V63          8.091e-02  4.414e-02   1.833  0.06707 .  
## V64         -1.846e-02  7.289e-02  -0.253  0.80015    
## V65          2.570e-01  2.041e-01   1.259  0.20818    
## V66          1.462e-02  5.442e-02   0.269  0.78822    
## V67          7.517e-03  3.419e-02   0.220  0.82602    
## V68         -3.749e-03  2.579e-02  -0.145  0.88448    
## V69          2.808e-03  2.309e-02   0.122  0.90322    
## V70         -2.304e-02  2.221e-02  -1.037  0.29975    
## V71          1.092e-02  2.397e-02   0.455  0.64886    
## V72          3.041e-04  2.449e-02   0.012  0.99009    
## V73         -3.523e-02  2.185e-02  -1.612  0.10721    
## V74          2.355e-02  1.845e-02   1.276  0.20215    
## V75         -6.711e-03  1.734e-02  -0.387  0.69884    
## V76         -5.136e-03  1.790e-02  -0.287  0.77425    
## V77         -2.787e-02  2.140e-02  -1.302  0.19316    
## V78          4.452e-03  2.760e-02   0.161  0.87186    
## V79         -1.076e-01  4.149e-02  -2.593  0.00965 ** 
## V80          7.553e-02  7.257e-02   1.041  0.29820    
## V81         -2.398e-01  1.821e-01  -1.317  0.18816    
## V82         -4.837e-02  6.438e-02  -0.751  0.45262    
## V83          6.218e-02  5.235e-02   1.188  0.23517    
## V84         -2.446e-02  3.622e-02  -0.675  0.49961    
## V85          6.275e-03  2.897e-02   0.217  0.82855    
## V86          1.109e-02  2.718e-02   0.408  0.68342    
## V87          3.097e-02  2.722e-02   1.138  0.25545    
## V88          5.655e-03  2.403e-02   0.235  0.81403    
## V89          3.318e-02  2.129e-02   1.559  0.11928    
## V90         -2.013e-03  2.021e-02  -0.100  0.92065    
## V91          9.206e-03  1.851e-02   0.497  0.61903    
## V92          1.669e-02  1.742e-02   0.958  0.33826    
## V93         -1.156e-02  2.006e-02  -0.576  0.56470    
## V94          2.353e-02  2.546e-02   0.924  0.35560    
## V95          2.604e-02  3.945e-02   0.660  0.50944    
## V96          1.318e-02  6.705e-02   0.197  0.84421    
## V97         -1.234e-01  1.241e-01  -0.995  0.32014    
## V98         -8.955e-02  1.123e-01  -0.798  0.42518    
## V99         -9.562e-02  7.489e-02  -1.277  0.20195    
## V100        -1.489e-02  5.289e-02  -0.281  0.77842    
## V101        -1.111e-02  3.468e-02  -0.320  0.74878    
## V102        -1.996e-02  2.702e-02  -0.739  0.46020    
## V103        -5.023e-02  2.362e-02  -2.126  0.03368 *  
## V104         4.274e-02  2.252e-02   1.898  0.05799 .  
## V105         4.326e-02  2.054e-02   2.107  0.03535 *  
## V106         5.432e-02  2.045e-02   2.656  0.00802 ** 
## V107         4.746e-02  1.946e-02   2.438  0.01491 *  
## V108         2.607e-02  1.764e-02   1.478  0.13982    
## V109         2.328e-02  1.895e-02   1.228  0.21952    
## V110         2.916e-02  2.330e-02   1.252  0.21100    
## V111        -4.302e-03  3.609e-02  -0.119  0.90513    
## V112         2.354e-02  5.477e-02   0.430  0.66746    
## V113         7.671e-02  9.288e-02   0.826  0.40905    
## V114        -8.837e-02  1.068e-01  -0.827  0.40829    
## V115         1.454e-01  7.153e-02   2.032  0.04239 *  
## V116        -1.779e-02  4.652e-02  -0.382  0.70228    
## V117         9.317e-03  2.991e-02   0.312  0.75546    
## V118         2.405e-02  2.360e-02   1.019  0.30846    
## V119         1.311e-03  2.135e-02   0.061  0.95103    
## V120         3.010e-03  2.023e-02   0.149  0.88177    
## V121         1.710e-02  1.951e-02   0.876  0.38104    
## V122         3.737e-02  1.881e-02   1.986  0.04725 *  
## V123         7.454e-03  1.822e-02   0.409  0.68248    
## V124         2.268e-02  1.721e-02   1.318  0.18780    
## V125         2.557e-02  1.907e-02   1.341  0.18010    
## V126        -9.931e-03  2.205e-02  -0.450  0.65254    
## V127         4.140e-02  3.059e-02   1.353  0.17621    
## V128         1.245e-02  4.112e-02   0.303  0.76204    
## V129         3.814e-02  6.081e-02   0.627  0.53068    
## V130        -2.789e-01  6.913e-02  -4.035 5.83e-05 ***
## V131         2.209e-02  4.801e-02   0.460  0.64550    
## V132        -5.811e-02  3.457e-02  -1.681  0.09304 .  
## V133         3.075e-02  2.814e-02   1.093  0.27472    
## V134        -1.930e-02  2.452e-02  -0.787  0.43156    
## V135        -9.028e-03  2.281e-02  -0.396  0.69234    
## V136        -1.558e-02  2.252e-02  -0.692  0.48920    
## V137        -1.661e-02  2.034e-02  -0.816  0.41446    
## V138         1.144e-02  1.881e-02   0.608  0.54304    
## V139         4.130e-02  1.783e-02   2.316  0.02072 *  
## V140         5.062e-03  1.689e-02   0.300  0.76444    
## V141         6.826e-03  1.830e-02   0.373  0.70917    
## V142         5.325e-03  2.158e-02   0.247  0.80517    
## V143         2.539e-03  2.624e-02   0.097  0.92295    
## V144        -1.334e-02  3.434e-02  -0.388  0.69772    
## V145        -6.290e-02  4.904e-02  -1.283  0.19993    
## V146         1.343e-02  4.447e-02   0.302  0.76276    
## V147         3.679e-03  3.236e-02   0.114  0.90951    
## V148        -3.145e-02  2.749e-02  -1.144  0.25289    
## V149        -4.165e-02  2.807e-02  -1.484  0.13821    
## V150        -3.280e-02  2.661e-02  -1.232  0.21809    
## V151         1.203e-02  2.554e-02   0.471  0.63762    
## V152        -1.694e-02  2.473e-02  -0.685  0.49345    
## V153         1.040e-02  2.260e-02   0.460  0.64552    
## V154        -2.884e-02  2.109e-02  -1.368  0.17172    
## V155         1.762e-04  1.854e-02   0.009  0.99242    
## V156         1.821e-02  1.754e-02   1.038  0.29947    
## V157         2.869e-02  1.856e-02   1.546  0.12248    
## V158         3.160e-02  1.984e-02   1.593  0.11152    
## V159         2.046e-03  2.366e-02   0.086  0.93110    
## V160         4.416e-02  2.904e-02   1.521  0.12866    
## V161         1.881e-02  3.428e-02   0.549  0.58331    
## V162        -6.361e-02  3.021e-02  -2.105  0.03547 *  
## V163        -5.579e-03  2.463e-02  -0.226  0.82086    
## V164         1.738e-02  2.232e-02   0.779  0.43634    
## V165        -9.026e-03  2.228e-02  -0.405  0.68551    
## V166        -4.395e-02  2.456e-02  -1.789  0.07385 .  
## V167        -2.628e-02  2.664e-02  -0.986  0.32410    
## V168        -3.095e-02  2.775e-02  -1.115  0.26499    
## V169        -2.619e-02  2.612e-02  -1.003  0.31627    
## V170        -1.907e-02  2.366e-02  -0.806  0.42030    
## V171         4.063e-02  2.054e-02   1.978  0.04813 *  
## V172        -1.497e-02  1.780e-02  -0.841  0.40051    
## V173         2.930e-02  1.844e-02   1.589  0.11225    
## V174        -3.264e-02  1.898e-02  -1.719  0.08585 .  
## V175         1.517e-02  2.063e-02   0.735  0.46227    
## V176         7.702e-03  2.518e-02   0.306  0.75980    
## V177         4.147e-02  3.003e-02   1.381  0.16755    
## V178         4.390e-02  2.692e-02   1.631  0.10324    
## V179        -5.062e-02  2.069e-02  -2.446  0.01459 *  
## V180        -7.380e-03  1.761e-02  -0.419  0.67525    
## V181        -1.602e-02  1.840e-02  -0.871  0.38417    
## V182         1.752e-03  2.061e-02   0.085  0.93228    
## V183        -2.106e-02  2.279e-02  -0.924  0.35561    
## V184        -4.552e-02  2.487e-02  -1.830  0.06748 .  
## V185        -2.354e-02  2.441e-02  -0.965  0.33497    
## V186        -2.812e-03  2.243e-02  -0.125  0.90022    
## V187        -2.401e-02  2.104e-02  -1.141  0.25401    
## V188         9.361e-03  1.855e-02   0.505  0.61394    
## V189        -2.083e-03  1.708e-02  -0.122  0.90297    
## V190         2.197e-02  1.773e-02   1.239  0.21557    
## V191        -2.111e-02  1.922e-02  -1.098  0.27238    
## V192         1.703e-02  2.276e-02   0.748  0.45454    
## V193         2.691e-03  2.492e-02   0.108  0.91401    
## V194        -4.660e-02  2.839e-02  -1.641  0.10098    
## V195         2.146e-02  1.939e-02   1.107  0.26855    
## V196        -1.316e-02  1.664e-02  -0.791  0.42910    
## V197        -1.417e-02  1.636e-02  -0.866  0.38668    
## V198        -3.229e-02  1.841e-02  -1.755  0.07961 .  
## V199         2.610e-02  2.035e-02   1.283  0.19980    
## V200        -4.369e-02  2.332e-02  -1.874  0.06124 .  
## V201         1.376e-02  2.430e-02   0.566  0.57140    
## V202        -5.078e-02  2.343e-02  -2.167  0.03044 *  
## V203         4.756e-02  2.084e-02   2.281  0.02271 *  
## V204         7.793e-03  1.835e-02   0.425  0.67112    
## V205        -3.428e-03  1.704e-02  -0.201  0.84064    
## V206         2.721e-02  1.759e-02   1.547  0.12208    
## V207         1.214e-02  1.930e-02   0.629  0.52931    
## V208        -8.417e-03  2.073e-02  -0.406  0.68481    
## V209         3.454e-02  2.450e-02   1.410  0.15893    
## V210         3.092e-02  3.107e-02   0.995  0.31985    
## V211        -3.030e-02  2.145e-02  -1.413  0.15800    
## V212        -3.098e-02  1.689e-02  -1.834  0.06690 .  
## V213         9.083e-03  1.664e-02   0.546  0.58528    
## V214        -2.459e-02  1.913e-02  -1.285  0.19894    
## V215        -1.368e-02  2.157e-02  -0.634  0.52614    
## V216         2.497e-02  2.431e-02   1.027  0.30467    
## V217        -1.457e-02  2.697e-02  -0.540  0.58923    
## V218        -6.586e-02  2.604e-02  -2.530  0.01155 *  
## V219         3.389e-02  2.367e-02   1.432  0.15253    
## V220        -1.033e-02  2.161e-02  -0.478  0.63286    
## V221         3.008e-02  1.967e-02   1.529  0.12654    
## V222        -9.658e-03  1.890e-02  -0.511  0.60938    
## V223         5.407e-03  1.966e-02   0.275  0.78340    
## V224        -6.273e-03  2.234e-02  -0.281  0.77892    
## V225         3.752e-02  2.619e-02   1.433  0.15214    
## V226        -5.258e-02  4.128e-02  -1.274  0.20299    
## V227        -2.386e-02  2.609e-02  -0.915  0.36059    
## V228         1.558e-02  2.083e-02   0.748  0.45451    
## V229        -1.749e-02  1.905e-02  -0.918  0.35890    
## V230        -1.886e-02  2.274e-02  -0.830  0.40693    
## V231         1.590e-02  2.760e-02   0.576  0.56475    
## V232        -5.505e-02  3.320e-02  -1.658  0.09762 .  
## V233        -3.491e-03  3.387e-02  -0.103  0.91793    
## V234         3.095e-02  3.291e-02   0.941  0.34709    
## V235        -2.589e-02  3.059e-02  -0.846  0.39752    
## V236         2.768e-02  2.539e-02   1.090  0.27586    
## V237        -2.828e-02  2.263e-02  -1.249  0.21175    
## V238         3.263e-02  2.166e-02   1.507  0.13221    
## V239        -5.134e-02  2.370e-02  -2.167  0.03048 *  
## V240         6.830e-02  2.777e-02   2.459  0.01407 *  
## V241        -3.051e-02  3.175e-02  -0.961  0.33678    
## V242        -4.480e-02  8.777e-02  -0.510  0.60982    
## V243         3.869e-02  4.678e-02   0.827  0.40842    
## V244        -1.646e-02  3.536e-02  -0.466  0.64166    
## V245         1.400e-02  2.852e-02   0.491  0.62350    
## V246        -3.738e-02  2.746e-02  -1.361  0.17377    
## V247         2.522e-02  2.958e-02   0.853  0.39409    
## V248         4.456e-02  3.411e-02   1.306  0.19174    
## V249        -1.026e-01  3.574e-02  -2.870  0.00418 ** 
## V250         9.799e-02  3.891e-02   2.519  0.01192 *  
## V251         2.830e-02  3.921e-02   0.722  0.47054    
## V252         1.048e-02  3.511e-02   0.299  0.76533    
## V253        -1.313e-02  3.177e-02  -0.413  0.67939    
## V254        -2.396e-02  2.789e-02  -0.859  0.39045    
## V255        -6.728e-04  2.894e-02  -0.023  0.98146    
## V256         1.846e-02  3.473e-02   0.532  0.59517    
## V257         1.072e-02  5.534e-02   0.194  0.84638    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1745 on 1132 degrees of freedom
## Multiple R-squared:  0.9005, Adjusted R-squared:  0.878 
## F-statistic: 40.01 on 256 and 1132 DF,  p-value: < 2.2e-16
plot(logistic_model)
## Warning: not plotting observations with leverage one:
##   681

Prediction

Changing the fitted value of Y is larger than 2.5, then class the label as 3; otherwise class the label as 2

pred= predict(logistic_model, data.frame= test)
head(pred)
##        1        2        3        4        5        6 
## 2.885693 3.024018 2.955483 2.927301 3.001351 1.953989
## Changing the fitted value of Y is larger than 2.5, then class the label as 3; otherwise class the label as 2

npred = ifelse(pred > 2.5,'3','2')

head(npred)                  
##   1   2   3   4   5   6 
## "3" "3" "3" "3" "3" "2"

Confusion Matrix & Accuracy of Logistic Regression to Predict “2” & “3” Classes

# Confusion Matrix 
table(npred, train$V1)
##      
## npred   2   3
##     2 728   5
##     3   3 653
# Accuracy
Reg_ccuracy=(728+653)/(728+5+3+653)

Reg_ccuracy
## [1] 0.9942405

Comparison of Accuracies of K-NN vs. Classification using Regression

Accuracy=c(97.5,96.9, 96.9, 96.1, 99.4)
Classification_Method= c("KNN_k=1","KNN_k=5", "KNN_k=7", "KNN_k=15", "Regression")
Result= data.frame(Classification_Method, Accuracy)
Result
##   Classification_Method Accuracy
## 1               KNN_k=1     97.5
## 2               KNN_k=5     96.9
## 3               KNN_k=7     96.9
## 4              KNN_k=15     96.1
## 5            Regression     99.4
ggplot(data = Result) + 
  geom_point(mapping = aes(x= Classification_Method , y =Accuracy, color =Classification_Method ,size= 3))

Result

Regression method has the highest accuracy amongest all classification method.

Classification KNN using k=15 has lowest accuracy

Q2.

\[Best Subset Linear Regression Analysis\]

#install.packages('ISLR')

prostate=read.table("prostate.data")
head(prostate)
##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUE

Spliting Test/Train

if(interactive())par(ask=TRUE)
str( prostate )
## 'data.frame':    97 obs. of  10 variables:
##  $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
##  $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
##  $ age    : int  50 58 74 58 62 50 64 58 47 63 ...
##  $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ gleason: int  6 6 7 6 6 6 6 6 6 6 ...
##  $ pgg45  : int  0 0 20 0 0 0 0 0 0 0 ...
##  $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...
##  $ train  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
cor( prostate[,1:8] )
##            lcavol    lweight       age         lbph         svi          lcp
## lcavol  1.0000000 0.28052138 0.2249999  0.027349703  0.53884500  0.675310484
## lweight 0.2805214 1.00000000 0.3479691  0.442264399  0.15538490  0.164537142
## age     0.2249999 0.34796911 1.0000000  0.350185896  0.11765804  0.127667752
## lbph    0.0273497 0.44226440 0.3501859  1.000000000 -0.08584324 -0.006999431
## svi     0.5388450 0.15538490 0.1176580 -0.085843238  1.00000000  0.673111185
## lcp     0.6753105 0.16453714 0.1276678 -0.006999431  0.67311118  1.000000000
## gleason 0.4324171 0.05688209 0.2688916  0.077820447  0.32041222  0.514830063
## pgg45   0.4336522 0.10735379 0.2761124  0.078460018  0.45764762  0.631528246
##            gleason      pgg45
## lcavol  0.43241706 0.43365225
## lweight 0.05688209 0.10735379
## age     0.26889160 0.27611245
## lbph    0.07782045 0.07846002
## svi     0.32041222 0.45764762
## lcp     0.51483006 0.63152825
## gleason 1.00000000 0.75190451
## pgg45   0.75190451 1.00000000
#pairs( prostate[,1:9], col="violet" )
train <- subset( prostate, train==TRUE )[,1:9]
test  <- subset( prostate, train=FALSE )[,1:9]

Computing best subsets regression

#install.packages("leaps")
library(leaps)
models <- regsubsets(lcavol  ~., data = prostate, nvmax = 5)
summary(models)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate, nvmax = 5)
## 9 Variables  (and intercept)
##           Forced in Forced out
## lweight       FALSE      FALSE
## age           FALSE      FALSE
## lbph          FALSE      FALSE
## svi           FALSE      FALSE
## lcp           FALSE      FALSE
## gleason       FALSE      FALSE
## pgg45         FALSE      FALSE
## lpsa          FALSE      FALSE
## trainTRUE     FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          lweight age lbph svi lcp gleason pgg45 lpsa trainTRUE
## 1  ( 1 ) " "     " " " "  " " " " " "     " "   "*"  " "      
## 2  ( 1 ) " "     " " " "  " " "*" " "     " "   "*"  " "      
## 3  ( 1 ) " "     "*" " "  " " "*" " "     " "   "*"  " "      
## 4  ( 1 ) " "     "*" "*"  " " "*" " "     " "   "*"  " "      
## 5  ( 1 ) " "     "*" "*"  " " "*" " "     "*"   "*"  " "

The summary(models) function suggests that:

The best 1-Variable model contains lcavol ~ lpsa

The best 2-Variable model contains lcavol ~ lpsa + lcp

The best 3-Variable model contains lcavol ~ lpsa + lcp + age

The best 4-Variable model contains lcavol ~ lpsa + lcp + age + lbph

The best 5-Variable model contains lcavol ~ lpsa + lcp + age + lbph + pgg45

\(Choosing the optimal model\)

Model selection criteria: Adjusted R2, Cp and BIC

res.sum <- summary(models)
data.frame(
  Adj.R2 = which.max(res.sum$adjr2),
  CP = which.min(res.sum$cp),
  BIC = which.min(res.sum$bic)
)
##   Adj.R2 CP BIC
## 1      5  4   2

There is no single correct solution to model selection, each of these criteria will lead to slightly different models. ## “All models are wrong, some models are useful”.

\(K-fold cross-validation\)

# id: model id
# object: regsubsets object
# data: data used to fit regsubsets
# outcome: outcome variable
get_model_formula <- function(id, object, outcome){
  # get models data
  models <- summary(object)$which[id,-1]
  # Get outcome variable
  #form <- as.formula(object$call[[2]])
  #outcome <- all.vars(form)[1]
  # Get model predictors
  predictors <- names(which(models == TRUE))
  predictors <- paste(predictors, collapse = "+")
  # Build model formula
  as.formula(paste0(outcome, "~", predictors))
}

To have the best 3-variable model formula, we can use:

get_model_formula(3, models, "lcavol")
## lcavol ~ age + lcp + lpsa
## <environment: 0x0000020dc89a1e30>

cross-validation (CV) error for a given mode (5 and 10 fold CV)

get_cv_error <- function(model.formula, data){
  set.seed(1)
  train.control <- trainControl(method = "cv", number = 10)
  cv <- train(model.formula, data = data, method = "lm",
              trControl = train.control)
  cv$results$RMSE
}

Compute cross-validation error

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
model.ids <- 1:5
cv.errors <-  map(model.ids, get_model_formula, models, "lcavol") %>%
  map(get_cv_error, data = prostate) %>%
  unlist()
cv.errors
## [1] 0.7951732 0.7000323 0.7005508 0.6978059 0.6973657

Selecting the model that minimize the CV error

which.min(cv.errors)
## [1] 5
coef(models, 3)
## (Intercept)         age         lcp        lpsa 
## -0.78389039  0.01419994  0.32463191  0.51857698

Model lcavol ~ lpsa + lcp + age has the lowest CV error

\(Bootstrapping\)

# Define training control
train.control <- trainControl(method = "boot", number = 100)
# Train the model
model <- train(lcavol ~., data = prostate, method = "lm",
               trControl = train.control)
# Summarize the results
print(model)
## Linear Regression 
## 
## 97 samples
##  9 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (100 reps) 
## Summary of sample sizes: 97, 97, 97, 97, 97, 97, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.7660011  0.599168  0.6218542
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
model_coef <- function(data, index){
  coef(lm(lcavol ~., data = data, subset = index))
}
model_coef(prostate, 1:47)
##  (Intercept)      lweight          age         lbph          svi          lcp 
## -0.510380563 -0.387696845  0.040172761 -0.097792007  0.415637032  0.504957399 
##      gleason        pgg45         lpsa    trainTRUE 
## -0.040976729 -0.003923027  0.537951095 -0.265304310
#install.packages("boot")
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
boot(prostate, model_coef, 500)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = prostate, statistic = model_coef, R = 500)
## 
## 
## Bootstrap Statistics :
##          original        bias    std. error
## t1*  -2.372185636 -0.1551835932  1.34190105
## t2*  -0.027959007 -0.0012411248  0.21784897
## t3*   0.022728547  0.0008333528  0.01173151
## t4*  -0.094381577 -0.0035134304  0.06373317
## t5*  -0.150633965 -0.0027424909  0.22672945
## t6*   0.364920411 -0.0018152176  0.08410792
## t7*   0.189025885  0.0150782552  0.14930361
## t8*  -0.006859374 -0.0003834078  0.00404993
## t9*   0.565415923  0.0063836946  0.09796613
## t10* -0.044701905  0.0004045562  0.15262520
summary(lm(lcavol ~., data = prostate))$coef
##                 Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept) -2.372185636 1.315776196 -1.8028793 7.486958e-02
## lweight     -0.027959007 0.213067115 -0.1312216 8.959030e-01
## age          0.022728547 0.011362056  2.0003902 4.857650e-02
## lbph        -0.094381577 0.058540554 -1.6122426 1.105311e-01
## svi         -0.150633965 0.255776491 -0.5889281 5.574356e-01
## lcp          0.364920411 0.082658408  4.4148009 2.891665e-05
## gleason      0.189025885 0.157891204  1.1971907 2.344841e-01
## pgg45       -0.006859374 0.004454136 -1.5400011 1.271895e-01
## lpsa         0.565415923 0.088543473  6.3857436 8.079318e-09
## trainTRUE   -0.044701905 0.162846170 -0.2745039 7.843482e-01

The bootstrap approach does not rely on any of these assumptions made by the linear model, and so it is likely giving a more accurate estimate of the coefficients standard errors than is the summary() function.