\[ Predicting the Class of "2"
& "3" from Zipcode dataset \]
Loading Zipcode dataset and choosing Class Labels “Y=3” and “Y=2”
for classification out of 0 to 9 digits
set.seed(123)
digits_train=read.table(gzfile("zip.train.gz"))
digits_test=read.table(gzfile("zip.test.gz"))
#digits_train
#filter(digits_train, V2 == 3)
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
train= digits_train %>% filter(V1 == 2 | V1 == 3)
test= digits_test %>% filter(V1 == 2 | V1 == 3)
KNN (k=1)
##KNN-1
# Installing Packages
#install.packages("e1071")
#install.packages("caTools")
#install.packages("class")
# Loading package
library(e1071)
library(caTools)
library(class)
# to training dataset
classifier_knn <- knn(train = train,
test = test,
cl = train$V1,
k = 1)
classifier_knn
## [1] 3 2 2 3 2 2 2 3 3 2 3 2 2 2 2 3 2 2 2 2 2 2 3 2 3 3 3 3 3 2 2 2 3 2 3 3 2
## [38] 3 3 2 3 2 3 2 2 2 2 2 2 3 2 3 2 2 3 2 3 2 2 3 3 3 2 2 2 3 3 3 3 2 2 2 2 2
## [75] 3 2 2 2 2 3 2 3 3 3 2 2 3 3 3 3 3 2 2 3 2 3 2 3 2 2 3 2 3 2 3 2 2 3 2 2 2
## [112] 2 3 2 3 2 2 2 2 2 3 2 2 2 2 2 3 3 3 3 3 2 2 2 2 2 3 3 3 3 3 3 3 3 2 3 3 2
## [149] 3 3 2 2 3 3 3 2 3 2 2 2 2 2 3 2 2 3 3 3 3 2 2 2 2 3 2 2 2 2 2 2 3 3 3 2 3
## [186] 2 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 2 2 3 2 2 2 2 2 3 3 2 2 3 2 3 2
## [223] 2 3 3 3 2 3 2 2 3 3 3 2 2 2 3 2 2 2 2 3 3 2 3 2 2 3 2 3 2 2 3 3 3 2 3 3 2
## [260] 3 3 3 3 3 2 3 3 3 3 3 2 3 3 3 3 2 2 2 3 3 2 2 3 3 2 2 2 3 3 3 3 2 2 2 2 2
## [297] 2 2 2 2 2 3 2 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2 3 2 3 2 2 2 3 3 3 3 3 3 2 2 2
## [334] 3 2 2 3 2 2 2 2 2 3 3 3 3 2 2 2 3 2 2 3 3 2 2 2 3 3 3 2 3 2 3
## Levels: 2 3
cm <- table(test$V1, classifier_knn)
cm
## classifier_knn
## 2 3
## 2 192 6
## 3 3 163
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.975274725274725"
k=3
classifier_knn3 <- knn(train = train,
test = test,
cl = train$V1,
k = 3)
#classifier_knn
cm <- table(test$V1, classifier_knn3)
cm
## classifier_knn3
## 2 3
## 2 191 7
## 3 4 162
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn3 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.96978021978022"
k=5
classifier_knn5 <- knn(train = train,
test = test,
cl = train$V1,
k = 5)
#classifier_knn
cm <- table(test$V1, classifier_knn5)
cm
## classifier_knn5
## 2 3
## 2 191 7
## 3 4 162
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn5 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.96978021978022"
k=7
classifier_knn7 <- knn(train = train,
test = test,
cl = train$V1,
k = 7)
#classifier_knn
cm <- table(test$V1, classifier_knn7)
cm
## classifier_knn7
## 2 3
## 2 189 9
## 3 2 164
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn7 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.96978021978022"
k=15
##KNN= 15
classifier_knn15 <- knn(train = train,
test = test,
cl = train$V1,
k = 15)
#classifier_knn
cm <- table(test$V1, classifier_knn15)
cm
## classifier_knn15
## 2 3
## 2 187 11
## 3 3 163
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn15 != test$V1)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.961538461538462"
Regression for Clasification
set.seed(12)
library(caTools)
library(ROCR)
logistic_model <- lm(V1 ~., data = train, family = "binomial")
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'family' will be disregarded
summary(logistic_model)
##
## Call:
## lm(formula = V1 ~ ., data = train, family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59281 -0.09679 -0.00057 0.09365 0.70122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.359e+01 1.042e+02 0.226 0.82090
## V2 -2.889e-01 1.463e-01 -1.974 0.04860 *
## V3 5.930e-02 6.753e-02 0.878 0.38007
## V4 2.406e-02 3.827e-02 0.629 0.52976
## V5 -2.644e-02 2.721e-02 -0.972 0.33140
## V6 1.168e-02 2.328e-02 0.502 0.61604
## V7 3.749e-03 2.260e-02 0.166 0.86825
## V8 -3.795e-03 2.271e-02 -0.167 0.86733
## V9 -3.584e-03 2.229e-02 -0.161 0.87228
## V10 -1.366e-02 2.222e-02 -0.615 0.53881
## V11 2.426e-02 2.480e-02 0.978 0.32825
## V12 -5.717e-02 3.220e-02 -1.776 0.07608 .
## V13 7.696e-02 4.284e-02 1.796 0.07269 .
## V14 -9.824e-03 6.383e-02 -0.154 0.87772
## V15 1.736e-01 1.220e-01 1.423 0.15509
## V16 -2.109e-01 4.220e-01 -0.500 0.61740
## V17 2.125e+01 1.044e+02 0.204 0.83873
## V18 -5.654e-02 7.946e-02 -0.711 0.47692
## V19 -5.524e-02 4.052e-02 -1.363 0.17302
## V20 9.906e-03 2.500e-02 0.396 0.69198
## V21 -3.085e-03 1.961e-02 -0.157 0.87504
## V22 3.690e-03 1.868e-02 0.198 0.84345
## V23 -1.208e-02 2.327e-02 -0.519 0.60391
## V24 5.070e-02 2.860e-02 1.773 0.07649 .
## V25 -3.083e-02 2.729e-02 -1.129 0.25896
## V26 -1.087e-02 2.446e-02 -0.444 0.65688
## V27 -1.533e-04 2.257e-02 -0.007 0.99458
## V28 4.249e-02 2.191e-02 1.939 0.05273 .
## V29 3.745e-02 2.716e-02 1.379 0.16819
## V30 -2.430e-02 3.744e-02 -0.649 0.51642
## V31 -7.055e-03 6.460e-02 -0.109 0.91306
## V32 -3.003e-01 1.425e-01 -2.107 0.03533 *
## V33 8.525e-01 5.674e-01 1.503 0.13323
## V34 -1.079e-02 5.103e-02 -0.211 0.83259
## V35 -2.298e-02 3.081e-02 -0.746 0.45591
## V36 2.882e-02 2.205e-02 1.307 0.19146
## V37 1.026e-02 1.780e-02 0.577 0.56436
## V38 -4.103e-02 1.734e-02 -2.366 0.01815 *
## V39 8.164e-03 1.952e-02 0.418 0.67582
## V40 1.561e-03 2.149e-02 0.073 0.94209
## V41 -2.097e-03 2.142e-02 -0.098 0.92202
## V42 1.850e-02 2.052e-02 0.901 0.36755
## V43 4.566e-03 1.916e-02 0.238 0.81163
## V44 1.446e-03 1.863e-02 0.078 0.93813
## V45 -2.673e-02 2.204e-02 -1.212 0.22559
## V46 -1.844e-02 3.064e-02 -0.602 0.54730
## V47 -4.644e-02 4.975e-02 -0.933 0.35082
## V48 8.354e-02 8.009e-02 1.043 0.29715
## V49 7.162e-02 1.815e-01 0.395 0.69317
## V50 -1.420e-02 4.604e-02 -0.308 0.75784
## V51 -2.603e-02 3.109e-02 -0.837 0.40259
## V52 1.751e-02 2.248e-02 0.779 0.43618
## V53 -1.896e-02 1.862e-02 -1.018 0.30873
## V54 1.514e-02 1.783e-02 0.849 0.39585
## V55 1.397e-02 2.087e-02 0.670 0.50327
## V56 -2.504e-02 2.240e-02 -1.118 0.26392
## V57 2.974e-02 2.121e-02 1.402 0.16124
## V58 -1.804e-03 1.809e-02 -0.100 0.92058
## V59 -4.102e-02 1.668e-02 -2.458 0.01410 *
## V60 2.757e-02 1.828e-02 1.508 0.13185
## V61 4.684e-03 2.101e-02 0.223 0.82362
## V62 6.673e-03 2.872e-02 0.232 0.81631
## V63 8.091e-02 4.414e-02 1.833 0.06707 .
## V64 -1.846e-02 7.289e-02 -0.253 0.80015
## V65 2.570e-01 2.041e-01 1.259 0.20818
## V66 1.462e-02 5.442e-02 0.269 0.78822
## V67 7.517e-03 3.419e-02 0.220 0.82602
## V68 -3.749e-03 2.579e-02 -0.145 0.88448
## V69 2.808e-03 2.309e-02 0.122 0.90322
## V70 -2.304e-02 2.221e-02 -1.037 0.29975
## V71 1.092e-02 2.397e-02 0.455 0.64886
## V72 3.041e-04 2.449e-02 0.012 0.99009
## V73 -3.523e-02 2.185e-02 -1.612 0.10721
## V74 2.355e-02 1.845e-02 1.276 0.20215
## V75 -6.711e-03 1.734e-02 -0.387 0.69884
## V76 -5.136e-03 1.790e-02 -0.287 0.77425
## V77 -2.787e-02 2.140e-02 -1.302 0.19316
## V78 4.452e-03 2.760e-02 0.161 0.87186
## V79 -1.076e-01 4.149e-02 -2.593 0.00965 **
## V80 7.553e-02 7.257e-02 1.041 0.29820
## V81 -2.398e-01 1.821e-01 -1.317 0.18816
## V82 -4.837e-02 6.438e-02 -0.751 0.45262
## V83 6.218e-02 5.235e-02 1.188 0.23517
## V84 -2.446e-02 3.622e-02 -0.675 0.49961
## V85 6.275e-03 2.897e-02 0.217 0.82855
## V86 1.109e-02 2.718e-02 0.408 0.68342
## V87 3.097e-02 2.722e-02 1.138 0.25545
## V88 5.655e-03 2.403e-02 0.235 0.81403
## V89 3.318e-02 2.129e-02 1.559 0.11928
## V90 -2.013e-03 2.021e-02 -0.100 0.92065
## V91 9.206e-03 1.851e-02 0.497 0.61903
## V92 1.669e-02 1.742e-02 0.958 0.33826
## V93 -1.156e-02 2.006e-02 -0.576 0.56470
## V94 2.353e-02 2.546e-02 0.924 0.35560
## V95 2.604e-02 3.945e-02 0.660 0.50944
## V96 1.318e-02 6.705e-02 0.197 0.84421
## V97 -1.234e-01 1.241e-01 -0.995 0.32014
## V98 -8.955e-02 1.123e-01 -0.798 0.42518
## V99 -9.562e-02 7.489e-02 -1.277 0.20195
## V100 -1.489e-02 5.289e-02 -0.281 0.77842
## V101 -1.111e-02 3.468e-02 -0.320 0.74878
## V102 -1.996e-02 2.702e-02 -0.739 0.46020
## V103 -5.023e-02 2.362e-02 -2.126 0.03368 *
## V104 4.274e-02 2.252e-02 1.898 0.05799 .
## V105 4.326e-02 2.054e-02 2.107 0.03535 *
## V106 5.432e-02 2.045e-02 2.656 0.00802 **
## V107 4.746e-02 1.946e-02 2.438 0.01491 *
## V108 2.607e-02 1.764e-02 1.478 0.13982
## V109 2.328e-02 1.895e-02 1.228 0.21952
## V110 2.916e-02 2.330e-02 1.252 0.21100
## V111 -4.302e-03 3.609e-02 -0.119 0.90513
## V112 2.354e-02 5.477e-02 0.430 0.66746
## V113 7.671e-02 9.288e-02 0.826 0.40905
## V114 -8.837e-02 1.068e-01 -0.827 0.40829
## V115 1.454e-01 7.153e-02 2.032 0.04239 *
## V116 -1.779e-02 4.652e-02 -0.382 0.70228
## V117 9.317e-03 2.991e-02 0.312 0.75546
## V118 2.405e-02 2.360e-02 1.019 0.30846
## V119 1.311e-03 2.135e-02 0.061 0.95103
## V120 3.010e-03 2.023e-02 0.149 0.88177
## V121 1.710e-02 1.951e-02 0.876 0.38104
## V122 3.737e-02 1.881e-02 1.986 0.04725 *
## V123 7.454e-03 1.822e-02 0.409 0.68248
## V124 2.268e-02 1.721e-02 1.318 0.18780
## V125 2.557e-02 1.907e-02 1.341 0.18010
## V126 -9.931e-03 2.205e-02 -0.450 0.65254
## V127 4.140e-02 3.059e-02 1.353 0.17621
## V128 1.245e-02 4.112e-02 0.303 0.76204
## V129 3.814e-02 6.081e-02 0.627 0.53068
## V130 -2.789e-01 6.913e-02 -4.035 5.83e-05 ***
## V131 2.209e-02 4.801e-02 0.460 0.64550
## V132 -5.811e-02 3.457e-02 -1.681 0.09304 .
## V133 3.075e-02 2.814e-02 1.093 0.27472
## V134 -1.930e-02 2.452e-02 -0.787 0.43156
## V135 -9.028e-03 2.281e-02 -0.396 0.69234
## V136 -1.558e-02 2.252e-02 -0.692 0.48920
## V137 -1.661e-02 2.034e-02 -0.816 0.41446
## V138 1.144e-02 1.881e-02 0.608 0.54304
## V139 4.130e-02 1.783e-02 2.316 0.02072 *
## V140 5.062e-03 1.689e-02 0.300 0.76444
## V141 6.826e-03 1.830e-02 0.373 0.70917
## V142 5.325e-03 2.158e-02 0.247 0.80517
## V143 2.539e-03 2.624e-02 0.097 0.92295
## V144 -1.334e-02 3.434e-02 -0.388 0.69772
## V145 -6.290e-02 4.904e-02 -1.283 0.19993
## V146 1.343e-02 4.447e-02 0.302 0.76276
## V147 3.679e-03 3.236e-02 0.114 0.90951
## V148 -3.145e-02 2.749e-02 -1.144 0.25289
## V149 -4.165e-02 2.807e-02 -1.484 0.13821
## V150 -3.280e-02 2.661e-02 -1.232 0.21809
## V151 1.203e-02 2.554e-02 0.471 0.63762
## V152 -1.694e-02 2.473e-02 -0.685 0.49345
## V153 1.040e-02 2.260e-02 0.460 0.64552
## V154 -2.884e-02 2.109e-02 -1.368 0.17172
## V155 1.762e-04 1.854e-02 0.009 0.99242
## V156 1.821e-02 1.754e-02 1.038 0.29947
## V157 2.869e-02 1.856e-02 1.546 0.12248
## V158 3.160e-02 1.984e-02 1.593 0.11152
## V159 2.046e-03 2.366e-02 0.086 0.93110
## V160 4.416e-02 2.904e-02 1.521 0.12866
## V161 1.881e-02 3.428e-02 0.549 0.58331
## V162 -6.361e-02 3.021e-02 -2.105 0.03547 *
## V163 -5.579e-03 2.463e-02 -0.226 0.82086
## V164 1.738e-02 2.232e-02 0.779 0.43634
## V165 -9.026e-03 2.228e-02 -0.405 0.68551
## V166 -4.395e-02 2.456e-02 -1.789 0.07385 .
## V167 -2.628e-02 2.664e-02 -0.986 0.32410
## V168 -3.095e-02 2.775e-02 -1.115 0.26499
## V169 -2.619e-02 2.612e-02 -1.003 0.31627
## V170 -1.907e-02 2.366e-02 -0.806 0.42030
## V171 4.063e-02 2.054e-02 1.978 0.04813 *
## V172 -1.497e-02 1.780e-02 -0.841 0.40051
## V173 2.930e-02 1.844e-02 1.589 0.11225
## V174 -3.264e-02 1.898e-02 -1.719 0.08585 .
## V175 1.517e-02 2.063e-02 0.735 0.46227
## V176 7.702e-03 2.518e-02 0.306 0.75980
## V177 4.147e-02 3.003e-02 1.381 0.16755
## V178 4.390e-02 2.692e-02 1.631 0.10324
## V179 -5.062e-02 2.069e-02 -2.446 0.01459 *
## V180 -7.380e-03 1.761e-02 -0.419 0.67525
## V181 -1.602e-02 1.840e-02 -0.871 0.38417
## V182 1.752e-03 2.061e-02 0.085 0.93228
## V183 -2.106e-02 2.279e-02 -0.924 0.35561
## V184 -4.552e-02 2.487e-02 -1.830 0.06748 .
## V185 -2.354e-02 2.441e-02 -0.965 0.33497
## V186 -2.812e-03 2.243e-02 -0.125 0.90022
## V187 -2.401e-02 2.104e-02 -1.141 0.25401
## V188 9.361e-03 1.855e-02 0.505 0.61394
## V189 -2.083e-03 1.708e-02 -0.122 0.90297
## V190 2.197e-02 1.773e-02 1.239 0.21557
## V191 -2.111e-02 1.922e-02 -1.098 0.27238
## V192 1.703e-02 2.276e-02 0.748 0.45454
## V193 2.691e-03 2.492e-02 0.108 0.91401
## V194 -4.660e-02 2.839e-02 -1.641 0.10098
## V195 2.146e-02 1.939e-02 1.107 0.26855
## V196 -1.316e-02 1.664e-02 -0.791 0.42910
## V197 -1.417e-02 1.636e-02 -0.866 0.38668
## V198 -3.229e-02 1.841e-02 -1.755 0.07961 .
## V199 2.610e-02 2.035e-02 1.283 0.19980
## V200 -4.369e-02 2.332e-02 -1.874 0.06124 .
## V201 1.376e-02 2.430e-02 0.566 0.57140
## V202 -5.078e-02 2.343e-02 -2.167 0.03044 *
## V203 4.756e-02 2.084e-02 2.281 0.02271 *
## V204 7.793e-03 1.835e-02 0.425 0.67112
## V205 -3.428e-03 1.704e-02 -0.201 0.84064
## V206 2.721e-02 1.759e-02 1.547 0.12208
## V207 1.214e-02 1.930e-02 0.629 0.52931
## V208 -8.417e-03 2.073e-02 -0.406 0.68481
## V209 3.454e-02 2.450e-02 1.410 0.15893
## V210 3.092e-02 3.107e-02 0.995 0.31985
## V211 -3.030e-02 2.145e-02 -1.413 0.15800
## V212 -3.098e-02 1.689e-02 -1.834 0.06690 .
## V213 9.083e-03 1.664e-02 0.546 0.58528
## V214 -2.459e-02 1.913e-02 -1.285 0.19894
## V215 -1.368e-02 2.157e-02 -0.634 0.52614
## V216 2.497e-02 2.431e-02 1.027 0.30467
## V217 -1.457e-02 2.697e-02 -0.540 0.58923
## V218 -6.586e-02 2.604e-02 -2.530 0.01155 *
## V219 3.389e-02 2.367e-02 1.432 0.15253
## V220 -1.033e-02 2.161e-02 -0.478 0.63286
## V221 3.008e-02 1.967e-02 1.529 0.12654
## V222 -9.658e-03 1.890e-02 -0.511 0.60938
## V223 5.407e-03 1.966e-02 0.275 0.78340
## V224 -6.273e-03 2.234e-02 -0.281 0.77892
## V225 3.752e-02 2.619e-02 1.433 0.15214
## V226 -5.258e-02 4.128e-02 -1.274 0.20299
## V227 -2.386e-02 2.609e-02 -0.915 0.36059
## V228 1.558e-02 2.083e-02 0.748 0.45451
## V229 -1.749e-02 1.905e-02 -0.918 0.35890
## V230 -1.886e-02 2.274e-02 -0.830 0.40693
## V231 1.590e-02 2.760e-02 0.576 0.56475
## V232 -5.505e-02 3.320e-02 -1.658 0.09762 .
## V233 -3.491e-03 3.387e-02 -0.103 0.91793
## V234 3.095e-02 3.291e-02 0.941 0.34709
## V235 -2.589e-02 3.059e-02 -0.846 0.39752
## V236 2.768e-02 2.539e-02 1.090 0.27586
## V237 -2.828e-02 2.263e-02 -1.249 0.21175
## V238 3.263e-02 2.166e-02 1.507 0.13221
## V239 -5.134e-02 2.370e-02 -2.167 0.03048 *
## V240 6.830e-02 2.777e-02 2.459 0.01407 *
## V241 -3.051e-02 3.175e-02 -0.961 0.33678
## V242 -4.480e-02 8.777e-02 -0.510 0.60982
## V243 3.869e-02 4.678e-02 0.827 0.40842
## V244 -1.646e-02 3.536e-02 -0.466 0.64166
## V245 1.400e-02 2.852e-02 0.491 0.62350
## V246 -3.738e-02 2.746e-02 -1.361 0.17377
## V247 2.522e-02 2.958e-02 0.853 0.39409
## V248 4.456e-02 3.411e-02 1.306 0.19174
## V249 -1.026e-01 3.574e-02 -2.870 0.00418 **
## V250 9.799e-02 3.891e-02 2.519 0.01192 *
## V251 2.830e-02 3.921e-02 0.722 0.47054
## V252 1.048e-02 3.511e-02 0.299 0.76533
## V253 -1.313e-02 3.177e-02 -0.413 0.67939
## V254 -2.396e-02 2.789e-02 -0.859 0.39045
## V255 -6.728e-04 2.894e-02 -0.023 0.98146
## V256 1.846e-02 3.473e-02 0.532 0.59517
## V257 1.072e-02 5.534e-02 0.194 0.84638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1745 on 1132 degrees of freedom
## Multiple R-squared: 0.9005, Adjusted R-squared: 0.878
## F-statistic: 40.01 on 256 and 1132 DF, p-value: < 2.2e-16
plot(logistic_model)
## Warning: not plotting observations with leverage one:
## 681




Prediction
Changing the fitted value of Y is larger than 2.5, then class the
label as 3; otherwise class the label as 2
pred= predict(logistic_model, data.frame= test)
head(pred)
## 1 2 3 4 5 6
## 2.885693 3.024018 2.955483 2.927301 3.001351 1.953989
## Changing the fitted value of Y is larger than 2.5, then class the label as 3; otherwise class the label as 2
npred = ifelse(pred > 2.5,'3','2')
head(npred)
## 1 2 3 4 5 6
## "3" "3" "3" "3" "3" "2"
Confusion Matrix & Accuracy of Logistic Regression to Predict
“2” & “3” Classes
# Confusion Matrix
table(npred, train$V1)
##
## npred 2 3
## 2 728 5
## 3 3 653
# Accuracy
Reg_ccuracy=(728+653)/(728+5+3+653)
Reg_ccuracy
## [1] 0.9942405
Comparison of Accuracies of K-NN vs. Classification using
Regression
Accuracy=c(97.5,96.9, 96.9, 96.1, 99.4)
Classification_Method= c("KNN_k=1","KNN_k=5", "KNN_k=7", "KNN_k=15", "Regression")
Result= data.frame(Classification_Method, Accuracy)
Result
## Classification_Method Accuracy
## 1 KNN_k=1 97.5
## 2 KNN_k=5 96.9
## 3 KNN_k=7 96.9
## 4 KNN_k=15 96.1
## 5 Regression 99.4
ggplot(data = Result) +
geom_point(mapping = aes(x= Classification_Method , y =Accuracy, color =Classification_Method ,size= 3))

Result
Regression method has the highest accuracy amongest all
classification method.
Classification KNN using k=15 has lowest accuracy
Q2.
\[Best Subset Linear Regression Analysis\]
#install.packages('ISLR')
prostate=read.table("prostate.data")
head(prostate)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.769459 50 -1.386294 0 -1.386294 6 0 -0.4307829
## 2 -0.9942523 3.319626 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 3 -0.5108256 2.691243 74 -1.386294 0 -1.386294 7 20 -0.1625189
## 4 -1.2039728 3.282789 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 5 0.7514161 3.432373 62 -1.386294 0 -1.386294 6 0 0.3715636
## 6 -1.0498221 3.228826 50 -1.386294 0 -1.386294 6 0 0.7654678
## train
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
Spliting Test/Train
if(interactive())par(ask=TRUE)
str( prostate )
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
cor( prostate[,1:8] )
## lcavol lweight age lbph svi lcp
## lcavol 1.0000000 0.28052138 0.2249999 0.027349703 0.53884500 0.675310484
## lweight 0.2805214 1.00000000 0.3479691 0.442264399 0.15538490 0.164537142
## age 0.2249999 0.34796911 1.0000000 0.350185896 0.11765804 0.127667752
## lbph 0.0273497 0.44226440 0.3501859 1.000000000 -0.08584324 -0.006999431
## svi 0.5388450 0.15538490 0.1176580 -0.085843238 1.00000000 0.673111185
## lcp 0.6753105 0.16453714 0.1276678 -0.006999431 0.67311118 1.000000000
## gleason 0.4324171 0.05688209 0.2688916 0.077820447 0.32041222 0.514830063
## pgg45 0.4336522 0.10735379 0.2761124 0.078460018 0.45764762 0.631528246
## gleason pgg45
## lcavol 0.43241706 0.43365225
## lweight 0.05688209 0.10735379
## age 0.26889160 0.27611245
## lbph 0.07782045 0.07846002
## svi 0.32041222 0.45764762
## lcp 0.51483006 0.63152825
## gleason 1.00000000 0.75190451
## pgg45 0.75190451 1.00000000
#pairs( prostate[,1:9], col="violet" )
train <- subset( prostate, train==TRUE )[,1:9]
test <- subset( prostate, train=FALSE )[,1:9]
Computing best subsets regression
#install.packages("leaps")
library(leaps)
models <- regsubsets(lcavol ~., data = prostate, nvmax = 5)
summary(models)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate, nvmax = 5)
## 9 Variables (and intercept)
## Forced in Forced out
## lweight FALSE FALSE
## age FALSE FALSE
## lbph FALSE FALSE
## svi FALSE FALSE
## lcp FALSE FALSE
## gleason FALSE FALSE
## pgg45 FALSE FALSE
## lpsa FALSE FALSE
## trainTRUE FALSE FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
## lweight age lbph svi lcp gleason pgg45 lpsa trainTRUE
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " "
## 2 ( 1 ) " " " " " " " " "*" " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " "*" " " " " "*" " "
## 4 ( 1 ) " " "*" "*" " " "*" " " " " "*" " "
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" "*" " "
The summary(models) function suggests that:
The best 1-Variable model contains lcavol ~ lpsa
The best 2-Variable model contains lcavol ~ lpsa + lcp
The best 3-Variable model contains lcavol ~ lpsa + lcp + age
The best 4-Variable model contains lcavol ~ lpsa + lcp + age +
lbph
The best 5-Variable model contains lcavol ~ lpsa + lcp + age + lbph
+ pgg45
\(Choosing the optimal model\)
Model selection criteria: Adjusted R2, Cp and BIC
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)
## Adj.R2 CP BIC
## 1 5 4 2
There is no single correct solution to model selection, each of
these criteria will lead to slightly different models. ## “All models
are wrong, some models are useful”.
\(K-fold cross-validation\)
# id: model id
# object: regsubsets object
# data: data used to fit regsubsets
# outcome: outcome variable
get_model_formula <- function(id, object, outcome){
# get models data
models <- summary(object)$which[id,-1]
# Get outcome variable
#form <- as.formula(object$call[[2]])
#outcome <- all.vars(form)[1]
# Get model predictors
predictors <- names(which(models == TRUE))
predictors <- paste(predictors, collapse = "+")
# Build model formula
as.formula(paste0(outcome, "~", predictors))
}
cross-validation (CV) error for a given mode (5 and 10 fold CV)
get_cv_error <- function(model.formula, data){
set.seed(1)
train.control <- trainControl(method = "cv", number = 10)
cv <- train(model.formula, data = data, method = "lm",
trControl = train.control)
cv$results$RMSE
}
Compute cross-validation error
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models, "lcavol") %>%
map(get_cv_error, data = prostate) %>%
unlist()
cv.errors
## [1] 0.7951732 0.7000323 0.7005508 0.6978059 0.6973657
Selecting the model that minimize the CV error
which.min(cv.errors)
## [1] 5
coef(models, 3)
## (Intercept) age lcp lpsa
## -0.78389039 0.01419994 0.32463191 0.51857698
Model lcavol ~ lpsa + lcp + age has the lowest CV error
\(Bootstrapping\)
# Define training control
train.control <- trainControl(method = "boot", number = 100)
# Train the model
model <- train(lcavol ~., data = prostate, method = "lm",
trControl = train.control)
# Summarize the results
print(model)
## Linear Regression
##
## 97 samples
## 9 predictor
##
## No pre-processing
## Resampling: Bootstrapped (100 reps)
## Summary of sample sizes: 97, 97, 97, 97, 97, 97, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.7660011 0.599168 0.6218542
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
model_coef <- function(data, index){
coef(lm(lcavol ~., data = data, subset = index))
}
model_coef(prostate, 1:47)
## (Intercept) lweight age lbph svi lcp
## -0.510380563 -0.387696845 0.040172761 -0.097792007 0.415637032 0.504957399
## gleason pgg45 lpsa trainTRUE
## -0.040976729 -0.003923027 0.537951095 -0.265304310
#install.packages("boot")
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
boot(prostate, model_coef, 500)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = prostate, statistic = model_coef, R = 500)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -2.372185636 -0.1551835932 1.34190105
## t2* -0.027959007 -0.0012411248 0.21784897
## t3* 0.022728547 0.0008333528 0.01173151
## t4* -0.094381577 -0.0035134304 0.06373317
## t5* -0.150633965 -0.0027424909 0.22672945
## t6* 0.364920411 -0.0018152176 0.08410792
## t7* 0.189025885 0.0150782552 0.14930361
## t8* -0.006859374 -0.0003834078 0.00404993
## t9* 0.565415923 0.0063836946 0.09796613
## t10* -0.044701905 0.0004045562 0.15262520
summary(lm(lcavol ~., data = prostate))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.372185636 1.315776196 -1.8028793 7.486958e-02
## lweight -0.027959007 0.213067115 -0.1312216 8.959030e-01
## age 0.022728547 0.011362056 2.0003902 4.857650e-02
## lbph -0.094381577 0.058540554 -1.6122426 1.105311e-01
## svi -0.150633965 0.255776491 -0.5889281 5.574356e-01
## lcp 0.364920411 0.082658408 4.4148009 2.891665e-05
## gleason 0.189025885 0.157891204 1.1971907 2.344841e-01
## pgg45 -0.006859374 0.004454136 -1.5400011 1.271895e-01
## lpsa 0.565415923 0.088543473 6.3857436 8.079318e-09
## trainTRUE -0.044701905 0.162846170 -0.2745039 7.843482e-01
The bootstrap approach does not rely on any of these assumptions
made by the linear model, and so it is likely giving a more accurate
estimate of the coefficients standard errors than is the summary()
function.