suppressMessages(library(ISLR))
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up
smarket = Smarket
names(smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
names(smarket) = tolower(names(smarket))

summary(smarket)
##       year           lag1                lag2          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500  
##  Median :2003   Median : 0.039000   Median : 0.039000  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000  
##       lag3                lag4                lag5         
##  Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200  
##  1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000  
##  Median : 0.038500   Median : 0.038500   Median : 0.03850  
##  Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561  
##  3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700  
##  Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300  
##      volume           today           direction 
##  Min.   :0.3561   Min.   :-4.922000   Down:602  
##  1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648  
##  Median :1.4229   Median : 0.038500             
##  Mean   :1.4783   Mean   : 0.003138             
##  3rd Qu.:1.6417   3rd Qu.: 0.596750             
##  Max.   :3.1525   Max.   : 5.733000
cor(smarket[,-9])
##              year         lag1         lag2         lag3         lag4
## year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                lag5      volume        today
## year    0.029787995  0.53900647  0.030095229
## lag1   -0.005674606  0.04090991 -0.026155045
## lag2   -0.003557949 -0.04338321 -0.010250033
## lag3   -0.018808338 -0.04182369 -0.002447647
## lag4   -0.027083641 -0.04841425 -0.006899527
## lag5    1.000000000 -0.02200231 -0.034860083
## volume -0.022002315  1.00000000  0.014591823
## today  -0.034860083  0.01459182  1.000000000
with(smarket, plot(volume))

with(smarket, plot(volume ~ year))

test = smarket[smarket$year == 2005,]
train = smarket[smarket$year != 2005,]
train.x = with(train, cbind(lag1, lag2))
test.x = with(test, cbind(lag1, lag2))
train.direction = train$direction # for comparison

knn() is part of “class” library.

K=1

library(class)

set.seed(1)

knn.pred = knn(train.x, test.x, train.direction, k=1)
table(knn.pred, test$direction)
##         
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
mean(knn.pred == test$direction)
## [1] 0.5

K=3

knn.pred2 = knn(train.x, test.x, train.direction, k=3)
table(knn.pred2, test$direction)
##          
## knn.pred2 Down Up
##      Down   48 54
##      Up     63 87
mean(knn.pred2 == test$direction)
## [1] 0.5357143

Example of KNN - Caravan Insurance Data

library(ISLR)
dim(Caravan)
## [1] 5822   86
caravan = Caravan
names(caravan) = tolower(names(caravan))

head(caravan)
##   mostype maanthui mgemomv mgemleef moshoofd mgodrk mgodpr mgodov mgodge
## 1      33        1       3        2        8      0      5      1      3
## 2      37        1       2        2        8      1      4      1      4
## 3      37        1       2        2        8      0      4      2      4
## 4       9        1       3        3        3      2      3      2      4
## 5      40        1       4        2       10      1      4      1      4
## 6      23        1       2        1        5      0      5      0      5
##   mrelge mrelsa mrelov mfalleen mfgekind mfwekind moplhoog moplmidd
## 1      7      0      2        1        2        6        1        2
## 2      6      2      2        0        4        5        0        5
## 3      3      2      4        4        4        2        0        5
## 4      5      2      2        2        3        4        3        4
## 5      7      1      2        2        4        4        5        4
## 6      0      6      3        3        5        2        0        5
##   mopllaag mberhoog mberzelf mberboer mbermidd mberarbg mberarbo mska
## 1        7        1        0        1        2        5        2    1
## 2        4        0        0        0        5        0        4    0
## 3        4        0        0        0        7        0        2    0
## 4        2        4        0        0        3        1        2    3
## 5        0        0        5        4        0        0        0    9
## 6        4        2        0        0        4        2        2    2
##   mskb1 mskb2 mskc mskd mhhuur mhkoop maut1 maut2 maut0 mzfonds mzpart
## 1     1     2    6    1      1      8     8     0     1       8      1
## 2     2     3    5    0      2      7     7     1     2       6      3
## 3     5     0    4    0      7      2     7     0     2       9      0
## 4     2     1    4    0      5      4     9     0     0       7      2
## 5     0     0    0    0      4      5     6     2     1       5      4
## 6     2     2    4    2      9      0     5     3     3       9      0
##   minkm30 mink3045 mink4575 mink7512 mink123m minkgem mkoopkla pwapart
## 1       0        4        5        0        0       4        3       0
## 2       2        0        5        2        0       5        4       2
## 3       4        5        0        0        0       3        4       2
## 4       1        5        3        0        0       4        4       0
## 5       0        0        9        0        0       6        3       0
## 6       5        2        3        0        0       3        3       0
##   pwabedr pwaland ppersaut pbesaut pmotsco pvraaut paanhang ptractor
## 1       0       0        6       0       0       0        0        0
## 2       0       0        0       0       0       0        0        0
## 3       0       0        6       0       0       0        0        0
## 4       0       0        6       0       0       0        0        0
## 5       0       0        0       0       0       0        0        0
## 6       0       0        6       0       0       0        0        0
##   pwerkt pbrom pleven ppersong pgezong pwaoreg pbrand pzeilpl pplezier
## 1      0     0      0        0       0       0      5       0        0
## 2      0     0      0        0       0       0      2       0        0
## 3      0     0      0        0       0       0      2       0        0
## 4      0     0      0        0       0       0      2       0        0
## 5      0     0      0        0       0       0      6       0        0
## 6      0     0      0        0       0       0      0       0        0
##   pfiets pinboed pbystand awapart awabedr awaland apersaut abesaut amotsco
## 1      0       0        0       0       0       0        1       0       0
## 2      0       0        0       2       0       0        0       0       0
## 3      0       0        0       1       0       0        1       0       0
## 4      0       0        0       0       0       0        1       0       0
## 5      0       0        0       0       0       0        0       0       0
## 6      0       0        0       0       0       0        1       0       0
##   avraaut aaanhang atractor awerkt abrom aleven apersong agezong awaoreg
## 1       0        0        0      0     0      0        0       0       0
## 2       0        0        0      0     0      0        0       0       0
## 3       0        0        0      0     0      0        0       0       0
## 4       0        0        0      0     0      0        0       0       0
## 5       0        0        0      0     0      0        0       0       0
## 6       0        0        0      0     0      0        0       0       0
##   abrand azeilpl aplezier afiets ainboed abystand purchase
## 1      1       0        0      0       0        0       No
## 2      1       0        0      0       0        0       No
## 3      1       0        0      0       0        0       No
## 4      1       0        0      0       0        0       No
## 5      1       0        0      0       0        0       No
## 6      0       0        0      0       0        0       No
names(caravan)
##  [1] "mostype"  "maanthui" "mgemomv"  "mgemleef" "moshoofd" "mgodrk"  
##  [7] "mgodpr"   "mgodov"   "mgodge"   "mrelge"   "mrelsa"   "mrelov"  
## [13] "mfalleen" "mfgekind" "mfwekind" "moplhoog" "moplmidd" "mopllaag"
## [19] "mberhoog" "mberzelf" "mberboer" "mbermidd" "mberarbg" "mberarbo"
## [25] "mska"     "mskb1"    "mskb2"    "mskc"     "mskd"     "mhhuur"  
## [31] "mhkoop"   "maut1"    "maut2"    "maut0"    "mzfonds"  "mzpart"  
## [37] "minkm30"  "mink3045" "mink4575" "mink7512" "mink123m" "minkgem" 
## [43] "mkoopkla" "pwapart"  "pwabedr"  "pwaland"  "ppersaut" "pbesaut" 
## [49] "pmotsco"  "pvraaut"  "paanhang" "ptractor" "pwerkt"   "pbrom"   
## [55] "pleven"   "ppersong" "pgezong"  "pwaoreg"  "pbrand"   "pzeilpl" 
## [61] "pplezier" "pfiets"   "pinboed"  "pbystand" "awapart"  "awabedr" 
## [67] "awaland"  "apersaut" "abesaut"  "amotsco"  "avraaut"  "aaanhang"
## [73] "atractor" "awerkt"   "abrom"    "aleven"   "apersong" "agezong" 
## [79] "awaoreg"  "abrand"   "azeilpl"  "aplezier" "afiets"   "ainboed" 
## [85] "abystand" "purchase"
attach(caravan)
summary(purchase)
##   No  Yes 
## 5474  348
random_guess_rate = 348/(5474+348)
round(random_guess_rate*100,2)
## [1] 5.98

so for insurance company a goal is, can we predict who are likely buyers of insurance policy so we can target only those people and increase our sales efficiently. let’s call it a “success rate”.

based on current buyer rate of 5.98%, a random guess of general population will result in success rate of roughly 6%. can we do better by classification of general population into buyer=yes or no if we know predictor variables about them.

stand.x = scale(caravan[,-86])

# before standardizing
var(caravan[,1])
## [1] 165.0378
var(caravan[,2])
## [1] 0.1647078
# after standardizing
var(stand.x[,1])
## [1] 1
var(stand.x[,2])
## [1] 1
index = 1:1000

train.x = stand.x[-index,]
test.x = stand.x[index,]
dim(stand.x)
## [1] 5822   85
dim(train.x)
## [1] 4822   85
dim(test.x)
## [1] 1000   85
train.y = purchase[-index]
test.y = purchase[index]
length(purchase)
## [1] 5822
length(train.y)
## [1] 4822
length(test.y)
## [1] 1000

k = 1

set.seed(1)
knn.pred.car = knn(train.x, test.x, train.y, k=1)
df_k1 = as.data.frame(table(knn.pred.car, test.y))
df_k1
##   knn.pred.car test.y Freq
## 1           No     No  873
## 2          Yes     No   68
## 3           No    Yes   50
## 4          Yes    Yes    9
# prediction accuracy rate:
mean(knn.pred.car == test.y)
## [1] 0.882
# prediction error-rate:;.
mean(knn.pred.car != test.y)
## [1] 0.118
# success-rate of likely buyer:
k1_success_rate = df_k1$Freq[4]/(df_k1$Freq[2]+df_k1$Freq[4])
k1_success_rate
## [1] 0.1168831

k = 3

set.seed(1)
knn.pred.car = knn(train.x, test.x, train.y, k=3)
df_k3 = as.data.frame(table(knn.pred.car, test.y))
df_k3
##   knn.pred.car test.y Freq
## 1           No     No  921
## 2          Yes     No   20
## 3           No    Yes   54
## 4          Yes    Yes    5
# prediction accuracy rate:
mean(knn.pred.car == test.y)
## [1] 0.926
# prediction error-rate:;.
mean(knn.pred.car != test.y)
## [1] 0.074
# success-rate of likely buyer:
k3_success_rate = df_k3$Freq[4]/(df_k3$Freq[2]+df_k3$Freq[4])
k3_success_rate
## [1] 0.2

k = 5

set.seed(1)
knn.pred.car = knn(train.x, test.x, train.y, k=5)
df_k5 = as.data.frame(table(knn.pred.car, test.y))
df_k5
##   knn.pred.car test.y Freq
## 1           No     No  930
## 2          Yes     No   11
## 3           No    Yes   55
## 4          Yes    Yes    4
# prediction accuracy rate:
mean(knn.pred.car == test.y)
## [1] 0.934
# prediction error-rate:;.
mean(knn.pred.car != test.y)
## [1] 0.066
# success-rate of likely buyer:
k5_success_rate = df_k5$Freq[4]/(df_k5$Freq[2]+df_k5$Freq[4])
k5_success_rate
## [1] 0.2666667