suppressMessages(library(ISLR))
head(Smarket)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
smarket = Smarket
names(smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
names(smarket) = tolower(names(smarket))
summary(smarket)
## year lag1 lag2
## Min. :2001 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500
## Median :2003 Median : 0.039000 Median : 0.039000
## Mean :2003 Mean : 0.003834 Mean : 0.003919
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000
## lag3 lag4 lag5
## Min. :-4.922000 Min. :-4.922000 Min. :-4.92200
## 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000
## Median : 0.038500 Median : 0.038500 Median : 0.03850
## Mean : 0.001716 Mean : 0.001636 Mean : 0.00561
## 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700
## Max. : 5.733000 Max. : 5.733000 Max. : 5.73300
## volume today direction
## Min. :0.3561 Min. :-4.922000 Down:602
## 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
## Median :1.4229 Median : 0.038500
## Mean :1.4783 Mean : 0.003138
## 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. :3.1525 Max. : 5.733000
cor(smarket[,-9])
## year lag1 lag2 lag3 lag4
## year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## lag5 volume today
## year 0.029787995 0.53900647 0.030095229
## lag1 -0.005674606 0.04090991 -0.026155045
## lag2 -0.003557949 -0.04338321 -0.010250033
## lag3 -0.018808338 -0.04182369 -0.002447647
## lag4 -0.027083641 -0.04841425 -0.006899527
## lag5 1.000000000 -0.02200231 -0.034860083
## volume -0.022002315 1.00000000 0.014591823
## today -0.034860083 0.01459182 1.000000000
with(smarket, plot(volume))

with(smarket, plot(volume ~ year))

test = smarket[smarket$year == 2005,]
train = smarket[smarket$year != 2005,]
train.x = with(train, cbind(lag1, lag2))
test.x = with(test, cbind(lag1, lag2))
train.direction = train$direction # for comparison
knn() is part of “class” library.
K=1
library(class)
set.seed(1)
knn.pred = knn(train.x, test.x, train.direction, k=1)
table(knn.pred, test$direction)
##
## knn.pred Down Up
## Down 43 58
## Up 68 83
mean(knn.pred == test$direction)
## [1] 0.5
K=3
knn.pred2 = knn(train.x, test.x, train.direction, k=3)
table(knn.pred2, test$direction)
##
## knn.pred2 Down Up
## Down 48 54
## Up 63 87
mean(knn.pred2 == test$direction)
## [1] 0.5357143
Example of KNN - Caravan Insurance Data
library(ISLR)
dim(Caravan)
## [1] 5822 86
caravan = Caravan
names(caravan) = tolower(names(caravan))
head(caravan)
## mostype maanthui mgemomv mgemleef moshoofd mgodrk mgodpr mgodov mgodge
## 1 33 1 3 2 8 0 5 1 3
## 2 37 1 2 2 8 1 4 1 4
## 3 37 1 2 2 8 0 4 2 4
## 4 9 1 3 3 3 2 3 2 4
## 5 40 1 4 2 10 1 4 1 4
## 6 23 1 2 1 5 0 5 0 5
## mrelge mrelsa mrelov mfalleen mfgekind mfwekind moplhoog moplmidd
## 1 7 0 2 1 2 6 1 2
## 2 6 2 2 0 4 5 0 5
## 3 3 2 4 4 4 2 0 5
## 4 5 2 2 2 3 4 3 4
## 5 7 1 2 2 4 4 5 4
## 6 0 6 3 3 5 2 0 5
## mopllaag mberhoog mberzelf mberboer mbermidd mberarbg mberarbo mska
## 1 7 1 0 1 2 5 2 1
## 2 4 0 0 0 5 0 4 0
## 3 4 0 0 0 7 0 2 0
## 4 2 4 0 0 3 1 2 3
## 5 0 0 5 4 0 0 0 9
## 6 4 2 0 0 4 2 2 2
## mskb1 mskb2 mskc mskd mhhuur mhkoop maut1 maut2 maut0 mzfonds mzpart
## 1 1 2 6 1 1 8 8 0 1 8 1
## 2 2 3 5 0 2 7 7 1 2 6 3
## 3 5 0 4 0 7 2 7 0 2 9 0
## 4 2 1 4 0 5 4 9 0 0 7 2
## 5 0 0 0 0 4 5 6 2 1 5 4
## 6 2 2 4 2 9 0 5 3 3 9 0
## minkm30 mink3045 mink4575 mink7512 mink123m minkgem mkoopkla pwapart
## 1 0 4 5 0 0 4 3 0
## 2 2 0 5 2 0 5 4 2
## 3 4 5 0 0 0 3 4 2
## 4 1 5 3 0 0 4 4 0
## 5 0 0 9 0 0 6 3 0
## 6 5 2 3 0 0 3 3 0
## pwabedr pwaland ppersaut pbesaut pmotsco pvraaut paanhang ptractor
## 1 0 0 6 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 6 0 0 0 0 0
## 4 0 0 6 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 6 0 0 0 0 0
## pwerkt pbrom pleven ppersong pgezong pwaoreg pbrand pzeilpl pplezier
## 1 0 0 0 0 0 0 5 0 0
## 2 0 0 0 0 0 0 2 0 0
## 3 0 0 0 0 0 0 2 0 0
## 4 0 0 0 0 0 0 2 0 0
## 5 0 0 0 0 0 0 6 0 0
## 6 0 0 0 0 0 0 0 0 0
## pfiets pinboed pbystand awapart awabedr awaland apersaut abesaut amotsco
## 1 0 0 0 0 0 0 1 0 0
## 2 0 0 0 2 0 0 0 0 0
## 3 0 0 0 1 0 0 1 0 0
## 4 0 0 0 0 0 0 1 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0
## avraaut aaanhang atractor awerkt abrom aleven apersong agezong awaoreg
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## abrand azeilpl aplezier afiets ainboed abystand purchase
## 1 1 0 0 0 0 0 No
## 2 1 0 0 0 0 0 No
## 3 1 0 0 0 0 0 No
## 4 1 0 0 0 0 0 No
## 5 1 0 0 0 0 0 No
## 6 0 0 0 0 0 0 No
names(caravan)
## [1] "mostype" "maanthui" "mgemomv" "mgemleef" "moshoofd" "mgodrk"
## [7] "mgodpr" "mgodov" "mgodge" "mrelge" "mrelsa" "mrelov"
## [13] "mfalleen" "mfgekind" "mfwekind" "moplhoog" "moplmidd" "mopllaag"
## [19] "mberhoog" "mberzelf" "mberboer" "mbermidd" "mberarbg" "mberarbo"
## [25] "mska" "mskb1" "mskb2" "mskc" "mskd" "mhhuur"
## [31] "mhkoop" "maut1" "maut2" "maut0" "mzfonds" "mzpart"
## [37] "minkm30" "mink3045" "mink4575" "mink7512" "mink123m" "minkgem"
## [43] "mkoopkla" "pwapart" "pwabedr" "pwaland" "ppersaut" "pbesaut"
## [49] "pmotsco" "pvraaut" "paanhang" "ptractor" "pwerkt" "pbrom"
## [55] "pleven" "ppersong" "pgezong" "pwaoreg" "pbrand" "pzeilpl"
## [61] "pplezier" "pfiets" "pinboed" "pbystand" "awapart" "awabedr"
## [67] "awaland" "apersaut" "abesaut" "amotsco" "avraaut" "aaanhang"
## [73] "atractor" "awerkt" "abrom" "aleven" "apersong" "agezong"
## [79] "awaoreg" "abrand" "azeilpl" "aplezier" "afiets" "ainboed"
## [85] "abystand" "purchase"
attach(caravan)
summary(purchase)
## No Yes
## 5474 348
random_guess_rate = 348/(5474+348)
round(random_guess_rate*100,2)
## [1] 5.98
so for insurance company a goal is, can we predict who are likely buyers of insurance policy so we can target only those people and increase our sales efficiently. let’s call it a “success rate”.
based on current buyer rate of 5.98%, a random guess of general population will result in success rate of roughly 6%. can we do better by classification of general population into buyer=yes or no if we know predictor variables about them.
stand.x = scale(caravan[,-86])
# before standardizing
var(caravan[,1])
## [1] 165.0378
var(caravan[,2])
## [1] 0.1647078
# after standardizing
var(stand.x[,1])
## [1] 1
var(stand.x[,2])
## [1] 1
index = 1:1000
train.x = stand.x[-index,]
test.x = stand.x[index,]
dim(stand.x)
## [1] 5822 85
dim(train.x)
## [1] 4822 85
dim(test.x)
## [1] 1000 85
train.y = purchase[-index]
test.y = purchase[index]
length(purchase)
## [1] 5822
length(train.y)
## [1] 4822
length(test.y)
## [1] 1000
k = 1
set.seed(1)
knn.pred.car = knn(train.x, test.x, train.y, k=1)
df_k1 = as.data.frame(table(knn.pred.car, test.y))
df_k1
## knn.pred.car test.y Freq
## 1 No No 873
## 2 Yes No 68
## 3 No Yes 50
## 4 Yes Yes 9
# prediction accuracy rate:
mean(knn.pred.car == test.y)
## [1] 0.882
# prediction error-rate:;.
mean(knn.pred.car != test.y)
## [1] 0.118
# success-rate of likely buyer:
k1_success_rate = df_k1$Freq[4]/(df_k1$Freq[2]+df_k1$Freq[4])
k1_success_rate
## [1] 0.1168831
k = 3
set.seed(1)
knn.pred.car = knn(train.x, test.x, train.y, k=3)
df_k3 = as.data.frame(table(knn.pred.car, test.y))
df_k3
## knn.pred.car test.y Freq
## 1 No No 921
## 2 Yes No 20
## 3 No Yes 54
## 4 Yes Yes 5
# prediction accuracy rate:
mean(knn.pred.car == test.y)
## [1] 0.926
# prediction error-rate:;.
mean(knn.pred.car != test.y)
## [1] 0.074
# success-rate of likely buyer:
k3_success_rate = df_k3$Freq[4]/(df_k3$Freq[2]+df_k3$Freq[4])
k3_success_rate
## [1] 0.2
k = 5
set.seed(1)
knn.pred.car = knn(train.x, test.x, train.y, k=5)
df_k5 = as.data.frame(table(knn.pred.car, test.y))
df_k5
## knn.pred.car test.y Freq
## 1 No No 930
## 2 Yes No 11
## 3 No Yes 55
## 4 Yes Yes 4
# prediction accuracy rate:
mean(knn.pred.car == test.y)
## [1] 0.934
# prediction error-rate:;.
mean(knn.pred.car != test.y)
## [1] 0.066
# success-rate of likely buyer:
k5_success_rate = df_k5$Freq[4]/(df_k5$Freq[2]+df_k5$Freq[4])
k5_success_rate
## [1] 0.2666667