suppressMessages(library(ISLR))
## Warning: package 'ISLR' was built under R version 3.3.1
head(Smarket)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
smarket = Smarket
names(smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
names(smarket) = tolower(names(smarket))
summary(smarket)
## year lag1 lag2
## Min. :2001 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500
## Median :2003 Median : 0.039000 Median : 0.039000
## Mean :2003 Mean : 0.003834 Mean : 0.003919
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000
## lag3 lag4 lag5
## Min. :-4.922000 Min. :-4.922000 Min. :-4.92200
## 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000
## Median : 0.038500 Median : 0.038500 Median : 0.03850
## Mean : 0.001716 Mean : 0.001636 Mean : 0.00561
## 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700
## Max. : 5.733000 Max. : 5.733000 Max. : 5.73300
## volume today direction
## Min. :0.3561 Min. :-4.922000 Down:602
## 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
## Median :1.4229 Median : 0.038500
## Mean :1.4783 Mean : 0.003138
## 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. :3.1525 Max. : 5.733000
cor(smarket[,-9])
## year lag1 lag2 lag3 lag4
## year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## lag5 volume today
## year 0.029787995 0.53900647 0.030095229
## lag1 -0.005674606 0.04090991 -0.026155045
## lag2 -0.003557949 -0.04338321 -0.010250033
## lag3 -0.018808338 -0.04182369 -0.002447647
## lag4 -0.027083641 -0.04841425 -0.006899527
## lag5 1.000000000 -0.02200231 -0.034860083
## volume -0.022002315 1.00000000 0.014591823
## today -0.034860083 0.01459182 1.000000000
with(smarket, plot(volume))

with(smarket, plot(volume ~ year))

glm.fit <- glm(direction ~ lag1+lag2+lag3+lag4+lag5+volume, data=smarket, family=binomial)
glm.fit
##
## Call: glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 +
## volume, family = binomial, data = smarket)
##
## Coefficients:
## (Intercept) lag1 lag2 lag3 lag4
## -0.126000 -0.073074 -0.042301 0.011085 0.009359
## lag5 volume
## 0.010313 0.135441
##
## Degrees of Freedom: 1249 Total (i.e. Null); 1243 Residual
## Null Deviance: 1731
## Residual Deviance: 1728 AIC: 1742
summary(glm.fit)
##
## Call:
## glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 +
## volume, family = binomial, data = smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.446 -1.203 1.065 1.145 1.326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## lag1 -0.073074 0.050167 -1.457 0.145
## lag2 -0.042301 0.050086 -0.845 0.398
## lag3 0.011085 0.049939 0.222 0.824
## lag4 0.009359 0.049974 0.187 0.851
## lag5 0.010313 0.049511 0.208 0.835
## volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
coef(glm.fit)
## (Intercept) lag1 lag2 lag3 lag4
## -0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938
## lag5 volume
## 0.010313068 0.135440659
summary(glm.fit)$coef[,4] #-p-value column only
## (Intercept) lag1 lag2 lag3 lag4 lag5
## 0.6006983 0.1452272 0.3983491 0.8243333 0.8514445 0.8349974
## volume
## 0.3924004
contrasts(smarket$direction) #dummy variables created by R for "direction"; all coeffcients in model are for baseline variable which is Down:0
## Up
## Down 0
## Up 1
glm.probs = predict(glm.fit, type="response") # predict probabilities of P(Y=1|X). here as per contrasts up=1 so these probs are for direction UP.
glm.probs[1:10] # first 10 just ot check
## 1 2 3 4 5 6 7
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509
## 8 9 10
## 0.5092292 0.5176135 0.4888378
glm.pred = rep("Down",1250) # create a vector of all DOWNs 1250 times.
glm.pred[glm.probs > 0.5] = "Up"
plot(smarket$lag1, glm.fit$fitted.values)

plot(smarket$lag2, glm.fit$fitted.values)

plot(smarket$lag3, glm.fit$fitted.values)

plot(smarket$lag4, glm.fit$fitted.values)

plot(smarket$lag5, glm.fit$fitted.values)

plot(smarket$volume, glm.fit$fitted.values)

table(glm.pred, smarket$direction) #confusion-matrix of model output vs. actual data from training-set
##
## glm.pred Down Up
## Down 145 141
## Up 457 507
mean(glm.pred == smarket$direction) # correct prediction-rate of the model on training-set
## [1] 0.5216
# logit (log-odds) ratios (see if includes 0, means, not significant)
confint(glm.fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.59907825 0.34544102
## lag1 -0.17191967 0.02499864
## lag2 -0.14082062 0.05578504
## lag3 -0.08689851 0.10912653
## lag4 -0.08870645 0.10745613
## lag5 -0.08681511 0.10753778
## volume -0.17448194 0.44696774
# all CIs include 0, meaning, all predcitors are not significant. (we can get the same conclusion by looking at individual p-values from summary(glm.fit) output).
# exponential ratios (see if includes 1, means, not significant)
exp(confint(glm.fit))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.5493177 1.412613
## lag1 0.8420468 1.025314
## lag2 0.8686451 1.057370
## lag3 0.9167701 1.115303
## lag4 0.9151142 1.113442
## lag5 0.9168466 1.113533
## volume 0.8398920 1.563564
# all CIs include "1" so all predictors are not significant.
# log-odds ratio and odds-ratio are not probabilities , they are function of prebabilities.
Instead of using entire set for both fitting the model and then prediction, let’s split the dataset in training and test sets.
test = smarket[smarket$year == 2005,]
train = smarket[smarket$year != 2005,]
glm.fit2 = glm(direction ~ lag1+lag2+lag3+lag4+lag5+volume, data=train, family=binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 +
## volume, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.302 -1.190 1.079 1.160 1.350
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.191213 0.333690 0.573 0.567
## lag1 -0.054178 0.051785 -1.046 0.295
## lag2 -0.045805 0.051797 -0.884 0.377
## lag3 0.007200 0.051644 0.139 0.889
## lag4 0.006441 0.051706 0.125 0.901
## lag5 -0.004223 0.051138 -0.083 0.934
## volume -0.116257 0.239618 -0.485 0.628
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.3 on 997 degrees of freedom
## Residual deviance: 1381.1 on 991 degrees of freedom
## AIC: 1395.1
##
## Number of Fisher Scoring iterations: 3
glm.probs2 = predict(glm.fit2, test, type="response")
# converting prediction probabilities into categorical variable of up or down.
glm.pred2 = rep("Down",252)
glm.pred2[glm.probs2 > 0.5] = "Up"
# glm.pred2 = ifelse(glm.probs2 > 0.5, "Up","Down") ....another way of doing it
table(glm.pred2, test$direction)
##
## glm.pred2 Down Up
## Down 77 97
## Up 34 44
mean(glm.pred2 == test$direction) # correct prediction-rate
## [1] 0.4801587
mean(glm.pred2 != test$direction) # error-rate
## [1] 0.5198413
modeling with only 2 most significant predictors.
glm.fit3 = glm(direction ~ lag1+lag2, data=train, family=binomial)
summary(glm.fit3)
##
## Call:
## glm(formula = direction ~ lag1 + lag2, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.345 -1.188 1.074 1.164 1.326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03222 0.06338 0.508 0.611
## lag1 -0.05562 0.05171 -1.076 0.282
## lag2 -0.04449 0.05166 -0.861 0.389
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.3 on 997 degrees of freedom
## Residual deviance: 1381.4 on 995 degrees of freedom
## AIC: 1387.4
##
## Number of Fisher Scoring iterations: 3
glm.probs3 = predict(glm.fit3, test, type="response")
glm.pred3 = ifelse(glm.probs3 > 0.5, "Up","Down")
table(glm.pred3, test$direction)
##
## glm.pred3 Down Up
## Down 35 35
## Up 76 106
mean(glm.pred3 == test$direction)
## [1] 0.5595238
# prediction
predict(glm.fit3, newdata = data.frame(lag1 = c(1.2,1.5), lag2 = c(1.1,-0.8)), type="response")
## 1 2
## 0.4791462 0.4960939
Example - Caravan Insurance Data
library(ISLR)
dim(Caravan)
## [1] 5822 86
caravan = Caravan
names(caravan) = tolower(names(caravan))
head(caravan)
## mostype maanthui mgemomv mgemleef moshoofd mgodrk mgodpr mgodov mgodge
## 1 33 1 3 2 8 0 5 1 3
## 2 37 1 2 2 8 1 4 1 4
## 3 37 1 2 2 8 0 4 2 4
## 4 9 1 3 3 3 2 3 2 4
## 5 40 1 4 2 10 1 4 1 4
## 6 23 1 2 1 5 0 5 0 5
## mrelge mrelsa mrelov mfalleen mfgekind mfwekind moplhoog moplmidd
## 1 7 0 2 1 2 6 1 2
## 2 6 2 2 0 4 5 0 5
## 3 3 2 4 4 4 2 0 5
## 4 5 2 2 2 3 4 3 4
## 5 7 1 2 2 4 4 5 4
## 6 0 6 3 3 5 2 0 5
## mopllaag mberhoog mberzelf mberboer mbermidd mberarbg mberarbo mska
## 1 7 1 0 1 2 5 2 1
## 2 4 0 0 0 5 0 4 0
## 3 4 0 0 0 7 0 2 0
## 4 2 4 0 0 3 1 2 3
## 5 0 0 5 4 0 0 0 9
## 6 4 2 0 0 4 2 2 2
## mskb1 mskb2 mskc mskd mhhuur mhkoop maut1 maut2 maut0 mzfonds mzpart
## 1 1 2 6 1 1 8 8 0 1 8 1
## 2 2 3 5 0 2 7 7 1 2 6 3
## 3 5 0 4 0 7 2 7 0 2 9 0
## 4 2 1 4 0 5 4 9 0 0 7 2
## 5 0 0 0 0 4 5 6 2 1 5 4
## 6 2 2 4 2 9 0 5 3 3 9 0
## minkm30 mink3045 mink4575 mink7512 mink123m minkgem mkoopkla pwapart
## 1 0 4 5 0 0 4 3 0
## 2 2 0 5 2 0 5 4 2
## 3 4 5 0 0 0 3 4 2
## 4 1 5 3 0 0 4 4 0
## 5 0 0 9 0 0 6 3 0
## 6 5 2 3 0 0 3 3 0
## pwabedr pwaland ppersaut pbesaut pmotsco pvraaut paanhang ptractor
## 1 0 0 6 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 6 0 0 0 0 0
## 4 0 0 6 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 6 0 0 0 0 0
## pwerkt pbrom pleven ppersong pgezong pwaoreg pbrand pzeilpl pplezier
## 1 0 0 0 0 0 0 5 0 0
## 2 0 0 0 0 0 0 2 0 0
## 3 0 0 0 0 0 0 2 0 0
## 4 0 0 0 0 0 0 2 0 0
## 5 0 0 0 0 0 0 6 0 0
## 6 0 0 0 0 0 0 0 0 0
## pfiets pinboed pbystand awapart awabedr awaland apersaut abesaut amotsco
## 1 0 0 0 0 0 0 1 0 0
## 2 0 0 0 2 0 0 0 0 0
## 3 0 0 0 1 0 0 1 0 0
## 4 0 0 0 0 0 0 1 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0
## avraaut aaanhang atractor awerkt abrom aleven apersong agezong awaoreg
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## abrand azeilpl aplezier afiets ainboed abystand purchase
## 1 1 0 0 0 0 0 No
## 2 1 0 0 0 0 0 No
## 3 1 0 0 0 0 0 No
## 4 1 0 0 0 0 0 No
## 5 1 0 0 0 0 0 No
## 6 0 0 0 0 0 0 No
names(caravan)
## [1] "mostype" "maanthui" "mgemomv" "mgemleef" "moshoofd" "mgodrk"
## [7] "mgodpr" "mgodov" "mgodge" "mrelge" "mrelsa" "mrelov"
## [13] "mfalleen" "mfgekind" "mfwekind" "moplhoog" "moplmidd" "mopllaag"
## [19] "mberhoog" "mberzelf" "mberboer" "mbermidd" "mberarbg" "mberarbo"
## [25] "mska" "mskb1" "mskb2" "mskc" "mskd" "mhhuur"
## [31] "mhkoop" "maut1" "maut2" "maut0" "mzfonds" "mzpart"
## [37] "minkm30" "mink3045" "mink4575" "mink7512" "mink123m" "minkgem"
## [43] "mkoopkla" "pwapart" "pwabedr" "pwaland" "ppersaut" "pbesaut"
## [49] "pmotsco" "pvraaut" "paanhang" "ptractor" "pwerkt" "pbrom"
## [55] "pleven" "ppersong" "pgezong" "pwaoreg" "pbrand" "pzeilpl"
## [61] "pplezier" "pfiets" "pinboed" "pbystand" "awapart" "awabedr"
## [67] "awaland" "apersaut" "abesaut" "amotsco" "avraaut" "aaanhang"
## [73] "atractor" "awerkt" "abrom" "aleven" "apersong" "agezong"
## [79] "awaoreg" "abrand" "azeilpl" "aplezier" "afiets" "ainboed"
## [85] "abystand" "purchase"
attach(caravan)
summary(purchase)
## No Yes
## 5474 348
random_guess_success_rate = 348/(5474+348)
round(random_guess_success_rate*100,2)
## [1] 5.98
so for insurance company a goal is, can we predict who are likely buyers of insurance policy so we can target only those people and increase our sales efficiently. let’s call it a “success rate”.
based on current buyer rate of 5.98%, a random guess of general population will result in success rate of roughly 6%. can we do better by classification of general population into buyer=yes or no if we know predictor variables about them.
index = 1:1000
train.x = caravan[-index,]
test.x = caravan[index,]
dim(caravan)
## [1] 5822 86
dim(train.x)
## [1] 4822 86
dim(test.x)
## [1] 1000 86
train.y = purchase[-index]
test.y = purchase[index]
length(purchase)
## [1] 5822
length(train.y)
## [1] 4822
length(test.y)
## [1] 1000
glm.fit.car = glm(purchase ~ ., data=train.x, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
posterior probs > 0.5 (are considered as potential buyer)
glm.probs.car = predict(glm.fit.car, test.x, type="response")
glm.pred.car = ifelse(glm.probs.car > 0.5,"Yes","No")
df_car = as.data.frame(table(glm.pred.car, test.y))
df_car
## glm.pred.car test.y Freq
## 1 No No 934
## 2 Yes No 7
## 3 No Yes 59
## 4 Yes Yes 0
success_rate = df_car$Freq[4]/(df_car$Freq[4]+df_car$Freq[2])
round(success_rate*100,2)
## [1] 0
posterior probs > 0.25 (are considered as potential buyer)
glm.probs.car = predict(glm.fit.car, test.x, type="response")
glm.pred.car = ifelse(glm.probs.car > 0.25,"Yes","No")
df_car = as.data.frame(table(glm.pred.car, test.y))
df_car
## glm.pred.car test.y Freq
## 1 No No 919
## 2 Yes No 22
## 3 No Yes 48
## 4 Yes Yes 11
success_rate = df_car$Freq[4]/(df_car$Freq[4]+df_car$Freq[2])
round(success_rate*100,2)
## [1] 33.33