suppressMessages(library(ISLR))
## Warning: package 'ISLR' was built under R version 3.3.1
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up
smarket = Smarket
names(smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
names(smarket) = tolower(names(smarket))

summary(smarket)
##       year           lag1                lag2          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500  
##  Median :2003   Median : 0.039000   Median : 0.039000  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000  
##       lag3                lag4                lag5         
##  Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200  
##  1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000  
##  Median : 0.038500   Median : 0.038500   Median : 0.03850  
##  Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561  
##  3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700  
##  Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300  
##      volume           today           direction 
##  Min.   :0.3561   Min.   :-4.922000   Down:602  
##  1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648  
##  Median :1.4229   Median : 0.038500             
##  Mean   :1.4783   Mean   : 0.003138             
##  3rd Qu.:1.6417   3rd Qu.: 0.596750             
##  Max.   :3.1525   Max.   : 5.733000
cor(smarket[,-9])
##              year         lag1         lag2         lag3         lag4
## year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                lag5      volume        today
## year    0.029787995  0.53900647  0.030095229
## lag1   -0.005674606  0.04090991 -0.026155045
## lag2   -0.003557949 -0.04338321 -0.010250033
## lag3   -0.018808338 -0.04182369 -0.002447647
## lag4   -0.027083641 -0.04841425 -0.006899527
## lag5    1.000000000 -0.02200231 -0.034860083
## volume -0.022002315  1.00000000  0.014591823
## today  -0.034860083  0.01459182  1.000000000
with(smarket, plot(volume))

with(smarket, plot(volume ~ year))

glm.fit <- glm(direction ~ lag1+lag2+lag3+lag4+lag5+volume, data=smarket, family=binomial)

glm.fit
## 
## Call:  glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 + 
##     volume, family = binomial, data = smarket)
## 
## Coefficients:
## (Intercept)         lag1         lag2         lag3         lag4  
##   -0.126000    -0.073074    -0.042301     0.011085     0.009359  
##        lag5       volume  
##    0.010313     0.135441  
## 
## Degrees of Freedom: 1249 Total (i.e. Null);  1243 Residual
## Null Deviance:       1731 
## Residual Deviance: 1728  AIC: 1742
summary(glm.fit)
## 
## Call:
## glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 + 
##     volume, family = binomial, data = smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## lag1        -0.073074   0.050167  -1.457    0.145
## lag2        -0.042301   0.050086  -0.845    0.398
## lag3         0.011085   0.049939   0.222    0.824
## lag4         0.009359   0.049974   0.187    0.851
## lag5         0.010313   0.049511   0.208    0.835
## volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
coef(glm.fit)
##  (Intercept)         lag1         lag2         lag3         lag4 
## -0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938 
##         lag5       volume 
##  0.010313068  0.135440659
summary(glm.fit)$coef[,4] #-p-value column only
## (Intercept)        lag1        lag2        lag3        lag4        lag5 
##   0.6006983   0.1452272   0.3983491   0.8243333   0.8514445   0.8349974 
##      volume 
##   0.3924004
contrasts(smarket$direction) #dummy variables created by R for "direction"; all coeffcients in model are for baseline variable which is Down:0
##      Up
## Down  0
## Up    1
glm.probs = predict(glm.fit, type="response") # predict probabilities of P(Y=1|X). here as per contrasts up=1 so these probs are for direction UP.
glm.probs[1:10] # first 10 just ot check
##         1         2         3         4         5         6         7 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 
##         8         9        10 
## 0.5092292 0.5176135 0.4888378
glm.pred = rep("Down",1250) # create a vector of all DOWNs 1250 times.
glm.pred[glm.probs > 0.5] = "Up"
plot(smarket$lag1, glm.fit$fitted.values)

plot(smarket$lag2, glm.fit$fitted.values)

plot(smarket$lag3, glm.fit$fitted.values)

plot(smarket$lag4, glm.fit$fitted.values)

plot(smarket$lag5, glm.fit$fitted.values)

plot(smarket$volume, glm.fit$fitted.values)

table(glm.pred, smarket$direction) #confusion-matrix of model output vs. actual data from training-set
##         
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
mean(glm.pred == smarket$direction) # correct prediction-rate of the model on training-set
## [1] 0.5216
# logit (log-odds) ratios (see if includes 0, means, not significant)
confint(glm.fit)
## Waiting for profiling to be done...
##                   2.5 %     97.5 %
## (Intercept) -0.59907825 0.34544102
## lag1        -0.17191967 0.02499864
## lag2        -0.14082062 0.05578504
## lag3        -0.08689851 0.10912653
## lag4        -0.08870645 0.10745613
## lag5        -0.08681511 0.10753778
## volume      -0.17448194 0.44696774
# all CIs include 0, meaning, all predcitors are not significant. (we can get the same conclusion by looking at individual p-values from summary(glm.fit) output).

# exponential ratios (see if includes 1, means, not significant)
exp(confint(glm.fit))
## Waiting for profiling to be done...
##                 2.5 %   97.5 %
## (Intercept) 0.5493177 1.412613
## lag1        0.8420468 1.025314
## lag2        0.8686451 1.057370
## lag3        0.9167701 1.115303
## lag4        0.9151142 1.113442
## lag5        0.9168466 1.113533
## volume      0.8398920 1.563564
# all CIs include "1" so all predictors are not significant.

# log-odds ratio and odds-ratio are not probabilities , they are function of prebabilities.

Instead of using entire set for both fitting the model and then prediction, let’s split the dataset in training and test sets.

test = smarket[smarket$year == 2005,]
train = smarket[smarket$year != 2005,]
glm.fit2 = glm(direction ~ lag1+lag2+lag3+lag4+lag5+volume, data=train, family=binomial)

summary(glm.fit2)
## 
## Call:
## glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 + 
##     volume, family = binomial, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.302  -1.190   1.079   1.160   1.350  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.191213   0.333690   0.573    0.567
## lag1        -0.054178   0.051785  -1.046    0.295
## lag2        -0.045805   0.051797  -0.884    0.377
## lag3         0.007200   0.051644   0.139    0.889
## lag4         0.006441   0.051706   0.125    0.901
## lag5        -0.004223   0.051138  -0.083    0.934
## volume      -0.116257   0.239618  -0.485    0.628
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1383.3  on 997  degrees of freedom
## Residual deviance: 1381.1  on 991  degrees of freedom
## AIC: 1395.1
## 
## Number of Fisher Scoring iterations: 3
glm.probs2 = predict(glm.fit2, test, type="response")

# converting prediction probabilities into categorical variable of up or down.
glm.pred2 = rep("Down",252)
glm.pred2[glm.probs2 > 0.5] = "Up"

# glm.pred2 = ifelse(glm.probs2 > 0.5, "Up","Down") ....another way of doing it
table(glm.pred2, test$direction)
##          
## glm.pred2 Down Up
##      Down   77 97
##      Up     34 44
mean(glm.pred2 == test$direction) # correct prediction-rate
## [1] 0.4801587
mean(glm.pred2 != test$direction) # error-rate
## [1] 0.5198413

modeling with only 2 most significant predictors.

glm.fit3 = glm(direction ~ lag1+lag2, data=train, family=binomial)
summary(glm.fit3)
## 
## Call:
## glm(formula = direction ~ lag1 + lag2, family = binomial, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.345  -1.188   1.074   1.164   1.326  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.03222    0.06338   0.508    0.611
## lag1        -0.05562    0.05171  -1.076    0.282
## lag2        -0.04449    0.05166  -0.861    0.389
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1383.3  on 997  degrees of freedom
## Residual deviance: 1381.4  on 995  degrees of freedom
## AIC: 1387.4
## 
## Number of Fisher Scoring iterations: 3
glm.probs3 = predict(glm.fit3, test, type="response")
glm.pred3 = ifelse(glm.probs3 > 0.5, "Up","Down")

table(glm.pred3, test$direction)
##          
## glm.pred3 Down  Up
##      Down   35  35
##      Up     76 106
mean(glm.pred3 == test$direction)
## [1] 0.5595238
# prediction
predict(glm.fit3, newdata = data.frame(lag1 = c(1.2,1.5), lag2 = c(1.1,-0.8)), type="response")
##         1         2 
## 0.4791462 0.4960939

Example - Caravan Insurance Data

library(ISLR)
dim(Caravan)
## [1] 5822   86
caravan = Caravan
names(caravan) = tolower(names(caravan))

head(caravan)
##   mostype maanthui mgemomv mgemleef moshoofd mgodrk mgodpr mgodov mgodge
## 1      33        1       3        2        8      0      5      1      3
## 2      37        1       2        2        8      1      4      1      4
## 3      37        1       2        2        8      0      4      2      4
## 4       9        1       3        3        3      2      3      2      4
## 5      40        1       4        2       10      1      4      1      4
## 6      23        1       2        1        5      0      5      0      5
##   mrelge mrelsa mrelov mfalleen mfgekind mfwekind moplhoog moplmidd
## 1      7      0      2        1        2        6        1        2
## 2      6      2      2        0        4        5        0        5
## 3      3      2      4        4        4        2        0        5
## 4      5      2      2        2        3        4        3        4
## 5      7      1      2        2        4        4        5        4
## 6      0      6      3        3        5        2        0        5
##   mopllaag mberhoog mberzelf mberboer mbermidd mberarbg mberarbo mska
## 1        7        1        0        1        2        5        2    1
## 2        4        0        0        0        5        0        4    0
## 3        4        0        0        0        7        0        2    0
## 4        2        4        0        0        3        1        2    3
## 5        0        0        5        4        0        0        0    9
## 6        4        2        0        0        4        2        2    2
##   mskb1 mskb2 mskc mskd mhhuur mhkoop maut1 maut2 maut0 mzfonds mzpart
## 1     1     2    6    1      1      8     8     0     1       8      1
## 2     2     3    5    0      2      7     7     1     2       6      3
## 3     5     0    4    0      7      2     7     0     2       9      0
## 4     2     1    4    0      5      4     9     0     0       7      2
## 5     0     0    0    0      4      5     6     2     1       5      4
## 6     2     2    4    2      9      0     5     3     3       9      0
##   minkm30 mink3045 mink4575 mink7512 mink123m minkgem mkoopkla pwapart
## 1       0        4        5        0        0       4        3       0
## 2       2        0        5        2        0       5        4       2
## 3       4        5        0        0        0       3        4       2
## 4       1        5        3        0        0       4        4       0
## 5       0        0        9        0        0       6        3       0
## 6       5        2        3        0        0       3        3       0
##   pwabedr pwaland ppersaut pbesaut pmotsco pvraaut paanhang ptractor
## 1       0       0        6       0       0       0        0        0
## 2       0       0        0       0       0       0        0        0
## 3       0       0        6       0       0       0        0        0
## 4       0       0        6       0       0       0        0        0
## 5       0       0        0       0       0       0        0        0
## 6       0       0        6       0       0       0        0        0
##   pwerkt pbrom pleven ppersong pgezong pwaoreg pbrand pzeilpl pplezier
## 1      0     0      0        0       0       0      5       0        0
## 2      0     0      0        0       0       0      2       0        0
## 3      0     0      0        0       0       0      2       0        0
## 4      0     0      0        0       0       0      2       0        0
## 5      0     0      0        0       0       0      6       0        0
## 6      0     0      0        0       0       0      0       0        0
##   pfiets pinboed pbystand awapart awabedr awaland apersaut abesaut amotsco
## 1      0       0        0       0       0       0        1       0       0
## 2      0       0        0       2       0       0        0       0       0
## 3      0       0        0       1       0       0        1       0       0
## 4      0       0        0       0       0       0        1       0       0
## 5      0       0        0       0       0       0        0       0       0
## 6      0       0        0       0       0       0        1       0       0
##   avraaut aaanhang atractor awerkt abrom aleven apersong agezong awaoreg
## 1       0        0        0      0     0      0        0       0       0
## 2       0        0        0      0     0      0        0       0       0
## 3       0        0        0      0     0      0        0       0       0
## 4       0        0        0      0     0      0        0       0       0
## 5       0        0        0      0     0      0        0       0       0
## 6       0        0        0      0     0      0        0       0       0
##   abrand azeilpl aplezier afiets ainboed abystand purchase
## 1      1       0        0      0       0        0       No
## 2      1       0        0      0       0        0       No
## 3      1       0        0      0       0        0       No
## 4      1       0        0      0       0        0       No
## 5      1       0        0      0       0        0       No
## 6      0       0        0      0       0        0       No
names(caravan)
##  [1] "mostype"  "maanthui" "mgemomv"  "mgemleef" "moshoofd" "mgodrk"  
##  [7] "mgodpr"   "mgodov"   "mgodge"   "mrelge"   "mrelsa"   "mrelov"  
## [13] "mfalleen" "mfgekind" "mfwekind" "moplhoog" "moplmidd" "mopllaag"
## [19] "mberhoog" "mberzelf" "mberboer" "mbermidd" "mberarbg" "mberarbo"
## [25] "mska"     "mskb1"    "mskb2"    "mskc"     "mskd"     "mhhuur"  
## [31] "mhkoop"   "maut1"    "maut2"    "maut0"    "mzfonds"  "mzpart"  
## [37] "minkm30"  "mink3045" "mink4575" "mink7512" "mink123m" "minkgem" 
## [43] "mkoopkla" "pwapart"  "pwabedr"  "pwaland"  "ppersaut" "pbesaut" 
## [49] "pmotsco"  "pvraaut"  "paanhang" "ptractor" "pwerkt"   "pbrom"   
## [55] "pleven"   "ppersong" "pgezong"  "pwaoreg"  "pbrand"   "pzeilpl" 
## [61] "pplezier" "pfiets"   "pinboed"  "pbystand" "awapart"  "awabedr" 
## [67] "awaland"  "apersaut" "abesaut"  "amotsco"  "avraaut"  "aaanhang"
## [73] "atractor" "awerkt"   "abrom"    "aleven"   "apersong" "agezong" 
## [79] "awaoreg"  "abrand"   "azeilpl"  "aplezier" "afiets"   "ainboed" 
## [85] "abystand" "purchase"
attach(caravan)
summary(purchase)
##   No  Yes 
## 5474  348
random_guess_success_rate = 348/(5474+348)
round(random_guess_success_rate*100,2)
## [1] 5.98

so for insurance company a goal is, can we predict who are likely buyers of insurance policy so we can target only those people and increase our sales efficiently. let’s call it a “success rate”.

based on current buyer rate of 5.98%, a random guess of general population will result in success rate of roughly 6%. can we do better by classification of general population into buyer=yes or no if we know predictor variables about them.

index = 1:1000

train.x = caravan[-index,]
test.x = caravan[index,]
dim(caravan)
## [1] 5822   86
dim(train.x)
## [1] 4822   86
dim(test.x)
## [1] 1000   86
train.y = purchase[-index]
test.y = purchase[index]
length(purchase)
## [1] 5822
length(train.y)
## [1] 4822
length(test.y)
## [1] 1000
glm.fit.car = glm(purchase ~ ., data=train.x, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

posterior probs > 0.5 (are considered as potential buyer)

glm.probs.car = predict(glm.fit.car, test.x, type="response")
glm.pred.car = ifelse(glm.probs.car > 0.5,"Yes","No")
df_car = as.data.frame(table(glm.pred.car, test.y))
df_car
##   glm.pred.car test.y Freq
## 1           No     No  934
## 2          Yes     No    7
## 3           No    Yes   59
## 4          Yes    Yes    0
success_rate = df_car$Freq[4]/(df_car$Freq[4]+df_car$Freq[2])
round(success_rate*100,2)
## [1] 0

posterior probs > 0.25 (are considered as potential buyer)

glm.probs.car = predict(glm.fit.car, test.x, type="response")
glm.pred.car = ifelse(glm.probs.car > 0.25,"Yes","No")
df_car = as.data.frame(table(glm.pred.car, test.y))
df_car
##   glm.pred.car test.y Freq
## 1           No     No  919
## 2          Yes     No   22
## 3           No    Yes   48
## 4          Yes    Yes   11
success_rate = df_car$Freq[4]/(df_car$Freq[4]+df_car$Freq[2])
round(success_rate*100,2)
## [1] 33.33