Logistic + Hold-out Method

getwd()

## [1] "/Users/liuxukun/Google Driver/2018Spring/CIS3920/Assignments/HW4"

setwd("/Users/liuxukun/Google Driver/2018Spring/CIS3920/Assignments/HW4")
getwd()

## [1] "/Users/liuxukun/Google Driver/2018Spring/CIS3920/Assignments/HW4"

data<-read.csv("telemarketing.csv",header=T)
head(data)

##   age balance housing  contact duration campaign  y
## 1  30    1787      no cellular       79        1 no
## 2  33    4789     yes cellular      220        1 no
## 3  35    1350     yes cellular      185        1 no
## 4  30    1476     yes  unknown      199        4 no
## 5  59       0     yes  unknown      226        1 no
## 6  35     747      no cellular      141        2 no

dim(data)

## [1] 4521    7

str(data)

## 'data.frame':    4521 obs. of  7 variables:
##  $ age     : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ balance : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ duration: int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign: int  1 1 1 4 1 2 1 2 2 1 ...
##  $ y       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

set.seed(1)
train.index <- sample(1:nrow(data),size = nrow(data)*0.8, replace = F, prob = NULL)
train <- data[train.index,]
test <- data[-train.index,]

data.frame(Full.size=nrow(data),Train.size=nrow(train),Test.size=nrow(test))

##   Full.size Train.size Test.size
## 1      4521       3616       905

model.logistic.full = glm(y~.,data = train, family = binomial)
summary(model.logistic.full)

## 
## Call:
## glm(formula = y ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2663  -0.4558  -0.3288  -0.1924   3.0488  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.876e+00  2.645e-01 -10.875  < 2e-16 ***
## age               8.812e-03  5.374e-03   1.640 0.101054    
## balance           1.215e-05  1.698e-05   0.715 0.474370    
## housingyes       -6.913e-01  1.255e-01  -5.509 3.61e-08 ***
## contacttelephone -1.362e-01  2.271e-01  -0.600 0.548545    
## contactunknown   -1.341e+00  1.816e-01  -7.382 1.56e-13 ***
## duration          3.863e-03  2.075e-04  18.617  < 2e-16 ***
## campaign         -1.139e-01  3.002e-02  -3.794 0.000148 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2556.8  on 3615  degrees of freedom
## Residual deviance: 2001.8  on 3608  degrees of freedom
## AIC: 2017.8
## 
## Number of Fisher Scoring iterations: 6

exp(coef(model.logistic.full))

##      (Intercept)              age          balance       housingyes 
##       0.05633538       1.00885059       1.00001215       0.50094780 
## contacttelephone   contactunknown         duration         campaign 
##       0.87265335       0.26164929       1.00387022       0.89236934

model.logistic.reduced = glm(y~duration+campaign+contact,data = train, family = binomial)
summary(model.logistic.reduced)

## 
## Call:
## glm(formula = y ~ duration + campaign + contact, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0897  -0.4503  -0.3571  -0.1976   2.9243  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.7704161  0.1171233 -23.654  < 2e-16 ***
## duration          0.0037553  0.0002045  18.367  < 2e-16 ***
## campaign         -0.1133183  0.0298458  -3.797 0.000147 ***
## contacttelephone  0.0436066  0.2161069   0.202 0.840087    
## contactunknown   -1.4907070  0.1787036  -8.342  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2556.8  on 3615  degrees of freedom
## Residual deviance: 2042.3  on 3611  degrees of freedom
## AIC: 2052.3
## 
## Number of Fisher Scoring iterations: 6

exp(coef(model.logistic.reduced))

##      (Intercept)         duration         campaign contacttelephone 
##       0.06263594       1.00376233       0.89286638       1.04457137 
##   contactunknown 
##       0.22521338

data.frame(model.logistic.full=AIC(model.logistic.full),model.logistic.reduced=AIC(model.logistic.reduced),row.names = "AIC")

##     model.logistic.full model.logistic.reduced
## AIC            2017.771               2052.291

pred.prob <- predict(model.logistic.full,test,type = "response")
head(pred.prob)

##          9         13         17         18         28         32 
## 0.01043554 0.18145637 0.12135236 0.05279831 0.11224389 0.06723638

pred.class <- pred.prob
pred.class[pred.prob>0.5]="Yes"
pred.class[!pred.prob>0.5]="No"

head(pred.class)

##    9   13   17   18   28   32 
## "No" "No" "No" "No" "No" "No"

c.matrix = table(actual=test$y,pred.class)
c.matrix

##       pred.class
## actual  No Yes
##    no  780  14
##    yes  87  24

acc = (c.matrix[1,1]+c.matrix[2,2])/(c.matrix[1]+c.matrix[2]+c.matrix[3]+c.matrix[4])
acc

## [1] 0.8883978

sens.yes = c.matrix[4]/(c.matrix[2]+c.matrix[4])
prec.yes = c.matrix[4]/(c.matrix[3]+c.matrix[4])

data.frame(acc,sens.yes,prec.yes)

##         acc  sens.yes  prec.yes
## 1 0.8883978 0.2162162 0.6315789

pred.class.80 <- pred.prob
pred.class.80[pred.prob>0.8]="Yes"
pred.class.80[!pred.prob>0.8]="No"

head(pred.class.80)

##    9   13   17   18   28   32 
## "No" "No" "No" "No" "No" "No"

c.matrix.80 = table(actual=test$y,pred.class.80)
c.matrix.80

##       pred.class.80
## actual  No Yes
##    no  789   5
##    yes 103   8

acc.80 = (c.matrix.80[1,1]+c.matrix.80[2,2])/(c.matrix.80[1]+c.matrix.80[2]+c.matrix.80[3]+c.matrix.80[4])
acc.80

## [1] 0.880663

sens.80.yes = c.matrix.80[4]/(c.matrix.80[2]+c.matrix.80[4])
prec.80.yes = c.matrix.80[4]/(c.matrix.80[3]+c.matrix.80[4])

data.frame(acc.80,sens.80.yes,prec.80.yes)

##     acc.80 sens.80.yes prec.80.yes
## 1 0.880663  0.07207207   0.6153846

KNN + K-fold

library(class)
mydata <- data
str(mydata)

## 'data.frame':    4521 obs. of  7 variables:
##  $ age     : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ balance : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ duration: int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign: int  1 1 1 4 1 2 1 2 2 1 ...
##  $ y       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

class(mydata$contact)

## [1] "factor"

levels(mydata$contact)

## [1] "cellular"  "telephone" "unknown"

normalize = function(x)
{
  return((x-min(x))/(max(x)-min(x)))
}

norm.age = normalize(mydata$age)
norm.balance = normalize(mydata$balance)
norm.duration = normalize(mydata$duration)
norm.campaign = normalize(mydata$campaign)


norm.housing = as.character(mydata$housing)
norm.housing[norm.housing == "yes"] = 1
norm.housing[norm.housing == "no"] = 0
norm.housing = as.integer(norm.housing)

norm.cellular = as.character(mydata$contact)
norm.cellular[norm.cellular == "cellular"] = 1
norm.cellular[norm.cellular == "unknown"]=0
norm.cellular[norm.cellular == "telephone"]=0
norm.cellular <- as.integer(norm.cellular)

norm.telephone = as.character(mydata$contact)
norm.telephone[norm.telephone == "cellular"] = 0
norm.telephone[norm.telephone == "unknown"] = 0
norm.telephone[norm.telephone == "telephone"] = 1
norm.telephone <- as.integer(norm.telephone)


norm.mydata = cbind(mydata,
                    norm.age,norm.balance,norm.campaign,
                    norm.cellular,norm.telephone,norm.duration,norm.housing)

str(norm.mydata)

## 'data.frame':    4521 obs. of  14 variables:
##  $ age           : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ balance       : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ contact       : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ duration      : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign      : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ norm.age      : num  0.162 0.206 0.235 0.162 0.588 ...
##  $ norm.balance  : num  0.0685 0.1088 0.0626 0.0643 0.0445 ...
##  $ norm.campaign : num  0 0 0 0.0612 0 ...
##  $ norm.cellular : int  1 1 1 0 0 1 1 1 0 1 ...
##  $ norm.telephone: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ norm.duration : num  0.0248 0.0715 0.0599 0.0645 0.0735 ...
##  $ norm.housing  : int  0 1 1 1 1 0 1 1 1 1 ...

summary(norm.mydata)

##       age           balance      housing         contact    
##  Min.   :19.00   Min.   :-3313   no :1962   cellular :2896  
##  1st Qu.:33.00   1st Qu.:   69   yes:2559   telephone: 301  
##  Median :39.00   Median :  444              unknown  :1324  
##  Mean   :41.17   Mean   : 1423                              
##  3rd Qu.:49.00   3rd Qu.: 1480                              
##  Max.   :87.00   Max.   :71188                              
##     duration       campaign        y           norm.age     
##  Min.   :   4   Min.   : 1.000   no :4000   Min.   :0.0000  
##  1st Qu.: 104   1st Qu.: 1.000   yes: 521   1st Qu.:0.2059  
##  Median : 185   Median : 2.000              Median :0.2941  
##  Mean   : 264   Mean   : 2.794              Mean   :0.3260  
##  3rd Qu.: 329   3rd Qu.: 3.000              3rd Qu.:0.4412  
##  Max.   :3025   Max.   :50.000              Max.   :1.0000  
##   norm.balance     norm.campaign     norm.cellular    norm.telephone   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04540   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.05043   Median :0.02041   Median :1.0000   Median :0.00000  
##  Mean   :0.06356   Mean   :0.03660   Mean   :0.6406   Mean   :0.06658  
##  3rd Qu.:0.06433   3rd Qu.:0.04082   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##  norm.duration      norm.housing  
##  Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.03310   1st Qu.:0.000  
##  Median :0.05991   Median :1.000  
##  Mean   :0.08605   Mean   :0.566  
##  3rd Qu.:0.10758   3rd Qu.:1.000  
##  Max.   :1.00000   Max.   :1.000

set.seed(1)

kfold=5
fold = sample(1:kfold,nrow(norm.mydata),replace=TRUE)
fold[1:10]

##  [1] 2 2 3 5 2 5 5 4 4 1

kfold.acc.result = 1:kfold
kfold.sen.result = 1:kfold
kfold.prec.result = 1:kfold
K=5
for (i in 1:kfold)
{
  test = norm.mydata[fold==i,] 
  train = norm.mydata[fold!=i,]
  test.x = test[,8:14]
  train.x = train[,8:14]
  train.cl = train[,7]
  KNN.test.pred.class = knn(train = train.x, test = test.x, cl = train.cl, k = K)
  KNN.c.matrix = table(test[,7],KNN.test.pred.class)
  
  #print(KNN.c.matrix)
  
  KNN.acc = mean(KNN.test.pred.class==test[,7])
  KNN.sens.yes = KNN.c.matrix[4]/(KNN.c.matrix[2]+KNN.c.matrix[4])
  KNN.prec.yes = KNN.c.matrix[4]/(KNN.c.matrix[3]+KNN.c.matrix[4])
  
  kfold.acc.result[i] = KNN.acc
  kfold.sen.result[i] = KNN.sens.yes
  kfold.prec.result[i] = KNN.prec.yes
}

mean(kfold.acc.result)

## [1] 0.8776359

mean(kfold.sen.result)

## [1] 0.2061236

mean(kfold.prec.result)

## [1] 0.4425273

KNN5=c(mean(kfold.acc.result),mean(kfold.sen.result),mean(kfold.prec.result))
KNN5

## [1] 0.8776359 0.2061236 0.4425273

kfold.acc.result = 1:kfold
kfold.sen.result = 1:kfold
kfold.prec.result = 1:kfold
K=3
for (i in 1:kfold)
{
  test = norm.mydata[fold==i,] 
  train = norm.mydata[fold!=i,]
  test.x = test[,8:14]
  train.x = train[,8:14]
  train.cl = train[,7]
  KNN.test.pred.class = knn(train = train.x, test = test.x, cl = train.cl, k = K)
  KNN.c.matrix = table(test[,7],KNN.test.pred.class)
  
  #print(KNN.c.matrix)
  
  KNN.acc = mean(KNN.test.pred.class==test[,7])
  KNN.sens.yes = KNN.c.matrix[4]/(KNN.c.matrix[2]+KNN.c.matrix[4])
  KNN.prec.yes = KNN.c.matrix[4]/(KNN.c.matrix[3]+KNN.c.matrix[4])
  
  kfold.acc.result[i] = KNN.acc
  kfold.sen.result[i] = KNN.sens.yes
  kfold.prec.result[i] = KNN.prec.yes
}

mean(kfold.acc.result)

## [1] 0.8714268

mean(kfold.sen.result)

## [1] 0.2373385

mean(kfold.prec.result)

## [1] 0.4047073

KNN3=c(mean(kfold.acc.result),mean(kfold.sen.result),mean(kfold.prec.result))
KNN3

## [1] 0.8714268 0.2373385 0.4047073

title = c("acc","sen","prec")
KNN.compare = data.frame(title,KNN3,KNN5)
KNN.compare

##   title      KNN3      KNN5
## 1   acc 0.8714268 0.8776359
## 2   sen 0.2373385 0.2061236
## 3  prec 0.4047073 0.4425273

Data Mining for Business Analytics - Notes + HW4 Solution

Xukun LIU

4/22/2018

Logistic + Hold-out Method

KNN + K-fold