Logistic + Hold-out Method
getwd()
## [1] "/Users/liuxukun/Google Driver/2018Spring/CIS3920/Assignments/HW4"
setwd("/Users/liuxukun/Google Driver/2018Spring/CIS3920/Assignments/HW4")
getwd()
## [1] "/Users/liuxukun/Google Driver/2018Spring/CIS3920/Assignments/HW4"
data<-read.csv("telemarketing.csv",header=T)
head(data)
## age balance housing contact duration campaign y
## 1 30 1787 no cellular 79 1 no
## 2 33 4789 yes cellular 220 1 no
## 3 35 1350 yes cellular 185 1 no
## 4 30 1476 yes unknown 199 4 no
## 5 59 0 yes unknown 226 1 no
## 6 35 747 no cellular 141 2 no
dim(data)
## [1] 4521 7
str(data)
## 'data.frame': 4521 obs. of 7 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
## $ duration: int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign: int 1 1 1 4 1 2 1 2 2 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
set.seed(1)
train.index <- sample(1:nrow(data),size = nrow(data)*0.8, replace = F, prob = NULL)
train <- data[train.index,]
test <- data[-train.index,]
data.frame(Full.size=nrow(data),Train.size=nrow(train),Test.size=nrow(test))
## Full.size Train.size Test.size
## 1 4521 3616 905
model.logistic.full = glm(y~.,data = train, family = binomial)
summary(model.logistic.full)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2663 -0.4558 -0.3288 -0.1924 3.0488
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.876e+00 2.645e-01 -10.875 < 2e-16 ***
## age 8.812e-03 5.374e-03 1.640 0.101054
## balance 1.215e-05 1.698e-05 0.715 0.474370
## housingyes -6.913e-01 1.255e-01 -5.509 3.61e-08 ***
## contacttelephone -1.362e-01 2.271e-01 -0.600 0.548545
## contactunknown -1.341e+00 1.816e-01 -7.382 1.56e-13 ***
## duration 3.863e-03 2.075e-04 18.617 < 2e-16 ***
## campaign -1.139e-01 3.002e-02 -3.794 0.000148 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2556.8 on 3615 degrees of freedom
## Residual deviance: 2001.8 on 3608 degrees of freedom
## AIC: 2017.8
##
## Number of Fisher Scoring iterations: 6
exp(coef(model.logistic.full))
## (Intercept) age balance housingyes
## 0.05633538 1.00885059 1.00001215 0.50094780
## contacttelephone contactunknown duration campaign
## 0.87265335 0.26164929 1.00387022 0.89236934
model.logistic.reduced = glm(y~duration+campaign+contact,data = train, family = binomial)
summary(model.logistic.reduced)
##
## Call:
## glm(formula = y ~ duration + campaign + contact, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0897 -0.4503 -0.3571 -0.1976 2.9243
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7704161 0.1171233 -23.654 < 2e-16 ***
## duration 0.0037553 0.0002045 18.367 < 2e-16 ***
## campaign -0.1133183 0.0298458 -3.797 0.000147 ***
## contacttelephone 0.0436066 0.2161069 0.202 0.840087
## contactunknown -1.4907070 0.1787036 -8.342 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2556.8 on 3615 degrees of freedom
## Residual deviance: 2042.3 on 3611 degrees of freedom
## AIC: 2052.3
##
## Number of Fisher Scoring iterations: 6
exp(coef(model.logistic.reduced))
## (Intercept) duration campaign contacttelephone
## 0.06263594 1.00376233 0.89286638 1.04457137
## contactunknown
## 0.22521338
data.frame(model.logistic.full=AIC(model.logistic.full),model.logistic.reduced=AIC(model.logistic.reduced),row.names = "AIC")
## model.logistic.full model.logistic.reduced
## AIC 2017.771 2052.291
pred.prob <- predict(model.logistic.full,test,type = "response")
head(pred.prob)
## 9 13 17 18 28 32
## 0.01043554 0.18145637 0.12135236 0.05279831 0.11224389 0.06723638
pred.class <- pred.prob
pred.class[pred.prob>0.5]="Yes"
pred.class[!pred.prob>0.5]="No"
head(pred.class)
## 9 13 17 18 28 32
## "No" "No" "No" "No" "No" "No"
c.matrix = table(actual=test$y,pred.class)
c.matrix
## pred.class
## actual No Yes
## no 780 14
## yes 87 24
acc = (c.matrix[1,1]+c.matrix[2,2])/(c.matrix[1]+c.matrix[2]+c.matrix[3]+c.matrix[4])
acc
## [1] 0.8883978
sens.yes = c.matrix[4]/(c.matrix[2]+c.matrix[4])
prec.yes = c.matrix[4]/(c.matrix[3]+c.matrix[4])
data.frame(acc,sens.yes,prec.yes)
## acc sens.yes prec.yes
## 1 0.8883978 0.2162162 0.6315789
pred.class.80 <- pred.prob
pred.class.80[pred.prob>0.8]="Yes"
pred.class.80[!pred.prob>0.8]="No"
head(pred.class.80)
## 9 13 17 18 28 32
## "No" "No" "No" "No" "No" "No"
c.matrix.80 = table(actual=test$y,pred.class.80)
c.matrix.80
## pred.class.80
## actual No Yes
## no 789 5
## yes 103 8
acc.80 = (c.matrix.80[1,1]+c.matrix.80[2,2])/(c.matrix.80[1]+c.matrix.80[2]+c.matrix.80[3]+c.matrix.80[4])
acc.80
## [1] 0.880663
sens.80.yes = c.matrix.80[4]/(c.matrix.80[2]+c.matrix.80[4])
prec.80.yes = c.matrix.80[4]/(c.matrix.80[3]+c.matrix.80[4])
data.frame(acc.80,sens.80.yes,prec.80.yes)
## acc.80 sens.80.yes prec.80.yes
## 1 0.880663 0.07207207 0.6153846
KNN + K-fold
library(class)
mydata <- data
str(mydata)
## 'data.frame': 4521 obs. of 7 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
## $ duration: int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign: int 1 1 1 4 1 2 1 2 2 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
class(mydata$contact)
## [1] "factor"
levels(mydata$contact)
## [1] "cellular" "telephone" "unknown"
normalize = function(x)
{
return((x-min(x))/(max(x)-min(x)))
}
norm.age = normalize(mydata$age)
norm.balance = normalize(mydata$balance)
norm.duration = normalize(mydata$duration)
norm.campaign = normalize(mydata$campaign)
norm.housing = as.character(mydata$housing)
norm.housing[norm.housing == "yes"] = 1
norm.housing[norm.housing == "no"] = 0
norm.housing = as.integer(norm.housing)
norm.cellular = as.character(mydata$contact)
norm.cellular[norm.cellular == "cellular"] = 1
norm.cellular[norm.cellular == "unknown"]=0
norm.cellular[norm.cellular == "telephone"]=0
norm.cellular <- as.integer(norm.cellular)
norm.telephone = as.character(mydata$contact)
norm.telephone[norm.telephone == "cellular"] = 0
norm.telephone[norm.telephone == "unknown"] = 0
norm.telephone[norm.telephone == "telephone"] = 1
norm.telephone <- as.integer(norm.telephone)
norm.mydata = cbind(mydata,
norm.age,norm.balance,norm.campaign,
norm.cellular,norm.telephone,norm.duration,norm.housing)
str(norm.mydata)
## 'data.frame': 4521 obs. of 14 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ norm.age : num 0.162 0.206 0.235 0.162 0.588 ...
## $ norm.balance : num 0.0685 0.1088 0.0626 0.0643 0.0445 ...
## $ norm.campaign : num 0 0 0 0.0612 0 ...
## $ norm.cellular : int 1 1 1 0 0 1 1 1 0 1 ...
## $ norm.telephone: int 0 0 0 0 0 0 0 0 0 0 ...
## $ norm.duration : num 0.0248 0.0715 0.0599 0.0645 0.0735 ...
## $ norm.housing : int 0 1 1 1 1 0 1 1 1 1 ...
summary(norm.mydata)
## age balance housing contact
## Min. :19.00 Min. :-3313 no :1962 cellular :2896
## 1st Qu.:33.00 1st Qu.: 69 yes:2559 telephone: 301
## Median :39.00 Median : 444 unknown :1324
## Mean :41.17 Mean : 1423
## 3rd Qu.:49.00 3rd Qu.: 1480
## Max. :87.00 Max. :71188
## duration campaign y norm.age
## Min. : 4 Min. : 1.000 no :4000 Min. :0.0000
## 1st Qu.: 104 1st Qu.: 1.000 yes: 521 1st Qu.:0.2059
## Median : 185 Median : 2.000 Median :0.2941
## Mean : 264 Mean : 2.794 Mean :0.3260
## 3rd Qu.: 329 3rd Qu.: 3.000 3rd Qu.:0.4412
## Max. :3025 Max. :50.000 Max. :1.0000
## norm.balance norm.campaign norm.cellular norm.telephone
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.04540 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.05043 Median :0.02041 Median :1.0000 Median :0.00000
## Mean :0.06356 Mean :0.03660 Mean :0.6406 Mean :0.06658
## 3rd Qu.:0.06433 3rd Qu.:0.04082 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## norm.duration norm.housing
## Min. :0.00000 Min. :0.000
## 1st Qu.:0.03310 1st Qu.:0.000
## Median :0.05991 Median :1.000
## Mean :0.08605 Mean :0.566
## 3rd Qu.:0.10758 3rd Qu.:1.000
## Max. :1.00000 Max. :1.000
set.seed(1)
kfold=5
fold = sample(1:kfold,nrow(norm.mydata),replace=TRUE)
fold[1:10]
## [1] 2 2 3 5 2 5 5 4 4 1
kfold.acc.result = 1:kfold
kfold.sen.result = 1:kfold
kfold.prec.result = 1:kfold
K=5
for (i in 1:kfold)
{
test = norm.mydata[fold==i,]
train = norm.mydata[fold!=i,]
test.x = test[,8:14]
train.x = train[,8:14]
train.cl = train[,7]
KNN.test.pred.class = knn(train = train.x, test = test.x, cl = train.cl, k = K)
KNN.c.matrix = table(test[,7],KNN.test.pred.class)
#print(KNN.c.matrix)
KNN.acc = mean(KNN.test.pred.class==test[,7])
KNN.sens.yes = KNN.c.matrix[4]/(KNN.c.matrix[2]+KNN.c.matrix[4])
KNN.prec.yes = KNN.c.matrix[4]/(KNN.c.matrix[3]+KNN.c.matrix[4])
kfold.acc.result[i] = KNN.acc
kfold.sen.result[i] = KNN.sens.yes
kfold.prec.result[i] = KNN.prec.yes
}
mean(kfold.acc.result)
## [1] 0.8776359
mean(kfold.sen.result)
## [1] 0.2061236
mean(kfold.prec.result)
## [1] 0.4425273
KNN5=c(mean(kfold.acc.result),mean(kfold.sen.result),mean(kfold.prec.result))
KNN5
## [1] 0.8776359 0.2061236 0.4425273
kfold.acc.result = 1:kfold
kfold.sen.result = 1:kfold
kfold.prec.result = 1:kfold
K=3
for (i in 1:kfold)
{
test = norm.mydata[fold==i,]
train = norm.mydata[fold!=i,]
test.x = test[,8:14]
train.x = train[,8:14]
train.cl = train[,7]
KNN.test.pred.class = knn(train = train.x, test = test.x, cl = train.cl, k = K)
KNN.c.matrix = table(test[,7],KNN.test.pred.class)
#print(KNN.c.matrix)
KNN.acc = mean(KNN.test.pred.class==test[,7])
KNN.sens.yes = KNN.c.matrix[4]/(KNN.c.matrix[2]+KNN.c.matrix[4])
KNN.prec.yes = KNN.c.matrix[4]/(KNN.c.matrix[3]+KNN.c.matrix[4])
kfold.acc.result[i] = KNN.acc
kfold.sen.result[i] = KNN.sens.yes
kfold.prec.result[i] = KNN.prec.yes
}
mean(kfold.acc.result)
## [1] 0.8714268
mean(kfold.sen.result)
## [1] 0.2373385
mean(kfold.prec.result)
## [1] 0.4047073
KNN3=c(mean(kfold.acc.result),mean(kfold.sen.result),mean(kfold.prec.result))
KNN3
## [1] 0.8714268 0.2373385 0.4047073
title = c("acc","sen","prec")
KNN.compare = data.frame(title,KNN3,KNN5)
KNN.compare
## title KNN3 KNN5
## 1 acc 0.8714268 0.8776359
## 2 sen 0.2373385 0.2061236
## 3 prec 0.4047073 0.4425273