book <- read.csv("CharlesBookClub.csv")
## Add new column RFM as the parameter for following data mining process
book$Rcode <- as.character(book$Rcode)
book$Fcode <- as.character(book$Fcode)
book$Mcode <- as.character(book$Mcode)
book$R_F_M <- with(book, paste0(book$Rcode, "_", book$Fcode, "_", book$Mcode))
names(book)
## [1] "Seq." "ID." "Gender"
## [4] "M" "R" "F"
## [7] "FirstPurch" "ChildBks" "YouthBks"
## [10] "CookBks" "DoItYBks" "RefBks"
## [13] "ArtBks" "GeogBks" "ItalCook"
## [16] "ItalAtlas" "ItalArt" "Florence"
## [19] "Related.Purchase" "Mcode" "Rcode"
## [22] "Fcode" "Yes_Florence" "No_Florence"
## [25] "R_F_M"
head(book)
## Seq. ID. Gender M R F FirstPurch ChildBks YouthBks CookBks DoItYBks
## 1 1 25 1 297 14 2 22 0 1 1 0
## 2 2 29 0 128 8 2 10 0 0 0 0
## 3 3 46 1 138 22 7 56 2 1 2 0
## 4 4 47 1 228 2 1 2 0 0 0 0
## 5 5 51 1 257 10 1 10 0 0 0 0
## 6 6 60 1 145 6 2 12 0 0 0 0
## RefBks ArtBks GeogBks ItalCook ItalAtlas ItalArt Florence
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 1 0 1 1 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## Related.Purchase Mcode Rcode Fcode Yes_Florence No_Florence R_F_M
## 1 0 5 4 2 0 1 4_2_5
## 2 0 4 3 2 0 1 3_2_4
## 3 2 4 4 3 0 1 4_3_4
## 4 0 5 1 1 0 1 1_1_5
## 5 0 5 3 1 0 1 3_1_5
## 6 0 4 2 2 0 1 2_2_4
set.seed(90)
train.index <- sample(1:nrow(book), 0.6*nrow(book))
train <- book[train.index, ]
valid <- book[-train.index, ]
## Response rate for training data
all_com <- round(table(train$Yes_Florence)/nrow(train), 4)
all_com
##
## 0 1
## 0.9117 0.0883
## Count each combination's response
z <- table(train$R_F_M, train$Florence)
## Reponse rate for each RFM combination in training data
each_com <- round(prop.table(z, 1), 4)
each_com
##
## 0 1
## 1_1_1 1.0000 0.0000
## 1_1_2 1.0000 0.0000
## 1_1_3 1.0000 0.0000
## 1_1_4 0.7778 0.2222
## 1_1_5 0.8889 0.1111
## 1_2_2 0.5000 0.5000
## 1_2_3 1.0000 0.0000
## 1_2_4 1.0000 0.0000
## 1_2_5 1.0000 0.0000
## 1_3_2 0.0000 1.0000
## 1_3_3 1.0000 0.0000
## 1_3_4 0.9565 0.0435
## 1_3_5 0.8136 0.1864
## 2_1_1 1.0000 0.0000
## 2_1_2 0.6000 0.4000
## 2_1_3 1.0000 0.0000
## 2_1_4 0.9697 0.0303
## 2_1_5 0.8649 0.1351
## 2_2_2 0.7500 0.2500
## 2_2_3 0.8696 0.1304
## 2_2_4 0.9487 0.0513
## 2_2_5 0.8723 0.1277
## 2_3_3 0.8000 0.2000
## 2_3_4 0.8605 0.1395
## 2_3_5 0.8222 0.1778
## 3_1_1 1.0000 0.0000
## 3_1_2 0.9091 0.0909
## 3_1_3 0.9355 0.0645
## 3_1_4 0.9125 0.0875
## 3_1_5 0.9712 0.0288
## 3_2_2 0.9286 0.0714
## 3_2_3 0.8667 0.1333
## 3_2_4 0.9143 0.0857
## 3_2_5 0.9604 0.0396
## 3_3_2 1.0000 0.0000
## 3_3_3 0.9444 0.0556
## 3_3_4 0.9242 0.0758
## 3_3_5 0.8479 0.1521
## 4_1_1 1.0000 0.0000
## 4_1_2 0.8889 0.1111
## 4_1_3 0.8630 0.1370
## 4_1_4 0.9196 0.0804
## 4_1_5 0.9365 0.0635
## 4_2_2 1.0000 0.0000
## 4_2_3 0.9492 0.0508
## 4_2_4 0.9412 0.0588
## 4_2_5 0.9362 0.0638
## 4_3_2 1.0000 0.0000
## 4_3_3 0.9412 0.0588
## 4_3_4 0.9099 0.0901
## 4_3_5 0.9336 0.0664
## Which combinations have response rates in the training data that are above the overall response?
## Sort RFM combination
sort(each_com[, '1'], decreasing = TRUE)
## 1_3_2 1_2_2 2_1_2 2_2_2 1_1_4 2_3_3 1_3_5 2_3_5 3_3_5 2_3_4
## 1.0000 0.5000 0.4000 0.2500 0.2222 0.2000 0.1864 0.1778 0.1521 0.1395
## 4_1_3 2_1_5 3_2_3 2_2_3 2_2_5 1_1_5 4_1_2 3_1_2 4_3_4 3_1_4
## 0.1370 0.1351 0.1333 0.1304 0.1277 0.1111 0.1111 0.0909 0.0901 0.0875
## 3_2_4 4_1_4 3_3_4 3_2_2 4_3_5 3_1_3 4_2_5 4_1_5 4_2_4 4_3_3
## 0.0857 0.0804 0.0758 0.0714 0.0664 0.0645 0.0638 0.0635 0.0588 0.0588
## 3_3_3 2_2_4 4_2_3 1_3_4 3_2_5 2_1_4 3_1_5 1_1_1 1_1_2 1_1_3
## 0.0556 0.0513 0.0508 0.0435 0.0396 0.0303 0.0288 0.0000 0.0000 0.0000
## 1_2_3 1_2_4 1_2_5 1_3_3 2_1_1 2_1_3 3_1_1 3_3_2 4_1_1 4_2_2
## 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 4_3_2
## 0.0000
RFM 1_3_2 has the highest response rate above overall response rate.
## Response rate in train data that above average response rate
u <- each_com[each_com[ , '1'] > mean(each_com[ ,'1']), '1']
u
## 1_1_4 1_1_5 1_2_2 1_3_2 1_3_5 2_1_2 2_1_5 2_2_2 2_2_3 2_2_5
## 0.2222 0.1111 0.5000 1.0000 0.1864 0.4000 0.1351 0.2500 0.1304 0.1277
## 2_3_3 2_3_4 2_3_5 3_2_3 3_3_5 4_1_2 4_1_3
## 0.2000 0.1395 0.1778 0.1333 0.1521 0.1111 0.1370
## Reponse rate for each RFM combinations in validation data
z1 <- table(valid$R_F_M, valid$Florence)
each_com1 <- round(prop.table(z1, 1), 4)
## RFM combination in valid data
each_com1[ , '1']
## 1_1_1 1_1_2 1_1_3 1_1_4 1_1_5 1_2_2 1_2_3 1_2_4 1_2_5 1_3_3
## 0.0000 0.2500 0.0000 0.1429 0.0769 0.0000 0.0000 0.1000 0.1579 1.0000
## 1_3_4 1_3_5 2_1_1 2_1_2 2_1_3 2_1_4 2_1_5 2_2_2 2_2_3 2_2_4
## 0.1429 0.1200 0.5000 0.1111 0.2143 0.0526 0.0833 0.0000 0.0714 0.1739
## 2_2_5 2_3_3 2_3_4 2_3_5 3_1_1 3_1_2 3_1_3 3_1_4 3_1_5 3_2_2
## 0.0667 0.3333 0.1250 0.1071 0.2857 0.0455 0.0000 0.0364 0.0506 0.0000
## 3_2_3 3_2_4 3_2_5 3_3_3 3_3_4 3_3_5 4_1_1 4_1_2 4_1_3 4_1_4
## 0.0000 0.0800 0.0411 0.0000 0.0893 0.1357 0.0000 0.0000 0.0556 0.0758
## 4_1_5 4_2_2 4_2_3 4_2_4 4_2_5 4_3_3 4_3_4 4_3_5
## 0.0625 0.0833 0.0303 0.0260 0.0816 0.0714 0.0649 0.0896
u1 <- each_com1[c(4,5, 6, 12, 14, 17, 18, 19, 21, 22, 23, 24, 31, 33, 38, 39), '1'] ; u1
## 1_1_4 1_1_5 1_2_2 1_3_5 2_1_2 2_1_5 2_2_2 2_2_3 2_2_5 2_3_3
## 0.1429 0.0769 0.0000 0.1200 0.1111 0.0833 0.0000 0.0714 0.0667 0.3333
## 2_3_4 2_3_5 3_2_3 3_2_5 4_1_2 4_1_3
## 0.1250 0.1071 0.0000 0.0411 0.0000 0.0556
Unfortunately, RFM 1_3_2 combo, the highest response rate in training data, doesn’t appear in validation data.
## : RFM combinations that have response rates that exceed twice the overall response rate
each_com1[each_com1[ , '1'] > (2*all_com[2]), '1']
## 1_1_2 1_3_3 2_1_1 2_1_3 2_3_3 3_1_1
## 0.2500 1.0000 0.5000 0.2143 0.3333 0.2857
## RFM combinations that exceed the overall response rate but do not exceed twice that rate
each_com1[each_com1[ , '1'] < (2*all_com[2]) & each_com1[ , '1'] > all_com[2], '1']
## 1_1_4 1_2_4 1_2_5 1_3_4 1_3_5 2_1_2 2_2_4 2_3_4 2_3_5 3_3_4
## 0.1429 0.1000 0.1579 0.1429 0.1200 0.1111 0.1739 0.1250 0.1071 0.0893
## 3_3_5 4_3_5
## 0.1357 0.0896
## Segment the data
valid$seg <- ifelse(valid$R_F_M == c("1_1_4", "1_2_4", "1_2_5", "1_3_4", "1_3_5", "2_1_2", "2_2_4", "2_3_4", "2_3_5", "3_3_4", "3_3_5", "4_3_5"), "seg2", ifelse(valid$R_F_M == c("1_1_2", "1_3_3", "2_1_1", "2_1_3", "2_3_3", "3_1_1"), "seg1", "seg3"))
Plot the Rate Against the Length
yes <- valid$Yes_Florence[valid$Yes_Florence == 1]
breaks <- seq(1, 1600, 200)
yes_cut <- cut(yes, breaks, right=FALSE)
yes_freq <- table(yes_cut)
cumfreq <- c(0, cumsum(yes_freq))
plot(breaks, cumfreq, main = "Cumulative Chart",
xlab = "# of Customer in Validation",
ylab ="Cumulative number of buyers")
lines(breaks, cumfreq)
Data Prep
## Slice the dataset
train.df <- train[ ,c(20:22)]
valid.df <- valid[ ,c(20:22)]
## Create dummy variables for training data
library(caret)
dmy <- dummyVars(" ~ .", data = train.df)
train.df1 <- data.frame(predict(dmy, newdata = train.df))
train.df1 <- cbind(train.df1, "FirstPurch" = train$FirstPurch,
"Related.Purchase" = train$Related.Purchase,
"Yes_Florence" = train$Yes_Florence,
"No_Florence" = train$No_Florence)
## Create dummy variable for validation data
dmy1 <- dummyVars(" ~ .", data = valid.df)
valid.df1 <- data.frame(predict(dmy1, newdata = valid.df))
valid.df1 <- cbind(valid.df1, "FirstPurch" = valid$FirstPurch,
"Related.Purchase" = valid$Related.Purchase,
"Yes_Florence" = valid$Yes_Florence,
"No_Florence" = valid$No_Florence)
names(valid.df1)
## [1] "Mcode1" "Mcode2" "Mcode3"
## [4] "Mcode4" "Mcode5" "Rcode1"
## [7] "Rcode2" "Rcode3" "Rcode4"
## [10] "Fcode1" "Fcode2" "Fcode3"
## [13] "FirstPurch" "Related.Purchase" "Yes_Florence"
## [16] "No_Florence"
## Scale and normalize the dataset
library(caret)
train.norm.df <- train.df1
valid.norm.df <- valid.df1
norm.values <- preProcess(train.df1[, c(13,14)], method = c("center","scale"))
train.norm.df[, c(13, 14)] <- predict(norm.values, train.df1[, c(13, 14)])
valid.norm.df[, c(13, 14)] <- predict(norm.values, valid.df1[, c(13, 14)])
Find out the Best K
## Find out the best k value
library(caret)
library(FNN)
accuracy.df <- data.frame(k = seq(1, 20 ,1), accuracy = rep(0, 20))
for (i in 1:20) {
knn.pred <- knn(train.norm.df[, c(13, 14)], valid.norm.df[, c(13, 14)], cl = train.norm.df[, 15], k=i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[, 15])$overall[1]
}
## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
accuracy.df
## k accuracy
## 1 1 0.850000
## 2 2 0.915625
## 3 3 0.874375
## 4 4 0.918750
## 5 5 0.915625
## 6 6 0.917500
## 7 7 0.916875
## 8 8 0.916250
## 9 9 0.916250
## 10 10 0.919375
## 11 11 0.915625
## 12 12 0.918750
## 13 13 0.919375
## 14 14 0.921250
## 15 15 0.918750
## 16 16 0.921250
## 17 17 0.920000
## 18 18 0.921250
## 19 19 0.921250
## 20 20 0.921250
Plot the Best K
## Demonstrate the confusion matrix in line chart
plot(accuracy.df, type = "b", col = "dodgerblue", cex = 1, pch = 20,
xlab = "k, number of neighbors", ylab = "classification accuracy",
main = "Accuracy and K number of Neighbors")
## add lines to indicate k with best accuracy
abline(v = which(accuracy.df$accuracy == max(accuracy.df$accuracy)), col = "darkorange", lwd = 1.5)
## add line for max accuracy seen
abline(h = max(accuracy.df$accuracy), col = "grey", lty = 2)
K = 14
library(FNN)
knn.pred1 <- knn(train.norm.df[, c(13, 14)], valid.norm.df[, c(13, 14)], cl = train.norm.df[, 15], k = 14)
head(book)
## Seq. ID. Gender M R F FirstPurch ChildBks YouthBks CookBks DoItYBks
## 1 1 25 1 297 14 2 22 0 1 1 0
## 2 2 29 0 128 8 2 10 0 0 0 0
## 3 3 46 1 138 22 7 56 2 1 2 0
## 4 4 47 1 228 2 1 2 0 0 0 0
## 5 5 51 1 257 10 1 10 0 0 0 0
## 6 6 60 1 145 6 2 12 0 0 0 0
## RefBks ArtBks GeogBks ItalCook ItalAtlas ItalArt Florence
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 1 0 1 1 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## Related.Purchase Mcode Rcode Fcode Yes_Florence No_Florence R_F_M
## 1 0 5 4 2 0 1 4_2_5
## 2 0 4 3 2 0 1 3_2_4
## 3 2 4 4 3 0 1 4_3_4
## 4 0 5 1 1 0 1 1_1_5
## 5 0 5 3 1 0 1 3_1_5
## 6 0 4 2 2 0 1 2_2_4
full_p <- book[ -c(1,2,4,5,6,23:25)]
full_p$Florence <- as.factor(full_p$Florence)
full_p$Mcode <- as.factor(full_p$Mcode)
full_p$Rcode <- as.factor(full_p$Rcode)
full_p$Fcode <- as.factor(full_p$Fcode)
head(full_p)
## Gender FirstPurch ChildBks YouthBks CookBks DoItYBks RefBks ArtBks
## 1 1 22 0 1 1 0 0 0
## 2 0 10 0 0 0 0 0 0
## 3 1 56 2 1 2 0 1 0
## 4 1 2 0 0 0 0 0 0
## 5 1 10 0 0 0 0 0 0
## 6 1 12 0 0 0 0 0 0
## GeogBks ItalCook ItalAtlas ItalArt Florence Related.Purchase Mcode Rcode
## 1 0 0 0 0 0 0 5 4
## 2 0 0 0 0 0 0 4 3
## 3 1 1 0 0 0 2 4 4
## 4 0 0 0 0 0 0 5 1
## 5 0 0 0 0 0 0 5 3
## 6 0 0 0 0 0 0 4 2
## Fcode
## 1 2
## 2 2
## 3 3
## 4 1
## 5 1
## 6 2
Partition the data
set.seed(2)
train.index1 <- sample(1:nrow(full_p), (1800/nrow(full_p))*nrow(full_p))
train1 <- full_p[train.index1, ]
valid1 <- full_p[-train.index1, ]
Logistic model
## full predictors
model1 <- glm(Florence ~ ., data = train1, family = binomial)
summary(model1)
##
## Call:
## glm(formula = Florence ~ ., family = binomial, data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1724 -0.4519 -0.3562 -0.2882 2.7468
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.267202 1.088881 -2.082 0.03733 *
## Gender -0.516859 0.177136 -2.918 0.00352 **
## FirstPurch 0.012012 0.010351 1.160 0.24587
## ChildBks -0.267054 0.119304 -2.238 0.02519 *
## YouthBks 0.161822 0.149912 1.079 0.28039
## CookBks -0.197110 0.111752 -1.764 0.07776 .
## DoItYBks -0.135837 0.147927 -0.918 0.35848
## RefBks -0.098318 0.166068 -0.592 0.55383
## ArtBks 0.445674 0.109848 4.057 4.97e-05 ***
## GeogBks 0.304471 0.114019 2.670 0.00758 **
## ItalCook 0.037408 0.231742 0.161 0.87176
## ItalAtlas -1.182218 0.737202 -1.604 0.10879
## ItalArt 0.524470 0.318070 1.649 0.09917 .
## Related.Purchase NA NA NA NA
## Mcode2 0.162851 1.127234 0.144 0.88513
## Mcode3 0.342766 1.071471 0.320 0.74904
## Mcode4 0.020087 1.060834 0.019 0.98489
## Mcode5 0.333982 1.056707 0.316 0.75196
## Rcode2 -0.001382 0.346826 -0.004 0.99682
## Rcode3 -0.379119 0.328162 -1.155 0.24798
## Rcode4 -0.609564 0.349083 -1.746 0.08078 .
## Fcode2 -0.419573 0.253630 -1.654 0.09807 .
## Fcode3 0.393632 0.316907 1.242 0.21420
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1065.83 on 1799 degrees of freedom
## Residual deviance: 986.21 on 1778 degrees of freedom
## AIC: 1030.2
##
## Number of Fisher Scoring iterations: 6
## The best judge I thought: ArtBks, ItalArt, ItalAtlas and ItalCook
model2 <- glm(Florence ~ ArtBks + ItalArt + ItalAtlas + ItalCook, data = train1, family = binomial)
summary(model2)
##
## Call:
## glm(formula = Florence ~ ArtBks + ItalArt + ItalAtlas + ItalCook,
## family = binomial, data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1224 -0.3831 -0.3812 -0.3812 2.6680
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.58539 0.10321 -25.050 < 2e-16 ***
## ArtBks 0.53283 0.10301 5.173 2.31e-07 ***
## ItalArt 0.68941 0.28778 2.396 0.0166 *
## ItalAtlas -0.94477 0.72712 -1.299 0.1938
## ItalCook 0.01012 0.21882 0.046 0.9631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1065.8 on 1799 degrees of freedom
## Residual deviance: 1033.9 on 1795 degrees of freedom
## AIC: 1043.9
##
## Number of Fisher Scoring iterations: 5
## RFM
model3 <- glm(Florence ~ Rcode + Fcode+ Mcode, data = train1, family = binomial)
summary(model3)
##
## Call:
## glm(formula = Florence ~ Rcode + Fcode + Mcode, family = binomial,
## data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6118 -0.4802 -0.3973 -0.3237 2.5907
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.44752 1.06812 -2.291 0.0219 *
## Rcode2 -0.02117 0.33409 -0.063 0.9495
## Rcode3 -0.27086 0.30609 -0.885 0.3762
## Rcode4 -0.52111 0.30044 -1.735 0.0828 .
## Fcode2 -0.40510 0.24849 -1.630 0.1030
## Fcode3 0.41584 0.21145 1.967 0.0492 *
## Mcode2 0.33725 1.11901 0.301 0.7631
## Mcode3 0.37698 1.06453 0.354 0.7232
## Mcode4 0.05348 1.05370 0.051 0.9595
## Mcode5 0.45084 1.04832 0.430 0.6672
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1065.8 on 1799 degrees of freedom
## Residual deviance: 1038.9 on 1790 degrees of freedom
## AIC: 1058.9
##
## Number of Fisher Scoring iterations: 5
Check Performance on the Validation Set
val1 <- predict(model1, valid1, type="response")
val2 <- predict(model2, valid1, type="response")
val3 <- predict(model3, valid1, type="response")
mydf <- cbind(valid1, val1, val2, val3)
Lift Chart
library(ROCR)
log_scores1 <- prediction(predictions = mydf$val1, labels = mydf$Florence)
log_scores2 <- prediction(predictions = mydf$val2, labels = mydf$Florence)
log_scores3 <- prediction(predictions = mydf$val3, labels = mydf$Florence)
par(mfrow=c(1,3))
lift.obj1 <- performance(log_scores1, measure = "lift", x.measure="rpp")
plot(lift.obj1,
main="Lift Chart",
xlab="% Populations",
ylab="Lift",
col="blue")
abline(1,0,col="grey")
lift.obj2 <- performance(log_scores2, measure = "lift", x.measure="rpp")
plot(lift.obj2,
main="Lift Chart",
xlab="% Populations",
ylab="Lift",
col="red")
abline(1,0,col="grey")
lift.obj3 <- performance(log_scores3, measure = "lift", x.measure="rpp")
plot(lift.obj3,
main="Lift Chart",
xlab="% Populations",
ylab="Lift",
col="green")
abline(1,0,col="grey")
Confusion Matrix
## cutoff criterion for a campaign is a 30% likelihood of apurchase
library(caret)
mydf$response1 <- as.factor(ifelse(mydf$val1 > 0.3, 1, 0))
mydf$response2 <- as.factor(ifelse(mydf$val2 > 0.3, 1, 0))
mydf$response3 <- as.factor(ifelse(mydf$val3 > 0.3, 1, 0))
z1 <- confusionMatrix(mydf$response1, mydf$Florence)
z2 <- confusionMatrix(mydf$response2, mydf$Florence)
z3 <- confusionMatrix(mydf$response3, mydf$Florence)
z1$table; z2$table ;z3$table
## Reference
## Prediction 0 1
## 0 1988 169
## 1 31 12
## Reference
## Prediction 0 1
## 0 2005 178
## 1 14 3
## Reference
## Prediction 0 1
## 0 2019 181
## 1 0 0
cat("The numbers of potential buyers in each model is", "181(169, 12), ", "181(178, 3), ", "181(181, 0)")
## The numbers of potential buyers in each model is 181(169, 12), 181(178, 3), 181(181, 0)
## Data Prep
gc <- read.csv("GermanCredit.csv")
names(gc)
## [1] "OBS." "CHK_ACCT" "DURATION"
## [4] "HISTORY" "NEW_CAR" "USED_CAR"
## [7] "FURNITURE" "RADIO.TV" "EDUCATION"
## [10] "RETRAINING" "AMOUNT" "SAV_ACCT"
## [13] "EMPLOYMENT" "INSTALL_RATE" "MALE_DIV"
## [16] "MALE_SINGLE" "MALE_MAR_or_WID" "CO.APPLICANT"
## [19] "GUARANTOR" "PRESENT_RESIDENT" "REAL_ESTATE"
## [22] "PROP_UNKN_NONE" "AGE" "OTHER_INSTALL"
## [25] "RENT" "OWN_RES" "NUM_CREDITS"
## [28] "JOB" "NUM_DEPENDENTS" "TELEPHONE"
## [31] "FOREIGN" "RESPONSE"
In my opinion, the predictors which affect in credit decision are related to debt paying ability.
Partitioning the Data (60/40)
gc$RESPONSE <- as.factor(gc$RESPONSE)
set.seed(99)
train.index2 <- sample(1:nrow(gc), 0.6*nrow(gc))
train2 <- gc[train.index2, ]
valid2 <- gc[-train.index2, ]
Logistic model
log_reg <- glm(RESPONSE ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + SAV_ACCT + REAL_ESTATE, family = binomial, data = train2)
summary(log_reg)
##
## Call:
## glm(formula = RESPONSE ~ CHK_ACCT + DURATION + HISTORY + AMOUNT +
## SAV_ACCT + REAL_ESTATE, family = binomial, data = train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3213 -0.9427 0.4763 0.7910 1.7730
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.713e-01 3.440e-01 -1.370 0.170755
## CHK_ACCT 5.666e-01 8.605e-02 6.585 4.56e-11 ***
## DURATION -3.215e-02 1.075e-02 -2.989 0.002795 **
## HISTORY 3.251e-01 9.742e-02 3.337 0.000846 ***
## AMOUNT 3.377e-06 4.423e-05 0.076 0.939142
## SAV_ACCT 2.670e-01 7.257e-02 3.679 0.000234 ***
## REAL_ESTATE 5.089e-01 2.360e-01 2.156 0.031076 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 736.39 on 599 degrees of freedom
## Residual deviance: 613.56 on 593 degrees of freedom
## AIC: 627.56
##
## Number of Fisher Scoring iterations: 4
CART
library(rpart)
library(rpart.plot)
set.seed(33)
cart.model <- rpart(RESPONSE ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + SAV_ACCT + REAL_ESTATE, data = train2)
prp(cart.model,
faclen = 0,
fallen.leaves = TRUE,
shadow.col = "green",
extra = 2)
Neural Networks
library(neuralnet) # for neuralnet(), nn model
library(nnet) # for class.ind()
library(caret) # for train(), tune parameters
## Dumify the RESPONSE
head(class.ind(train2$RESPONSE))
## 0 1
## [1,] 0 1
## [2,] 1 0
## [3,] 0 1
## [4,] 0 1
## [5,] 0 1
## [6,] 0 1
n_train <- cbind(train2, class.ind(train2$RESPONSE))
names(n_train)[33] <- "RESPONSE_NO"
names(n_train)[34] <- "RESPONSE_YES"
## Build Formula
formula.bpn <- RESPONSE_NO + RESPONSE_YES ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + SAV_ACCT + REAL_ESTATE
## Training model
bpn <- neuralnet(formula = formula.bpn,
data = n_train,
hidden = c(2),
learningrate = 0.01,
threshold = 0.01,
stepmax = 5e5)
plot(bpn)
Check Performance on the Validation Set
## Logistic
val_1 <- predict(log_reg, valid2, type ="response")
## CART
val_2 <- predict(cart.model, valid2, type ="class")
valid2 <-cbind(valid2, val_1, val_2)
valid2$response1 <- as.factor(ifelse(valid2$val_1 > 0.5, 1, 0))
valid2$response2 <- as.factor(ifelse(valid2$val_2 == "0", 0, 1))
Confusion Matrix
library(caret)
## Logistic Confusion Matrix
z_con <- confusionMatrix(valid2$response1, valid2$RESPONSE)
z_con$table
## Reference
## Prediction 0 1
## 0 57 34
## 1 61 248
## CART Confusion Matrix
z1_con <- confusionMatrix(valid2$response2, valid2$RESPONSE)
z1_con$table
## Reference
## Prediction 0 1
## 0 57 36
## 1 61 246
## Neural Networks Matrix
temp_test <- subset(valid2, select = c("CHK_ACCT", "DURATION", "HISTORY", "AMOUNT", "SAV_ACCT", "REAL_ESTATE"))
nn.results <- compute(bpn, temp_test)
results <- data.frame("actual" = as.integer(valid2$RESPONSE), "prediction" = nn.results$net.result)
results$actual[results$actual == 1] <- 0
results$actual[results$actual == 2] <- 1
roundedresults <- sapply(results,round,digits=0)
roundedresultsdf = data.frame(roundedresults)
attach(roundedresultsdf)
table(actual, prediction.2)
## prediction.2
## actual 1
## 0 118
## 1 282
Net Profit for Each Algorithm
## Net Profit of Logistic Model
log_net <- (248*100) + (-500 *61)
cart_net <- (246*100) + (61*-500)
nn_net <- (249*100) + (61*-500)
cat("Net Profit for Logistic Model:", log_net, "\n", "Net Profit for CART:",
cart_net, "\n", "Net Profit for NN:", nn_net)
## Net Profit for Logistic Model: -5700
## Net Profit for CART: -5900
## Net Profit for NN: -5600
According to the results that confusion matrix times net profit table, both algorithm didn’t work well on classifying target customer.