I. Charles Book Club

Data Prep

book <- read.csv("CharlesBookClub.csv")
## Add new column RFM as the parameter for following data mining process
book$Rcode <- as.character(book$Rcode)
book$Fcode <- as.character(book$Fcode)
book$Mcode <- as.character(book$Mcode)
book$R_F_M <- with(book, paste0(book$Rcode, "_", book$Fcode, "_", book$Mcode))
names(book)

##  [1] "Seq."             "ID."              "Gender"          
##  [4] "M"                "R"                "F"               
##  [7] "FirstPurch"       "ChildBks"         "YouthBks"        
## [10] "CookBks"          "DoItYBks"         "RefBks"          
## [13] "ArtBks"           "GeogBks"          "ItalCook"        
## [16] "ItalAtlas"        "ItalArt"          "Florence"        
## [19] "Related.Purchase" "Mcode"            "Rcode"           
## [22] "Fcode"            "Yes_Florence"     "No_Florence"     
## [25] "R_F_M"

head(book)

##   Seq. ID. Gender   M  R F FirstPurch ChildBks YouthBks CookBks DoItYBks
## 1    1  25      1 297 14 2         22        0        1       1        0
## 2    2  29      0 128  8 2         10        0        0       0        0
## 3    3  46      1 138 22 7         56        2        1       2        0
## 4    4  47      1 228  2 1          2        0        0       0        0
## 5    5  51      1 257 10 1         10        0        0       0        0
## 6    6  60      1 145  6 2         12        0        0       0        0
##   RefBks ArtBks GeogBks ItalCook ItalAtlas ItalArt Florence
## 1      0      0       0        0         0       0        0
## 2      0      0       0        0         0       0        0
## 3      1      0       1        1         0       0        0
## 4      0      0       0        0         0       0        0
## 5      0      0       0        0         0       0        0
## 6      0      0       0        0         0       0        0
##   Related.Purchase Mcode Rcode Fcode Yes_Florence No_Florence R_F_M
## 1                0     5     4     2            0           1 4_2_5
## 2                0     4     3     2            0           1 3_2_4
## 3                2     4     4     3            0           1 4_3_4
## 4                0     5     1     1            0           1 1_1_5
## 5                0     5     3     1            0           1 3_1_5
## 6                0     4     2     2            0           1 2_2_4

Partition The Data into 60/40

set.seed(90)
train.index <- sample(1:nrow(book), 0.6*nrow(book))
train <- book[train.index, ]
valid <- book[-train.index, ]

1.

## Response rate for training data
all_com <- round(table(train$Yes_Florence)/nrow(train), 4)
all_com

## 
##      0      1 
## 0.9117 0.0883

## Count each combination's response
z <- table(train$R_F_M, train$Florence)

## Reponse rate for each RFM combination in training data
each_com <- round(prop.table(z, 1), 4)
each_com

##        
##              0      1
##   1_1_1 1.0000 0.0000
##   1_1_2 1.0000 0.0000
##   1_1_3 1.0000 0.0000
##   1_1_4 0.7778 0.2222
##   1_1_5 0.8889 0.1111
##   1_2_2 0.5000 0.5000
##   1_2_3 1.0000 0.0000
##   1_2_4 1.0000 0.0000
##   1_2_5 1.0000 0.0000
##   1_3_2 0.0000 1.0000
##   1_3_3 1.0000 0.0000
##   1_3_4 0.9565 0.0435
##   1_3_5 0.8136 0.1864
##   2_1_1 1.0000 0.0000
##   2_1_2 0.6000 0.4000
##   2_1_3 1.0000 0.0000
##   2_1_4 0.9697 0.0303
##   2_1_5 0.8649 0.1351
##   2_2_2 0.7500 0.2500
##   2_2_3 0.8696 0.1304
##   2_2_4 0.9487 0.0513
##   2_2_5 0.8723 0.1277
##   2_3_3 0.8000 0.2000
##   2_3_4 0.8605 0.1395
##   2_3_5 0.8222 0.1778
##   3_1_1 1.0000 0.0000
##   3_1_2 0.9091 0.0909
##   3_1_3 0.9355 0.0645
##   3_1_4 0.9125 0.0875
##   3_1_5 0.9712 0.0288
##   3_2_2 0.9286 0.0714
##   3_2_3 0.8667 0.1333
##   3_2_4 0.9143 0.0857
##   3_2_5 0.9604 0.0396
##   3_3_2 1.0000 0.0000
##   3_3_3 0.9444 0.0556
##   3_3_4 0.9242 0.0758
##   3_3_5 0.8479 0.1521
##   4_1_1 1.0000 0.0000
##   4_1_2 0.8889 0.1111
##   4_1_3 0.8630 0.1370
##   4_1_4 0.9196 0.0804
##   4_1_5 0.9365 0.0635
##   4_2_2 1.0000 0.0000
##   4_2_3 0.9492 0.0508
##   4_2_4 0.9412 0.0588
##   4_2_5 0.9362 0.0638
##   4_3_2 1.0000 0.0000
##   4_3_3 0.9412 0.0588
##   4_3_4 0.9099 0.0901
##   4_3_5 0.9336 0.0664

## Which combinations have response rates in the training data that are above the overall response?
## Sort RFM combination
sort(each_com[, '1'], decreasing = TRUE)

##  1_3_2  1_2_2  2_1_2  2_2_2  1_1_4  2_3_3  1_3_5  2_3_5  3_3_5  2_3_4 
## 1.0000 0.5000 0.4000 0.2500 0.2222 0.2000 0.1864 0.1778 0.1521 0.1395 
##  4_1_3  2_1_5  3_2_3  2_2_3  2_2_5  1_1_5  4_1_2  3_1_2  4_3_4  3_1_4 
## 0.1370 0.1351 0.1333 0.1304 0.1277 0.1111 0.1111 0.0909 0.0901 0.0875 
##  3_2_4  4_1_4  3_3_4  3_2_2  4_3_5  3_1_3  4_2_5  4_1_5  4_2_4  4_3_3 
## 0.0857 0.0804 0.0758 0.0714 0.0664 0.0645 0.0638 0.0635 0.0588 0.0588 
##  3_3_3  2_2_4  4_2_3  1_3_4  3_2_5  2_1_4  3_1_5  1_1_1  1_1_2  1_1_3 
## 0.0556 0.0513 0.0508 0.0435 0.0396 0.0303 0.0288 0.0000 0.0000 0.0000 
##  1_2_3  1_2_4  1_2_5  1_3_3  2_1_1  2_1_3  3_1_1  3_3_2  4_1_1  4_2_2 
## 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
##  4_3_2 
## 0.0000

RFM 1_3_2 has the highest response rate above overall response rate.

2.

## Response rate in train data that above average response rate
u <- each_com[each_com[ , '1'] > mean(each_com[ ,'1']), '1']
u

##  1_1_4  1_1_5  1_2_2  1_3_2  1_3_5  2_1_2  2_1_5  2_2_2  2_2_3  2_2_5 
## 0.2222 0.1111 0.5000 1.0000 0.1864 0.4000 0.1351 0.2500 0.1304 0.1277 
##  2_3_3  2_3_4  2_3_5  3_2_3  3_3_5  4_1_2  4_1_3 
## 0.2000 0.1395 0.1778 0.1333 0.1521 0.1111 0.1370

## Reponse rate for each RFM combinations in validation data
z1 <- table(valid$R_F_M, valid$Florence)
each_com1 <- round(prop.table(z1, 1), 4)
## RFM combination in valid data
each_com1[ , '1']

##  1_1_1  1_1_2  1_1_3  1_1_4  1_1_5  1_2_2  1_2_3  1_2_4  1_2_5  1_3_3 
## 0.0000 0.2500 0.0000 0.1429 0.0769 0.0000 0.0000 0.1000 0.1579 1.0000 
##  1_3_4  1_3_5  2_1_1  2_1_2  2_1_3  2_1_4  2_1_5  2_2_2  2_2_3  2_2_4 
## 0.1429 0.1200 0.5000 0.1111 0.2143 0.0526 0.0833 0.0000 0.0714 0.1739 
##  2_2_5  2_3_3  2_3_4  2_3_5  3_1_1  3_1_2  3_1_3  3_1_4  3_1_5  3_2_2 
## 0.0667 0.3333 0.1250 0.1071 0.2857 0.0455 0.0000 0.0364 0.0506 0.0000 
##  3_2_3  3_2_4  3_2_5  3_3_3  3_3_4  3_3_5  4_1_1  4_1_2  4_1_3  4_1_4 
## 0.0000 0.0800 0.0411 0.0000 0.0893 0.1357 0.0000 0.0000 0.0556 0.0758 
##  4_1_5  4_2_2  4_2_3  4_2_4  4_2_5  4_3_3  4_3_4  4_3_5 
## 0.0625 0.0833 0.0303 0.0260 0.0816 0.0714 0.0649 0.0896

u1 <- each_com1[c(4,5, 6, 12, 14, 17, 18, 19, 21, 22, 23, 24, 31, 33, 38, 39), '1'] ; u1

##  1_1_4  1_1_5  1_2_2  1_3_5  2_1_2  2_1_5  2_2_2  2_2_3  2_2_5  2_3_3 
## 0.1429 0.0769 0.0000 0.1200 0.1111 0.0833 0.0000 0.0714 0.0667 0.3333 
##  2_3_4  2_3_5  3_2_3  3_2_5  4_1_2  4_1_3 
## 0.1250 0.1071 0.0000 0.0411 0.0000 0.0556

Unfortunately, RFM 1_3_2 combo, the highest response rate in training data, doesn’t appear in validation data.

3.

## : RFM combinations that have response rates that exceed twice the overall response rate
each_com1[each_com1[ , '1'] > (2*all_com[2]), '1']

##  1_1_2  1_3_3  2_1_1  2_1_3  2_3_3  3_1_1 
## 0.2500 1.0000 0.5000 0.2143 0.3333 0.2857

## RFM combinations that exceed the overall response rate but do not exceed twice that rate
each_com1[each_com1[ , '1'] < (2*all_com[2]) & each_com1[ , '1'] > all_com[2], '1']

##  1_1_4  1_2_4  1_2_5  1_3_4  1_3_5  2_1_2  2_2_4  2_3_4  2_3_5  3_3_4 
## 0.1429 0.1000 0.1579 0.1429 0.1200 0.1111 0.1739 0.1250 0.1071 0.0893 
##  3_3_5  4_3_5 
## 0.1357 0.0896

## Segment the data
valid$seg <- ifelse(valid$R_F_M == c("1_1_4", "1_2_4", "1_2_5", "1_3_4", "1_3_5", "2_1_2", "2_2_4", "2_3_4", "2_3_5", "3_3_4", "3_3_5", "4_3_5"), "seg2", ifelse(valid$R_F_M == c("1_1_2", "1_3_3", "2_1_1", "2_1_3", "2_3_3", "3_1_1"), "seg1", "seg3"))

Plot the Rate Against the Length

yes <- valid$Yes_Florence[valid$Yes_Florence == 1]
breaks <- seq(1, 1600, 200)
yes_cut <- cut(yes, breaks, right=FALSE) 
yes_freq <- table(yes_cut)

cumfreq <- c(0, cumsum(yes_freq))

plot(breaks, cumfreq, main = "Cumulative Chart", 
     xlab = "# of Customer in Validation", 
     ylab ="Cumulative number of buyers")
lines(breaks, cumfreq)

4.

Data Prep

## Slice the dataset
train.df <- train[ ,c(20:22)]
valid.df <- valid[ ,c(20:22)]
## Create dummy variables for training data
library(caret)
dmy <- dummyVars(" ~ .", data = train.df)
train.df1 <- data.frame(predict(dmy, newdata = train.df))
train.df1 <- cbind(train.df1, "FirstPurch" = train$FirstPurch, 
                   "Related.Purchase" = train$Related.Purchase, 
                   "Yes_Florence" = train$Yes_Florence, 
                   "No_Florence" = train$No_Florence)

## Create dummy variable for validation data
dmy1 <- dummyVars(" ~ .", data = valid.df)
valid.df1 <- data.frame(predict(dmy1, newdata = valid.df))
valid.df1 <- cbind(valid.df1, "FirstPurch" = valid$FirstPurch, 
                   "Related.Purchase" = valid$Related.Purchase, 
                   "Yes_Florence" = valid$Yes_Florence, 
                   "No_Florence" = valid$No_Florence)
names(valid.df1)

##  [1] "Mcode1"           "Mcode2"           "Mcode3"          
##  [4] "Mcode4"           "Mcode5"           "Rcode1"          
##  [7] "Rcode2"           "Rcode3"           "Rcode4"          
## [10] "Fcode1"           "Fcode2"           "Fcode3"          
## [13] "FirstPurch"       "Related.Purchase" "Yes_Florence"    
## [16] "No_Florence"

## Scale and normalize the dataset
library(caret)
train.norm.df <- train.df1
valid.norm.df <- valid.df1
norm.values <- preProcess(train.df1[, c(13,14)], method = c("center","scale"))
train.norm.df[, c(13, 14)] <- predict(norm.values, train.df1[, c(13, 14)])
valid.norm.df[, c(13, 14)] <- predict(norm.values, valid.df1[, c(13, 14)])

Find out the Best K

## Find out the best k value
library(caret)
library(FNN)
accuracy.df <- data.frame(k = seq(1, 20 ,1), accuracy = rep(0, 20))
for (i in 1:20) {
  knn.pred <- knn(train.norm.df[, c(13, 14)], valid.norm.df[, c(13, 14)], cl = train.norm.df[, 15], k=i)
  accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.norm.df[, 15])$overall[1]
}

## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.

## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.

## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.

## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.

## Warning in confusionMatrix.default(knn.pred, valid.norm.df[, 15]): Levels
## are not in the same order for reference and data. Refactoring data to
## match.

accuracy.df

##     k accuracy
## 1   1 0.850000
## 2   2 0.915625
## 3   3 0.874375
## 4   4 0.918750
## 5   5 0.915625
## 6   6 0.917500
## 7   7 0.916875
## 8   8 0.916250
## 9   9 0.916250
## 10 10 0.919375
## 11 11 0.915625
## 12 12 0.918750
## 13 13 0.919375
## 14 14 0.921250
## 15 15 0.918750
## 16 16 0.921250
## 17 17 0.920000
## 18 18 0.921250
## 19 19 0.921250
## 20 20 0.921250

Plot the Best K

## Demonstrate the confusion matrix in line chart
plot(accuracy.df, type = "b", col = "dodgerblue", cex = 1, pch = 20, 
     xlab = "k, number of neighbors", ylab = "classification accuracy",
     main = "Accuracy and K number of  Neighbors")

## add lines to indicate k with best accuracy
abline(v = which(accuracy.df$accuracy == max(accuracy.df$accuracy)), col = "darkorange", lwd = 1.5)
## add line for max accuracy seen
abline(h = max(accuracy.df$accuracy), col = "grey", lty = 2)

K = 14

library(FNN)
knn.pred1 <- knn(train.norm.df[, c(13, 14)], valid.norm.df[, c(13, 14)], cl = train.norm.df[, 15], k = 14)

5.

6.

head(book)

##   Seq. ID. Gender   M  R F FirstPurch ChildBks YouthBks CookBks DoItYBks
## 1    1  25      1 297 14 2         22        0        1       1        0
## 2    2  29      0 128  8 2         10        0        0       0        0
## 3    3  46      1 138 22 7         56        2        1       2        0
## 4    4  47      1 228  2 1          2        0        0       0        0
## 5    5  51      1 257 10 1         10        0        0       0        0
## 6    6  60      1 145  6 2         12        0        0       0        0
##   RefBks ArtBks GeogBks ItalCook ItalAtlas ItalArt Florence
## 1      0      0       0        0         0       0        0
## 2      0      0       0        0         0       0        0
## 3      1      0       1        1         0       0        0
## 4      0      0       0        0         0       0        0
## 5      0      0       0        0         0       0        0
## 6      0      0       0        0         0       0        0
##   Related.Purchase Mcode Rcode Fcode Yes_Florence No_Florence R_F_M
## 1                0     5     4     2            0           1 4_2_5
## 2                0     4     3     2            0           1 3_2_4
## 3                2     4     4     3            0           1 4_3_4
## 4                0     5     1     1            0           1 1_1_5
## 5                0     5     3     1            0           1 3_1_5
## 6                0     4     2     2            0           1 2_2_4

full_p <- book[ -c(1,2,4,5,6,23:25)]
full_p$Florence <- as.factor(full_p$Florence)
full_p$Mcode <- as.factor(full_p$Mcode)
full_p$Rcode <- as.factor(full_p$Rcode)
full_p$Fcode <- as.factor(full_p$Fcode)
head(full_p)

##   Gender FirstPurch ChildBks YouthBks CookBks DoItYBks RefBks ArtBks
## 1      1         22        0        1       1        0      0      0
## 2      0         10        0        0       0        0      0      0
## 3      1         56        2        1       2        0      1      0
## 4      1          2        0        0       0        0      0      0
## 5      1         10        0        0       0        0      0      0
## 6      1         12        0        0       0        0      0      0
##   GeogBks ItalCook ItalAtlas ItalArt Florence Related.Purchase Mcode Rcode
## 1       0        0         0       0        0                0     5     4
## 2       0        0         0       0        0                0     4     3
## 3       1        1         0       0        0                2     4     4
## 4       0        0         0       0        0                0     5     1
## 5       0        0         0       0        0                0     5     3
## 6       0        0         0       0        0                0     4     2
##   Fcode
## 1     2
## 2     2
## 3     3
## 4     1
## 5     1
## 6     2

Partition the data

set.seed(2)
train.index1 <- sample(1:nrow(full_p), (1800/nrow(full_p))*nrow(full_p))
train1 <- full_p[train.index1, ]
valid1 <- full_p[-train.index1, ]

Logistic model

## full predictors
model1 <- glm(Florence ~ ., data = train1, family = binomial)
summary(model1)

## 
## Call:
## glm(formula = Florence ~ ., family = binomial, data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1724  -0.4519  -0.3562  -0.2882   2.7468  
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.267202   1.088881  -2.082  0.03733 *  
## Gender           -0.516859   0.177136  -2.918  0.00352 ** 
## FirstPurch        0.012012   0.010351   1.160  0.24587    
## ChildBks         -0.267054   0.119304  -2.238  0.02519 *  
## YouthBks          0.161822   0.149912   1.079  0.28039    
## CookBks          -0.197110   0.111752  -1.764  0.07776 .  
## DoItYBks         -0.135837   0.147927  -0.918  0.35848    
## RefBks           -0.098318   0.166068  -0.592  0.55383    
## ArtBks            0.445674   0.109848   4.057 4.97e-05 ***
## GeogBks           0.304471   0.114019   2.670  0.00758 ** 
## ItalCook          0.037408   0.231742   0.161  0.87176    
## ItalAtlas        -1.182218   0.737202  -1.604  0.10879    
## ItalArt           0.524470   0.318070   1.649  0.09917 .  
## Related.Purchase        NA         NA      NA       NA    
## Mcode2            0.162851   1.127234   0.144  0.88513    
## Mcode3            0.342766   1.071471   0.320  0.74904    
## Mcode4            0.020087   1.060834   0.019  0.98489    
## Mcode5            0.333982   1.056707   0.316  0.75196    
## Rcode2           -0.001382   0.346826  -0.004  0.99682    
## Rcode3           -0.379119   0.328162  -1.155  0.24798    
## Rcode4           -0.609564   0.349083  -1.746  0.08078 .  
## Fcode2           -0.419573   0.253630  -1.654  0.09807 .  
## Fcode3            0.393632   0.316907   1.242  0.21420    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1065.83  on 1799  degrees of freedom
## Residual deviance:  986.21  on 1778  degrees of freedom
## AIC: 1030.2
## 
## Number of Fisher Scoring iterations: 6

## The best judge I thought: ArtBks, ItalArt, ItalAtlas and ItalCook
model2 <- glm(Florence ~ ArtBks + ItalArt + ItalAtlas + ItalCook, data = train1, family = binomial)
summary(model2)

## 
## Call:
## glm(formula = Florence ~ ArtBks + ItalArt + ItalAtlas + ItalCook, 
##     family = binomial, data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1224  -0.3831  -0.3812  -0.3812   2.6680  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.58539    0.10321 -25.050  < 2e-16 ***
## ArtBks       0.53283    0.10301   5.173 2.31e-07 ***
## ItalArt      0.68941    0.28778   2.396   0.0166 *  
## ItalAtlas   -0.94477    0.72712  -1.299   0.1938    
## ItalCook     0.01012    0.21882   0.046   0.9631    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1065.8  on 1799  degrees of freedom
## Residual deviance: 1033.9  on 1795  degrees of freedom
## AIC: 1043.9
## 
## Number of Fisher Scoring iterations: 5

## RFM
model3 <- glm(Florence ~ Rcode + Fcode+ Mcode, data = train1, family = binomial)
summary(model3)

## 
## Call:
## glm(formula = Florence ~ Rcode + Fcode + Mcode, family = binomial, 
##     data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6118  -0.4802  -0.3973  -0.3237   2.5907  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -2.44752    1.06812  -2.291   0.0219 *
## Rcode2      -0.02117    0.33409  -0.063   0.9495  
## Rcode3      -0.27086    0.30609  -0.885   0.3762  
## Rcode4      -0.52111    0.30044  -1.735   0.0828 .
## Fcode2      -0.40510    0.24849  -1.630   0.1030  
## Fcode3       0.41584    0.21145   1.967   0.0492 *
## Mcode2       0.33725    1.11901   0.301   0.7631  
## Mcode3       0.37698    1.06453   0.354   0.7232  
## Mcode4       0.05348    1.05370   0.051   0.9595  
## Mcode5       0.45084    1.04832   0.430   0.6672  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1065.8  on 1799  degrees of freedom
## Residual deviance: 1038.9  on 1790  degrees of freedom
## AIC: 1058.9
## 
## Number of Fisher Scoring iterations: 5

Check Performance on the Validation Set

val1 <- predict(model1, valid1, type="response")
val2 <- predict(model2, valid1, type="response")
val3 <- predict(model3, valid1, type="response")
mydf <- cbind(valid1, val1, val2, val3)

Lift Chart

library(ROCR)
log_scores1 <- prediction(predictions = mydf$val1, labels = mydf$Florence)
log_scores2 <- prediction(predictions = mydf$val2, labels = mydf$Florence)
log_scores3 <- prediction(predictions = mydf$val3, labels = mydf$Florence)

par(mfrow=c(1,3))
lift.obj1 <- performance(log_scores1, measure = "lift", x.measure="rpp")
plot(lift.obj1,
     main="Lift Chart",
     xlab="% Populations",
     ylab="Lift",
     col="blue")
abline(1,0,col="grey")

lift.obj2 <- performance(log_scores2, measure = "lift", x.measure="rpp")
plot(lift.obj2,
     main="Lift Chart",
     xlab="% Populations",
     ylab="Lift",
     col="red")
abline(1,0,col="grey")

lift.obj3 <- performance(log_scores3, measure = "lift", x.measure="rpp")
plot(lift.obj3,
     main="Lift Chart",
     xlab="% Populations",
     ylab="Lift",
     col="green")
abline(1,0,col="grey")

7.

Confusion Matrix

## cutoff criterion for a campaign is a 30% likelihood of apurchase
library(caret)
mydf$response1 <- as.factor(ifelse(mydf$val1 > 0.3, 1, 0))
mydf$response2 <- as.factor(ifelse(mydf$val2 > 0.3, 1, 0))
mydf$response3 <- as.factor(ifelse(mydf$val3 > 0.3, 1, 0))


z1 <- confusionMatrix(mydf$response1, mydf$Florence)
z2 <- confusionMatrix(mydf$response2, mydf$Florence)
z3 <- confusionMatrix(mydf$response3, mydf$Florence)

z1$table; z2$table ;z3$table

##           Reference
## Prediction    0    1
##          0 1988  169
##          1   31   12

##           Reference
## Prediction    0    1
##          0 2005  178
##          1   14    3

##           Reference
## Prediction    0    1
##          0 2019  181
##          1    0    0

cat("The numbers of potential buyers in each model is", "181(169, 12), ", "181(178, 3), ", "181(181, 0)")

## The numbers of potential buyers in each model is 181(169, 12),  181(178, 3),  181(181, 0)

II. German Credit

## Data Prep

gc <- read.csv("GermanCredit.csv")
names(gc)

##  [1] "OBS."             "CHK_ACCT"         "DURATION"        
##  [4] "HISTORY"          "NEW_CAR"          "USED_CAR"        
##  [7] "FURNITURE"        "RADIO.TV"         "EDUCATION"       
## [10] "RETRAINING"       "AMOUNT"           "SAV_ACCT"        
## [13] "EMPLOYMENT"       "INSTALL_RATE"     "MALE_DIV"        
## [16] "MALE_SINGLE"      "MALE_MAR_or_WID"  "CO.APPLICANT"    
## [19] "GUARANTOR"        "PRESENT_RESIDENT" "REAL_ESTATE"     
## [22] "PROP_UNKN_NONE"   "AGE"              "OTHER_INSTALL"   
## [25] "RENT"             "OWN_RES"          "NUM_CREDITS"     
## [28] "JOB"              "NUM_DEPENDENTS"   "TELEPHONE"       
## [31] "FOREIGN"          "RESPONSE"

1.

In my opinion, the predictors which affect in credit decision are related to debt paying ability.

Strong Related
- CHK_ACCT
- DURATION
- HISTORY
- AMOUNT
- SAV_ACCT
- REAL_ESTAT
Weak Related
- NEW_CAR
- USED_CAR
- FURNITURE
- RADIO.TV
- EDUCATION
- RETRAINING
- MALE_DIV
- MALE_MAR_or_WID
- PROP_UNKN_NONE
- EMPLOYMENT
- INSTALL_RATE
- CO.APPLICANT
- GUARANTOR
- PRESENT_RESIDENT
- AGE
- MALE_SINGLE
- OTHER_INSTALL
- RENT
- OWN_RES
- NUM_CREDITS
- JOB
- NUM_DEPENDENTS

2.

Partitioning the Data (60/40)

gc$RESPONSE <- as.factor(gc$RESPONSE)
set.seed(99)
train.index2 <- sample(1:nrow(gc), 0.6*nrow(gc))
train2 <- gc[train.index2, ]
valid2 <- gc[-train.index2, ]

Logistic model

log_reg <- glm(RESPONSE ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + SAV_ACCT + REAL_ESTATE, family = binomial, data = train2)
summary(log_reg)

## 
## Call:
## glm(formula = RESPONSE ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + 
##     SAV_ACCT + REAL_ESTATE, family = binomial, data = train2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3213  -0.9427   0.4763   0.7910   1.7730  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.713e-01  3.440e-01  -1.370 0.170755    
## CHK_ACCT     5.666e-01  8.605e-02   6.585 4.56e-11 ***
## DURATION    -3.215e-02  1.075e-02  -2.989 0.002795 ** 
## HISTORY      3.251e-01  9.742e-02   3.337 0.000846 ***
## AMOUNT       3.377e-06  4.423e-05   0.076 0.939142    
## SAV_ACCT     2.670e-01  7.257e-02   3.679 0.000234 ***
## REAL_ESTATE  5.089e-01  2.360e-01   2.156 0.031076 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 736.39  on 599  degrees of freedom
## Residual deviance: 613.56  on 593  degrees of freedom
## AIC: 627.56
## 
## Number of Fisher Scoring iterations: 4

CART

library(rpart)
library(rpart.plot)
set.seed(33)
cart.model <- rpart(RESPONSE ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + SAV_ACCT + REAL_ESTATE, data = train2)
prp(cart.model,
    faclen = 0,
    fallen.leaves = TRUE,
    shadow.col = "green",
    extra = 2)

Neural Networks

library(neuralnet) # for neuralnet(), nn model
library(nnet)      # for class.ind()
library(caret)     # for train(), tune parameters

## Dumify the RESPONSE
head(class.ind(train2$RESPONSE))

##      0 1
## [1,] 0 1
## [2,] 1 0
## [3,] 0 1
## [4,] 0 1
## [5,] 0 1
## [6,] 0 1

n_train <- cbind(train2, class.ind(train2$RESPONSE))
names(n_train)[33] <- "RESPONSE_NO"
names(n_train)[34] <- "RESPONSE_YES"

## Build Formula
formula.bpn <- RESPONSE_NO + RESPONSE_YES ~ CHK_ACCT + DURATION + HISTORY + AMOUNT + SAV_ACCT + REAL_ESTATE

## Training model
bpn <- neuralnet(formula = formula.bpn,
                 data = n_train,
                 hidden = c(2),
                 learningrate = 0.01,
                 threshold = 0.01,
                 stepmax = 5e5)
plot(bpn)

3.

Check Performance on the Validation Set

## Logistic
val_1 <- predict(log_reg, valid2, type ="response")

## CART
val_2 <- predict(cart.model, valid2, type ="class")

valid2 <-cbind(valid2, val_1, val_2)

valid2$response1 <- as.factor(ifelse(valid2$val_1 > 0.5, 1, 0))
valid2$response2 <- as.factor(ifelse(valid2$val_2 == "0", 0, 1))

Confusion Matrix

library(caret)
## Logistic Confusion Matrix
z_con <- confusionMatrix(valid2$response1, valid2$RESPONSE)
z_con$table

##           Reference
## Prediction   0   1
##          0  57  34
##          1  61 248

## CART Confusion Matrix
z1_con <- confusionMatrix(valid2$response2, valid2$RESPONSE)
z1_con$table

##           Reference
## Prediction   0   1
##          0  57  36
##          1  61 246

## Neural Networks Matrix
temp_test <- subset(valid2, select = c("CHK_ACCT", "DURATION", "HISTORY", "AMOUNT", "SAV_ACCT", "REAL_ESTATE"))
nn.results <- compute(bpn, temp_test)

results <- data.frame("actual" = as.integer(valid2$RESPONSE), "prediction" = nn.results$net.result)

results$actual[results$actual == 1] <- 0
results$actual[results$actual == 2] <- 1

roundedresults <- sapply(results,round,digits=0)
roundedresultsdf = data.frame(roundedresults)
attach(roundedresultsdf)
table(actual, prediction.2)

##       prediction.2
## actual   1
##      0 118
##      1 282

Net Profit for Each Algorithm

## Net Profit of Logistic Model
log_net <- (248*100) + (-500 *61)
cart_net <- (246*100) + (61*-500)
nn_net <- (249*100) + (61*-500)
cat("Net Profit for Logistic Model:", log_net, "\n", "Net Profit for CART:", 
    cart_net, "\n", "Net Profit for NN:", nn_net)

## Net Profit for Logistic Model: -5700 
##  Net Profit for CART: -5900 
##  Net Profit for NN: -5600

According to the results that confusion matrix times net profit table, both algorithm didn’t work well on classifying target customer.

Assignment 3

Kuan-Ting Yeh

4/11/2018

I. Charles Book Club

Data Prep

Partition The Data into 60/40

1.

2.

3.

4.

5.

6.

7.

II. German Credit

1.

2.

3.

4.