options(width=100)
# Starter logistic regression analysis script for Rhapsody Churn Modeling
# Adapted from a Churn Exercise script file
library(ggplot2) # for improved plots -- qplot and scatterplot matrix
library(GGally) # for scatterplot matrix
library(RcmdrMisc) # for numSummary, stepwise
library(ROCR) # for producing ROC and other prediction performance plots
library(caret) # for confusion matrix analysis
library(plyr) # for mapvalues recoding function
library(dplyr) # select function for subsetting data frames
# Suggested usage:
# Copy the required data file, Rhapsody2016CPTOrderSummary-010.csv into a
# new project folder <folder>
# Change the working directory (setwd below) to point to the project <folder>
# Run the entire script as a block
# Set up working directory for file locations
# and load required libraries
ChurnDir <- "D:/MBA/SecondTerm/Data/exam"
ChurnData <- "Rhapsody2017CPTOrderSummary-010.csv"
ChurnPrepped <- "ChurnFullPrepped-010.RData"
ChurnTrainAnalysis <- "ChurnTrainAnalysis-010.csv"
ChurnTestAnalysis <- "ChurnTestAnalysis-010.csv"
# DateFormat <- "%m/%d/%Y"
DateFormat <- "%Y-%m-%d"
########### BEGIN DATA PREPARATION
setwd(ChurnDir)
# DATA ENTRY AND SUMMARIZATION
# Read full data set, list -- names, structure, head,
ChurnFull <- read.csv(ChurnData)
AllRows <- nrow(ChurnFull)
str(ChurnFull, list.len = 999)
## 'data.frame': 5156 obs. of 109 variables:
## $ UserPartyId : num 2273929 2637516 3268169 5027183 5580324 ...
## $ UserOrigSignupDt : Factor w/ 1983 levels "2002-06-07","2002-08-06",..: 51 599 460 320 1358 750 58 1749 678 74 ...
## $ SubscriberServiceId : num 1.76e+08 1.33e+08 3.11e+08 8.36e+07 3.07e+08 ...
## $ FirstBillDt : Factor w/ 1977 levels "2002-08-06","2002-10-02",..: 49 603 465 311 1359 749 58 1721 680 71 ...
## $ LastPlayDt : Factor w/ 602 levels "","2005-01-17",..: 598 556 602 602 1 574 574 175 602 602 ...
## $ SignUpDt : Factor w/ 1580 levels "2005-05-05","2005-05-09",..: 394 266 987 91 948 353 1303 1346 310 1034 ...
## $ StopRequestDt : Factor w/ 87 levels "2010-08-05","2011-01-19",..: 87 87 87 87 87 87 87 87 87 87 ...
## $ CancelDt : Factor w/ 76 levels "2013-11-02","2013-11-04",..: 76 69 76 76 76 76 76 76 76 76 ...
## $ CancelType : Factor w/ 4 levels "Current Sub",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ CancelReason : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 4 1 1 1 1 1 1 1 1 ...
## $ ChannelGroupName : Factor w/ 15 levels "Cable Partners",..: 13 8 2 2 8 6 14 1 6 10 ...
## $ ChannelName : Factor w/ 75 levels "Affiliate","Android Mobile",..: 7 56 57 57 56 2 67 9 22 25 ...
## $ CobrandCode : num 40132 40134 40134 40134 40134 ...
## $ CobrandName : Factor w/ 23 levels "ATT","AudioGalaxy",..: 3 18 18 18 18 18 11 4 18 10 ...
## $ OriginCode : Factor w/ 210 levels "3x5_rhapcom",..: 23 154 30 144 70 70 69 31 166 67 ...
## $ pcode : Factor w/ 160 levels ".","30sec","30sec_hp",..: 1 108 136 136 127 52 1 23 63 1 ...
## $ cpath : Factor w/ 122 levels ".","0","12_19Bpage",..: 1 1 81 81 61 14 1 61 14 1 ...
## $ rsrc : Factor w/ 569 levels ".","0","0508",..: 1 1 262 250 411 70 1 541 371 1 ...
## $ opage : Factor w/ 155 levels "","-","404__404",..: 1 1 68 68 1 48 1 1 1 1 ...
## $ ServiceTier : Factor w/ 4 levels "PREM","RHAP",..: 1 4 4 4 1 4 1 4 4 4 ...
## $ MonthsPerBill : num 1 1 1 1 1 1 3 1 1 1 ...
## $ BillingZipCode : num 75098 52402 60542 80227 79424 ...
## $ BillingState : Factor w/ 56 levels "","7th fl","AE",..: 48 16 18 9 48 48 24 12 8 47 ...
## $ EcommOrderId : num 3.68e+08 3.67e+08 3.68e+08 3.67e+08 3.68e+08 ...
## $ OrderDate : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 25 1 11 7 13 24 14 16 27 3 ...
## $ PaymentTypeName : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTypeCode : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentSubTypeCode : Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 6 6 6 6 6 2 2 ...
## $ CPTCardBrand : Factor w/ 3 levels "MC","None","VI": 3 3 3 3 3 3 3 3 2 2 ...
## $ CPTCardType : Factor w/ 6 levels "0 - None","1 - Credit",..: 2 2 4 2 2 4 2 2 1 1 ...
## $ CPTBIN : num 410413 414720 443045 438857 441712 ...
## $ CPTTransactionDivision : num 193099 193099 193099 193099 193099 ...
## $ PaymentTxTypeCode : Factor w/ 1 level "BC": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTxStatus : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PmtStatType : Factor w/ 1 level "COMPLETE": 1 1 1 1 1 1 1 1 1 1 ...
## $ PT_status : Factor w/ 18 levels "00","04","05",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AVS_status : Factor w/ 12 levels "3","6","7","9",..: 7 7 7 7 7 6 7 7 5 5 ...
## $ SEC_status : Factor w/ 2 levels "S","UNK": 2 2 2 2 2 2 2 2 2 2 ...
## $ RiskCode : Factor w/ 1 level "UNK": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentProcessorCode : Factor w/ 4 levels "AMZ","PPL","PT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentProcessorDesc : Factor w/ 4 levels "","Amazon FPS",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentDivisionCode : Factor w/ 4 levels "193099","203217",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentDivisionDesc : Factor w/ 5 levels "","Amazon FPS non-recurring default",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ PaymentDivisionUseType : Factor w/ 3 levels "","N","R": 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentDivisionRiskType: Factor w/ 2 levels "","NORMALRISK": 2 2 2 2 2 2 2 2 2 2 ...
## $ Country : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
## $ Currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Amount : num 10.8 15 15 15 10.8 ...
## $ OrderStatus : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Churn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PayAttempts : num 1 8 1 1 1 1 1 1 1 4 ...
## $ OrdSucces : num 1 1 1 1 1 1 1 1 1 1 ...
## $ OrdSftDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PaySuccess : num 1 1 1 1 1 1 1 1 1 1 ...
## $ PaySftDec : num 0 1 0 0 0 0 0 0 0 1 ...
## $ PayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat05 : num 0 1 0 0 0 0 0 0 0 1 ...
## $ PTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubOrderCount : num 29 24 14 26 19 27 2 5 24 16 ...
## $ SubPayAttempts : num 29 26 15 26 19 27 2 5 24 16 ...
## $ SubAmount : num 313 360 210 390 205 ...
## $ SubFirstOrderDate : Factor w/ 911 levels "2011-04-21","2011-04-22",..: 5 4 328 48 275 33 626 717 37 375 ...
## $ SubLastOrderDate : Factor w/ 139 levels "2012-10-29","2012-11-03",..: 104 80 90 86 92 103 16 95 76 82 ...
## $ SubChurns : num 0 0 1 0 0 1 0 0 0 1 ...
## $ SubOrdSucces : num 29 24 13 26 19 26 2 5 24 15 ...
## $ SubOrdSftDec : num 0 0 1 0 0 1 0 0 0 1 ...
## $ SubOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPaySuccess : num 29 24 13 26 19 26 2 5 24 15 ...
## $ SubPaySftDec : num 0 2 2 0 0 1 0 0 0 1 ...
## $ SubPayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat05 : num 0 2 0 0 0 1 0 0 0 1 ...
## $ SubPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat56 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ SubPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat12 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ SubPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubCount : num 1 1 2 1 1 1 2 3 1 3 ...
## $ UserOrderCount : num 29 24 23 26 19 27 7 25 24 22 ...
## $ UserPayAttempts : num 29 26 25 26 19 27 10 39 24 23 ...
## $ UserAmount : num 313 360 300 390 205 ...
## $ UserFirstOrderDate : Factor w/ 888 levels "2011-04-21","2011-04-22",..: 5 4 30 48 277 33 18 16 37 12 ...
## $ UserLastOrderDate : Factor w/ 191 levels "2011-07-12","2011-10-28",..: 156 132 142 138 144 155 50 147 128 134 ...
## $ UserChurns : num 0 0 1 0 0 1 0 1 0 1 ...
## $ UserOrdSucces : num 29 24 22 26 19 26 7 24 24 21 ...
## $ UserOrdSftDec : num 0 0 1 0 0 1 0 1 0 1 ...
## $ UserOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPaySuccess : num 29 24 22 26 19 26 7 24 24 21 ...
## $ UserPaySftDec : num 0 2 2 0 0 1 1 2 0 1 ...
## $ UserPayHrdDec : num 0 0 1 0 0 0 0 0 0 1 ...
## $ UserPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat05 : num 0 2 0 0 0 1 0 2 0 1 ...
## $ UserPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat56 : num 0 0 1 0 0 0 1 0 0 0 ...
## $ UserPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat12 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ UserPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat41 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ UserPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
head(ChurnFull, n = 3)
## UserPartyId UserOrigSignupDt SubscriberServiceId FirstBillDt LastPlayDt SignUpDt StopRequestDt
## 1 2273929 2004-05-31 176406097 2004-06-15 2014-01-18 2010-04-24 2999-01-01
## 2 2637516 2009-03-08 132608865 2009-03-23 2013-12-07 2009-03-08 2999-01-01
## 3 3268169 2008-05-13 311201787 2008-05-28 2014-01-22 2012-03-08 2999-01-01
## CancelDt CancelType CancelReason ChannelGroupName ChannelName CobrandCode
## 1 2999-01-01 Current Sub Retail Partners BBDMS 40132
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE MP3 Player Partners Sandisk 40134
## 3 2999-01-01 Current Sub Direct Marketing Search 40134
## CobrandName OriginCode pcode cpath rsrc opage ServiceTier
## 1 Best Buy bestbuy . . . PREM
## 2 RealRhapsody sandiskfuze rn . . RTG
## 3 RealRhapsody cj srchrv ppcse gg_ru_rhp_14 offer_rhap_rhapDMMMainPage RTG
## MonthsPerBill BillingZipCode BillingState EcommOrderId OrderDate PaymentTypeName PaymentTypeCode
## 1 1 75098 TX 368004554 2013-11-25 Credit CC
## 2 1 52402 IA 367205302 2013-11-01 Credit CC
## 3 1 60542 IL 367578873 2013-11-11 Credit CC
## PaymentSubTypeCode CPTCardBrand CPTCardType CPTBIN CPTTransactionDivision PaymentTxTypeCode
## 1 VI VI 1 - Credit 410413 193099 BC
## 2 VI VI 1 - Credit 414720 193099 BC
## 3 VI VI 3 - Check 443045 193099 BC
## PaymentTxStatus PmtStatType PT_status AVS_status SEC_status RiskCode PaymentProcessorCode
## 1 SUCCESS COMPLETE 00 C UNK UNK PT
## 2 SUCCESS COMPLETE 00 C UNK UNK PT
## 3 SUCCESS COMPLETE 00 C UNK UNK PT
## PaymentProcessorDesc PaymentDivisionCode PaymentDivisionDesc PaymentDivisionUseType
## 1 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## 2 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## 3 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## PaymentDivisionRiskType Country Currency Amount OrderStatus Churn PayAttempts OrdSucces OrdSftDec
## 1 NORMALRISK US USD 10.81 SUCCESS 0 1 1 0
## 2 NORMALRISK US USD 14.99 SUCCESS 0 8 1 0
## 3 NORMALRISK US USD 14.99 SUCCESS 0 1 1 0
## OrdHrdDec PaySuccess PaySftDec PayHrdDec PTStat89 PTStat05 PTStat14 PTStat56 PTStat52 PTStat12
## 1 0 1 0 0 0 0 0 0 0 0
## 2 0 1 1 0 0 1 0 0 0 0
## 3 0 1 0 0 0 0 0 0 0 0
## PTStat33 PTStat41 PTStat04 SubOrderCount SubPayAttempts SubAmount SubFirstOrderDate
## 1 0 0 0 29 29 313.49 2011-04-25
## 2 0 0 0 24 26 359.76 2011-04-24
## 3 0 0 0 14 15 209.86 2012-04-09
## SubLastOrderDate SubChurns SubOrdSucces SubOrdSftDec SubOrdHrdDec SubPaySuccess SubPaySftDec
## 1 2013-10-25 0 29 0 0 29 0
## 2 2013-10-01 0 24 0 0 24 2
## 3 2013-10-11 1 13 1 0 13 2
## SubPayHrdDec SubPTStat89 SubPTStat05 SubPTStat14 SubPTStat56 SubPTStat52 SubPTStat12 SubPTStat33
## 1 0 0 0 0 0 0 0 0
## 2 0 0 2 0 0 0 0 0
## 3 0 0 0 0 1 0 1 0
## SubPTStat41 SubPTStat04 SubCount UserOrderCount UserPayAttempts UserAmount UserFirstOrderDate
## 1 0 0 1 29 29 313.49 2011-04-25
## 2 0 0 1 24 26 359.76 2011-04-24
## 3 0 0 2 23 25 299.77 2011-05-20
## UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess
## 1 2013-10-25 0 29 0 0 29
## 2 2013-10-01 0 24 0 0 24
## 3 2013-10-11 1 22 1 0 22
## UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52
## 1 0 0 0 0 0 0 0
## 2 2 0 0 2 0 0 0
## 3 2 1 0 0 0 1 0
## UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## 1 0 0 0 0
## 2 0 0 0 0
## 3 1 0 1 0
# Data Understanding
# A lot of variables!
# Perhaps use pivot tables and/or Tableau to build on earlier work
# Clean up card brand / card type data, add to database
# Look at distribution of card brand and type in CPT data
table(ChurnFull$CPTCardBrand, ChurnFull$CPTCardType)
##
## 0 - None 1 - Credit 2 - Debit 3 - Check 5 - Pre-Paid 7 - European Deferred Debit Card
## MC 0 583 605 0 52 0
## None 830 0 0 0 0 0
## VI 0 859 0 2095 131 1
# Set CardBrand
# Set CardBrand defailt to PaymentSubType value from original Order Summary
# Replace PaymentSubType value with CPTCardBrand if present (not None)
ChurnFull$CardBrand <- ChurnFull$PaymentSubTypeCode
ChurnFull$CardBrand[ChurnFull$CPTCardBrand != "None"] <-
ChurnFull$CPTCardBrand[ChurnFull$CPTCardBrand != "None"]
# Clean up CardType
# Remove leading numbers from codes
# Map Check to Debit
# Fill in CardType (as Credit) for credit-only CardBrands -- AX, DI, PAYPAL
ChurnFull$CardType <- mapvalues(ChurnFull$CPTCardType,
c("0 - None", "1 - Credit", "2 - Debit", "3 - Check",
"5 - Pre-Paid", "7 - European Deferred Debit Card"),
c("None", "Credit", "Debit", "Debit", "Prepaid", "Debit"))
ChurnFull$CardType[ChurnFull$CardBrand == "AX"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "DI"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "PAYPAL"] <- "Credit"
# Check newly created CardBrand and CardType
table(ChurnFull$CardBrand, ChurnFull$CardType)
##
## None Credit Debit Prepaid
## AMZ_FPS 5 0 0 0
## AX 0 503 0 0
## DI 0 138 0 0
## MC 5 583 605 52
## PAYPAL 0 162 0 0
## VI 17 859 2096 131
# Set reference levels for CardBrand and CardType
# Sets VISA and Credit as the omitted factor levels in models
# CardBrand and CardType estimates are then interpretted relative to
# these reference levels
ChurnFull$CardBrand <- relevel(ChurnFull$CardBrand, ref = "VI")
ChurnFull$CardType <- relevel(ChurnFull$CardType, ref = "Credit")
# Add User history metrics
# Convert date strings from factor to date type
# Tenure, OrderDensity, Payment Friction, ChurnDensity
ChurnFull$UserOrigSignupDate <-
as.Date(ChurnFull$UserOrigSignupDt, format = DateFormat)
ChurnFull$FirstBillDate <-
as.Date(ChurnFull$FirstBillDt, format = DateFormat)
ChurnFull$LastPlayDate <-
as.Date(ChurnFull$LastPlayDt, format = DateFormat)
ChurnFull$SignUpDate <-
as.Date(ChurnFull$SignUpDt, format = DateFormat)
ChurnFull$StopRequestDate <-
as.Date(ChurnFull$StopRequestDt, format = DateFormat)
ChurnFull$CancelDate <-
as.Date(ChurnFull$CancelDt, format = DateFormat)
ChurnFull$OrdDate <-
as.Date(ChurnFull$OrderDate, format = DateFormat)
ChurnFull$UserFirstOrdDate <-
as.Date(ChurnFull$UserFirstOrderDate, format = DateFormat)
ChurnFull$UserLastOrdDate <-
as.Date(ChurnFull$UserLastOrderDate, format = DateFormat)
# Calculated date features
ChurnFull$UserTenure <-
as.numeric(round((ChurnFull$OrdDate - ChurnFull$UserOrigSignupDate)/365.25*12 + 1))
ChurnFull$UserWindow <-
as.numeric(round((ChurnFull$UserLastOrdDate -
ChurnFull$UserFirstOrdDate)/365.25*12 + 1))
ChurnFull$USuccessDensity <-
(ChurnFull$UserPaySuccess * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$USDDensity <-
(ChurnFull$UserPaySftDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UHDDensity <-
(ChurnFull$UserPayHrdDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UChurnDensity <-
(ChurnFull$UserChurns * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
# Randomly split observations 80/20 into training and test samples
# Generate sample of indices
set.seed(999999999)
TrainSize <- round(0.8 * AllRows)
TrainIdx <- sample(AllRows, TrainSize)
# Label observations as Train or Test
# Add Train/Test labels to data base
TrainTest <- rep("Test", AllRows)
TrainTest[TrainIdx] <- "Train"
ChurnFull$TrainTest <- TrainTest
# Save prepped data to .Rdata file for subsequent analysis
save("ChurnFull", file = ChurnPrepped)
########### END OF DATA PREPARATION
########### BEGIN ANALYSIS
# Select (using dplyr) analysis subset of candidate model variables/features
ChurnAnalysis <- select(ChurnFull,
TrainTest, Churn, UserOrigSignupDate,
FirstBillDate, LastPlayDate,
SignUpDate, StopRequestDate, CancelDate, CancelType,
CancelReason, ServiceTier:CPTCardType,
CardBrand, CardType, Amount,
SubCount:UserPTStat04,
UserTenure, UserWindow,
USuccessDensity, USDDensity, UHDDensity, UChurnDensity
)
str(ChurnAnalysis, list.len = 999)
## 'data.frame': 5156 obs. of 52 variables:
## $ TrainTest : chr "Train" "Train" "Test" "Test" ...
## $ Churn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserOrigSignupDate: Date, format: "2004-05-31" "2009-03-08" "2008-05-13" ...
## $ FirstBillDate : Date, format: "2004-06-15" "2009-03-23" "2008-05-28" ...
## $ LastPlayDate : Date, format: "2014-01-18" "2013-12-07" "2014-01-22" ...
## $ SignUpDate : Date, format: "2010-04-24" "2009-03-08" "2012-03-08" ...
## $ StopRequestDate : Date, format: "2999-01-01" "2999-01-01" "2999-01-01" ...
## $ CancelDate : Date, format: "2999-01-01" "2014-01-16" "2999-01-01" ...
## $ CancelType : Factor w/ 4 levels "Current Sub",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ CancelReason : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 4 1 1 1 1 1 1 1 1 ...
## $ ServiceTier : Factor w/ 4 levels "PREM","RHAP",..: 1 4 4 4 1 4 1 4 4 4 ...
## $ MonthsPerBill : num 1 1 1 1 1 1 3 1 1 1 ...
## $ BillingZipCode : num 75098 52402 60542 80227 79424 ...
## $ BillingState : Factor w/ 56 levels "","7th fl","AE",..: 48 16 18 9 48 48 24 12 8 47 ...
## $ EcommOrderId : num 3.68e+08 3.67e+08 3.68e+08 3.67e+08 3.68e+08 ...
## $ OrderDate : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 25 1 11 7 13 24 14 16 27 3 ...
## $ PaymentTypeName : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTypeCode : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentSubTypeCode: Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 6 6 6 6 6 2 2 ...
## $ CPTCardBrand : Factor w/ 3 levels "MC","None","VI": 3 3 3 3 3 3 3 3 2 2 ...
## $ CPTCardType : Factor w/ 6 levels "0 - None","1 - Credit",..: 2 2 4 2 2 4 2 2 1 1 ...
## $ CardBrand : Factor w/ 6 levels "VI","AMZ_FPS",..: 1 1 1 1 1 1 1 1 3 3 ...
## $ CardType : Factor w/ 4 levels "Credit","None",..: 1 1 3 1 1 3 1 1 1 1 ...
## $ Amount : num 10.8 15 15 15 10.8 ...
## $ SubCount : num 1 1 2 1 1 1 2 3 1 3 ...
## $ UserOrderCount : num 29 24 23 26 19 27 7 25 24 22 ...
## $ UserPayAttempts : num 29 26 25 26 19 27 10 39 24 23 ...
## $ UserAmount : num 313 360 300 390 205 ...
## $ UserFirstOrderDate: Factor w/ 888 levels "2011-04-21","2011-04-22",..: 5 4 30 48 277 33 18 16 37 12 ...
## $ UserLastOrderDate : Factor w/ 191 levels "2011-07-12","2011-10-28",..: 156 132 142 138 144 155 50 147 128 134 ...
## $ UserChurns : num 0 0 1 0 0 1 0 1 0 1 ...
## $ UserOrdSucces : num 29 24 22 26 19 26 7 24 24 21 ...
## $ UserOrdSftDec : num 0 0 1 0 0 1 0 1 0 1 ...
## $ UserOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPaySuccess : num 29 24 22 26 19 26 7 24 24 21 ...
## $ UserPaySftDec : num 0 2 2 0 0 1 1 2 0 1 ...
## $ UserPayHrdDec : num 0 0 1 0 0 0 0 0 0 1 ...
## $ UserPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat05 : num 0 2 0 0 0 1 0 2 0 1 ...
## $ UserPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat56 : num 0 0 1 0 0 0 1 0 0 0 ...
## $ UserPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat12 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ UserPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat41 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ UserPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserTenure : num 115 57 67 78 22 47 113 9 51 110 ...
## $ UserWindow : num 31 30 30 29 21 30 25 30 29 30 ...
## $ USuccessDensity : num 0.935 0.8 0.733 0.897 0.905 ...
## $ USDDensity : num 0 0.0667 0.0667 0 0 ...
## $ UHDDensity : num 0 0 0.0333 0 0 ...
## $ UChurnDensity : num 0 0 0.0333 0 0 ...
head(ChurnAnalysis, n = 3)
## TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1 Train 0 2004-05-31 2004-06-15 2014-01-18 2010-04-24 2999-01-01
## 2 Train 0 2009-03-08 2009-03-23 2013-12-07 2009-03-08 2999-01-01
## 3 Test 0 2008-05-13 2008-05-28 2014-01-22 2012-03-08 2999-01-01
## CancelDate CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode
## 1 2999-01-01 Current Sub PREM 1 75098
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE RTG 1 52402
## 3 2999-01-01 Current Sub RTG 1 60542
## BillingState EcommOrderId OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode
## 1 TX 368004554 2013-11-25 Credit CC VI
## 2 IA 367205302 2013-11-01 Credit CC VI
## 3 IL 367578873 2013-11-11 Credit CC VI
## CPTCardBrand CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts
## 1 VI 1 - Credit VI Credit 10.81 1 29 29
## 2 VI 1 - Credit VI Credit 14.99 1 24 26
## 3 VI 3 - Check VI Debit 14.99 2 23 25
## UserAmount UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec
## 1 313.49 2011-04-25 2013-10-25 0 29 0
## 2 359.76 2011-04-24 2013-10-01 0 24 0
## 3 299.77 2011-05-20 2013-10-11 1 22 1
## UserOrdHrdDec UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14
## 1 0 29 0 0 0 0 0
## 2 0 24 2 0 0 2 0
## 3 0 22 2 1 0 0 0
## UserPTStat56 UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure
## 1 0 0 0 0 0 0 115
## 2 0 0 0 0 0 0 57
## 3 1 0 1 0 1 0 67
## UserWindow USuccessDensity USDDensity UHDDensity UChurnDensity
## 1 31 0.9354839 0.00000000 0.00000000 0.00000000
## 2 30 0.8000000 0.06666667 0.00000000 0.00000000
## 3 30 0.7333333 0.06666667 0.03333333 0.03333333
# Check data summaries
summary(ChurnAnalysis)
## TrainTest Churn UserOrigSignupDate FirstBillDate
## Length:5156 Min. :0.00000 Min. :2002-06-07 Min. :2002-08-06
## Class :character 1st Qu.:0.00000 1st Qu.:2010-07-28 1st Qu.:2010-08-11
## Mode :character Median :0.00000 Median :2011-12-01 Median :2011-12-02
## Mean :0.04325 Mean :2011-06-03 Mean :2018-09-18
## 3rd Qu.:0.00000 3rd Qu.:2013-01-28 3rd Qu.:2013-02-06
## Max. :1.00000 Max. :2013-11-12 Max. :2999-01-01
##
## LastPlayDate SignUpDate StopRequestDate CancelDate
## Min. :2005-01-17 Min. :2005-05-05 Min. :2010-08-05 Min. :2013-11-02
## 1st Qu.:2013-11-23 1st Qu.:2011-05-25 1st Qu.:2999-01-01 1st Qu.:2999-01-01
## Median :2014-01-15 Median :2012-05-26 Median :2999-01-01 Median :2999-01-01
## Mean :2013-10-24 Mean :2012-02-04 Mean :2957-12-03 Mean :2902-09-18
## 3rd Qu.:2014-01-21 3rd Qu.:2013-03-14 3rd Qu.:2999-01-01 3rd Qu.:2999-01-01
## Max. :2014-01-22 Max. :2013-11-12 Max. :2999-01-01 Max. :2999-01-01
## NA's :426
## CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode
## Current Sub:4652 :4652 PREM:3318 Min. : 1.00 Min. : 664
## Involuntary: 259 MIGRATION_PAID_TO_PAID : 75 RHAP: 95 1st Qu.: 1.00 1st Qu.:27525
## Migration : 78 NON_BILLABLE_HARD_DECLINE: 47 RR : 12 Median : 1.00 Median :49051
## Voluntary : 167 NON_BILLABLE_SOFT_DECLINE: 212 RTG :1731 Mean : 1.07 Mean :52297
## NON_TIER_TRANSITION : 3 3rd Qu.: 1.00 3rd Qu.:79101
## NONE : 167 Max. :12.00 Max. :99999
##
## BillingState EcommOrderId OrderDate PaymentTypeName PaymentTypeCode
## CA : 544 Min. :367201731 2013-11-01: 467 Credit:4989 CC:4989
## TX : 467 1st Qu.:367420226 2013-11-02: 208 Flexib: 167 FP: 167
## : 342 Median :367665204 2013-11-03: 200
## FL : 298 Mean :367662576 2013-11-10: 185
## NY : 288 3rd Qu.:367909473 2013-11-14: 183
## PA : 213 Max. :368164542 2013-11-27: 181
## (Other):3004 (Other) :3732
## PaymentSubTypeCode CPTCardBrand CPTCardType CardBrand
## AMZ_FPS: 5 MC :1240 0 - None : 830 VI :3103
## AX : 503 None: 830 1 - Credit :1442 AMZ_FPS: 5
## DI : 138 VI :3086 2 - Debit : 605 AX : 503
## MC :1245 3 - Check :2095 DI : 138
## PAYPAL : 162 5 - Pre-Paid : 183 MC :1245
## VI :3103 7 - European Deferred Debit Card: 1 PAYPAL : 162
##
## CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## Credit :2245 Min. : 0.00 Min. :0.00 Min. : 0.00 Min. : 0.00 Min. : 0.0
## None : 27 1st Qu.: 9.99 1st Qu.:1.00 1st Qu.: 9.00 1st Qu.: 11.00 1st Qu.: 99.9
## Debit :2701 Median : 9.99 Median :1.00 Median :20.00 Median : 22.00 Median :229.8
## Prepaid: 183 Mean : 12.15 Mean :1.37 Mean :17.36 Mean : 20.08 Mean :211.6
## 3rd Qu.: 14.99 3rd Qu.:2.00 3rd Qu.:25.00 3rd Qu.: 27.00 3rd Qu.:307.4
## Max. :159.88 Max. :8.00 Max. :31.00 Max. :119.00 Max. :514.2
##
## UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec
## 2011-05-01: 197 2013-10-01: 425 Min. :0.0000 Min. : 0.00 Min. :0.0000
## 2011-05-17: 78 2013-10-02: 160 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.:0.0000
## 2011-05-03: 73 2013-10-03: 158 Median :0.0000 Median :20.00 Median :0.0000
## 2011-04-26: 71 2013-10-10: 151 Mean :0.2289 Mean :17.13 Mean :0.1903
## 2011-05-05: 69 2013-10-27: 150 3rd Qu.:0.0000 3rd Qu.:25.00 3rd Qu.:0.0000
## 2011-05-04: 68 2013-10-06: 149 Max. :8.0000 Max. :31.00 Max. :8.0000
## (Other) :4600 (Other) :3963
## UserOrdHrdDec UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89
## Min. :0.0000 Min. : 0.00 Min. : 0.0000 Min. :0.0000 Min. : 0.0000
## 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.0000
## Median :0.0000 Median :20.00 Median : 0.0000 Median :0.0000 Median : 0.0000
## Mean :0.0386 Mean :17.13 Mean : 0.7335 Mean :0.1098 Mean : 0.3765
## 3rd Qu.:0.0000 3rd Qu.:25.00 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.: 0.0000
## Max. :3.0000 Max. :31.00 Max. :16.0000 Max. :9.0000 Max. :16.0000
##
## UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52 UserPTStat12
## Min. : 0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 0.0000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean : 0.2411 Mean :0.06633 Mean :0.02948 Mean :0.01687 Mean :0.01552
## 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :10.0000 Max. :3.00000 Max. :4.00000 Max. :4.00000 Max. :2.00000
##
## UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. : 1.00 Min. : 1.00
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 10.00 1st Qu.:12.00
## Median :0.00000 Median :0.00000 Median :0.00000 Median : 24.00 Median :23.00
## Mean :0.01901 Mean :0.06594 Mean :0.02036 Mean : 30.41 Mean :20.76
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 41.00 3rd Qu.:30.00
## Max. :2.00000 Max. :9.00000 Max. :2.00000 Max. :138.00 Max. :31.00
##
## USuccessDensity USDDensity UHDDensity UChurnDensity
## Min. : 0.0000 Min. : 0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.: 0.7931 1st Qu.: 0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Median : 0.8621 Median : 0.00000 Median :0.000000 Median :0.00000
## Mean : 0.8435 Mean : 0.05069 Mean :0.005472 Mean :0.01286
## 3rd Qu.: 0.9091 3rd Qu.: 0.03704 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :12.0000 Max. :12.00000 Max. :0.666667 Max. :1.00000
##
# Split analysis database into Train and Test
ChurnTrain <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Train",]
ChurnTest <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Test",]
TrainRows <- nrow(ChurnTrain)
TestRows <- nrow(ChurnTest)
TrainRows
## [1] 4125
TestRows
## [1] 1031
# ANALYSIS AND MODELING OF TRAINING DATA
# Estimate card brand and card type effects
BrandModel <- glm(Churn ~ CardBrand,
family = binomial(logit), data = ChurnTrain)
summary(BrandModel)
##
## Call:
## glm(formula = Churn ~ CardBrand, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3317 -0.3317 -0.3317 -0.3022 3.2636
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.87254 0.08947 -32.107 < 2e-16 ***
## CardBrandAMZ_FPS -11.69353 441.37170 -0.026 0.978864
## CardBrandAX -2.44803 0.71446 -3.426 0.000612 ***
## CardBrandDI -1.75244 1.00886 -1.737 0.082381 .
## CardBrandMC -0.19085 0.17682 -1.079 0.280420
## CardBrandPAYPAL -0.37006 0.46456 -0.797 0.425700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1510.2 on 4124 degrees of freedom
## Residual deviance: 1476.6 on 4119 degrees of freedom
## AIC: 1488.6
##
## Number of Fisher Scoring iterations: 13
BrandTypeModel <- glm(Churn ~ CardBrand + CardType,
family = binomial(logit), data = ChurnTrain)
summary(BrandTypeModel)
##
## Call:
## glm(formula = Churn ~ CardBrand + CardType, family = binomial(logit),
## data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9855 -0.3140 -0.3040 -0.1633 3.2636
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.24455 0.26273 -16.155 < 2e-16 ***
## CardBrandAMZ_FPS -14.09630 441.37192 -0.032 0.9745
## CardBrandAX -1.07602 0.75596 -1.423 0.1546
## CardBrandDI -0.38043 1.03867 -0.366 0.7142
## CardBrandMC -0.06651 0.18958 -0.351 0.7257
## CardBrandPAYPAL 1.00195 0.52616 1.904 0.0569 .
## CardTypeNone 3.77478 0.51602 7.315 2.57e-13 ***
## CardTypeDebit 1.25944 0.27363 4.603 4.17e-06 ***
## CardTypePrepaid 3.69671 0.30729 12.030 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1510.2 on 4124 degrees of freedom
## Residual deviance: 1281.3 on 4116 degrees of freedom
## AIC: 1299.3
##
## Number of Fisher Scoring iterations: 13
# Stepwise feature selection
BrandTypeRedModel <- step(BrandTypeModel, k = log(TrainRows)) # based on BIC
## Start: AIC=1356.25
## Churn ~ CardBrand + CardType
##
## Df Deviance AIC
## - CardBrand 5 1292.0 1325.3
## <none> 1281.3 1356.2
## - CardType 3 1476.6 1526.6
##
## Step: AIC=1325.27
## Churn ~ CardType
##
## Df Deviance AIC
## <none> 1292.0 1325.3
## - CardType 3 1510.2 1518.5
summary(BrandTypeRedModel)
##
## Call:
## glm(formula = Churn ~ CardType, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9476 -0.3118 -0.3118 -0.1634 2.9402
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.3091 0.2055 -20.970 < 2e-16 ***
## CardTypeNone 3.5553 0.4754 7.478 7.55e-14 ***
## CardTypeDebit 1.3095 0.2292 5.714 1.10e-08 ***
## CardTypePrepaid 3.7411 0.2701 13.852 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1510.2 on 4124 degrees of freedom
## Residual deviance: 1292.0 on 4121 degrees of freedom
## AIC: 1300
##
## Number of Fisher Scoring iterations: 7
# Estimate card brand, card type and User effects
OrderUserModel <- glm(Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
USDDensity + UHDDensity + UChurnDensity,
family = binomial(logit), data = ChurnTrain)
summary(OrderUserModel)
##
## Call:
## glm(formula = Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity + UHDDensity + UChurnDensity, family = binomial(logit),
## data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4329 -0.3066 -0.2119 -0.1282 5.5596
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.668675 0.339682 -7.856 3.95e-15 ***
## CardBrandAMZ_FPS -13.810996 440.422605 -0.031 0.974984
## CardBrandAX -1.057242 0.758607 -1.394 0.163420
## CardBrandDI -0.281372 1.041815 -0.270 0.787100
## CardBrandMC -0.061239 0.192241 -0.319 0.750065
## CardBrandPAYPAL 0.359743 0.536807 0.670 0.502759
## CardTypeNone 3.304463 0.553339 5.972 2.35e-09 ***
## CardTypeDebit 0.956513 0.277875 3.442 0.000577 ***
## CardTypePrepaid 2.691278 0.330703 8.138 4.02e-16 ***
## UserTenure -0.026382 0.005788 -4.558 5.17e-06 ***
## USuccessDensity -1.036930 0.280873 -3.692 0.000223 ***
## USDDensity 1.128422 0.333060 3.388 0.000704 ***
## UHDDensity 1.967121 1.985718 0.991 0.321864
## UChurnDensity -0.526417 0.885568 -0.594 0.552218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1510.2 on 4124 degrees of freedom
## Residual deviance: 1218.7 on 4111 degrees of freedom
## AIC: 1246.7
##
## Number of Fisher Scoring iterations: 13
# Stepwise feature selection
OrderUserRedModel <- step(OrderUserModel, k = log(TrainRows)) # based on BIC
## Start: AIC=1335.23
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity + UHDDensity + UChurnDensity
##
## Df Deviance AIC
## - CardBrand 5 1225.0 1299.9
## - UChurnDensity 1 1219.0 1327.3
## - UHDDensity 1 1219.6 1327.8
## - USDDensity 1 1225.3 1333.5
## <none> 1218.7 1335.2
## - USuccessDensity 1 1232.0 1340.3
## - UserTenure 1 1246.2 1354.4
## - CardType 3 1312.7 1404.3
##
## Step: AIC=1299.92
## Churn ~ CardType + UserTenure + USuccessDensity + USDDensity +
## UHDDensity + UChurnDensity
##
## Df Deviance AIC
## - UChurnDensity 1 1225.4 1292.0
## - UHDDensity 1 1226.0 1292.6
## - USDDensity 1 1232.1 1298.7
## <none> 1225.0 1299.9
## - USuccessDensity 1 1239.4 1306.0
## - UserTenure 1 1253.5 1320.1
## - CardType 3 1335.8 1385.7
##
## Step: AIC=1292.02
## Churn ~ CardType + UserTenure + USuccessDensity + USDDensity +
## UHDDensity
##
## Df Deviance AIC
## - UHDDensity 1 1226.3 1284.6
## - USDDensity 1 1232.8 1291.1
## <none> 1225.4 1292.0
## - USuccessDensity 1 1240.7 1299.0
## - UserTenure 1 1254.5 1312.8
## - CardType 3 1336.9 1378.5
##
## Step: AIC=1284.56
## Churn ~ CardType + UserTenure + USuccessDensity + USDDensity
##
## Df Deviance AIC
## - USDDensity 1 1233.4 1283.3
## <none> 1226.3 1284.6
## - USuccessDensity 1 1241.2 1291.2
## - UserTenure 1 1255.7 1305.7
## - CardType 3 1340.1 1373.4
##
## Step: AIC=1283.31
## Churn ~ CardType + UserTenure + USuccessDensity
##
## Df Deviance AIC
## <none> 1233.4 1283.3
## - USuccessDensity 1 1242.9 1284.5
## - UserTenure 1 1268.0 1309.7
## - CardType 3 1368.5 1393.5
summary(OrderUserRedModel)
##
## Call:
## glm(formula = Churn ~ CardType + UserTenure + USuccessDensity,
## family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2880 -0.3153 -0.2106 -0.1423 5.1376
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.838664 0.284368 -9.982 < 2e-16 ***
## CardTypeNone 3.153756 0.502397 6.277 3.44e-10 ***
## CardTypeDebit 1.091985 0.231196 4.723 2.32e-06 ***
## CardTypePrepaid 3.017505 0.282856 10.668 < 2e-16 ***
## UserTenure -0.029452 0.005871 -5.017 5.26e-07 ***
## USuccessDensity -0.831317 0.266827 -3.116 0.00184 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1510.2 on 4124 degrees of freedom
## Residual deviance: 1233.4 on 4119 degrees of freedom
## AIC: 1245.4
##
## Number of Fisher Scoring iterations: 7
# Note Deviance measures and compute Pseudo (Deviance) R2
# (Null Deviance - Residual Deviance)/(Null Deviance)
# Exponentiate and interpret logistic coefficients
# Multiplicative change in odds per unit change in predictor
LogisticCoef <- coef(summary(OrderUserRedModel))
ExpCoef <- (exp(LogisticCoef[,"Estimate"]))
NeatCoef <- round(cbind(LogisticCoef, ExpCoef), digits = 6)
NeatCoef
## Estimate Std. Error z value Pr(>|z|) ExpCoef
## (Intercept) -2.838664 0.284368 -9.982359 0.000000 0.058504
## CardTypeNone 3.153756 0.502397 6.277419 0.000000 23.423881
## CardTypeDebit 1.091985 0.231196 4.723207 0.000002 2.980183
## CardTypePrepaid 3.017505 0.282856 10.667996 0.000000 20.440221
## UserTenure -0.029452 0.005871 -5.016721 0.000001 0.970977
## USuccessDensity -0.831317 0.266827 -3.115567 0.001836 0.435475
# Compute and examine training-sample predictions
# for reduced logistic --
# Logistic Prob, Classification
LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTrain, type = "response")
# Classification analysis, training sample
Threshold <- 0.5
LogisticClass <- rep(0, TrainRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTrain$Churn)
Confusion
##
## LogisticClass 0 1
## 0 3927 157
## 1 13 28
mean(LogisticClass == ChurnTrain$Churn)
## [1] 0.9587879
# Lift Chart
# What fraction of churn is captured in the top x fraction of churn scores?
ScoreFrac <- (1:TrainRows - 1)/(TrainRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTrain$Churn[ranks])/sum(ChurnTrain$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
main = "Churn Lift Chart -- Train",
xlab = "Fraction of Top Churn Scores",
ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
# Using performance function from ROCR package
plotpred <- prediction(LogisticProb, ChurnTrain$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Train", colorize = TRUE)

# ROC replotted using qplot (ggplot2)
# Extract plot variables fpr, tpr, threshold/cutoff
# from plotroc object created via performace function above
fpr <- plotroc@x.values[[1]] # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]] # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]] # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom = "line", color = Cutoff,
main = "ROC Chart -- Train",
xlab = "False Positive Rate",
ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package
confusionMatrix(LogisticClass,ChurnTrain$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3927 157
## 1 13 28
##
## Accuracy : 0.9588
## 95% CI : (0.9523, 0.9646)
## No Information Rate : 0.9552
## P-Value [Acc > NIR] : 0.1371
##
## Kappa : 0.2353
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.151351
## Specificity : 0.996701
## Pos Pred Value : 0.682927
## Neg Pred Value : 0.961557
## Prevalence : 0.044848
## Detection Rate : 0.006788
## Detection Prevalence : 0.009939
## Balanced Accuracy : 0.574026
##
## 'Positive' Class : 1
##
# Add predictions to data set, save to .csv
ChurnTrain$LogisticProb <- LogisticProb
ChurnTrain$LogisticClass <- LogisticClass
head(ChurnTrain, n = 3)
## TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1 Train 0 2004-05-31 2004-06-15 2014-01-18 2010-04-24 2999-01-01
## 2 Train 0 2009-03-08 2009-03-23 2013-12-07 2009-03-08 2999-01-01
## 6 Train 0 2010-01-14 2010-01-22 2013-12-25 2010-01-22 2999-01-01
## CancelDate CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode
## 1 2999-01-01 Current Sub PREM 1 75098
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE RTG 1 52402
## 6 2999-01-01 Current Sub RTG 1 78757
## BillingState EcommOrderId OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode
## 1 TX 368004554 2013-11-25 Credit CC VI
## 2 IA 367205302 2013-11-01 Credit CC VI
## 6 TX 367973905 2013-11-24 Credit CC VI
## CPTCardBrand CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts
## 1 VI 1 - Credit VI Credit 10.81 1 29 29
## 2 VI 1 - Credit VI Credit 14.99 1 24 26
## 6 VI 3 - Check VI Debit 16.23 1 27 27
## UserAmount UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec
## 1 313.49 2011-04-25 2013-10-25 0 29 0
## 2 359.76 2011-04-24 2013-10-01 0 24 0
## 6 438.21 2011-05-23 2013-10-24 1 26 1
## UserOrdHrdDec UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14
## 1 0 29 0 0 0 0 0
## 2 0 24 2 0 0 2 0
## 6 0 26 1 0 0 1 0
## UserPTStat56 UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure
## 1 0 0 0 0 0 0 115
## 2 0 0 0 0 0 0 57
## 6 0 0 0 0 0 0 47
## UserWindow USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 1 31 0.9354839 0.00000000 0 0.00000000 0.0009079962 0
## 2 30 0.8000000 0.06666667 0 0.00000000 0.0055826212 0
## 6 30 0.8666667 0.03333333 0 0.03333333 0.0208075542 0
write.csv(ChurnTrain, file = ChurnTrainAnalysis)
# MODEL EVALUATION USING TEST DATA
# Generate test set predictions --
# Logistic Prob, Classification
LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTest, type = "response")
# Classification analysis, Testing sample
Threshold <- 0.5
LogisticClass <- rep(0, TestRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTest$Churn)
Confusion
##
## LogisticClass 0 1
## 0 989 34
## 1 4 4
mean(LogisticClass == ChurnTest$Churn)
## [1] 0.9631426
# Lift Chart
# What fraction of churn is captured in the top x fraction of churn scores?
ScoreFrac <- (1:TestRows - 1)/(TestRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTest$Churn[ranks])/sum(ChurnTest$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
main = "Churn Lift Chart -- Test",
xlab = "Fraction of Top Churn Scores",
ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
# Using performance function from ROCR package
plotpred <- prediction(LogisticProb, ChurnTest$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Test", colorize = TRUE)

# ROC replotted using qplot (ggplot2)
# Extract plot variables fpr, tpr, threshold/cutoff
# from plotroc object created via performace function above
fpr <- plotroc@x.values[[1]] # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]] # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]] # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom="line", color= Cutoff,
main = "ROC Chart -- Test",
xlab = "False Positive Rate",
ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package
confusionMatrix(LogisticClass,ChurnTest$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 989 34
## 1 4 4
##
## Accuracy : 0.9631
## 95% CI : (0.9498, 0.9738)
## No Information Rate : 0.9631
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0.1632
## Mcnemar's Test P-Value : 2.546e-06
##
## Sensitivity : 0.105263
## Specificity : 0.995972
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.966764
## Prevalence : 0.036857
## Detection Rate : 0.003880
## Detection Prevalence : 0.007759
## Balanced Accuracy : 0.550617
##
## 'Positive' Class : 1
##
# Add predictions to data set, save to .csv
ChurnTest$LogisticProb <- LogisticProb
ChurnTest$LogisticClass <- LogisticClass
head(ChurnTest, n = 3)
## TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 3 Test 0 2008-05-13 2008-05-28 2014-01-22 2012-03-08 2999-01-01
## 4 Test 0 2007-06-19 2007-07-04 2014-01-22 2007-06-19 2999-01-01
## 5 Test 0 2012-01-29 2012-02-13 <NA> 2012-01-29 2999-01-01
## CancelDate CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 3 2999-01-01 Current Sub RTG 1 60542 IL
## 4 2999-01-01 Current Sub RTG 1 80227 CO
## 5 2999-01-01 Current Sub PREM 1 79424 TX
## EcommOrderId OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 3 367578873 2013-11-11 Credit CC VI VI
## 4 367439669 2013-11-07 Credit CC VI VI
## 5 367640723 2013-11-13 Credit CC VI VI
## CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 3 3 - Check VI Debit 14.99 2 23 25 299.77
## 4 1 - Credit VI Credit 14.99 1 26 26 389.74
## 5 1 - Credit VI Credit 10.81 1 19 19 205.39
## UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 3 2011-05-20 2013-10-11 1 22 1 0
## 4 2011-06-07 2013-10-07 0 26 0 0
## 5 2012-02-13 2013-10-13 0 19 0 0
## UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 3 22 2 1 0 0 0 1
## 4 26 0 0 0 0 0 0
## 5 19 0 0 0 0 0 0
## UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 3 0 1 0 1 0 67 30
## 4 0 0 0 0 0 78 29
## 5 0 0 0 0 0 22 21
## USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 3 0.7333333 0.06666667 0.03333333 0.03333333 0.013001347 0
## 4 0.8965517 0.00000000 0.00000000 0.00000000 0.002783491 0
## 5 0.9047619 0.00000000 0.00000000 0.00000000 0.014220353 0
write.table(ChurnTest, file = ChurnTestAnalysis)
########### END OF ANALYSIS