options(width=100)
# Starter logistic regression analysis script for Rhapsody Churn Modeling
# Adapted from a Churn Exercise script file
library(ggplot2) # for improved plots -- qplot and scatterplot matrix
library(GGally) # for scatterplot matrix
library(RcmdrMisc) # for numSummary, stepwise
library(ROCR) # for producing ROC and other prediction performance plots
library(caret) # for confusion matrix analysis
library(plyr) # for mapvalues recoding function
library(dplyr) # select function for subsetting data frames
# Suggested usage:
# Copy the required data file, Rhapsody2016CPTOrderSummary-010.csv into a
# new project folder <folder>
# Change the working directory (setwd below) to point to the project <folder>
# Run the entire script as a block
# Set up working directory for file locations
# and load required libraries
ChurnDir <- "D:/MBA/SecondTerm/Data/exam"
ChurnData <- "Rhapsody2017CPTOrderSummary.csv"
ChurnPrepped <- "ChurnFullPrepped-100.RData"
ChurnTrainAnalysis <- "ChurnTrainAnalysis-100.csv"
ChurnTestAnalysis <- "ChurnTestAnalysis-100.csv"
# DateFormat <- "%m/%d/%Y"
DateFormat <- "%Y-%m-%d"
########### BEGIN DATA PREPARATION
setwd(ChurnDir)
# DATA ENTRY AND SUMMARIZATION
# Read full data set, list -- names, structure, head,
ChurnFull <- read.csv(ChurnData)
AllRows <- nrow(ChurnFull)
str(ChurnFull, list.len = 999)
## 'data.frame': 529430 obs. of 109 variables:
## $ UserPartyId : num 120392 141193 146503 149471 150298 ...
## $ UserOrigSignupDt : Factor w/ 4161 levels "2001-12-03","2002-01-25",..: 2507 504 2894 3513 762 2375 361 2176 1139 2841 ...
## $ SubscriberServiceId : num 1.41e+08 8.80e+07 1.77e+08 3.02e+08 4.00e+08 ...
## $ FirstBillDt : Factor w/ 3890 levels "2001-12-03","2002-01-25",..: 2212 202 2584 3203 460 2065 97 1896 844 2531 ...
## $ LastPlayDt : Factor w/ 2234 levels "","2003-05-09",..: 2205 2205 1976 2230 1 2232 2234 2234 1974 2187 ...
## $ SignUpDt : Factor w/ 3200 levels "2002-12-22","2003-01-04",..: 1546 1018 1933 2552 2908 2296 2908 1245 2525 2480 ...
## $ StopRequestDt : Factor w/ 924 levels "2007-11-26","2007-12-19",..: 924 924 908 924 924 924 924 924 924 924 ...
## $ CancelDt : Factor w/ 87 levels "2007-12-16","2008-01-03",..: 87 87 87 87 87 87 87 87 87 87 ...
## $ CancelType : Factor w/ 4 levels "Current Sub",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CancelReason : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ChannelGroupName : Factor w/ 15 levels "Cable Partners",..: 14 14 5 12 3 14 1 8 14 2 ...
## $ ChannelName : Factor w/ 117 levels "Affiliate","Android Mobile",..: 106 106 84 70 30 106 69 91 106 92 ...
## $ CobrandCode : num 40134 1 40134 40134 40135 ...
## $ CobrandName : Factor w/ 36 levels "","ATT","AudioGalaxy",..: 28 19 28 28 15 28 26 28 19 28 ...
## $ OriginCode : Factor w/ 493 levels "","12_19Bpage",..: 191 187 327 341 161 332 280 191 187 191 ...
## $ pcode : Factor w/ 445 levels ".","0","12_19Bpage",..: 1 1 1 305 1 1 1 345 1 385 ...
## $ cpath : Factor w/ 351 levels ".","<musicnew>",..: 1 1 223 272 1 1 1 176 1 251 ...
## $ rsrc : Factor w/ 5161 levels "-","$10_300x250",..: 4 4 1604 4755 4 4 4 4958 4 2171 ...
## $ opage : Factor w/ 455 levels "","-","0","404__404",..: 1 1 262 441 1 1 1 1 1 31 ...
## $ ServiceTier : Factor w/ 6 levels "FREE","PREM",..: 5 5 2 2 2 2 2 5 2 5 ...
## $ MonthsPerBill : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BillingZipCode : num 44333 94127 18015 19426 96737 ...
## $ BillingState : Factor w/ 171 levels "","- AICH","- MIYA",..: 118 30 126 126 65 103 117 117 126 53 ...
## $ EcommOrderId : num 3.68e+08 3.67e+08 3.67e+08 3.68e+08 3.67e+08 ...
## $ OrderDate : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 16 1 2 11 8 1 26 14 15 1 ...
## $ PaymentTypeName : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTypeCode : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentSubTypeCode : Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 2 4 6 3 2 6 6 ...
## $ CPTCardBrand : Factor w/ 3 levels "MC","None","VI": 3 3 3 2 1 3 2 2 3 3 ...
## $ CPTCardType : Factor w/ 7 levels "0 - None","1 - Credit",..: 2 2 4 1 2 2 1 1 4 4 ...
## $ CPTBIN : num 408161 414729 482857 NA 512107 ...
## $ CPTTransactionDivision : num 193099 193099 193099 NA 193099 ...
## $ PaymentTxTypeCode : Factor w/ 1 level "BC": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTxStatus : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PmtStatType : Factor w/ 1 level "COMPLETE": 1 1 1 1 1 1 1 1 1 1 ...
## $ PT_status : Factor w/ 33 levels "00","01","04",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AVS_status : Factor w/ 16 levels "2","3","6","7",..: 8 10 10 6 2 8 8 6 7 8 ...
## $ SEC_status : Factor w/ 4 levels "M","P","S","UNK": 4 4 4 4 4 4 3 4 4 4 ...
## $ RiskCode : Factor w/ 1 level "UNK": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentProcessorCode : Factor w/ 4 levels "AMZ","PPL","PT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentProcessorDesc : Factor w/ 4 levels "","Amazon FPS",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentDivisionCode : Factor w/ 4 levels "193099","203217",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentDivisionDesc : Factor w/ 5 levels "","Amazon FPS non-recurring default",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ PaymentDivisionUseType : Factor w/ 3 levels "","N","R": 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentDivisionRiskType: Factor w/ 2 levels "","NORMALRISK": 2 2 2 2 2 2 2 2 2 2 ...
## $ Country : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
## $ Currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Amount : num 14.99 14.99 9.99 9.99 10.35 ...
## $ OrderStatus : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Churn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OrdSucces : num 1 1 1 1 1 1 1 1 1 1 ...
## $ OrdSftDec : num 1 1 1 1 1 1 1 1 1 1 ...
## $ OrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PayAttempts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PaySuccess : num 1 1 1 1 1 1 1 1 1 1 ...
## $ PaySftDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat05 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubOrderCount : num 26 25 25 16 8 25 8 27 17 19 ...
## $ SubPayAttempts : num 34 25 25 16 8 25 8 27 17 19 ...
## $ SubAmount : num 389.7 374.8 249.8 159.8 82.8 ...
## $ SubFirstOrderDate : Factor w/ 954 levels "2011-04-21","2011-04-22",..: 6 11 43 326 660 72 678 24 301 256 ...
## $ SubLastOrderDate : Factor w/ 408 levels "2011-10-23","2011-10-29",..: 364 319 320 359 356 349 344 362 363 349 ...
## $ SubChurns : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubOrdSucces : num 26 25 25 16 8 25 8 27 17 19 ...
## $ SubOrdSftDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPaySuccess : num 26 25 25 16 8 25 8 27 17 19 ...
## $ SubPaySftDec : num 2 0 0 0 0 0 0 0 0 0 ...
## $ SubPayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat05 : num 2 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubCount : num 1 1 1 1 2 2 2 1 2 2 ...
## $ UserOrderCount : num 26 25 25 16 26 26 28 27 18 25 ...
## $ UserPayAttempts : num 34 25 25 16 26 26 28 27 18 25 ...
## $ UserAmount : num 390 375 250 160 269 ...
## $ UserFirstOrderDate : Factor w/ 955 levels "2011-04-21","2011-04-22",..: 6 11 43 326 18 11 6 24 129 24 ...
## $ UserLastOrderDate : Factor w/ 878 levels "2011-04-22","2011-04-23",..: 833 788 789 828 825 818 813 831 832 818 ...
## $ UserChurns : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserOrdSucces : num 26 25 25 16 26 26 28 27 18 25 ...
## $ UserOrdSftDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPaySuccess : num 26 25 25 16 26 26 28 27 18 25 ...
## $ UserPaySftDec : num 2 0 0 0 0 0 0 0 0 0 ...
## $ UserPayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat05 : num 2 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
head(ChurnFull, n = 3)
## UserPartyId UserOrigSignupDt SubscriberServiceId FirstBillDt LastPlayDt SignUpDt StopRequestDt
## 1 120392 2009-05-10 141005905 2009-05-25 2013-12-24 2009-05-10 2999-01-01
## 2 141193 2003-11-15 87975825 2003-11-23 2013-12-24 2007-11-29 2999-01-01
## 3 146503 2010-06-01 176614654 2010-06-01 2013-05-09 2010-06-01 2014-01-07
## CancelDt CancelType CancelReason ChannelGroupName ChannelName CobrandCode
## 1 2999-01-01 Current Sub Unmapped Unknown 40134
## 2 2999-01-01 Current Sub Unmapped Unknown 1
## 3 2999-01-01 Current Sub Miscellaneous RealPlayer Feature Trigger SP 40134
## CobrandName OriginCode pcode cpath rsrc opage ServiceTier MonthsPerBill
## 1 RealRhapsody LogInViaRN . . . RTG 1
## 2 Listen.com listen . . . RTG 1
## 3 RealRhapsody rcom . myaccount edit rc_rp_trigger_dvdplayback PREM 1
## BillingZipCode BillingState EcommOrderId OrderDate PaymentTypeName PaymentTypeCode
## 1 44333 OH 367726684 2013-11-16 Credit CC
## 2 94127 CA 367201564 2013-11-01 Credit CC
## 3 18015 PA 367275211 2013-11-02 Credit CC
## PaymentSubTypeCode CPTCardBrand CPTCardType CPTBIN CPTTransactionDivision PaymentTxTypeCode
## 1 VI VI 1 - Credit 408161 193099 BC
## 2 VI VI 1 - Credit 414729 193099 BC
## 3 VI VI 3 - Check 482857 193099 BC
## PaymentTxStatus PmtStatType PT_status AVS_status SEC_status RiskCode PaymentProcessorCode
## 1 SUCCESS COMPLETE 00 C UNK UNK PT
## 2 SUCCESS COMPLETE 00 G UNK UNK PT
## 3 SUCCESS COMPLETE 00 G UNK UNK PT
## PaymentProcessorDesc PaymentDivisionCode PaymentDivisionDesc PaymentDivisionUseType
## 1 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## 2 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## 3 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## PaymentDivisionRiskType Country Currency Amount OrderStatus Churn OrdSucces OrdSftDec OrdHrdDec
## 1 NORMALRISK US USD 14.99 SUCCESS 0 1 1 0
## 2 NORMALRISK US USD 14.99 SUCCESS 0 1 1 0
## 3 NORMALRISK US USD 9.99 SUCCESS 0 1 1 0
## PayAttempts PaySuccess PaySftDec PayHrdDec PTStat89 PTStat05 PTStat14 PTStat56 PTStat52 PTStat12
## 1 0 1 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0
## 3 0 1 0 0 0 0 0 0 0 0
## PTStat33 PTStat41 PTStat04 SubOrderCount SubPayAttempts SubAmount SubFirstOrderDate
## 1 0 0 0 26 34 389.74 2011-04-26
## 2 0 0 0 25 25 374.75 2011-05-01
## 3 0 0 0 25 25 249.75 2011-06-02
## SubLastOrderDate SubChurns SubOrdSucces SubOrdSftDec SubOrdHrdDec SubPaySuccess SubPaySftDec
## 1 2013-10-16 0 26 0 0 26 2
## 2 2013-09-01 0 25 0 0 25 0
## 3 2013-09-02 0 25 0 0 25 0
## SubPayHrdDec SubPTStat89 SubPTStat05 SubPTStat14 SubPTStat56 SubPTStat52 SubPTStat12 SubPTStat33
## 1 0 0 2 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## SubPTStat41 SubPTStat04 SubCount UserOrderCount UserPayAttempts UserAmount UserFirstOrderDate
## 1 0 0 1 26 34 389.74 2011-04-26
## 2 0 0 1 25 25 374.75 2011-05-01
## 3 0 0 1 25 25 249.75 2011-06-02
## UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess
## 1 2013-10-16 0 26 0 0 26
## 2 2013-09-01 0 25 0 0 25
## 3 2013-09-02 0 25 0 0 25
## UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52
## 1 2 0 0 2 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
# Data Understanding
# A lot of variables!
# Perhaps use pivot tables and/or Tableau to build on earlier work
# Clean up card brand / card type data, add to database
# Look at distribution of card brand and type in CPT data
table(ChurnFull$CPTCardBrand, ChurnFull$CPTCardType)
##
## 0 - None 1 - Credit 2 - Debit 3 - Check 5 - Pre-Paid 6 - Charge Card
## MC 0 56950 60926 0 6504 0
## None 86749 0 0 0 0 0
## VI 0 89563 0 214453 14255 1
##
## 7 - European Deferred Debit Card
## MC 0
## None 0
## VI 29
# Set CardBrand
# Set CardBrand defailt to PaymentSubType value from original Order Summary
# Replace PaymentSubType value with CPTCardBrand if present (not None)
ChurnFull$CardBrand <- ChurnFull$PaymentSubTypeCode
ChurnFull$CardBrand[ChurnFull$CPTCardBrand != "None"] <-
ChurnFull$CPTCardBrand[ChurnFull$CPTCardBrand != "None"]
# Clean up CardType
# Remove leading numbers from codes
# Map Check to Debit
# Fill in CardType (as Credit) for credit-only CardBrands -- AX, DI, PAYPAL
ChurnFull$CardType <- mapvalues(ChurnFull$CPTCardType,
c("0 - None", "1 - Credit", "2 - Debit", "3 - Check",
"5 - Pre-Paid", "7 - European Deferred Debit Card"),
c("None", "Credit", "Debit", "Debit", "Prepaid", "Debit"))
ChurnFull$CardType[ChurnFull$CardBrand == "AX"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "DI"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "PAYPAL"] <- "Credit"
# Check newly created CardBrand and CardType
table(ChurnFull$CardBrand, ChurnFull$CardType)
##
## None Credit Debit Prepaid 6 - Charge Card
## AMZ_FPS 832 0 0 0 0
## AX 0 53352 0 0 0
## DI 0 14070 0 0 0
## MC 718 56950 60926 6504 0
## PAYPAL 0 16121 0 0 0
## VI 1656 89563 214482 14255 1
# Set reference levels for CardBrand and CardType
# Sets VISA and Credit as the omitted factor levels in models
# CardBrand and CardType estimates are then interpretted relative to
# these reference levels
ChurnFull$CardBrand <- relevel(ChurnFull$CardBrand, ref = "VI")
ChurnFull$CardType <- relevel(ChurnFull$CardType, ref = "Credit")
# Add User history metrics
# Convert date strings from factor to date type
# Tenure, OrderDensity, Payment Friction, ChurnDensity
ChurnFull$UserOrigSignupDate <-
as.Date(ChurnFull$UserOrigSignupDt, format = DateFormat)
ChurnFull$FirstBillDate <-
as.Date(ChurnFull$FirstBillDt, format = DateFormat)
ChurnFull$LastPlayDate <-
as.Date(ChurnFull$LastPlayDt, format = DateFormat)
ChurnFull$SignUpDate <-
as.Date(ChurnFull$SignUpDt, format = DateFormat)
ChurnFull$StopRequestDate <-
as.Date(ChurnFull$StopRequestDt, format = DateFormat)
ChurnFull$CancelDate <-
as.Date(ChurnFull$CancelDt, format = DateFormat)
ChurnFull$OrdDate <-
as.Date(ChurnFull$OrderDate, format = DateFormat)
ChurnFull$UserFirstOrdDate <-
as.Date(ChurnFull$UserFirstOrderDate, format = DateFormat)
ChurnFull$UserLastOrdDate <-
as.Date(ChurnFull$UserLastOrderDate, format = DateFormat)
# Calculated date features
ChurnFull$UserTenure <-
as.numeric(round((ChurnFull$OrdDate - ChurnFull$UserOrigSignupDate)/365.25*12 + 1))
ChurnFull$UserWindow <-
as.numeric(round((ChurnFull$UserLastOrdDate -
ChurnFull$UserFirstOrdDate)/365.25*12 + 1))
ChurnFull$USuccessDensity <-
(ChurnFull$UserPaySuccess * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$USDDensity <-
(ChurnFull$UserPaySftDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UHDDensity <-
(ChurnFull$UserPayHrdDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UChurnDensity <-
(ChurnFull$UserChurns * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
# Randomly split observations 80/20 into training and test samples
# Generate sample of indices
set.seed(999999999)
TrainSize <- round(0.8 * AllRows)
TrainIdx <- sample(AllRows, TrainSize)
# Label observations as Train or Test
# Add Train/Test labels to data base
TrainTest <- rep("Test", AllRows)
TrainTest[TrainIdx] <- "Train"
ChurnFull$TrainTest <- TrainTest
# Save prepped data to .Rdata file for subsequent analysis
save("ChurnFull", file = ChurnPrepped)
########### END OF DATA PREPARATION
########### BEGIN ANALYSIS
# Select (using dplyr) analysis subset of candidate model variables/features
ChurnAnalysis <- select(ChurnFull,
TrainTest, Churn, UserOrigSignupDate,
FirstBillDate, LastPlayDate,
SignUpDate, StopRequestDate, CancelDate, CancelType,
CancelReason, ServiceTier:CPTCardType,
CardBrand, CardType, Amount,
SubCount:UserPTStat04,
UserTenure, UserWindow,
USuccessDensity, USDDensity, UHDDensity, UChurnDensity
)
str(ChurnAnalysis, list.len = 999)
## 'data.frame': 529430 obs. of 52 variables:
## $ TrainTest : chr "Test" "Train" "Train" "Test" ...
## $ Churn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserOrigSignupDate: Date, format: "2009-05-10" "2003-11-15" "2010-06-01" ...
## $ FirstBillDate : Date, format: "2009-05-25" "2003-11-23" "2010-06-01" ...
## $ LastPlayDate : Date, format: "2013-12-24" "2013-12-24" "2013-05-09" ...
## $ SignUpDate : Date, format: "2009-05-10" "2007-11-29" "2010-06-01" ...
## $ StopRequestDate : Date, format: "2999-01-01" "2999-01-01" "2014-01-07" ...
## $ CancelDate : Date, format: "2999-01-01" "2999-01-01" "2999-01-01" ...
## $ CancelType : Factor w/ 4 levels "Current Sub",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CancelReason : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ServiceTier : Factor w/ 6 levels "FREE","PREM",..: 5 5 2 2 2 2 2 5 2 5 ...
## $ MonthsPerBill : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BillingZipCode : num 44333 94127 18015 19426 96737 ...
## $ BillingState : Factor w/ 171 levels "","- AICH","- MIYA",..: 118 30 126 126 65 103 117 117 126 53 ...
## $ EcommOrderId : num 3.68e+08 3.67e+08 3.67e+08 3.68e+08 3.67e+08 ...
## $ OrderDate : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 16 1 2 11 8 1 26 14 15 1 ...
## $ PaymentTypeName : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTypeCode : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentSubTypeCode: Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 2 4 6 3 2 6 6 ...
## $ CPTCardBrand : Factor w/ 3 levels "MC","None","VI": 3 3 3 2 1 3 2 2 3 3 ...
## $ CPTCardType : Factor w/ 7 levels "0 - None","1 - Credit",..: 2 2 4 1 2 2 1 1 4 4 ...
## $ CardBrand : Factor w/ 6 levels "VI","AMZ_FPS",..: 1 1 1 3 5 1 4 3 1 1 ...
## $ CardType : Factor w/ 5 levels "Credit","None",..: 1 1 3 1 1 1 1 1 3 3 ...
## $ Amount : num 14.99 14.99 9.99 9.99 10.35 ...
## $ SubCount : num 1 1 1 1 2 2 2 1 2 2 ...
## $ UserOrderCount : num 26 25 25 16 26 26 28 27 18 25 ...
## $ UserPayAttempts : num 34 25 25 16 26 26 28 27 18 25 ...
## $ UserAmount : num 390 375 250 160 269 ...
## $ UserFirstOrderDate: Factor w/ 955 levels "2011-04-21","2011-04-22",..: 6 11 43 326 18 11 6 24 129 24 ...
## $ UserLastOrderDate : Factor w/ 878 levels "2011-04-22","2011-04-23",..: 833 788 789 828 825 818 813 831 832 818 ...
## $ UserChurns : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserOrdSucces : num 26 25 25 16 26 26 28 27 18 25 ...
## $ UserOrdSftDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPaySuccess : num 26 25 25 16 26 26 28 27 18 25 ...
## $ UserPaySftDec : num 2 0 0 0 0 0 0 0 0 0 ...
## $ UserPayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat05 : num 2 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserTenure : num 55 121 42 22 112 59 126 66 100 44 ...
## $ UserWindow : num 31 29 28 20 30 30 30 30 27 30 ...
## $ USuccessDensity : num 0.839 0.862 0.893 0.8 0.867 ...
## $ USDDensity : num 0.0645 0 0 0 0 ...
## $ UHDDensity : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UChurnDensity : num 0 0 0 0 0 0 0 0 0 0 ...
head(ChurnAnalysis, n = 3)
## TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1 Test 0 2009-05-10 2009-05-25 2013-12-24 2009-05-10 2999-01-01
## 2 Train 0 2003-11-15 2003-11-23 2013-12-24 2007-11-29 2999-01-01
## 3 Train 0 2010-06-01 2010-06-01 2013-05-09 2010-06-01 2014-01-07
## CancelDate CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 1 2999-01-01 Current Sub RTG 1 44333 OH
## 2 2999-01-01 Current Sub RTG 1 94127 CA
## 3 2999-01-01 Current Sub PREM 1 18015 PA
## EcommOrderId OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 1 367726684 2013-11-16 Credit CC VI VI
## 2 367201564 2013-11-01 Credit CC VI VI
## 3 367275211 2013-11-02 Credit CC VI VI
## CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 1 1 - Credit VI Credit 14.99 1 26 34 389.74
## 2 1 - Credit VI Credit 14.99 1 25 25 374.75
## 3 3 - Check VI Debit 9.99 1 25 25 249.75
## UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 1 2011-04-26 2013-10-16 0 26 0 0
## 2 2011-05-01 2013-09-01 0 25 0 0
## 3 2011-06-02 2013-09-02 0 25 0 0
## UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 1 26 2 0 0 2 0 0
## 2 25 0 0 0 0 0 0
## 3 25 0 0 0 0 0 0
## UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 1 0 0 0 0 0 55 31
## 2 0 0 0 0 0 121 29
## 3 0 0 0 0 0 42 28
## USuccessDensity USDDensity UHDDensity UChurnDensity
## 1 0.8387097 0.06451613 0 0
## 2 0.8620690 0.00000000 0 0
## 3 0.8928571 0.00000000 0 0
# Check data summaries
summary(ChurnAnalysis)
## TrainTest Churn UserOrigSignupDate FirstBillDate
## Length:529430 Min. :0.00000 Min. :2001-12-03 Min. :2001-12-03
## Class :character 1st Qu.:0.00000 1st Qu.:2010-08-20 1st Qu.:2010-09-04
## Mode :character Median :0.00000 Median :2011-12-01 Median :2011-12-05
## Mean :0.04311 Mean :2011-06-11 Mean :2017-12-08
## 3rd Qu.:0.00000 3rd Qu.:2013-01-23 3rd Qu.:2013-02-03
## Max. :1.00000 Max. :2013-11-22 Max. :2999-01-01
##
## LastPlayDate SignUpDate StopRequestDate CancelDate
## Min. :2003-05-09 Min. :2002-12-22 Min. :2007-11-26 Min. :2007-12-16
## 1st Qu.:2013-11-20 1st Qu.:2011-06-17 1st Qu.:2999-01-01 1st Qu.:2999-01-01
## Median :2014-01-15 Median :2012-06-11 Median :2999-01-01 Median :2999-01-01
## Mean :2013-10-23 Mean :2012-02-15 Mean :2955-07-23 Mean :2903-07-19
## 3rd Qu.:2014-01-21 3rd Qu.:2013-03-11 3rd Qu.:2999-01-01 3rd Qu.:2999-01-01
## Max. :2014-01-22 Max. :2013-11-22 Max. :2999-01-01 Max. :2999-01-01
## NA's :45646
## CancelType CancelReason ServiceTier MonthsPerBill
## Current Sub:478125 :478125 FREE: 178 Min. : 1.000
## Involuntary: 24964 MIGRATION_PAID_TO_PAID : 7569 PREM:337522 1st Qu.: 1.000
## Migration : 8092 NON_BILLABLE_HARD_DECLINE: 5025 RHAP: 11311 Median : 1.000
## Voluntary : 18249 NON_BILLABLE_SOFT_DECLINE: 19934 RR : 1275 Mean : 1.061
## NON_TIER_TRANSITION : 523 RTG :178973 3rd Qu.: 1.000
## NONE : 18254 RU : 171 Max. :12.000
##
## BillingZipCode BillingState EcommOrderId OrderDate PaymentTypeName
## Min. : 0 CA : 57425 Min. :367201517 2013-11-01: 44960 Credit:512466
## 1st Qu.:27521 TX : 47599 1st Qu.:367423714 2013-11-03: 20075 Flexib: 16964
## Median :48855 : 32196 Median :367665426 2013-11-09: 18947
## Mean :52015 NY : 30870 Mean :367665833 2013-11-05: 18560
## 3rd Qu.:78741 FL : 29788 3rd Qu.:367898711 2013-11-07: 18476
## Max. :99999 (Other):331551 Max. :368164705 2013-11-02: 18459
## NA's : 1 (Other) :389953
## PaymentTypeCode PaymentSubTypeCode CPTCardBrand CPTCardType
## CC:512466 AMZ_FPS: 832 MC :124380 0 - None : 86749
## FP: 16964 AX : 53377 None: 86749 1 - Credit :146513
## DI : 14076 VI :318301 2 - Debit : 60926
## MC :124774 3 - Check :214453
## PAYPAL : 16170 5 - Pre-Paid : 20759
## VI :320201 6 - Charge Card : 1
## 7 - European Deferred Debit Card: 29
## CardBrand CardType Amount SubCount UserOrderCount
## VI :319957 Credit :230056 Min. : 0.00 Min. : 0.000 Min. : 0.00
## AMZ_FPS: 832 None : 3206 1st Qu.: 9.99 1st Qu.: 1.000 1st Qu.: 9.00
## AX : 53352 Debit :275408 Median : 9.99 Median : 1.000 Median :20.00
## DI : 14070 Prepaid : 20759 Mean : 12.09 Mean : 1.371 Mean :17.28
## MC :125098 6 - Charge Card: 1 3rd Qu.: 14.99 3rd Qu.: 2.000 3rd Qu.:25.00
## PAYPAL : 16121 Max. :178.99 Max. :18.000 Max. :46.00
##
## UserPayAttempts UserAmount UserFirstOrderDate UserLastOrderDate UserChurns
## Min. : 0.00 Min. : 0.0 2011-05-01: 18901 2013-10-01: 39759 Min. : 0.0000
## 1st Qu.: 11.00 1st Qu.: 99.9 2011-05-03: 6731 2013-10-03: 16172 1st Qu.: 0.0000
## Median : 22.00 Median :227.2 2011-04-26: 6645 2013-10-09: 15224 Median : 0.0000
## Mean : 20.04 Mean :210.6 2011-05-10: 6423 2013-10-05: 14810 Mean : 0.2369
## 3rd Qu.: 27.00 3rd Qu.:303.8 2011-05-16: 6350 2013-10-07: 14730 3rd Qu.: 0.0000
## Max. :158.00 Max. :681.7 2011-05-09: 6345 2013-10-02: 14648 Max. :17.0000
## (Other) :478035 (Other) :414087
## UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess UserPaySftDec
## Min. : 0.00 Min. : 0.000 Min. :0.00000 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 9.00 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.: 9.00 1st Qu.: 0.0000
## Median :19.00 Median : 0.000 Median :0.00000 Median :19.00 Median : 0.0000
## Mean :17.05 Mean : 0.198 Mean :0.03889 Mean :17.05 Mean : 0.7418
## 3rd Qu.:25.00 3rd Qu.: 0.000 3rd Qu.:0.00000 3rd Qu.:25.00 3rd Qu.: 1.0000
## Max. :46.00 Max. :17.000 Max. :8.00000 Max. :46.00 Max. :21.0000
##
## UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000 Min. : 0.00000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.00000 Median :0.0000
## Mean : 0.1009 Mean : 0.3716 Mean : 0.2529 Mean : 0.06399 Mean :0.0308
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.:0.0000
## Max. :15.0000 Max. :21.0000 Max. :17.0000 Max. :11.00000 Max. :9.0000
##
## UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## Min. : 0.00000 Min. :0.00000 Min. :0.00000 Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 0.000 1st Qu.:0.00000
## Median : 0.00000 Median :0.00000 Median :0.00000 Median : 0.000 Median :0.00000
## Mean : 0.02035 Mean :0.01699 Mean :0.01676 Mean : 0.056 Mean :0.02092
## 3rd Qu.: 0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :11.00000 Max. :9.00000 Max. :9.00000 Max. :15.000 Max. :6.00000
##
## UserTenure UserWindow USuccessDensity USDDensity UHDDensity
## Min. : 1.00 Min. : 1.0 Min. : 0.0000 Min. : 0.00000 Min. : 0.00000
## 1st Qu.: 11.00 1st Qu.:12.0 1st Qu.: 0.7917 1st Qu.: 0.00000 1st Qu.: 0.00000
## Median : 24.00 Median :23.0 Median : 0.8621 Median : 0.00000 Median : 0.00000
## Mean : 30.14 Mean :20.7 Mean : 0.8345 Mean : 0.04885 Mean : 0.00546
## 3rd Qu.: 40.00 3rd Qu.:30.0 3rd Qu.: 0.9091 3rd Qu.: 0.03846 3rd Qu.: 0.00000
## Max. :144.00 Max. :31.0 Max. :12.0000 Max. :12.00000 Max. :12.00000
##
## UChurnDensity
## Min. : 0.00000
## 1st Qu.: 0.00000
## Median : 0.00000
## Mean : 0.01482
## 3rd Qu.: 0.00000
## Max. :12.00000
##
# Split analysis database into Train and Test
ChurnTrain <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Train",]
ChurnTest <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Test",]
TrainRows <- nrow(ChurnTrain)
TestRows <- nrow(ChurnTest)
TrainRows
## [1] 423544
TestRows
## [1] 105886
# ANALYSIS AND MODELING OF TRAINING DATA
# Estimate card brand and card type effects
BrandModel <- glm(Churn ~ CardBrand,
family = binomial(logit), data = ChurnTrain)
summary(BrandModel)
##
## Call:
## glm(formula = Churn ~ CardBrand, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5557 -0.3121 -0.3121 -0.3121 3.0174
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.997272 0.009287 -322.722 <2e-16 ***
## CardBrandAMZ_FPS 1.207306 0.112402 10.741 <2e-16 ***
## CardBrandAX -1.544367 0.048292 -31.980 <2e-16 ***
## CardBrandDI -1.221875 0.079445 -15.380 <2e-16 ***
## CardBrandMC 0.006294 0.017484 0.360 0.7189
## CardBrandPAYPAL -0.077641 0.043800 -1.773 0.0763 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 150383 on 423543 degrees of freedom
## Residual deviance: 148255 on 423538 degrees of freedom
## AIC: 148267
##
## Number of Fisher Scoring iterations: 7
BrandTypeModel <- glm(Churn ~ CardBrand + CardType,
family = binomial(logit), data = ChurnTrain)
summary(BrandTypeModel)
##
## Call:
## glm(formula = Churn ~ CardBrand + CardType, family = binomial(logit),
## data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9424 -0.2933 -0.2933 -0.1930 3.0174
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.97403 0.02270 -175.088 < 2e-16 ***
## CardBrandAMZ_FPS -1.14663 0.12195 -9.402 < 2e-16 ***
## CardBrandAX -0.56761 0.05255 -10.802 < 2e-16 ***
## CardBrandDI -0.24512 0.08210 -2.986 0.00283 **
## CardBrandMC 0.06181 0.01843 3.353 0.00080 ***
## CardBrandPAYPAL 0.89912 0.04845 18.558 < 2e-16 ***
## CardTypeNone 3.33070 0.05249 63.452 < 2e-16 ***
## CardTypeDebit 0.84910 0.02411 35.220 < 2e-16 ***
## CardTypePrepaid 3.01709 0.02757 109.444 < 2e-16 ***
## CardType6 - Charge Card -4.59174 43.95396 -0.104 0.91680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 150383 on 423543 degrees of freedom
## Residual deviance: 133772 on 423534 degrees of freedom
## AIC: 133792
##
## Number of Fisher Scoring iterations: 7
# Stepwise feature selection
BrandTypeRedModel <- step(BrandTypeModel, k = log(TrainRows)) # based on BIC
## Start: AIC=133901.8
## Churn ~ CardBrand + CardType
##
## Df Deviance AIC
## <none> 133772 133902
## - CardBrand 5 134418 134483
## - CardType 4 148255 148333
summary(BrandTypeRedModel)
##
## Call:
## glm(formula = Churn ~ CardBrand + CardType, family = binomial(logit),
## data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9424 -0.2933 -0.2933 -0.1930 3.0174
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.97403 0.02270 -175.088 < 2e-16 ***
## CardBrandAMZ_FPS -1.14663 0.12195 -9.402 < 2e-16 ***
## CardBrandAX -0.56761 0.05255 -10.802 < 2e-16 ***
## CardBrandDI -0.24512 0.08210 -2.986 0.00283 **
## CardBrandMC 0.06181 0.01843 3.353 0.00080 ***
## CardBrandPAYPAL 0.89912 0.04845 18.558 < 2e-16 ***
## CardTypeNone 3.33070 0.05249 63.452 < 2e-16 ***
## CardTypeDebit 0.84910 0.02411 35.220 < 2e-16 ***
## CardTypePrepaid 3.01709 0.02757 109.444 < 2e-16 ***
## CardType6 - Charge Card -4.59174 43.95396 -0.104 0.91680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 150383 on 423543 degrees of freedom
## Residual deviance: 133772 on 423534 degrees of freedom
## AIC: 133792
##
## Number of Fisher Scoring iterations: 7
# Estimate card brand, card type and User effects
OrderUserModel <- glm(Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
USDDensity + UHDDensity + UChurnDensity,
family = binomial(logit), data = ChurnTrain)
summary(OrderUserModel)
##
## Call:
## glm(formula = Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity + UHDDensity + UChurnDensity, family = binomial(logit),
## data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8314 -0.3114 -0.2390 -0.1630 3.8062
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.122409 0.030495 -102.389 < 2e-16 ***
## CardBrandAMZ_FPS -1.565125 0.123590 -12.664 < 2e-16 ***
## CardBrandAX -0.505114 0.052825 -9.562 < 2e-16 ***
## CardBrandDI -0.182046 0.082432 -2.208 0.02721 *
## CardBrandMC 0.040225 0.018555 2.168 0.03017 *
## CardBrandPAYPAL 0.477622 0.049033 9.741 < 2e-16 ***
## CardTypeNone 3.221118 0.055524 58.013 < 2e-16 ***
## CardTypeDebit 0.634891 0.024420 25.999 < 2e-16 ***
## CardTypePrepaid 2.383459 0.029777 80.044 < 2e-16 ***
## CardType6 - Charge Card -5.099232 43.953958 -0.116 0.90764
## UserTenure -0.025705 0.000544 -47.251 < 2e-16 ***
## USuccessDensity -0.138484 0.023591 -5.870 4.35e-09 ***
## USDDensity 0.449203 0.027445 16.367 < 2e-16 ***
## UHDDensity 0.242389 0.090162 2.688 0.00718 **
## UChurnDensity -0.094917 0.041895 -2.266 0.02348 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 150383 on 423543 degrees of freedom
## Residual deviance: 130024 on 423529 degrees of freedom
## AIC: 130054
##
## Number of Fisher Scoring iterations: 7
# Stepwise feature selection
OrderUserRedModel <- step(OrderUserModel, k = log(TrainRows)) # based on BIC
## Start: AIC=130217.9
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity + UHDDensity + UChurnDensity
##
## Df Deviance AIC
## - UHDDensity 1 130028 130210
## - UChurnDensity 1 130029 130210
## <none> 130024 130218
## - USuccessDensity 1 130066 130248
## - USDDensity 1 130314 130496
## - CardBrand 5 130473 130603
## - UserTenure 1 132919 133100
## - CardType 4 138728 138870
##
## Step: AIC=130209.9
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity + UChurnDensity
##
## Df Deviance AIC
## - UChurnDensity 1 130032 130201
## <none> 130028 130210
## - USuccessDensity 1 130068 130237
## - USDDensity 1 130315 130483
## - CardBrand 5 130479 130596
## - UserTenure 1 132941 133110
## - CardType 4 138814 138943
##
## Step: AIC=130200.5
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity
##
## Df Deviance AIC
## <none> 130032 130201
## - USuccessDensity 1 130070 130226
## - CardBrand 5 130485 130588
## - USDDensity 1 130452 130607
## - UserTenure 1 133008 133164
## - CardType 4 138913 139030
summary(OrderUserRedModel)
##
## Call:
## glm(formula = Churn ~ CardBrand + CardType + UserTenure + USuccessDensity +
## USDDensity, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0978 -0.3115 -0.2394 -0.1629 3.7900
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1417414 0.0290210 -108.258 < 2e-16 ***
## CardBrandAMZ_FPS -1.5691952 0.1235544 -12.700 < 2e-16 ***
## CardBrandAX -0.5068364 0.0528550 -9.589 < 2e-16 ***
## CardBrandDI -0.1822290 0.0824601 -2.210 0.0271 *
## CardBrandMC 0.0401433 0.0185562 2.163 0.0305 *
## CardBrandPAYPAL 0.4789792 0.0490312 9.769 < 2e-16 ***
## CardTypeNone 3.2268086 0.0554548 58.188 < 2e-16 ***
## CardTypeDebit 0.6365736 0.0244135 26.075 < 2e-16 ***
## CardTypePrepaid 2.3946461 0.0294876 81.209 < 2e-16 ***
## CardType6 - Charge Card -5.1082314 43.9539580 -0.116 0.9075
## UserTenure -0.0258568 0.0005406 -47.826 < 2e-16 ***
## USuccessDensity -0.1089399 0.0190431 -5.721 1.06e-08 ***
## USDDensity 0.4070114 0.0200980 20.251 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 150383 on 423543 degrees of freedom
## Residual deviance: 130032 on 423531 degrees of freedom
## AIC: 130058
##
## Number of Fisher Scoring iterations: 7
# Note Deviance measures and compute Pseudo (Deviance) R2
# (Null Deviance - Residual Deviance)/(Null Deviance)
# Exponentiate and interpret logistic coefficients
# Multiplicative change in odds per unit change in predictor
LogisticCoef <- coef(summary(OrderUserRedModel))
ExpCoef <- (exp(LogisticCoef[,"Estimate"]))
NeatCoef <- round(cbind(LogisticCoef, ExpCoef), digits = 6)
NeatCoef
## Estimate Std. Error z value Pr(>|z|) ExpCoef
## (Intercept) -3.141741 0.029021 -108.257654 0.000000 0.043207
## CardBrandAMZ_FPS -1.569195 0.123554 -12.700436 0.000000 0.208213
## CardBrandAX -0.506836 0.052855 -9.589186 0.000000 0.602398
## CardBrandDI -0.182229 0.082460 -2.209906 0.027112 0.833410
## CardBrandMC 0.040143 0.018556 2.163333 0.030516 1.040960
## CardBrandPAYPAL 0.478979 0.049031 9.768858 0.000000 1.614425
## CardTypeNone 3.226809 0.055455 58.188046 0.000000 25.199107
## CardTypeDebit 0.636574 0.024414 26.074624 0.000000 1.889994
## CardTypePrepaid 2.394646 0.029488 81.208676 0.000000 10.964317
## CardType6 - Charge Card -5.108231 43.953958 -0.116218 0.907480 0.006047
## UserTenure -0.025857 0.000541 -47.826368 0.000000 0.974475
## USuccessDensity -0.108940 0.019043 -5.720702 0.000000 0.896784
## USDDensity 0.407011 0.020098 20.251351 0.000000 1.502321
# Compute and examine training-sample predictions
# for reduced logistic --
# Logistic Prob, Classification
LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTrain, type = "response")
# Classification analysis, training sample
Threshold <- 0.5
LogisticClass <- rep(0, TrainRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTrain$Churn)
Confusion
##
## LogisticClass 0 1
## 0 404863 18175
## 1 446 60
mean(LogisticClass == ChurnTrain$Churn)
## [1] 0.9560353
# Lift Chart
# What fraction of churn is captured in the top x fraction of churn scores?
ScoreFrac <- (1:TrainRows - 1)/(TrainRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTrain$Churn[ranks])/sum(ChurnTrain$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
main = "Churn Lift Chart -- Train",
xlab = "Fraction of Top Churn Scores",
ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
# Using performance function from ROCR package
plotpred <- prediction(LogisticProb, ChurnTrain$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Train", colorize = TRUE)

# ROC replotted using qplot (ggplot2)
# Extract plot variables fpr, tpr, threshold/cutoff
# from plotroc object created via performace function above
fpr <- plotroc@x.values[[1]] # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]] # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]] # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom = "line", color = Cutoff,
main = "ROC Chart -- Train",
xlab = "False Positive Rate",
ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package
confusionMatrix(LogisticClass,ChurnTrain$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 404863 18175
## 1 446 60
##
## Accuracy : 0.956
## 95% CI : (0.9554, 0.9567)
## No Information Rate : 0.9569
## P-Value [Acc > NIR] : 0.9982
##
## Kappa : 0.0041
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0032904
## Specificity : 0.9988996
## Pos Pred Value : 0.1185771
## Neg Pred Value : 0.9570370
## Prevalence : 0.0430534
## Detection Rate : 0.0001417
## Detection Prevalence : 0.0011947
## Balanced Accuracy : 0.5010950
##
## 'Positive' Class : 1
##
# Add predictions to data set, save to .csv
ChurnTrain$LogisticProb <- LogisticProb
ChurnTrain$LogisticClass <- LogisticClass
head(ChurnTrain, n = 3)
## TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 2 Train 0 2003-11-15 2003-11-23 2013-12-24 2007-11-29 2999-01-01
## 3 Train 0 2010-06-01 2010-06-01 2013-05-09 2010-06-01 2014-01-07
## 5 Train 0 2004-07-30 2004-08-07 <NA> 2013-01-31 2999-01-01
## CancelDate CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 2 2999-01-01 Current Sub RTG 1 94127 CA
## 3 2999-01-01 Current Sub PREM 1 18015 PA
## 5 2999-01-01 Current Sub PREM 1 96737 HI
## EcommOrderId OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 2 367201564 2013-11-01 Credit CC VI VI
## 3 367275211 2013-11-02 Credit CC VI VI
## 5 367488848 2013-11-08 Credit CC MC MC
## CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 2 1 - Credit VI Credit 14.99 1 25 25 374.75
## 3 3 - Check VI Debit 9.99 1 25 25 249.75
## 5 1 - Credit MC Credit 10.35 2 26 26 269.10
## UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 2 2011-05-01 2013-09-01 0 25 0 0
## 3 2011-06-02 2013-09-02 0 25 0 0
## 5 2011-05-08 2013-10-08 0 26 0 0
## UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 2 25 0 0 0 0 0 0
## 3 25 0 0 0 0 0 0
## 5 26 0 0 0 0 0 0
## UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 2 0 0 0 0 0 121 29
## 3 0 0 0 0 0 42 28
## 5 0 0 0 0 0 112 30
## USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 2 0.8620690 0 0 0 0.001718937 0
## 3 0.8928571 0 0 0 0.024401144 0
## 5 0.8666667 0 0 0 0.002255839 0
write.csv(ChurnTrain, file = ChurnTrainAnalysis)
# MODEL EVALUATION USING TEST DATA
# Generate test set predictions --
# Logistic Prob, Classification
LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTest, type = "response")
# Classification analysis, Testing sample
Threshold <- 0.5
LogisticClass <- rep(0, TestRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTest$Churn)
Confusion
##
## LogisticClass 0 1
## 0 101181 4580
## 1 115 10
mean(LogisticClass == ChurnTest$Churn)
## [1] 0.9556599
# Lift Chart
# What fraction of churn is captured in the top x fraction of churn scores?
ScoreFrac <- (1:TestRows - 1)/(TestRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTest$Churn[ranks])/sum(ChurnTest$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
main = "Churn Lift Chart -- Test",
xlab = "Fraction of Top Churn Scores",
ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
# Using performance function from ROCR package
plotpred <- prediction(LogisticProb, ChurnTest$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Test", colorize = TRUE)

# ROC replotted using qplot (ggplot2)
# Extract plot variables fpr, tpr, threshold/cutoff
# from plotroc object created via performace function above
fpr <- plotroc@x.values[[1]] # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]] # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]] # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom="line", color= Cutoff,
main = "ROC Chart -- Test",
xlab = "False Positive Rate",
ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package
confusionMatrix(LogisticClass,ChurnTest$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 101181 4580
## 1 115 10
##
## Accuracy : 0.9557
## 95% CI : (0.9544, 0.9569)
## No Information Rate : 0.9567
## P-Value [Acc > NIR] : 0.9439
##
## Kappa : 0.0019
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 2.179e-03
## Specificity : 9.989e-01
## Pos Pred Value : 8.000e-02
## Neg Pred Value : 9.567e-01
## Prevalence : 4.335e-02
## Detection Rate : 9.444e-05
## Detection Prevalence : 1.181e-03
## Balanced Accuracy : 5.005e-01
##
## 'Positive' Class : 1
##
# Add predictions to data set, save to .csv
ChurnTest$LogisticProb <- LogisticProb
ChurnTest$LogisticClass <- LogisticClass
head(ChurnTest, n = 3)
## TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1 Test 0 2009-05-10 2009-05-25 2013-12-24 2009-05-10 2999-01-01
## 4 Test 0 2012-02-10 2012-02-10 2014-01-18 2012-02-10 2999-01-01
## 15 Test 0 2003-09-26 2003-11-04 <NA> 2013-01-31 2999-01-01
## CancelDate CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 1 2999-01-01 Current Sub RTG 1 44333 OH
## 4 2999-01-01 Current Sub PREM 1 19426 PA
## 15 2999-01-01 Current Sub PREM 1 26241 WV
## EcommOrderId OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 1 367726684 2013-11-16 Credit CC VI VI
## 4 367578619 2013-11-11 Credit CC AX None
## 15 367455971 2013-11-07 Credit CC MC MC
## CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 1 1 - Credit VI Credit 14.99 1 26 34 389.74
## 4 0 - None AX Credit 9.99 1 16 16 159.84
## 15 1 - Credit MC Credit 9.99 2 29 32 349.71
## UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 1 2011-04-26 2013-10-16 0 26 0 0
## 4 2012-03-11 2013-10-11 0 16 0 0
## 15 2011-05-01 2013-10-07 0 29 0 0
## UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 1 26 2 0 0 2 0 0
## 4 16 0 0 0 0 0 0
## 15 29 1 0 1 0 0 0
## UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 1 0 0 0 0 0 55 31
## 4 0 0 0 0 0 22 20
## 15 0 0 0 0 0 122 30
## USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 1 0.8387097 0.06451613 0 0 0.009670326 0
## 4 0.8000000 0.00000000 0 0 0.013326569 0
## 15 0.9666667 0.03333333 0 0 0.001747413 0
write.table(ChurnTest, file = ChurnTestAnalysis)
########### END OF ANALYSIS