options(width=100)
# Starter data preparation script for Rhapsody Churn Modeling
library(ggplot2) # for improved plots -- qplot and scatterplot matrix
library(GGally) # for scatterplot matrix
library(RcmdrMisc) # for numSummary, stepwise
library(ROCR) # for producing ROC and other prediction performance plots
library(caret) # for confusion matrix analysis
library(plyr) # for mapvalues recoding function
library(dplyr) # select function for subsetting data frames
# Suggested usage:
# Copy the required data file, Rhapsody2016CPTOrderSummary.csv into a
# new project folder <folder>
# Change the working directory (setwd below) to point to the project <folder>
# Run the entire script as a block
# Set up working directory for file locations
# and load required libraries
ChurnDir <- "D:/MBA/SecondTerm/Data/exam"
ChurnData <- "Rhapsody2017CPTOrderSummary-010.csv"
ChurnPrepped <- "ChurnFullPrepped-010.RData"
# DateFormat <- "%m/%d/%Y"
DateFormat <- "%Y-%m-%d"
########### BEGIN DATA PREPARATION
setwd(ChurnDir)
# DATA ENTRY AND SUMMARIZATION
# Read full data set, list -- names, structure, head,
ChurnFull <- read.csv(ChurnData)
AllRows <- nrow(ChurnFull)
str(ChurnFull, list.len = 999)
## 'data.frame': 5156 obs. of 109 variables:
## $ UserPartyId : num 2273929 2637516 3268169 5027183 5580324 ...
## $ UserOrigSignupDt : Factor w/ 1983 levels "2002-06-07","2002-08-06",..: 51 599 460 320 1358 750 58 1749 678 74 ...
## $ SubscriberServiceId : num 1.76e+08 1.33e+08 3.11e+08 8.36e+07 3.07e+08 ...
## $ FirstBillDt : Factor w/ 1977 levels "2002-08-06","2002-10-02",..: 49 603 465 311 1359 749 58 1721 680 71 ...
## $ LastPlayDt : Factor w/ 602 levels "","2005-01-17",..: 598 556 602 602 1 574 574 175 602 602 ...
## $ SignUpDt : Factor w/ 1580 levels "2005-05-05","2005-05-09",..: 394 266 987 91 948 353 1303 1346 310 1034 ...
## $ StopRequestDt : Factor w/ 87 levels "2010-08-05","2011-01-19",..: 87 87 87 87 87 87 87 87 87 87 ...
## $ CancelDt : Factor w/ 76 levels "2013-11-02","2013-11-04",..: 76 69 76 76 76 76 76 76 76 76 ...
## $ CancelType : Factor w/ 4 levels "Current Sub",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ CancelReason : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 4 1 1 1 1 1 1 1 1 ...
## $ ChannelGroupName : Factor w/ 15 levels "Cable Partners",..: 13 8 2 2 8 6 14 1 6 10 ...
## $ ChannelName : Factor w/ 75 levels "Affiliate","Android Mobile",..: 7 56 57 57 56 2 67 9 22 25 ...
## $ CobrandCode : num 40132 40134 40134 40134 40134 ...
## $ CobrandName : Factor w/ 23 levels "ATT","AudioGalaxy",..: 3 18 18 18 18 18 11 4 18 10 ...
## $ OriginCode : Factor w/ 210 levels "3x5_rhapcom",..: 23 154 30 144 70 70 69 31 166 67 ...
## $ pcode : Factor w/ 160 levels ".","30sec","30sec_hp",..: 1 108 136 136 127 52 1 23 63 1 ...
## $ cpath : Factor w/ 122 levels ".","0","12_19Bpage",..: 1 1 81 81 61 14 1 61 14 1 ...
## $ rsrc : Factor w/ 569 levels ".","0","0508",..: 1 1 262 250 411 70 1 541 371 1 ...
## $ opage : Factor w/ 155 levels "","-","404__404",..: 1 1 68 68 1 48 1 1 1 1 ...
## $ ServiceTier : Factor w/ 4 levels "PREM","RHAP",..: 1 4 4 4 1 4 1 4 4 4 ...
## $ MonthsPerBill : num 1 1 1 1 1 1 3 1 1 1 ...
## $ BillingZipCode : num 75098 52402 60542 80227 79424 ...
## $ BillingState : Factor w/ 56 levels "","7th fl","AE",..: 48 16 18 9 48 48 24 12 8 47 ...
## $ EcommOrderId : num 3.68e+08 3.67e+08 3.68e+08 3.67e+08 3.68e+08 ...
## $ OrderDate : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 25 1 11 7 13 24 14 16 27 3 ...
## $ PaymentTypeName : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTypeCode : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentSubTypeCode : Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 6 6 6 6 6 2 2 ...
## $ CPTCardBrand : Factor w/ 3 levels "MC","None","VI": 3 3 3 3 3 3 3 3 2 2 ...
## $ CPTCardType : Factor w/ 6 levels "0 - None","1 - Credit",..: 2 2 4 2 2 4 2 2 1 1 ...
## $ CPTBIN : num 410413 414720 443045 438857 441712 ...
## $ CPTTransactionDivision : num 193099 193099 193099 193099 193099 ...
## $ PaymentTxTypeCode : Factor w/ 1 level "BC": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentTxStatus : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PmtStatType : Factor w/ 1 level "COMPLETE": 1 1 1 1 1 1 1 1 1 1 ...
## $ PT_status : Factor w/ 18 levels "00","04","05",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AVS_status : Factor w/ 12 levels "3","6","7","9",..: 7 7 7 7 7 6 7 7 5 5 ...
## $ SEC_status : Factor w/ 2 levels "S","UNK": 2 2 2 2 2 2 2 2 2 2 ...
## $ RiskCode : Factor w/ 1 level "UNK": 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentProcessorCode : Factor w/ 4 levels "AMZ","PPL","PT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentProcessorDesc : Factor w/ 4 levels "","Amazon FPS",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentDivisionCode : Factor w/ 4 levels "193099","203217",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PaymentDivisionDesc : Factor w/ 5 levels "","Amazon FPS non-recurring default",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ PaymentDivisionUseType : Factor w/ 3 levels "","N","R": 3 3 3 3 3 3 3 3 3 3 ...
## $ PaymentDivisionRiskType: Factor w/ 2 levels "","NORMALRISK": 2 2 2 2 2 2 2 2 2 2 ...
## $ Country : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
## $ Currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Amount : num 10.8 15 15 15 10.8 ...
## $ OrderStatus : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Churn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PayAttempts : num 1 8 1 1 1 1 1 1 1 4 ...
## $ OrdSucces : num 1 1 1 1 1 1 1 1 1 1 ...
## $ OrdSftDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PaySuccess : num 1 1 1 1 1 1 1 1 1 1 ...
## $ PaySftDec : num 0 1 0 0 0 0 0 0 0 1 ...
## $ PayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat05 : num 0 1 0 0 0 0 0 0 0 1 ...
## $ PTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat56 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubOrderCount : num 29 24 14 26 19 27 2 5 24 16 ...
## $ SubPayAttempts : num 29 26 15 26 19 27 2 5 24 16 ...
## $ SubAmount : num 313 360 210 390 205 ...
## $ SubFirstOrderDate : Factor w/ 911 levels "2011-04-21","2011-04-22",..: 5 4 328 48 275 33 626 717 37 375 ...
## $ SubLastOrderDate : Factor w/ 139 levels "2012-10-29","2012-11-03",..: 104 80 90 86 92 103 16 95 76 82 ...
## $ SubChurns : num 0 0 1 0 0 1 0 0 0 1 ...
## $ SubOrdSucces : num 29 24 13 26 19 26 2 5 24 15 ...
## $ SubOrdSftDec : num 0 0 1 0 0 1 0 0 0 1 ...
## $ SubOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPaySuccess : num 29 24 13 26 19 26 2 5 24 15 ...
## $ SubPaySftDec : num 0 2 2 0 0 1 0 0 0 1 ...
## $ SubPayHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat05 : num 0 2 0 0 0 1 0 0 0 1 ...
## $ SubPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat56 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ SubPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat12 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ SubPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat41 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SubCount : num 1 1 2 1 1 1 2 3 1 3 ...
## $ UserOrderCount : num 29 24 23 26 19 27 7 25 24 22 ...
## $ UserPayAttempts : num 29 26 25 26 19 27 10 39 24 23 ...
## $ UserAmount : num 313 360 300 390 205 ...
## $ UserFirstOrderDate : Factor w/ 888 levels "2011-04-21","2011-04-22",..: 5 4 30 48 277 33 18 16 37 12 ...
## $ UserLastOrderDate : Factor w/ 191 levels "2011-07-12","2011-10-28",..: 156 132 142 138 144 155 50 147 128 134 ...
## $ UserChurns : num 0 0 1 0 0 1 0 1 0 1 ...
## $ UserOrdSucces : num 29 24 22 26 19 26 7 24 24 21 ...
## $ UserOrdSftDec : num 0 0 1 0 0 1 0 1 0 1 ...
## $ UserOrdHrdDec : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPaySuccess : num 29 24 22 26 19 26 7 24 24 21 ...
## $ UserPaySftDec : num 0 2 2 0 0 1 1 2 0 1 ...
## $ UserPayHrdDec : num 0 0 1 0 0 0 0 0 0 1 ...
## $ UserPTStat89 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat05 : num 0 2 0 0 0 1 0 2 0 1 ...
## $ UserPTStat14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat56 : num 0 0 1 0 0 0 1 0 0 0 ...
## $ UserPTStat52 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat12 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ UserPTStat33 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UserPTStat41 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ UserPTStat04 : num 0 0 0 0 0 0 0 0 0 0 ...
head(ChurnFull, n = 3)
## UserPartyId UserOrigSignupDt SubscriberServiceId FirstBillDt LastPlayDt SignUpDt StopRequestDt
## 1 2273929 2004-05-31 176406097 2004-06-15 2014-01-18 2010-04-24 2999-01-01
## 2 2637516 2009-03-08 132608865 2009-03-23 2013-12-07 2009-03-08 2999-01-01
## 3 3268169 2008-05-13 311201787 2008-05-28 2014-01-22 2012-03-08 2999-01-01
## CancelDt CancelType CancelReason ChannelGroupName ChannelName CobrandCode
## 1 2999-01-01 Current Sub Retail Partners BBDMS 40132
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE MP3 Player Partners Sandisk 40134
## 3 2999-01-01 Current Sub Direct Marketing Search 40134
## CobrandName OriginCode pcode cpath rsrc opage ServiceTier
## 1 Best Buy bestbuy . . . PREM
## 2 RealRhapsody sandiskfuze rn . . RTG
## 3 RealRhapsody cj srchrv ppcse gg_ru_rhp_14 offer_rhap_rhapDMMMainPage RTG
## MonthsPerBill BillingZipCode BillingState EcommOrderId OrderDate PaymentTypeName PaymentTypeCode
## 1 1 75098 TX 368004554 2013-11-25 Credit CC
## 2 1 52402 IA 367205302 2013-11-01 Credit CC
## 3 1 60542 IL 367578873 2013-11-11 Credit CC
## PaymentSubTypeCode CPTCardBrand CPTCardType CPTBIN CPTTransactionDivision PaymentTxTypeCode
## 1 VI VI 1 - Credit 410413 193099 BC
## 2 VI VI 1 - Credit 414720 193099 BC
## 3 VI VI 3 - Check 443045 193099 BC
## PaymentTxStatus PmtStatType PT_status AVS_status SEC_status RiskCode PaymentProcessorCode
## 1 SUCCESS COMPLETE 00 C UNK UNK PT
## 2 SUCCESS COMPLETE 00 C UNK UNK PT
## 3 SUCCESS COMPLETE 00 C UNK UNK PT
## PaymentProcessorDesc PaymentDivisionCode PaymentDivisionDesc PaymentDivisionUseType
## 1 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## 2 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## 3 PaymentTech 193099 Paymentech Recurring for Rhapsody R
## PaymentDivisionRiskType Country Currency Amount OrderStatus Churn PayAttempts OrdSucces OrdSftDec
## 1 NORMALRISK US USD 10.81 SUCCESS 0 1 1 0
## 2 NORMALRISK US USD 14.99 SUCCESS 0 8 1 0
## 3 NORMALRISK US USD 14.99 SUCCESS 0 1 1 0
## OrdHrdDec PaySuccess PaySftDec PayHrdDec PTStat89 PTStat05 PTStat14 PTStat56 PTStat52 PTStat12
## 1 0 1 0 0 0 0 0 0 0 0
## 2 0 1 1 0 0 1 0 0 0 0
## 3 0 1 0 0 0 0 0 0 0 0
## PTStat33 PTStat41 PTStat04 SubOrderCount SubPayAttempts SubAmount SubFirstOrderDate
## 1 0 0 0 29 29 313.49 2011-04-25
## 2 0 0 0 24 26 359.76 2011-04-24
## 3 0 0 0 14 15 209.86 2012-04-09
## SubLastOrderDate SubChurns SubOrdSucces SubOrdSftDec SubOrdHrdDec SubPaySuccess SubPaySftDec
## 1 2013-10-25 0 29 0 0 29 0
## 2 2013-10-01 0 24 0 0 24 2
## 3 2013-10-11 1 13 1 0 13 2
## SubPayHrdDec SubPTStat89 SubPTStat05 SubPTStat14 SubPTStat56 SubPTStat52 SubPTStat12 SubPTStat33
## 1 0 0 0 0 0 0 0 0
## 2 0 0 2 0 0 0 0 0
## 3 0 0 0 0 1 0 1 0
## SubPTStat41 SubPTStat04 SubCount UserOrderCount UserPayAttempts UserAmount UserFirstOrderDate
## 1 0 0 1 29 29 313.49 2011-04-25
## 2 0 0 1 24 26 359.76 2011-04-24
## 3 0 0 2 23 25 299.77 2011-05-20
## UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess
## 1 2013-10-25 0 29 0 0 29
## 2 2013-10-01 0 24 0 0 24
## 3 2013-10-11 1 22 1 0 22
## UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52
## 1 0 0 0 0 0 0 0
## 2 2 0 0 2 0 0 0
## 3 2 1 0 0 0 1 0
## UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## 1 0 0 0 0
## 2 0 0 0 0
## 3 1 0 1 0
# Data Understanding
# A lot of variables!
# Perhaps use pivot tables and/or Tableau to build on earlier work
# Clean up card brand / card type data, add to database
# Look at distribution of card brand and type in CPT data
table(ChurnFull$CPTCardBrand, ChurnFull$CPTCardType)
##
## 0 - None 1 - Credit 2 - Debit 3 - Check 5 - Pre-Paid 7 - European Deferred Debit Card
## MC 0 583 605 0 52 0
## None 830 0 0 0 0 0
## VI 0 859 0 2095 131 1
# Set CardBrand
# Set CardBrand defailt to PaymentSubType value from original Order Summary
# Replace PaymentSubType value with CPTCardBrand if present (not None)
ChurnFull$CardBrand <- ChurnFull$PaymentSubTypeCode
ChurnFull$CardBrand[ChurnFull$CPTCardBrand != "None"] <-
ChurnFull$CPTCardBrand[ChurnFull$CPTCardBrand != "None"]
# Clean up CardType
# Remove leading numbers from codes
# Map Check to Debit
# Fill in CardType (as Credit) for credit-only CardBrands -- AX, DI, PAYPAL
ChurnFull$CardType <- mapvalues(ChurnFull$CPTCardType,
c("0 - None", "1 - Credit", "2 - Debit", "3 - Check",
"5 - Pre-Paid", "7 - European Deferred Debit Card"),
c("None", "Credit", "Debit", "Debit", "Prepaid", "Debit"))
ChurnFull$CardType[ChurnFull$CardBrand == "AX"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "DI"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "PAYPAL"] <- "Credit"
# Check newly created CardBrand and CardType
table(ChurnFull$CardBrand, ChurnFull$CardType)
##
## None Credit Debit Prepaid
## AMZ_FPS 5 0 0 0
## AX 0 503 0 0
## DI 0 138 0 0
## MC 5 583 605 52
## PAYPAL 0 162 0 0
## VI 17 859 2096 131
# Set reference levels for CardBrand and CardType
# Sets VISA and Credit as the omitted factor levels in models
# CardBrand and CardType estimates are then interpretted relative to
# these reference levels
ChurnFull$CardBrand <- relevel(ChurnFull$CardBrand, ref = "VI")
ChurnFull$CardType <- relevel(ChurnFull$CardType, ref = "Credit")
# Add User history metrics
# Convert date strings from factor to date type
# Tenure, OrderDensity, Payment Friction, ChurnDensity
ChurnFull$UserOrigSignupDate <-
as.Date(ChurnFull$UserOrigSignupDt, format = DateFormat)
ChurnFull$FirstBillDate <-
as.Date(ChurnFull$FirstBillDt, format = DateFormat)
ChurnFull$LastPlayDate <-
as.Date(ChurnFull$LastPlayDt, format = DateFormat)
ChurnFull$SignUpDate <-
as.Date(ChurnFull$SignUpDt, format = DateFormat)
ChurnFull$StopRequestDate <-
as.Date(ChurnFull$StopRequestDt, format = DateFormat)
ChurnFull$CancelDate <-
as.Date(ChurnFull$CancelDt, format = DateFormat)
ChurnFull$OrdDate <-
as.Date(ChurnFull$OrderDate, format = DateFormat)
ChurnFull$UserFirstOrdDate <-
as.Date(ChurnFull$UserFirstOrderDate, format = DateFormat)
ChurnFull$UserLastOrdDate <-
as.Date(ChurnFull$UserLastOrderDate, format = DateFormat)
# Calculated date features
ChurnFull$UserTenure <-
as.numeric(round((ChurnFull$OrdDate - ChurnFull$UserOrigSignupDate)/365.25*12 + 1))
ChurnFull$UserWindow <-
as.numeric(round((ChurnFull$UserLastOrdDate -
ChurnFull$UserFirstOrdDate)/365.25*12 + 1))
ChurnFull$USuccessDensity <-
(ChurnFull$UserPaySuccess * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$USDDensity <-
(ChurnFull$UserPaySftDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UHDDensity <-
(ChurnFull$UserPayHrdDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UChurnDensity <-
(ChurnFull$UserChurns * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
# Randomly split observations 80/20 into training and test samples
# Generate sample of indices
set.seed(999999999)
TrainSize <- round(0.8 * AllRows)
TrainIdx <- sample(AllRows, TrainSize)
# Label observations as Train or Test
# Add Train/Test labels to data base
TrainTest <- rep("Test", AllRows)
TrainTest[TrainIdx] <- "Train"
ChurnFull$TrainTest <- TrainTest
# Save prepped data to .Rdata file for subsequent analysis
save("ChurnFull", file = ChurnPrepped)
########### END OF DATA PREPARATION