options(width=100)

# Starter data preparation script for Rhapsody Churn Modeling

library(ggplot2) # for improved plots -- qplot and scatterplot matrix
library(GGally) # for scatterplot matrix
library(RcmdrMisc) # for numSummary, stepwise
library(ROCR) # for producing ROC and other prediction performance plots
library(caret) # for confusion matrix analysis
library(plyr) # for mapvalues recoding function
library(dplyr) # select function for subsetting data frames

# Suggested usage:
#   Copy the required data file, Rhapsody2016CPTOrderSummary.csv into a 
#     new project folder <folder>
#   Change the working directory (setwd below) to point to the project <folder>
#   Run the entire script as a block

# Set up working directory for file locations
#   and load required libraries

ChurnDir <- "D:/MBA/SecondTerm/Data/exam"
ChurnData <- "Rhapsody2017CPTOrderSummary-010.csv"
ChurnPrepped <- "ChurnFullPrepped-010.RData"

# DateFormat <- "%m/%d/%Y"
DateFormat <- "%Y-%m-%d"

########### BEGIN DATA PREPARATION

setwd(ChurnDir)

# DATA ENTRY AND SUMMARIZATION

# Read full data set, list -- names, structure, head,

ChurnFull <- read.csv(ChurnData)
AllRows <- nrow(ChurnFull)
str(ChurnFull, list.len = 999)
## 'data.frame':    5156 obs. of  109 variables:
##  $ UserPartyId            : num  2273929 2637516 3268169 5027183 5580324 ...
##  $ UserOrigSignupDt       : Factor w/ 1983 levels "2002-06-07","2002-08-06",..: 51 599 460 320 1358 750 58 1749 678 74 ...
##  $ SubscriberServiceId    : num  1.76e+08 1.33e+08 3.11e+08 8.36e+07 3.07e+08 ...
##  $ FirstBillDt            : Factor w/ 1977 levels "2002-08-06","2002-10-02",..: 49 603 465 311 1359 749 58 1721 680 71 ...
##  $ LastPlayDt             : Factor w/ 602 levels "","2005-01-17",..: 598 556 602 602 1 574 574 175 602 602 ...
##  $ SignUpDt               : Factor w/ 1580 levels "2005-05-05","2005-05-09",..: 394 266 987 91 948 353 1303 1346 310 1034 ...
##  $ StopRequestDt          : Factor w/ 87 levels "2010-08-05","2011-01-19",..: 87 87 87 87 87 87 87 87 87 87 ...
##  $ CancelDt               : Factor w/ 76 levels "2013-11-02","2013-11-04",..: 76 69 76 76 76 76 76 76 76 76 ...
##  $ CancelType             : Factor w/ 4 levels "Current Sub",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ CancelReason           : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 4 1 1 1 1 1 1 1 1 ...
##  $ ChannelGroupName       : Factor w/ 15 levels "Cable Partners",..: 13 8 2 2 8 6 14 1 6 10 ...
##  $ ChannelName            : Factor w/ 75 levels "Affiliate","Android Mobile",..: 7 56 57 57 56 2 67 9 22 25 ...
##  $ CobrandCode            : num  40132 40134 40134 40134 40134 ...
##  $ CobrandName            : Factor w/ 23 levels "ATT","AudioGalaxy",..: 3 18 18 18 18 18 11 4 18 10 ...
##  $ OriginCode             : Factor w/ 210 levels "3x5_rhapcom",..: 23 154 30 144 70 70 69 31 166 67 ...
##  $ pcode                  : Factor w/ 160 levels ".","30sec","30sec_hp",..: 1 108 136 136 127 52 1 23 63 1 ...
##  $ cpath                  : Factor w/ 122 levels ".","0","12_19Bpage",..: 1 1 81 81 61 14 1 61 14 1 ...
##  $ rsrc                   : Factor w/ 569 levels ".","0","0508",..: 1 1 262 250 411 70 1 541 371 1 ...
##  $ opage                  : Factor w/ 155 levels "","-","404__404",..: 1 1 68 68 1 48 1 1 1 1 ...
##  $ ServiceTier            : Factor w/ 4 levels "PREM","RHAP",..: 1 4 4 4 1 4 1 4 4 4 ...
##  $ MonthsPerBill          : num  1 1 1 1 1 1 3 1 1 1 ...
##  $ BillingZipCode         : num  75098 52402 60542 80227 79424 ...
##  $ BillingState           : Factor w/ 56 levels "","7th fl","AE",..: 48 16 18 9 48 48 24 12 8 47 ...
##  $ EcommOrderId           : num  3.68e+08 3.67e+08 3.68e+08 3.67e+08 3.68e+08 ...
##  $ OrderDate              : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 25 1 11 7 13 24 14 16 27 3 ...
##  $ PaymentTypeName        : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTypeCode        : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentSubTypeCode     : Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 6 6 6 6 6 2 2 ...
##  $ CPTCardBrand           : Factor w/ 3 levels "MC","None","VI": 3 3 3 3 3 3 3 3 2 2 ...
##  $ CPTCardType            : Factor w/ 6 levels "0 - None","1 - Credit",..: 2 2 4 2 2 4 2 2 1 1 ...
##  $ CPTBIN                 : num  410413 414720 443045 438857 441712 ...
##  $ CPTTransactionDivision : num  193099 193099 193099 193099 193099 ...
##  $ PaymentTxTypeCode      : Factor w/ 1 level "BC": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTxStatus        : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PmtStatType            : Factor w/ 1 level "COMPLETE": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PT_status              : Factor w/ 18 levels "00","04","05",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AVS_status             : Factor w/ 12 levels "3","6","7","9",..: 7 7 7 7 7 6 7 7 5 5 ...
##  $ SEC_status             : Factor w/ 2 levels "S","UNK": 2 2 2 2 2 2 2 2 2 2 ...
##  $ RiskCode               : Factor w/ 1 level "UNK": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentProcessorCode   : Factor w/ 4 levels "AMZ","PPL","PT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentProcessorDesc   : Factor w/ 4 levels "","Amazon FPS",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentDivisionCode    : Factor w/ 4 levels "193099","203217",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentDivisionDesc    : Factor w/ 5 levels "","Amazon FPS non-recurring default",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ PaymentDivisionUseType : Factor w/ 3 levels "","N","R": 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentDivisionRiskType: Factor w/ 2 levels "","NORMALRISK": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Country                : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Currency               : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Amount                 : num  10.8 15 15 15 10.8 ...
##  $ OrderStatus            : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Churn                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PayAttempts            : num  1 8 1 1 1 1 1 1 1 4 ...
##  $ OrdSucces              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ OrdSftDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OrdHrdDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PaySuccess             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ PaySftDec              : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ PayHrdDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat89               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat05               : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ PTStat14               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat56               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat52               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat12               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat33               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat41               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat04               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubOrderCount          : num  29 24 14 26 19 27 2 5 24 16 ...
##  $ SubPayAttempts         : num  29 26 15 26 19 27 2 5 24 16 ...
##  $ SubAmount              : num  313 360 210 390 205 ...
##  $ SubFirstOrderDate      : Factor w/ 911 levels "2011-04-21","2011-04-22",..: 5 4 328 48 275 33 626 717 37 375 ...
##  $ SubLastOrderDate       : Factor w/ 139 levels "2012-10-29","2012-11-03",..: 104 80 90 86 92 103 16 95 76 82 ...
##  $ SubChurns              : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ SubOrdSucces           : num  29 24 13 26 19 26 2 5 24 15 ...
##  $ SubOrdSftDec           : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ SubOrdHrdDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPaySuccess          : num  29 24 13 26 19 26 2 5 24 15 ...
##  $ SubPaySftDec           : num  0 2 2 0 0 1 0 0 0 1 ...
##  $ SubPayHrdDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat89            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat05            : num  0 2 0 0 0 1 0 0 0 1 ...
##  $ SubPTStat14            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat56            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ SubPTStat52            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat12            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ SubPTStat33            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat41            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat04            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubCount               : num  1 1 2 1 1 1 2 3 1 3 ...
##  $ UserOrderCount         : num  29 24 23 26 19 27 7 25 24 22 ...
##  $ UserPayAttempts        : num  29 26 25 26 19 27 10 39 24 23 ...
##  $ UserAmount             : num  313 360 300 390 205 ...
##  $ UserFirstOrderDate     : Factor w/ 888 levels "2011-04-21","2011-04-22",..: 5 4 30 48 277 33 18 16 37 12 ...
##  $ UserLastOrderDate      : Factor w/ 191 levels "2011-07-12","2011-10-28",..: 156 132 142 138 144 155 50 147 128 134 ...
##  $ UserChurns             : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ UserOrdSucces          : num  29 24 22 26 19 26 7 24 24 21 ...
##  $ UserOrdSftDec          : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ UserOrdHrdDec          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPaySuccess         : num  29 24 22 26 19 26 7 24 24 21 ...
##  $ UserPaySftDec          : num  0 2 2 0 0 1 1 2 0 1 ...
##  $ UserPayHrdDec          : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ UserPTStat89           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat05           : num  0 2 0 0 0 1 0 2 0 1 ...
##  $ UserPTStat14           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat56           : num  0 0 1 0 0 0 1 0 0 0 ...
##  $ UserPTStat52           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat12           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ UserPTStat33           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat41           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ UserPTStat04           : num  0 0 0 0 0 0 0 0 0 0 ...
head(ChurnFull, n = 3)
##   UserPartyId UserOrigSignupDt SubscriberServiceId FirstBillDt LastPlayDt   SignUpDt StopRequestDt
## 1     2273929       2004-05-31           176406097  2004-06-15 2014-01-18 2010-04-24    2999-01-01
## 2     2637516       2009-03-08           132608865  2009-03-23 2013-12-07 2009-03-08    2999-01-01
## 3     3268169       2008-05-13           311201787  2008-05-28 2014-01-22 2012-03-08    2999-01-01
##     CancelDt  CancelType              CancelReason    ChannelGroupName ChannelName CobrandCode
## 1 2999-01-01 Current Sub                               Retail Partners       BBDMS       40132
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE MP3 Player Partners     Sandisk       40134
## 3 2999-01-01 Current Sub                              Direct Marketing      Search       40134
##    CobrandName  OriginCode  pcode cpath         rsrc                      opage ServiceTier
## 1     Best Buy     bestbuy      .     .            .                                   PREM
## 2 RealRhapsody sandiskfuze     rn     .            .                                    RTG
## 3 RealRhapsody          cj srchrv ppcse gg_ru_rhp_14 offer_rhap_rhapDMMMainPage         RTG
##   MonthsPerBill BillingZipCode BillingState EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode
## 1             1          75098           TX    368004554 2013-11-25          Credit              CC
## 2             1          52402           IA    367205302 2013-11-01          Credit              CC
## 3             1          60542           IL    367578873 2013-11-11          Credit              CC
##   PaymentSubTypeCode CPTCardBrand CPTCardType CPTBIN CPTTransactionDivision PaymentTxTypeCode
## 1                 VI           VI  1 - Credit 410413                 193099                BC
## 2                 VI           VI  1 - Credit 414720                 193099                BC
## 3                 VI           VI   3 - Check 443045                 193099                BC
##   PaymentTxStatus PmtStatType PT_status AVS_status SEC_status RiskCode PaymentProcessorCode
## 1         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
## 2         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
## 3         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
##   PaymentProcessorDesc PaymentDivisionCode               PaymentDivisionDesc PaymentDivisionUseType
## 1          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
## 2          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
## 3          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
##   PaymentDivisionRiskType Country Currency Amount OrderStatus Churn PayAttempts OrdSucces OrdSftDec
## 1              NORMALRISK      US      USD  10.81     SUCCESS     0           1         1         0
## 2              NORMALRISK      US      USD  14.99     SUCCESS     0           8         1         0
## 3              NORMALRISK      US      USD  14.99     SUCCESS     0           1         1         0
##   OrdHrdDec PaySuccess PaySftDec PayHrdDec PTStat89 PTStat05 PTStat14 PTStat56 PTStat52 PTStat12
## 1         0          1         0         0        0        0        0        0        0        0
## 2         0          1         1         0        0        1        0        0        0        0
## 3         0          1         0         0        0        0        0        0        0        0
##   PTStat33 PTStat41 PTStat04 SubOrderCount SubPayAttempts SubAmount SubFirstOrderDate
## 1        0        0        0            29             29    313.49        2011-04-25
## 2        0        0        0            24             26    359.76        2011-04-24
## 3        0        0        0            14             15    209.86        2012-04-09
##   SubLastOrderDate SubChurns SubOrdSucces SubOrdSftDec SubOrdHrdDec SubPaySuccess SubPaySftDec
## 1       2013-10-25         0           29            0            0            29            0
## 2       2013-10-01         0           24            0            0            24            2
## 3       2013-10-11         1           13            1            0            13            2
##   SubPayHrdDec SubPTStat89 SubPTStat05 SubPTStat14 SubPTStat56 SubPTStat52 SubPTStat12 SubPTStat33
## 1            0           0           0           0           0           0           0           0
## 2            0           0           2           0           0           0           0           0
## 3            0           0           0           0           1           0           1           0
##   SubPTStat41 SubPTStat04 SubCount UserOrderCount UserPayAttempts UserAmount UserFirstOrderDate
## 1           0           0        1             29              29     313.49         2011-04-25
## 2           0           0        1             24              26     359.76         2011-04-24
## 3           0           0        2             23              25     299.77         2011-05-20
##   UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess
## 1        2013-10-25          0            29             0             0             29
## 2        2013-10-01          0            24             0             0             24
## 3        2013-10-11          1            22             1             0             22
##   UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52
## 1             0             0            0            0            0            0            0
## 2             2             0            0            2            0            0            0
## 3             2             1            0            0            0            1            0
##   UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## 1            0            0            0            0
## 2            0            0            0            0
## 3            1            0            1            0
# Data Understanding
#   A lot of variables!
#   Perhaps use pivot tables and/or Tableau to build on earlier work 

# Clean up card brand / card type data, add to database
#   Look at distribution of card brand and type in CPT data

table(ChurnFull$CPTCardBrand, ChurnFull$CPTCardType)
##       
##        0 - None 1 - Credit 2 - Debit 3 - Check 5 - Pre-Paid 7 - European Deferred Debit Card
##   MC          0        583       605         0           52                                0
##   None      830          0         0         0            0                                0
##   VI          0        859         0      2095          131                                1
#   Set CardBrand
#     Set CardBrand defailt to PaymentSubType value from original Order Summary
#     Replace PaymentSubType value with CPTCardBrand if present (not None)

ChurnFull$CardBrand <- ChurnFull$PaymentSubTypeCode
ChurnFull$CardBrand[ChurnFull$CPTCardBrand != "None"] <- 
  ChurnFull$CPTCardBrand[ChurnFull$CPTCardBrand != "None"]

#   Clean up CardType
#     Remove leading numbers from codes
#     Map Check to Debit
#     Fill in CardType (as Credit) for credit-only CardBrands -- AX, DI, PAYPAL

ChurnFull$CardType <- mapvalues(ChurnFull$CPTCardType, 
                      c("0 - None", "1 - Credit", "2 - Debit", "3 - Check", 
                        "5 - Pre-Paid", "7 - European Deferred Debit Card"), 
                      c("None", "Credit", "Debit", "Debit", "Prepaid", "Debit"))
ChurnFull$CardType[ChurnFull$CardBrand == "AX"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "DI"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "PAYPAL"] <- "Credit"

#   Check newly created CardBrand and CardType

table(ChurnFull$CardBrand, ChurnFull$CardType)
##          
##           None Credit Debit Prepaid
##   AMZ_FPS    5      0     0       0
##   AX         0    503     0       0
##   DI         0    138     0       0
##   MC         5    583   605      52
##   PAYPAL     0    162     0       0
##   VI        17    859  2096     131
# Set reference levels for CardBrand and CardType
#   Sets VISA and Credit as the omitted factor levels in models
#   CardBrand and CardType estimates are then interpretted relative to 
#     these reference levels

ChurnFull$CardBrand <- relevel(ChurnFull$CardBrand, ref = "VI")
ChurnFull$CardType <- relevel(ChurnFull$CardType, ref = "Credit")

# Add User history metrics
#   Convert date strings from factor to date type
#   Tenure, OrderDensity, Payment Friction, ChurnDensity

ChurnFull$UserOrigSignupDate <- 
  as.Date(ChurnFull$UserOrigSignupDt, format = DateFormat)
ChurnFull$FirstBillDate <- 
  as.Date(ChurnFull$FirstBillDt, format = DateFormat)
ChurnFull$LastPlayDate <- 
  as.Date(ChurnFull$LastPlayDt, format = DateFormat)
ChurnFull$SignUpDate <- 
  as.Date(ChurnFull$SignUpDt, format = DateFormat)
ChurnFull$StopRequestDate <- 
  as.Date(ChurnFull$StopRequestDt, format = DateFormat)
ChurnFull$CancelDate <- 
  as.Date(ChurnFull$CancelDt, format = DateFormat)
ChurnFull$OrdDate <- 
  as.Date(ChurnFull$OrderDate, format = DateFormat)
ChurnFull$UserFirstOrdDate <- 
  as.Date(ChurnFull$UserFirstOrderDate, format = DateFormat)
ChurnFull$UserLastOrdDate <- 
  as.Date(ChurnFull$UserLastOrderDate, format = DateFormat)

#   Calculated date features

ChurnFull$UserTenure <- 
  as.numeric(round((ChurnFull$OrdDate - ChurnFull$UserOrigSignupDate)/365.25*12 + 1))
ChurnFull$UserWindow <- 
  as.numeric(round((ChurnFull$UserLastOrdDate - 
                      ChurnFull$UserFirstOrdDate)/365.25*12 + 1))
ChurnFull$USuccessDensity <- 
  (ChurnFull$UserPaySuccess * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$USDDensity <- 
  (ChurnFull$UserPaySftDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UHDDensity <- 
  (ChurnFull$UserPayHrdDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UChurnDensity <- 
  (ChurnFull$UserChurns * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow

# Randomly split observations 80/20 into training and test samples
#   Generate sample of indices

set.seed(999999999)
TrainSize <- round(0.8 * AllRows)
TrainIdx <- sample(AllRows, TrainSize)

#   Label observations as Train or Test
#   Add Train/Test labels to data base

TrainTest <- rep("Test", AllRows)
TrainTest[TrainIdx] <- "Train"
ChurnFull$TrainTest <- TrainTest

# Save prepped data to .Rdata file for subsequent analysis

save("ChurnFull", file = ChurnPrepped)

########### END OF DATA PREPARATION