options(width=100)

# Starter logistic regression analysis script for Rhapsody Churn Modeling
# Adapted from a Churn Exercise script file

library(ggplot2) # for improved plots -- qplot and scatterplot matrix
library(GGally) # for scatterplot matrix
library(RcmdrMisc) # for numSummary, stepwise
library(ROCR) # for producing ROC and other prediction performance plots
library(caret) # for confusion matrix analysis
library(plyr) # for mapvalues recoding function
library(dplyr) # select function for subsetting data frames

# Suggested usage:
#   Copy the required data file, Rhapsody2016CPTOrderSummary-010.csv into a 
#     new project folder <folder>
#   Change the working directory (setwd below) to point to the project <folder>
#   Run the entire script as a block

# Set up working directory for file locations
#   and load required libraries

ChurnDir <- "D:/MBA/SecondTerm/Data/exam"
ChurnData <- "Rhapsody2017CPTOrderSummary-010.csv"
ChurnPrepped <- "ChurnFullPrepped-010.RData"
ChurnTrainAnalysis <-  "ChurnTrainAnalysis-010.csv"
ChurnTestAnalysis <- "ChurnTestAnalysis-010.csv"

# DateFormat <- "%m/%d/%Y"
DateFormat <- "%Y-%m-%d"

########### BEGIN DATA PREPARATION

setwd(ChurnDir)

# DATA ENTRY AND SUMMARIZATION

# Read full data set, list -- names, structure, head,

ChurnFull <- read.csv(ChurnData)
AllRows <- nrow(ChurnFull)
str(ChurnFull, list.len = 999)
## 'data.frame':    5156 obs. of  109 variables:
##  $ UserPartyId            : num  2273929 2637516 3268169 5027183 5580324 ...
##  $ UserOrigSignupDt       : Factor w/ 1983 levels "2002-06-07","2002-08-06",..: 51 599 460 320 1358 750 58 1749 678 74 ...
##  $ SubscriberServiceId    : num  1.76e+08 1.33e+08 3.11e+08 8.36e+07 3.07e+08 ...
##  $ FirstBillDt            : Factor w/ 1977 levels "2002-08-06","2002-10-02",..: 49 603 465 311 1359 749 58 1721 680 71 ...
##  $ LastPlayDt             : Factor w/ 602 levels "","2005-01-17",..: 598 556 602 602 1 574 574 175 602 602 ...
##  $ SignUpDt               : Factor w/ 1580 levels "2005-05-05","2005-05-09",..: 394 266 987 91 948 353 1303 1346 310 1034 ...
##  $ StopRequestDt          : Factor w/ 87 levels "2010-08-05","2011-01-19",..: 87 87 87 87 87 87 87 87 87 87 ...
##  $ CancelDt               : Factor w/ 76 levels "2013-11-02","2013-11-04",..: 76 69 76 76 76 76 76 76 76 76 ...
##  $ CancelType             : Factor w/ 4 levels "Current Sub",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ CancelReason           : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 4 1 1 1 1 1 1 1 1 ...
##  $ ChannelGroupName       : Factor w/ 15 levels "Cable Partners",..: 13 8 2 2 8 6 14 1 6 10 ...
##  $ ChannelName            : Factor w/ 75 levels "Affiliate","Android Mobile",..: 7 56 57 57 56 2 67 9 22 25 ...
##  $ CobrandCode            : num  40132 40134 40134 40134 40134 ...
##  $ CobrandName            : Factor w/ 23 levels "ATT","AudioGalaxy",..: 3 18 18 18 18 18 11 4 18 10 ...
##  $ OriginCode             : Factor w/ 210 levels "3x5_rhapcom",..: 23 154 30 144 70 70 69 31 166 67 ...
##  $ pcode                  : Factor w/ 160 levels ".","30sec","30sec_hp",..: 1 108 136 136 127 52 1 23 63 1 ...
##  $ cpath                  : Factor w/ 122 levels ".","0","12_19Bpage",..: 1 1 81 81 61 14 1 61 14 1 ...
##  $ rsrc                   : Factor w/ 569 levels ".","0","0508",..: 1 1 262 250 411 70 1 541 371 1 ...
##  $ opage                  : Factor w/ 155 levels "","-","404__404",..: 1 1 68 68 1 48 1 1 1 1 ...
##  $ ServiceTier            : Factor w/ 4 levels "PREM","RHAP",..: 1 4 4 4 1 4 1 4 4 4 ...
##  $ MonthsPerBill          : num  1 1 1 1 1 1 3 1 1 1 ...
##  $ BillingZipCode         : num  75098 52402 60542 80227 79424 ...
##  $ BillingState           : Factor w/ 56 levels "","7th fl","AE",..: 48 16 18 9 48 48 24 12 8 47 ...
##  $ EcommOrderId           : num  3.68e+08 3.67e+08 3.68e+08 3.67e+08 3.68e+08 ...
##  $ OrderDate              : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 25 1 11 7 13 24 14 16 27 3 ...
##  $ PaymentTypeName        : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTypeCode        : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentSubTypeCode     : Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 6 6 6 6 6 2 2 ...
##  $ CPTCardBrand           : Factor w/ 3 levels "MC","None","VI": 3 3 3 3 3 3 3 3 2 2 ...
##  $ CPTCardType            : Factor w/ 6 levels "0 - None","1 - Credit",..: 2 2 4 2 2 4 2 2 1 1 ...
##  $ CPTBIN                 : num  410413 414720 443045 438857 441712 ...
##  $ CPTTransactionDivision : num  193099 193099 193099 193099 193099 ...
##  $ PaymentTxTypeCode      : Factor w/ 1 level "BC": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTxStatus        : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PmtStatType            : Factor w/ 1 level "COMPLETE": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PT_status              : Factor w/ 18 levels "00","04","05",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AVS_status             : Factor w/ 12 levels "3","6","7","9",..: 7 7 7 7 7 6 7 7 5 5 ...
##  $ SEC_status             : Factor w/ 2 levels "S","UNK": 2 2 2 2 2 2 2 2 2 2 ...
##  $ RiskCode               : Factor w/ 1 level "UNK": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentProcessorCode   : Factor w/ 4 levels "AMZ","PPL","PT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentProcessorDesc   : Factor w/ 4 levels "","Amazon FPS",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentDivisionCode    : Factor w/ 4 levels "193099","203217",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentDivisionDesc    : Factor w/ 5 levels "","Amazon FPS non-recurring default",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ PaymentDivisionUseType : Factor w/ 3 levels "","N","R": 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentDivisionRiskType: Factor w/ 2 levels "","NORMALRISK": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Country                : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Currency               : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Amount                 : num  10.8 15 15 15 10.8 ...
##  $ OrderStatus            : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Churn                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PayAttempts            : num  1 8 1 1 1 1 1 1 1 4 ...
##  $ OrdSucces              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ OrdSftDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OrdHrdDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PaySuccess             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ PaySftDec              : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ PayHrdDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat89               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat05               : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ PTStat14               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat56               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat52               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat12               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat33               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat41               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat04               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubOrderCount          : num  29 24 14 26 19 27 2 5 24 16 ...
##  $ SubPayAttempts         : num  29 26 15 26 19 27 2 5 24 16 ...
##  $ SubAmount              : num  313 360 210 390 205 ...
##  $ SubFirstOrderDate      : Factor w/ 911 levels "2011-04-21","2011-04-22",..: 5 4 328 48 275 33 626 717 37 375 ...
##  $ SubLastOrderDate       : Factor w/ 139 levels "2012-10-29","2012-11-03",..: 104 80 90 86 92 103 16 95 76 82 ...
##  $ SubChurns              : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ SubOrdSucces           : num  29 24 13 26 19 26 2 5 24 15 ...
##  $ SubOrdSftDec           : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ SubOrdHrdDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPaySuccess          : num  29 24 13 26 19 26 2 5 24 15 ...
##  $ SubPaySftDec           : num  0 2 2 0 0 1 0 0 0 1 ...
##  $ SubPayHrdDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat89            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat05            : num  0 2 0 0 0 1 0 0 0 1 ...
##  $ SubPTStat14            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat56            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ SubPTStat52            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat12            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ SubPTStat33            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat41            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat04            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubCount               : num  1 1 2 1 1 1 2 3 1 3 ...
##  $ UserOrderCount         : num  29 24 23 26 19 27 7 25 24 22 ...
##  $ UserPayAttempts        : num  29 26 25 26 19 27 10 39 24 23 ...
##  $ UserAmount             : num  313 360 300 390 205 ...
##  $ UserFirstOrderDate     : Factor w/ 888 levels "2011-04-21","2011-04-22",..: 5 4 30 48 277 33 18 16 37 12 ...
##  $ UserLastOrderDate      : Factor w/ 191 levels "2011-07-12","2011-10-28",..: 156 132 142 138 144 155 50 147 128 134 ...
##  $ UserChurns             : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ UserOrdSucces          : num  29 24 22 26 19 26 7 24 24 21 ...
##  $ UserOrdSftDec          : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ UserOrdHrdDec          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPaySuccess         : num  29 24 22 26 19 26 7 24 24 21 ...
##  $ UserPaySftDec          : num  0 2 2 0 0 1 1 2 0 1 ...
##  $ UserPayHrdDec          : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ UserPTStat89           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat05           : num  0 2 0 0 0 1 0 2 0 1 ...
##  $ UserPTStat14           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat56           : num  0 0 1 0 0 0 1 0 0 0 ...
##  $ UserPTStat52           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat12           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ UserPTStat33           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat41           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ UserPTStat04           : num  0 0 0 0 0 0 0 0 0 0 ...
head(ChurnFull, n = 3)
##   UserPartyId UserOrigSignupDt SubscriberServiceId FirstBillDt LastPlayDt   SignUpDt StopRequestDt
## 1     2273929       2004-05-31           176406097  2004-06-15 2014-01-18 2010-04-24    2999-01-01
## 2     2637516       2009-03-08           132608865  2009-03-23 2013-12-07 2009-03-08    2999-01-01
## 3     3268169       2008-05-13           311201787  2008-05-28 2014-01-22 2012-03-08    2999-01-01
##     CancelDt  CancelType              CancelReason    ChannelGroupName ChannelName CobrandCode
## 1 2999-01-01 Current Sub                               Retail Partners       BBDMS       40132
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE MP3 Player Partners     Sandisk       40134
## 3 2999-01-01 Current Sub                              Direct Marketing      Search       40134
##    CobrandName  OriginCode  pcode cpath         rsrc                      opage ServiceTier
## 1     Best Buy     bestbuy      .     .            .                                   PREM
## 2 RealRhapsody sandiskfuze     rn     .            .                                    RTG
## 3 RealRhapsody          cj srchrv ppcse gg_ru_rhp_14 offer_rhap_rhapDMMMainPage         RTG
##   MonthsPerBill BillingZipCode BillingState EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode
## 1             1          75098           TX    368004554 2013-11-25          Credit              CC
## 2             1          52402           IA    367205302 2013-11-01          Credit              CC
## 3             1          60542           IL    367578873 2013-11-11          Credit              CC
##   PaymentSubTypeCode CPTCardBrand CPTCardType CPTBIN CPTTransactionDivision PaymentTxTypeCode
## 1                 VI           VI  1 - Credit 410413                 193099                BC
## 2                 VI           VI  1 - Credit 414720                 193099                BC
## 3                 VI           VI   3 - Check 443045                 193099                BC
##   PaymentTxStatus PmtStatType PT_status AVS_status SEC_status RiskCode PaymentProcessorCode
## 1         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
## 2         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
## 3         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
##   PaymentProcessorDesc PaymentDivisionCode               PaymentDivisionDesc PaymentDivisionUseType
## 1          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
## 2          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
## 3          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
##   PaymentDivisionRiskType Country Currency Amount OrderStatus Churn PayAttempts OrdSucces OrdSftDec
## 1              NORMALRISK      US      USD  10.81     SUCCESS     0           1         1         0
## 2              NORMALRISK      US      USD  14.99     SUCCESS     0           8         1         0
## 3              NORMALRISK      US      USD  14.99     SUCCESS     0           1         1         0
##   OrdHrdDec PaySuccess PaySftDec PayHrdDec PTStat89 PTStat05 PTStat14 PTStat56 PTStat52 PTStat12
## 1         0          1         0         0        0        0        0        0        0        0
## 2         0          1         1         0        0        1        0        0        0        0
## 3         0          1         0         0        0        0        0        0        0        0
##   PTStat33 PTStat41 PTStat04 SubOrderCount SubPayAttempts SubAmount SubFirstOrderDate
## 1        0        0        0            29             29    313.49        2011-04-25
## 2        0        0        0            24             26    359.76        2011-04-24
## 3        0        0        0            14             15    209.86        2012-04-09
##   SubLastOrderDate SubChurns SubOrdSucces SubOrdSftDec SubOrdHrdDec SubPaySuccess SubPaySftDec
## 1       2013-10-25         0           29            0            0            29            0
## 2       2013-10-01         0           24            0            0            24            2
## 3       2013-10-11         1           13            1            0            13            2
##   SubPayHrdDec SubPTStat89 SubPTStat05 SubPTStat14 SubPTStat56 SubPTStat52 SubPTStat12 SubPTStat33
## 1            0           0           0           0           0           0           0           0
## 2            0           0           2           0           0           0           0           0
## 3            0           0           0           0           1           0           1           0
##   SubPTStat41 SubPTStat04 SubCount UserOrderCount UserPayAttempts UserAmount UserFirstOrderDate
## 1           0           0        1             29              29     313.49         2011-04-25
## 2           0           0        1             24              26     359.76         2011-04-24
## 3           0           0        2             23              25     299.77         2011-05-20
##   UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess
## 1        2013-10-25          0            29             0             0             29
## 2        2013-10-01          0            24             0             0             24
## 3        2013-10-11          1            22             1             0             22
##   UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52
## 1             0             0            0            0            0            0            0
## 2             2             0            0            2            0            0            0
## 3             2             1            0            0            0            1            0
##   UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## 1            0            0            0            0
## 2            0            0            0            0
## 3            1            0            1            0
# Data Understanding
#   A lot of variables!
#   Perhaps use pivot tables and/or Tableau to build on earlier work 

# Clean up card brand / card type data, add to database
#   Look at distribution of card brand and type in CPT data

table(ChurnFull$CPTCardBrand, ChurnFull$CPTCardType)
##       
##        0 - None 1 - Credit 2 - Debit 3 - Check 5 - Pre-Paid 7 - European Deferred Debit Card
##   MC          0        583       605         0           52                                0
##   None      830          0         0         0            0                                0
##   VI          0        859         0      2095          131                                1
#   Set CardBrand
#     Set CardBrand defailt to PaymentSubType value from original Order Summary
#     Replace PaymentSubType value with CPTCardBrand if present (not None)

ChurnFull$CardBrand <- ChurnFull$PaymentSubTypeCode
ChurnFull$CardBrand[ChurnFull$CPTCardBrand != "None"] <- 
  ChurnFull$CPTCardBrand[ChurnFull$CPTCardBrand != "None"]

#   Clean up CardType
#     Remove leading numbers from codes
#     Map Check to Debit
#     Fill in CardType (as Credit) for credit-only CardBrands -- AX, DI, PAYPAL

ChurnFull$CardType <- mapvalues(ChurnFull$CPTCardType, 
                      c("0 - None", "1 - Credit", "2 - Debit", "3 - Check", 
                        "5 - Pre-Paid", "7 - European Deferred Debit Card"), 
                      c("None", "Credit", "Debit", "Debit", "Prepaid", "Debit"))
ChurnFull$CardType[ChurnFull$CardBrand == "AX"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "DI"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "PAYPAL"] <- "Credit"

#   Check newly created CardBrand and CardType

table(ChurnFull$CardBrand, ChurnFull$CardType)
##          
##           None Credit Debit Prepaid
##   AMZ_FPS    5      0     0       0
##   AX         0    503     0       0
##   DI         0    138     0       0
##   MC         5    583   605      52
##   PAYPAL     0    162     0       0
##   VI        17    859  2096     131
# Set reference levels for CardBrand and CardType
#   Sets VISA and Credit as the omitted factor levels in models
#   CardBrand and CardType estimates are then interpretted relative to 
#     these reference levels

ChurnFull$CardBrand <- relevel(ChurnFull$CardBrand, ref = "VI")
ChurnFull$CardType <- relevel(ChurnFull$CardType, ref = "Credit")

# Add User history metrics
#   Convert date strings from factor to date type
#   Tenure, OrderDensity, Payment Friction, ChurnDensity

ChurnFull$UserOrigSignupDate <- 
  as.Date(ChurnFull$UserOrigSignupDt, format = DateFormat)
ChurnFull$FirstBillDate <- 
  as.Date(ChurnFull$FirstBillDt, format = DateFormat)
ChurnFull$LastPlayDate <- 
  as.Date(ChurnFull$LastPlayDt, format = DateFormat)
ChurnFull$SignUpDate <- 
  as.Date(ChurnFull$SignUpDt, format = DateFormat)
ChurnFull$StopRequestDate <- 
  as.Date(ChurnFull$StopRequestDt, format = DateFormat)
ChurnFull$CancelDate <- 
  as.Date(ChurnFull$CancelDt, format = DateFormat)
ChurnFull$OrdDate <- 
  as.Date(ChurnFull$OrderDate, format = DateFormat)
ChurnFull$UserFirstOrdDate <- 
  as.Date(ChurnFull$UserFirstOrderDate, format = DateFormat)
ChurnFull$UserLastOrdDate <- 
  as.Date(ChurnFull$UserLastOrderDate, format = DateFormat)

#   Calculated date features

ChurnFull$UserTenure <- 
  as.numeric(round((ChurnFull$OrdDate - ChurnFull$UserOrigSignupDate)/365.25*12 + 1))
ChurnFull$UserWindow <- 
  as.numeric(round((ChurnFull$UserLastOrdDate - 
                      ChurnFull$UserFirstOrdDate)/365.25*12 + 1))
ChurnFull$USuccessDensity <- 
  (ChurnFull$UserPaySuccess * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$USDDensity <- 
  (ChurnFull$UserPaySftDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UHDDensity <- 
  (ChurnFull$UserPayHrdDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UChurnDensity <- 
  (ChurnFull$UserChurns * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow

# Randomly split observations 80/20 into training and test samples
#   Generate sample of indices

set.seed(999999999)
TrainSize <- round(0.8 * AllRows)
TrainIdx <- sample(AllRows, TrainSize)

#   Label observations as Train or Test
#   Add Train/Test labels to data base

TrainTest <- rep("Test", AllRows)
TrainTest[TrainIdx] <- "Train"
ChurnFull$TrainTest <- TrainTest

# Save prepped data to .Rdata file for subsequent analysis

save("ChurnFull", file = ChurnPrepped)

########### END OF DATA PREPARATION

########### BEGIN ANALYSIS

# Select (using dplyr) analysis subset of candidate model variables/features

ChurnAnalysis <- select(ChurnFull, 
                        TrainTest, Churn, UserOrigSignupDate, 
                        FirstBillDate, LastPlayDate, 
                        SignUpDate, StopRequestDate, CancelDate, CancelType,
                        CancelReason, ServiceTier:CPTCardType, 
                        CardBrand, CardType, Amount,
                        SubCount:UserPTStat04,
                        UserTenure, UserWindow, 
                        USuccessDensity, USDDensity, UHDDensity, UChurnDensity
                        )
str(ChurnAnalysis, list.len = 999)
## 'data.frame':    5156 obs. of  52 variables:
##  $ TrainTest         : chr  "Train" "Train" "Test" "Test" ...
##  $ Churn             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserOrigSignupDate: Date, format: "2004-05-31" "2009-03-08" "2008-05-13" ...
##  $ FirstBillDate     : Date, format: "2004-06-15" "2009-03-23" "2008-05-28" ...
##  $ LastPlayDate      : Date, format: "2014-01-18" "2013-12-07" "2014-01-22" ...
##  $ SignUpDate        : Date, format: "2010-04-24" "2009-03-08" "2012-03-08" ...
##  $ StopRequestDate   : Date, format: "2999-01-01" "2999-01-01" "2999-01-01" ...
##  $ CancelDate        : Date, format: "2999-01-01" "2014-01-16" "2999-01-01" ...
##  $ CancelType        : Factor w/ 4 levels "Current Sub",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ CancelReason      : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 4 1 1 1 1 1 1 1 1 ...
##  $ ServiceTier       : Factor w/ 4 levels "PREM","RHAP",..: 1 4 4 4 1 4 1 4 4 4 ...
##  $ MonthsPerBill     : num  1 1 1 1 1 1 3 1 1 1 ...
##  $ BillingZipCode    : num  75098 52402 60542 80227 79424 ...
##  $ BillingState      : Factor w/ 56 levels "","7th fl","AE",..: 48 16 18 9 48 48 24 12 8 47 ...
##  $ EcommOrderId      : num  3.68e+08 3.67e+08 3.68e+08 3.67e+08 3.68e+08 ...
##  $ OrderDate         : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 25 1 11 7 13 24 14 16 27 3 ...
##  $ PaymentTypeName   : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTypeCode   : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentSubTypeCode: Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 6 6 6 6 6 2 2 ...
##  $ CPTCardBrand      : Factor w/ 3 levels "MC","None","VI": 3 3 3 3 3 3 3 3 2 2 ...
##  $ CPTCardType       : Factor w/ 6 levels "0 - None","1 - Credit",..: 2 2 4 2 2 4 2 2 1 1 ...
##  $ CardBrand         : Factor w/ 6 levels "VI","AMZ_FPS",..: 1 1 1 1 1 1 1 1 3 3 ...
##  $ CardType          : Factor w/ 4 levels "Credit","None",..: 1 1 3 1 1 3 1 1 1 1 ...
##  $ Amount            : num  10.8 15 15 15 10.8 ...
##  $ SubCount          : num  1 1 2 1 1 1 2 3 1 3 ...
##  $ UserOrderCount    : num  29 24 23 26 19 27 7 25 24 22 ...
##  $ UserPayAttempts   : num  29 26 25 26 19 27 10 39 24 23 ...
##  $ UserAmount        : num  313 360 300 390 205 ...
##  $ UserFirstOrderDate: Factor w/ 888 levels "2011-04-21","2011-04-22",..: 5 4 30 48 277 33 18 16 37 12 ...
##  $ UserLastOrderDate : Factor w/ 191 levels "2011-07-12","2011-10-28",..: 156 132 142 138 144 155 50 147 128 134 ...
##  $ UserChurns        : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ UserOrdSucces     : num  29 24 22 26 19 26 7 24 24 21 ...
##  $ UserOrdSftDec     : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ UserOrdHrdDec     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPaySuccess    : num  29 24 22 26 19 26 7 24 24 21 ...
##  $ UserPaySftDec     : num  0 2 2 0 0 1 1 2 0 1 ...
##  $ UserPayHrdDec     : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ UserPTStat89      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat05      : num  0 2 0 0 0 1 0 2 0 1 ...
##  $ UserPTStat14      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat56      : num  0 0 1 0 0 0 1 0 0 0 ...
##  $ UserPTStat52      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat12      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ UserPTStat33      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat41      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ UserPTStat04      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserTenure        : num  115 57 67 78 22 47 113 9 51 110 ...
##  $ UserWindow        : num  31 30 30 29 21 30 25 30 29 30 ...
##  $ USuccessDensity   : num  0.935 0.8 0.733 0.897 0.905 ...
##  $ USDDensity        : num  0 0.0667 0.0667 0 0 ...
##  $ UHDDensity        : num  0 0 0.0333 0 0 ...
##  $ UChurnDensity     : num  0 0 0.0333 0 0 ...
head(ChurnAnalysis, n = 3)
##   TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1     Train     0         2004-05-31    2004-06-15   2014-01-18 2010-04-24      2999-01-01
## 2     Train     0         2009-03-08    2009-03-23   2013-12-07 2009-03-08      2999-01-01
## 3      Test     0         2008-05-13    2008-05-28   2014-01-22 2012-03-08      2999-01-01
##   CancelDate  CancelType              CancelReason ServiceTier MonthsPerBill BillingZipCode
## 1 2999-01-01 Current Sub                                  PREM             1          75098
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE         RTG             1          52402
## 3 2999-01-01 Current Sub                                   RTG             1          60542
##   BillingState EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode
## 1           TX    368004554 2013-11-25          Credit              CC                 VI
## 2           IA    367205302 2013-11-01          Credit              CC                 VI
## 3           IL    367578873 2013-11-11          Credit              CC                 VI
##   CPTCardBrand CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts
## 1           VI  1 - Credit        VI   Credit  10.81        1             29              29
## 2           VI  1 - Credit        VI   Credit  14.99        1             24              26
## 3           VI   3 - Check        VI    Debit  14.99        2             23              25
##   UserAmount UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec
## 1     313.49         2011-04-25        2013-10-25          0            29             0
## 2     359.76         2011-04-24        2013-10-01          0            24             0
## 3     299.77         2011-05-20        2013-10-11          1            22             1
##   UserOrdHrdDec UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14
## 1             0             29             0             0            0            0            0
## 2             0             24             2             0            0            2            0
## 3             0             22             2             1            0            0            0
##   UserPTStat56 UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure
## 1            0            0            0            0            0            0        115
## 2            0            0            0            0            0            0         57
## 3            1            0            1            0            1            0         67
##   UserWindow USuccessDensity USDDensity UHDDensity UChurnDensity
## 1         31       0.9354839 0.00000000 0.00000000    0.00000000
## 2         30       0.8000000 0.06666667 0.00000000    0.00000000
## 3         30       0.7333333 0.06666667 0.03333333    0.03333333
#   Check data summaries

summary(ChurnAnalysis)
##   TrainTest             Churn         UserOrigSignupDate   FirstBillDate       
##  Length:5156        Min.   :0.00000   Min.   :2002-06-07   Min.   :2002-08-06  
##  Class :character   1st Qu.:0.00000   1st Qu.:2010-07-28   1st Qu.:2010-08-11  
##  Mode  :character   Median :0.00000   Median :2011-12-01   Median :2011-12-02  
##                     Mean   :0.04325   Mean   :2011-06-03   Mean   :2018-09-18  
##                     3rd Qu.:0.00000   3rd Qu.:2013-01-28   3rd Qu.:2013-02-06  
##                     Max.   :1.00000   Max.   :2013-11-12   Max.   :2999-01-01  
##                                                                                
##   LastPlayDate          SignUpDate         StopRequestDate        CancelDate        
##  Min.   :2005-01-17   Min.   :2005-05-05   Min.   :2010-08-05   Min.   :2013-11-02  
##  1st Qu.:2013-11-23   1st Qu.:2011-05-25   1st Qu.:2999-01-01   1st Qu.:2999-01-01  
##  Median :2014-01-15   Median :2012-05-26   Median :2999-01-01   Median :2999-01-01  
##  Mean   :2013-10-24   Mean   :2012-02-04   Mean   :2957-12-03   Mean   :2902-09-18  
##  3rd Qu.:2014-01-21   3rd Qu.:2013-03-14   3rd Qu.:2999-01-01   3rd Qu.:2999-01-01  
##  Max.   :2014-01-22   Max.   :2013-11-12   Max.   :2999-01-01   Max.   :2999-01-01  
##  NA's   :426                                                                        
##        CancelType                      CancelReason  ServiceTier MonthsPerBill   BillingZipCode 
##  Current Sub:4652                            :4652   PREM:3318   Min.   : 1.00   Min.   :  664  
##  Involuntary: 259   MIGRATION_PAID_TO_PAID   :  75   RHAP:  95   1st Qu.: 1.00   1st Qu.:27525  
##  Migration  :  78   NON_BILLABLE_HARD_DECLINE:  47   RR  :  12   Median : 1.00   Median :49051  
##  Voluntary  : 167   NON_BILLABLE_SOFT_DECLINE: 212   RTG :1731   Mean   : 1.07   Mean   :52297  
##                     NON_TIER_TRANSITION      :   3               3rd Qu.: 1.00   3rd Qu.:79101  
##                     NONE                     : 167               Max.   :12.00   Max.   :99999  
##                                                                                                 
##   BillingState   EcommOrderId            OrderDate    PaymentTypeName PaymentTypeCode
##  CA     : 544   Min.   :367201731   2013-11-01: 467   Credit:4989     CC:4989        
##  TX     : 467   1st Qu.:367420226   2013-11-02: 208   Flexib: 167     FP: 167        
##         : 342   Median :367665204   2013-11-03: 200                                  
##  FL     : 298   Mean   :367662576   2013-11-10: 185                                  
##  NY     : 288   3rd Qu.:367909473   2013-11-14: 183                                  
##  PA     : 213   Max.   :368164542   2013-11-27: 181                                  
##  (Other):3004                       (Other)   :3732                                  
##  PaymentSubTypeCode CPTCardBrand                           CPTCardType     CardBrand   
##  AMZ_FPS:   5       MC  :1240    0 - None                        : 830   VI     :3103  
##  AX     : 503       None: 830    1 - Credit                      :1442   AMZ_FPS:   5  
##  DI     : 138       VI  :3086    2 - Debit                       : 605   AX     : 503  
##  MC     :1245                    3 - Check                       :2095   DI     : 138  
##  PAYPAL : 162                    5 - Pre-Paid                    : 183   MC     :1245  
##  VI     :3103                    7 - European Deferred Debit Card:   1   PAYPAL : 162  
##                                                                                        
##     CardType        Amount          SubCount    UserOrderCount  UserPayAttempts    UserAmount   
##  Credit :2245   Min.   :  0.00   Min.   :0.00   Min.   : 0.00   Min.   :  0.00   Min.   :  0.0  
##  None   :  27   1st Qu.:  9.99   1st Qu.:1.00   1st Qu.: 9.00   1st Qu.: 11.00   1st Qu.: 99.9  
##  Debit  :2701   Median :  9.99   Median :1.00   Median :20.00   Median : 22.00   Median :229.8  
##  Prepaid: 183   Mean   : 12.15   Mean   :1.37   Mean   :17.36   Mean   : 20.08   Mean   :211.6  
##                 3rd Qu.: 14.99   3rd Qu.:2.00   3rd Qu.:25.00   3rd Qu.: 27.00   3rd Qu.:307.4  
##                 Max.   :159.88   Max.   :8.00   Max.   :31.00   Max.   :119.00   Max.   :514.2  
##                                                                                                 
##   UserFirstOrderDate  UserLastOrderDate   UserChurns     UserOrdSucces   UserOrdSftDec   
##  2011-05-01: 197     2013-10-01: 425    Min.   :0.0000   Min.   : 0.00   Min.   :0.0000  
##  2011-05-17:  78     2013-10-02: 160    1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.:0.0000  
##  2011-05-03:  73     2013-10-03: 158    Median :0.0000   Median :20.00   Median :0.0000  
##  2011-04-26:  71     2013-10-10: 151    Mean   :0.2289   Mean   :17.13   Mean   :0.1903  
##  2011-05-05:  69     2013-10-27: 150    3rd Qu.:0.0000   3rd Qu.:25.00   3rd Qu.:0.0000  
##  2011-05-04:  68     2013-10-06: 149    Max.   :8.0000   Max.   :31.00   Max.   :8.0000  
##  (Other)   :4600     (Other)   :3963                                                     
##  UserOrdHrdDec    UserPaySuccess  UserPaySftDec     UserPayHrdDec     UserPTStat89    
##  Min.   :0.0000   Min.   : 0.00   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.0000  
##  1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.0000  
##  Median :0.0000   Median :20.00   Median : 0.0000   Median :0.0000   Median : 0.0000  
##  Mean   :0.0386   Mean   :17.13   Mean   : 0.7335   Mean   :0.1098   Mean   : 0.3765  
##  3rd Qu.:0.0000   3rd Qu.:25.00   3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.: 0.0000  
##  Max.   :3.0000   Max.   :31.00   Max.   :16.0000   Max.   :9.0000   Max.   :16.0000  
##                                                                                       
##   UserPTStat05      UserPTStat14      UserPTStat56      UserPTStat52      UserPTStat12    
##  Min.   : 0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median : 0.0000   Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   : 0.2411   Mean   :0.06633   Mean   :0.02948   Mean   :0.01687   Mean   :0.01552  
##  3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :10.0000   Max.   :3.00000   Max.   :4.00000   Max.   :4.00000   Max.   :2.00000  
##                                                                                           
##   UserPTStat33      UserPTStat41      UserPTStat04       UserTenure       UserWindow   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :  1.00   Min.   : 1.00  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.: 10.00   1st Qu.:12.00  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median : 24.00   Median :23.00  
##  Mean   :0.01901   Mean   :0.06594   Mean   :0.02036   Mean   : 30.41   Mean   :20.76  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.: 41.00   3rd Qu.:30.00  
##  Max.   :2.00000   Max.   :9.00000   Max.   :2.00000   Max.   :138.00   Max.   :31.00  
##                                                                                        
##  USuccessDensity     USDDensity         UHDDensity       UChurnDensity    
##  Min.   : 0.0000   Min.   : 0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.: 0.7931   1st Qu.: 0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median : 0.8621   Median : 0.00000   Median :0.000000   Median :0.00000  
##  Mean   : 0.8435   Mean   : 0.05069   Mean   :0.005472   Mean   :0.01286  
##  3rd Qu.: 0.9091   3rd Qu.: 0.03704   3rd Qu.:0.000000   3rd Qu.:0.00000  
##  Max.   :12.0000   Max.   :12.00000   Max.   :0.666667   Max.   :1.00000  
## 
#   Split analysis database into Train and Test

ChurnTrain <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Train",]
ChurnTest <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Test",]
TrainRows <- nrow(ChurnTrain)
TestRows <- nrow(ChurnTest)
TrainRows
## [1] 4125
TestRows
## [1] 1031
# ANALYSIS AND MODELING OF TRAINING DATA

# Estimate card brand and card type effects

BrandModel <- glm(Churn ~ CardBrand,  
                  family = binomial(logit), data = ChurnTrain)
summary(BrandModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3317  -0.3317  -0.3317  -0.3022   3.2636  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.87254    0.08947 -32.107  < 2e-16 ***
## CardBrandAMZ_FPS -11.69353  441.37170  -0.026 0.978864    
## CardBrandAX       -2.44803    0.71446  -3.426 0.000612 ***
## CardBrandDI       -1.75244    1.00886  -1.737 0.082381 .  
## CardBrandMC       -0.19085    0.17682  -1.079 0.280420    
## CardBrandPAYPAL   -0.37006    0.46456  -0.797 0.425700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1510.2  on 4124  degrees of freedom
## Residual deviance: 1476.6  on 4119  degrees of freedom
## AIC: 1488.6
## 
## Number of Fisher Scoring iterations: 13
BrandTypeModel <- glm(Churn ~ CardBrand + CardType, 
                      family = binomial(logit), data = ChurnTrain)
summary(BrandTypeModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand + CardType, family = binomial(logit), 
##     data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9855  -0.3140  -0.3040  -0.1633   3.2636  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -4.24455    0.26273 -16.155  < 2e-16 ***
## CardBrandAMZ_FPS -14.09630  441.37192  -0.032   0.9745    
## CardBrandAX       -1.07602    0.75596  -1.423   0.1546    
## CardBrandDI       -0.38043    1.03867  -0.366   0.7142    
## CardBrandMC       -0.06651    0.18958  -0.351   0.7257    
## CardBrandPAYPAL    1.00195    0.52616   1.904   0.0569 .  
## CardTypeNone       3.77478    0.51602   7.315 2.57e-13 ***
## CardTypeDebit      1.25944    0.27363   4.603 4.17e-06 ***
## CardTypePrepaid    3.69671    0.30729  12.030  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1510.2  on 4124  degrees of freedom
## Residual deviance: 1281.3  on 4116  degrees of freedom
## AIC: 1299.3
## 
## Number of Fisher Scoring iterations: 13
# Stepwise feature selection

BrandTypeRedModel <- step(BrandTypeModel, k = log(TrainRows)) # based on BIC
## Start:  AIC=1356.25
## Churn ~ CardBrand + CardType
## 
##             Df Deviance    AIC
## - CardBrand  5   1292.0 1325.3
## <none>           1281.3 1356.2
## - CardType   3   1476.6 1526.6
## 
## Step:  AIC=1325.27
## Churn ~ CardType
## 
##            Df Deviance    AIC
## <none>          1292.0 1325.3
## - CardType  3   1510.2 1518.5
summary(BrandTypeRedModel)
## 
## Call:
## glm(formula = Churn ~ CardType, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9476  -0.3118  -0.3118  -0.1634   2.9402  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.3091     0.2055 -20.970  < 2e-16 ***
## CardTypeNone      3.5553     0.4754   7.478 7.55e-14 ***
## CardTypeDebit     1.3095     0.2292   5.714 1.10e-08 ***
## CardTypePrepaid   3.7411     0.2701  13.852  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1510.2  on 4124  degrees of freedom
## Residual deviance: 1292.0  on 4121  degrees of freedom
## AIC: 1300
## 
## Number of Fisher Scoring iterations: 7
# Estimate card brand, card type and User effects

OrderUserModel <- glm(Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
                      USDDensity + UHDDensity + UChurnDensity, 
                      family = binomial(logit), data = ChurnTrain)
summary(OrderUserModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity + UHDDensity + UChurnDensity, family = binomial(logit), 
##     data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4329  -0.3066  -0.2119  -0.1282   5.5596  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.668675   0.339682  -7.856 3.95e-15 ***
## CardBrandAMZ_FPS -13.810996 440.422605  -0.031 0.974984    
## CardBrandAX       -1.057242   0.758607  -1.394 0.163420    
## CardBrandDI       -0.281372   1.041815  -0.270 0.787100    
## CardBrandMC       -0.061239   0.192241  -0.319 0.750065    
## CardBrandPAYPAL    0.359743   0.536807   0.670 0.502759    
## CardTypeNone       3.304463   0.553339   5.972 2.35e-09 ***
## CardTypeDebit      0.956513   0.277875   3.442 0.000577 ***
## CardTypePrepaid    2.691278   0.330703   8.138 4.02e-16 ***
## UserTenure        -0.026382   0.005788  -4.558 5.17e-06 ***
## USuccessDensity   -1.036930   0.280873  -3.692 0.000223 ***
## USDDensity         1.128422   0.333060   3.388 0.000704 ***
## UHDDensity         1.967121   1.985718   0.991 0.321864    
## UChurnDensity     -0.526417   0.885568  -0.594 0.552218    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1510.2  on 4124  degrees of freedom
## Residual deviance: 1218.7  on 4111  degrees of freedom
## AIC: 1246.7
## 
## Number of Fisher Scoring iterations: 13
# Stepwise feature selection

OrderUserRedModel <- step(OrderUserModel, k = log(TrainRows)) # based on BIC
## Start:  AIC=1335.23
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity + UHDDensity + UChurnDensity
## 
##                   Df Deviance    AIC
## - CardBrand        5   1225.0 1299.9
## - UChurnDensity    1   1219.0 1327.3
## - UHDDensity       1   1219.6 1327.8
## - USDDensity       1   1225.3 1333.5
## <none>                 1218.7 1335.2
## - USuccessDensity  1   1232.0 1340.3
## - UserTenure       1   1246.2 1354.4
## - CardType         3   1312.7 1404.3
## 
## Step:  AIC=1299.92
## Churn ~ CardType + UserTenure + USuccessDensity + USDDensity + 
##     UHDDensity + UChurnDensity
## 
##                   Df Deviance    AIC
## - UChurnDensity    1   1225.4 1292.0
## - UHDDensity       1   1226.0 1292.6
## - USDDensity       1   1232.1 1298.7
## <none>                 1225.0 1299.9
## - USuccessDensity  1   1239.4 1306.0
## - UserTenure       1   1253.5 1320.1
## - CardType         3   1335.8 1385.7
## 
## Step:  AIC=1292.02
## Churn ~ CardType + UserTenure + USuccessDensity + USDDensity + 
##     UHDDensity
## 
##                   Df Deviance    AIC
## - UHDDensity       1   1226.3 1284.6
## - USDDensity       1   1232.8 1291.1
## <none>                 1225.4 1292.0
## - USuccessDensity  1   1240.7 1299.0
## - UserTenure       1   1254.5 1312.8
## - CardType         3   1336.9 1378.5
## 
## Step:  AIC=1284.56
## Churn ~ CardType + UserTenure + USuccessDensity + USDDensity
## 
##                   Df Deviance    AIC
## - USDDensity       1   1233.4 1283.3
## <none>                 1226.3 1284.6
## - USuccessDensity  1   1241.2 1291.2
## - UserTenure       1   1255.7 1305.7
## - CardType         3   1340.1 1373.4
## 
## Step:  AIC=1283.31
## Churn ~ CardType + UserTenure + USuccessDensity
## 
##                   Df Deviance    AIC
## <none>                 1233.4 1283.3
## - USuccessDensity  1   1242.9 1284.5
## - UserTenure       1   1268.0 1309.7
## - CardType         3   1368.5 1393.5
summary(OrderUserRedModel)
## 
## Call:
## glm(formula = Churn ~ CardType + UserTenure + USuccessDensity, 
##     family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2880  -0.3153  -0.2106  -0.1423   5.1376  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.838664   0.284368  -9.982  < 2e-16 ***
## CardTypeNone     3.153756   0.502397   6.277 3.44e-10 ***
## CardTypeDebit    1.091985   0.231196   4.723 2.32e-06 ***
## CardTypePrepaid  3.017505   0.282856  10.668  < 2e-16 ***
## UserTenure      -0.029452   0.005871  -5.017 5.26e-07 ***
## USuccessDensity -0.831317   0.266827  -3.116  0.00184 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1510.2  on 4124  degrees of freedom
## Residual deviance: 1233.4  on 4119  degrees of freedom
## AIC: 1245.4
## 
## Number of Fisher Scoring iterations: 7
#    Note Deviance measures and compute Pseudo (Deviance) R2
#      (Null Deviance - Residual Deviance)/(Null Deviance)

# Exponentiate and interpret logistic coefficients
#   Multiplicative change in odds per unit change in predictor

LogisticCoef <- coef(summary(OrderUserRedModel))
ExpCoef <- (exp(LogisticCoef[,"Estimate"]))
NeatCoef <- round(cbind(LogisticCoef, ExpCoef), digits = 6)
NeatCoef
##                  Estimate Std. Error   z value Pr(>|z|)   ExpCoef
## (Intercept)     -2.838664   0.284368 -9.982359 0.000000  0.058504
## CardTypeNone     3.153756   0.502397  6.277419 0.000000 23.423881
## CardTypeDebit    1.091985   0.231196  4.723207 0.000002  2.980183
## CardTypePrepaid  3.017505   0.282856 10.667996 0.000000 20.440221
## UserTenure      -0.029452   0.005871 -5.016721 0.000001  0.970977
## USuccessDensity -0.831317   0.266827 -3.115567 0.001836  0.435475
# Compute and examine training-sample predictions 
#   for reduced logistic --
#   Logistic Prob, Classification

LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTrain, type = "response")

#   Classification analysis, training sample

Threshold <- 0.5
LogisticClass <- rep(0, TrainRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTrain$Churn)
Confusion
##              
## LogisticClass    0    1
##             0 3927  157
##             1   13   28
mean(LogisticClass == ChurnTrain$Churn)
## [1] 0.9587879
# Lift Chart
#   What fraction of churn is captured in the top x fraction of churn scores?

ScoreFrac <- (1:TrainRows - 1)/(TrainRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTrain$Churn[ranks])/sum(ChurnTrain$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
     main = "Churn Lift Chart -- Train",
     xlab = "Fraction of Top Churn Scores",
     ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
#   Using performance function from ROCR package

plotpred <- prediction(LogisticProb, ChurnTrain$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Train", colorize = TRUE)

#   ROC replotted using qplot (ggplot2)
#     Extract plot variables fpr, tpr, threshold/cutoff
#       from plotroc object created via performace function above

fpr <- plotroc@x.values[[1]]  # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]]  # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]]  # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom = "line", color = Cutoff, 
      main = "ROC Chart -- Train",
      xlab = "False Positive Rate",
      ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package

confusionMatrix(LogisticClass,ChurnTrain$Churn, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3927  157
##          1   13   28
##                                           
##                Accuracy : 0.9588          
##                  95% CI : (0.9523, 0.9646)
##     No Information Rate : 0.9552          
##     P-Value [Acc > NIR] : 0.1371          
##                                           
##                   Kappa : 0.2353          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.151351        
##             Specificity : 0.996701        
##          Pos Pred Value : 0.682927        
##          Neg Pred Value : 0.961557        
##              Prevalence : 0.044848        
##          Detection Rate : 0.006788        
##    Detection Prevalence : 0.009939        
##       Balanced Accuracy : 0.574026        
##                                           
##        'Positive' Class : 1               
## 
# Add predictions to data set, save to .csv

ChurnTrain$LogisticProb <- LogisticProb
ChurnTrain$LogisticClass <- LogisticClass
head(ChurnTrain, n = 3)
##   TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1     Train     0         2004-05-31    2004-06-15   2014-01-18 2010-04-24      2999-01-01
## 2     Train     0         2009-03-08    2009-03-23   2013-12-07 2009-03-08      2999-01-01
## 6     Train     0         2010-01-14    2010-01-22   2013-12-25 2010-01-22      2999-01-01
##   CancelDate  CancelType              CancelReason ServiceTier MonthsPerBill BillingZipCode
## 1 2999-01-01 Current Sub                                  PREM             1          75098
## 2 2014-01-16 Involuntary NON_BILLABLE_SOFT_DECLINE         RTG             1          52402
## 6 2999-01-01 Current Sub                                   RTG             1          78757
##   BillingState EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode
## 1           TX    368004554 2013-11-25          Credit              CC                 VI
## 2           IA    367205302 2013-11-01          Credit              CC                 VI
## 6           TX    367973905 2013-11-24          Credit              CC                 VI
##   CPTCardBrand CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts
## 1           VI  1 - Credit        VI   Credit  10.81        1             29              29
## 2           VI  1 - Credit        VI   Credit  14.99        1             24              26
## 6           VI   3 - Check        VI    Debit  16.23        1             27              27
##   UserAmount UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec
## 1     313.49         2011-04-25        2013-10-25          0            29             0
## 2     359.76         2011-04-24        2013-10-01          0            24             0
## 6     438.21         2011-05-23        2013-10-24          1            26             1
##   UserOrdHrdDec UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14
## 1             0             29             0             0            0            0            0
## 2             0             24             2             0            0            2            0
## 6             0             26             1             0            0            1            0
##   UserPTStat56 UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure
## 1            0            0            0            0            0            0        115
## 2            0            0            0            0            0            0         57
## 6            0            0            0            0            0            0         47
##   UserWindow USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 1         31       0.9354839 0.00000000          0    0.00000000 0.0009079962             0
## 2         30       0.8000000 0.06666667          0    0.00000000 0.0055826212             0
## 6         30       0.8666667 0.03333333          0    0.03333333 0.0208075542             0
write.csv(ChurnTrain, file = ChurnTrainAnalysis)

# MODEL EVALUATION USING TEST DATA

# Generate test set predictions -- 
#   Logistic Prob, Classification

LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTest, type = "response")

#   Classification analysis, Testing sample

Threshold <- 0.5
LogisticClass <- rep(0, TestRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTest$Churn)
Confusion
##              
## LogisticClass   0   1
##             0 989  34
##             1   4   4
mean(LogisticClass == ChurnTest$Churn)
## [1] 0.9631426
# Lift Chart
#   What fraction of churn is captured in the top x fraction of churn scores?

ScoreFrac <- (1:TestRows - 1)/(TestRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTest$Churn[ranks])/sum(ChurnTest$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
     main = "Churn Lift Chart -- Test",
     xlab = "Fraction of Top Churn Scores",
     ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
#   Using performance function from ROCR package

plotpred <- prediction(LogisticProb, ChurnTest$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Test", colorize = TRUE)

#   ROC replotted using qplot (ggplot2)
#     Extract plot variables fpr, tpr, threshold/cutoff
#       from plotroc object created via performace function above

fpr <- plotroc@x.values[[1]]  # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]]  # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]]  # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom="line", color= Cutoff, 
      main = "ROC Chart -- Test",
      xlab = "False Positive Rate",
      ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package

confusionMatrix(LogisticClass,ChurnTest$Churn, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 989  34
##          1   4   4
##                                           
##                Accuracy : 0.9631          
##                  95% CI : (0.9498, 0.9738)
##     No Information Rate : 0.9631          
##     P-Value [Acc > NIR] : 0.543           
##                                           
##                   Kappa : 0.1632          
##  Mcnemar's Test P-Value : 2.546e-06       
##                                           
##             Sensitivity : 0.105263        
##             Specificity : 0.995972        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.966764        
##              Prevalence : 0.036857        
##          Detection Rate : 0.003880        
##    Detection Prevalence : 0.007759        
##       Balanced Accuracy : 0.550617        
##                                           
##        'Positive' Class : 1               
## 
# Add predictions to data set, save to .csv

ChurnTest$LogisticProb <- LogisticProb
ChurnTest$LogisticClass <- LogisticClass
head(ChurnTest, n = 3)
##   TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 3      Test     0         2008-05-13    2008-05-28   2014-01-22 2012-03-08      2999-01-01
## 4      Test     0         2007-06-19    2007-07-04   2014-01-22 2007-06-19      2999-01-01
## 5      Test     0         2012-01-29    2012-02-13         <NA> 2012-01-29      2999-01-01
##   CancelDate  CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 3 2999-01-01 Current Sub                      RTG             1          60542           IL
## 4 2999-01-01 Current Sub                      RTG             1          80227           CO
## 5 2999-01-01 Current Sub                     PREM             1          79424           TX
##   EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 3    367578873 2013-11-11          Credit              CC                 VI           VI
## 4    367439669 2013-11-07          Credit              CC                 VI           VI
## 5    367640723 2013-11-13          Credit              CC                 VI           VI
##   CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 3   3 - Check        VI    Debit  14.99        2             23              25     299.77
## 4  1 - Credit        VI   Credit  14.99        1             26              26     389.74
## 5  1 - Credit        VI   Credit  10.81        1             19              19     205.39
##   UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 3         2011-05-20        2013-10-11          1            22             1             0
## 4         2011-06-07        2013-10-07          0            26             0             0
## 5         2012-02-13        2013-10-13          0            19             0             0
##   UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 3             22             2             1            0            0            0            1
## 4             26             0             0            0            0            0            0
## 5             19             0             0            0            0            0            0
##   UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 3            0            1            0            1            0         67         30
## 4            0            0            0            0            0         78         29
## 5            0            0            0            0            0         22         21
##   USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 3       0.7333333 0.06666667 0.03333333    0.03333333  0.013001347             0
## 4       0.8965517 0.00000000 0.00000000    0.00000000  0.002783491             0
## 5       0.9047619 0.00000000 0.00000000    0.00000000  0.014220353             0
write.table(ChurnTest, file = ChurnTestAnalysis)

########### END OF ANALYSIS