options(width=100)

# Starter logistic regression analysis script for Rhapsody Churn Modeling
# Adapted from a Churn Exercise script file

library(ggplot2) # for improved plots -- qplot and scatterplot matrix
library(GGally) # for scatterplot matrix
library(RcmdrMisc) # for numSummary, stepwise
library(ROCR) # for producing ROC and other prediction performance plots
library(caret) # for confusion matrix analysis
library(plyr) # for mapvalues recoding function
library(dplyr) # select function for subsetting data frames

# Suggested usage:
#   Copy the required data file, Rhapsody2016CPTOrderSummary-010.csv into a 
#     new project folder <folder>
#   Change the working directory (setwd below) to point to the project <folder>
#   Run the entire script as a block

# Set up working directory for file locations
#   and load required libraries

ChurnDir <- "D:/MBA/SecondTerm/Data/exam"
ChurnData <- "Rhapsody2017CPTOrderSummary.csv"
ChurnPrepped <- "ChurnFullPrepped-100.RData"
ChurnTrainAnalysis <-  "ChurnTrainAnalysis-100.csv"
ChurnTestAnalysis <- "ChurnTestAnalysis-100.csv"

# DateFormat <- "%m/%d/%Y"
DateFormat <- "%Y-%m-%d"

########### BEGIN DATA PREPARATION

setwd(ChurnDir)

# DATA ENTRY AND SUMMARIZATION

# Read full data set, list -- names, structure, head,

ChurnFull <- read.csv(ChurnData)
AllRows <- nrow(ChurnFull)
str(ChurnFull, list.len = 999)
## 'data.frame':    529430 obs. of  109 variables:
##  $ UserPartyId            : num  120392 141193 146503 149471 150298 ...
##  $ UserOrigSignupDt       : Factor w/ 4161 levels "2001-12-03","2002-01-25",..: 2507 504 2894 3513 762 2375 361 2176 1139 2841 ...
##  $ SubscriberServiceId    : num  1.41e+08 8.80e+07 1.77e+08 3.02e+08 4.00e+08 ...
##  $ FirstBillDt            : Factor w/ 3890 levels "2001-12-03","2002-01-25",..: 2212 202 2584 3203 460 2065 97 1896 844 2531 ...
##  $ LastPlayDt             : Factor w/ 2234 levels "","2003-05-09",..: 2205 2205 1976 2230 1 2232 2234 2234 1974 2187 ...
##  $ SignUpDt               : Factor w/ 3200 levels "2002-12-22","2003-01-04",..: 1546 1018 1933 2552 2908 2296 2908 1245 2525 2480 ...
##  $ StopRequestDt          : Factor w/ 924 levels "2007-11-26","2007-12-19",..: 924 924 908 924 924 924 924 924 924 924 ...
##  $ CancelDt               : Factor w/ 87 levels "2007-12-16","2008-01-03",..: 87 87 87 87 87 87 87 87 87 87 ...
##  $ CancelType             : Factor w/ 4 levels "Current Sub",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ CancelReason           : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ChannelGroupName       : Factor w/ 15 levels "Cable Partners",..: 14 14 5 12 3 14 1 8 14 2 ...
##  $ ChannelName            : Factor w/ 117 levels "Affiliate","Android Mobile",..: 106 106 84 70 30 106 69 91 106 92 ...
##  $ CobrandCode            : num  40134 1 40134 40134 40135 ...
##  $ CobrandName            : Factor w/ 36 levels "","ATT","AudioGalaxy",..: 28 19 28 28 15 28 26 28 19 28 ...
##  $ OriginCode             : Factor w/ 493 levels "","12_19Bpage",..: 191 187 327 341 161 332 280 191 187 191 ...
##  $ pcode                  : Factor w/ 445 levels ".","0","12_19Bpage",..: 1 1 1 305 1 1 1 345 1 385 ...
##  $ cpath                  : Factor w/ 351 levels ".","<musicnew>",..: 1 1 223 272 1 1 1 176 1 251 ...
##  $ rsrc                   : Factor w/ 5161 levels "-","$10_300x250",..: 4 4 1604 4755 4 4 4 4958 4 2171 ...
##  $ opage                  : Factor w/ 455 levels "","-","0","404__404",..: 1 1 262 441 1 1 1 1 1 31 ...
##  $ ServiceTier            : Factor w/ 6 levels "FREE","PREM",..: 5 5 2 2 2 2 2 5 2 5 ...
##  $ MonthsPerBill          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BillingZipCode         : num  44333 94127 18015 19426 96737 ...
##  $ BillingState           : Factor w/ 171 levels "","- AICH","- MIYA",..: 118 30 126 126 65 103 117 117 126 53 ...
##  $ EcommOrderId           : num  3.68e+08 3.67e+08 3.67e+08 3.68e+08 3.67e+08 ...
##  $ OrderDate              : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 16 1 2 11 8 1 26 14 15 1 ...
##  $ PaymentTypeName        : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTypeCode        : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentSubTypeCode     : Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 2 4 6 3 2 6 6 ...
##  $ CPTCardBrand           : Factor w/ 3 levels "MC","None","VI": 3 3 3 2 1 3 2 2 3 3 ...
##  $ CPTCardType            : Factor w/ 7 levels "0 - None","1 - Credit",..: 2 2 4 1 2 2 1 1 4 4 ...
##  $ CPTBIN                 : num  408161 414729 482857 NA 512107 ...
##  $ CPTTransactionDivision : num  193099 193099 193099 NA 193099 ...
##  $ PaymentTxTypeCode      : Factor w/ 1 level "BC": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTxStatus        : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PmtStatType            : Factor w/ 1 level "COMPLETE": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PT_status              : Factor w/ 33 levels "00","01","04",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AVS_status             : Factor w/ 16 levels "2","3","6","7",..: 8 10 10 6 2 8 8 6 7 8 ...
##  $ SEC_status             : Factor w/ 4 levels "M","P","S","UNK": 4 4 4 4 4 4 3 4 4 4 ...
##  $ RiskCode               : Factor w/ 1 level "UNK": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentProcessorCode   : Factor w/ 4 levels "AMZ","PPL","PT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentProcessorDesc   : Factor w/ 4 levels "","Amazon FPS",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentDivisionCode    : Factor w/ 4 levels "193099","203217",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentDivisionDesc    : Factor w/ 5 levels "","Amazon FPS non-recurring default",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ PaymentDivisionUseType : Factor w/ 3 levels "","N","R": 3 3 3 3 3 3 3 3 3 3 ...
##  $ PaymentDivisionRiskType: Factor w/ 2 levels "","NORMALRISK": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Country                : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Currency               : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Amount                 : num  14.99 14.99 9.99 9.99 10.35 ...
##  $ OrderStatus            : Factor w/ 3 levels "HRDDECLINE","SFTDECLINE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Churn                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OrdSucces              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ OrdSftDec              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ OrdHrdDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PayAttempts            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PaySuccess             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ PaySftDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PayHrdDec              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat89               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat05               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat14               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat56               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat52               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat12               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat33               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat41               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTStat04               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubOrderCount          : num  26 25 25 16 8 25 8 27 17 19 ...
##  $ SubPayAttempts         : num  34 25 25 16 8 25 8 27 17 19 ...
##  $ SubAmount              : num  389.7 374.8 249.8 159.8 82.8 ...
##  $ SubFirstOrderDate      : Factor w/ 954 levels "2011-04-21","2011-04-22",..: 6 11 43 326 660 72 678 24 301 256 ...
##  $ SubLastOrderDate       : Factor w/ 408 levels "2011-10-23","2011-10-29",..: 364 319 320 359 356 349 344 362 363 349 ...
##  $ SubChurns              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubOrdSucces           : num  26 25 25 16 8 25 8 27 17 19 ...
##  $ SubOrdSftDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubOrdHrdDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPaySuccess          : num  26 25 25 16 8 25 8 27 17 19 ...
##  $ SubPaySftDec           : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ SubPayHrdDec           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat89            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat05            : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat14            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat56            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat52            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat12            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat33            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat41            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubPTStat04            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SubCount               : num  1 1 1 1 2 2 2 1 2 2 ...
##  $ UserOrderCount         : num  26 25 25 16 26 26 28 27 18 25 ...
##  $ UserPayAttempts        : num  34 25 25 16 26 26 28 27 18 25 ...
##  $ UserAmount             : num  390 375 250 160 269 ...
##  $ UserFirstOrderDate     : Factor w/ 955 levels "2011-04-21","2011-04-22",..: 6 11 43 326 18 11 6 24 129 24 ...
##  $ UserLastOrderDate      : Factor w/ 878 levels "2011-04-22","2011-04-23",..: 833 788 789 828 825 818 813 831 832 818 ...
##  $ UserChurns             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserOrdSucces          : num  26 25 25 16 26 26 28 27 18 25 ...
##  $ UserOrdSftDec          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserOrdHrdDec          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPaySuccess         : num  26 25 25 16 26 26 28 27 18 25 ...
##  $ UserPaySftDec          : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ UserPayHrdDec          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat89           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat05           : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat14           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat56           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat52           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat12           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat33           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat41           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat04           : num  0 0 0 0 0 0 0 0 0 0 ...
head(ChurnFull, n = 3)
##   UserPartyId UserOrigSignupDt SubscriberServiceId FirstBillDt LastPlayDt   SignUpDt StopRequestDt
## 1      120392       2009-05-10           141005905  2009-05-25 2013-12-24 2009-05-10    2999-01-01
## 2      141193       2003-11-15            87975825  2003-11-23 2013-12-24 2007-11-29    2999-01-01
## 3      146503       2010-06-01           176614654  2010-06-01 2013-05-09 2010-06-01    2014-01-07
##     CancelDt  CancelType CancelReason ChannelGroupName                   ChannelName CobrandCode
## 1 2999-01-01 Current Sub                      Unmapped                       Unknown       40134
## 2 2999-01-01 Current Sub                      Unmapped                       Unknown           1
## 3 2999-01-01 Current Sub                 Miscellaneous RealPlayer Feature Trigger SP       40134
##    CobrandName OriginCode pcode     cpath rsrc                     opage ServiceTier MonthsPerBill
## 1 RealRhapsody LogInViaRN     .         .    .                                   RTG             1
## 2   Listen.com     listen     .         .    .                                   RTG             1
## 3 RealRhapsody       rcom     . myaccount edit rc_rp_trigger_dvdplayback        PREM             1
##   BillingZipCode BillingState EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode
## 1          44333           OH    367726684 2013-11-16          Credit              CC
## 2          94127           CA    367201564 2013-11-01          Credit              CC
## 3          18015           PA    367275211 2013-11-02          Credit              CC
##   PaymentSubTypeCode CPTCardBrand CPTCardType CPTBIN CPTTransactionDivision PaymentTxTypeCode
## 1                 VI           VI  1 - Credit 408161                 193099                BC
## 2                 VI           VI  1 - Credit 414729                 193099                BC
## 3                 VI           VI   3 - Check 482857                 193099                BC
##   PaymentTxStatus PmtStatType PT_status AVS_status SEC_status RiskCode PaymentProcessorCode
## 1         SUCCESS    COMPLETE        00          C        UNK      UNK                   PT
## 2         SUCCESS    COMPLETE        00          G        UNK      UNK                   PT
## 3         SUCCESS    COMPLETE        00          G        UNK      UNK                   PT
##   PaymentProcessorDesc PaymentDivisionCode               PaymentDivisionDesc PaymentDivisionUseType
## 1          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
## 2          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
## 3          PaymentTech              193099 Paymentech Recurring for Rhapsody                      R
##   PaymentDivisionRiskType Country Currency Amount OrderStatus Churn OrdSucces OrdSftDec OrdHrdDec
## 1              NORMALRISK      US      USD  14.99     SUCCESS     0         1         1         0
## 2              NORMALRISK      US      USD  14.99     SUCCESS     0         1         1         0
## 3              NORMALRISK      US      USD   9.99     SUCCESS     0         1         1         0
##   PayAttempts PaySuccess PaySftDec PayHrdDec PTStat89 PTStat05 PTStat14 PTStat56 PTStat52 PTStat12
## 1           0          1         0         0        0        0        0        0        0        0
## 2           0          1         0         0        0        0        0        0        0        0
## 3           0          1         0         0        0        0        0        0        0        0
##   PTStat33 PTStat41 PTStat04 SubOrderCount SubPayAttempts SubAmount SubFirstOrderDate
## 1        0        0        0            26             34    389.74        2011-04-26
## 2        0        0        0            25             25    374.75        2011-05-01
## 3        0        0        0            25             25    249.75        2011-06-02
##   SubLastOrderDate SubChurns SubOrdSucces SubOrdSftDec SubOrdHrdDec SubPaySuccess SubPaySftDec
## 1       2013-10-16         0           26            0            0            26            2
## 2       2013-09-01         0           25            0            0            25            0
## 3       2013-09-02         0           25            0            0            25            0
##   SubPayHrdDec SubPTStat89 SubPTStat05 SubPTStat14 SubPTStat56 SubPTStat52 SubPTStat12 SubPTStat33
## 1            0           0           2           0           0           0           0           0
## 2            0           0           0           0           0           0           0           0
## 3            0           0           0           0           0           0           0           0
##   SubPTStat41 SubPTStat04 SubCount UserOrderCount UserPayAttempts UserAmount UserFirstOrderDate
## 1           0           0        1             26              34     389.74         2011-04-26
## 2           0           0        1             25              25     374.75         2011-05-01
## 3           0           0        1             25              25     249.75         2011-06-02
##   UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec UserPaySuccess
## 1        2013-10-16          0            26             0             0             26
## 2        2013-09-01          0            25             0             0             25
## 3        2013-09-02          0            25             0             0             25
##   UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56 UserPTStat52
## 1             2             0            0            2            0            0            0
## 2             0             0            0            0            0            0            0
## 3             0             0            0            0            0            0            0
##   UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04
## 1            0            0            0            0
## 2            0            0            0            0
## 3            0            0            0            0
# Data Understanding
#   A lot of variables!
#   Perhaps use pivot tables and/or Tableau to build on earlier work 

# Clean up card brand / card type data, add to database
#   Look at distribution of card brand and type in CPT data

table(ChurnFull$CPTCardBrand, ChurnFull$CPTCardType)
##       
##        0 - None 1 - Credit 2 - Debit 3 - Check 5 - Pre-Paid 6 - Charge Card
##   MC          0      56950     60926         0         6504               0
##   None    86749          0         0         0            0               0
##   VI          0      89563         0    214453        14255               1
##       
##        7 - European Deferred Debit Card
##   MC                                  0
##   None                                0
##   VI                                 29
#   Set CardBrand
#     Set CardBrand defailt to PaymentSubType value from original Order Summary
#     Replace PaymentSubType value with CPTCardBrand if present (not None)

ChurnFull$CardBrand <- ChurnFull$PaymentSubTypeCode
ChurnFull$CardBrand[ChurnFull$CPTCardBrand != "None"] <- 
  ChurnFull$CPTCardBrand[ChurnFull$CPTCardBrand != "None"]

#   Clean up CardType
#     Remove leading numbers from codes
#     Map Check to Debit
#     Fill in CardType (as Credit) for credit-only CardBrands -- AX, DI, PAYPAL

ChurnFull$CardType <- mapvalues(ChurnFull$CPTCardType, 
                      c("0 - None", "1 - Credit", "2 - Debit", "3 - Check", 
                        "5 - Pre-Paid", "7 - European Deferred Debit Card"), 
                      c("None", "Credit", "Debit", "Debit", "Prepaid", "Debit"))
ChurnFull$CardType[ChurnFull$CardBrand == "AX"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "DI"] <- "Credit"
ChurnFull$CardType[ChurnFull$CardBrand == "PAYPAL"] <- "Credit"

#   Check newly created CardBrand and CardType

table(ChurnFull$CardBrand, ChurnFull$CardType)
##          
##             None Credit  Debit Prepaid 6 - Charge Card
##   AMZ_FPS    832      0      0       0               0
##   AX           0  53352      0       0               0
##   DI           0  14070      0       0               0
##   MC         718  56950  60926    6504               0
##   PAYPAL       0  16121      0       0               0
##   VI        1656  89563 214482   14255               1
# Set reference levels for CardBrand and CardType
#   Sets VISA and Credit as the omitted factor levels in models
#   CardBrand and CardType estimates are then interpretted relative to 
#     these reference levels

ChurnFull$CardBrand <- relevel(ChurnFull$CardBrand, ref = "VI")
ChurnFull$CardType <- relevel(ChurnFull$CardType, ref = "Credit")

# Add User history metrics
#   Convert date strings from factor to date type
#   Tenure, OrderDensity, Payment Friction, ChurnDensity

ChurnFull$UserOrigSignupDate <- 
  as.Date(ChurnFull$UserOrigSignupDt, format = DateFormat)
ChurnFull$FirstBillDate <- 
  as.Date(ChurnFull$FirstBillDt, format = DateFormat)
ChurnFull$LastPlayDate <- 
  as.Date(ChurnFull$LastPlayDt, format = DateFormat)
ChurnFull$SignUpDate <- 
  as.Date(ChurnFull$SignUpDt, format = DateFormat)
ChurnFull$StopRequestDate <- 
  as.Date(ChurnFull$StopRequestDt, format = DateFormat)
ChurnFull$CancelDate <- 
  as.Date(ChurnFull$CancelDt, format = DateFormat)
ChurnFull$OrdDate <- 
  as.Date(ChurnFull$OrderDate, format = DateFormat)
ChurnFull$UserFirstOrdDate <- 
  as.Date(ChurnFull$UserFirstOrderDate, format = DateFormat)
ChurnFull$UserLastOrdDate <- 
  as.Date(ChurnFull$UserLastOrderDate, format = DateFormat)

#   Calculated date features

ChurnFull$UserTenure <- 
  as.numeric(round((ChurnFull$OrdDate - ChurnFull$UserOrigSignupDate)/365.25*12 + 1))
ChurnFull$UserWindow <- 
  as.numeric(round((ChurnFull$UserLastOrdDate - 
                      ChurnFull$UserFirstOrdDate)/365.25*12 + 1))
ChurnFull$USuccessDensity <- 
  (ChurnFull$UserPaySuccess * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$USDDensity <- 
  (ChurnFull$UserPaySftDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UHDDensity <- 
  (ChurnFull$UserPayHrdDec * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow
ChurnFull$UChurnDensity <- 
  (ChurnFull$UserChurns * ChurnFull$MonthsPerBill)/ChurnFull$UserWindow

# Randomly split observations 80/20 into training and test samples
#   Generate sample of indices

set.seed(999999999)
TrainSize <- round(0.8 * AllRows)
TrainIdx <- sample(AllRows, TrainSize)

#   Label observations as Train or Test
#   Add Train/Test labels to data base

TrainTest <- rep("Test", AllRows)
TrainTest[TrainIdx] <- "Train"
ChurnFull$TrainTest <- TrainTest

# Save prepped data to .Rdata file for subsequent analysis

save("ChurnFull", file = ChurnPrepped)

########### END OF DATA PREPARATION

########### BEGIN ANALYSIS

# Select (using dplyr) analysis subset of candidate model variables/features

ChurnAnalysis <- select(ChurnFull, 
                        TrainTest, Churn, UserOrigSignupDate, 
                        FirstBillDate, LastPlayDate, 
                        SignUpDate, StopRequestDate, CancelDate, CancelType,
                        CancelReason, ServiceTier:CPTCardType, 
                        CardBrand, CardType, Amount,
                        SubCount:UserPTStat04,
                        UserTenure, UserWindow, 
                        USuccessDensity, USDDensity, UHDDensity, UChurnDensity
                        )
str(ChurnAnalysis, list.len = 999)
## 'data.frame':    529430 obs. of  52 variables:
##  $ TrainTest         : chr  "Test" "Train" "Train" "Test" ...
##  $ Churn             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserOrigSignupDate: Date, format: "2009-05-10" "2003-11-15" "2010-06-01" ...
##  $ FirstBillDate     : Date, format: "2009-05-25" "2003-11-23" "2010-06-01" ...
##  $ LastPlayDate      : Date, format: "2013-12-24" "2013-12-24" "2013-05-09" ...
##  $ SignUpDate        : Date, format: "2009-05-10" "2007-11-29" "2010-06-01" ...
##  $ StopRequestDate   : Date, format: "2999-01-01" "2999-01-01" "2014-01-07" ...
##  $ CancelDate        : Date, format: "2999-01-01" "2999-01-01" "2999-01-01" ...
##  $ CancelType        : Factor w/ 4 levels "Current Sub",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ CancelReason      : Factor w/ 6 levels "","MIGRATION_PAID_TO_PAID",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ServiceTier       : Factor w/ 6 levels "FREE","PREM",..: 5 5 2 2 2 2 2 5 2 5 ...
##  $ MonthsPerBill     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BillingZipCode    : num  44333 94127 18015 19426 96737 ...
##  $ BillingState      : Factor w/ 171 levels "","- AICH","- MIYA",..: 118 30 126 126 65 103 117 117 126 53 ...
##  $ EcommOrderId      : num  3.68e+08 3.67e+08 3.67e+08 3.68e+08 3.67e+08 ...
##  $ OrderDate         : Factor w/ 30 levels "2013-11-01","2013-11-02",..: 16 1 2 11 8 1 26 14 15 1 ...
##  $ PaymentTypeName   : Factor w/ 2 levels "Credit","Flexib": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentTypeCode   : Factor w/ 2 levels "CC","FP": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PaymentSubTypeCode: Factor w/ 6 levels "AMZ_FPS","AX",..: 6 6 6 2 4 6 3 2 6 6 ...
##  $ CPTCardBrand      : Factor w/ 3 levels "MC","None","VI": 3 3 3 2 1 3 2 2 3 3 ...
##  $ CPTCardType       : Factor w/ 7 levels "0 - None","1 - Credit",..: 2 2 4 1 2 2 1 1 4 4 ...
##  $ CardBrand         : Factor w/ 6 levels "VI","AMZ_FPS",..: 1 1 1 3 5 1 4 3 1 1 ...
##  $ CardType          : Factor w/ 5 levels "Credit","None",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ Amount            : num  14.99 14.99 9.99 9.99 10.35 ...
##  $ SubCount          : num  1 1 1 1 2 2 2 1 2 2 ...
##  $ UserOrderCount    : num  26 25 25 16 26 26 28 27 18 25 ...
##  $ UserPayAttempts   : num  34 25 25 16 26 26 28 27 18 25 ...
##  $ UserAmount        : num  390 375 250 160 269 ...
##  $ UserFirstOrderDate: Factor w/ 955 levels "2011-04-21","2011-04-22",..: 6 11 43 326 18 11 6 24 129 24 ...
##  $ UserLastOrderDate : Factor w/ 878 levels "2011-04-22","2011-04-23",..: 833 788 789 828 825 818 813 831 832 818 ...
##  $ UserChurns        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserOrdSucces     : num  26 25 25 16 26 26 28 27 18 25 ...
##  $ UserOrdSftDec     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserOrdHrdDec     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPaySuccess    : num  26 25 25 16 26 26 28 27 18 25 ...
##  $ UserPaySftDec     : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ UserPayHrdDec     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat89      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat05      : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat14      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat56      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat52      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat12      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat33      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat41      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserPTStat04      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UserTenure        : num  55 121 42 22 112 59 126 66 100 44 ...
##  $ UserWindow        : num  31 29 28 20 30 30 30 30 27 30 ...
##  $ USuccessDensity   : num  0.839 0.862 0.893 0.8 0.867 ...
##  $ USDDensity        : num  0.0645 0 0 0 0 ...
##  $ UHDDensity        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UChurnDensity     : num  0 0 0 0 0 0 0 0 0 0 ...
head(ChurnAnalysis, n = 3)
##   TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1      Test     0         2009-05-10    2009-05-25   2013-12-24 2009-05-10      2999-01-01
## 2     Train     0         2003-11-15    2003-11-23   2013-12-24 2007-11-29      2999-01-01
## 3     Train     0         2010-06-01    2010-06-01   2013-05-09 2010-06-01      2014-01-07
##   CancelDate  CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 1 2999-01-01 Current Sub                      RTG             1          44333           OH
## 2 2999-01-01 Current Sub                      RTG             1          94127           CA
## 3 2999-01-01 Current Sub                     PREM             1          18015           PA
##   EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 1    367726684 2013-11-16          Credit              CC                 VI           VI
## 2    367201564 2013-11-01          Credit              CC                 VI           VI
## 3    367275211 2013-11-02          Credit              CC                 VI           VI
##   CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 1  1 - Credit        VI   Credit  14.99        1             26              34     389.74
## 2  1 - Credit        VI   Credit  14.99        1             25              25     374.75
## 3   3 - Check        VI    Debit   9.99        1             25              25     249.75
##   UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 1         2011-04-26        2013-10-16          0            26             0             0
## 2         2011-05-01        2013-09-01          0            25             0             0
## 3         2011-06-02        2013-09-02          0            25             0             0
##   UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 1             26             2             0            0            2            0            0
## 2             25             0             0            0            0            0            0
## 3             25             0             0            0            0            0            0
##   UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 1            0            0            0            0            0         55         31
## 2            0            0            0            0            0        121         29
## 3            0            0            0            0            0         42         28
##   USuccessDensity USDDensity UHDDensity UChurnDensity
## 1       0.8387097 0.06451613          0             0
## 2       0.8620690 0.00000000          0             0
## 3       0.8928571 0.00000000          0             0
#   Check data summaries

summary(ChurnAnalysis)
##   TrainTest             Churn         UserOrigSignupDate   FirstBillDate       
##  Length:529430      Min.   :0.00000   Min.   :2001-12-03   Min.   :2001-12-03  
##  Class :character   1st Qu.:0.00000   1st Qu.:2010-08-20   1st Qu.:2010-09-04  
##  Mode  :character   Median :0.00000   Median :2011-12-01   Median :2011-12-05  
##                     Mean   :0.04311   Mean   :2011-06-11   Mean   :2017-12-08  
##                     3rd Qu.:0.00000   3rd Qu.:2013-01-23   3rd Qu.:2013-02-03  
##                     Max.   :1.00000   Max.   :2013-11-22   Max.   :2999-01-01  
##                                                                                
##   LastPlayDate          SignUpDate         StopRequestDate        CancelDate        
##  Min.   :2003-05-09   Min.   :2002-12-22   Min.   :2007-11-26   Min.   :2007-12-16  
##  1st Qu.:2013-11-20   1st Qu.:2011-06-17   1st Qu.:2999-01-01   1st Qu.:2999-01-01  
##  Median :2014-01-15   Median :2012-06-11   Median :2999-01-01   Median :2999-01-01  
##  Mean   :2013-10-23   Mean   :2012-02-15   Mean   :2955-07-23   Mean   :2903-07-19  
##  3rd Qu.:2014-01-21   3rd Qu.:2013-03-11   3rd Qu.:2999-01-01   3rd Qu.:2999-01-01  
##  Max.   :2014-01-22   Max.   :2013-11-22   Max.   :2999-01-01   Max.   :2999-01-01  
##  NA's   :45646                                                                      
##        CancelType                        CancelReason    ServiceTier   MonthsPerBill   
##  Current Sub:478125                            :478125   FREE:   178   Min.   : 1.000  
##  Involuntary: 24964   MIGRATION_PAID_TO_PAID   :  7569   PREM:337522   1st Qu.: 1.000  
##  Migration  :  8092   NON_BILLABLE_HARD_DECLINE:  5025   RHAP: 11311   Median : 1.000  
##  Voluntary  : 18249   NON_BILLABLE_SOFT_DECLINE: 19934   RR  :  1275   Mean   : 1.061  
##                       NON_TIER_TRANSITION      :   523   RTG :178973   3rd Qu.: 1.000  
##                       NONE                     : 18254   RU  :   171   Max.   :12.000  
##                                                                                        
##  BillingZipCode   BillingState     EcommOrderId            OrderDate      PaymentTypeName
##  Min.   :    0   CA     : 57425   Min.   :367201517   2013-11-01: 44960   Credit:512466  
##  1st Qu.:27521   TX     : 47599   1st Qu.:367423714   2013-11-03: 20075   Flexib: 16964  
##  Median :48855          : 32196   Median :367665426   2013-11-09: 18947                  
##  Mean   :52015   NY     : 30870   Mean   :367665833   2013-11-05: 18560                  
##  3rd Qu.:78741   FL     : 29788   3rd Qu.:367898711   2013-11-07: 18476                  
##  Max.   :99999   (Other):331551   Max.   :368164705   2013-11-02: 18459                  
##                  NA's   :     1                       (Other)   :389953                  
##  PaymentTypeCode PaymentSubTypeCode CPTCardBrand                            CPTCardType    
##  CC:512466       AMZ_FPS:   832     MC  :124380   0 - None                        : 86749  
##  FP: 16964       AX     : 53377     None: 86749   1 - Credit                      :146513  
##                  DI     : 14076     VI  :318301   2 - Debit                       : 60926  
##                  MC     :124774                   3 - Check                       :214453  
##                  PAYPAL : 16170                   5 - Pre-Paid                    : 20759  
##                  VI     :320201                   6 - Charge Card                 :     1  
##                                                   7 - European Deferred Debit Card:    29  
##    CardBrand                 CardType          Amount          SubCount      UserOrderCount 
##  VI     :319957   Credit         :230056   Min.   :  0.00   Min.   : 0.000   Min.   : 0.00  
##  AMZ_FPS:   832   None           :  3206   1st Qu.:  9.99   1st Qu.: 1.000   1st Qu.: 9.00  
##  AX     : 53352   Debit          :275408   Median :  9.99   Median : 1.000   Median :20.00  
##  DI     : 14070   Prepaid        : 20759   Mean   : 12.09   Mean   : 1.371   Mean   :17.28  
##  MC     :125098   6 - Charge Card:     1   3rd Qu.: 14.99   3rd Qu.: 2.000   3rd Qu.:25.00  
##  PAYPAL : 16121                            Max.   :178.99   Max.   :18.000   Max.   :46.00  
##                                                                                             
##  UserPayAttempts    UserAmount     UserFirstOrderDate  UserLastOrderDate    UserChurns     
##  Min.   :  0.00   Min.   :  0.0   2011-05-01: 18901   2013-10-01: 39759   Min.   : 0.0000  
##  1st Qu.: 11.00   1st Qu.: 99.9   2011-05-03:  6731   2013-10-03: 16172   1st Qu.: 0.0000  
##  Median : 22.00   Median :227.2   2011-04-26:  6645   2013-10-09: 15224   Median : 0.0000  
##  Mean   : 20.04   Mean   :210.6   2011-05-10:  6423   2013-10-05: 14810   Mean   : 0.2369  
##  3rd Qu.: 27.00   3rd Qu.:303.8   2011-05-16:  6350   2013-10-07: 14730   3rd Qu.: 0.0000  
##  Max.   :158.00   Max.   :681.7   2011-05-09:  6345   2013-10-02: 14648   Max.   :17.0000  
##                                   (Other)   :478035   (Other)   :414087                    
##  UserOrdSucces   UserOrdSftDec    UserOrdHrdDec     UserPaySuccess  UserPaySftDec    
##  Min.   : 0.00   Min.   : 0.000   Min.   :0.00000   Min.   : 0.00   Min.   : 0.0000  
##  1st Qu.: 9.00   1st Qu.: 0.000   1st Qu.:0.00000   1st Qu.: 9.00   1st Qu.: 0.0000  
##  Median :19.00   Median : 0.000   Median :0.00000   Median :19.00   Median : 0.0000  
##  Mean   :17.05   Mean   : 0.198   Mean   :0.03889   Mean   :17.05   Mean   : 0.7418  
##  3rd Qu.:25.00   3rd Qu.: 0.000   3rd Qu.:0.00000   3rd Qu.:25.00   3rd Qu.: 1.0000  
##  Max.   :46.00   Max.   :17.000   Max.   :8.00000   Max.   :46.00   Max.   :21.0000  
##                                                                                      
##  UserPayHrdDec      UserPTStat89      UserPTStat05      UserPTStat14       UserPTStat56   
##  Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.00000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.:0.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.00000   Median :0.0000  
##  Mean   : 0.1009   Mean   : 0.3716   Mean   : 0.2529   Mean   : 0.06399   Mean   :0.0308  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.:0.0000  
##  Max.   :15.0000   Max.   :21.0000   Max.   :17.0000   Max.   :11.00000   Max.   :9.0000  
##                                                                                           
##   UserPTStat52       UserPTStat12      UserPTStat33      UserPTStat41     UserPTStat04    
##  Min.   : 0.00000   Min.   :0.00000   Min.   :0.00000   Min.   : 0.000   Min.   :0.00000  
##  1st Qu.: 0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.: 0.000   1st Qu.:0.00000  
##  Median : 0.00000   Median :0.00000   Median :0.00000   Median : 0.000   Median :0.00000  
##  Mean   : 0.02035   Mean   :0.01699   Mean   :0.01676   Mean   : 0.056   Mean   :0.02092  
##  3rd Qu.: 0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.: 0.000   3rd Qu.:0.00000  
##  Max.   :11.00000   Max.   :9.00000   Max.   :9.00000   Max.   :15.000   Max.   :6.00000  
##                                                                                           
##    UserTenure       UserWindow   USuccessDensity     USDDensity         UHDDensity      
##  Min.   :  1.00   Min.   : 1.0   Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.00000  
##  1st Qu.: 11.00   1st Qu.:12.0   1st Qu.: 0.7917   1st Qu.: 0.00000   1st Qu.: 0.00000  
##  Median : 24.00   Median :23.0   Median : 0.8621   Median : 0.00000   Median : 0.00000  
##  Mean   : 30.14   Mean   :20.7   Mean   : 0.8345   Mean   : 0.04885   Mean   : 0.00546  
##  3rd Qu.: 40.00   3rd Qu.:30.0   3rd Qu.: 0.9091   3rd Qu.: 0.03846   3rd Qu.: 0.00000  
##  Max.   :144.00   Max.   :31.0   Max.   :12.0000   Max.   :12.00000   Max.   :12.00000  
##                                                                                         
##  UChurnDensity     
##  Min.   : 0.00000  
##  1st Qu.: 0.00000  
##  Median : 0.00000  
##  Mean   : 0.01482  
##  3rd Qu.: 0.00000  
##  Max.   :12.00000  
## 
#   Split analysis database into Train and Test

ChurnTrain <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Train",]
ChurnTest <- ChurnAnalysis[ChurnAnalysis$TrainTest == "Test",]
TrainRows <- nrow(ChurnTrain)
TestRows <- nrow(ChurnTest)
TrainRows
## [1] 423544
TestRows
## [1] 105886
# ANALYSIS AND MODELING OF TRAINING DATA

# Estimate card brand and card type effects

BrandModel <- glm(Churn ~ CardBrand,  
                  family = binomial(logit), data = ChurnTrain)
summary(BrandModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5557  -0.3121  -0.3121  -0.3121   3.0174  
## 
## Coefficients:
##                   Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)      -2.997272   0.009287 -322.722   <2e-16 ***
## CardBrandAMZ_FPS  1.207306   0.112402   10.741   <2e-16 ***
## CardBrandAX      -1.544367   0.048292  -31.980   <2e-16 ***
## CardBrandDI      -1.221875   0.079445  -15.380   <2e-16 ***
## CardBrandMC       0.006294   0.017484    0.360   0.7189    
## CardBrandPAYPAL  -0.077641   0.043800   -1.773   0.0763 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 150383  on 423543  degrees of freedom
## Residual deviance: 148255  on 423538  degrees of freedom
## AIC: 148267
## 
## Number of Fisher Scoring iterations: 7
BrandTypeModel <- glm(Churn ~ CardBrand + CardType, 
                      family = binomial(logit), data = ChurnTrain)
summary(BrandTypeModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand + CardType, family = binomial(logit), 
##     data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9424  -0.2933  -0.2933  -0.1930   3.0174  
## 
## Coefficients:
##                         Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)             -3.97403    0.02270 -175.088  < 2e-16 ***
## CardBrandAMZ_FPS        -1.14663    0.12195   -9.402  < 2e-16 ***
## CardBrandAX             -0.56761    0.05255  -10.802  < 2e-16 ***
## CardBrandDI             -0.24512    0.08210   -2.986  0.00283 ** 
## CardBrandMC              0.06181    0.01843    3.353  0.00080 ***
## CardBrandPAYPAL          0.89912    0.04845   18.558  < 2e-16 ***
## CardTypeNone             3.33070    0.05249   63.452  < 2e-16 ***
## CardTypeDebit            0.84910    0.02411   35.220  < 2e-16 ***
## CardTypePrepaid          3.01709    0.02757  109.444  < 2e-16 ***
## CardType6 - Charge Card -4.59174   43.95396   -0.104  0.91680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 150383  on 423543  degrees of freedom
## Residual deviance: 133772  on 423534  degrees of freedom
## AIC: 133792
## 
## Number of Fisher Scoring iterations: 7
# Stepwise feature selection

BrandTypeRedModel <- step(BrandTypeModel, k = log(TrainRows)) # based on BIC
## Start:  AIC=133901.8
## Churn ~ CardBrand + CardType
## 
##             Df Deviance    AIC
## <none>           133772 133902
## - CardBrand  5   134418 134483
## - CardType   4   148255 148333
summary(BrandTypeRedModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand + CardType, family = binomial(logit), 
##     data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9424  -0.2933  -0.2933  -0.1930   3.0174  
## 
## Coefficients:
##                         Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)             -3.97403    0.02270 -175.088  < 2e-16 ***
## CardBrandAMZ_FPS        -1.14663    0.12195   -9.402  < 2e-16 ***
## CardBrandAX             -0.56761    0.05255  -10.802  < 2e-16 ***
## CardBrandDI             -0.24512    0.08210   -2.986  0.00283 ** 
## CardBrandMC              0.06181    0.01843    3.353  0.00080 ***
## CardBrandPAYPAL          0.89912    0.04845   18.558  < 2e-16 ***
## CardTypeNone             3.33070    0.05249   63.452  < 2e-16 ***
## CardTypeDebit            0.84910    0.02411   35.220  < 2e-16 ***
## CardTypePrepaid          3.01709    0.02757  109.444  < 2e-16 ***
## CardType6 - Charge Card -4.59174   43.95396   -0.104  0.91680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 150383  on 423543  degrees of freedom
## Residual deviance: 133772  on 423534  degrees of freedom
## AIC: 133792
## 
## Number of Fisher Scoring iterations: 7
# Estimate card brand, card type and User effects

OrderUserModel <- glm(Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
                      USDDensity + UHDDensity + UChurnDensity, 
                      family = binomial(logit), data = ChurnTrain)
summary(OrderUserModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity + UHDDensity + UChurnDensity, family = binomial(logit), 
##     data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8314  -0.3114  -0.2390  -0.1630   3.8062  
## 
## Coefficients:
##                          Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)             -3.122409   0.030495 -102.389  < 2e-16 ***
## CardBrandAMZ_FPS        -1.565125   0.123590  -12.664  < 2e-16 ***
## CardBrandAX             -0.505114   0.052825   -9.562  < 2e-16 ***
## CardBrandDI             -0.182046   0.082432   -2.208  0.02721 *  
## CardBrandMC              0.040225   0.018555    2.168  0.03017 *  
## CardBrandPAYPAL          0.477622   0.049033    9.741  < 2e-16 ***
## CardTypeNone             3.221118   0.055524   58.013  < 2e-16 ***
## CardTypeDebit            0.634891   0.024420   25.999  < 2e-16 ***
## CardTypePrepaid          2.383459   0.029777   80.044  < 2e-16 ***
## CardType6 - Charge Card -5.099232  43.953958   -0.116  0.90764    
## UserTenure              -0.025705   0.000544  -47.251  < 2e-16 ***
## USuccessDensity         -0.138484   0.023591   -5.870 4.35e-09 ***
## USDDensity               0.449203   0.027445   16.367  < 2e-16 ***
## UHDDensity               0.242389   0.090162    2.688  0.00718 ** 
## UChurnDensity           -0.094917   0.041895   -2.266  0.02348 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 150383  on 423543  degrees of freedom
## Residual deviance: 130024  on 423529  degrees of freedom
## AIC: 130054
## 
## Number of Fisher Scoring iterations: 7
# Stepwise feature selection

OrderUserRedModel <- step(OrderUserModel, k = log(TrainRows)) # based on BIC
## Start:  AIC=130217.9
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity + UHDDensity + UChurnDensity
## 
##                   Df Deviance    AIC
## - UHDDensity       1   130028 130210
## - UChurnDensity    1   130029 130210
## <none>                 130024 130218
## - USuccessDensity  1   130066 130248
## - USDDensity       1   130314 130496
## - CardBrand        5   130473 130603
## - UserTenure       1   132919 133100
## - CardType         4   138728 138870
## 
## Step:  AIC=130209.9
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity + UChurnDensity
## 
##                   Df Deviance    AIC
## - UChurnDensity    1   130032 130201
## <none>                 130028 130210
## - USuccessDensity  1   130068 130237
## - USDDensity       1   130315 130483
## - CardBrand        5   130479 130596
## - UserTenure       1   132941 133110
## - CardType         4   138814 138943
## 
## Step:  AIC=130200.5
## Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity
## 
##                   Df Deviance    AIC
## <none>                 130032 130201
## - USuccessDensity  1   130070 130226
## - CardBrand        5   130485 130588
## - USDDensity       1   130452 130607
## - UserTenure       1   133008 133164
## - CardType         4   138913 139030
summary(OrderUserRedModel)
## 
## Call:
## glm(formula = Churn ~ CardBrand + CardType + UserTenure + USuccessDensity + 
##     USDDensity, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0978  -0.3115  -0.2394  -0.1629   3.7900  
## 
## Coefficients:
##                           Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)             -3.1417414  0.0290210 -108.258  < 2e-16 ***
## CardBrandAMZ_FPS        -1.5691952  0.1235544  -12.700  < 2e-16 ***
## CardBrandAX             -0.5068364  0.0528550   -9.589  < 2e-16 ***
## CardBrandDI             -0.1822290  0.0824601   -2.210   0.0271 *  
## CardBrandMC              0.0401433  0.0185562    2.163   0.0305 *  
## CardBrandPAYPAL          0.4789792  0.0490312    9.769  < 2e-16 ***
## CardTypeNone             3.2268086  0.0554548   58.188  < 2e-16 ***
## CardTypeDebit            0.6365736  0.0244135   26.075  < 2e-16 ***
## CardTypePrepaid          2.3946461  0.0294876   81.209  < 2e-16 ***
## CardType6 - Charge Card -5.1082314 43.9539580   -0.116   0.9075    
## UserTenure              -0.0258568  0.0005406  -47.826  < 2e-16 ***
## USuccessDensity         -0.1089399  0.0190431   -5.721 1.06e-08 ***
## USDDensity               0.4070114  0.0200980   20.251  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 150383  on 423543  degrees of freedom
## Residual deviance: 130032  on 423531  degrees of freedom
## AIC: 130058
## 
## Number of Fisher Scoring iterations: 7
#    Note Deviance measures and compute Pseudo (Deviance) R2
#      (Null Deviance - Residual Deviance)/(Null Deviance)

# Exponentiate and interpret logistic coefficients
#   Multiplicative change in odds per unit change in predictor

LogisticCoef <- coef(summary(OrderUserRedModel))
ExpCoef <- (exp(LogisticCoef[,"Estimate"]))
NeatCoef <- round(cbind(LogisticCoef, ExpCoef), digits = 6)
NeatCoef
##                          Estimate Std. Error     z value Pr(>|z|)   ExpCoef
## (Intercept)             -3.141741   0.029021 -108.257654 0.000000  0.043207
## CardBrandAMZ_FPS        -1.569195   0.123554  -12.700436 0.000000  0.208213
## CardBrandAX             -0.506836   0.052855   -9.589186 0.000000  0.602398
## CardBrandDI             -0.182229   0.082460   -2.209906 0.027112  0.833410
## CardBrandMC              0.040143   0.018556    2.163333 0.030516  1.040960
## CardBrandPAYPAL          0.478979   0.049031    9.768858 0.000000  1.614425
## CardTypeNone             3.226809   0.055455   58.188046 0.000000 25.199107
## CardTypeDebit            0.636574   0.024414   26.074624 0.000000  1.889994
## CardTypePrepaid          2.394646   0.029488   81.208676 0.000000 10.964317
## CardType6 - Charge Card -5.108231  43.953958   -0.116218 0.907480  0.006047
## UserTenure              -0.025857   0.000541  -47.826368 0.000000  0.974475
## USuccessDensity         -0.108940   0.019043   -5.720702 0.000000  0.896784
## USDDensity               0.407011   0.020098   20.251351 0.000000  1.502321
# Compute and examine training-sample predictions 
#   for reduced logistic --
#   Logistic Prob, Classification

LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTrain, type = "response")

#   Classification analysis, training sample

Threshold <- 0.5
LogisticClass <- rep(0, TrainRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTrain$Churn)
Confusion
##              
## LogisticClass      0      1
##             0 404863  18175
##             1    446     60
mean(LogisticClass == ChurnTrain$Churn)
## [1] 0.9560353
# Lift Chart
#   What fraction of churn is captured in the top x fraction of churn scores?

ScoreFrac <- (1:TrainRows - 1)/(TrainRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTrain$Churn[ranks])/sum(ChurnTrain$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
     main = "Churn Lift Chart -- Train",
     xlab = "Fraction of Top Churn Scores",
     ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
#   Using performance function from ROCR package

plotpred <- prediction(LogisticProb, ChurnTrain$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Train", colorize = TRUE)

#   ROC replotted using qplot (ggplot2)
#     Extract plot variables fpr, tpr, threshold/cutoff
#       from plotroc object created via performace function above

fpr <- plotroc@x.values[[1]]  # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]]  # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]]  # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom = "line", color = Cutoff, 
      main = "ROC Chart -- Train",
      xlab = "False Positive Rate",
      ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package

confusionMatrix(LogisticClass,ChurnTrain$Churn, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 404863  18175
##          1    446     60
##                                           
##                Accuracy : 0.956           
##                  95% CI : (0.9554, 0.9567)
##     No Information Rate : 0.9569          
##     P-Value [Acc > NIR] : 0.9982          
##                                           
##                   Kappa : 0.0041          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0032904       
##             Specificity : 0.9988996       
##          Pos Pred Value : 0.1185771       
##          Neg Pred Value : 0.9570370       
##              Prevalence : 0.0430534       
##          Detection Rate : 0.0001417       
##    Detection Prevalence : 0.0011947       
##       Balanced Accuracy : 0.5010950       
##                                           
##        'Positive' Class : 1               
## 
# Add predictions to data set, save to .csv

ChurnTrain$LogisticProb <- LogisticProb
ChurnTrain$LogisticClass <- LogisticClass
head(ChurnTrain, n = 3)
##   TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 2     Train     0         2003-11-15    2003-11-23   2013-12-24 2007-11-29      2999-01-01
## 3     Train     0         2010-06-01    2010-06-01   2013-05-09 2010-06-01      2014-01-07
## 5     Train     0         2004-07-30    2004-08-07         <NA> 2013-01-31      2999-01-01
##   CancelDate  CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 2 2999-01-01 Current Sub                      RTG             1          94127           CA
## 3 2999-01-01 Current Sub                     PREM             1          18015           PA
## 5 2999-01-01 Current Sub                     PREM             1          96737           HI
##   EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 2    367201564 2013-11-01          Credit              CC                 VI           VI
## 3    367275211 2013-11-02          Credit              CC                 VI           VI
## 5    367488848 2013-11-08          Credit              CC                 MC           MC
##   CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 2  1 - Credit        VI   Credit  14.99        1             25              25     374.75
## 3   3 - Check        VI    Debit   9.99        1             25              25     249.75
## 5  1 - Credit        MC   Credit  10.35        2             26              26     269.10
##   UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 2         2011-05-01        2013-09-01          0            25             0             0
## 3         2011-06-02        2013-09-02          0            25             0             0
## 5         2011-05-08        2013-10-08          0            26             0             0
##   UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 2             25             0             0            0            0            0            0
## 3             25             0             0            0            0            0            0
## 5             26             0             0            0            0            0            0
##   UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 2            0            0            0            0            0        121         29
## 3            0            0            0            0            0         42         28
## 5            0            0            0            0            0        112         30
##   USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 2       0.8620690          0          0             0  0.001718937             0
## 3       0.8928571          0          0             0  0.024401144             0
## 5       0.8666667          0          0             0  0.002255839             0
write.csv(ChurnTrain, file = ChurnTrainAnalysis)

# MODEL EVALUATION USING TEST DATA

# Generate test set predictions -- 
#   Logistic Prob, Classification

LogisticProb <- predict(OrderUserRedModel, newdata = ChurnTest, type = "response")

#   Classification analysis, Testing sample

Threshold <- 0.5
LogisticClass <- rep(0, TestRows)
LogisticClass[LogisticProb > Threshold] <- 1
Confusion <- table(LogisticClass, ChurnTest$Churn)
Confusion
##              
## LogisticClass      0      1
##             0 101181   4580
##             1    115     10
mean(LogisticClass == ChurnTest$Churn)
## [1] 0.9556599
# Lift Chart
#   What fraction of churn is captured in the top x fraction of churn scores?

ScoreFrac <- (1:TestRows - 1)/(TestRows - 1)
ranks <- order(LogisticProb, decreasing=TRUE)
ChurnFrac <- cumsum(ChurnTest$Churn[ranks])/sum(ChurnTest$Churn)
qplot(x = ScoreFrac, y = ChurnFrac, geom = "line",
     main = "Churn Lift Chart -- Test",
     xlab = "Fraction of Top Churn Scores",
     ylab = "Fraction of Churn")

# ROC chart -- True positives versus false positives (for changing cutoffs)
#   Using performance function from ROCR package

plotpred <- prediction(LogisticProb, ChurnTest$Churn)
plotroc <- performance(plotpred, measure = "tpr", x.measure = "fpr")
plot(plotroc, main = "ROC Curve -- Test", colorize = TRUE)

#   ROC replotted using qplot (ggplot2)
#     Extract plot variables fpr, tpr, threshold/cutoff
#       from plotroc object created via performace function above

fpr <- plotroc@x.values[[1]]  # Extract "fpr" for qplot(ggplot2)
tpr <- plotroc@y.values[[1]]  # Extract "tpr" for qplot(ggplot2)
Cutoff <- plotroc@alpha.values[[1]]  # Extract cutoffs for color scale
qplot(x = fpr, y = tpr, geom="line", color= Cutoff, 
      main = "ROC Chart -- Test",
      xlab = "False Positive Rate",
      ylab = "True Positive Rate")

# Detailed confusion matrix metrics using caret package

confusionMatrix(LogisticClass,ChurnTest$Churn, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 101181   4580
##          1    115     10
##                                           
##                Accuracy : 0.9557          
##                  95% CI : (0.9544, 0.9569)
##     No Information Rate : 0.9567          
##     P-Value [Acc > NIR] : 0.9439          
##                                           
##                   Kappa : 0.0019          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 2.179e-03       
##             Specificity : 9.989e-01       
##          Pos Pred Value : 8.000e-02       
##          Neg Pred Value : 9.567e-01       
##              Prevalence : 4.335e-02       
##          Detection Rate : 9.444e-05       
##    Detection Prevalence : 1.181e-03       
##       Balanced Accuracy : 5.005e-01       
##                                           
##        'Positive' Class : 1               
## 
# Add predictions to data set, save to .csv

ChurnTest$LogisticProb <- LogisticProb
ChurnTest$LogisticClass <- LogisticClass
head(ChurnTest, n = 3)
##    TrainTest Churn UserOrigSignupDate FirstBillDate LastPlayDate SignUpDate StopRequestDate
## 1       Test     0         2009-05-10    2009-05-25   2013-12-24 2009-05-10      2999-01-01
## 4       Test     0         2012-02-10    2012-02-10   2014-01-18 2012-02-10      2999-01-01
## 15      Test     0         2003-09-26    2003-11-04         <NA> 2013-01-31      2999-01-01
##    CancelDate  CancelType CancelReason ServiceTier MonthsPerBill BillingZipCode BillingState
## 1  2999-01-01 Current Sub                      RTG             1          44333           OH
## 4  2999-01-01 Current Sub                     PREM             1          19426           PA
## 15 2999-01-01 Current Sub                     PREM             1          26241           WV
##    EcommOrderId  OrderDate PaymentTypeName PaymentTypeCode PaymentSubTypeCode CPTCardBrand
## 1     367726684 2013-11-16          Credit              CC                 VI           VI
## 4     367578619 2013-11-11          Credit              CC                 AX         None
## 15    367455971 2013-11-07          Credit              CC                 MC           MC
##    CPTCardType CardBrand CardType Amount SubCount UserOrderCount UserPayAttempts UserAmount
## 1   1 - Credit        VI   Credit  14.99        1             26              34     389.74
## 4     0 - None        AX   Credit   9.99        1             16              16     159.84
## 15  1 - Credit        MC   Credit   9.99        2             29              32     349.71
##    UserFirstOrderDate UserLastOrderDate UserChurns UserOrdSucces UserOrdSftDec UserOrdHrdDec
## 1          2011-04-26        2013-10-16          0            26             0             0
## 4          2012-03-11        2013-10-11          0            16             0             0
## 15         2011-05-01        2013-10-07          0            29             0             0
##    UserPaySuccess UserPaySftDec UserPayHrdDec UserPTStat89 UserPTStat05 UserPTStat14 UserPTStat56
## 1              26             2             0            0            2            0            0
## 4              16             0             0            0            0            0            0
## 15             29             1             0            1            0            0            0
##    UserPTStat52 UserPTStat12 UserPTStat33 UserPTStat41 UserPTStat04 UserTenure UserWindow
## 1             0            0            0            0            0         55         31
## 4             0            0            0            0            0         22         20
## 15            0            0            0            0            0        122         30
##    USuccessDensity USDDensity UHDDensity UChurnDensity LogisticProb LogisticClass
## 1        0.8387097 0.06451613          0             0  0.009670326             0
## 4        0.8000000 0.00000000          0             0  0.013326569             0
## 15       0.9666667 0.03333333          0             0  0.001747413             0
write.table(ChurnTest, file = ChurnTestAnalysis)

########### END OF ANALYSIS