BuyOnlinePickUpStore

Impact of BOPS on Consumer Return Behavior

Set the directory

setwd("~/Documents/BOPS Project")
data12 = read.csv("BOPS-FY12.csv")
data13 = read.csv("BOPS-FY13.csv") 
data_trans<-read.table("transaction.txt", header= TRUE, sep= ",")

allbops=rbind(data12,data13)
data_trans$bops= ifelse(data_trans$transaction_id%in% allbops$transaction_id, 1,0)
summary(data_trans)

##   customer_id                   purchase_date     transaction_id   
##  Min.   :1.003e+05   13DEC2012:00:00:00:  12989   Min.   :   7198  
##  1st Qu.:2.569e+07   12DEC2012:00:00:00:  12286   1st Qu.:2649367  
##  Median :3.015e+07   18DEC2012:00:00:00:  12234   Median :3355444  
##  Mean   :2.944e+10   06DEC2012:00:00:00:  11614   Mean   :3338930  
##  3rd Qu.:3.499e+07   11DEC2012:00:00:00:  10228   3rd Qu.:4030256  
##  Max.   :9.197e+11   22DEC2011:00:00:00:   9896   Max.   :4702552  
##                      (Other)           :1602255                    
##   store_number    net_purchase_amount      sku               return     
##  Min.   :   2.0   Min.   :    0.00    Min.   :10033405   Min.   :0.000  
##  1st Qu.:   2.0   1st Qu.:   45.13    1st Qu.:17859869   1st Qu.:0.000  
##  Median :   2.0   Median :   90.00    Median :18126698   Median :0.000  
##  Mean   : 229.5   Mean   :  170.33    Mean   :18300993   Mean   :0.101  
##  3rd Qu.:   2.0   3rd Qu.:  189.00    3rd Qu.:18584417   3rd Qu.:0.000  
##  Max.   :5998.0   Max.   :39422.00    Max.   :80006100   Max.   :1.000  
##                                                                         
##              return_date       return_store     time_to_return   
##                    :1502656   Min.   :   2.0    Min.   :   0.0   
##  02JUN2011:00:00:00:    921   1st Qu.:   2.0    1st Qu.:   8.0   
##  21JUL2011:00:00:00:    803   Median :   2.0    Median :  17.0   
##  25APR2012:00:00:00:    802   Mean   : 756.7    Mean   :  27.8   
##  10JAN2013:00:00:00:    781   3rd Qu.:1479.0    3rd Qu.:  33.0   
##  08JAN2013:00:00:00:    744   Max.   :5998.0    Max.   :1679.0   
##  (Other)           : 164795   NA's   :1502656   NA's   :1502656  
##  gender        age_band     est_income_code  ethnic_code    
##   :210507   Min.   : 0.00   Min.   :1.00    N      :615387  
##  F:693941   1st Qu.: 0.00   1st Qu.:4.00    S      :204830  
##  M:719292   Median : 5.00   Median :6.00    H      :187647  
##  U: 47762   Mean   : 4.95   Mean   :5.42    Z      :161313  
##             3rd Qu.: 8.00   3rd Qu.:7.00    G      :120011  
##             Max.   :13.00   Max.   :9.00    U      : 71027  
##             NA's   :72038   NA's   :68664   (Other):311287  
##  homeowner_code length_of_residence child           year     
##   :  68664      Min.   : 0.00        : 68664   Min.   :2010  
##  O:1064498      1st Qu.: 2.00       N:977479   1st Qu.:2011  
##  R: 538340      Median : 6.00       Y:625359   Median :2012  
##                 Mean   : 7.14                  Mean   :2012  
##                 3rd Qu.:13.00                  3rd Qu.:2012  
##                 Max.   :15.00                  Max.   :2013  
##                 NA's   :68664                                
##      month         month_index       summary           bops       
##  DEC    :425369   Min.   :13.00   Min.   : 1.00   Min.   :0.0000  
##  NOV    :187468   1st Qu.:22.00   1st Qu.: 5.00   1st Qu.:0.0000  
##  FEB    :179716   Median :31.00   Median : 9.00   Median :0.0000  
##  MAY    :143596   Mean   :31.47   Mean   :10.07   Mean   :0.1664  
##  JAN    :127777   3rd Qu.:41.00   3rd Qu.:12.00   3rd Qu.:0.0000  
##  APR    :102339   Max.   :48.00   Max.   :21.00   Max.   :1.0000  
##  (Other):505237                   NA's   :4

##Subseting the data to only BOPS implementaion period
data_df1=subset(data_trans, (month_index>=25) & (store_number==2|store_number==6))
data_df2=subset(data_trans, (month_index>=37) & (store_number==5998))
df_m4=rbind(data_df1,data_df2)

##rename the net_purchase_amount as price -->
colnames(df_m4)[colnames(df_m4)=="net_purchase_amount"] <- "price"

Creating Dummies

attach(df_m4)
#Create dummy for return
df_m4$return=ifelse(is.na(df_m4$time_to_return),0,1)

#create dummy variables for ethnic code
df_m4$white <- ifelse(df_m4$ethnic_code == "N"| df_m4$ethnic_code == "Z", 1, 0)

#create dummy variables for store number -->
df_m4$store2 <- ifelse(store_number == "2", 1, 0)
df_m4$store6 <- ifelse(store_number == "6", 1, 0)
df_m4$store5998 <- ifelse(store_number == "5998", 1, 0)

#create dummy variables for homeowner
df_m4$homeowner <- ifelse(homeowner_code == "O",1,ifelse(homeowner_code == "R",0,NA))

#create dummy variables for male
df_m4$male <- ifelse(gender == "M",1,ifelse(gender == "F",0,NA))

#create child variables for male
df_m4$kid <- ifelse(child == "Y",1,ifelse(child == "N",0,NA))

#create dummy variables for product catagory -->
df_m4$bridal <- ifelse(summary == "1", 1, 0)
df_m4$goldwedbands <- ifelse(summary == "2", 1, 0)
df_m4$solitaires <- ifelse(summary == "3", 1, 0)
df_m4$dfashion <- ifelse(summary == "4", 1, 0)
df_m4$semiprecious <- ifelse(summary == "5", 1, 0)
df_m4$mens <- ifelse(summary == "6", 1, 0)
df_m4$goldearrings <- ifelse(summary == "7", 1, 0)
df_m4$specialevent <- ifelse(summary == "8", 1, 0)
df_m4$beads <- ifelse(summary == "9", 1, 0)
df_m4$piercings <- ifelse(summary == "10", 1, 0)
df_m4$dsoljewelry <- ifelse(summary == "11", 1, 0)
df_m4$gold <- ifelse(summary == "12", 1, 0)
df_m4$watch <- ifelse(summary == "13", 1, 0)
df_m4$preowned <- ifelse(summary == "14", 1, 0)
df_m4$specialized <- ifelse(summary == "15", 1, 0)
df_m4$events <- ifelse(summary == "17", 1, 0)
df_m4$diawedband <- ifelse(summary == "20", 1, 0)
df_m4$silver <- ifelse(summary == "21", 1, 0)

#create dummy variables for month -->
df_m4$JAN <- ifelse(month == "JAN", 1, 0)
df_m4$FEB <- ifelse(month == "FEB", 1, 0)
df_m4$MAR <- ifelse(month == "MAR", 1, 0)
df_m4$APR <- ifelse(month == "APR", 1, 0)
df_m4$MAY <- ifelse(month == "MAY", 1, 0)
df_m4$JUN <- ifelse(month == "JUN", 1, 0)
df_m4$JUL <- ifelse(month == "JUL", 1, 0)
df_m4$AUG <- ifelse(month == "AUG", 1, 0)
df_m4$SEP <- ifelse(month == "SEP", 1, 0) 
df_m4$OCT <- ifelse(month == "OCT", 1, 0) 
df_m4$NOV <- ifelse(month == "NOV", 1, 0)
df_m4$DEC <- ifelse(month == "DEC", 1, 0) 

##Create fiscal year variables and month index
df_m4$month_index2=df_m4$month_index%%12
df_m4$y_2011 <- ifelse(df_m4$year == "2011", 1, 0)
df_m4$y_2012 <- ifelse(df_m4$year == "2012", 1, 0)
df_m4$y_2013 <- ifelse(df_m4$year == "2013", 1, 0)
df_m4$fiscalyr <- ifelse(13<= df_m4$month_index & df_m4$month_index <=24, 11, ifelse(25<= df_m4$month_index & df_m4$month_index <= 36, 12, 13))
write.csv(df_m4, "df_m4.csv")
df_m4 = read.csv("df_m4.csv")

``` ##Data Aggregatioin To investigate impact of BOPS on consumer return behavior. We will use transaction level data. The reason why we do not aggregate the data to consumer level is that, return is a behavior that we can observe in the level of single transaction as compared to purchases.

Multicollinearity Test: Positive result

#install.packages("VIF")
library(VIF)
#install.packages("usdm")
library(usdm)

## Loading required package: sp

## Loading required package: raster

## 
## Attaching package: 'usdm'

## The following object is masked from 'package:VIF':
## 
##     vif

df=data.frame(df_m4$bops,df_m4$kid,df_m4$price,df_m4$age_band, df_m4$est_income_code, df_m4$white, df_m4$male, df_m4$JAN, df_m4$FEB, df_m4$MAR, df_m4$APR, df_m4$MAY, df_m4$JUN, df_m4$JUL, df_m4$AUG, df_m4$SEP, df_m4$OCT, df_m4$NOV, df_m4$store5998, df_m4$store6, df_m4$y_2011,df_m4$y_2013) 
cor(df)  #all the correlations are less than 0.8

##                         df_m4.bops df_m4.kid   df_m4.price df_m4.age_band
## df_m4.bops             1.000000000        NA -0.0101172279             NA
## df_m4.kid                       NA         1            NA             NA
## df_m4.price           -0.010117228        NA  1.0000000000             NA
## df_m4.age_band                  NA        NA            NA              1
## df_m4.est_income_code           NA        NA            NA             NA
## df_m4.white           -0.028404731        NA -0.0038756811             NA
## df_m4.male                      NA        NA            NA             NA
## df_m4.JAN             -0.007311232        NA -0.0067796287             NA
## df_m4.FEB              0.009480953        NA -0.0037767351             NA
## df_m4.MAR              0.012546736        NA  0.0323048257             NA
## df_m4.APR              0.001051713        NA -0.0009001587             NA
## df_m4.MAY              0.009398969        NA -0.0070794211             NA
## df_m4.JUN              0.011534453        NA  0.0255734488             NA
## df_m4.JUL              0.015268638        NA  0.0270406534             NA
## df_m4.AUG             -0.051095312        NA  0.0120217150             NA
## df_m4.SEP             -0.018947926        NA  0.0131514042             NA
## df_m4.OCT             -0.013401491        NA -0.0074665929             NA
## df_m4.NOV              0.006988895        NA  0.0036523372             NA
## df_m4.store5998        0.072886923        NA  0.0107170312             NA
## df_m4.store6          -0.060354787        NA -0.0189206033             NA
## df_m4.y_2011          -0.112449810        NA -0.0011609810             NA
## df_m4.y_2013           0.030826311        NA  0.0052551304             NA
##                       df_m4.est_income_code   df_m4.white df_m4.male
## df_m4.bops                               NA -0.0284047306         NA
## df_m4.kid                                NA            NA         NA
## df_m4.price                              NA -0.0038756811         NA
## df_m4.age_band                           NA            NA         NA
## df_m4.est_income_code                     1            NA         NA
## df_m4.white                              NA  1.0000000000         NA
## df_m4.male                               NA            NA          1
## df_m4.JAN                                NA -0.0049391699         NA
## df_m4.FEB                                NA  0.0073368986         NA
## df_m4.MAR                                NA  0.0033286286         NA
## df_m4.APR                                NA -0.0033764480         NA
## df_m4.MAY                                NA -0.0046344057         NA
## df_m4.JUN                                NA -0.0037389399         NA
## df_m4.JUL                                NA -0.0046412131         NA
## df_m4.AUG                                NA -0.0001984816         NA
## df_m4.SEP                                NA -0.0036863832         NA
## df_m4.OCT                                NA -0.0035437577         NA
## df_m4.NOV                                NA  0.0034090014         NA
## df_m4.store5998                          NA -0.1677960962         NA
## df_m4.store6                             NA  0.0152509963         NA
## df_m4.y_2011                             NA  0.0156282682         NA
## df_m4.y_2013                             NA -0.0171879084         NA
##                          df_m4.JAN    df_m4.FEB    df_m4.MAR     df_m4.APR
## df_m4.bops            -0.007311232  0.009480953  0.012546736  0.0010517127
## df_m4.kid                       NA           NA           NA            NA
## df_m4.price           -0.006779629 -0.003776735  0.032304826 -0.0009001587
## df_m4.age_band                  NA           NA           NA            NA
## df_m4.est_income_code           NA           NA           NA            NA
## df_m4.white           -0.004939170  0.007336899  0.003328629 -0.0033764480
## df_m4.male                      NA           NA           NA            NA
## df_m4.JAN              1.000000000 -0.105232142 -0.068525534 -0.0770189105
## df_m4.FEB             -0.105232142  1.000000000 -0.081548330 -0.0916558132
## df_m4.MAR             -0.068525534 -0.081548330  1.000000000 -0.0596848395
## df_m4.APR             -0.077018911 -0.091655813 -0.059684839  1.0000000000
## df_m4.MAY             -0.093225239 -0.110942040 -0.072243731 -0.0811979580
## df_m4.JUN             -0.062634251 -0.074537450 -0.048537628 -0.0545536093
## df_m4.JUL             -0.064605257 -0.076883032 -0.050065034 -0.0562703301
## df_m4.AUG             -0.065238866 -0.077637054 -0.050556042 -0.0568221950
## df_m4.SEP             -0.062997766 -0.074970049 -0.048819330 -0.0548702267
## df_m4.OCT             -0.073090492 -0.086980826 -0.056640561 -0.0636608579
## df_m4.NOV             -0.106762970 -0.127052523 -0.082734626 -0.0929891442
## df_m4.store5998        0.012676869 -0.006808460 -0.006366504  0.0051799761
## df_m4.store6           0.009939449 -0.005839145 -0.005919546 -0.0018472821
## df_m4.y_2011          -0.160924177 -0.191506686 -0.124706174 -0.1401628436
## df_m4.y_2013           0.220260047  0.219152450  0.151079037  0.2185266535
##                           df_m4.MAY    df_m4.JUN    df_m4.JUL
## df_m4.bops             0.0093989693  0.011534453  0.015268638
## df_m4.kid                        NA           NA           NA
## df_m4.price           -0.0070794211  0.025573449  0.027040653
## df_m4.age_band                   NA           NA           NA
## df_m4.est_income_code            NA           NA           NA
## df_m4.white           -0.0046344057 -0.003738940 -0.004641213
## df_m4.male                       NA           NA           NA
## df_m4.JAN             -0.0932252389 -0.062634251 -0.064605257
## df_m4.FEB             -0.1109420404 -0.074537450 -0.076883032
## df_m4.MAR             -0.0722437305 -0.048537628 -0.050065034
## df_m4.APR             -0.0811979580 -0.054553609 -0.056270330
## df_m4.MAY              1.0000000000 -0.066032786 -0.068110740
## df_m4.JUN             -0.0660327863  1.000000000 -0.045760839
## df_m4.JUL             -0.0681107397 -0.045760839  1.000000000
## df_m4.AUG             -0.0687787281 -0.046209633 -0.047663782
## df_m4.SEP             -0.0664160264 -0.044622230 -0.046026425
## df_m4.OCT             -0.0770563832 -0.051771053 -0.053400211
## df_m4.NOV             -0.1125559310 -0.075621757 -0.078001461
## df_m4.store5998       -0.0008261705  0.006199132  0.003397435
## df_m4.store6           0.0031296349 -0.001676022 -0.003427566
## df_m4.y_2011          -0.1696559258 -0.113984924 -0.117571860
## df_m4.y_2013           0.2005907218  0.118589887  0.126088923
##                           df_m4.AUG    df_m4.SEP    df_m4.OCT    df_m4.NOV
## df_m4.bops            -0.0510953121 -0.018947926 -0.013401491  0.006988895
## df_m4.kid                        NA           NA           NA           NA
## df_m4.price            0.0120217150  0.013151404 -0.007466593  0.003652337
## df_m4.age_band                   NA           NA           NA           NA
## df_m4.est_income_code            NA           NA           NA           NA
## df_m4.white           -0.0001984816 -0.003686383 -0.003543758  0.003409001
## df_m4.male                       NA           NA           NA           NA
## df_m4.JAN             -0.0652388660 -0.062997766 -0.073090492 -0.106762970
## df_m4.FEB             -0.0776370541 -0.074970049 -0.086980826 -0.127052523
## df_m4.MAR             -0.0505560416 -0.048819330 -0.056640561 -0.082734626
## df_m4.APR             -0.0568221950 -0.054870227 -0.063660858 -0.092989144
## df_m4.MAY             -0.0687787281 -0.066416026 -0.077056383 -0.112555931
## df_m4.JUN             -0.0462096333 -0.044622230 -0.051771053 -0.075621757
## df_m4.JUL             -0.0476637816 -0.046026425 -0.053400211 -0.078001461
## df_m4.AUG              1.0000000000 -0.046477824 -0.053923928 -0.078766452
## df_m4.SEP             -0.0464778240  1.000000000 -0.052071522 -0.076060650
## df_m4.OCT             -0.0539239279 -0.052071522  1.000000000 -0.088246149
## df_m4.NOV             -0.0787664520 -0.076060650 -0.088246149  1.000000000
## df_m4.store5998       -0.0114636277 -0.007341724 -0.001463996  0.012666226
## df_m4.store6           0.0137257731  0.009818707  0.002728975  0.005498699
## df_m4.y_2011           0.1261982207  0.144975576  0.130982368  0.140713606
## df_m4.y_2013          -0.1368321975 -0.132131708 -0.153300221 -0.223924981
##                       df_m4.store5998 df_m4.store6 df_m4.y_2011
## df_m4.bops               0.0728869233 -0.060354787 -0.112449810
## df_m4.kid                          NA           NA           NA
## df_m4.price              0.0107170312 -0.018920603 -0.001160981
## df_m4.age_band                     NA           NA           NA
## df_m4.est_income_code              NA           NA           NA
## df_m4.white             -0.1677960962  0.015250996  0.015628268
## df_m4.male                         NA           NA           NA
## df_m4.JAN                0.0126768690  0.009939449 -0.160924177
## df_m4.FEB               -0.0068084604 -0.005839145 -0.191506686
## df_m4.MAR               -0.0063665036 -0.005919546 -0.124706174
## df_m4.APR                0.0051799761 -0.001847282 -0.140162844
## df_m4.MAY               -0.0008261705  0.003129635 -0.169655926
## df_m4.JUN                0.0061991320 -0.001676022 -0.113984924
## df_m4.JUL                0.0033974348 -0.003427566 -0.117571860
## df_m4.AUG               -0.0114636277  0.013725773  0.126198221
## df_m4.SEP               -0.0073417242  0.009818707  0.144975576
## df_m4.OCT               -0.0014639957  0.002728975  0.130982368
## df_m4.NOV                0.0126662257  0.005498699  0.140713606
## df_m4.store5998          1.0000000000 -0.047190310 -0.097499559
## df_m4.store6            -0.0471903101  1.000000000  0.017788860
## df_m4.y_2011            -0.0974995592  0.017788860  1.000000000
## df_m4.y_2013             0.0878465092 -0.025130106 -0.337522862
##                       df_m4.y_2013
## df_m4.bops              0.03082631
## df_m4.kid                       NA
## df_m4.price             0.00525513
## df_m4.age_band                  NA
## df_m4.est_income_code           NA
## df_m4.white            -0.01718791
## df_m4.male                      NA
## df_m4.JAN               0.22026005
## df_m4.FEB               0.21915245
## df_m4.MAR               0.15107904
## df_m4.APR               0.21852665
## df_m4.MAY               0.20059072
## df_m4.JUN               0.11858989
## df_m4.JUL               0.12608892
## df_m4.AUG              -0.13683220
## df_m4.SEP              -0.13213171
## df_m4.OCT              -0.15330022
## df_m4.NOV              -0.22392498
## df_m4.store5998         0.08784651
## df_m4.store6           -0.02513011
## df_m4.y_2011           -0.33752286
## df_m4.y_2013            1.00000000

vif(df) #Calculates VIF scores, all are between range (1, 1.66) and they are less than 3, so it is ok.

##                Variables      VIF
## 1             df_m4.bops 1.032586
## 2              df_m4.kid 1.022891
## 3            df_m4.price 1.025776
## 4         df_m4.age_band 1.047151
## 5  df_m4.est_income_code 1.049683
## 6            df_m4.white 1.006558
## 7             df_m4.male 1.041855
## 8              df_m4.JAN 1.524594
## 9              df_m4.FEB 1.664023
## 10             df_m4.MAR 1.373884
## 11             df_m4.APR 1.476160
## 12             df_m4.MAY 1.613740
## 13             df_m4.JUN 1.264681
## 14             df_m4.JUL 1.251023
## 15             df_m4.AUG 1.141380
## 16             df_m4.SEP 1.112447
## 17             df_m4.OCT 1.156644
## 18             df_m4.NOV 1.269882
## 19       df_m4.store5998 1.005065
## 20          df_m4.store6 1.009599
## 21          df_m4.y_2011 1.441881
## 22          df_m4.y_2013 1.716031

Functions for estimate a treatment-effect model.

#install.packages("sampleSelection")
#library(sampleSelection)

CB <- function(x) {
   ifelse(x > -500,
          -exp(dnorm(x, log = TRUE)
                - pnorm(x, log.p = TRUE))*x
           -exp(2*(dnorm(x, log = TRUE) - pnorm(x, log.p = TRUE))),
          -1)
}

lambda <- function(x) {
   as.vector(ifelse(x > -30, dnorm(x)/pnorm(x), -x))
                           # can we prove it?
}

tobitTfit <- function(YS, XS, YO, XO, start,
                      weights=NULL, print.level=0,
                      maxMethod="Newton-Raphson",
                      index=NULL,
                      binaryOutcome=FALSE,
                      ...) {
### Tobit treatment models:
### The latent variable is:
### YS* = XS'g + u
### The observables are:
###      / 1  if  YS* > 0
### YS = \ 0  if  YS* <= 0
### YO = X'b + YS bT + v
### u, v are correlated
### 
### Arguments:
### 
###  YS        binary or logical vector, 0 (FALSE) and 1 (TRUE)
###  XS              -"-                selection, should include
###              exclusion restriction
###  YO        numeric vector, outcomes
###  XO        explanatory variables for outcomes, should include YS
###  index     individual parameter indices in the parameter vector.
###            Should always be supplied but can generate here for
###            testing purposes
###  ...       additional parameters for maxLik
###
   loglik <- function( beta) {
      betaS <- beta[iBetaS]
      betaO <- beta[iBetaO]
      sigma <- beta[iSigma]
      if(sigma <= 0) return(NA)
      rho <- beta[iRho]
      if( ( rho < -1) || ( rho > 1)) return(NA)
                           # check the range
      XS0.betaS <- XS0%*%betaS
                           # denoted by 'z' in the vignette
      XS1.betaS <- XS1%*%betaS
      v0 <- YO0 - XO0%*%betaO
      v1 <- YO1 - XO1%*%betaO
      sqrt1r2 <- sqrt( 1 - rho^2)
      B0 <- (-XS0.betaS - rho/sigma*v0)/sqrt1r2
      B1 <- (XS1.betaS + rho/sigma*v1)/sqrt1r2
      loglik <- numeric(nObs)
      loglik[i0] <- -1/2*log( 2*pi) - log( sigma) -
          0.5*( v0/sigma)^2 + pnorm( B0, log.p=TRUE) 
      loglik[i1] <- -1/2*log( 2*pi) -log( sigma) -
          0.5*( v1/sigma)^2 + pnorm( B1, log.p=TRUE) 
      #sum(loglik)
      loglik
   }
   gradlik <- function(beta) {
      ## gradient is nObs x nParam matrix
      betaS <- beta[iBetaS]
      betaO <- beta[iBetaO]
      sigma <- beta[iSigma]
      if(sigma <= 0) return(NA)
      rho <- beta[iRho]
      if( ( rho < -1) || ( rho > 1)) return(NA)
                           # check the range
      XS0.betaS <- XS0%*%betaS
                           # denoted by 'z' in the vignette
      XS1.betaS <- XS1%*%betaS
      v0 <- drop(YO0 - XO0%*%betaO)
      v1 <- drop(YO1 - XO1%*%betaO)
      sqrt1r2 <- sqrt( 1 - rho^2)
      B0 <- (-XS0.betaS - rho/sigma*v0)/sqrt1r2
      B1 <- (XS1.betaS + rho/sigma*v1)/sqrt1r2
      lambda0 <- drop(lambda(B0))
      lambda1 <- drop(lambda(B1))
      ## now the gradient itself
      gradient <- matrix(0, nObs, nParam)
      gradient[i0, iBetaS] <- -lambda0*XS0/sqrt1r2
      gradient[i1, iBetaS] <- lambda1*XS1/sqrt1r2
      gradient[i0,iBetaO] <- (lambda0*rho/sigma/sqrt1r2
                              + v0/sigma^2)*XO0
      gradient[i1,iBetaO] <- (-lambda1*rho/sigma/sqrt1r2
                              + v1/sigma^2)*XO1
      gradient[i0,iSigma] <- (-1/sigma + v0^2/sigma^3
                              + lambda0*rho/sigma^2*v0/sqrt1r2)
      gradient[i1,iSigma] <- (-1/sigma + v1^2/sigma^3
                              - lambda1*rho/sigma^2*v1/sqrt1r2)
      gradient[i0,iRho] <- -lambda0*(v0/sigma + rho*XS0.betaS)/
          sqrt1r2^3
      gradient[i1,iRho] <- lambda1*(v1/sigma + rho*XS1.betaS)/
          sqrt1r2^3
#      colSums(gradient)
      gradient
   }
   hesslik <- function(beta) {
                           # This is a hack in order to avoid numeric problems
      ## gradient is nObs x nParam matrix
      betaS <- beta[iBetaS]
      betaO <- beta[iBetaO]
      sigma <- beta[iSigma]
      if(sigma <= 0) return(NA)
      rho <- beta[iRho]
      if( ( rho < -1) || ( rho > 1)) return(NA)
                           # check the range
      XS0.betaS <- XS0%*%betaS
                           # denoted by 'z' in the vignette
      XS1.betaS <- XS1%*%betaS
      v0 <- drop(YO0 - XO0%*%betaO)
      v1 <- drop(YO1 - XO1%*%betaO)
      sqrt1r2 <- sqrt( 1 - rho^2)
      B0 <- (-XS0.betaS - rho/sigma*v0)/sqrt1r2
      B1 <- (XS1.betaS + rho/sigma*v1)/sqrt1r2
      lambda0 <- drop(lambda(B0))
      lambda1 <- drop(lambda(B1))
      CB0 <- drop(CB(B0))
      CB1 <- drop(CB(B1))
      hess <- array(0, c( nParam, nParam))
      hess[,] <- NA
      hess[iBetaS,iBetaS] <-
         t( XS0) %*% ( XS0 * CB0)/sqrt1r2^2 +
             t( XS1) %*% ( XS1 * CB1)/sqrt1r2^2
      hess[iBetaS,iBetaO]  <-
         - t( XS0) %*% ( XO0 * CB0)*rho/sqrt1r2^2/sigma -
             t( XS1) %*% ( XO1 * CB1)*rho/sqrt1r2^2/sigma
      hess[iBetaO,iBetaS] <- t(hess[iBetaS,iBetaO])
      hess[iBetaS,iSigma] <-
         -rho/sigma^2/sqrt1r2^2*t( XS0) %*% ( CB0*v0) -
             rho/sigma^2/sqrt1r2^2*t( XS1) %*% ( CB1*v1)
      hess[iSigma,iBetaS] <- t(hess[iBetaS,iSigma])
      hess[iBetaS,iRho] <- 
         (t(XS0) %*% (CB0*(v0/sigma + rho*XS0.betaS)/sqrt1r2^4
                      - lambda0*rho/sqrt1r2^3) 
          +t(XS1) %*% (CB1*(v1/sigma + rho*XS1.betaS)/sqrt1r2^4
                       + lambda1*rho/sqrt1r2^3)
          )
      hess[iRho,iBetaS] <- t(hess[iBetaS,iRho])
      ##
      hess[iBetaO,iBetaO] <- 
         t( XO0) %*% (XO0*((rho/sqrt1r2)^2*CB0 - 1))/sigma^2 +
             t( XO1) %*% (XO1*( (rho/sqrt1r2)^2 * CB1 - 1))/sigma^2
      hess[iBetaO,iSigma] <-
         (t( XO0) %*% (CB0*rho^2/sigma^3*v0/sqrt1r2^2
                       - rho/sigma^2*lambda0/sqrt1r2 
                       - 2*v0/sigma^3) 
          + t( XO1) %*% (CB1*rho^2/sigma^3*v1/sqrt1r2^2 
                         + rho/sigma^2*lambda1/sqrt1r2
                         - 2*v1/sigma^3)
          )
      hess[iSigma,iBetaO] <- t(hess[iBetaO,iSigma])
      hess[iBetaO,iRho] <-
         (t(XO0) %*% (-CB0*(v0/sigma + rho*XS0.betaS)/sqrt1r2^4*rho
                      + lambda0/sqrt1r2^3)/sigma
          + t(XO1) %*% (-CB1*(v1/sigma + rho*XS1.betaS)/sqrt1r2^4*rho
                        - lambda1/sqrt1r2^3)/sigma
          )
      hess[iRho,iBetaO] <- t(hess[iBetaO,iRho])
      ##
      hess[iSigma,iSigma] <-
         (sum(1/sigma^2
             -3*v0*v0/sigma^4
             + v0*v0/sigma^4*rho^2/sqrt1r2^2*CB0
             -2*lambda0*v0/sqrt1r2*rho/sigma^3)
          + sum(1/sigma^2
                -3*v1*v1/sigma^4
                +rho^2/sigma^4*v1*v1/sqrt1r2^2*CB1
                +2*lambda1*v1/sqrt1r2*rho/sigma^3)
          )
      hess[iSigma,iRho] <- 
         (sum((-CB0*rho*(v0/sigma + rho*XS0.betaS)/sqrt1r2 + lambda0)
              *v0/sigma^2)/sqrt1r2^3
          - sum(
              (CB1*rho*(v1/sigma + rho*XS1.betaS)/sqrt1r2 + lambda1)
              *v1/sigma^2)/sqrt1r2^3
          )
      hess[iRho,iSigma] <- t(hess[iSigma,iRho])
      hess[iRho,iRho] <-
         (sum(CB0*( (v0/sigma + rho*XS0.betaS)/sqrt1r2^3)^2
              -lambda0*(XS0.betaS*(1 + 2*rho^2) + 3*rho*v0/sigma)/
                  sqrt1r2^5
              )
          + sum(CB1*( (v1/sigma + rho*XS1.betaS)/sqrt1r2^3)^2
                +lambda1*( XS1.betaS*( 1 + 2*rho^2) + 3*rho*v1/sigma) /
              sqrt1r2^5
                )
          )
      ## l.s2x3 is zero
      hess
   }
   ## ---------------
   NXS <- ncol( XS)
   if(is.null(colnames(XS)))
      colnames(XS) <- rep("XS", NXS)
   NXO <- ncol( XO)
   if(is.null(colnames(XO)))
      colnames(XO) <- rep("XO", NXO)
   nObs <- length( YS)
   i0 <- YS==0
   i1 <- YS==1
   NO1 <- length( YS[i0])
   NO2 <- length( YS[i1])
   if(!is.null(weights)) {
      warning("Argument 'weight' is ignored by tobitTfit")
   }
   ## indices in for the parameter vector
   if(is.null(index)) {
      iBetaS <- 1:NXS
      iBetaO <- max(iBetaS) + seq(length=NXO)
      if(!binaryOutcome) {
         iSigma <- max(iBetaO) + 1
         iRho <- max(iSigma) + 1
      }
      else
         iRho <- max(iBetaO) + 1
      nParam <- iRho
   }
   else {
      iBetaS <- index$betaS
      iBetaO <- index$betaO
      iSigma <- index$errTerms["sigma"]
      iRho <- index$errTerms["rho"]
      nParam <- index$nParam
   }
   ## split the data by selection
   XS0 <- XS[i0,,drop=FALSE]
   XS1 <- XS[i1,,drop=FALSE]
   YO0 <- YO[i0]
   YO1 <- YO[i1]
   XO0 <- XO[i0,,drop=FALSE]
   XO1 <- XO[i1,,drop=FALSE]
   ##
   if(print.level > 0) {
      cat( "Non-participants: ", NO1,
          "; participants: ", NO2, "\n", sep="")
      cat( "Initial values:\n")
      cat("selection equation betaS:\n")
      print(start[iBetaS])
      cat("Outcome equation betaO\n")
      print(start[iBetaO])
      cat("Variance sigma\n")
      print(start[iSigma])
      cat("Correlation rho\n")
      print(start[iRho])
   }
   result <- maxLik(loglik,
                    grad=gradlik,
                    hess=hesslik,
                    start=start,
                    print.level=print.level,
                    method=maxMethod,
                    ...)
   ## compareDerivatives(#loglik,
   ##     gradlik,
   ##     hesslik,
   ##                    t0=start)
   result$tobitType <- "treatment"
   result$method <- "ml"
   class( result ) <- c( "selection", class( result ) )
   return( result )
}

treatReg <- function(selection, outcome,
                      data=sys.frame(sys.parent()),
                      weights = NULL,
                      subset,
                      method="ml",
                      start=NULL,
                      ys=FALSE, xs=FALSE,
                      yo=FALSE, xo=FALSE,
                      mfs=FALSE, mfo=FALSE,
                      print.level=0,
                      ...) {
   ## Heckman-style treatment effect models
   ## selection:   formula
   ##              LHS: must be convertable to two-level factor (e.g. 0-1, 1-2, "A"-"B")
   ##              RHS: ordinary formula as in lm()
   ## outcome:     formula
   ##              should include selection outcome
   ## ys, xs, yo, xo, mfs, mfo: whether to return the response, model matrix or
   ##              the model frame of outcome and selection equation(s)
   ## First the consistency checks
   ## ...          additional arguments for tobit2fit and tobit5fit
   type <- 0
   if(!inherits( selection, "formula" )) {
      stop( "argument 'selection' must be a formula" )
   }
   if( length( selection ) != 3 ) {
      stop( "argument 'selection' must be a 2-sided formula" )
   }
   if(inherits(outcome, "formula")) {
      if( length( outcome ) != 3 ) {
         stop( "argument 'outcome' must be a 2-sided formula" )
      }
   }
   else
       stop("argument 'outcome' must be a formula" )
   if(!missing(data)) {
      if(!inherits(data, "environment") & !inherits(data, "data.frame") & !inherits(data, "list")) {
         stop("'data' must be either environment, data.frame, or list (currently a ", class(data), ")")
      }
   }
   ##
   if(print.level > 0)
       cat("Treatment effect model", type, "model\n")
   probitEndogenous <- model.frame( selection, data = data)[ , 1 ]
   probitLevels <- levels( as.factor( probitEndogenous ) )
   if( length( probitLevels ) != 2 ) {
      stop( "the left hand side of 'selection' has to contain",
         " exactly two levels (e.g. FALSE and TRUE)" )
   }
   if( !is.null( weights )) {
      warning( "argument 'weights' is ignored" )
      weights <- NULL
   }
   ## now check whether two-step method was requested
   cl <- match.call()
   if(method == "2step") {
      twoStep <- heckitTfit(selection, outcome, data=data,
#                            weights = weights,
                            print.level = print.level, ... )
      twoStep$call <- cl
      class(twoStep) <- c("selection", class(twoStep))
      return(twoStep)
   }
   ## Now extract model frames etc
   ## YS (selection equation)
   mf <- match.call(expand.dots = FALSE)
   m <- match(c("selection", "data", "subset"), names(mf), 0)
   mfS <- mf[c(1, m)]
   mfS$drop.unused.levels <- TRUE
   mfS$na.action <- na.pass
   mfS[[1]] <- as.name("model.frame")
   names(mfS)[2] <- "formula"
                                        # model.frame requires the parameter to
                                        # be 'formula'
   mfS <- eval(mfS, parent.frame())
   mtS <- attr(mfS, "terms")
   XS <- model.matrix(mtS, mfS)
   YS <- model.response(mfS)
   YSLevels <- levels( as.factor( YS ) )
   if( length( YSLevels ) != 2 ) {
      stop( "the left hand side of the 'selection' formula has to contain",
         " exactly two levels (e.g. FALSE and TRUE)" )
   }
   YS <- as.integer(YS == YSLevels[ 2 ])
                                        # selection will be kept as integer internally
   ## check for NA-s.  Because we have to find NA-s in several frames, we cannot use the standard na.
   ## functions here.  Find bad rows and remove them later.
   ## We check XS and YS separately, because mfS may be a data frame with complex structure (e.g.
   ## including matrices)
   badRow <- !complete.cases(YS, XS)
   badRow <- badRow | is.infinite(YS)
   badRow <- badRow | apply(XS, 1, function(v) any(is.infinite(v)))
   ## YO (outcome equation)
   ## Here we should include a possibility for the user to
   ## specify the model.  Currently just a guess.
   binaryOutcome <- FALSE
   oArg <- match("outcome", names(mf), 0)
                           # find the outcome argument
   m <- match(c("outcome", "data", "subset",
                "offset"), names(mf), 0)
   ## replace the outcome list by the first equation and evaluate it
   mfO <- mf[c(1, m)]
   mfO$drop.unused.levels <- TRUE
   mfO$na.action <- na.pass
   mfO[[1]] <- as.name("model.frame")
                           # eval it as model frame
   names(mfO)[2] <- "formula"
   mfO <- eval(mfO, parent.frame())
                           # Note: if unobserved variables are
                           # marked as NA, eval returns a
                           # subframe of visible variables only.
                           # We have to check it later
   mtO <- attr(mfO, "terms")
   XO <- model.matrix(mtO, mfO)
   YO <- model.response(mfO)
   if(is.logical(YO) |
      (is.factor(YO) & length(levels(YO)) == 2)) {
      binaryOutcome <- TRUE
   }
   ## Now figure out if selection outcome is in fact used as
   ## explanatory variable for the outcome
   selectionVariable <- as.character(selection[[2]])
                           # name of the selection outcome
   ##
   badRow <- badRow | !complete.cases(YO, XO)
   badRow <- badRow | is.infinite(YO)
   badRow <- badRow | apply(XO, 1, function(v) any(is.infinite(v)))
                           # outcome cases that contain NA, Inf, NaN
   if( !is.null( weights ) ) {
      if( length( weights ) != length( badRow ) ) {
         stop( "number of weights (", length( weights ), ") is not equal",
              " to the number of observations (", length( badRow ), ")" )
      }
      badRow <- badRow | is.na( weights )
      badRow <- badRow | is.infinite( weights )
   }   
   if(print.level > 0) {
      cat(sum(badRow), "invalid observations\n")
   }
   if( method == "model.frame" ) {
      mf <- mfS
      mf <- cbind( mf, mfO[ , ! names( mfO ) %in% names( mf ), drop = FALSE ] )
      return( mf[ !badRow, ] )
   }
   XS <- XS[!badRow,, drop=FALSE]
   YS <- YS[!badRow]
   XO <- XO[!badRow,, drop=FALSE]
   YO <- YO[!badRow]
   weightsNoNA <- weights[ !badRow ]
   NXS <- ncol(XS)
   NXO <- ncol(XO)
   ## parameter indices in the parameter vector
   iBetaS <- seq(length=ncol(XS))
   iBetaO <- max(iBetaS) + seq(length=NXO)
   if(!binaryOutcome) {
      iSigma <- max(iBetaO) + 1
      iRho <- max(iSigma) + 1
   }
   else
      iRho <- max(iBetaO) + 1
   nParam <- iRho
   if(binaryOutcome) {
      iErrTerms <- c(rho=iRho)
   }
   else {
      iErrTerms <- c(sigma=iSigma, rho=iRho )
   }
   index <- list(betaS=iBetaS,
                 betaO=iBetaO,
                 errTerms=iErrTerms,
                 outcome = iBetaO,
                 nParam=iRho)
   ##
   twoStep <- NULL
   if(is.null(start)) {
                           # start values by Heckman 2-step method
      start <- numeric(nParam)
      twoStep <- heckitTfit(selection, outcome, data=data,
                            print.level = print.level,
#                            weights = weights
                            )
      coefs <- coef(twoStep, part="full")
      start[iBetaS] <- coefs[twoStep$param$index$betaS]
      if(!binaryOutcome) {
         start[iBetaO] <- coefs[twoStep$param$index$betaO]
         start[iSigma] <- coefs[twoStep$param$index$sigma]
      }
      else
         start[iBetaO] <- coefs[twoStep$param$index$betaO]/coefs[twoStep$param$index$sigma]
      start[iRho] <- coefs[twoStep$param$index$rho]
      if(start[iRho] > 0.99)
         start[iRho] <- 0.99
      else if(start[iRho] < -0.99)
         start[iRho] <- -0.99
   }
   if(is.null(names(start))) {
      if(!binaryOutcome) {
         names(start) <- c(colnames(XS), colnames(XO), "sigma",
                           "rho")
      }
      else
         names(start) <- c(colnames(XS), colnames(XO), 
                           "rho")
   }                                        # add names to start values if not present
   if(!binaryOutcome) {
      estimation <- tobitTfit(YS, XS, YO, XO, start,
#                              weights = weightsNoNA,
                              print.level=print.level,
                              index=index,
                              binaryOutcome=binaryOutcome,
                              ...)
   }
   else {
      ## estimation <- tobitTBfit(YS, XS, YO, XO, start, weights = weightsNoNA,
      ##                          print.level=print.level, ...)
      ## iErrTerms <- c(rho=iRho)
      stop("Binary outcome models are not implemented")
   }
   param <- list(index=index,
                 NXS=ncol(XS), NXO=ncol(XO),
                 N0=sum(YS==0), N1=sum(YS==1),
                 nObs=length(YS), nParam=length(start),
                 df=length(YS) - length(start),
                 levels=YSLevels,
                           # levels[1]: selection 1; levels[2]:
                           # selection 2
                 selectionVariableName=selectionVariable
                           # which explanatory variable is selection outcome
                 )
   result <- c(estimation,
               twoStep=list(twoStep),
               start=list(start),
               param=list(param),
               call=cl,
               termsS=mtS,
               termsO=mtO,
               ys=switch(as.character(ys), "TRUE"=list(YS), "FALSE"=NULL),
               xs=switch(as.character(xs), "TRUE"=list(XS), "FALSE"=NULL),
               yo=switch(as.character(yo), "TRUE"=list(YO), "FALSE"=NULL),
               xo=switch(as.character(xo), "TRUE"=list(XO), "FALSE"=NULL),
               mfs=switch(as.character(mfs), "TRUE"=list(mfS[!badRow,]), "FALSE"=NULL),
               mfo=switch(as.character(mfs),
               "TRUE"=list(mfO[!badRow,]), "FALSE"=NULL)
               )
   result$binaryOutcome <- binaryOutcome
   class( result ) <- class( estimation ) 
   return(result)
}

First Attempt: Treatment-Effect model

In this return model, BOPS is a self-select decision from the customer, therefore, there is a selection bias, the suspect that BOPS is endogenous, we therefore use this treatment-effect model to address this problem. However, the result from this model: insignificant rho of -0.00137, we can conclude that, the endogeneity problem is negligible in this case.

#model1<-treatReg(bops~kid+price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number)+factor(year), return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number)+factor(fiscalyr), data=df_m4, method="ML")

#summary(model1) 

#df_m4$predict_return=predict(model1, newdata=df_m4,type = "response")
#ttest=t.test(df_m4$predict_return~df_m4$bops)
#ATE=ttest$estimate[2]-ttest$estimate[1]
#names(ATE)<-"ATE"
#ATE ##result from stata ATE=0.011, my ATE=0.0203

Testing the instruments

probit_treatment<- glm(bops~kid+price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number)+factor(year),data=df_m4,family=binomial(link="probit")) 

df_m4$predict_bops=predict(probit_treatment, newdata=df_m4,type = "response")

df_m4$predicted_bops=ifelse(df_m4$predict_bops >= 0.5,1,0)
mean(df_m4$predicted_bops != df_m4$bops,na.rm=TRUE)

## [1] 0.2301414

misClasificError <- mean(df_m4$predicted_bops != df_m4$bops,na.rm=TRUE) # count number of wrong classifications
print(paste('Accuracy',1-misClasificError)) #accurarcy is 76.91%

## [1] "Accuracy 0.769858571226574"

Final model: Simple Probit model with Robust SE

Interpretation of the variable of interest, BOPS: Assume that we sell 1000 units. Since the return rate when BOPS=0 is 9.6%, customers will returns 96 units. If these customers use BOPS, the return rate will be 0.096 + 0.02=0.116. Hence, we would get 116 returned products under BOPS strategy. This corresponds to a 20.8% (=(116-96)/96) increase to total number of returns.

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

probit1<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number) + factor(fiscalyr), data=df_m4,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#install.packages("mfx")
dwtest(probit1)

## 
##  Durbin-Watson test
## 
## data:  probit1
## DW = 1.7909, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

library(mfx)

## Loading required package: sandwich

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following objects are masked from 'package:raster':
## 
##     area, select

## Loading required package: betareg

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number) + factor(fiscalyr), data=df_m4,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(store_number) + 
##     factor(fiscalyr), data = df_m4, robust = TRUE)
## 
## Marginal Effects:
##                                dF/dx   Std. Err.        z     P>|z|    
## bops                      1.3372e-02  7.4501e-04  17.9489 < 2.2e-16 ***
## price                     6.3374e-05  1.7961e-06  35.2854 < 2.2e-16 ***
## age_band                 -1.0423e-03  8.0301e-05 -12.9804 < 2.2e-16 ***
## est_income_code           1.2424e-03  1.3689e-04   9.0760 < 2.2e-16 ***
## white                    -3.3155e-03  6.0078e-04  -5.5187 3.415e-08 ***
## male                     -2.2401e-02  6.2346e-04 -35.9308 < 2.2e-16 ***
## factor(month_index2)1     2.0660e-03  2.0113e-03   1.0272 0.3043217    
## factor(month_index2)2     6.9244e-04  2.0254e-03   0.3419 0.7324485    
## factor(month_index2)3    -6.4963e-03  1.8288e-03  -3.5522 0.0003821 ***
## factor(month_index2)4    -5.1723e-03  1.6431e-03  -3.1479 0.0016447 ** 
## factor(month_index2)5    -1.7662e-02  1.4772e-03 -11.9565 < 2.2e-16 ***
## factor(month_index2)6     1.0119e-02  1.8770e-03   5.3912 7.000e-08 ***
## factor(month_index2)7    -9.0031e-03  1.6195e-03  -5.5591 2.712e-08 ***
## factor(month_index2)8     1.1212e-02  2.0685e-03   5.4202 5.952e-08 ***
## factor(month_index2)9    -1.1690e-02  1.7530e-03  -6.6688 2.579e-11 ***
## factor(month_index2)10   -1.7107e-02  1.5908e-03 -10.7536 < 2.2e-16 ***
## factor(month_index2)11    2.9297e-03  2.0707e-03   1.4149 0.1571090    
## factor(store_number)6    -9.9732e-03  1.1473e-03  -8.6930 < 2.2e-16 ***
## factor(store_number)5998 -6.0697e-02  1.3326e-02  -4.5546 5.248e-06 ***
## factor(fiscalyr)13       -1.2039e-02  6.1194e-04 -19.6733 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                     "white"                   
##  [3] "male"                     "factor(month_index2)1"   
##  [5] "factor(month_index2)2"    "factor(month_index2)3"   
##  [7] "factor(month_index2)4"    "factor(month_index2)5"   
##  [9] "factor(month_index2)6"    "factor(month_index2)7"   
## [11] "factor(month_index2)8"    "factor(month_index2)9"   
## [13] "factor(month_index2)10"   "factor(month_index2)11"  
## [15] "factor(store_number)6"    "factor(store_number)5998"
## [17] "factor(fiscalyr)13"

aggregate(df_m4$return~df_m4$bops, FUN=mean)

##   df_m4$bops df_m4$return
## 1          0   0.09679147
## 2          1   0.10945370

Heteroskedasticity test on simple probit model

#install.packages("lmtest")
library(lmtest)
library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:usdm':
## 
##     vif

## The following object is masked from 'package:VIF':
## 
##     vif

probit1<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number)+factor(fiscalyr), data=df_m4, family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

gqtest(probit1) # Goldfeld-Quandt test

## 
##  Goldfeld-Quandt test
## 
## data:  probit1
## GQ = 0.88268, df1 = 491500, df2 = 491500, p-value = 1
## alternative hypothesis: variance increases from segment 1 to 2

bptest(probit1)# Breusch-Pagan test

## 
##  studentized Breusch-Pagan test
## 
## data:  probit1
## BP = 18257, df = 20, p-value < 2.2e-16

#it shows heteroskedasticity, but using normal se is okay. Obtaining robust se for treatReg is beyond this course.

serial correlation test:

In DW test, the result shows a positive serial correlation but it is in the acceptable range. And in Lagrange Multiplier test also indicates a serial correlation. To fix this, we will need to obtain Newey-West standard error.

#install.packages("DataCombine")
library(DataCombine)

## 
## Attaching package: 'DataCombine'

## The following object is masked from 'package:raster':
## 
##     shift

dwtest(probit1) # Durbin-Watson test - DW=1.79 < 2, and significant, indicating a positive serial correlation, but it is in a acceptable range

## 
##  Durbin-Watson test
## 
## data:  probit1
## DW = 1.7909, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

#library(FinTS)
#arch.test(df_m4$return) #There is serial correlation
library(nlme)

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:usdm':
## 
##     Variogram

## The following object is masked from 'package:raster':
## 
##     getData

coeftest(probit1,vcov.=NeweyWest)

## 
## z test of coefficients:
## 
##                             Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept)              -1.2185e+00  1.2647e-02 -96.3459 < 2.2e-16 ***
## bops                      7.5381e-02  4.8620e-03  15.5042 < 2.2e-16 ***
## price                     3.6676e-04  1.0956e-05  33.4764 < 2.2e-16 ***
## age_band                 -6.0322e-03  5.4636e-04 -11.0407 < 2.2e-16 ***
## est_income_code           7.1900e-03  9.4617e-04   7.5991 2.982e-14 ***
## white                    -1.9191e-02  4.1188e-03  -4.6593 3.172e-06 ***
## male                     -1.2930e-01  4.3471e-03 -29.7451 < 2.2e-16 ***
## factor(month_index2)1     1.1873e-02  1.5589e-02   0.7617  0.446252    
## factor(month_index2)2     3.9978e-03  1.4947e-02   0.2675  0.789112    
## factor(month_index2)3    -3.8435e-02  1.4800e-02  -2.5970  0.009404 ** 
## factor(month_index2)4    -3.0390e-02  1.2315e-02  -2.4678  0.013595 *  
## factor(month_index2)5    -1.0553e-01  1.2173e-02  -8.6693 < 2.2e-16 ***
## factor(month_index2)6     5.6795e-02  1.4090e-02   4.0310 5.555e-05 ***
## factor(month_index2)7    -5.3534e-02  1.2964e-02  -4.1293 3.639e-05 ***
## factor(month_index2)8     6.2580e-02  1.3812e-02   4.5309 5.872e-06 ***
## factor(month_index2)9    -7.0438e-02  1.4495e-02  -4.8595 1.177e-06 ***
## factor(month_index2)10   -1.0474e-01  1.3433e-02  -7.7973 6.324e-15 ***
## factor(month_index2)11    1.6787e-02  1.4519e-02   1.1562  0.247596    
## factor(store_number)6    -5.9687e-02  9.9337e-03  -6.0085 1.872e-09 ***
## factor(store_number)5998 -4.9038e-01  9.5280e-02  -5.1467 2.651e-07 ***
## factor(fiscalyr)13       -6.9387e-02  5.2599e-03 -13.1916 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Simple Probit model with robust se

df_2012<-subset(df_m4,df_m4$fiscalyr==12)
df_2012_26<-subset(df_2012,df_2012$store2==1|df_2012$store6==1)
df_2013<-subset(df_m4,df_m4$fiscalyr==13)
df_2013_26<-subset(df_2013,df_2012$store2==1|df_2013$store6==1)

## Warning in df_2012$store2 == 1 | df_2013$store6 == 1: longer object length
## is not a multiple of shorter object length

df_store5998<-subset(df_m4,df_m4$store5998==1)
df_store26<-subset(df_m4,df_m4$store2==1|df_m4$store6==1)
##Year 2012
probit_2012<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number) + factor(year), data=df_2012,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number) + factor(year), data=df_2012,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(store_number) + 
##     factor(year), data = df_2012, robust = TRUE)
## 
## Marginal Effects:
##                              dF/dx   Std. Err.        z     P>|z|    
## bops                    1.3872e-02  1.2283e-03  11.2935 < 2.2e-16 ***
## price                   6.4166e-05  2.7926e-06  22.9770 < 2.2e-16 ***
## age_band               -7.5040e-04  1.2227e-04  -6.1372 8.398e-10 ***
## est_income_code         9.7563e-04  2.0897e-04   4.6688 3.029e-06 ***
## white                  -3.3599e-03  9.1607e-04  -3.6677 0.0002448 ***
## male                   -2.3587e-02  9.5222e-04 -24.7700 < 2.2e-16 ***
## factor(month_index2)1   1.1952e-02  3.1151e-03   3.8368 0.0001246 ***
## factor(month_index2)2   4.0934e-05  2.9157e-03   0.0140 0.9887985    
## factor(month_index2)3   1.0864e-03  2.8289e-03   0.3840 0.7009554    
## factor(month_index2)4  -1.3166e-04  2.5201e-03  -0.0522 0.9583333    
## factor(month_index2)5  -1.0571e-02  2.2174e-03  -4.7672 1.868e-06 ***
## factor(month_index2)6   2.7503e-02  3.0273e-03   9.0851 < 2.2e-16 ***
## factor(month_index2)7  -6.7781e-03  2.4198e-03  -2.8010 0.0050937 ** 
## factor(month_index2)8   1.1404e-02  3.0821e-03   3.7000 0.0002156 ***
## factor(month_index2)9   3.7700e-03  2.9781e-03   1.2659 0.2055462    
## factor(month_index2)10 -1.3933e-02  2.4227e-03  -5.7509 8.874e-09 ***
## factor(month_index2)11  3.7893e-03  3.0348e-03   1.2486 0.2118047    
## factor(store_number)6  -1.0142e-02  1.6732e-03  -6.0615 1.349e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                   "white"                 
##  [3] "male"                   "factor(month_index2)1" 
##  [5] "factor(month_index2)2"  "factor(month_index2)3" 
##  [7] "factor(month_index2)4"  "factor(month_index2)5" 
##  [9] "factor(month_index2)6"  "factor(month_index2)7" 
## [11] "factor(month_index2)8"  "factor(month_index2)9" 
## [13] "factor(month_index2)10" "factor(month_index2)11"
## [15] "factor(store_number)6"

##Year 2013
probit_2013<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number) + factor(year), data=df_2013,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(store_number) + factor(year), data=df_2013,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(store_number) + 
##     factor(year), data = df_2013, robust = TRUE)
## 
## Marginal Effects:
##                                dF/dx   Std. Err.        z     P>|z|    
## bops                      1.3049e-02  9.3290e-04  13.9880 < 2.2e-16 ***
## price                     6.2362e-05  2.2564e-06  27.6383 < 2.2e-16 ***
## age_band                 -1.2702e-03  1.0619e-04 -11.9613 < 2.2e-16 ***
## est_income_code           1.4520e-03  1.8073e-04   8.0343 9.414e-16 ***
## white                    -3.2947e-03  7.9375e-04  -4.1508 3.313e-05 ***
## male                     -2.1391e-02  8.2165e-04 -26.0347 < 2.2e-16 ***
## factor(month_index2)1    -7.3359e-03  2.5818e-03  -2.8414 0.0044914 ** 
## factor(month_index2)2     1.4487e-03  2.8337e-03   0.5112 0.6091852    
## factor(month_index2)3    -1.3771e-02  2.3531e-03  -5.8522 4.850e-09 ***
## factor(month_index2)4    -1.0616e-02  2.1478e-03  -4.9426 7.709e-07 ***
## factor(month_index2)5    -2.4601e-02  1.9693e-03 -12.4922 < 2.2e-16 ***
## factor(month_index2)6    -3.7112e-03  2.3394e-03  -1.5864 0.1126425    
## factor(month_index2)7    -1.2007e-02  2.1624e-03  -5.5527 2.813e-08 ***
## factor(month_index2)8     9.2195e-03  2.7650e-03   3.3343 0.0008551 ***
## factor(month_index2)9    -2.2490e-02  2.1240e-03 -10.5886 < 2.2e-16 ***
## factor(month_index2)10   -2.0812e-02  2.0840e-03  -9.9866 < 2.2e-16 ***
## factor(month_index2)11    1.5102e-03  2.8240e-03   0.5348 0.5928025    
## factor(store_number)6    -1.0244e-02  1.5818e-03  -6.4761 9.411e-11 ***
## factor(store_number)5998 -5.8200e-02  1.2424e-02  -4.6847 2.804e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                     "white"                   
##  [3] "male"                     "factor(month_index2)1"   
##  [5] "factor(month_index2)2"    "factor(month_index2)3"   
##  [7] "factor(month_index2)4"    "factor(month_index2)5"   
##  [9] "factor(month_index2)6"    "factor(month_index2)7"   
## [11] "factor(month_index2)8"    "factor(month_index2)9"   
## [13] "factor(month_index2)10"   "factor(month_index2)11"  
## [15] "factor(store_number)6"    "factor(store_number)5998"

##df_store5998
probit_store5998<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2)+ factor(year), data=df_store5998,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(year), data=df_store5998,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(year), data = df_store5998, 
##     robust = TRUE)
## 
## Marginal Effects:
##                              dF/dx   Std. Err.       z     P>|z|    
## bops                   -1.1625e-06  2.5901e-06 -0.4488   0.65357    
## price                  -2.4356e-10  6.8997e-10 -0.3530   0.72409    
## age_band                5.3592e-09  1.7979e-08  0.2981   0.76565    
## est_income_code         3.5399e-08  1.3341e-07  0.2653   0.79075    
## white                  -1.7441e-05  2.9136e-05 -0.5986   0.54942    
## male                   -3.8016e-06  7.1690e-06 -0.5303   0.59591    
## factor(month_index2)1   1.2158e-03  3.3819e-03  0.3595   0.71921    
## factor(month_index2)2  -2.4019e-08  1.1962e-07 -0.2008   0.84087    
## factor(month_index2)3   1.3437e-07  6.3174e-07  0.2127   0.83156    
## factor(month_index2)4   3.7382e-01  3.4793e-01  1.0744   0.28263    
## factor(month_index2)5   2.1639e-01  1.1545e-01  1.8744   0.06088 .  
## factor(month_index2)6   7.9309e-08  3.6404e-07  0.2179   0.82754    
## factor(month_index2)7   8.2616e-01  1.4528e-01  5.6865 1.297e-08 ***
## factor(month_index2)8   2.1235e-08  1.8537e-07  0.1146   0.90880    
## factor(month_index2)9  -2.4128e-08  1.0217e-07 -0.2362   0.81331    
## factor(month_index2)10  2.3553e-07  9.2381e-07  0.2550   0.79875    
## factor(month_index2)11  9.6010e-01  1.3917e-01  6.8990 5.236e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                   "white"                 
##  [3] "male"                   "factor(month_index2)1" 
##  [5] "factor(month_index2)2"  "factor(month_index2)3" 
##  [7] "factor(month_index2)4"  "factor(month_index2)5" 
##  [9] "factor(month_index2)6"  "factor(month_index2)7" 
## [11] "factor(month_index2)8"  "factor(month_index2)9" 
## [13] "factor(month_index2)10" "factor(month_index2)11"

##df_store 2 & 6
probit_store26<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(year), data=df_store26,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2)  + factor(year), data=df_store26,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(year), data = df_store26, 
##     robust = TRUE)
## 
## Marginal Effects:
##                              dF/dx   Std. Err.        z     P>|z|    
## bops                    1.3862e-02  7.4809e-04  18.5301 < 2.2e-16 ***
## price                   6.3498e-05  1.7995e-06  35.2874 < 2.2e-16 ***
## age_band               -1.0625e-03  8.0297e-05 -13.2324 < 2.2e-16 ***
## est_income_code         1.2689e-03  1.3690e-04   9.2688 < 2.2e-16 ***
## white                  -3.3229e-03  6.0096e-04  -5.5294 3.214e-08 ***
## male                   -2.2263e-02  6.2351e-04 -35.7054 < 2.2e-16 ***
## factor(month_index2)1  -9.6305e-03  1.9229e-03  -5.0083 5.491e-07 ***
## factor(month_index2)2  -1.0912e-02  1.9484e-03  -5.6002 2.141e-08 ***
## factor(month_index2)3  -1.7429e-02  1.7540e-03  -9.9365 < 2.2e-16 ***
## factor(month_index2)4  -1.6331e-02  1.6036e-03 -10.1842 < 2.2e-16 ***
## factor(month_index2)5  -2.8546e-02  1.5014e-03 -19.0131 < 2.2e-16 ***
## factor(month_index2)6   9.9123e-03  1.8778e-03   5.2788 1.300e-07 ***
## factor(month_index2)7  -9.0883e-03  1.6194e-03  -5.6122 1.997e-08 ***
## factor(month_index2)8   1.1161e-02  2.0687e-03   5.3953 6.841e-08 ***
## factor(month_index2)9  -1.1874e-02  1.7549e-03  -6.7664 1.320e-11 ***
## factor(month_index2)10 -1.7215e-02  1.5906e-03 -10.8227 < 2.2e-16 ***
## factor(month_index2)11  2.8568e-03  2.0700e-03   1.3801    0.1676    
## factor(year)2012       -1.3030e-02  8.4545e-04 -15.4117 < 2.2e-16 ***
## factor(year)2013       -2.2710e-02  1.1140e-03 -20.3858 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                   "white"                 
##  [3] "male"                   "factor(month_index2)1" 
##  [5] "factor(month_index2)2"  "factor(month_index2)3" 
##  [7] "factor(month_index2)4"  "factor(month_index2)5" 
##  [9] "factor(month_index2)6"  "factor(month_index2)7" 
## [11] "factor(month_index2)8"  "factor(month_index2)9" 
## [13] "factor(month_index2)10" "factor(month_index2)11"
## [15] "factor(year)2012"       "factor(year)2013"

##df_store 2 & 6, 2012
probit_store2012_26<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(year), data=df_2012_26,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2)  + factor(year), data=df_2012_26,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(year), data = df_2012_26, 
##     robust = TRUE)
## 
## Marginal Effects:
##                              dF/dx   Std. Err.        z     P>|z|    
## bops                    1.4234e-02  1.2289e-03  11.5823 < 2.2e-16 ***
## price                   6.4331e-05  2.7967e-06  23.0027 < 2.2e-16 ***
## age_band               -7.8120e-04  1.2224e-04  -6.3906 1.652e-10 ***
## est_income_code         1.0085e-03  2.0894e-04   4.8267 1.388e-06 ***
## white                  -3.4004e-03  9.1619e-04  -3.7115 0.0002061 ***
## male                   -2.3445e-02  9.5241e-04 -24.6167 < 2.2e-16 ***
## factor(month_index2)1   1.1900e-02  3.1146e-03   3.8206 0.0001332 ***
## factor(month_index2)2  -5.5985e-06  2.9152e-03  -0.0019 0.9984677    
## factor(month_index2)3   1.0841e-03  2.8292e-03   0.3832 0.7015695    
## factor(month_index2)4  -1.8623e-04  2.5196e-03  -0.0739 0.9410792    
## factor(month_index2)5  -1.0557e-02  2.2178e-03  -4.7601 1.935e-06 ***
## factor(month_index2)6   2.7290e-02  3.0241e-03   9.0243 < 2.2e-16 ***
## factor(month_index2)7  -6.8369e-03  2.4193e-03  -2.8259 0.0047142 ** 
## factor(month_index2)8   1.1391e-02  3.0822e-03   3.6958 0.0002192 ***
## factor(month_index2)9   3.6257e-03  2.9758e-03   1.2184 0.2230613    
## factor(month_index2)10 -1.3999e-02  2.4220e-03  -5.7800 7.471e-09 ***
## factor(month_index2)11  3.7291e-03  3.0341e-03   1.2290 0.2190543    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                   "white"                 
##  [3] "male"                   "factor(month_index2)1" 
##  [5] "factor(month_index2)2"  "factor(month_index2)3" 
##  [7] "factor(month_index2)4"  "factor(month_index2)5" 
##  [9] "factor(month_index2)6"  "factor(month_index2)7" 
## [11] "factor(month_index2)8"  "factor(month_index2)9" 
## [13] "factor(month_index2)10" "factor(month_index2)11"

##df_store 2 & 6, 2013
probit_store2013_26<-glm(return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2) + factor(year), data=df_2013_26,family=binomial(link="probit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probitmfx(formula=return ~ bops + price+age_band + est_income_code + white + male + factor(month_index2)  + factor(year), data=df_2013_26,robust=TRUE)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Call:
## probitmfx(formula = return ~ bops + price + age_band + est_income_code + 
##     white + male + factor(month_index2) + factor(year), data = df_2013_26, 
##     robust = TRUE)
## 
## Marginal Effects:
##                              dF/dx   Std. Err.        z     P>|z|    
## bops                    1.2801e-02  9.6333e-04  13.2885 < 2.2e-16 ***
## price                   6.2362e-05  2.1606e-06  28.8625 < 2.2e-16 ***
## age_band               -1.2724e-03  1.1006e-04 -11.5605 < 2.2e-16 ***
## est_income_code         1.5169e-03  1.8760e-04   8.0859 6.173e-16 ***
## white                  -3.2249e-03  8.2383e-04  -3.9145 9.060e-05 ***
## male                   -2.0709e-02  8.5026e-04 -24.3556 < 2.2e-16 ***
## factor(month_index2)1  -7.6298e-03  2.6073e-03  -2.9263  0.003431 ** 
## factor(month_index2)2   1.3043e-03  2.8610e-03   0.4559  0.648468    
## factor(month_index2)3  -1.3981e-02  2.3814e-03  -5.8707 4.339e-09 ***
## factor(month_index2)4  -1.0871e-02  2.1731e-03  -5.0027 5.653e-07 ***
## factor(month_index2)5  -2.5045e-02  2.0044e-03 -12.4946 < 2.2e-16 ***
## factor(month_index2)6  -3.8802e-03  2.3639e-03  -1.6414  0.100713    
## factor(month_index2)7  -1.2164e-02  2.1896e-03  -5.5555 2.768e-08 ***
## factor(month_index2)8   9.4851e-03  2.7960e-03   3.3923  0.000693 ***
## factor(month_index2)9  -1.3337e-02  2.7825e-03  -4.7930 1.643e-06 ***
## factor(month_index2)10 -1.7054e-02  2.2318e-03  -7.6414 2.149e-14 ***
## factor(month_index2)11  1.5823e-03  2.8557e-03   0.5541  0.579534    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "bops"                   "white"                 
##  [3] "male"                   "factor(month_index2)1" 
##  [5] "factor(month_index2)2"  "factor(month_index2)3" 
##  [7] "factor(month_index2)4"  "factor(month_index2)5" 
##  [9] "factor(month_index2)6"  "factor(month_index2)7" 
## [11] "factor(month_index2)8"  "factor(month_index2)9" 
## [13] "factor(month_index2)10" "factor(month_index2)11"

I tried to address the possible endogeneity caused by self select BOPS by treatment effect model but rho is insignificant, so the endogeity I suspected earlier is negligible in this model. So simple probit regession will do the work. The result shows customers who purchase with BOPS are 14% more likely return than customers who purchase without BOPS.