Regression

## Read data: let's do this coding line by line together
siva <- read.csv(file='C:/Users/ehk994/Desktop/Teaching/Research for Marketing Communications/5 - Multiple regression/Class exercise/siva_cust_survey_transaction.csv', header=T)

## If you import the data using 'Import Dataset' in 'Environment/ History' pane, this is the code with which you can covert the file name
# siva <- siva_cust_survey_transaction (To make the code work, you shouldn't have the # sign before 'siva'. I used # sign. Otherwise, I wouldn't be able to publish the code)

#Recommended X variables: Staff_Courtesy / Speed_of_Service / Veh_Equip_Condition / Trans_Billing_as_Expected / Value_for_the_Money / Total_charge_USD / SUrvey_checkout_diff / rent_loc_type / Purpose_of_Rental / booking_channel_dummy
#Y variable: Recom_mend_Siva

## Subset for regression
siva_sub <- siva[c(10,9,11,12,13,14,15,28,29,30,23)]
names(siva_sub)

##  [1] "Recom_mend_Siva"           "Purpose_of_Rental"        
##  [3] "Staff_Courtesy"            "Speed_of_Service"         
##  [5] "Veh_Equip_Condition"       "Trans_Billing_as_Expected"
##  [7] "Value_for_the_Money"       "Total_charge_USD"         
##  [9] "Survey_checkout_diff"      "booking_channel_dummy"    
## [11] "rent_loc_type"

## As factor for channel
siva_sub$booking_channel_dummy <- as.factor(siva_sub$booking_channel_dummy)

##### 1a) Model ################################################################################## 

sivaLM <- lm(Recom_mend_Siva~Staff_Courtesy + Speed_of_Service + Veh_Equip_Condition +Trans_Billing_as_Expected + Value_for_the_Money + Total_charge_USD + Survey_checkout_diff, data = siva_sub)

summary(sivaLM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = siva_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2512 -0.5215  0.1394  0.6044  9.1607 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -4.339e-01  3.082e-02 -14.080  < 2e-16 ***
## Staff_Courtesy             2.486e-01  4.677e-03  53.161  < 2e-16 ***
## Speed_of_Service           1.951e-01  3.446e-03  56.608  < 2e-16 ***
## Veh_Equip_Condition        1.798e-01  2.937e-03  61.222  < 2e-16 ***
## Trans_Billing_as_Expected  1.565e-01  3.331e-03  46.994  < 2e-16 ***
## Value_for_the_Money        2.690e-01  3.639e-03  73.915  < 2e-16 ***
## Total_charge_USD           1.191e-04  2.165e-05   5.505 3.72e-08 ***
## Survey_checkout_diff      -1.654e-03  1.494e-03  -1.107    0.268    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.28 on 52338 degrees of freedom
##   (1469 observations deleted due to missingness)
## Multiple R-squared:  0.625,  Adjusted R-squared:  0.6249 
## F-statistic: 1.246e+04 on 7 and 52338 DF,  p-value: < 2.2e-16

## We see that all variables are significant, except for the time since checkout. We also see that we have a very strong R^2.

## Interpretation of beta coefficients
# 1.    All non-recommendation survey questions are positively associated with the recommendation score. This makes intuitive sense, as one would expect for example that better staff courtesy would, all else being equal, be associated with a better overall recommendation. They are all highly significant.

# 2.    Total charge is also positively associated with the recommendation score. A unit increase in total charge is associated with a 0.00012 increase in the overall recommendation score. This would suggest that all else being equal, more expensive rentals are positively associated with better recommendations.

# 3.    A unit increase in the Value_for_the_Money coefficient is associated with a 0.27 increase in the overall recommendation score.

## 1a) VIF
## VIF: To test for multicollinearity, obtain the VIF of the predictors 
library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

car::vif(sivaLM)

##            Staff_Courtesy          Speed_of_Service 
##                  1.962906                  1.829614 
##       Veh_Equip_Condition Trans_Billing_as_Expected 
##                  1.458116                  1.623979 
##       Value_for_the_Money          Total_charge_USD 
##                  1.917934                  1.009810 
##      Survey_checkout_diff 
##                  1.004727

# VIF greater than 5 is worrying. We see that all predictors have VIF values less than 2. Therefore, there does not appear to be a serious multicollinearity issue.

## Subset data with conditions
# location type
sivaAir <- siva_sub[ which(siva_sub$rent_loc_type=='AP'), ]
sivaAirOff <- siva_sub[ which(siva_sub$rent_loc_type=='OFF AP'), ]

# business vs leisure
sivaBus <- siva_sub[ which(siva_sub$Purpose_of_Rental=='Bus.'), ]
sivaOthers <- siva_sub[ which(siva_sub$Purpose_of_Rental=='Leis. / Pers.'), ]

# booking channel: 0 (i.e., not booked through Siva.com) / 1 (i.e., through Siva.com)
siva1 <- siva_sub[ which(siva_sub$booking_channel_dummy== '1'), ]
siva0 <- siva_sub[ which(siva_sub$booking_channel_dummy=='0'), ]

##### 2b) Models ############################################################################### 

## LM for siva air (This is regression model for airport locations)
sivaAirLM <- lm(Recom_mend_Siva~Staff_Courtesy+Speed_of_Service+Veh_Equip_Condition+Trans_Billing_as_Expected+Value_for_the_Money+Total_charge_USD+Survey_checkout_diff, data=sivaAir) # use subset (i.e. sivaAir) of the 'siva_sub' for the regression model 'sivaAirLM'

summary(sivaAirLM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = sivaAir)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0218 -0.5667  0.1341  0.6600  9.1005 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -3.745e-01  4.323e-02  -8.664  < 2e-16 ***
## Staff_Courtesy             2.390e-01  6.414e-03  37.255  < 2e-16 ***
## Speed_of_Service           1.983e-01  4.759e-03  41.679  < 2e-16 ***
## Veh_Equip_Condition        1.810e-01  4.134e-03  43.777  < 2e-16 ***
## Trans_Billing_as_Expected  1.562e-01  4.675e-03  33.409  < 2e-16 ***
## Value_for_the_Money        2.685e-01  5.094e-03  52.708  < 2e-16 ***
## Total_charge_USD           1.347e-04  3.333e-05   4.043  5.3e-05 ***
## Survey_checkout_diff      -1.481e-03  2.238e-03  -0.661    0.508    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.318 on 26264 degrees of freedom
##   (928 observations deleted due to missingness)
## Multiple R-squared:  0.6198, Adjusted R-squared:  0.6197 
## F-statistic:  6115 on 7 and 26264 DF,  p-value: < 2.2e-16

## LM for siva air off (This is regression model for off-airport locations)
sivaAirOffLM <- lm(Recom_mend_Siva~Staff_Courtesy+Speed_of_Service+Veh_Equip_Condition+Trans_Billing_as_Expected+Value_for_the_Money+Total_charge_USD+Survey_checkout_diff, data=sivaAirOff) # use subset (i.e. sivaAirOff) of the 'siva_sub' for the regression model 'sivaAirOffLM'

summary(sivaAirOffLM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = sivaAirOff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2066 -0.4730  0.1440  0.5419  8.5436 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -5.323e-01  4.561e-02 -11.670  < 2e-16 ***
## Staff_Courtesy             2.612e-01  7.095e-03  36.814  < 2e-16 ***
## Speed_of_Service           1.934e-01  5.153e-03  37.537  < 2e-16 ***
## Veh_Equip_Condition        1.787e-01  4.312e-03  41.438  < 2e-16 ***
## Trans_Billing_as_Expected  1.571e-01  4.916e-03  31.956  < 2e-16 ***
## Value_for_the_Money        2.689e-01  5.461e-03  49.236  < 2e-16 ***
## Total_charge_USD           9.915e-05  2.868e-05   3.458 0.000546 ***
## Survey_checkout_diff      -4.580e-04  2.064e-03  -0.222 0.824378    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.24 on 24500 degrees of freedom
##   (538 observations deleted due to missingness)
## Multiple R-squared:  0.6313, Adjusted R-squared:  0.6312 
## F-statistic:  5993 on 7 and 24500 DF,  p-value: < 2.2e-16

## Interpretation of the two models, 'sivaAirLM' and 'sivaAirOffLM'
# The results appear substantively the same, both in terms of the significance of predictors and in terms of the level of the coefficients.

## LM for siva business (This is regression model for business rentals)
sivaBusLM <- lm(Recom_mend_Siva~Staff_Courtesy+Speed_of_Service+Veh_Equip_Condition+Trans_Billing_as_Expected+Value_for_the_Money+Total_charge_USD+Survey_checkout_diff, data=sivaBus)  # use subset (i.e. sivaBus) of the 'siva_sub' for the regression model 'sivaBusLM'

summary(sivaBusLM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = sivaBus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0236 -0.5480  0.1341  0.6251  8.3997 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -4.972e-01  4.300e-02 -11.563   <2e-16 ***
## Staff_Courtesy             2.433e-01  6.477e-03  37.558   <2e-16 ***
## Speed_of_Service           2.056e-01  4.715e-03  43.603   <2e-16 ***
## Veh_Equip_Condition        1.805e-01  4.020e-03  44.900   <2e-16 ***
## Trans_Billing_as_Expected  1.438e-01  4.714e-03  30.510   <2e-16 ***
## Value_for_the_Money        2.830e-01  5.018e-03  56.395   <2e-16 ***
## Total_charge_USD           7.688e-05  3.434e-05   2.239   0.0252 *  
## Survey_checkout_diff      -2.064e-03  2.167e-03  -0.952   0.3409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.306 on 28157 degrees of freedom
##   (743 observations deleted due to missingness)
## Multiple R-squared:  0.6253, Adjusted R-squared:  0.6252 
## F-statistic:  6712 on 7 and 28157 DF,  p-value: < 2.2e-16

## LM for siva others (These is regression model for leisure rentals)
sivaOthersLM <- lm(Recom_mend_Siva~Staff_Courtesy+Speed_of_Service+Veh_Equip_Condition+Trans_Billing_as_Expected+Value_for_the_Money+Total_charge_USD+Survey_checkout_diff, data=sivaOthers) # use subset (i.e. sivaOthers) of the 'siva_sub' for the regression model 'sivaOthersLM'

summary(sivaOthersLM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = sivaOthers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2650 -0.4930  0.1384  0.5772  9.0792 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -3.321e-01  4.471e-02  -7.427 1.15e-13 ***
## Staff_Courtesy             2.576e-01  6.787e-03  37.952  < 2e-16 ***
## Speed_of_Service           1.811e-01  5.074e-03  35.696  < 2e-16 ***
## Veh_Equip_Condition        1.790e-01  4.321e-03  41.435  < 2e-16 ***
## Trans_Billing_as_Expected  1.728e-01  4.725e-03  36.582  < 2e-16 ***
## Value_for_the_Money        2.478e-01  5.330e-03  46.500  < 2e-16 ***
## Total_charge_USD           1.225e-04  2.794e-05   4.383 1.18e-05 ***
## Survey_checkout_diff      -1.363e-03  2.060e-03  -0.661    0.508    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.244 on 23858 degrees of freedom
##   (715 observations deleted due to missingness)
## Multiple R-squared:  0.6244, Adjusted R-squared:  0.6243 
## F-statistic:  5665 on 7 and 23858 DF,  p-value: < 2.2e-16

## Interpretation of the two models, 'sivaBusLM' and 'sivaOthersLM'
# Once again, these results seem substantively the same in terms of significance and level of the coefficients.

## LM for siva.com (This is regression model for booking channel = 1 (i.e., through Siva.com))
siva1LM <- lm(Recom_mend_Siva~Staff_Courtesy+Speed_of_Service+Veh_Equip_Condition+Trans_Billing_as_Expected+Value_for_the_Money+Total_charge_USD+Survey_checkout_diff, data=siva1) # use subset (i.e. siva1) of the 'siva_sub' for the regression model 'siva1LM'

summary(siva1LM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = siva1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0180 -0.5286  0.1399  0.5939  8.9817 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -2.204e-01  4.452e-02  -4.949 7.49e-07 ***
## Staff_Courtesy             2.693e-01  6.655e-03  40.467  < 2e-16 ***
## Speed_of_Service           1.817e-01  4.928e-03  36.860  < 2e-16 ***
## Veh_Equip_Condition        1.817e-01  4.209e-03  43.161  < 2e-16 ***
## Trans_Billing_as_Expected  1.578e-01  4.744e-03  33.266  < 2e-16 ***
## Value_for_the_Money        2.352e-01  5.214e-03  45.116  < 2e-16 ***
## Total_charge_USD           1.071e-04  3.045e-05   3.517 0.000437 ***
## Survey_checkout_diff      -1.660e-03  2.203e-03  -0.754 0.451149    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.255 on 24307 degrees of freedom
##   (487 observations deleted due to missingness)
## Multiple R-squared:  0.6145, Adjusted R-squared:  0.6144 
## F-statistic:  5535 on 7 and 24307 DF,  p-value: < 2.2e-16

## LM for other channels (This is regression model for booking channel = 0 (i.e., not booked through Siva.com) )
siva0LM <- lm(Recom_mend_Siva~Staff_Courtesy+Speed_of_Service+Veh_Equip_Condition+Trans_Billing_as_Expected+Value_for_the_Money+Total_charge_USD+Survey_checkout_diff, data=siva0) # use subset (i.e. siva0) of the 'siva_sub' for the regression model 'siva0LM'

summary(siva0LM)

## 
## Call:
## lm(formula = Recom_mend_Siva ~ Staff_Courtesy + Speed_of_Service + 
##     Veh_Equip_Condition + Trans_Billing_as_Expected + Value_for_the_Money + 
##     Total_charge_USD + Survey_checkout_diff, data = siva0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2384 -0.5294  0.1383  0.6101  8.7183 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.5971459  0.0425558 -14.032  < 2e-16 ***
## Staff_Courtesy             0.2297475  0.0065398  35.131  < 2e-16 ***
## Speed_of_Service           0.2076416  0.0047969  43.287  < 2e-16 ***
## Veh_Equip_Condition        0.1780032  0.0040808  43.620  < 2e-16 ***
## Trans_Billing_as_Expected  0.1543541  0.0046540  33.166  < 2e-16 ***
## Value_for_the_Money        0.2968875  0.0050626  58.643  < 2e-16 ***
## Total_charge_USD           0.0001144  0.0000306   3.739 0.000185 ***
## Survey_checkout_diff      -0.0009758  0.0020294  -0.481 0.630627    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.296 on 28023 degrees of freedom
##   (982 observations deleted due to missingness)
## Multiple R-squared:  0.6352, Adjusted R-squared:  0.6351 
## F-statistic:  6972 on 7 and 28023 DF,  p-value: < 2.2e-16

## Interpretation of the two models, 'siva1LM' and 'siva0LM'
# These results again seem substantively the same in terms of significance and level of the coefficients.

Regression_Siva

Emily Ko

11/4/2019