Modelling

Author

AS

Setup

remove(list=ls())

Import

train_data <- read.csv("train_data.csv", 
                       header=TRUE
                       )

test_data <- read.csv("test.csv", 
                       header=TRUE
                       )

sample_submission <- read.csv("sample_submission.csv", 
                       header=TRUE
                       )
library(visdat)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
vis_dat(train_data)

stargazer(train_data, type = "text")

====================================================================
Statistic                                N   Mean   St. Dev. Min Max
--------------------------------------------------------------------
Id                                      423 298.695 172.546   1  602
Age                                     423 30.631   10.188   3  67 
Customer_Reviews_Importance             423  2.525   1.196    1   5 
Personalized_Recommendation_Frequency.1 423  2.674   1.034    1   5 
Rating_Accuracy                         423  2.643   0.899    1   5 
Shopping_Satisfaction                   423  2.454   1.015    1   5 
--------------------------------------------------------------------

Model (built on train data )

str(test_data)
'data.frame':   179 obs. of  23 variables:
 $ Id                                     : int  5 6 10 12 14 15 16 19 20 24 ...
 $ Age                                    : int  32 33 26 32 54 43 25 25 32 31 ...
 $ Gender                                 : chr  "Female" "Female" "Male" "Male" ...
 $ Purchase_Frequency                     : chr  "Once a month" "Multiple times a week" "Less than once a month" "Less than once a month" ...
 $ Purchase_Categories                    : chr  "Clothing and Fashion;Home and Kitchen;others" "Groceries and Gourmet Food;Beauty and Personal Care;Clothing and Fashion;Home and Kitchen;others" "Groceries and Gourmet Food;Beauty and Personal Care;Clothing and Fashion;Home and Kitchen;others" "others" ...
 $ Personalized_Recommendation_Frequency  : chr  "No" "No" "Yes" "Sometimes" ...
 $ Browsing_Frequency                     : chr  "Few times a week" "Few times a month" "Few times a week" "Rarely" ...
 $ Product_Search_Method                  : chr  "others" "categories" "Keyword" "categories" ...
 $ Search_Result_Exploration              : chr  "Multiple pages" "Multiple pages" "Multiple pages" "First page" ...
 $ Customer_Reviews_Importance            : int  1 1 1 1 3 2 5 4 1 3 ...
 $ Add_to_Cart_Browsing                   : chr  "Yes" "Yes" "Yes" "No" ...
 $ Cart_Completion_Frequency              : chr  "Sometimes" "Always" "Often" "Always" ...
 $ Cart_Abandonment_Factors               : chr  "Found a better price elsewhere" "Changed my mind or no longer need the item" "Found a better price elsewhere" "Changed my mind or no longer need the item" ...
 $ Saveforlater_Frequency                 : chr  "Rarely" "Often" "Sometimes" "Never" ...
 $ Review_Left                            : chr  "Yes" "Yes" "No" "No" ...
 $ Review_Reliability                     : chr  "Occasionally" "Moderately" "Heavily" "Moderately" ...
 $ Review_Helpfulness                     : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ Personalized_Recommendation_Frequency.1: int  5 2 1 2 3 1 4 3 1 4 ...
 $ Recommendation_Helpfulness             : chr  "Yes" "No" "Yes" "Sometimes" ...
 $ Rating_Accuracy                        : int  1 3 2 2 2 2 3 3 1 2 ...
 $ Service_Appreciation                   : chr  "Competitive prices" "Wide product selection" "Wide product selection" "Wide product selection" ...
 $ Improvement_Areas                      : chr  "Shipping speed and reliability" "Reducing packaging waste" "Customer service responsiveness" "Customer service responsiveness" ...
 $ Shopping_Satisfaction                  : logi  NA NA NA NA NA NA ...
## Bivariate Linear Regression
reg0 <- 
lm(data = train_data, 
   formula = Shopping_Satisfaction ~ Browsing_Frequency  )
summary(reg0)

Call:
lm(formula = Shopping_Satisfaction ~ Browsing_Frequency, data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.6667 -0.5442  0.0000  0.5465  2.5465 

Coefficients:
                                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)                             2.54422    0.08262  30.795  < 2e-16 ***
Browsing_FrequencyFew times a week     -0.09073    0.11251  -0.806 0.420483    
Browsing_FrequencyMultiple times a day -0.54422    0.16049  -3.391 0.000763 ***
Browsing_FrequencyRarely                0.12245    0.16279   0.752 0.452357    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.002 on 419 degrees of freedom
Multiple R-squared:  0.03318,   Adjusted R-squared:  0.02626 
F-statistic: 4.793 on 3 and 419 DF,  p-value: 0.002703
## Multivariate Regression
reg1 <- 
lm(data = train_data, 
   formula = Shopping_Satisfaction ~ Browsing_Frequency + Age + Gender + Purchase_Frequency + Recommendation_Helpfulness  + Review_Reliability + Cart_Completion_Frequency )

reg1 <- 
lm(data = train_data, 
   formula = Shopping_Satisfaction ~ Browsing_Frequency + Age + Gender + Purchase_Frequency + Product_Search_Method + Add_to_Cart_Browsing + Recommendation_Helpfulness  + Review_Reliability + Cart_Completion_Frequency)

summary(reg1)

Call:
lm(formula = Shopping_Satisfaction ~ Browsing_Frequency + Age + 
    Gender + Purchase_Frequency + Product_Search_Method + Add_to_Cart_Browsing + 
    Recommendation_Helpfulness + Review_Reliability + Cart_Completion_Frequency, 
    data = train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.02320 -0.61939 -0.03303  0.56271  2.99783 

Coefficients:
                                          Estimate Std. Error t value Pr(>|t|)
(Intercept)                               2.041153   1.009748   2.021 0.043907
Browsing_FrequencyFew times a week       -0.071466   0.112063  -0.638 0.524017
Browsing_FrequencyMultiple times a day   -0.355274   0.162534  -2.186 0.029414
Browsing_FrequencyRarely                  0.037397   0.177734   0.210 0.833455
Age                                      -0.003549   0.004793  -0.741 0.459434
GenderMale                               -0.002191   0.121208  -0.018 0.985587
GenderOthers                              0.137852   0.293783   0.469 0.639165
GenderPrefer not to say                   0.025267   0.137385   0.184 0.854173
Purchase_FrequencyLess than once a month -0.095617   0.158227  -0.604 0.545990
Purchase_FrequencyMultiple times a week   0.126529   0.183428   0.690 0.490725
Purchase_FrequencyOnce a month            0.023401   0.136326   0.172 0.863797
Purchase_FrequencyOnce a week             0.020389   0.134636   0.151 0.879707
Product_Search_Methodcategories           0.150031   0.971081   0.154 0.877295
Product_Search_MethodFilter              -0.082809   0.972615  -0.085 0.932193
Product_Search_MethodKeyword             -0.064639   0.969996  -0.067 0.946903
Product_Search_Methodothers               0.509811   0.992407   0.514 0.607741
Add_to_Cart_BrowsingNo                   -0.121984   0.128483  -0.949 0.342991
Add_to_Cart_BrowsingYes                  -0.094787   0.125109  -0.758 0.449119
Recommendation_HelpfulnessSometimes       0.163799   0.118709   1.380 0.168416
Recommendation_HelpfulnessYes            -0.120485   0.135228  -0.891 0.373482
Review_ReliabilityModerately              0.049780   0.128347   0.388 0.698335
Review_ReliabilityNever                   0.957557   0.268118   3.571 0.000399
Review_ReliabilityOccasionally            0.330716   0.144567   2.288 0.022687
Review_ReliabilityRarely                  0.379830   0.224115   1.695 0.090902
Cart_Completion_FrequencyNever            0.873841   0.379661   2.302 0.021875
Cart_Completion_FrequencyOften            0.297651   0.209919   1.418 0.156998
Cart_Completion_FrequencyRarely           0.425536   0.236933   1.796 0.073256
Cart_Completion_FrequencySometimes        0.431147   0.204546   2.108 0.035676
                                            
(Intercept)                              *  
Browsing_FrequencyFew times a week          
Browsing_FrequencyMultiple times a day   *  
Browsing_FrequencyRarely                    
Age                                         
GenderMale                                  
GenderOthers                                
GenderPrefer not to say                     
Purchase_FrequencyLess than once a month    
Purchase_FrequencyMultiple times a week     
Purchase_FrequencyOnce a month              
Purchase_FrequencyOnce a week               
Product_Search_Methodcategories             
Product_Search_MethodFilter                 
Product_Search_MethodKeyword                
Product_Search_Methodothers                 
Add_to_Cart_BrowsingNo                      
Add_to_Cart_BrowsingYes                     
Recommendation_HelpfulnessSometimes         
Recommendation_HelpfulnessYes               
Review_ReliabilityModerately                
Review_ReliabilityNever                  ***
Review_ReliabilityOccasionally           *  
Review_ReliabilityRarely                 .  
Cart_Completion_FrequencyNever           *  
Cart_Completion_FrequencyOften              
Cart_Completion_FrequencyRarely          .  
Cart_Completion_FrequencySometimes       *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9435 on 395 degrees of freedom
Multiple R-squared:  0.1914,    Adjusted R-squared:  0.1362 
F-statistic: 3.464 on 27 and 395 DF,  p-value: 3.723e-08
stargazer(reg0, reg1, type="text")

=======================================================================================
                                                      Dependent variable:              
                                         ----------------------------------------------
                                                     Shopping_Satisfaction             
                                                  (1)                     (2)          
---------------------------------------------------------------------------------------
Browsing_FrequencyFew times a week               -0.091                 -0.071         
                                                (0.113)                 (0.112)        
                                                                                       
Browsing_FrequencyMultiple times a day         -0.544***               -0.355**        
                                                (0.160)                 (0.163)        
                                                                                       
Browsing_FrequencyRarely                         0.122                   0.037         
                                                (0.163)                 (0.178)        
                                                                                       
Age                                                                     -0.004         
                                                                        (0.005)        
                                                                                       
GenderMale                                                              -0.002         
                                                                        (0.121)        
                                                                                       
GenderOthers                                                             0.138         
                                                                        (0.294)        
                                                                                       
GenderPrefer not to say                                                  0.025         
                                                                        (0.137)        
                                                                                       
Purchase_FrequencyLess than once a month                                -0.096         
                                                                        (0.158)        
                                                                                       
Purchase_FrequencyMultiple times a week                                  0.127         
                                                                        (0.183)        
                                                                                       
Purchase_FrequencyOnce a month                                           0.023         
                                                                        (0.136)        
                                                                                       
Purchase_FrequencyOnce a week                                            0.020         
                                                                        (0.135)        
                                                                                       
Product_Search_Methodcategories                                          0.150         
                                                                        (0.971)        
                                                                                       
Product_Search_MethodFilter                                             -0.083         
                                                                        (0.973)        
                                                                                       
Product_Search_MethodKeyword                                            -0.065         
                                                                        (0.970)        
                                                                                       
Product_Search_Methodothers                                              0.510         
                                                                        (0.992)        
                                                                                       
Add_to_Cart_BrowsingNo                                                  -0.122         
                                                                        (0.128)        
                                                                                       
Add_to_Cart_BrowsingYes                                                 -0.095         
                                                                        (0.125)        
                                                                                       
Recommendation_HelpfulnessSometimes                                      0.164         
                                                                        (0.119)        
                                                                                       
Recommendation_HelpfulnessYes                                           -0.120         
                                                                        (0.135)        
                                                                                       
Review_ReliabilityModerately                                             0.050         
                                                                        (0.128)        
                                                                                       
Review_ReliabilityNever                                                0.958***        
                                                                        (0.268)        
                                                                                       
Review_ReliabilityOccasionally                                          0.331**        
                                                                        (0.145)        
                                                                                       
Review_ReliabilityRarely                                                0.380*         
                                                                        (0.224)        
                                                                                       
Cart_Completion_FrequencyNever                                          0.874**        
                                                                        (0.380)        
                                                                                       
Cart_Completion_FrequencyOften                                           0.298         
                                                                        (0.210)        
                                                                                       
Cart_Completion_FrequencyRarely                                         0.426*         
                                                                        (0.237)        
                                                                                       
Cart_Completion_FrequencySometimes                                      0.431**        
                                                                        (0.205)        
                                                                                       
Constant                                        2.544***                2.041**        
                                                (0.083)                 (1.010)        
                                                                                       
---------------------------------------------------------------------------------------
Observations                                      423                     423          
R2                                               0.033                   0.191         
Adjusted R2                                      0.026                   0.136         
Residual Std. Error                         1.002 (df = 419)       0.943 (df = 395)    
F Statistic                              4.793*** (df = 3; 419) 3.464*** (df = 27; 395)
=======================================================================================
Note:                                                       *p<0.1; **p<0.05; ***p<0.01

Predict (on test data)

?predict

# make sure the new data has all the variables used in reg1
predictions <- predict(object = reg1, 
                         newdata = test_data # unseen data
                       )
summary(predictions)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.437   2.131   2.421   2.462   2.706   4.211 
rounded_predictions <-
round(x = predictions,
      digits = 0)

summary(rounded_predictions)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   2.000   2.447   3.000   4.000 
head(cbind(predictions, rounded_predictions))
  predictions rounded_predictions
1    2.935919                   3
2    2.155587                   2
3    1.797343                   2
4    2.108799                   2
5    2.046370                   2
6    1.609032                   2
tail(cbind(predictions, rounded_predictions))
    predictions rounded_predictions
174    2.703205                   3
175    1.795930                   2
176    2.683700                   3
177    3.142169                   3
178    2.661060                   3
179    2.230416                   2
sample_submission$Shopping_Satisfaction <- rounded_predictions

write.csv(sample_submission, 
            file = "my_submission.csv", 
            row.names = FALSE)
  • Building different multivariate regressions

    • Putting in all variables need not be better.
  • Converting the predictions into rounded predictions