remove(list=ls())
Modelling
Setup
Import
<- read.csv("train_data.csv",
train_data header=TRUE
)
<- read.csv("test.csv",
test_data header=TRUE
)
<- read.csv("sample_submission.csv",
sample_submission header=TRUE
)
library(visdat)
library(stargazer)
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
vis_dat(train_data)
stargazer(train_data, type = "text")
====================================================================
Statistic N Mean St. Dev. Min Max
--------------------------------------------------------------------
Id 423 298.695 172.546 1 602
Age 423 30.631 10.188 3 67
Customer_Reviews_Importance 423 2.525 1.196 1 5
Personalized_Recommendation_Frequency.1 423 2.674 1.034 1 5
Rating_Accuracy 423 2.643 0.899 1 5
Shopping_Satisfaction 423 2.454 1.015 1 5
--------------------------------------------------------------------
Model (built on train data )
str(test_data)
'data.frame': 179 obs. of 23 variables:
$ Id : int 5 6 10 12 14 15 16 19 20 24 ...
$ Age : int 32 33 26 32 54 43 25 25 32 31 ...
$ Gender : chr "Female" "Female" "Male" "Male" ...
$ Purchase_Frequency : chr "Once a month" "Multiple times a week" "Less than once a month" "Less than once a month" ...
$ Purchase_Categories : chr "Clothing and Fashion;Home and Kitchen;others" "Groceries and Gourmet Food;Beauty and Personal Care;Clothing and Fashion;Home and Kitchen;others" "Groceries and Gourmet Food;Beauty and Personal Care;Clothing and Fashion;Home and Kitchen;others" "others" ...
$ Personalized_Recommendation_Frequency : chr "No" "No" "Yes" "Sometimes" ...
$ Browsing_Frequency : chr "Few times a week" "Few times a month" "Few times a week" "Rarely" ...
$ Product_Search_Method : chr "others" "categories" "Keyword" "categories" ...
$ Search_Result_Exploration : chr "Multiple pages" "Multiple pages" "Multiple pages" "First page" ...
$ Customer_Reviews_Importance : int 1 1 1 1 3 2 5 4 1 3 ...
$ Add_to_Cart_Browsing : chr "Yes" "Yes" "Yes" "No" ...
$ Cart_Completion_Frequency : chr "Sometimes" "Always" "Often" "Always" ...
$ Cart_Abandonment_Factors : chr "Found a better price elsewhere" "Changed my mind or no longer need the item" "Found a better price elsewhere" "Changed my mind or no longer need the item" ...
$ Saveforlater_Frequency : chr "Rarely" "Often" "Sometimes" "Never" ...
$ Review_Left : chr "Yes" "Yes" "No" "No" ...
$ Review_Reliability : chr "Occasionally" "Moderately" "Heavily" "Moderately" ...
$ Review_Helpfulness : chr "Yes" "Yes" "Yes" "Yes" ...
$ Personalized_Recommendation_Frequency.1: int 5 2 1 2 3 1 4 3 1 4 ...
$ Recommendation_Helpfulness : chr "Yes" "No" "Yes" "Sometimes" ...
$ Rating_Accuracy : int 1 3 2 2 2 2 3 3 1 2 ...
$ Service_Appreciation : chr "Competitive prices" "Wide product selection" "Wide product selection" "Wide product selection" ...
$ Improvement_Areas : chr "Shipping speed and reliability" "Reducing packaging waste" "Customer service responsiveness" "Customer service responsiveness" ...
$ Shopping_Satisfaction : logi NA NA NA NA NA NA ...
## Bivariate Linear Regression
<-
reg0 lm(data = train_data,
formula = Shopping_Satisfaction ~ Browsing_Frequency )
summary(reg0)
Call:
lm(formula = Shopping_Satisfaction ~ Browsing_Frequency, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-1.6667 -0.5442 0.0000 0.5465 2.5465
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.54422 0.08262 30.795 < 2e-16 ***
Browsing_FrequencyFew times a week -0.09073 0.11251 -0.806 0.420483
Browsing_FrequencyMultiple times a day -0.54422 0.16049 -3.391 0.000763 ***
Browsing_FrequencyRarely 0.12245 0.16279 0.752 0.452357
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.002 on 419 degrees of freedom
Multiple R-squared: 0.03318, Adjusted R-squared: 0.02626
F-statistic: 4.793 on 3 and 419 DF, p-value: 0.002703
## Multivariate Regression
<-
reg1 lm(data = train_data,
formula = Shopping_Satisfaction ~ Browsing_Frequency + Age + Gender + Purchase_Frequency + Recommendation_Helpfulness + Review_Reliability + Cart_Completion_Frequency )
<-
reg1 lm(data = train_data,
formula = Shopping_Satisfaction ~ Browsing_Frequency + Age + Gender + Purchase_Frequency + Product_Search_Method + Add_to_Cart_Browsing + Recommendation_Helpfulness + Review_Reliability + Cart_Completion_Frequency)
summary(reg1)
Call:
lm(formula = Shopping_Satisfaction ~ Browsing_Frequency + Age +
Gender + Purchase_Frequency + Product_Search_Method + Add_to_Cart_Browsing +
Recommendation_Helpfulness + Review_Reliability + Cart_Completion_Frequency,
data = train_data)
Residuals:
Min 1Q Median 3Q Max
-2.02320 -0.61939 -0.03303 0.56271 2.99783
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.041153 1.009748 2.021 0.043907
Browsing_FrequencyFew times a week -0.071466 0.112063 -0.638 0.524017
Browsing_FrequencyMultiple times a day -0.355274 0.162534 -2.186 0.029414
Browsing_FrequencyRarely 0.037397 0.177734 0.210 0.833455
Age -0.003549 0.004793 -0.741 0.459434
GenderMale -0.002191 0.121208 -0.018 0.985587
GenderOthers 0.137852 0.293783 0.469 0.639165
GenderPrefer not to say 0.025267 0.137385 0.184 0.854173
Purchase_FrequencyLess than once a month -0.095617 0.158227 -0.604 0.545990
Purchase_FrequencyMultiple times a week 0.126529 0.183428 0.690 0.490725
Purchase_FrequencyOnce a month 0.023401 0.136326 0.172 0.863797
Purchase_FrequencyOnce a week 0.020389 0.134636 0.151 0.879707
Product_Search_Methodcategories 0.150031 0.971081 0.154 0.877295
Product_Search_MethodFilter -0.082809 0.972615 -0.085 0.932193
Product_Search_MethodKeyword -0.064639 0.969996 -0.067 0.946903
Product_Search_Methodothers 0.509811 0.992407 0.514 0.607741
Add_to_Cart_BrowsingNo -0.121984 0.128483 -0.949 0.342991
Add_to_Cart_BrowsingYes -0.094787 0.125109 -0.758 0.449119
Recommendation_HelpfulnessSometimes 0.163799 0.118709 1.380 0.168416
Recommendation_HelpfulnessYes -0.120485 0.135228 -0.891 0.373482
Review_ReliabilityModerately 0.049780 0.128347 0.388 0.698335
Review_ReliabilityNever 0.957557 0.268118 3.571 0.000399
Review_ReliabilityOccasionally 0.330716 0.144567 2.288 0.022687
Review_ReliabilityRarely 0.379830 0.224115 1.695 0.090902
Cart_Completion_FrequencyNever 0.873841 0.379661 2.302 0.021875
Cart_Completion_FrequencyOften 0.297651 0.209919 1.418 0.156998
Cart_Completion_FrequencyRarely 0.425536 0.236933 1.796 0.073256
Cart_Completion_FrequencySometimes 0.431147 0.204546 2.108 0.035676
(Intercept) *
Browsing_FrequencyFew times a week
Browsing_FrequencyMultiple times a day *
Browsing_FrequencyRarely
Age
GenderMale
GenderOthers
GenderPrefer not to say
Purchase_FrequencyLess than once a month
Purchase_FrequencyMultiple times a week
Purchase_FrequencyOnce a month
Purchase_FrequencyOnce a week
Product_Search_Methodcategories
Product_Search_MethodFilter
Product_Search_MethodKeyword
Product_Search_Methodothers
Add_to_Cart_BrowsingNo
Add_to_Cart_BrowsingYes
Recommendation_HelpfulnessSometimes
Recommendation_HelpfulnessYes
Review_ReliabilityModerately
Review_ReliabilityNever ***
Review_ReliabilityOccasionally *
Review_ReliabilityRarely .
Cart_Completion_FrequencyNever *
Cart_Completion_FrequencyOften
Cart_Completion_FrequencyRarely .
Cart_Completion_FrequencySometimes *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9435 on 395 degrees of freedom
Multiple R-squared: 0.1914, Adjusted R-squared: 0.1362
F-statistic: 3.464 on 27 and 395 DF, p-value: 3.723e-08
stargazer(reg0, reg1, type="text")
=======================================================================================
Dependent variable:
----------------------------------------------
Shopping_Satisfaction
(1) (2)
---------------------------------------------------------------------------------------
Browsing_FrequencyFew times a week -0.091 -0.071
(0.113) (0.112)
Browsing_FrequencyMultiple times a day -0.544*** -0.355**
(0.160) (0.163)
Browsing_FrequencyRarely 0.122 0.037
(0.163) (0.178)
Age -0.004
(0.005)
GenderMale -0.002
(0.121)
GenderOthers 0.138
(0.294)
GenderPrefer not to say 0.025
(0.137)
Purchase_FrequencyLess than once a month -0.096
(0.158)
Purchase_FrequencyMultiple times a week 0.127
(0.183)
Purchase_FrequencyOnce a month 0.023
(0.136)
Purchase_FrequencyOnce a week 0.020
(0.135)
Product_Search_Methodcategories 0.150
(0.971)
Product_Search_MethodFilter -0.083
(0.973)
Product_Search_MethodKeyword -0.065
(0.970)
Product_Search_Methodothers 0.510
(0.992)
Add_to_Cart_BrowsingNo -0.122
(0.128)
Add_to_Cart_BrowsingYes -0.095
(0.125)
Recommendation_HelpfulnessSometimes 0.164
(0.119)
Recommendation_HelpfulnessYes -0.120
(0.135)
Review_ReliabilityModerately 0.050
(0.128)
Review_ReliabilityNever 0.958***
(0.268)
Review_ReliabilityOccasionally 0.331**
(0.145)
Review_ReliabilityRarely 0.380*
(0.224)
Cart_Completion_FrequencyNever 0.874**
(0.380)
Cart_Completion_FrequencyOften 0.298
(0.210)
Cart_Completion_FrequencyRarely 0.426*
(0.237)
Cart_Completion_FrequencySometimes 0.431**
(0.205)
Constant 2.544*** 2.041**
(0.083) (1.010)
---------------------------------------------------------------------------------------
Observations 423 423
R2 0.033 0.191
Adjusted R2 0.026 0.136
Residual Std. Error 1.002 (df = 419) 0.943 (df = 395)
F Statistic 4.793*** (df = 3; 419) 3.464*** (df = 27; 395)
=======================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Predict (on test data)
?predict
# make sure the new data has all the variables used in reg1
<- predict(object = reg1,
predictions newdata = test_data # unseen data
)summary(predictions)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.437 2.131 2.421 2.462 2.706 4.211
<-
rounded_predictions round(x = predictions,
digits = 0)
summary(rounded_predictions)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 2.000 2.447 3.000 4.000
head(cbind(predictions, rounded_predictions))
predictions rounded_predictions
1 2.935919 3
2 2.155587 2
3 1.797343 2
4 2.108799 2
5 2.046370 2
6 1.609032 2
tail(cbind(predictions, rounded_predictions))
predictions rounded_predictions
174 2.703205 3
175 1.795930 2
176 2.683700 3
177 3.142169 3
178 2.661060 3
179 2.230416 2
$Shopping_Satisfaction <- rounded_predictions
sample_submission
write.csv(sample_submission,
file = "my_submission.csv",
row.names = FALSE)
Building different multivariate regressions
- Putting in all variables need not be better.
Converting the predictions into rounded predictions