Dataset was used from Kaggle: https://www.kaggle.com/datasets/vishakhdapat/waiter-tip-prediction

Research question

How the amount of total bill, number of individuals in the group that dine and sex affect the the amount left as a tip?

mydata <-read.table("~/IMB/Mutivariat analysis/tips.csv", header = TRUE, sep = ",", dec= ",")

head(mydata)
##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01  3.5   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4
mydata$total_bill <- as.numeric(mydata$total_bill)
mydata$tip <- as.numeric(mydata$tip)
#for the purpose of the easier analysis, I will exclude some of the variables I won't use in the analysis. However most important explanatory variables (total_bill, sex, size) that explains dependant variable (tip) will remain in the model. (Also, Denis explained in the mail that two explanatory variables are enough, however I will do analysis with 3 explanatory variables, two numerical and one categorical)
mydata2 <- mydata[,c(-4,-5,-6)]
head(mydata2)
##   total_bill  tip    sex size
## 1      16.99 1.01 Female    2
## 2      10.34 1.66   Male    3
## 3      21.01 3.50   Male    3
## 4      23.68 3.31   Male    2
## 5      24.59 3.61 Female    4
## 6      25.29 4.71   Male    4

Explanation of dataset

Data manipulation

#creating new variable and informing R that we have non-numerical variable
mydata2$sexF <- factor(mydata2$sex,
 levels = c("Male", "Female"),
 labels =c("M", "F"))

head(mydata2)
##   total_bill  tip    sex size sexF
## 1      16.99 1.01 Female    2    F
## 2      10.34 1.66   Male    3    M
## 3      21.01 3.50   Male    3    M
## 4      23.68 3.31   Male    2    M
## 5      24.59 3.61 Female    4    F
## 6      25.29 4.71   Male    4    M
summary(mydata2[,c(-3)])
##    total_bill         tip              size      sexF   
##  Min.   : 3.07   Min.   : 1.000   Min.   :1.00   M:157  
##  1st Qu.:13.35   1st Qu.: 2.000   1st Qu.:2.00   F: 87  
##  Median :17.80   Median : 2.900   Median :2.00          
##  Mean   :19.79   Mean   : 2.998   Mean   :2.57          
##  3rd Qu.:24.13   3rd Qu.: 3.562   3rd Qu.:3.00          
##  Max.   :50.81   Max.   :10.000   Max.   :6.00
library(psych)
describe(mydata2[,c(-3,-5)])
##            vars   n  mean   sd median trimmed  mad  min   max range skew
## total_bill    1 244 19.79 8.90   17.8   18.73 7.46 3.07 50.81 47.74 1.12
## tip           2 244  3.00 1.38    2.9    2.84 1.33 1.00 10.00  9.00 1.45
## size          3 244  2.57 0.95    2.0    2.42 0.00 1.00  6.00  5.00 1.43
##            kurtosis   se
## total_bill     1.14 0.57
## tip            3.50 0.09
## size           1.63 0.06

Explanations of parameters

Defining multiple regression model and brief explanation of the variables

We estimate linear regression model: Tip = b0 + b1total_bill + b2size + b3*Sex

Explanatory variables:

Regression analysis

General assumptions of the classical linear regression model:

Requirements are:

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(mydata2[ , c( -3, -5)], 
                  smooth = FALSE)

Based on the scatterplots from the row 2, I can conclude that the relationship between dependant and explanatory variable is linear (I don’t see any non-linear shapes)

fit1 <- lm(tip ~ total_bill + size + sexF, #Dependent and explanatory variables
           data = mydata2) #Name of the data frame

Checking multicolinearity

vif(fit1) #Multicolinearity
## total_bill       size       sexF 
##   1.579160   1.557587   1.021440
mean(vif(fit1))
## [1] 1.386062
mydata2$StdResid <- round(rstandard(fit1), 3) #Standardized residuals
mydata2$CooksD <- round(cooks.distance(fit1), 3) #Cooks distances

hist(mydata2$StdResid, 
     xlab = "Standardized residuals", 
     ylab = "Frequency", 
     main = "Histogram of standardized residuals")

- Based on histogram we see that we have some of the outliers in the sample (+- 3 standardized residuals)

Althought we have large enough sample, we will still check normality with Shapiro Wilk normality test, for the education purposes.

shapiro.test(mydata2$StdResid)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata2$StdResid
## W = 0.96455, p-value = 9.577e-06

BSD we can reject null hypothesis (p<0.001), errors are not normally distributed. However since the sample size is big enough, this doesn’t matter,

hist(mydata2$CooksD, 
     xlab = "Cooks distance", 
     ylab = "Frequency", 
     main = "Histogram of Cooks distances")

- All values are below 1, but we still see a gap between units, meaning that there are some units with high impact that need to be removed.

Creating ID variable to easier exclude the units later

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata2 <- mydata2 %>%
  mutate(ID = row_number())

head(mydata2,3)
##   total_bill  tip    sex size sexF StdResid CooksD ID
## 1      16.99 1.01 Female    2    F   -1.621  0.008  1
## 2      10.34 1.66   Male    3    M   -0.531  0.001  2
## 3      21.01 3.50   Male    3    M    0.311  0.000  3
head(mydata2[order(mydata2$StdResid),], 3) #Three units with lowest value of stand. residuals
##     total_bill  tip    sex size sexF StdResid CooksD  ID
## 238      32.83 1.17   Male    2    M   -2.918  0.061 238
## 103      44.30 2.50 Female    3    F   -2.917  0.129 103
## 188      30.46 2.00   Male    5    M   -2.452  0.051 188
head(mydata2[order(-mydata2$StdResid),], 5) #five units with highest value of stand. residuals
##     total_bill   tip    sex size sexF StdResid CooksD  ID
## 171      50.81 10.00   Male    3    M    4.134  0.328 171
## 173       7.25  5.15   Male    2    M    3.411  0.049 173
## 213      48.33  9.00   Male    4    M    3.112  0.122 213
## 184      23.17  6.50   Male    4    M    2.901  0.037 184
## 215      28.17  6.50 Female    3    F    2.605  0.029 215
head(mydata2[order(-mydata2$CooksD),], 6) #Six units with highest value of Cooks distance
##     total_bill   tip    sex size sexF StdResid CooksD  ID
## 171      50.81 10.00   Male    3    M    4.134  0.328 171
## 103      44.30  2.50 Female    3    F   -2.917  0.129 103
## 213      48.33  9.00   Male    4    M    3.112  0.122 213
## 238      32.83  1.17   Male    2    M   -2.918  0.061 238
## 188      30.46  2.00   Male    5    M   -2.452  0.051 188
## 183      45.35  3.50   Male    3    M   -1.966  0.050 183

The unit with the highest impact is ID171 which is as well an outliers and will be removed. With removing this unit we will close the gap.

library(dplyr)
mydata2 <- mydata2 %>%
  filter(!ID %in% c(171, 173, 213))
fit1 <- lm(tip ~ total_bill + size + sexF, 
           data = mydata2) 
mydata2$StdResid <- round(rstandard(fit1), 3) 
mydata2$CooksD <- round(cooks.distance(fit1), 3) 

hist(mydata2$StdResid, 
     xlab = "Standardized residuals", 
     ylab = "Frequency", 
     main = "Histogram of standardized residuals")

- it seems like their is one more outlier, ID184, we will remove it as well.

library(dplyr)
mydata2 <- mydata2 %>%
  filter(!ID %in% c(184))
fit1 <- lm(tip ~ total_bill + size + sexF, 
           data = mydata2) 
mydata2$StdFitted <- scale(fit1$fitted.values)

library(car)
scatterplot(y = mydata2$StdResid, x = mydata2$StdFitted,
            ylab = "Standardized residuals",
            xlab = "Standardized fitted values",
            boxplots = FALSE,
            regLine = FALSE,
            smooth = FALSE)

library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(fit1)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##              Data               
##  -------------------------------
##  Response : tip 
##  Variables: fitted values of tip 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    45.27042 
##  Prob > Chi2   =    1.716215e-11

BSD we can reject null hypothesis. (p<0.001). Homoskedasticity is violated. We need to use robust standard errors.

library(estimatr)
fit2 <- lm_robust(tip ~ total_bill + size + sexF,  
                  se_type = "HC1",
                  data = mydata2)


summary(fit2)
## 
## Call:
## lm_robust(formula = tip ~ total_bill + size + sexF, data = mydata2, 
##     se_type = "HC1")
## 
## Standard error type:  HC1 
## 
## Coefficients:
##             Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
## (Intercept)  0.73949    0.18574  3.9812 9.132e-05  0.37356   1.1054 236
## total_bill   0.08096    0.01184  6.8359 6.901e-11  0.05763   0.1043 236
## size         0.22022    0.09638  2.2849 2.321e-02  0.03034   0.4101 236
## sexFF        0.09034    0.11756  0.7684 4.430e-01 -0.14127   0.3220 236
## 
## Multiple R-squared:  0.452 , Adjusted R-squared:  0.445 
## F-statistic: 43.75 on 3 and 236 DF,  p-value: < 2.2e-16

Explanation of the coefficients

Multiple regression coefficient

sqrt(summary(fit2)$r.squared)
## [1] 0.6723104

Test of significance of regression