Hi all! In this section of predictive analysis, we will first revisit linear regression.

Download lanes.csv here

Supermarket waiting time 1

d <- read.csv("lanes.csv")
str(d)
## 'data.frame':    12 obs. of  5 variables:
##  $ Date        : Factor w/ 12 levels "14 Dec","15 Dec",..: 5 6 8 10 11 12 1 2 3 4 ...
##  $ Weather     : Factor w/ 4 levels "Cloudy","Rain",..: 4 1 4 2 4 4 1 1 1 2 ...
##  $ Lanes       : int  5 12 11 7 12 8 6 10 8 6 ...
##  $ Avg.Wait    : num  12.2 4.2 4.4 6.75 3.8 5.75 10.4 6.5 6.25 9.2 ...
##  $ Extra.Salary: int  213 451 444 256 498 301 229 364 291 253 ...
# q1
# y = Avg.Wait x = Lanes

# q2
plot(d$Lanes, d$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")

# q3
# (4, 1.1) is outlier

# q4
# It was snowing that day and does not give an accurate prediction on non-snowing days.

# q5
d0 <- d[d$Weather != "Snow",]
fit0 <- lm(Avg.Wait ~ Lanes, data = d0)
plot(d0$Lanes, d0$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")
abline(fit0, col = "red")

fit0
## 
## Call:
## lm(formula = Avg.Wait ~ Lanes, data = d0)
## 
## Coefficients:
## (Intercept)        Lanes  
##     14.9621      -0.9627
# y = 14.9621 - 0.9627x

# q6
# x increase y decrease

# q7
predict(fit0, data.frame(Lanes = c(6,11)))
##        1        2 
## 9.185778 4.372156
# (6,9.18), (11, 4.37)

# q8
summary(fit0)
## 
## Call:
## lm(formula = Avg.Wait ~ Lanes, data = d0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66033 -1.24169  0.02784  0.97784  2.05150 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.9621     1.5036   9.951 3.73e-06 ***
## Lanes        -0.9627     0.1714  -5.618 0.000327 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.335 on 9 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.7535 
## F-statistic: 31.57 on 1 and 9 DF,  p-value: 0.0003265
# q9
cor(d0$Lanes, d0$Avg.Wait)
## [1] -0.8821232
# q10
summary(fit0)$r.squared; summary(fit0)$adj.r.squared
## [1] 0.7781413
## [1] 0.7534904
# q11 & 12
predict(fit0, data.frame(Lanes = c(9,7)))
##        1        2 
## 6.297605 8.223054
# q13
d0[d0$Lanes == 7,]$Avg.Wait - predict(fit0, data.frame(Lanes = 7))
##         1 
## -1.473054
# q14 & q 15
plot(d0$Lanes, d0$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")
abline(fit0, col = "red")
abline(h = c(4.5,10), col = "blue")

# 11 lanes & 6 lanes

# q16 & 17
predict(fit0, data.frame(Lanes = c(1,20)))
##         1         2 
## 13.999401 -4.292365
# q18
# out of range
range(d0$Lanes)
## [1]  5 12
# q19
fit1 <- lm(Avg.Wait ~ Weather, data = d0)
summary(fit1)
## 
## Call:
## lm(formula = Avg.Wait ~ Weather, data = d0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6375 -1.5875 -0.6000  0.4437  5.8500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   6.8375     1.4637   4.671   0.0016 **
## WeatherRain   1.1375     2.5352   0.449   0.6656   
## WeatherSun   -0.4875     1.9638  -0.248   0.8102   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.927 on 8 degrees of freedom
## Multiple R-squared:  0.05217,    Adjusted R-squared:  -0.1848 
## F-statistic: 0.2202 on 2 and 8 DF,  p-value: 0.8071
plot(d0$Weather, d0$Avg.Wait, main = "Avg.Wait Time(min) against Weather")

# however not much significance

Supermarket waiting time 2

# q20
# y = Lanes, x = Extra.Salary

# q21
plot(d0$Extra.Salary, d0$Lanes, pch = 19, main = "Lanes against Extra Salary")

# q22
abline(lm(Lanes ~ Extra.Salary, data = d0), col = "red")

fit2 <- lm(Lanes ~ Extra.Salary, data = d0)
fit2
## 
## Call:
## lm(formula = Lanes ~ Extra.Salary, data = d0)
## 
## Coefficients:
##  (Intercept)  Extra.Salary  
##      0.28831       0.02481
# y = 14.51 - 0.02x

# q23
# Lanes decrease Extra.Salary increase

# q24
# (0, 14.51) (1, 14.49)

# q25
summary(fit2)
## 
## Call:
## lm(formula = Lanes ~ Extra.Salary, data = d0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64594 -0.43617  0.02916  0.42492  0.67920 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.288313   0.570566   0.505    0.625    
## Extra.Salary 0.024815   0.001669  14.871 1.22e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5137 on 9 degrees of freedom
## Multiple R-squared:  0.9609, Adjusted R-squared:  0.9565 
## F-statistic: 221.1 on 1 and 9 DF,  p-value: 1.216e-07
# q26
cor(d0$Extra.Salary, d0$Lanes)
## [1] 0.9802519
# q27
# let's try manual calculation of r²

m <- mean(d0$Lanes)
sst <- sum((d0$Lanes - m)^2)
sse <- sum((d0$Lanes - fit2$fitted.values)^2)
r2 <- 1-sse/sst
r2
## [1] 0.9608939
n <- 11
adjr2 <- 1-sse/(n-2)/sst*(n-1)
adjr2
## [1] 0.9565488
summary(fit2)$r.square; summary(fit2)$adj.r.square
## [1] 0.9608939
## [1] 0.9565488
# q28 & 29
plot(d0$Extra.Salary, d0$Lanes, pch = 19, main = "Lanes against Extra Salary")
abline(lm(Lanes ~ Extra.Salary, data = d0), col = "red")
abline(h = c(9,7), col = "blue")

# $350 & $270

# q30
270-256
## [1] 14
# q31
floor(predict(fit2, data.frame(Extra.Salary = 500)))
##  1 
## 12
# q32
floor(predict(fit2, data.frame(Extra.Salary = 250)))
## 1 
## 6
# q33
fit2i <- lm(Extra.Salary ~ Lanes, data = d0)
predict(fit2i, data.frame(Lanes = 0))
##       1 
## 1.70509
# q34
predict(fit2i, data.frame(Lanes = 50))
##        1 
## 1937.858
# q35
# out of range
range(d0$Lanes)
## [1]  5 12
# q36
# more lanes reduce avg.wait time but increases extra.salary

# q37
fit3 <- lm(Lanes ~ Avg.Wait + Extra.Salary, data = d0)
summary(fit3)
## 
## Call:
## lm(formula = Lanes ~ Avg.Wait + Extra.Salary, data = d0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48834 -0.36275 -0.05215  0.24075  0.76323 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.761510   1.596744   1.729    0.122    
## Avg.Wait     -0.170401   0.103932  -1.640    0.140    
## Extra.Salary  0.020832   0.002871   7.255 8.76e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4714 on 8 degrees of freedom
## Multiple R-squared:  0.9707, Adjusted R-squared:  0.9634 
## F-statistic: 132.7 on 2 and 8 DF,  p-value: 7.341e-07
summary(fit0)
## 
## Call:
## lm(formula = Avg.Wait ~ Lanes, data = d0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66033 -1.24169  0.02784  0.97784  2.05150 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.9621     1.5036   9.951 3.73e-06 ***
## Lanes        -0.9627     0.1714  -5.618 0.000327 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.335 on 9 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.7535 
## F-statistic: 31.57 on 1 and 9 DF,  p-value: 0.0003265
summary(fit2)
## 
## Call:
## lm(formula = Lanes ~ Extra.Salary, data = d0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64594 -0.43617  0.02916  0.42492  0.67920 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.288313   0.570566   0.505    0.625    
## Extra.Salary 0.024815   0.001669  14.871 1.22e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5137 on 9 degrees of freedom
## Multiple R-squared:  0.9609, Adjusted R-squared:  0.9565 
## F-statistic: 221.1 on 1 and 9 DF,  p-value: 1.216e-07
# no information on sales, insufficent data to comment

## additional
fit4 <- lm(Avg.Wait ~ Lanes + I(Lanes^2), data = d0)
newdat <- data.frame(Lanes = seq(min(d0$Lanes), max(d0$Lanes), length.out = 100))
newdat$Avg.Wait <- predict(fit4, newdata = newdat)
plot(d0$Lanes, d0$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")
with(newdat, lines(Lanes, Avg.Wait, col = "blue"))

Return to contents page