Hi all! In this section of predictive analysis, we will first revisit linear regression.
Supermarket waiting time 1
d <- read.csv("lanes.csv")
str(d)
## 'data.frame': 12 obs. of 5 variables:
## $ Date : Factor w/ 12 levels "14 Dec","15 Dec",..: 5 6 8 10 11 12 1 2 3 4 ...
## $ Weather : Factor w/ 4 levels "Cloudy","Rain",..: 4 1 4 2 4 4 1 1 1 2 ...
## $ Lanes : int 5 12 11 7 12 8 6 10 8 6 ...
## $ Avg.Wait : num 12.2 4.2 4.4 6.75 3.8 5.75 10.4 6.5 6.25 9.2 ...
## $ Extra.Salary: int 213 451 444 256 498 301 229 364 291 253 ...
# q1
# y = Avg.Wait x = Lanes
# q2
plot(d$Lanes, d$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")

# q3
# (4, 1.1) is outlier
# q4
# It was snowing that day and does not give an accurate prediction on non-snowing days.
# q5
d0 <- d[d$Weather != "Snow",]
fit0 <- lm(Avg.Wait ~ Lanes, data = d0)
plot(d0$Lanes, d0$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")
abline(fit0, col = "red")

fit0
##
## Call:
## lm(formula = Avg.Wait ~ Lanes, data = d0)
##
## Coefficients:
## (Intercept) Lanes
## 14.9621 -0.9627
# y = 14.9621 - 0.9627x
# q6
# x increase y decrease
# q7
predict(fit0, data.frame(Lanes = c(6,11)))
## 1 2
## 9.185778 4.372156
# (6,9.18), (11, 4.37)
# q8
summary(fit0)
##
## Call:
## lm(formula = Avg.Wait ~ Lanes, data = d0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66033 -1.24169 0.02784 0.97784 2.05150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.9621 1.5036 9.951 3.73e-06 ***
## Lanes -0.9627 0.1714 -5.618 0.000327 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.335 on 9 degrees of freedom
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.7535
## F-statistic: 31.57 on 1 and 9 DF, p-value: 0.0003265
# q9
cor(d0$Lanes, d0$Avg.Wait)
## [1] -0.8821232
# q10
summary(fit0)$r.squared; summary(fit0)$adj.r.squared
## [1] 0.7781413
## [1] 0.7534904
# q11 & 12
predict(fit0, data.frame(Lanes = c(9,7)))
## 1 2
## 6.297605 8.223054
# q13
d0[d0$Lanes == 7,]$Avg.Wait - predict(fit0, data.frame(Lanes = 7))
## 1
## -1.473054
# q14 & q 15
plot(d0$Lanes, d0$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")
abline(fit0, col = "red")
abline(h = c(4.5,10), col = "blue")

# 11 lanes & 6 lanes
# q16 & 17
predict(fit0, data.frame(Lanes = c(1,20)))
## 1 2
## 13.999401 -4.292365
# q18
# out of range
range(d0$Lanes)
## [1] 5 12
# q19
fit1 <- lm(Avg.Wait ~ Weather, data = d0)
summary(fit1)
##
## Call:
## lm(formula = Avg.Wait ~ Weather, data = d0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6375 -1.5875 -0.6000 0.4437 5.8500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8375 1.4637 4.671 0.0016 **
## WeatherRain 1.1375 2.5352 0.449 0.6656
## WeatherSun -0.4875 1.9638 -0.248 0.8102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.927 on 8 degrees of freedom
## Multiple R-squared: 0.05217, Adjusted R-squared: -0.1848
## F-statistic: 0.2202 on 2 and 8 DF, p-value: 0.8071
plot(d0$Weather, d0$Avg.Wait, main = "Avg.Wait Time(min) against Weather")

# however not much significance
Supermarket waiting time 2
# q20
# y = Lanes, x = Extra.Salary
# q21
plot(d0$Extra.Salary, d0$Lanes, pch = 19, main = "Lanes against Extra Salary")
# q22
abline(lm(Lanes ~ Extra.Salary, data = d0), col = "red")

fit2 <- lm(Lanes ~ Extra.Salary, data = d0)
fit2
##
## Call:
## lm(formula = Lanes ~ Extra.Salary, data = d0)
##
## Coefficients:
## (Intercept) Extra.Salary
## 0.28831 0.02481
# y = 14.51 - 0.02x
# q23
# Lanes decrease Extra.Salary increase
# q24
# (0, 14.51) (1, 14.49)
# q25
summary(fit2)
##
## Call:
## lm(formula = Lanes ~ Extra.Salary, data = d0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64594 -0.43617 0.02916 0.42492 0.67920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.288313 0.570566 0.505 0.625
## Extra.Salary 0.024815 0.001669 14.871 1.22e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5137 on 9 degrees of freedom
## Multiple R-squared: 0.9609, Adjusted R-squared: 0.9565
## F-statistic: 221.1 on 1 and 9 DF, p-value: 1.216e-07
# q26
cor(d0$Extra.Salary, d0$Lanes)
## [1] 0.9802519
# q27
# let's try manual calculation of r²
m <- mean(d0$Lanes)
sst <- sum((d0$Lanes - m)^2)
sse <- sum((d0$Lanes - fit2$fitted.values)^2)
r2 <- 1-sse/sst
r2
## [1] 0.9608939
n <- 11
adjr2 <- 1-sse/(n-2)/sst*(n-1)
adjr2
## [1] 0.9565488
summary(fit2)$r.square; summary(fit2)$adj.r.square
## [1] 0.9608939
## [1] 0.9565488
# q28 & 29
plot(d0$Extra.Salary, d0$Lanes, pch = 19, main = "Lanes against Extra Salary")
abline(lm(Lanes ~ Extra.Salary, data = d0), col = "red")
abline(h = c(9,7), col = "blue")

# $350 & $270
# q30
270-256
## [1] 14
# q31
floor(predict(fit2, data.frame(Extra.Salary = 500)))
## 1
## 12
# q32
floor(predict(fit2, data.frame(Extra.Salary = 250)))
## 1
## 6
# q33
fit2i <- lm(Extra.Salary ~ Lanes, data = d0)
predict(fit2i, data.frame(Lanes = 0))
## 1
## 1.70509
# q34
predict(fit2i, data.frame(Lanes = 50))
## 1
## 1937.858
# q35
# out of range
range(d0$Lanes)
## [1] 5 12
# q36
# more lanes reduce avg.wait time but increases extra.salary
# q37
fit3 <- lm(Lanes ~ Avg.Wait + Extra.Salary, data = d0)
summary(fit3)
##
## Call:
## lm(formula = Lanes ~ Avg.Wait + Extra.Salary, data = d0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48834 -0.36275 -0.05215 0.24075 0.76323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.761510 1.596744 1.729 0.122
## Avg.Wait -0.170401 0.103932 -1.640 0.140
## Extra.Salary 0.020832 0.002871 7.255 8.76e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4714 on 8 degrees of freedom
## Multiple R-squared: 0.9707, Adjusted R-squared: 0.9634
## F-statistic: 132.7 on 2 and 8 DF, p-value: 7.341e-07
summary(fit0)
##
## Call:
## lm(formula = Avg.Wait ~ Lanes, data = d0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66033 -1.24169 0.02784 0.97784 2.05150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.9621 1.5036 9.951 3.73e-06 ***
## Lanes -0.9627 0.1714 -5.618 0.000327 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.335 on 9 degrees of freedom
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.7535
## F-statistic: 31.57 on 1 and 9 DF, p-value: 0.0003265
summary(fit2)
##
## Call:
## lm(formula = Lanes ~ Extra.Salary, data = d0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64594 -0.43617 0.02916 0.42492 0.67920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.288313 0.570566 0.505 0.625
## Extra.Salary 0.024815 0.001669 14.871 1.22e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5137 on 9 degrees of freedom
## Multiple R-squared: 0.9609, Adjusted R-squared: 0.9565
## F-statistic: 221.1 on 1 and 9 DF, p-value: 1.216e-07
# no information on sales, insufficent data to comment
## additional
fit4 <- lm(Avg.Wait ~ Lanes + I(Lanes^2), data = d0)
newdat <- data.frame(Lanes = seq(min(d0$Lanes), max(d0$Lanes), length.out = 100))
newdat$Avg.Wait <- predict(fit4, newdata = newdat)
plot(d0$Lanes, d0$Avg.Wait, pch = 19, main = "Avg.Wait Time(min) against Lanes")
with(newdat, lines(Lanes, Avg.Wait, col = "blue"))
