Hi all! Here are Neumann’s Workshop 7 codes.
Download CocaCola.csv here
OmniFoods
Question 1
ISPG <- read.csv("ISPG.csv")
summary(ISPG)
## Bars Price Promotion ShelfLocation Dispensers
## Min. : 675 Min. :59.00 Min. :200.0 EndAisle:15 No :17
## 1st Qu.:2125 1st Qu.:59.00 1st Qu.:200.0 Normal :19 Yes:17
## Median :3430 Median :79.00 Median :400.0
## Mean :3099 Mean :77.82 Mean :388.2
## 3rd Qu.:3969 3rd Qu.:99.00 3rd Qu.:600.0
## Max. :5120 Max. :99.00 Max. :600.0
# Output shows ShelfLOcation and Dispensers are considered as categorical variable by R.
plot(ISPG)
levels(ISPG$ShelfLocation)
## [1] "EndAisle" "Normal"
levels(ISPG$Dispensers)
## [1] "No" "Yes"
# Shows the order of the levels in categorical X. By default, the first level by alphabetical order, is the reference level for dummy coding = 0.
ISPG_Fit <- lm(Bars ~ ., ISPG)
summary(ISPG_Fit)
##
## Call:
## lm(formula = Bars ~ ., data = ISPG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1280.33 -251.63 -44.83 277.04 865.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6248.2478 529.0322 11.811 1.33e-12 ***
## Price -53.0447 5.2357 -10.131 4.91e-11 ***
## Promotion 3.5650 0.5652 6.307 6.88e-07 ***
## ShelfLocationNormal -815.3759 169.3097 -4.816 4.23e-05 ***
## DispensersYes 100.3259 180.4650 0.556 0.583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 487.2 on 29 degrees of freedom
## Multiple R-squared: 0.8679, Adjusted R-squared: 0.8496
## F-statistic: 47.62 on 4 and 29 DF, p-value: 2.441e-12
# Dispensers is not sig. To remove and rerun.
ISPG_Fit2 <- lm(Bars ~ . - Dispensers, ISPG)
summary(ISPG_Fit2)
##
## Call:
## lm(formula = Bars ~ . - Dispensers, data = ISPG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1348.8 -261.1 -58.4 285.6 883.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6357.3609 485.5861 13.092 6.14e-14 ***
## Price -53.1560 5.1713 -10.279 2.40e-11 ***
## Promotion 3.4476 0.5182 6.653 2.28e-07 ***
## ShelfLocationNormal -823.8005 166.6770 -4.942 2.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 481.5 on 30 degrees of freedom
## Multiple R-squared: 0.8665, Adjusted R-squared: 0.8531
## F-statistic: 64.89 on 3 and 30 DF, p-value: 3.207e-13
library(car)

vif(ISPG_Fit2)
## Price Promotion ShelfLocation
## 1.009460 1.013686 1.004268
# VIF for all X less than 5. No multicollinearity problem.
plot(ISPG_Fit2)




# Residuals plot shows no serious violation of assumptions.
# No influential outliers detected.
#Q2
ISPG_Fit3 <- lm(Bars ~ ShelfLocation + Dispensers, ISPG)
summary(ISPG_Fit3)
##
## Call:
## lm(formula = Bars ~ ShelfLocation + Dispensers, data = ISPG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2176.0 -932.0 204.9 947.6 2164.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3781.7 378.2 9.999 3.21e-11 ***
## ShelfLocationNormal -930.7 413.3 -2.252 0.0316 *
## DispensersYes -325.9 410.4 -0.794 0.4333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1195 on 31 degrees of freedom
## Multiple R-squared: 0.1509, Adjusted R-squared: 0.09612
## F-statistic: 2.755 on 2 and 31 DF, p-value: 0.07923
# Fitting both SHelf Location and Dispensers, Dispensers is still insignificant.
Coca Cola
Question 2
coca <- read.csv("CocaCola.csv")
coca$codedYear <- coca$Year - 1995
plot(coca$codedYear, coca$Revenues)
# simple linear regression
cocaFit1 <- lm(Revenues ~ codedYear, coca)
summary(cocaFit1)
##
## Call:
## lm(formula = Revenues ~ codedYear, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8067 -2.0092 0.0533 1.7508 4.0033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.0017 1.0641 15.038 1.34e-09 ***
## codedYear 0.9150 0.1294 7.074 8.37e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.165 on 13 degrees of freedom
## Multiple R-squared: 0.7938, Adjusted R-squared: 0.7779
## F-statistic: 50.04 on 1 and 13 DF, p-value: 8.372e-06
abline(cocaFit1)
lines(coca$codedYear, predict(cocaFit1), col = "blue")

plot(cocaFit1)




# polynomial regression
cocaFit2 <- lm(Revenues ~ codedYear + I(codedYear^2), coca)
summary(cocaFit2)
##
## Call:
## lm(formula = Revenues ~ codedYear + I(codedYear^2), data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6953 -0.9027 -0.1802 0.7729 2.2397
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.08794 0.83954 22.736 3.11e-11 ***
## codedYear -0.50943 0.27835 -1.830 0.092154 .
## I(codedYear^2) 0.10175 0.01917 5.306 0.000186 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.232 on 12 degrees of freedom
## Multiple R-squared: 0.9384, Adjusted R-squared: 0.9281
## F-statistic: 91.36 on 2 and 12 DF, p-value: 5.478e-08
plot(coca$codedYear, coca$Revenues)
lines(coca$codedYear, predict(cocaFit2), col = "red")

plot(cocaFit2)




predict(cocaFit1, data.frame(codedYear = 15))
## 1
## 29.72667
predict(cocaFit1, data.frame(codedYear = 15),
interval = "predict")
## fit lwr upr
## 1 29.72667 24.40482 35.04851
predict(cocaFit2, data.frame(codedYear = 15))
## 1
## 34.33912
predict(cocaFit2, data.frame(codedYear = 15),
interval = "predict")
## fit lwr upr
## 1 34.33912 30.74569 37.93255
Hotel
Question 3
hotel <- read.csv("Hotel.csv")
summary(hotel)
## Sat TimeDiff Prev
## Min. :0.0000 Min. :0.80 Min. :0.0
## 1st Qu.:0.0000 1st Qu.:2.55 1st Qu.:0.0
## Median :1.0000 Median :3.80 Median :1.0
## Mean :0.6667 Mean :3.67 Mean :0.6
## 3rd Qu.:1.0000 3rd Qu.:4.50 3rd Qu.:1.0
## Max. :1.0000 Max. :6.10 Max. :1.0
# Output shows that Sat and Prev are treated as continuous.
hotel$Sat <- factor(hotel$Sat, levels = c(0, 1), labels = c("unfavorable", "favorable"))
hotel$Prev <- factor(hotel$Prev, levels = c(0, 1),
labels = c("No", "Yes"))
summary(hotel)
## Sat TimeDiff Prev
## unfavorable:10 Min. :0.80 No :12
## favorable :20 1st Qu.:2.55 Yes:18
## Median :3.80
## Mean :3.67
## 3rd Qu.:4.50
## Max. :6.10
# After using factor command, Sat and Prev are treated as categorical.
hotelFit <- glm(Sat ~ TimeDiff + Prev, family = binomial(), data = hotel)
summary(hotelFit)
##
## Call:
## glm(formula = Sat ~ TimeDiff + Prev, family = binomial(), data = hotel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8800 -0.1371 0.2064 0.4374 2.0820
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.0521 3.2501 2.477 0.0132 *
## TimeDiff -2.2440 0.9014 -2.489 0.0128 *
## PrevYes 2.5037 1.5944 1.570 0.1163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 38.191 on 29 degrees of freedom
## Residual deviance: 18.282 on 27 degrees of freedom
## AIC: 24.282
##
## Number of Fisher Scoring iterations: 6
# Only TimeDiff is sig.
hotelFit2 <- glm(Sat ~ TimeDiff, family = binomial(), data = hotel)
summary(hotelFit2)
##
## Call:
## glm(formula = Sat ~ TimeDiff, family = binomial(), data = hotel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2240 -0.3641 0.1695 0.6345 1.1912
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.3353 3.1504 2.646 0.00815 **
## TimeDiff -1.8595 0.7302 -2.546 0.01088 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 38.191 on 29 degrees of freedom
## Residual deviance: 21.418 on 28 degrees of freedom
## AIC: 25.418
##
## Number of Fisher Scoring iterations: 6
predict(hotelFit2, data.frame(TimeDiff = 3), type = "response")
## 1
## 0.9403002
prob <- predict(hotelFit2, type = 'response')
Table <- data.frame(hotel$TimeDiff, prob)
plot(Table$hotel.TimeDiff, prob)
abline(h=0.5, col="red")

# Adds a horizontal line at y-axis [prob] = 0.5
# When time diff = 4.5 mins or more, then prob of satisfied is 0.5 or less.