Hi all! Here are Neumann’s Workshop 7 codes.

Download ISPG.csv here

Download CocaCola.csv here

Download Hotel.csv here

OmniFoods

Question 1

ISPG <- read.csv("ISPG.csv")
summary(ISPG)

##       Bars          Price         Promotion      ShelfLocation Dispensers
##  Min.   : 675   Min.   :59.00   Min.   :200.0   EndAisle:15    No :17    
##  1st Qu.:2125   1st Qu.:59.00   1st Qu.:200.0   Normal  :19    Yes:17    
##  Median :3430   Median :79.00   Median :400.0                            
##  Mean   :3099   Mean   :77.82   Mean   :388.2                            
##  3rd Qu.:3969   3rd Qu.:99.00   3rd Qu.:600.0                            
##  Max.   :5120   Max.   :99.00   Max.   :600.0

# Output shows ShelfLOcation and Dispensers are considered as categorical variable by R.

plot(ISPG)

levels(ISPG$ShelfLocation)

## [1] "EndAisle" "Normal"

levels(ISPG$Dispensers)

## [1] "No"  "Yes"

# Shows the order of the levels in categorical X. By default, the first level by alphabetical order, is the reference level for dummy coding = 0.

ISPG_Fit <- lm(Bars ~ ., ISPG)
summary(ISPG_Fit)

## 
## Call:
## lm(formula = Bars ~ ., data = ISPG)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1280.33  -251.63   -44.83   277.04   865.29 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6248.2478   529.0322  11.811 1.33e-12 ***
## Price                -53.0447     5.2357 -10.131 4.91e-11 ***
## Promotion              3.5650     0.5652   6.307 6.88e-07 ***
## ShelfLocationNormal -815.3759   169.3097  -4.816 4.23e-05 ***
## DispensersYes        100.3259   180.4650   0.556    0.583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 487.2 on 29 degrees of freedom
## Multiple R-squared:  0.8679, Adjusted R-squared:  0.8496 
## F-statistic: 47.62 on 4 and 29 DF,  p-value: 2.441e-12

# Dispensers is not sig. To remove and rerun.

ISPG_Fit2 <- lm(Bars ~ . - Dispensers, ISPG)
summary(ISPG_Fit2)

## 
## Call:
## lm(formula = Bars ~ . - Dispensers, data = ISPG)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1348.8  -261.1   -58.4   285.6   883.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6357.3609   485.5861  13.092 6.14e-14 ***
## Price                -53.1560     5.1713 -10.279 2.40e-11 ***
## Promotion              3.4476     0.5182   6.653 2.28e-07 ***
## ShelfLocationNormal -823.8005   166.6770  -4.942 2.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 481.5 on 30 degrees of freedom
## Multiple R-squared:  0.8665, Adjusted R-squared:  0.8531 
## F-statistic: 64.89 on 3 and 30 DF,  p-value: 3.207e-13

library(car)

vif(ISPG_Fit2)

##         Price     Promotion ShelfLocation 
##      1.009460      1.013686      1.004268

# VIF for all X less than 5. No multicollinearity problem.

plot(ISPG_Fit2)

# Residuals plot shows no serious violation of assumptions.
# No influential outliers detected.

#Q2
ISPG_Fit3 <- lm(Bars ~ ShelfLocation + Dispensers, ISPG)
summary(ISPG_Fit3)

## 
## Call:
## lm(formula = Bars ~ ShelfLocation + Dispensers, data = ISPG)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2176.0  -932.0   204.9   947.6  2164.0 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3781.7      378.2   9.999 3.21e-11 ***
## ShelfLocationNormal   -930.7      413.3  -2.252   0.0316 *  
## DispensersYes         -325.9      410.4  -0.794   0.4333    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1195 on 31 degrees of freedom
## Multiple R-squared:  0.1509, Adjusted R-squared:  0.09612 
## F-statistic: 2.755 on 2 and 31 DF,  p-value: 0.07923

# Fitting both SHelf Location and Dispensers, Dispensers is still insignificant.

Coca Cola

Question 2

coca <- read.csv("CocaCola.csv")

coca$codedYear <- coca$Year - 1995

plot(coca$codedYear, coca$Revenues)

# simple linear regression
cocaFit1 <- lm(Revenues ~ codedYear, coca)
summary(cocaFit1)

## 
## Call:
## lm(formula = Revenues ~ codedYear, data = coca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8067 -2.0092  0.0533  1.7508  4.0033 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  16.0017     1.0641  15.038 1.34e-09 ***
## codedYear     0.9150     0.1294   7.074 8.37e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.165 on 13 degrees of freedom
## Multiple R-squared:  0.7938, Adjusted R-squared:  0.7779 
## F-statistic: 50.04 on 1 and 13 DF,  p-value: 8.372e-06

abline(cocaFit1)
lines(coca$codedYear, predict(cocaFit1), col = "blue")

plot(cocaFit1)

# polynomial regression
cocaFit2 <- lm(Revenues ~ codedYear + I(codedYear^2), coca)
summary(cocaFit2)

## 
## Call:
## lm(formula = Revenues ~ codedYear + I(codedYear^2), data = coca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6953 -0.9027 -0.1802  0.7729  2.2397 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    19.08794    0.83954  22.736 3.11e-11 ***
## codedYear      -0.50943    0.27835  -1.830 0.092154 .  
## I(codedYear^2)  0.10175    0.01917   5.306 0.000186 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.232 on 12 degrees of freedom
## Multiple R-squared:  0.9384, Adjusted R-squared:  0.9281 
## F-statistic: 91.36 on 2 and 12 DF,  p-value: 5.478e-08

plot(coca$codedYear, coca$Revenues)
lines(coca$codedYear, predict(cocaFit2), col = "red")

plot(cocaFit2)

predict(cocaFit1, data.frame(codedYear = 15))

##        1 
## 29.72667

predict(cocaFit1, data.frame(codedYear = 15),
        interval = "predict")

##        fit      lwr      upr
## 1 29.72667 24.40482 35.04851

predict(cocaFit2, data.frame(codedYear = 15))

##        1 
## 34.33912

predict(cocaFit2, data.frame(codedYear = 15),
        interval = "predict")

##        fit      lwr      upr
## 1 34.33912 30.74569 37.93255

Hotel

Question 3

hotel <- read.csv("Hotel.csv")
summary(hotel)

##       Sat            TimeDiff         Prev    
##  Min.   :0.0000   Min.   :0.80   Min.   :0.0  
##  1st Qu.:0.0000   1st Qu.:2.55   1st Qu.:0.0  
##  Median :1.0000   Median :3.80   Median :1.0  
##  Mean   :0.6667   Mean   :3.67   Mean   :0.6  
##  3rd Qu.:1.0000   3rd Qu.:4.50   3rd Qu.:1.0  
##  Max.   :1.0000   Max.   :6.10   Max.   :1.0

# Output shows that Sat and Prev are treated as continuous.

hotel$Sat <- factor(hotel$Sat, levels = c(0, 1), labels = c("unfavorable", "favorable"))

hotel$Prev <- factor(hotel$Prev, levels = c(0, 1),
                     labels = c("No", "Yes"))

summary(hotel)

##           Sat        TimeDiff     Prev   
##  unfavorable:10   Min.   :0.80   No :12  
##  favorable  :20   1st Qu.:2.55   Yes:18  
##                   Median :3.80           
##                   Mean   :3.67           
##                   3rd Qu.:4.50           
##                   Max.   :6.10

# After using factor command, Sat and Prev are treated as categorical.

hotelFit <- glm(Sat ~ TimeDiff + Prev, family = binomial(), data = hotel)
summary(hotelFit)

## 
## Call:
## glm(formula = Sat ~ TimeDiff + Prev, family = binomial(), data = hotel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8800  -0.1371   0.2064   0.4374   2.0820  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   8.0521     3.2501   2.477   0.0132 *
## TimeDiff     -2.2440     0.9014  -2.489   0.0128 *
## PrevYes       2.5037     1.5944   1.570   0.1163  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.191  on 29  degrees of freedom
## Residual deviance: 18.282  on 27  degrees of freedom
## AIC: 24.282
## 
## Number of Fisher Scoring iterations: 6

# Only TimeDiff is sig.


hotelFit2 <- glm(Sat ~ TimeDiff, family = binomial(), data = hotel)
summary(hotelFit2)

## 
## Call:
## glm(formula = Sat ~ TimeDiff, family = binomial(), data = hotel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2240  -0.3641   0.1695   0.6345   1.1912  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   8.3353     3.1504   2.646  0.00815 **
## TimeDiff     -1.8595     0.7302  -2.546  0.01088 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.191  on 29  degrees of freedom
## Residual deviance: 21.418  on 28  degrees of freedom
## AIC: 25.418
## 
## Number of Fisher Scoring iterations: 6

predict(hotelFit2, data.frame(TimeDiff = 3), type = "response")

##         1 
## 0.9403002

prob <- predict(hotelFit2, type = 'response')

Table <- data.frame(hotel$TimeDiff, prob)
plot(Table$hotel.TimeDiff, prob)

abline(h=0.5, col="red")

# Adds a horizontal line at y-axis [prob] = 0.5
# When time diff = 4.5 mins or more, then prob of satisfied is 0.5 or less.

Workshop 7: Regression Modelling Using R

Chng Yan Hao

3 to 7 April 2017

Hi all! Here are Neumann’s Workshop 7 codes.

Download ISPG.csv here

Download CocaCola.csv here

Download Hotel.csv here

OmniFoods

Question 1

Coca Cola

Question 2

Hotel

Question 3

Return to contents page