This shows the outputs from Chapter 5 using R. The textbook is “A Modern Approach to Regression with R” by Simon J. Sheather (2008). The R code provided with the book has been updated.
profsalary <- read_excel("MARData.xlsx", sheet = "profsalary")
attach(profsalary)
head(profsalary, 10)
## # A tibble: 10 x 3
## Case Salary Experience
## <dbl> <dbl> <dbl>
## 1 1 71 26
## 2 2 69 19
## 3 3 73 22
## 4 4 69 17
## 5 5 65 13
## 6 6 75 25
## 7 7 66 35
## 8 8 66 16
## 9 9 67 16
## 10 10 69 16
#Figure 5.1 on page 126
plot(Experience,Salary,xlab = "Years of Experience")
#Figure 5.2 on page 127
m1 <- lm(Salary~Experience)
leverage1 <- hatvalues(m1)
StanRes1 <- rstandard(m1)
ExperienceNew <- seq(0,37,len = 37)
plot(Experience,StanRes1,xlab = "Years of Experience", ylab = "Standardized Residuals")
#Figure 5.3 on page 127
m2 <- lm(Salary~Experience + I(Experience^2))
plot(Experience,Salary,xlab = "Years of Experience")
ExperienceNew <- seq(0,37,len = 37)
lines(ExperienceNew,predict(m2,newdata = data.frame(Experience = ExperienceNew)))
#Figure 5.4 on page 128
StanRes2 <- rstandard(m2)
plot(Experience,StanRes2,xlab = "Years of Experience", ylab = "Standardized Residuals")
#Figure 5.5 on page 128
leverage2 <- hatvalues(m2)
plot(Experience,leverage2,xlab = "Years of Experience",ylab = "Leverage")
abline(h = 6/max(Case),lty = 2)
#Figure 5.6 on page 129
par(mfrow = c(2,2))
plot(m2)
#Regression output on pages 129 & 130
summary(m2)
##
## Call:
## lm(formula = Salary ~ Experience + I(Experience^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5786 -2.3573 0.0957 2.0171 5.5176
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.720498 0.828724 41.90 <2e-16 ***
## Experience 2.872275 0.095697 30.01 <2e-16 ***
## I(Experience^2) -0.053316 0.002477 -21.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.817 on 140 degrees of freedom
## Multiple R-squared: 0.9247, Adjusted R-squared: 0.9236
## F-statistic: 859.3 on 2 and 140 DF, p-value: < 2.2e-16
predict(m2,newdata = data.frame(Experience = c(10)),interval = "prediction",level = 0.95)
## fit lwr upr
## 1 58.11164 52.50481 63.71847
detach(profsalary)
nyc <- read_excel("MARData.xlsx", sheet = "nyc")
attach(nyc)
head(nyc, 10)
## # A tibble: 10 x 7
## Case Restaurant Price Food Decor Service East
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Daniella Ristorante 43 22 18 20 0
## 2 2 Tello's Ristorante 32 20 19 19 0
## 3 3 Biricchino 34 21 13 18 0
## 4 4 Bottino 41 20 20 17 0
## 5 5 Da Umberto 54 24 19 21 0
## 6 6 Le Madri 52 22 22 21 0
## 7 7 Le Zie 34 22 16 21 0
## 8 8 Pasticcio 34 20 18 21 1
## 9 9 Belluno 39 22 19 22 1
## 10 10 Cinque Terre 44 21 17 19 1
#Regression output on pages 138 & 139
m1 <- lm(Price~Food+Decor+Service+East)
summary(m1)
##
## Call:
## lm(formula = Price ~ Food + Decor + Service + East)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0465 -3.8837 0.0373 3.3942 17.7491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.023800 4.708359 -5.102 9.24e-07 ***
## Food 1.538120 0.368951 4.169 4.96e-05 ***
## Decor 1.910087 0.217005 8.802 1.87e-15 ***
## Service -0.002727 0.396232 -0.007 0.9945
## East 2.068050 0.946739 2.184 0.0304 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.738 on 163 degrees of freedom
## Multiple R-squared: 0.6279, Adjusted R-squared: 0.6187
## F-statistic: 68.76 on 4 and 163 DF, p-value: < 2.2e-16
#Regression output on page 139
m2 <- lm(Price~Food+Decor+East)
summary(m2)
##
## Call:
## lm(formula = Price ~ Food + Decor + East)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0451 -3.8809 0.0389 3.3918 17.7557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.0269 4.6727 -5.142 7.67e-07 ***
## Food 1.5363 0.2632 5.838 2.76e-08 ***
## Decor 1.9094 0.1900 10.049 < 2e-16 ***
## East 2.0670 0.9318 2.218 0.0279 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.72 on 164 degrees of freedom
## Multiple R-squared: 0.6279, Adjusted R-squared: 0.6211
## F-statistic: 92.24 on 3 and 164 DF, p-value: < 2.2e-16
#An alternative way to obtain m2 is to use the update command
m2 <- update(m1,~.-Service)
summary(m2)
##
## Call:
## lm(formula = Price ~ Food + Decor + East)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0451 -3.8809 0.0389 3.3918 17.7557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.0269 4.6727 -5.142 7.67e-07 ***
## Food 1.5363 0.2632 5.838 2.76e-08 ***
## Decor 1.9094 0.1900 10.049 < 2e-16 ***
## East 2.0670 0.9318 2.218 0.0279 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.72 on 164 degrees of freedom
## Multiple R-squared: 0.6279, Adjusted R-squared: 0.6211
## F-statistic: 92.24 on 3 and 164 DF, p-value: < 2.2e-16
detach(nyc)
travel <- read_excel("MARData.xlsx", sheet = "travel")
attach(travel)
head(travel, 10)
## # A tibble: 10 x 4
## Amount Age Segment C
## <dbl> <dbl> <chr> <dbl>
## 1 997 44 A 0
## 2 997 43 A 0
## 3 951 41 A 0
## 4 649 59 A 0
## 5 1265 25 A 0
## 6 1059 38 A 0
## 7 837 46 A 0
## 8 924 42 A 0
## 9 852 48 A 0
## 10 963 39 A 0
#Regression output on page 141
mfull <- lm(Amount~Age+C+C:Age)
summary(mfull)
##
## Call:
## lm(formula = Amount ~ Age + C + C:Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.298 -30.541 -0.034 31.108 130.743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1814.5445 8.6011 211.0 <2e-16 ***
## Age -20.3175 0.1878 -108.2 <2e-16 ***
## C -1821.2337 12.5736 -144.8 <2e-16 ***
## Age:C 40.4461 0.2724 148.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.63 on 921 degrees of freedom
## Multiple R-squared: 0.9601, Adjusted R-squared: 0.9599
## F-statistic: 7379 on 3 and 921 DF, p-value: < 2.2e-16
#Figure 5.7 on page 142
par(mfrow = c(1,1))
plot(Age[C == 0],Amount[C == 0],pch = c("A"),col = c("black"),ylab = "Amount Spent",xlab = "Age")
points(Age[C == 1],Amount[C == 1],pch = c("C"),col = c("red"))
#Regression output on page 143
mreduced <- lm(Amount~Age)
summary(mreduced)
##
## Call:
## lm(formula = Amount ~ Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -545.06 -199.03 6.34 198.74 497.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 957.9103 31.3056 30.599 <2e-16 ***
## Age -1.1140 0.6784 -1.642 0.101
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 237.7 on 923 degrees of freedom
## Multiple R-squared: 0.002913, Adjusted R-squared: 0.001833
## F-statistic: 2.697 on 1 and 923 DF, p-value: 0.1009
#Regression output on page 144
anova(mreduced,mfull)
## Analysis of Variance Table
##
## Model 1: Amount ~ Age
## Model 2: Amount ~ Age + C + C:Age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 923 52158945
## 2 921 2089377 2 50069568 11035 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
detach(travel)
nyc <- read_excel("MARData.xlsx", sheet = "nyc")
attach(nyc)
head(nyc, 10)
## # A tibble: 10 x 7
## Case Restaurant Price Food Decor Service East
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Daniella Ristorante 43 22 18 20 0
## 2 2 Tello's Ristorante 32 20 19 19 0
## 3 3 Biricchino 34 21 13 18 0
## 4 4 Bottino 41 20 20 17 0
## 5 5 Da Umberto 54 24 19 21 0
## 6 6 Le Madri 52 22 22 21 0
## 7 7 Le Zie 34 22 16 21 0
## 8 8 Pasticcio 34 20 18 21 1
## 9 9 Belluno 39 22 19 22 1
## 10 10 Cinque Terre 44 21 17 19 1
#Regression output on page 145
mfull <- lm(Price~Food+Decor+Service+East+Food:East+Decor:East+Service:East)
summary(mfull)
##
## Call:
## lm(formula = Price ~ Food + Decor + Service + East + Food:East +
## Decor:East + Service:East)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5099 -3.7996 -0.1413 3.6522 17.1656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.9949 8.4672 -3.188 0.00172 **
## Food 1.0068 0.5704 1.765 0.07946 .
## Decor 1.8881 0.2984 6.327 2.4e-09 ***
## Service 0.7438 0.6443 1.155 0.25001
## East 6.1253 10.2499 0.598 0.55095
## Food:East 1.2077 0.7743 1.560 0.12079
## Decor:East -0.2500 0.4570 -0.547 0.58510
## Service:East -1.2719 0.8171 -1.557 0.12151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.713 on 160 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.622
## F-statistic: 40.27 on 7 and 160 DF, p-value: < 2.2e-16
#Regression output on page 146
mreduced <- lm(Price~Food+Decor+East)
summary(mreduced)
##
## Call:
## lm(formula = Price ~ Food + Decor + East)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0451 -3.8809 0.0389 3.3918 17.7557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.0269 4.6727 -5.142 7.67e-07 ***
## Food 1.5363 0.2632 5.838 2.76e-08 ***
## Decor 1.9094 0.1900 10.049 < 2e-16 ***
## East 2.0670 0.9318 2.218 0.0279 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.72 on 164 degrees of freedom
## Multiple R-squared: 0.6279, Adjusted R-squared: 0.6211
## F-statistic: 92.24 on 3 and 164 DF, p-value: < 2.2e-16
#Regression output on page 146
anova(mreduced,mfull)
## Analysis of Variance Table
##
## Model 1: Price ~ Food + Decor + East
## Model 2: Price ~ Food + Decor + Service + East + Food:East + Decor:East +
## Service:East
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 164 5366.5
## 2 160 5222.2 4 144.36 1.1057 0.3558
detach(nyc)
#################EXERCISES
#Ex 5.4.3
latour <- read_excel("MARData.xlsx", sheet = "Latour")
attach(latour)
head(latour, 10)
## # A tibble: 10 x 4
## Vintage Quality EndofHarvest Rain
## <dbl> <dbl> <dbl> <dbl>
## 1 1961 5 28 0
## 2 1962 4 50 0
## 3 1963 1 53 1
## 4 1964 3 38 0
## 5 1965 1 46 1
## 6 1966 4 40 0
## 7 1967 3 35 1
## 8 1968 2 38 1
## 9 1969 2 45 1
## 10 1970 4 47 0
#Regression output on page 148
mfull <- lm(Quality ~ EndofHarvest + Rain + Rain:EndofHarvest)
summary(mfull)
##
## Call:
## lm(formula = Quality ~ EndofHarvest + Rain + Rain:EndofHarvest)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6833 -0.5703 0.1265 0.4385 1.6354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.16122 0.68917 7.489 3.95e-09 ***
## EndofHarvest -0.03145 0.01760 -1.787 0.0816 .
## Rain 1.78670 1.31740 1.356 0.1826
## EndofHarvest:Rain -0.08314 0.03160 -2.631 0.0120 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7578 on 40 degrees of freedom
## Multiple R-squared: 0.6848, Adjusted R-squared: 0.6612
## F-statistic: 28.97 on 3 and 40 DF, p-value: 4.017e-10
#Figure 5.8 on page 149
y = Rain
par(mfrow = c(1,1))
plot(EndofHarvest,Quality,pch = y+1,col = y+1,xlab = "End of Harvest (in days since August 31)")
abline(lsfit(EndofHarvest[y == 0],Quality[y == 0]),lty = 1,col = 1)
abline(lsfit(EndofHarvest[y == 1],Quality[y == 1]),lty = 2,col = 2)
legend(23, 2.5,legend = c("No","Yes"),pch = 1:2,col = 1:2,title = "Rain at Harvest?")
#Regression output on page 149
mreduced <- lm(Quality ~ EndofHarvest + Rain)
summary(mreduced)
##
## Call:
## lm(formula = Quality ~ EndofHarvest + Rain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4563 -0.7366 0.1430 0.6413 1.7652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.14633 0.61896 9.930 1.8e-12 ***
## EndofHarvest -0.05723 0.01564 -3.660 0.000713 ***
## Rain -1.62219 0.25478 -6.367 1.3e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8107 on 41 degrees of freedom
## Multiple R-squared: 0.6303, Adjusted R-squared: 0.6123
## F-statistic: 34.95 on 2 and 41 DF, p-value: 1.383e-09
anova(mreduced,mfull)
## Analysis of Variance Table
##
## Model 1: Quality ~ EndofHarvest + Rain
## Model 2: Quality ~ EndofHarvest + Rain + Rain:EndofHarvest
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 41 26.945
## 2 40 22.971 1 3.9749 6.9218 0.01203 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
detach(latour)