In collaboration with Bryce O’Connor

Part 1

Part 1 is in Bryce’s Rpub

#Part 2

A

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
data(Carseats)

str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

The Urban and US varibles are both categorical with two levels (yes or no) Sales and Price are numeric

B

Carsales<- lm(Sales~Price + Urban + US, data=Carseats)

C

The coefficients in the model shift the model up and down.

summary(Carsales)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
anova(Carsales)
## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## Price       1  630.03  630.03 103.0603 < 2.2e-16 ***
## Urban       1    0.10    0.10   0.0158    0.9001    
## US          1  131.31  131.31  21.4802  4.86e-06 ***
## Residuals 396 2420.83    6.11                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(Carsales, aes(x=Price, y=Sales, color = US, pch = Urban))+
  geom_point()+
  geom_abline(intercept = Carsales$coefficients[1], slope=Carsales$coefficients[2],
              color="red", lwd=1)+
  geom_abline(intercept = Carsales$coefficients[1]+Carsales$coefficients[2], slope=Carsales$coefficients[2],
              color="green", lwd=1)+
  geom_abline(intercept = Carsales$coefficients[1]+Carsales$coefficients[3], slope=Carsales$coefficients[2],
              color="blue", lwd=1)+
  geom_abline(intercept = Carsales$coefficients[1]+Carsales$coefficients[4], slope = Carsales$coefficients[2],
              color = "purple", lwd = 1)

The US coefficent appears to shift the model up.

D

#US=1 and Urban=1
#Y = 13.043469 - 0.054459*X1 - 0.021916*X2 + 1.200573*x3 + e
#US=1 and Urban=0
#Y = 13.043469 - 0.054459*X1 - 0.021916*X2 + e
#US=0 and Urban=1
#Y = 13.043469 - 0.054459*X1+ 1.200573*x3 + e
#US=0 and Urban=0
#Y = 13.043469 - 0.054459*X1 + e

E

We reject the null for the coefficents Price and US.

F

carSales2<-lm(Sales~Price+US, data=Carseats)
summary(carSales2)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
anova(carSales2)
## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## Price       1  630.03  630.03 103.319 < 2.2e-16 ***
## US          1  131.37  131.37  21.543 4.707e-06 ***
## Residuals 397 2420.87    6.10                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

G

carseatlin2  = lm(Sales~Price + US, Carseats)

Carseats$part<-rep(0, 400)
ceiling(400*.3)
## [1] 120
# model 1
set.seed(1)
test<-sample(1:400, 120, replace=FALSE)
Carseats$part[test]<-1

traindf = Carseats%>%
  filter(part==0)
testdf = Carseats%>%
  filter(part==1)

trainLM1 = lm(Sales~Price, data=Carseats)
anova(trainLM1)
## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## Price       1  630.03  630.03  98.248 < 2.2e-16 ***
## Residuals 398 2552.24    6.41                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
testFit1<-predict(trainLM1, testdf)
testMSE1<-mean((testdf$Sales-testFit1)^2)
testMSE1 #6.749432
## [1] 6.253838
# model 2
test<-sample(1:400, 120, replace=FALSE)
Carseats$part[test]<-1

traindf = Carseats%>%
  filter(part==0)
testdf = Carseats%>%
  filter(part==1)

trainLM2 = lm(Sales~Price + US, data=Carseats)
anova(trainLM2)
## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## Price       1  630.03  630.03 103.319 < 2.2e-16 ***
## US          1  131.37  131.37  21.543 4.707e-06 ***
## Residuals 397 2420.87    6.10                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
testFit2<-predict(trainLM2, testdf)
testMSE2<-mean((testdf$Sales-testFit2)^2)
testMSE2 #6.345222
## [1] 6.345222

H

confint(Carsales)
##                   2.5 %      97.5 %
## (Intercept) 11.76359670 14.32334118
## Price       -0.06476419 -0.04415351
## UrbanYes    -0.55597316  0.51214085
## USYes        0.69130419  1.70984121