In collaboration with Bryce O’Connor
Part 1 is in Bryce’s Rpub
#Part 2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
data(Carseats)
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
The Urban and US varibles are both categorical with two levels (yes or no) Sales and Price are numeric
Carsales<- lm(Sales~Price + Urban + US, data=Carseats)
The coefficients in the model shift the model up and down.
summary(Carsales)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
anova(Carsales)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.0603 < 2.2e-16 ***
## Urban 1 0.10 0.10 0.0158 0.9001
## US 1 131.31 131.31 21.4802 4.86e-06 ***
## Residuals 396 2420.83 6.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(Carsales, aes(x=Price, y=Sales, color = US, pch = Urban))+
geom_point()+
geom_abline(intercept = Carsales$coefficients[1], slope=Carsales$coefficients[2],
color="red", lwd=1)+
geom_abline(intercept = Carsales$coefficients[1]+Carsales$coefficients[2], slope=Carsales$coefficients[2],
color="green", lwd=1)+
geom_abline(intercept = Carsales$coefficients[1]+Carsales$coefficients[3], slope=Carsales$coefficients[2],
color="blue", lwd=1)+
geom_abline(intercept = Carsales$coefficients[1]+Carsales$coefficients[4], slope = Carsales$coefficients[2],
color = "purple", lwd = 1)
The US coefficent appears to shift the model up.
#US=1 and Urban=1
#Y = 13.043469 - 0.054459*X1 - 0.021916*X2 + 1.200573*x3 + e
#US=1 and Urban=0
#Y = 13.043469 - 0.054459*X1 - 0.021916*X2 + e
#US=0 and Urban=1
#Y = 13.043469 - 0.054459*X1+ 1.200573*x3 + e
#US=0 and Urban=0
#Y = 13.043469 - 0.054459*X1 + e
We reject the null for the coefficents Price and US.
carSales2<-lm(Sales~Price+US, data=Carseats)
summary(carSales2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
anova(carSales2)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.319 < 2.2e-16 ***
## US 1 131.37 131.37 21.543 4.707e-06 ***
## Residuals 397 2420.87 6.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
carseatlin2 = lm(Sales~Price + US, Carseats)
Carseats$part<-rep(0, 400)
ceiling(400*.3)
## [1] 120
# model 1
set.seed(1)
test<-sample(1:400, 120, replace=FALSE)
Carseats$part[test]<-1
traindf = Carseats%>%
filter(part==0)
testdf = Carseats%>%
filter(part==1)
trainLM1 = lm(Sales~Price, data=Carseats)
anova(trainLM1)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 98.248 < 2.2e-16 ***
## Residuals 398 2552.24 6.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
testFit1<-predict(trainLM1, testdf)
testMSE1<-mean((testdf$Sales-testFit1)^2)
testMSE1 #6.749432
## [1] 6.253838
# model 2
test<-sample(1:400, 120, replace=FALSE)
Carseats$part[test]<-1
traindf = Carseats%>%
filter(part==0)
testdf = Carseats%>%
filter(part==1)
trainLM2 = lm(Sales~Price + US, data=Carseats)
anova(trainLM2)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.319 < 2.2e-16 ***
## US 1 131.37 131.37 21.543 4.707e-06 ***
## Residuals 397 2420.87 6.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
testFit2<-predict(trainLM2, testdf)
testMSE2<-mean((testdf$Sales-testFit2)^2)
testMSE2 #6.345222
## [1] 6.345222
confint(Carsales)
## 2.5 % 97.5 %
## (Intercept) 11.76359670 14.32334118
## Price -0.06476419 -0.04415351
## UrbanYes -0.55597316 0.51214085
## USYes 0.69130419 1.70984121