Homework #7
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
testStat<-3.9324/0.4155
testStat
## [1] 9.46426
n<-50
#probabilities from the t distribution
pt(testStat, df= n-2, lower.tail=FALSE)*2
## [1] 1.488495e-12
#Better version
pt(-1*abs(testStat), df = n-2)*2
## [1] 1.488495e-12
#mean sqr for speed (MS(REG))
MS_Reg<-21186/1
MS_Reg
## [1] 21186
#mean sqr for res (MS(Res)) bottom is df
MS_Res<-11354/48
MS_Res
## [1] 236.5417
#F stat
c
## function (...) .Primitive("c")
pf(F, df1=1, df2=48,lower.tail= FALSE)
## [1] 1
#R Squared
21186/(21186+11354)
## [1] 0.6510756
#A
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
#Sales and Price are numeric varailes. While Urban and US are categorical.
#Urban and US have two levels, "No" and "Yes"
#B
Model1<-lm(Sales~ Price + Urban + US, data = Carseats)
summary(Model1)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
#C
#For every 1 increase in price there are .05 less sales
#For the categorical variables, we can look at:
#Urban and in the US. 11
#Urban and not in the US 10
#Not Urban and in the US 01
#Not Urban and not in the US 00
#D
\[y=13.04+-.05x+-.02+1.2\]
\[y=13.04+-.05x+-.02\] \[y=13.04+-.05x+1.2\] \[y=13.04+-.05x\]
#E
#We reject the null hypothesis for the Urban variable because the p value is significantly
#greater than .05
#F
Model2<-lm(Sales~ Price + US, data = Carseats)
summary(Model2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
#G
anova(Model1)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.0603 < 2.2e-16 ***
## Urban 1 0.10 0.10 0.0158 0.9001
## US 1 131.31 131.31 21.4802 4.86e-06 ***
## Residuals 396 2420.83 6.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(Model2)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.319 < 2.2e-16 ***
## US 1 131.37 131.37 21.543 4.707e-06 ***
## Residuals 397 2420.87 6.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Model 1 has a slightly lower MSE for US. I conclude that model 1 fits our data
#slightly better
#H
confint(Model2,level = .95)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
#We are 95% confident that the true intercept lies between 11.79 and 14.27
#We are 95% confident that the true Price value lies between -.065 and -.044
#We are 95% confident that the true USYes value lies between .691 and 1.708
```