Homework #7

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
testStat<-3.9324/0.4155
testStat
## [1] 9.46426
n<-50
#probabilities from the t distribution

pt(testStat, df= n-2, lower.tail=FALSE)*2
## [1] 1.488495e-12
#Better version
pt(-1*abs(testStat), df = n-2)*2
## [1] 1.488495e-12
#mean sqr for speed (MS(REG))
MS_Reg<-21186/1
MS_Reg
## [1] 21186
#mean sqr for res (MS(Res)) bottom is df
MS_Res<-11354/48
MS_Res
## [1] 236.5417
#F stat
c
## function (...)  .Primitive("c")
pf(F, df1=1, df2=48,lower.tail= FALSE)
## [1] 1
#R Squared

21186/(21186+11354)
## [1] 0.6510756
#A
str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
#Sales and Price are numeric varailes. While Urban and US are categorical. 
#Urban and US have two levels, "No" and "Yes"


#B
Model1<-lm(Sales~ Price + Urban + US, data = Carseats)
summary(Model1)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
#C
#For every 1 increase in price there are .05 less sales
#For the categorical variables, we can look at:
#Urban and in the US.           11
#Urban and not in the US        10
#Not Urban and in the US        01
#Not Urban and not in the US    00

#D

\[y=13.04+-.05x+-.02+1.2\]

\[y=13.04+-.05x+-.02\] \[y=13.04+-.05x+1.2\] \[y=13.04+-.05x\]

#E
#We reject the null hypothesis for the Urban variable because the p value is significantly
#greater than .05


#F
Model2<-lm(Sales~ Price + US, data = Carseats)
summary(Model2)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
#G
anova(Model1)
## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## Price       1  630.03  630.03 103.0603 < 2.2e-16 ***
## Urban       1    0.10    0.10   0.0158    0.9001    
## US          1  131.31  131.31  21.4802  4.86e-06 ***
## Residuals 396 2420.83    6.11                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(Model2)
## Analysis of Variance Table
## 
## Response: Sales
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## Price       1  630.03  630.03 103.319 < 2.2e-16 ***
## US          1  131.37  131.37  21.543 4.707e-06 ***
## Residuals 397 2420.87    6.10                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Model 1 has a slightly lower MSE for US. I conclude that model 1 fits our data
#slightly better



#H
confint(Model2,level = .95)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
#We are 95% confident that the true intercept lies between 11.79 and 14.27

#We are 95% confident that the true Price value lies between -.065 and -.044

#We are 95% confident that the true USYes value lies between .691 and 1.708

```