#in class exercise reviewing Simple Linear Regression:
ads <- read.csv("Advertising.csv", TRUE)
attach(ads)
names(ads)
## [1] "X" "TV" "radio" "newspaper" "sales"
head(ads)
## X TV radio newspaper sales
## 1 1 230.1 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 151.5 41.3 58.5 18.5
## 5 5 180.8 10.8 58.4 12.9
## 6 6 8.7 48.9 75.0 7.2
#3. Describe relationship between TV and sales
cor(TV,sales)
## [1] 0.7822244
#The correlation between TV and sales is very strong, with a correlation coefficient of .78.
plot(TV,sales)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
mod <- lm(sales~TV)
mod
##
## Call:
## lm(formula = sales ~ TV)
##
## Coefficients:
## (Intercept) TV
## 7.03259 0.04754
ggplot(ads,aes(x=TV,y=sales))+
geom_point()+
geom_abline(slope=mod$coefficients[2],intercept = mod$coefficients[1],
color="coral",lty=2,lwd=1)+
theme_bw()
#4. Create a linear model for sales as a function of TV
mod <- lm(sales~TV)
mod
##
## Call:
## lm(formula = sales ~ TV)
##
## Coefficients:
## (Intercept) TV
## 7.03259 0.04754
#5. For every 1 change in TV advertisement, this leads to .04 change in Sales
#6. confidence interval for slope coefficient
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 6.12971927 7.93546783
## TV 0.04223072 0.05284256
#7. Perform a hypothesis test for slope coefficient. #basically a t-test on a slope: (null is B1 = 0, anything other than 0 will show that X does have a relationship to Y)
summary(mod)
##
## Call:
## lm(formula = sales ~ TV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
#sumary gives t value and associated p
#one way anova (because we only have one value). Partitioning variablity: Variablity described by error and variability described by model
anova(mod)
## Analysis of Variance Table
##
## Response: sales
## Df Sum Sq Mean Sq F value Pr(>F)
## TV 1 3314.6 3314.6 312.14 < 2.2e-16 ***
## Residuals 198 2102.5 10.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#look at second row: residuals. #output gives F statistic (still testing null: B1 = 0) this is the only way F and t are the same. If there are more x variables this is no longer the case.
#8. Predict the response for a new observation of TV at 100.
newdata <- data.frame(TV=c(100))
predict(mod,newdata,interval="predict")
## fit lwr upr
## 1 11.78626 5.339251 18.23326
predict(mod,newdata,interval="confidence")
## fit lwr upr
## 1 11.78626 11.26782 12.3047
#The prediction interval is much larger than the confidence interval because the standard error is much larger for the prediction interval. #Prediction interval is predicting the interval of a single response (which will be much more variable), while the confidence interval is predicting the mean response (but at a single point).