#1. Import the data from the textbook website:

ads<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Advertising.csv",
              header=TRUE)
attach(ads)

2. Learn about the data set, what kinds of variables are there:

names(ads)
## [1] "X"         "TV"        "radio"     "newspaper" "sales"
head(ads)
##   X    TV radio newspaper sales
## 1 1 230.1  37.8      69.2  22.1
## 2 2  44.5  39.3      45.1  10.4
## 3 3  17.2  45.9      69.3   9.3
## 4 4 151.5  41.3      58.5  18.5
## 5 5 180.8  10.8      58.4  12.9
## 6 6   8.7  48.9      75.0   7.2

3. Describe the relationship between sales and the TV using cor() and plot()

cor(TV,sales)
## [1] 0.7822244
plot(TV,sales)

4. Create a linear model for sales as a function of TV

library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
mod<-lm(sales~TV)
mod
## 
## Call:
## lm(formula = sales ~ TV)
## 
## Coefficients:
## (Intercept)           TV  
##     7.03259      0.04754
TvSales<-ggplot(ads, aes(x=TV, y=sales))+
  geom_point()+
  geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
              color="blue", lty=2, lwd=1)+
  theme_bw()
TvSales

5. Interpret the slope coefficient for the context of the problem

#Slope guesstimate: 5/100 --> 0.05
mod
## 
## Call:
## lm(formula = sales ~ TV)
## 
## Coefficients:
## (Intercept)           TV  
##     7.03259      0.04754
mod$coefficients[2] #0.04754 --> for each unit of money spend on TV advertising, would expect an increase in sales of 0.04754
##         TV 
## 0.04753664

6. Create a confidence interval for the slope coefficient

confint(mod)
##                  2.5 %     97.5 %
## (Intercept) 6.12971927 7.93546783
## TV          0.04223072 0.05284256

7. Perform a hypothesis test for the slope coefficient. Be sure to indicate which test you used.

summary(mod) #Summary() uses a t.test, and the t-value of the variable (not the intercept) is the test statistic
## 
## Call:
## lm(formula = sales ~ TV)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3860 -1.9545 -0.1913  2.0671  7.2124 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.032594   0.457843   15.36   <2e-16 ***
## TV          0.047537   0.002691   17.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6099 
## F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16
#Could also use anova()

8. Predict the response for a new observation of TV at 100. Include the prediction and confidence intervals. Why are these intervals different?

newdata<-data.frame(TV=c(100))

predict(mod, newdata, 
        interval="predict")
##        fit      lwr      upr
## 1 11.78626 5.339251 18.23326
predict(mod, newdata, 
        interval="confidence")
##        fit      lwr     upr
## 1 11.78626 11.26782 12.3047
#These intervals are different because predict considers the extra variability of a single observation, whereas confidence looks at the average/mean of a set of observations

detach(ads)