ads<-read.csv("Advertising.csv", header=TRUE)
attach(ads)
names(ads)
## [1] "X" "TV" "radio" "newspaper" "sales"
head(ads)
## X TV radio newspaper sales
## 1 1 230.1 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 151.5 41.3 58.5 18.5
## 5 5 180.8 10.8 58.4 12.9
## 6 6 8.7 48.9 75.0 7.2
cor(TV, sales)
## [1] 0.7822244
plot(TV, sales, main= "Correlation of Sales by TV")
#use lm(response, explanatory)
mod1<-lm(sales~TV)
mod1
##
## Call:
## lm(formula = sales ~ TV)
##
## Coefficients:
## (Intercept) TV
## 7.03259 0.04754
#the slope coefficient of approximately 0.05 tells us that for every 1 TV ads run we get a new .05 sales.
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 6.12971927 7.93546783
## TV 0.04223072 0.05284256
#using a F-test for a simple linear regression
#with simple linear regression the t and f tests are indentical
summary(mod1)
##
## Call:
## lm(formula = sales ~ TV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
#this gives out an f-statistic similar to the anova, and also gives the t-value
# this is a t-test for slope!!!
anova(mod1)
## Analysis of Variance Table
##
## Response: sales
## Df Sum Sq Mean Sq F value Pr(>F)
## TV 1 3314.6 3314.6 312.14 < 2.2e-16 ***
## Residuals 198 2102.5 10.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#gives the f-statistic
#the anova and the summary give the same information becasue the f model and t model are equivalent becasue they both test the same hypothesis. reesults in the same p-value
library(ggplot2)

ggplot(ads, aes(x=TV, y=sales))+
geom_point()+
geom_abline(slope=mod1$coefficients[2], intercept=mod1$coefficients[1],
color="blue", lty=2, lwd=1)+
theme_dark()

#Much less than .05
#to do prediction we must pass in a new data frame (for one thing as below we must pass through in c() function)
newdata<-data.frame(TV= c(100))
predict(mod1, newdata,
interval="predict")
## fit lwr upr
## 1 11.78626 5.339251 18.23326
#PI=11.78626 5.339251 18.23326
predict(mod1, newdata,
interval="confidence")
## fit lwr upr
## 1 11.78626 11.26782 12.3047
#CI= fit lwr upr
#11.78626 11.26782 12.3047
#the confidence interval averages the error across a group and has less standard error than just one observation.
#if you think about this then you can see that predicting one value will have a larger amount of possible error, as we cannot rely on the canceling out of error above and below, which occurs when using a mean (center) of data.