advertising <- read_csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## TV = col_double(),
## radio = col_double(),
## newspaper = col_double(),
## sales = col_double()
## )
head(advertising)
## # A tibble: 6 x 5
## X1 TV radio newspaper sales
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 230.1 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 151.5 41.3 58.5 18.5
## 5 5 180.8 10.8 58.4 12.9
## 6 6 8.7 48.9 75.0 7.2
In this problem we will see whether there is a linear relation between TV and Sales.
scatter.smooth(advertising$TV,advertising$sales, main="TV Budget ~ Sales")

# From the scatter plot we could see some linear relationship between TV Budget and Sales.
# Next we will check the outliers for both the data points.
par(mfrow=c(1, 2)) # divide graph area in 2 columns
boxplot(advertising$TV, main="TV", sub=paste("Outlier rows: ", boxplot.stats(advertising$TV)$out)) # box plot for 'TV'
boxplot(advertising$sales, main="Sales", sub=paste("Outlier rows: ", boxplot.stats(advertising$sales)$out)) # box plot for 'distance'

# Do not see outliets.
# Next we will try to build linear model.
linearmodel = lm(advertising$sales ~ advertising$TV)
print(linearmodel)
##
## Call:
## lm(formula = advertising$sales ~ advertising$TV)
##
## Coefficients:
## (Intercept) advertising$TV
## 7.03259 0.04754
# From this we can say the following
#Sales = 7.03 + 0.04 * TV Budget
summary(linearmodel)
##
## Call:
## lm(formula = advertising$sales ~ advertising$TV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## advertising$TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
# Let us see how the Residuals look like,
plot(fitted(linearmodel), resid(linearmodel))
abline(h=0)
mean(linearmodel$residuals)
## [1] -6.464447e-17
# Mean of the residuals is less near zero.
hist(linearmodel$residuals)

qqnorm(linearmodel$residuals)
qqline(linearmodel$residuals)
# Conclusion.
# From the QQ Plot, we can see though center is more super imposed on the line,i,e there is residuals are normally distributed.
# but little towards the end, the residuals are not normally distributed. Esp when the TV Budget is towards 0, the sales move from linear.
# We could say that there is some linear relation between TV Budget and Sales.
