advertising <- read_csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   TV = col_double(),
##   radio = col_double(),
##   newspaper = col_double(),
##   sales = col_double()
## )
head(advertising)
## # A tibble: 6 x 5
##      X1    TV radio newspaper sales
##   <int> <dbl> <dbl>     <dbl> <dbl>
## 1     1 230.1  37.8      69.2  22.1
## 2     2  44.5  39.3      45.1  10.4
## 3     3  17.2  45.9      69.3   9.3
## 4     4 151.5  41.3      58.5  18.5
## 5     5 180.8  10.8      58.4  12.9
## 6     6   8.7  48.9      75.0   7.2

In this problem we will see whether there is a linear relation between TV and Sales.

scatter.smooth(advertising$TV,advertising$sales, main="TV Budget ~ Sales")

# From the scatter plot we could see some linear relationship between TV Budget and Sales.


# Next we will check the outliers for both the data points.

par(mfrow=c(1, 2))  # divide graph area in 2 columns
boxplot(advertising$TV, main="TV", sub=paste("Outlier rows: ", boxplot.stats(advertising$TV)$out))  # box plot for 'TV'
boxplot(advertising$sales, main="Sales", sub=paste("Outlier rows: ", boxplot.stats(advertising$sales)$out))  # box plot for 'distance'

# Do not see outliets.

# Next we will try to build linear model.

linearmodel = lm(advertising$sales ~ advertising$TV)
print(linearmodel)
## 
## Call:
## lm(formula = advertising$sales ~ advertising$TV)
## 
## Coefficients:
##    (Intercept)  advertising$TV  
##        7.03259         0.04754
# From this we can say the following

#Sales = 7.03 + 0.04 * TV Budget

summary(linearmodel)
## 
## Call:
## lm(formula = advertising$sales ~ advertising$TV)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3860 -1.9545 -0.1913  2.0671  7.2124 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.032594   0.457843   15.36   <2e-16 ***
## advertising$TV 0.047537   0.002691   17.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6099 
## F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16
# Let us see how the Residuals look like,

plot(fitted(linearmodel), resid(linearmodel))
abline(h=0)

mean(linearmodel$residuals)
## [1] -6.464447e-17
# Mean of the residuals is less near zero.

hist(linearmodel$residuals)

qqnorm(linearmodel$residuals)
qqline(linearmodel$residuals)
# Conclusion. 
# From the QQ Plot, we can see though center is more super imposed on the line,i,e there is residuals are normally distributed.
# but little towards the end, the residuals are not normally distributed. Esp when the TV Budget is towards 0, the sales move from linear.
# We could say that there is some linear relation between TV Budget and Sales.