ACTEX Study Manual for Exam PA: Section 3.3 (Case Study 1: Fitting a Linear Model in R)

add <- read.csv('Advertising.csv')
add$X <- NULL

str(add)

## 'data.frame':    200 obs. of  4 variables:
##  $ TV       : num  230.1 44.5 17.2 151.5 180.8 ...
##  $ Radio    : num  37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
##  $ Newspaper: num  69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
##  $ Sales    : num  22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...

library(ggplot2)
library(gridExtra)

p1 <- ggplot(add, aes(x = Sales)) +
  geom_histogram()

p2 <- ggplot(add, aes(x = TV)) +
  geom_histogram()

p3 <- ggplot(add, aes(x = Radio)) +
  geom_histogram()

p4 <- ggplot(add, aes(x = Newspaper)) +
  geom_histogram()

grid.arrange(p1, p2, p3, p4, ncol = 2)

# install.packages('tidyverse')
library(tidyverse)

p21 <- add %>% ggplot(aes(x = TV, y = Sales)) +
  geom_point() +
  geom_smooth()

p22 <- add %>% ggplot(aes(x = Radio, y = Sales)) + 
  geom_point() +
  geom_smooth()

p23 <- add %>% ggplot(aes(x = Newspaper, y = Sales)) + 
  geom_point() +
  geom_smooth()

grid.arrange(p21, p22, p23, ncol = 2)

lm.fitted <- lm(Sales ~., data = add)
summary(lm.fitted)

## 
## Call:
## lm(formula = Sales ~ ., data = add)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8277 -0.8908  0.2418  1.1893  2.8292 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.938889   0.311908   9.422   <2e-16 ***
## TV           0.045765   0.001395  32.809   <2e-16 ***
## Radio        0.188530   0.008611  21.893   <2e-16 ***
## Newspaper   -0.001037   0.005871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

confint(lm.fitted, level=0.99)

##                   0.5 %     99.5 %
## (Intercept)  2.12757072 3.75020802
## TV           0.04213632 0.04939297
## Radio        0.16613095 0.21092909
## Newspaper   -0.01630884 0.01423386

fitted <- as.data.frame(predict(lm.fitted, interval ='prediction'), colnames=c('fit', 'lwr', 'upr'))

fitted$diff <- fitted$upr - fitted$lwr

which.min(fitted$diff)

## [1] 46

add$TV[which.min(fitted$diff)]

## [1] 175.1

pred.int <- predict(lm.fitted, interval = 'prediction')
as_pred <- cbind(add, pred.int)

as_pred[which.min(as_pred$upr - as_pred$lwr), 'TV']

## [1] 175.1

ggplot(as_pred, aes(x = TV, y = Sales)) +
  geom_point(color = 'green') +
  geom_smooth(method = 'lm') +
  geom_line(aes(y = lwr), color = 'red', linetype = 'dashed') +
  geom_line(aes(y = upr), color = 'red', linetype = 'dashed')

cor(add)

##                   TV      Radio  Newspaper     Sales
## TV        1.00000000 0.05480866 0.05664787 0.7822244
## Radio     0.05480866 1.00000000 0.35410375 0.5762226
## Newspaper 0.05664787 0.35410375 1.00000000 0.2282990
## Sales     0.78222442 0.57622257 0.22829903 1.0000000

lm1 <- lm(Sales ~ Radio + TV, data = add)
res1 <- lm1$residuals

lm2 <- lm(Newspaper ~ Radio + TV, data = add)
res2 <- lm2$residuals

partial_cor <- cor(res1, res2)
partial_cor

## [1] -0.01262147

interaction.fitted <- lm(Sales ~ TV * Radio, data = add)
summary(interaction.fitted)

## 
## Call:
## lm(formula = Sales ~ TV * Radio, data = add)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3366 -0.4028  0.1831  0.5948  1.5246 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.750e+00  2.479e-01  27.233   <2e-16 ***
## TV          1.910e-02  1.504e-03  12.699   <2e-16 ***
## Radio       2.886e-02  8.905e-03   3.241   0.0014 ** 
## TV:Radio    1.086e-03  5.242e-05  20.727   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9435 on 196 degrees of freedom
## Multiple R-squared:  0.9678, Adjusted R-squared:  0.9673 
## F-statistic:  1963 on 3 and 196 DF,  p-value: < 2.2e-16

df <- data.frame(TV = c(0, 50, 30), Radio = c(100, 50, 70))
prediction <- predict(interaction.fitted, newdata = df)
prediction

##         1         2         3 
##  9.636254 11.864528 11.625115

# install.packages('caret')
library(caret)

set.seed(123)
partition <- createDataPartition(add$Sales, p = 0.7, list = F)
trains <- add[partition, ]
tests <- add[-partition, ]

model.1.tr <- lm(Sales ~ TV + Radio + Newspaper, data = trains)
model.2.tr <- lm(Sales ~ TV + Radio, data = trains)
model.3.tr <- lm(Sales ~ TV * Radio, data = trains)
model.4.tr <- lm(Sales ~ TV * Radio + I(TV^2), data = trains)
model.5.tr <- lm(Sales ~ TV * Radio + I(TV^2) + I(Radio^2), data = trains)

r1 <- RMSE(predict(model.1.tr), trains$Sales)
r2 <- RMSE(predict(model.2.tr), trains$Sales)
r3 <- RMSE(predict(model.3.tr), trains$Sales)
r4 <- RMSE(predict(model.4.tr), trains$Sales)
r5 <- RMSE(predict(model.5.tr), trains$Sales)

rmse = c(r1, r2, r3, r4)
rmse

## [1] 1.6692022 1.6728241 0.9700702 0.6355037

rt1 <- RMSE(predict(model.1.tr, newdata = tests), tests$Sales)
rt2 <- RMSE(predict(model.2.tr, newdata = tests), tests$Sales)
rt3 <- RMSE(predict(model.3.tr, newdata = tests), tests$Sales)
rt4 <- RMSE(predict(model.4.tr, newdata = tests), tests$Sales)
rt5 <- RMSE(predict(model.5.tr, newdata = tests), tests$Sales)

rmse_t <- c(rt1, rt2, rt3, rt4, rt5)
rmse_t

## [1] 1.6894002 1.6619253 0.8415654 0.5716940 0.5729795

ACTEX Study Manual for Exam PA: Section 3.3 (Case Study 1: Fitting a Linear Model in R)

yangj22