add <- read.csv('Advertising.csv')
add$X <- NULL
str(add)
## 'data.frame': 200 obs. of 4 variables:
## $ TV : num 230.1 44.5 17.2 151.5 180.8 ...
## $ Radio : num 37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
## $ Newspaper: num 69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
## $ Sales : num 22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...
library(ggplot2)
library(gridExtra)
p1 <- ggplot(add, aes(x = Sales)) +
geom_histogram()
p2 <- ggplot(add, aes(x = TV)) +
geom_histogram()
p3 <- ggplot(add, aes(x = Radio)) +
geom_histogram()
p4 <- ggplot(add, aes(x = Newspaper)) +
geom_histogram()
grid.arrange(p1, p2, p3, p4, ncol = 2)

# install.packages('tidyverse')
library(tidyverse)
p21 <- add %>% ggplot(aes(x = TV, y = Sales)) +
geom_point() +
geom_smooth()
p22 <- add %>% ggplot(aes(x = Radio, y = Sales)) +
geom_point() +
geom_smooth()
p23 <- add %>% ggplot(aes(x = Newspaper, y = Sales)) +
geom_point() +
geom_smooth()
grid.arrange(p21, p22, p23, ncol = 2)

lm.fitted <- lm(Sales ~., data = add)
summary(lm.fitted)
##
## Call:
## lm(formula = Sales ~ ., data = add)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
confint(lm.fitted, level=0.99)
## 0.5 % 99.5 %
## (Intercept) 2.12757072 3.75020802
## TV 0.04213632 0.04939297
## Radio 0.16613095 0.21092909
## Newspaper -0.01630884 0.01423386
fitted <- as.data.frame(predict(lm.fitted, interval ='prediction'), colnames=c('fit', 'lwr', 'upr'))
fitted$diff <- fitted$upr - fitted$lwr
which.min(fitted$diff)
## [1] 46
add$TV[which.min(fitted$diff)]
## [1] 175.1
pred.int <- predict(lm.fitted, interval = 'prediction')
as_pred <- cbind(add, pred.int)
as_pred[which.min(as_pred$upr - as_pred$lwr), 'TV']
## [1] 175.1
ggplot(as_pred, aes(x = TV, y = Sales)) +
geom_point(color = 'green') +
geom_smooth(method = 'lm') +
geom_line(aes(y = lwr), color = 'red', linetype = 'dashed') +
geom_line(aes(y = upr), color = 'red', linetype = 'dashed')

cor(add)
## TV Radio Newspaper Sales
## TV 1.00000000 0.05480866 0.05664787 0.7822244
## Radio 0.05480866 1.00000000 0.35410375 0.5762226
## Newspaper 0.05664787 0.35410375 1.00000000 0.2282990
## Sales 0.78222442 0.57622257 0.22829903 1.0000000
lm1 <- lm(Sales ~ Radio + TV, data = add)
res1 <- lm1$residuals
lm2 <- lm(Newspaper ~ Radio + TV, data = add)
res2 <- lm2$residuals
partial_cor <- cor(res1, res2)
partial_cor
## [1] -0.01262147
interaction.fitted <- lm(Sales ~ TV * Radio, data = add)
summary(interaction.fitted)
##
## Call:
## lm(formula = Sales ~ TV * Radio, data = add)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3366 -0.4028 0.1831 0.5948 1.5246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.750e+00 2.479e-01 27.233 <2e-16 ***
## TV 1.910e-02 1.504e-03 12.699 <2e-16 ***
## Radio 2.886e-02 8.905e-03 3.241 0.0014 **
## TV:Radio 1.086e-03 5.242e-05 20.727 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9435 on 196 degrees of freedom
## Multiple R-squared: 0.9678, Adjusted R-squared: 0.9673
## F-statistic: 1963 on 3 and 196 DF, p-value: < 2.2e-16
df <- data.frame(TV = c(0, 50, 30), Radio = c(100, 50, 70))
prediction <- predict(interaction.fitted, newdata = df)
prediction
## 1 2 3
## 9.636254 11.864528 11.625115
# install.packages('caret')
library(caret)
set.seed(123)
partition <- createDataPartition(add$Sales, p = 0.7, list = F)
trains <- add[partition, ]
tests <- add[-partition, ]
model.1.tr <- lm(Sales ~ TV + Radio + Newspaper, data = trains)
model.2.tr <- lm(Sales ~ TV + Radio, data = trains)
model.3.tr <- lm(Sales ~ TV * Radio, data = trains)
model.4.tr <- lm(Sales ~ TV * Radio + I(TV^2), data = trains)
model.5.tr <- lm(Sales ~ TV * Radio + I(TV^2) + I(Radio^2), data = trains)
r1 <- RMSE(predict(model.1.tr), trains$Sales)
r2 <- RMSE(predict(model.2.tr), trains$Sales)
r3 <- RMSE(predict(model.3.tr), trains$Sales)
r4 <- RMSE(predict(model.4.tr), trains$Sales)
r5 <- RMSE(predict(model.5.tr), trains$Sales)
rmse = c(r1, r2, r3, r4)
rmse
## [1] 1.6692022 1.6728241 0.9700702 0.6355037
rt1 <- RMSE(predict(model.1.tr, newdata = tests), tests$Sales)
rt2 <- RMSE(predict(model.2.tr, newdata = tests), tests$Sales)
rt3 <- RMSE(predict(model.3.tr, newdata = tests), tests$Sales)
rt4 <- RMSE(predict(model.4.tr, newdata = tests), tests$Sales)
rt5 <- RMSE(predict(model.5.tr, newdata = tests), tests$Sales)
rmse_t <- c(rt1, rt2, rt3, rt4, rt5)
rmse_t
## [1] 1.6894002 1.6619253 0.8415654 0.5716940 0.5729795