library(tidyverse)
library(datarium)
library(caret)
library(psych)
library(gridExtra)
library(Metrics)
data <- marketing
head(data)
str(data)
'data.frame': 200 obs. of 4 variables:
$ youtube : num 276.1 53.4 20.6 181.8 217 ...
$ facebook : num 45.4 47.2 55.1 49.6 13 ...
$ newspaper: num 83 54.1 83.2 70.2 70.1 ...
$ sales : num 26.5 12.5 11.2 22.2 15.5 ...
ggplot(data, aes(x=youtube, y=sales)) + geom_point()
ggplot(data, aes(x=facebook, y=sales)) + geom_point()
ggplot(data, aes(x=newspaper, y=sales)) + geom_point()
pairs.panels(data, method = "pearson", hist.col = "#00AFBB")
set.seed(3456)
trainIndex <- createDataPartition(data$sales, p = 0.8,
list = FALSE,
times = 1)
head(trainIndex)
Resample1
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 5
[6,] 6
train <- data[ trainIndex,]
test <- data[-trainIndex,]
linreg1 <- lm(sales~youtube,train)
summary(linreg1)
Call:
lm(formula = sales ~ youtube, data = train)
Residuals:
Min 1Q Median 3Q Max
-9.7375 -2.4757 0.0065 2.6190 8.8483
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.583693 0.633160 13.56 <2e-16 ***
youtube 0.046120 0.003054 15.10 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.025 on 160 degrees of freedom
Multiple R-squared: 0.5878, Adjusted R-squared: 0.5852
F-statistic: 228.1 on 1 and 160 DF, p-value: < 2.2e-16
linreg2 <- lm(sales~facebook,train)
summary(linreg2)
Call:
lm(formula = sales ~ facebook, data = train)
Residuals:
Min 1Q Median 3Q Max
-18.636 -2.568 1.027 3.461 9.713
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.48285 0.77301 14.86 < 2e-16 ***
facebook 0.19094 0.02317 8.24 5.84e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.252 on 160 degrees of freedom
Multiple R-squared: 0.2979, Adjusted R-squared: 0.2935
F-statistic: 67.9 on 1 and 160 DF, p-value: 5.844e-14
linreg3 <- lm(sales~newspaper,train)
summary(linreg3)
Call:
lm(formula = sales ~ newspaper, data = train)
Residuals:
Min 1Q Median 3Q Max
-13.6974 -4.1096 -0.9246 4.1947 15.0749
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.11819 0.82283 18.373 < 2e-16 ***
newspaper 0.04782 0.01821 2.626 0.00948 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.137 on 160 degrees of freedom
Multiple R-squared: 0.04131, Adjusted R-squared: 0.03532
F-statistic: 6.895 on 1 and 160 DF, p-value: 0.009484
p1 <- ggplot(data, aes(x=youtube, y=sales)) +
geom_point() +
geom_smooth(method='lm',formula='y ~ x')
p2 <- ggplot(data, aes(x=facebook, y=sales)) +
geom_point() +
geom_smooth(method='lm',formula='y ~ x')
p3 <- ggplot(data, aes(x=newspaper, y=sales)) +
geom_point() +
geom_smooth(method='lm',formula='y ~ x')
grid.arrange(p1,p2,p3,nrow=2)
multireg1 <- lm(sales~.,train)
summary(multireg1)
Call:
lm(formula = sales ~ ., data = train)
Residuals:
Min 1Q Median 3Q Max
-10.5411 -1.0937 0.4033 1.4690 3.7120
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.210320 0.441087 7.278 1.49e-11 ***
youtube 0.046288 0.001586 29.192 < 2e-16 ***
facebook 0.194756 0.010061 19.358 < 2e-16 ***
newspaper -0.004111 0.006766 -0.608 0.544
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.09 on 158 degrees of freedom
Multiple R-squared: 0.8902, Adjusted R-squared: 0.8881
F-statistic: 427.1 on 3 and 158 DF, p-value: < 2.2e-16
multireg2 <- lm(sales~youtube+facebook,train)
summary(multireg2)
Call:
lm(formula = sales ~ youtube + facebook, data = train)
Residuals:
Min 1Q Median 3Q Max
-10.3862 -1.0921 0.3928 1.4976 3.7229
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.128690 0.419291 7.462 5.25e-12 ***
youtube 0.046289 0.001583 29.250 < 2e-16 ***
facebook 0.192311 0.009202 20.898 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.086 on 159 degrees of freedom
Multiple R-squared: 0.89, Adjusted R-squared: 0.8886
F-statistic: 643.1 on 2 and 159 DF, p-value: < 2.2e-16
x_test <- test[,-4]
y_test <- as.numeric(as.matrix(test[,4]))
predict.linreg1 <- predict(linreg1,x_test)
predict.linreg2 <- predict(linreg2,x_test)
predict.linreg3 <- predict(linreg3,x_test)
predict.multireg1 <- predict(multireg1,x_test)
predict.multireg2 <- predict(multireg2,x_test)
data_pred <- data.frame(actual = y_test,
linreg1 = predict.linreg1,
linreg2 = predict.linreg2,
linreg3 = predict.linreg3,
multireg1 = predict.multireg1,
multireg2 = predict.multireg2)
head(data_pred)
\[ \mathrm{RMSE}=\sqrt{\frac{\sum_{i=1}^n\left(y_i-\hat{y}_i\right)^2}{n}} \]
# Root Mean Square Error
print(apply(X = data_pred[,-1], MARGIN = 2, FUN = rmse, actual = data_pred$actual) %>% sort())
multireg2 multireg1 linreg1 linreg2 linreg3
1.757540 1.778099 3.412434 4.603717 6.013120