#The Libraries required
#Packages need to be installed only once
#install.packages("datarium") for inbulit data
#install.packages("caTools") for sample spliting
#install.packages("ggplot2") for visualization
#install.packages("GGally")
#install.packages("gganimate") for animation
#loding package in R
library(datarium)
library(caTools)
library(ggplot2)
library(GGally)
library(gganimate)
#Loading The Data
data("marketing", package = "datarium")
head(marketing)
## youtube facebook newspaper sales
## 1 276.12 45.36 83.04 26.52
## 2 53.40 47.16 54.12 12.48
## 3 20.64 55.08 83.16 11.16
## 4 181.80 49.56 70.20 22.20
## 5 216.96 12.96 70.08 15.48
## 6 10.44 58.68 90.00 8.64
summary(marketing)
## youtube facebook newspaper sales
## Min. : 0.84 Min. : 0.00 Min. : 0.36 Min. : 1.92
## 1st Qu.: 89.25 1st Qu.:11.97 1st Qu.: 15.30 1st Qu.:12.45
## Median :179.70 Median :27.48 Median : 30.90 Median :15.48
## Mean :176.45 Mean :27.92 Mean : 36.66 Mean :16.83
## 3rd Qu.:262.59 3rd Qu.:43.83 3rd Qu.: 54.12 3rd Qu.:20.88
## Max. :355.68 Max. :59.52 Max. :136.80 Max. :32.40
# Correlation matrix
cor(marketing)
## youtube facebook newspaper sales
## youtube 1.00000000 0.05480866 0.05664787 0.7822244
## facebook 0.05480866 1.00000000 0.35410375 0.5762226
## newspaper 0.05664787 0.35410375 1.00000000 0.2282990
## sales 0.78222442 0.57622257 0.22829903 1.0000000
#Pairwise plotting technique
ggpairs(marketing)
Looking at the correlation numbers we can see that youtube (0.78) and facebook (0.58) have a much higher correlation to sales than newspapers (0.23).
# Here we are splitting data into traning data for model and test data for model
splitRatio = 0.75
#Splitting The Data
set.seed(101)# Set Seed so that same sample can be reproduced in future also
#Now Selecting 75% of data as sample from total 'n' rows of the data
sample = sample.split(marketing$youtube, SplitRatio = splitRatio)
train = subset(marketing, sample == TRUE)
test = subset(marketing, sample == FALSE)
train_size = dim(train)
test_size = dim(test)
#creating the model
Model <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
summary(Model)
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5932 -1.0690 0.2902 1.4272 3.3951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.526667 0.374290 9.422 <2e-16 ***
## youtube 0.045765 0.001395 32.809 <2e-16 ***
## facebook 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
It can be seen that p-value of the F-statistic is < 2.2e-16. This small number means that, at least, one of the predictor variables (youtube, FB or newspaper) is significantly related to the outcome variable.
It can be seen that from the estimates column that, changes in the youtube and facebook advertising budgets are significantly associated to changes in sales while changes in the newspaper budget is not.
#Predicting
data_size = dim(marketing)
pred <- predict(Model, test)
numx <- data_size[1]*(1 - splitRatio)
x_axis <- seq(numx)
df <- data.frame(x_axis, pred,test$sales)
#Plotting the predicted values against the actual values
g <- ggplot(df, aes(x=x_axis))
g <- g + geom_line(aes(y=pred, colour="Predicted"))
g <- g + geom_point(aes(x=x_axis, y=pred, colour="Predicted"))
g <- g + geom_line(aes(y=test$sales, colour="Actual"))
g <- g + geom_point(aes(x=x_axis, y=test$sales, colour="Actual"))
g <- g + scale_colour_manual("", values = c(Predicted="red", Actual="blue"))
g <- g+transition_reveal(x_axis)
animate(g, fps=3,renderer = gifski_renderer("linear.gif"))
# Since newspaper is in significant , we drop that variable
up_model <- lm(sales ~ youtube + facebook , data = marketing)
# summary
summary(up_model)
##
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5572 -1.0502 0.2906 1.4049 3.3994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50532 0.35339 9.919 <2e-16 ***
## youtube 0.04575 0.00139 32.909 <2e-16 ***
## facebook 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.018 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16