if (!requireNamespace("datarium", quietly = TRUE)) {
install.packages("datarium")
}
library(datarium)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# Load the marketing data
data(marketing)
head(marketing)
## youtube facebook newspaper sales
## 1 276.12 45.36 83.04 26.52
## 2 53.40 47.16 54.12 12.48
## 3 20.64 55.08 83.16 11.16
## 4 181.80 49.56 70.20 22.20
## 5 216.96 12.96 70.08 15.48
## 6 10.44 58.68 90.00 8.64
tibble(marketing)
## # A tibble: 200 × 4
## youtube facebook newspaper sales
## <dbl> <dbl> <dbl> <dbl>
## 1 276. 45.4 83.0 26.5
## 2 53.4 47.2 54.1 12.5
## 3 20.6 55.1 83.2 11.2
## 4 182. 49.6 70.2 22.2
## 5 217. 13.0 70.1 15.5
## 6 10.4 58.7 90 8.64
## 7 69 39.4 28.2 14.2
## 8 144. 23.5 13.9 15.8
## 9 10.3 2.52 1.2 5.76
## 10 240. 3.12 25.4 12.7
## # ℹ 190 more rows
This document presents an analysis of marketing data.
EDA and later more rigor methods such as Linear regression will be employed for analysis.
The accuracy of predictions will be measured using the following metrics:
In this section we will inspect the data for; - Bivariate correlation/multi collinearity
Investigate linearity between sales and the explanatory variables through scatterplots
Check for outliers in the explanatory through the use of boxplots etc
In this section we will conduct a bivariate correlation between all the variables
library(DT)
cov(marketing) %>%
datatable() %>%
formatStyle("facebook",
color = styleInterval(c(100, 200, 20000, Inf), c("green", "violet", "red", "darkred","cyan")))%>%
formatStyle("newspaper",color=(styleInterval(c(100,200,20000,Inf),c('green','violet','red','darkred','cyan'))
))%>%
formatStyle("youtube",color=styleInterval(c(100,200,20000,Inf),c('green','violet','red','darkred','cyan')))
library(ggplot2)
# Plot for Facebook
plot_facebook <- ggplot(data = marketing, aes(x = "", y = facebook,fill='facebook')) +
geom_boxplot(show.legend = FALSE) +
labs(y = "Facebook")+
labs(x = "sales")+
scale_fill_manual(values=c('facebook'='yellow'))
# Plot for Youtube
plot_youtube <- ggplot(data = marketing, aes(x = "", y = youtube,fill='Youtube')) +
geom_boxplot(show.legend = FALSE) +
labs(y = "Youtube")+
labs(x = "sales")+
scale_fill_manual(values=c('Youtube'='green'))
# Plot for Newspaper
plot_newspaper <- ggplot(data = marketing, aes(x = "", y = newspaper, fill = "Newspaper")) +
geom_boxplot(show.legend = FALSE) +
labs(y = "Newspaper") +
labs(x = "sales") +
scale_fill_manual(aesthetics = '',values = c("Newspaper" = "cyan"))
# Combine plots using grid.arrange or cowplot
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(plot_facebook, plot_youtube, plot_newspaper, nrow = 1)
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's values.
model=lm(sales~.,data=marketing)
summary(model)
##
## Call:
## lm(formula = sales ~ ., data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5932 -1.0690 0.2902 1.4272 3.3951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.526667 0.374290 9.422 <2e-16 ***
## youtube 0.045765 0.001395 32.809 <2e-16 ***
## facebook 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
index <- createDataPartition(marketing$sales, p = 0.8, list = FALSE)
#all columns and only the rows from the below index
train_data=marketing[index,]
#all columns and only the rows from the below index
test_data=marketing[-index,]
train_model=lm(sales~.,data=train_data)
summary(train_model)
##
## Call:
## lm(formula = sales ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8224 -1.0227 0.3386 1.4442 3.1212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.913680 0.426719 9.172 2.41e-16 ***
## youtube 0.045029 0.001632 27.584 < 2e-16 ***
## facebook 0.185754 0.009496 19.561 < 2e-16 ***
## newspaper -0.003462 0.006989 -0.495 0.621
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.062 on 158 degrees of freedom
## Multiple R-squared: 0.8928, Adjusted R-squared: 0.8907
## F-statistic: 438.5 on 3 and 158 DF, p-value: < 2.2e-16
# Generate predictions for test_data
test_data$predicted_sales <- predict(train_model, newdata = test_data)
#Difference between predicted and observed
test_data$difference_observed_predicted=
test_data$predicted_sales-test_data$sales
# Display test_data with predicted sales using datatable()
test_data %>% datatable()
MAE=function(x)
{
return(sum(abs(x)) / length(x))
}
MAE(test_data$difference_observed_predicted) %>% tibble()
## # A tibble: 1 × 1
## .
## <dbl>
## 1 1.37
MAPE <- function(actual, predicted) {
return(mean(abs((actual - predicted) / actual)) * 100)
}
MAPE(test_data$sales,test_data$predicted_sales)%>% tibble()
## # A tibble: 1 × 1
## .
## <dbl>
## 1 12.1