Install and load the datarium package

if (!requireNamespace("datarium", quietly = TRUE)) {
  install.packages("datarium")
}
library(datarium)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

# Load the marketing data
data(marketing)
head(marketing)

##   youtube facebook newspaper sales
## 1  276.12    45.36     83.04 26.52
## 2   53.40    47.16     54.12 12.48
## 3   20.64    55.08     83.16 11.16
## 4  181.80    49.56     70.20 22.20
## 5  216.96    12.96     70.08 15.48
## 6   10.44    58.68     90.00  8.64

tibble(marketing)

## # A tibble: 200 × 4
##    youtube facebook newspaper sales
##      <dbl>    <dbl>     <dbl> <dbl>
##  1   276.     45.4       83.0 26.5 
##  2    53.4    47.2       54.1 12.5 
##  3    20.6    55.1       83.2 11.2 
##  4   182.     49.6       70.2 22.2 
##  5   217.     13.0       70.1 15.5 
##  6    10.4    58.7       90    8.64
##  7    69      39.4       28.2 14.2 
##  8   144.     23.5       13.9 15.8 
##  9    10.3     2.52       1.2  5.76
## 10   240.      3.12      25.4 12.7 
## # ℹ 190 more rows

Introduction

This document presents an analysis of marketing data.

Methods

EDA and later more rigor methods such as Linear regression will be employed for analysis.

The accuracy of predictions will be measured using the following metrics:

Mean Absolute Percentage Error (MAPE)
Root Mean Square Error (RMSE)

EDA

In this section we will inspect the data for; - Bivariate correlation/multi collinearity

Investigate linearity between sales and the explanatory variables through scatterplots
Check for outliers in the explanatory through the use of boxplots etc

Multicollinearity

Green for values less than 100
Violet for values between 100 and 200
Red for values between 200 and 20000
Dark Red for values above 20000
Cyan for any other values beyond the specified intervals e.g -negatives

In this section we will conduct a bivariate correlation between all the variables

library(DT)


cov(marketing) %>% 
  datatable() %>% 
  formatStyle("facebook", 
              color = styleInterval(c(100, 200, 20000, Inf), c("green", "violet", "red", "darkred","cyan")))%>%
  formatStyle("newspaper",color=(styleInterval(c(100,200,20000,Inf),c('green','violet','red','darkred','cyan'))
              ))%>%
  formatStyle("youtube",color=styleInterval(c(100,200,20000,Inf),c('green','violet','red','darkred','cyan')))

library(ggplot2)

# Plot for Facebook
plot_facebook <- ggplot(data = marketing, aes(x = "", y = facebook,fill='facebook')) +
  geom_boxplot(show.legend = FALSE) +
  labs(y = "Facebook")+
  labs(x = "sales")+
  scale_fill_manual(values=c('facebook'='yellow'))


# Plot for Youtube
plot_youtube <- ggplot(data = marketing, aes(x = "", y = youtube,fill='Youtube')) +
  geom_boxplot(show.legend = FALSE) +
  labs(y = "Youtube")+
  labs(x = "sales")+
  scale_fill_manual(values=c('Youtube'='green'))


# Plot for Newspaper
plot_newspaper <- ggplot(data = marketing, aes(x = "", y = newspaper, fill = "Newspaper")) +
  geom_boxplot(show.legend = FALSE) +
  labs(y = "Newspaper") +
  labs(x = "sales") +
  scale_fill_manual(aesthetics = '',values = c("Newspaper" = "cyan"))


# Combine plots using grid.arrange or cowplot
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(plot_facebook, plot_youtube, plot_newspaper, nrow = 1)

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's values.

Conduct supervised learning

model=lm(sales~.,data=marketing)
summary(model)

## 
## Call:
## lm(formula = sales ~ ., data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5932  -1.0690   0.2902   1.4272   3.3951 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.526667   0.374290   9.422   <2e-16 ***
## youtube      0.045765   0.001395  32.809   <2e-16 ***
## facebook     0.188530   0.008611  21.893   <2e-16 ***
## newspaper   -0.001037   0.005871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

Splitting data into Train and Testing using caret

index <- createDataPartition(marketing$sales, p = 0.8, list = FALSE)

#all columns and only the rows from the below index
train_data=marketing[index,]
#all columns and only the rows from the below index
test_data=marketing[-index,]

Training the data

train_model=lm(sales~.,data=train_data)
summary(train_model)

## 
## Call:
## lm(formula = sales ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8224  -1.0227   0.3386   1.4442   3.1212 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.913680   0.426719   9.172 2.41e-16 ***
## youtube      0.045029   0.001632  27.584  < 2e-16 ***
## facebook     0.185754   0.009496  19.561  < 2e-16 ***
## newspaper   -0.003462   0.006989  -0.495    0.621    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.062 on 158 degrees of freedom
## Multiple R-squared:  0.8928, Adjusted R-squared:  0.8907 
## F-statistic: 438.5 on 3 and 158 DF,  p-value: < 2.2e-16

conduct prediction

# Generate predictions for test_data
test_data$predicted_sales <- predict(train_model, newdata = test_data)
#Difference between predicted and observed
test_data$difference_observed_predicted=
  test_data$predicted_sales-test_data$sales

# Display test_data with predicted sales using datatable()
test_data %>% datatable()

MAE=function(x)
{
  return(sum(abs(x)) / length(x))
}
MAE(test_data$difference_observed_predicted) %>% tibble()

## # A tibble: 1 × 1
##       .
##   <dbl>
## 1  1.37

MAPE <- function(actual, predicted) {
  return(mean(abs((actual - predicted) / actual)) * 100)
}

MAPE(test_data$sales,test_data$predicted_sales)%>% tibble()

## # A tibble: 1 × 1
##       .
##   <dbl>
## 1  12.1

Marketting data analysis

Ngala H.

2024-03-06