I found this dataset on Kaggle titled “Dummy Data HSS” which was meant for using the data of TV, Influencer, Radio, and Social Media Advertisement budget to predict Sales.
mydata <- read.table("./Dummy Data HSS.csv",
header = TRUE,
sep = ",",
dec = ".")
I imported the data into R Studio using the read.table() function. The data was sourced from Indonesia, which is why the decimal point is used as the decimal separator instead of the decimal comma.
head(mydata)
## TV Radio Social.Media Influencer Sales
## 1 16 6.566231 2.907983 Mega 54.73276
## 2 13 9.237765 2.409567 Mega 46.67790
## 3 41 15.886446 2.913410 Mega 150.17783
## 4 83 30.020028 6.922304 Mega 298.24634
## 5 15 8.437408 1.405998 Micro 56.59418
## 6 29 9.614382 1.027163 Mega 105.88915
This dataset has 4572 observations with 5 variables.
The data is sourced from Monash University, Indonesia. It was uploaded to Kaggle by Assistant Professor Harriman Samuel Saragih.
#I created a copy of mydata to work on for the sake of convenience
mydata2 <- mydata
#I created a new column based on "Influencer" to categorise promotional programs
mydata2$InfluencerF <- factor(mydata2$Influencer,
levels = c("Mega", "Macro", "Nano", "Micro"),
labels = c("Mega", "Macro", "Nano", "Micro"))
#Below I shall rename several variables for the sake of clarity
library(dplyr)
##
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata2 <- mydata2 %>%
rename(TV_Budget = TV)
library(dplyr)
mydata2 <- mydata2 %>%
rename(Radio_Budget = Radio)
library(dplyr)
mydata2 <- mydata2 %>%
rename(SocialMedia_Budget = Social.Media)
#Here I shall remove the observations with missing values
library(rstatix)
##
## 载入程序包:'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata2 <- mydata2 %>%
drop_na()
#Below is a dataframe that only contains promotional programs with Mega Influencers
mydata2_Mega <- mydata2 %>%
subset(InfluencerF == "Mega")
summary(mydata2_Mega)
## TV_Budget Radio_Budget SocialMedia_Budget Influencer
## Min. : 10.00 Min. : 0.08417 Min. : 0.000031 Length:1152
## 1st Qu.: 32.00 1st Qu.:10.38416 1st Qu.: 1.594543 Class :character
## Median : 52.00 Median :17.59556 Median : 3.164299 Mode :character
## Mean : 53.49 Mean :18.04841 Mean : 3.370672
## 3rd Qu.: 75.00 3rd Qu.:25.51413 3rd Qu.: 4.826683
## Max. :100.00 Max. :48.87116 Max. :13.083957
## Sales InfluencerF
## Min. : 31.4 Mega :1152
## 1st Qu.:113.3 Macro: 0
## Median :184.1 Nano : 0
## Mean :190.4 Micro: 0
## 3rd Qu.:265.8
## Max. :364.1
#Below is a dataframe that only contains promotional programs with Micro Influencers
mydata2_Micro <- mydata2 %>%
subset(InfluencerF == "Micro")
summary(mydata2_Micro)
## TV_Budget Radio_Budget SocialMedia_Budget Influencer
## Min. : 10.00 Min. : 0.01449 Min. : 0.003177 Length:1148
## 1st Qu.: 31.00 1st Qu.:10.49782 1st Qu.: 1.478449 Class :character
## Median : 53.00 Median :17.94893 Median : 2.939348 Mode :character
## Mean : 53.84 Mean :18.19743 Mean : 3.270513
## 3rd Qu.: 77.00 3rd Qu.:25.48128 3rd Qu.: 4.759552
## Max. :100.00 Max. :47.11629 Max. :12.108017
## Sales InfluencerF
## Min. : 33.72 Mega : 0
## 1st Qu.:110.26 Macro: 0
## Median :187.84 Nano : 0
## Mean :191.58 Micro:1148
## 3rd Qu.:273.77
## Max. :362.04
#General overview of mydata2
summary(mydata2)
## TV_Budget Radio_Budget SocialMedia_Budget Influencer
## Min. : 10.00 Min. : 0.00068 Min. : 0.000031 Length:4546
## 1st Qu.: 32.00 1st Qu.:10.55536 1st Qu.: 1.530822 Class :character
## Median : 53.00 Median :17.85951 Median : 3.055565 Mode :character
## Mean : 54.06 Mean :18.15753 Mean : 3.323473
## 3rd Qu.: 77.00 3rd Qu.:25.64060 3rd Qu.: 4.804919
## Max. :100.00 Max. :48.87116 Max. :13.981662
## Sales InfluencerF
## Min. : 31.2 Mega :1152
## 1st Qu.:112.4 Macro:1112
## Median :189.0 Nano :1134
## Mean :192.4 Micro:1148
## 3rd Qu.:272.3
## Max. :364.1
#Sample statistics of "Sales" categorised by "InfluencerF"
library(psych)
describeBy(mydata2$Sales, mydata2$InfluencerF)
##
## Descriptive statistics by group
## group: Mega
## vars n mean sd median trimmed mad min max range skew
## X1 1 1152 190.41 92.31 184.07 189.14 114.07 31.4 364.08 332.68 0.1
## kurtosis se
## X1 -1.14 2.72
## ------------------------------------------------------------
## group: Macro
## vars n mean sd median trimmed mad min max range skew
## X1 1 1112 196.07 92.39 194.46 195.76 120.25 32.57 358.42 325.85 0.03
## kurtosis se
## X1 -1.2 2.77
## ------------------------------------------------------------
## group: Nano
## vars n mean sd median trimmed mad min max range skew
## X1 1 1134 191.71 93.27 189.66 190.7 118.43 31.2 357.12 325.93 0.07
## kurtosis se
## X1 -1.21 2.77
## ------------------------------------------------------------
## group: Micro
## vars n mean sd median trimmed mad min max range skew
## X1 1 1148 191.58 94.11 187.84 190.61 121.61 33.72 362.04 328.32 0.08
## kurtosis se
## X1 -1.2 2.78
Promotional programs with Mega Influencers:
Promotional programs with Micro Influencers:
library(ggplot2)
##
## 载入程序包:'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata2, aes(x = Sales, y = InfluencerF)) +
geom_boxplot()
library(ggplot2)
ggplot(mydata2, aes(x = InfluencerF)) +
geom_bar(colour = "black", fill = "skyblue")
library(ggplot2)
ggplot(mydata2, aes(x = TV_Budget, y = Sales)) +
geom_point(colour = "forestgreen")