I am adding in descriptions so it will be easier to read the output of the model
#cleaning variables
#define weekday
fbook_clean =
fbook %>%
mutate(
#Category
dsc_category =
case_when(
Category == 1 ~ "Action",
Category == 2 ~ "Product",
Category == 3 ~ "Inspiration",
TRUE ~ as.character(Category)
),
dsc_month =
case_when(
Post.Month == 1 ~ "January",
Post.Month == 2 ~ "February",
Post.Month == 3 ~ "March",
Post.Month == 4 ~ "April",
Post.Month == 5 ~ "May",
Post.Month == 6 ~ "June",
Post.Month == 7 ~ "July",
Post.Month == 8 ~ "August",
Post.Month == 9 ~ "September",
Post.Month == 10 ~ "October",
Post.Month == 11 ~ "November",
Post.Month == 12 ~ "December",
TRUE ~ as.character(Post.Month)
),
dsc_weekday =
case_when(
Post.Weekday == 1 ~ "Sunday",
Post.Weekday == 2 ~ "Monday",
Post.Weekday == 3 ~ "Tuesday",
Post.Weekday == 4 ~ "Wednesday",
Post.Weekday == 5 ~ "Thursday",
Post.Weekday == 6 ~ "Friday",
Post.Weekday == 7 ~ "Saturday",
TRUE ~ as.character(Post.Weekday)
)
)
# going to check to make sure the descriptions I created are correct
fbook_clean %>%
dplyr::select(Category, dsc_category) %>%
distinct() %>%
arrange(Category)
## Category dsc_category
## 1 1 Action
## 2 2 Product
## 3 3 Inspiration
fbook_clean %>%
dplyr::select(Post.Month, dsc_month) %>%
distinct() %>% #only the unique pairs selected
arrange(Post.Month)
## Post.Month dsc_month
## 1 1 January
## 2 2 February
## 3 3 March
## 4 4 April
## 5 5 May
## 6 6 June
## 7 7 July
## 8 8 August
## 9 9 September
## 10 10 October
## 11 11 November
## 12 12 December
fbook_clean %>%
dplyr::select(Post.Weekday, dsc_weekday) %>%
distinct() %>%
arrange(Post.Weekday)
## Post.Weekday dsc_weekday
## 1 1 Sunday
## 2 2 Monday
## 3 3 Tuesday
## 4 4 Wednesday
## 5 5 Thursday
## 6 6 Friday
## 7 7 Saturday
Change Type to factor for linear regression
head(fbook)
## Page.total.likes Type Category Post.Month Post.Weekday Post.Hour Paid
## 1 139441 Photo 2 12 4 3 0
## 2 139441 Status 2 12 3 10 0
## 3 139441 Photo 3 12 3 3 0
## 4 139441 Photo 2 12 2 10 1
## 5 139441 Photo 2 12 2 3 0
## 6 139441 Status 2 12 1 9 0
## Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1 2752 5091
## 2 10460 19057
## 3 2413 4373
## 4 50128 87991
## 5 7244 13594
## 6 10472 20849
## Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1 178 109 159
## 2 1457 1361 1674
## 3 177 113 154
## 4 2211 790 1119
## 5 671 410 580
## 6 1191 1073 1389
## Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1 3078
## 2 11710
## 3 2812
## 4 61027
## 5 6228
## 6 16034
## Lifetime.Post.reach.by.people.who.like.your.Page
## 1 1640
## 2 6112
## 3 1503
## 4 32048
## 5 3200
## 6 7852
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1 119 4
## 2 1108 5
## 3 132 0
## 4 1386 58
## 5 396 19
## 6 1016 1
## like share Total.Interactions
## 1 79 17 100
## 2 130 29 164
## 3 66 14 80
## 4 1572 147 1777
## 5 325 49 393
## 6 152 33 186
#using str(fbook_clean) in the terminal to see what needs to be changed and its the features I just made above
fbook_clean$Type = as.factor(fbook_clean$Type)
fbook_clean$dsc_weekday = factor(fbook_clean$dsc_weekday, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
fbook_clean$dsc_month = factor(fbook_clean$dsc_month,
levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
fbook_clean$dsc_category = as.factor(fbook_clean$dsc_category)
# making post hour and paid factors too
fbook_clean$Post.Hour = as.factor(fbook_clean$Post.Hour)
fbook_clean$Paid = as.factor(fbook_clean$Paid)
str(fbook_clean) #success!
## 'data.frame': 490 obs. of 22 variables:
## $ Page.total.likes : int 139441 139441 139441 139441 139441 139441 139441 139441 139441 139441 ...
## $ Type : Factor w/ 4 levels "Link","Photo",..: 2 3 2 2 2 3 2 2 3 2 ...
## $ Category : int 2 2 3 2 2 2 3 3 2 3 ...
## $ Post.Month : int 12 12 12 12 12 12 12 12 12 12 ...
## $ Post.Weekday : int 4 3 3 2 2 1 1 7 7 6 ...
## $ Post.Hour : Factor w/ 22 levels "1","2","3","4",..: 3 10 3 10 3 9 3 9 3 10 ...
## $ Paid : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 2 1 1 ...
## $ Lifetime.Post.Total.Reach : int 2752 10460 2413 50128 7244 10472 11692 13720 11844 4694 ...
## $ Lifetime.Post.Total.Impressions : int 5091 19057 4373 87991 13594 20849 19479 24137 22538 8668 ...
## $ Lifetime.Engaged.Users : int 178 1457 177 2211 671 1191 481 537 1530 280 ...
## $ Lifetime.Post.Consumers : int 109 1361 113 790 410 1073 265 232 1407 183 ...
## $ Lifetime.Post.Consumptions : int 159 1674 154 1119 580 1389 364 305 1692 250 ...
## $ Lifetime.Post.Impressions.by.people.who.have.liked.your.Page : int 3078 11710 2812 61027 6228 16034 15432 19728 15220 4309 ...
## $ Lifetime.Post.reach.by.people.who.like.your.Page : int 1640 6112 1503 32048 3200 7852 9328 11056 7912 2324 ...
## $ Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post: int 119 1108 132 1386 396 1016 379 422 1250 199 ...
## $ comment : int 4 5 0 58 19 1 3 0 0 3 ...
## $ like : int 79 130 66 1572 325 152 249 325 161 113 ...
## $ share : int 17 29 14 147 49 33 27 14 31 26 ...
## $ Total.Interactions : int 100 164 80 1777 393 186 279 339 192 142 ...
## $ dsc_category : Factor w/ 3 levels "Action","Inspiration",..: 3 3 2 3 3 3 2 2 3 2 ...
## $ dsc_month : Factor w/ 12 levels "January","February",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ dsc_weekday : Factor w/ 7 levels "Sunday","Monday",..: 4 3 3 2 2 1 1 7 7 6 ...
Checking for missing values in the 7 input features
sum(is.na(fbook_clean))
## [1] 6
sapply(fbook_clean, function(x) sum(is.na(x)))
## Page.total.likes
## 0
## Type
## 0
## Category
## 0
## Post.Month
## 0
## Post.Weekday
## 0
## Post.Hour
## 0
## Paid
## 1
## Lifetime.Post.Total.Reach
## 0
## Lifetime.Post.Total.Impressions
## 0
## Lifetime.Engaged.Users
## 0
## Lifetime.Post.Consumers
## 0
## Lifetime.Post.Consumptions
## 0
## Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 0
## Lifetime.Post.reach.by.people.who.like.your.Page
## 0
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post
## 0
## comment
## 0
## like
## 1
## share
## 4
## Total.Interactions
## 0
## dsc_category
## 0
## dsc_month
## 0
## dsc_weekday
## 0
fbook_clean %>%
filter(is.na(share)) #assume this is a NA b/c its 0, no share so plan to change this to 0....
## Page.total.likes Type Category Post.Month Post.Weekday Post.Hour Paid
## 1 136736 Photo 1 10 6 8 0
## 2 136393 Photo 1 10 7 9 0
## 3 136393 Photo 1 10 7 6 0
## 4 135428 Photo 1 9 5 10 0
## Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1 1261 2158
## 2 584 1029
## 3 677 1285
## 4 1060 2004
## Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1 37 37 49
## 2 273 271 308
## 3 251 246 297
## 4 266 251 337
## Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1 1911
## 2 943
## 3 1210
## 4 1705
## Lifetime.Post.reach.by.people.who.like.your.Page
## 1 1077
## 2 511
## 3 615
## 4 870
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1 33 0
## 2 232 0
## 3 211 0
## 4 204 0
## like share Total.Interactions dsc_category dsc_month dsc_weekday
## 1 NA NA 0 Action October Friday
## 2 2 NA 2 Action October Saturday
## 3 7 NA 7 Action October Saturday
## 4 18 NA 18 Action September Thursday
fbook_clean %>%
filter(is.na(like)) #assume this is a NA b/c its 0, no like so plan to change this to 0....
## Page.total.likes Type Category Post.Month Post.Weekday Post.Hour Paid
## 1 136736 Photo 1 10 6 8 0
## Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1 1261 2158
## Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1 37 37 49
## Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1 1911
## Lifetime.Post.reach.by.people.who.like.your.Page
## 1 1077
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1 33 0
## like share Total.Interactions dsc_category dsc_month dsc_weekday
## 1 NA NA 0 Action October Friday
fbook_clean %>%
filter(is.na(Paid)) #we can drop this one... but need to figure out how,, maybe w/ index
## Page.total.likes Type Category Post.Month Post.Weekday Post.Hour Paid
## 1 81370 Photo 2 1 4 4 <NA>
## Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1 4188 7292
## Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1 564 524 743
## Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1 3861
## Lifetime.Post.reach.by.people.who.like.your.Page
## 1 2200
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1 316 0
## like share Total.Interactions dsc_category dsc_month dsc_weekday
## 1 91 28 119 Product January Wednesday
Here I’m going to create a new dataset, fbook_clean2, with 0’s replacing the NAs in share and like, and removing the record that has NA in Paid
fbook_clean2 =
fbook_clean %>%
filter(!is.na(Paid))
fbook_clean2[is.na(fbook_clean2)] = 0
# this is checking that there are no more nulls
sum(is.na(fbook_clean2))
## [1] 0
To start I’m going to create a dataset with only the input variables and the output variable of interest, Total Reach
fbook_clean2_reach =
fbook_clean2 %>%
dplyr::select(Lifetime.Post.Total.Reach, Page.total.likes, Type, dsc_category, dsc_month, dsc_weekday, Post.Hour, Paid)
Since there’s only one continuous variable (Page.total.likes) in the input variables, I am only going to look for outliers in this variable.
Starting by looking visually
fbook_clean2_reach %>%
ggplot(aes(Page.total.likes)) +
geom_boxplot() +
theme_classic()
There doesn’t appear to be any outliers in Page.total.likes based on the boxplot above.
Using a scatterplot to examine the relationship between total reach and page total likes
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = Page.total.likes)) +
geom_point() +
theme_classic()
Interesting to see that in general, posts don’t have a high reach depending on how many total likes a page has.
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = Type, fill = Type)) +
geom_boxplot() +
theme_classic()
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = dsc_category, fill = dsc_category)) +
geom_boxplot() +
theme_classic()
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = dsc_month, fill = dsc_month)) +
geom_boxplot() +
theme_classic()
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = dsc_weekday, fill = dsc_weekday)) +
geom_boxplot() +
theme_classic()
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = Post.Hour, fill= Post.Hour)) +
geom_boxplot() +
theme_classic()
fbook_clean2_reach %>%
ggplot(aes(y = Lifetime.Post.Total.Reach, x = Paid, fill = Paid)) +
geom_boxplot() +
theme_classic()
Type of post shows that “Photo” has the larger spread when it comes to reach and “Video” is likely to have a high reach. The category that the post is classified as does not seem to largely influence its reach but generally speaking Action does better than Product. The time features like month, day, and hour may not have high importance in the reach due to the likely hood of a post being created during waking hours and various shopping season, basically Life.time.post.total.reach covers so much time, the orginal posting time shouldnt influence the post’s reach down the road. Last graph shows whether the post has been paid for or not, and shows that paid post do in fact have a larger reach.
After exploring total reach, it appears that total reach has an interation with whether its a paid post or not. Besides this, there was not a whole lot to futher explore therefore I am going to look at Engaged Users below.
To start, I’m going to create a dataset with only the input variables and the output variable of interest, Total Reach
fbook_clean2_engaged=
fbook_clean2 %>%
dplyr::select(Lifetime.Engaged.Users, Page.total.likes, Type, dsc_category, dsc_month, dsc_weekday, Post.Hour, Paid)
Since there’s only one continuous variable (Page.total.likes) in the input variables, I am only going to look for outliers in this variable.
Starting by looking visually
fbook_clean2_engaged %>%
ggplot(aes(Lifetime.Engaged.Users)) +
geom_boxplot() +
theme_classic()
The above boxplot appears to be right skewed
Using a scatterplot to examine the relationship between engaged users and page total likes
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = Page.total.likes)) +
geom_point() +
theme_classic()
Interesting to see that in general, posts don’t have a high reach depending on how many total likes a page has.
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = Type, fill = Type)) +
geom_boxplot() +
theme_classic()
Engaged Users and Type of post show that Status has the greatest engaged users while link is the least.
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = dsc_category, fill = dsc_category)) +
geom_boxplot() +
theme_classic()
Engeged Users and Category of post show that product does well. Interesting because product and lifetime.post.reach did not show a strong interation suggesting that reach may not be a great metirc for investment if the goal is to increase engagement.
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = dsc_month, fill = dsc_month)) +
geom_boxplot() + theme_classic()
Engagment by month show February as quite the standout month with overall high engagment.
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = dsc_weekday, fill = dsc_weekday)) +
geom_boxplot() +
theme_classic()
Tuesday and Thursday is the best day to post if engaged users is your goal however the lifetime engament seem to be heavily dependent on the day of the week. This could be due to more people working and living non tradional schedules even at the time of this data collection.
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = Post.Hour, fill = Post.Hour)) +
geom_boxplot() +
theme_classic()
Post with good engagment is seen during the early AM, mid day, and late afternoon. Late night has very little spread in the data.
fbook_clean2_engaged %>%
ggplot(aes(y = Lifetime.Engaged.Users, x = Paid, fill = Paid)) +
geom_boxplot() +
theme_classic()
Paying to promote a post does not seem to influence whether people will engage with the post or not. Interesting finding due to previous exploration that showed a paid post having greater reach. This suggest that if the goal is to reach people, pay to promote the post while on the other hand, if the goal of the social media content is to have engagment (think influencer) then the content quality should be the main focus. Another intersting finding previously was that the category action did best with reach and product was the least. if the post’s goal is to show a product and get reach, the post should be a picture of something pretty rad and action packed with the product possibly as a subject or be in use by the subject.
## install packages if necessary
# install.packages("cluster")
# install.packages("factoextra")
# install.packages("purrr")
# install.packages("NbClust")
library(cluster)
## Warning: package 'cluster' was built under R version 4.0.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(purrr)
## Warning: package 'purrr' was built under R version 4.0.3
library(NbClust)
## Warning: package 'NbClust' was built under R version 4.0.3
fbook_clean2_output_var =
fbook_clean2 %>%
dplyr::select(
Lifetime.Post.Total.Reach,
Lifetime.Post.Total.Impressions,
Lifetime.Engaged.Users,
Lifetime.Post.Consumers,
Lifetime.Post.Consumptions,
Lifetime.Post.Impressions.by.people.who.have.liked.your.Page,
Lifetime.Post.reach.by.people.who.like.your.Page,
Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post,
comment, like, share, Total.Interactions)
set.seed(123)
fviz_nbclust(fbook_clean2_output_var, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
Going with 4 clusters
km_results =
kmeans(fbook_clean2_output_var, 4)
print(km_results)
## K-means clustering with 4 clusters of sizes 80, 386, 21, 2
##
## Cluster means:
## Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1 28044.550 47547.562
## 2 5289.347 8864.459
## 3 60027.429 135393.952
## 4 52576.000 888037.000
## Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1 1690.688 1473.500 2319.200
## 2 630.987 564.544 1069.142
## 3 1788.000 1330.810 2454.667
## 4 3098.500 2533.000 4595.500
## Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1 26516.250
## 2 6320.744
## 3 67814.524
## 4 878222.000
## Lifetime.Post.reach.by.people.who.like.your.Page
## 1 13718.000
## 2 3508.518
## 3 23778.286
## 4 47928.000
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1 1191.5250 13.525000
## 2 435.6684 4.310881
## 3 1109.6667 16.047619
## 4 2471.5000 76.500000
## like share Total.Interactions
## 1 314.5125 40.60000 368.6375
## 2 105.9301 20.16580 130.4067
## 3 570.9048 50.52381 637.4762
## 4 886.5000 98.00000 1061.0000
##
## Clustering vector:
## [1] 2 2 2 3 2 2 2 2 2 2 1 2 2 2 1 2 2 3 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2
## [38] 2 1 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2 3 1 1 2 2 2 1 2 2 3 2 1 1
## [75] 1 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 1 2 2
## [112] 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 1 1 3 2 1 2 2
## [149] 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 1 2 1 2 2
## [186] 2 2 2 2 2 1 2 2 1 2 2 2 2 1 2 2 2 3 2 2 3 2 2 2 1 2 2 2 2 1 2 1 1 2 2 2 2
## [223] 1 2 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 2 2 2 2 2 1 2 1 1 2 1 1 2 2 2
## [260] 1 2 2 1 2 2 2 2 2 1 3 3 2 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 3 2 1 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 3 2 2 2 2 1 2 1 2 2 2 2 2 2
## [334] 2 2 1 2 2 2 2 2 2 1 2 2 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 1 2
## [371] 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 3 2 2 2 1 1 3 2 2 2 2 1 2 2 2 2
## [408] 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 1 2
## [445] 2 2 2 2 2 1 2 2 4 2 2 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2
## [482] 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 53391958944 30457556711 89059189719 204264499799
## (between_SS / total_SS = 90.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(km_results, data = fbook_clean2_output_var,
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw()
)
there is a lot of overlap in this cluster.