Loading the data
library(tidyverse)
posts <- read_csv('ilp_dataset_1.csv') %>%
select(
alias, numberPosts, numberFollowers, date, url, description, tags, multipleImage, numberLikes
) %>%
mutate(
url = as.factor(url),
alias = as.factor(alias)
) %>%
rename(
followers = numberFollowers,
posts = numberPosts,
likes = numberLikes
)
summary(posts)
## alias posts followers
## _foodstories_ : 17 Min. : 15 Min. : 124965
## _hollyt : 17 1st Qu.: 787 1st Qu.: 198039
## _ingo_1 : 17 Median : 1481 Median : 393843
## _mariannejacobsen_: 17 Mean : 2316 Mean : 997829
## _picolo : 17 3rd Qu.: 2932 3rd Qu.: 935310
## _tinamaria : 17 Max. :27671 Max. :22130730
## (Other) :16437
## date
## Min. :1970-01-01 00:00:00.00
## 1st Qu.:2017-04-10 05:00:00.00
## Median :2017-04-21 05:00:00.00
## Mean :2017-04-04 08:50:26.45
## 3rd Qu.:2017-04-27 05:00:00.00
## Max. :2017-05-01 05:00:00.00
##
## url
## https://www.instagram.com/p/_umyXcP4B7/?taken-by=aaronsanimals: 1
## https://www.instagram.com/p/-CV8BmlCbJ/?taken-by=ibrahimgi : 1
## https://www.instagram.com/p/-CWE8YFCbd/?taken-by=ibrahimgi : 1
## https://www.instagram.com/p/-CWZyAFCcF/?taken-by=ibrahimgi : 1
## https://www.instagram.com/p/-D6VgNFCZQ/?taken-by=ibrahimgi : 1
## https://www.instagram.com/p/-D6YO8FCZX/?taken-by=ibrahimgi : 1
## (Other) :16533
## description tags multipleImage likes
## Length:16539 Length:16539 Mode :logical Min. : 0
## Class :character Class :character FALSE:16057 1st Qu.: 3232
## Mode :character Mode :character TRUE :482 Median : 7351
## Mean : 24414
## 3rd Qu.: 18357
## Max. :1115123
##
Distribution of stats
posts_long <- posts %>%
pivot_longer(
c('followers', 'likes', 'posts'),
names_to = 'stat_name', values_to = 'stat_value'
)
posts_long %>%
group_by(stat_name) %>%
ggplot(aes(x=stat_value)) +
scale_x_log10() +
geom_histogram() +
facet_grid(stat_name~., scales = "free_x")

Stats over time
library(lubridate)
# break down the timestamp into month, weekday, hour
posts_long <- posts_long %>%
mutate(
month = month(date, label=TRUE),
weekday = wday(date, label=TRUE),
hour = factor(hour(date))
)
By hour of the day
posts_long %>% filter(stat_name == 'likes') %>%
ggplot(aes(x = stat_value, fill=hour)) +
geom_histogram() + scale_x_log10() +
facet_grid(hour ~ .) +
ggtitle('Engagement by hour of the day') +
xlab('No. of likes')

By day of the week
posts_long %>% filter(stat_name == 'likes') %>%
ggplot(aes(x = stat_value, fill = weekday)) +
geom_histogram() + scale_x_log10() +
facet_grid(weekday ~ .) +
ggtitle('Engagement by day of the week') +
xlab('No. of likes')

Engagement types
library(scatterPlotMatrix)
posts %>%
select(url, followers, likes) %>%
filter(!is.na(likes)) %>%
distinct() %>%
select(-url) %>%
scatterPlotMatrix(controlWidgets = TRUE)
Engagement by userID
by_user <- posts %>%
distinct() %>%
group_by(alias) %>%
select(alias, likes) %>%
filter(!is.na(likes)) %>%
summarise(total_likes=sum(likes)) %>%
ungroup()
by_user %>% ggplot(aes(x=total_likes)) +
geom_histogram() +
xlab('total likes per user') + ylab('number of users') +
scale_x_log10()

Likes vs followers
posts %>%
distinct() %>%
select(alias, followers, likes) %>%
group_by(alias) %>%
summarise(followers=mean(followers), total_likes=sum(likes)) %>%
ungroup() %>%
select(followers, total_likes) %>%
ggplot(aes(x=followers, y=total_likes)) +
geom_point() + scale_x_log10() + scale_y_log10()
