Loading, setting up

library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis

cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))

Total number of posts

total_unique_posts <- tbl(cn, "posts") %>%
  tally()

URL is a unique identifier

tbl(cn, "posts") %>% 
  count(url) %>%
  tally()
## # Source:   lazy query [?? x 1]
## # Database: sqlite 3.33.0
## #   [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
##          n
##      <int>
## 1 17880239

Types of post

tbl(cn, "posts") %>% 
  count(type) %>% 
  arrange(desc(n)) %>% 
  knitr::kable()
type n
Photo 9339271
Link 3708186
Status 3495985
Native Video 885847
YouTube 315467
Live Video Complete 156938
Video 88188
Live Video Scheduled 4785
Vine 304
Album 115
Live Video 77

Accounts

Total number of accounts

tbl(cn, "posts") %>% 
  count(facebook_id) %>% 
  tally()
## # Source:   lazy query [?? x 1]
## # Database: sqlite 3.33.0
## #   [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
##       n
##   <int>
## 1 14187

Distribution of the number of posts by accounts

tbl(cn, "posts") %>% count(facebook_id) %>% 
  ggplot(aes(x = n)) + 
  geom_histogram(bins = 1000)

Time series

by_day <- tbl(cn, "posts") %>% 
  count(created_rounded_to_day) %>% 
  collect() %>% 
  mutate(day = as.POSIXct(created_rounded_to_day, origin = "1970-01-01")) %>% 
  filter(day >= lubridate::ymd("2005-01-01"))

Month of year - within the year

by_day %>% 
  mutate(yday = lubridate::yday(day),
         month = lubridate::month(day),
         year = lubridate::year(day)) %>% 
  filter(year >= 2010) %>% 
  mutate(year = as.factor(year)) %>% 
  group_by(year, month) %>% 
  summarize(sum_n = sum(n)) %>% 
  ggplot(aes(x = month, y = sum_n, color = year, group = year)) +
  geom_point() +
  geom_line() +
  hrbrthemes::theme_ipsum() +
  scale_color_brewer(type = "div") +
  scale_y_continuous(label = comma)

Month of year - between years

by_day %>% 
  ggplot(aes(x = day, y = n)) + 
  geom_point(alpha = .25) +
  geom_line() +
  theme_bw() +
  scale_y_continuous(label = comma) +
  geom_smooth(se = FALSE, color = "blue")

Day of week

tbl(cn, "posts") %>%  
  count(day_of_week) %>% 
  collect() %>% 
  mutate(day_of_week_string = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) %>% 
  mutate(day_of_week_string = fct_inorder(day_of_week_string)) %>% 
  ggplot(aes(x = day_of_week_string, y = n)) +
  geom_col() +
  theme_bw() + 
  scale_y_continuous(label = comma)

Hour

tbl(cn, "posts") %>%  
  count(hour) %>% 
  collect() %>% 
  mutate(hour = lubridate::make_datetime(hour = hour)) %>% 
  ggplot(aes(x = hour, y = n)) +
  geom_col() +
  hrbrthemes::theme_ipsum() +
  scale_x_datetime(date_labels = "%H:%M") +
  scale_y_continuous(label = comma)

Interactions

Total interactions per post (overall)

tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect() %>% 
  gather(key, val) %>% 
  summarize(mean_total_interactions_per_post = sum(val),
            sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
##   mean_total_interactions_per_post sd_total_interactions_per_post
##                              <dbl>                          <dbl>
## 1                            1039.                           108.

Mean total interactions per post (overall)

tbl(cn, "posts") %>% 
  select(likes:care, year) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect() %>% 
  select(-year_mean, -year_sd) %>% 
  gather(key, val) %>% 
  filter(!str_detect(key, "sd")) %>%
  summarize(mean_total_interactions_per_post = sum(val),
            sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
##   mean_total_interactions_per_post sd_total_interactions_per_post
##                              <dbl>                          <dbl>
## 1                             37.7                           7.60

Another way of presenting interactions

probably preferable to and redundant with the others for the purposes of exploring the data, at least the others that are not by time

descriptives_of_interactions <- tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(sum, mean, sd)) %>% 
  collect()
table_of_descriptives <- descriptives_of_interactions %>% 
  gather(key, val) %>% 
  separate(key, into = c("var", "stat")) %>% 
  spread(stat, val) %>% 
  arrange(desc(sum))

table_of_descriptives %>% 
  knitr::kable()
var mean sd sum
likes 23.5632329 367.946806 424024217
shares 6.1171739 304.766830 110079541
love 4.2413979 135.737827 76324647
comments 3.0795693 89.624197 55417351
haha 0.2073269 36.235316 3730882
sad 0.2015831 35.427633 3627520
wow 0.1729944 6.811638 3113063
care 0.0695198 11.250849 1251020
angry 0.0532505 12.999496 958251
table_of_descriptives %>% 
  summarize_if(is.numeric, sum)
## # A tibble: 1 x 3
##    mean    sd       sum
##   <dbl> <dbl>     <dbl>
## 1  37.7 1001. 678526492

Mean interactions by type per post (overall)

interactions <- tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect()

interactions %>% 
  select(contains("mean")) %>% 
  gather(key, val) %>% 
  ggplot(aes(x = reorder(key,val), y = val)) + 
  geom_col() + 
  coord_flip() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Mean interactions by type per post by Month (2010-September, 2020)

interactions_by_month <- tbl(cn, "posts") %>% 
  group_by(year, month) %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect()

interactions_by_month %>% 
  select(contains("mean"), year, month) %>%
  mutate(date = lubridate::make_date(year = year, month = month)) %>% 
  ungroup() %>% 
  select(-year, -month) %>% 
  gather(key, val, -date) %>% 
  filter(date >= lubridate::ymd("2010-01-01")) %>% 
  filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>% 
  ggplot(aes(x = date, y= val, group = key, color = key)) +
  geom_point(alpha = .2) +
  geom_line() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Mean interactions by type by month (2018-September, 2020)

interactions_by_month %>% 
  select(contains("mean"), year, month) %>%
  mutate(date = lubridate::make_date(year = year, month = month)) %>% 
  ungroup() %>% 
  select(-year, -month) %>% 
  gather(key, val, -date) %>% 
  filter(date >= lubridate::ymd("2018-01-01")) %>% 
  filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>% 
  ggplot(aes(x = date, y= val, group = key, color = key)) +
  geom_point(alpha = .2) +
  geom_line() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()