library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis
cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))
total_unique_posts <- tbl(cn, "posts") %>%
tally()
URL is a unique identifier
tbl(cn, "posts") %>%
count(url) %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: sqlite 3.33.0
## # [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
## n
## <int>
## 1 17880239
tbl(cn, "posts") %>%
count(type) %>%
arrange(desc(n)) %>%
knitr::kable()
type | n |
---|---|
Photo | 9339271 |
Link | 3708186 |
Status | 3495985 |
Native Video | 885847 |
YouTube | 315467 |
Live Video Complete | 156938 |
Video | 88188 |
Live Video Scheduled | 4785 |
Vine | 304 |
Album | 115 |
Live Video | 77 |
tbl(cn, "posts") %>%
count(facebook_id) %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: sqlite 3.33.0
## # [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
## n
## <int>
## 1 14187
tbl(cn, "posts") %>% count(facebook_id) %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 1000)
by_day <- tbl(cn, "posts") %>%
count(created_rounded_to_day) %>%
collect() %>%
mutate(day = as.POSIXct(created_rounded_to_day, origin = "1970-01-01")) %>%
filter(day >= lubridate::ymd("2005-01-01"))
by_day %>%
mutate(yday = lubridate::yday(day),
month = lubridate::month(day),
year = lubridate::year(day)) %>%
filter(year >= 2010) %>%
mutate(year = as.factor(year)) %>%
group_by(year, month) %>%
summarize(sum_n = sum(n)) %>%
ggplot(aes(x = month, y = sum_n, color = year, group = year)) +
geom_point() +
geom_line() +
hrbrthemes::theme_ipsum() +
scale_color_brewer(type = "div") +
scale_y_continuous(label = comma)
by_day %>%
ggplot(aes(x = day, y = n)) +
geom_point(alpha = .25) +
geom_line() +
theme_bw() +
scale_y_continuous(label = comma) +
geom_smooth(se = FALSE, color = "blue")
tbl(cn, "posts") %>%
count(day_of_week) %>%
collect() %>%
mutate(day_of_week_string = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) %>%
mutate(day_of_week_string = fct_inorder(day_of_week_string)) %>%
ggplot(aes(x = day_of_week_string, y = n)) +
geom_col() +
theme_bw() +
scale_y_continuous(label = comma)
tbl(cn, "posts") %>%
count(hour) %>%
collect() %>%
mutate(hour = lubridate::make_datetime(hour = hour)) %>%
ggplot(aes(x = hour, y = n)) +
geom_col() +
hrbrthemes::theme_ipsum() +
scale_x_datetime(date_labels = "%H:%M") +
scale_y_continuous(label = comma)
tbl(cn, "posts") %>%
select(likes:care) %>%
summarize_all(funs(mean, sd)) %>%
collect() %>%
gather(key, val) %>%
summarize(mean_total_interactions_per_post = sum(val),
sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
## mean_total_interactions_per_post sd_total_interactions_per_post
## <dbl> <dbl>
## 1 1039. 108.
tbl(cn, "posts") %>%
select(likes:care, year) %>%
summarize_all(funs(mean, sd)) %>%
collect() %>%
select(-year_mean, -year_sd) %>%
gather(key, val) %>%
filter(!str_detect(key, "sd")) %>%
summarize(mean_total_interactions_per_post = sum(val),
sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
## mean_total_interactions_per_post sd_total_interactions_per_post
## <dbl> <dbl>
## 1 37.7 7.60
probably preferable to and redundant with the others for the purposes of exploring the data, at least the others that are not by time
descriptives_of_interactions <- tbl(cn, "posts") %>%
select(likes:care) %>%
summarize_all(funs(sum, mean, sd)) %>%
collect()
table_of_descriptives <- descriptives_of_interactions %>%
gather(key, val) %>%
separate(key, into = c("var", "stat")) %>%
spread(stat, val) %>%
arrange(desc(sum))
table_of_descriptives %>%
knitr::kable()
var | mean | sd | sum |
---|---|---|---|
likes | 23.5632329 | 367.946806 | 424024217 |
shares | 6.1171739 | 304.766830 | 110079541 |
love | 4.2413979 | 135.737827 | 76324647 |
comments | 3.0795693 | 89.624197 | 55417351 |
haha | 0.2073269 | 36.235316 | 3730882 |
sad | 0.2015831 | 35.427633 | 3627520 |
wow | 0.1729944 | 6.811638 | 3113063 |
care | 0.0695198 | 11.250849 | 1251020 |
angry | 0.0532505 | 12.999496 | 958251 |
table_of_descriptives %>%
summarize_if(is.numeric, sum)
## # A tibble: 1 x 3
## mean sd sum
## <dbl> <dbl> <dbl>
## 1 37.7 1001. 678526492
interactions <- tbl(cn, "posts") %>%
select(likes:care) %>%
summarize_all(funs(mean, sd)) %>%
collect()
interactions %>%
select(contains("mean")) %>%
gather(key, val) %>%
ggplot(aes(x = reorder(key,val), y = val)) +
geom_col() +
coord_flip() +
geom_smooth(se = FALSE) +
hrbrthemes::theme_ipsum()
interactions_by_month <- tbl(cn, "posts") %>%
group_by(year, month) %>%
select(likes:care) %>%
summarize_all(funs(mean, sd)) %>%
collect()
interactions_by_month %>%
select(contains("mean"), year, month) %>%
mutate(date = lubridate::make_date(year = year, month = month)) %>%
ungroup() %>%
select(-year, -month) %>%
gather(key, val, -date) %>%
filter(date >= lubridate::ymd("2010-01-01")) %>%
filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>%
ggplot(aes(x = date, y= val, group = key, color = key)) +
geom_point(alpha = .2) +
geom_line() +
geom_smooth(se = FALSE) +
hrbrthemes::theme_ipsum()
interactions_by_month %>%
select(contains("mean"), year, month) %>%
mutate(date = lubridate::make_date(year = year, month = month)) %>%
ungroup() %>%
select(-year, -month) %>%
gather(key, val, -date) %>%
filter(date >= lubridate::ymd("2018-01-01")) %>%
filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>%
ggplot(aes(x = date, y= val, group = key, color = key)) +
geom_point(alpha = .2) +
geom_line() +
geom_smooth(se = FALSE) +
hrbrthemes::theme_ipsum()