library(tidyverse)
library(DBI)
library(scales)
cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))
URL is a unique identifier
tbl(cn, "posts") %>%
count(url) %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: sqlite 3.30.1
## # [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
## n
## <int>
## 1 17131373
Join un url
from the db
tbl(cn, "posts") %>%
select(page_name, url) %>%
head(20) %>%
knitr::kable()
Join on url
from all-institutional-facebook-urls.csv
:
all_institutional_facebook_urls <- read_csv("data-raw/all-institutional-facebook-urls.csv")
all_institutional_facebook_urls %>%
head(20) %>%
knitr::kable()
tbl(cn, "posts") %>%
count(facebook_id) %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: sqlite 3.30.1
## # [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
## n
## <int>
## 1 14178
tbl(cn, "posts") %>% count(facebook_id) %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 1000)
by_day <- tbl(cn, "posts") %>%
count(created_rounded_to_day) %>%
collect() %>%
mutate(day = as.POSIXct(created_rounded_to_day, origin = "1970-01-01")) %>%
filter(day >= lubridate::ymd("2005-01-01"))
by_day %>%
summarize(sum_n = sum(n))
## # A tibble: 1 x 1
## sum_n
## <int>
## 1 17246268
by_day %>%
mutate(yday = lubridate::yday(day),
month = lubridate::month(day),
year = lubridate::year(day)) %>%
filter(year >= 2010) %>%
mutate(year = as.factor(year)) %>%
group_by(year, month) %>%
summarize(sum_n = sum(n)) %>%
ggplot(aes(x = month, y = sum_n, color = year, group = year)) +
geom_point() +
geom_line() +
hrbrthemes::theme_ipsum() +
scale_color_brewer(type = "div") +
scale_y_continuous(label = comma)
by_day %>%
ggplot(aes(x = day, y = n)) +
geom_point(alpha = .25) +
geom_line() +
theme_bw() +
scale_y_continuous(label = comma) +
geom_smooth(se = FALSE, color = "blue")
tbl(cn, "posts") %>%
count(day_of_week) %>%
collect() %>%
mutate(day_of_week_string = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) %>%
mutate(day_of_week_string = fct_inorder(day_of_week_string)) %>%
ggplot(aes(x = day_of_week_string, y = n)) +
geom_col() +
theme_bw() +
scale_y_continuous(label = comma)
tbl(cn, "posts") %>%
count(hour) %>%
collect() %>%
mutate(hour = lubridate::make_datetime(hour = hour)) %>%
ggplot(aes(x = hour, y = n)) +
geom_col() +
hrbrthemes::theme_ipsum() +
scale_x_datetime(date_labels = "%H:%M") +
scale_y_continuous(label = comma)
tbl(cn, "posts") %>%
select(likes:care) %>%
summarize_all(funs(mean, sd)) %>%
collect() %>%
gather(key, val) %>%
summarize(mean_total_interactions_per_post = sum(val),
sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
## mean_total_interactions_per_post sd_total_interactions_per_post
## <dbl> <dbl>
## 1 1035. 108.
tbl(cn, "posts") %>%
select(likes:care, year) %>%
summarize_all(funs(mean, sd)) %>%
collect() %>%
select(-year_mean, -year_sd) %>%
gather(key, val) %>%
filter(!str_detect(key, "sd")) %>%
summarize(mean_total_interactions_per_post = sum(val),
sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
## mean_total_interactions_per_post sd_total_interactions_per_post
## <dbl> <dbl>
## 1 37.6 7.61
probably preferable to and redundant with the others for the purposes of exploring the data, at least the others that are not by time
descriptives_of_interactions <- tbl(cn, "posts") %>%
select(likes:care) %>%
summarize_all(funs(sum, mean, sd)) %>%
collect()
table_of_descriptives <- descriptives_of_interactions %>%
gather(key, val) %>%
separate(key, into = c("var", "stat")) %>%
spread(stat, val) %>%
arrange(desc(sum))
table_of_descriptives %>%
knitr::kable()
var | mean | sd | sum |
---|---|---|---|
likes | 23.5881150 | 361.857310 | 406807637 |
shares | 6.1378886 | 310.722884 | 105855849 |
love | 4.1529660 | 134.946896 | 71623285 |
comments | 3.0359537 | 88.960075 | 52358959 |
haha | 0.2044435 | 36.358538 | 3525893 |
sad | 0.1963104 | 36.106653 | 3385628 |
wow | 0.1734847 | 6.798965 | 2991968 |
care | 0.0525257 | 10.144989 | 905873 |
angry | 0.0504053 | 11.941445 | 869305 |
table_of_descriptives %>%
summarize_if(is.numeric, sum)
## # A tibble: 1 x 3
## mean sd sum
## <dbl> <dbl> <dbl>
## 1 37.6 998. 648324397
interactions <- tbl(cn, "posts") %>%
select(likes:care) %>%
summarize_all(funs(mean, sd)) %>%
collect()
interactions %>%
select(contains("mean")) %>%
gather(key, val) %>%
ggplot(aes(x = reorder(key,val), y = val)) +
geom_col() +
coord_flip() +
geom_smooth(se = FALSE) +
hrbrthemes::theme_ipsum()
interactions_by_month <- tbl(cn, "posts") %>%
group_by(year, month) %>%
select(likes:care) %>%
summarize_all(funs(mean, sd)) %>%
collect()
interactions_by_month %>%
select(contains("mean"), year, month) %>%
mutate(date = lubridate::make_date(year = year, month = month)) %>%
ungroup() %>%
select(-year, -month) %>%
gather(key, val, -date) %>%
filter(date >= lubridate::ymd("2010-01-01")) %>%
filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>%
ggplot(aes(x = date, y= val, group = key, color = key)) +
geom_point(alpha = .2) +
geom_line() +
geom_smooth(se = FALSE) +
hrbrthemes::theme_ipsum()
interactions_by_month %>%
select(contains("mean"), year, month) %>%
mutate(date = lubridate::make_date(year = year, month = month)) %>%
ungroup() %>%
select(-year, -month) %>%
gather(key, val, -date) %>%
filter(date >= lubridate::ymd("2018-01-01")) %>%
filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>%
ggplot(aes(x = date, y= val, group = key, color = key)) +
geom_point(alpha = .2) +
geom_line() +
geom_smooth(se = FALSE) +
hrbrthemes::theme_ipsum()
Starting with a random sample of 1,000,000 messages
text <- tbl(cn, "posts") %>%
select(message, year) %>%
collect() %>%
sample_n(1000000) %>%
mutate(learning = str_detect(message, "(?i)learning"),
technology = str_detect(message, "(?i)technology"),
student = str_detect(message, "(?i)student"),
education = str_detect(message, "(?i)education"),
teaching = str_detect(message, "(?i)teaching"),
online = str_detect(message, "(?i)online"),
teacher = str_detect(message, "(?i)teacher"),
covid = str_detect(message, "(?i)covid")) %>%
filter(!is.na(message))
text_to_plot <- text %>%
select(year, learning:covid) %>%
gather(key, val, -year) %>%
group_by(key, year) %>%
summarize(mean_val = mean(val, na.rm = TRUE))
text_to_plot %>%
filter(year >= 2010) %>%
ggplot(aes(x = year, y = mean_val, color = key, group = key)) +
geom_point() +
geom_line() +
scale_color_brewer(type = "qual") +
hrbrthemes::theme_ipsum() +
scale_x_continuous(breaks = 2010:2020) +
ylab("proportion")
text <- tbl(cn, "posts") %>%
filter(year == 2020) %>%
select(message, month) %>%
collect() %>%
mutate(learning = str_detect(message, "(?i)learning"),
technology = str_detect(message, "(?i)technology"),
student = str_detect(message, "(?i)student"),
education = str_detect(message, "(?i)education"),
remote = str_detect(message, "(?i)remote"),
virtual = str_detect(message, "(?i)virtual"),
remote = str_detect(message, "(?i)remote"),
teacher = str_detect(message, "(?i)teacher"),
meal = str_detect(message, "(?i)meal"),
covid = str_detect(message, "(?i)covid")) %>%
filter(!is.na(message))
text_to_plot <- text %>%
select(month, learning:covid) %>%
gather(key, val, -month) %>%
group_by(key, month) %>%
summarize(mean_val = mean(val, na.rm = TRUE))
text_to_plot %>%
ggplot(aes(x = month, y = mean_val, color = key, group = key)) +
geom_point() +
geom_line() +
scale_color_brewer(type = "qual") +
hrbrthemes::theme_ipsum() +
scale_x_continuous(breaks = 1:9) +
ylab("proportion")
Only for ST math, a digital tool (of the kind that may be used more during the pandemic); what other tools can we examine? some ideas here: https://www.educationnext.org/google-search-data-reveals-about-learning-pandemic/
text <- tbl(cn, "posts") %>%
filter(year == 2020) %>%
select(message, month) %>%
collect() %>%
mutate(st_math= str_detect(message, "(?i)st math")) %>%
filter(!is.na(message))
text_to_plot <- text %>%
select(month, st_math) %>%
gather(key, val, -month) %>%
group_by(key, month) %>%
summarize(mean_val = sum(val, na.rm = TRUE))
text_to_plot %>%
ggplot(aes(x = month, y = mean_val, color = key, group = key)) +
geom_point() +
geom_line() +
scale_color_brewer(type = "qual") +
hrbrthemes::theme_ipsum() +
scale_x_continuous(breaks = 1:9) +
ylab("proportion")