Loading, setting up

library(tidyverse)
library(DBI)
library(scales)

cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))

URL is a unique identifier

tbl(cn, "posts") %>% 
  count(url) %>% 
  tally()
## # Source:   lazy query [?? x 1]
## # Database: sqlite 3.30.1
## #   [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
##          n
##      <int>
## 1 17131373

What we’ll need to merge on

Join un url from the db

tbl(cn, "posts") %>% 
  select(page_name, url) %>% 
  head(20) %>% 
  knitr::kable()
page_name url
KIPP SoCal Public Schools https://www.facebook.com/KIPPSoCal/posts/10151336401812074
Facing History and Ourselves https://www.facebook.com/FacingHistory/posts/10151194384564740
Facing History and Ourselves https://www.facebook.com/FacingHistory/posts/10151550837359740
Facing History and Ourselves https://www.facebook.com/FacingHistory/posts/10151550841404740
Kalamazoo RESA https://www.facebook.com/kalresa/posts/10150597628823386
Connections Academy https://www.facebook.com/ConnectionsAcademy/posts/73709802093
KIPP New Jersey https://www.facebook.com/KIPPNJ/posts/542265837903
Butler Tech https://www.facebook.com/ButlerTech/posts/521858510614
Butler Tech https://www.facebook.com/ButlerTech/posts/521858470694
Butler Tech https://www.facebook.com/ButlerTech/posts/521858400834
South Carolina Governor’s School for Science and Mathematics https://www.facebook.com/SCGSSM/posts/70158487414
KIPP New Jersey https://www.facebook.com/KIPPNJ/posts/40412043731
KIPP New Jersey https://www.facebook.com/KIPPNJ/posts/541827695943
The Learning Community https://www.facebook.com/learningcommunity/posts/41054255641
KIPP New Jersey https://www.facebook.com/KIPPNJ/posts/541811273853
South Carolina Governor’s School for Science and Mathematics https://www.facebook.com/SCGSSM/posts/40847787156
PA Virtual Charter School https://www.facebook.com/pavirtual/posts/1073286633288
Ozarks Technical Community College https://www.facebook.com/ozarkstech/posts/1064932258496
Ozarks Technical Community College https://www.facebook.com/ozarkstech/posts/1064925778334
The Learning Community https://www.facebook.com/learningcommunity/posts/40059890641

Join on url from all-institutional-facebook-urls.csv:

all_institutional_facebook_urls <- read_csv("data-raw/all-institutional-facebook-urls.csv")

all_institutional_facebook_urls %>% 
  head(20) %>% 
  knitr::kable()
nces_id url parsed_path
0100197 https://wwww.facebook.com/mymaefnetwork mymaefnetwork
0100190 https://wwww.facebook.com/alabastercityschools/ alabastercityschools/
0100005 https://wwww.facebook.com/albertvilleschools/ albertvilleschools/
0100030 https://wwww.facebook.com/alexander-city-schools-211441775676594/ alexander-city-schools-211441775676594/
0100090 https://wwww.facebook.com/acsbulldogs/ acsbulldogs/
0100210 https://wwww.facebook.com/auburncityedu/ auburncityedu/
0100240 https://wwww.facebook.com/autauga-county-school-system-569762506412543/ autauga-county-school-system-569762506412543/
0100330 https://wwww.facebook.com/bcstigers/ bcstigers/
0100390 https://wwww.facebook.com/birmingham-city-school-system-1417462398484288/ birmingham-city-school-system-1417462398484288/
0100510 https://wwww.facebook.com/butlercountyss/ butlercountyss/
0100540 https://wwww.facebook.com/calhouncountyboe calhouncountyboe
0100600 https://wwww.facebook.com/chamberscountyschooldistrict chamberscountyschooldistrict
0100188 https://wwww.facebook.com/chickasaw-city-schools-482235791805003/ chickasaw-city-schools-482235791805003/
0100660 https://wwww.facebook.com/pages/chilton-county-school-district/1418177841735719 pages/chilton-county-school-district/1418177841735719
0100660 https://wwww.facebook.com/chiltoncountyschooldistrict/ chiltoncountyschooldistrict/
0100750 https://wwww.facebook.com/pages/clay-county-board-of-education/926850330736055 pages/clay-county-board-of-education/926850330736055
0100780 https://wwww.facebook.com/cleburneschools/ cleburneschools/
0100840 https://wwww.facebook.com/colbertcountyschools colbertcountyschools
0100870 https://wwww.facebook.com/conecuhcountyal conecuhcountyal
0100900 https://wwww.facebook.com/coosa-county-school-system-1358964240820036/ coosa-county-school-system-1358964240820036/

Accounts

Total number of accounts

tbl(cn, "posts") %>% 
  count(facebook_id) %>% 
  tally()
## # Source:   lazy query [?? x 1]
## # Database: sqlite 3.30.1
## #   [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
##       n
##   <int>
## 1 14178

Distribution of the number of posts by accounts

tbl(cn, "posts") %>% count(facebook_id) %>% 
  ggplot(aes(x = n)) + 
  geom_histogram(bins = 1000)

Time series

by_day <- tbl(cn, "posts") %>% 
  count(created_rounded_to_day) %>% 
  collect() %>% 
  mutate(day = as.POSIXct(created_rounded_to_day, origin = "1970-01-01")) %>% 
  filter(day >= lubridate::ymd("2005-01-01"))

Total number of posts

by_day %>% 
  summarize(sum_n = sum(n))
## # A tibble: 1 x 1
##      sum_n
##      <int>
## 1 17246268

Month of year - within the year

by_day %>% 
  mutate(yday = lubridate::yday(day),
         month = lubridate::month(day),
         year = lubridate::year(day)) %>% 
  filter(year >= 2010) %>% 
  mutate(year = as.factor(year)) %>% 
  group_by(year, month) %>% 
  summarize(sum_n = sum(n)) %>% 
  ggplot(aes(x = month, y = sum_n, color = year, group = year)) +
  geom_point() +
  geom_line() +
  hrbrthemes::theme_ipsum() +
  scale_color_brewer(type = "div") +
  scale_y_continuous(label = comma)

Month of year - between years

by_day %>% 
  ggplot(aes(x = day, y = n)) + 
  geom_point(alpha = .25) +
  geom_line() +
  theme_bw() +
  scale_y_continuous(label = comma) +
  geom_smooth(se = FALSE, color = "blue")

Day of week

tbl(cn, "posts") %>%  
  count(day_of_week) %>% 
  collect() %>% 
  mutate(day_of_week_string = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) %>% 
  mutate(day_of_week_string = fct_inorder(day_of_week_string)) %>% 
  ggplot(aes(x = day_of_week_string, y = n)) +
  geom_col() +
  theme_bw() + 
  scale_y_continuous(label = comma)

Hour

tbl(cn, "posts") %>%  
  count(hour) %>% 
  collect() %>% 
  mutate(hour = lubridate::make_datetime(hour = hour)) %>% 
  ggplot(aes(x = hour, y = n)) +
  geom_col() +
  hrbrthemes::theme_ipsum() +
  scale_x_datetime(date_labels = "%H:%M") +
  scale_y_continuous(label = comma)

Interactions

Total interactions per post (overall)

tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect() %>% 
  gather(key, val) %>% 
  summarize(mean_total_interactions_per_post = sum(val),
            sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
##   mean_total_interactions_per_post sd_total_interactions_per_post
##                              <dbl>                          <dbl>
## 1                            1035.                           108.

Mean total interactions per post (overall)

tbl(cn, "posts") %>% 
  select(likes:care, year) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect() %>% 
  select(-year_mean, -year_sd) %>% 
  gather(key, val) %>% 
  filter(!str_detect(key, "sd")) %>%
  summarize(mean_total_interactions_per_post = sum(val),
            sd_total_interactions_per_post = sd(val))
## # A tibble: 1 x 2
##   mean_total_interactions_per_post sd_total_interactions_per_post
##                              <dbl>                          <dbl>
## 1                             37.6                           7.61

Another way of presenting interactions

probably preferable to and redundant with the others for the purposes of exploring the data, at least the others that are not by time

descriptives_of_interactions <- tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(sum, mean, sd)) %>% 
  collect()
table_of_descriptives <- descriptives_of_interactions %>% 
  gather(key, val) %>% 
  separate(key, into = c("var", "stat")) %>% 
  spread(stat, val) %>% 
  arrange(desc(sum))

table_of_descriptives %>% 
  knitr::kable()
var mean sd sum
likes 23.5881150 361.857310 406807637
shares 6.1378886 310.722884 105855849
love 4.1529660 134.946896 71623285
comments 3.0359537 88.960075 52358959
haha 0.2044435 36.358538 3525893
sad 0.1963104 36.106653 3385628
wow 0.1734847 6.798965 2991968
care 0.0525257 10.144989 905873
angry 0.0504053 11.941445 869305
table_of_descriptives %>% 
  summarize_if(is.numeric, sum)
## # A tibble: 1 x 3
##    mean    sd       sum
##   <dbl> <dbl>     <dbl>
## 1  37.6  998. 648324397

Mean interactions by type per post (overall)

interactions <- tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect()

interactions %>% 
  select(contains("mean")) %>% 
  gather(key, val) %>% 
  ggplot(aes(x = reorder(key,val), y = val)) + 
  geom_col() + 
  coord_flip() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Mean interactions by type per post by Month (2010-September, 2020)

interactions_by_month <- tbl(cn, "posts") %>% 
  group_by(year, month) %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect()

interactions_by_month %>% 
  select(contains("mean"), year, month) %>%
  mutate(date = lubridate::make_date(year = year, month = month)) %>% 
  ungroup() %>% 
  select(-year, -month) %>% 
  gather(key, val, -date) %>% 
  filter(date >= lubridate::ymd("2010-01-01")) %>% 
  filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>% 
  ggplot(aes(x = date, y= val, group = key, color = key)) +
  geom_point(alpha = .2) +
  geom_line() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Mean interactions by type by month (2018-September, 2020)

interactions_by_month %>% 
  select(contains("mean"), year, month) %>%
  mutate(date = lubridate::make_date(year = year, month = month)) %>% 
  ungroup() %>% 
  select(-year, -month) %>% 
  gather(key, val, -date) %>% 
  filter(date >= lubridate::ymd("2018-01-01")) %>% 
  filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>% 
  ggplot(aes(x = date, y= val, group = key, color = key)) +
  geom_point(alpha = .2) +
  geom_line() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Text

Starting with a random sample of 1,000,000 messages

text <- tbl(cn, "posts") %>% 
  select(message, year) %>% 
  collect() %>%  
  sample_n(1000000) %>% 
  mutate(learning = str_detect(message, "(?i)learning"),
         technology = str_detect(message, "(?i)technology"),
         student = str_detect(message, "(?i)student"),
         education = str_detect(message, "(?i)education"),
         teaching = str_detect(message, "(?i)teaching"),
         online = str_detect(message, "(?i)online"),
         teacher = str_detect(message, "(?i)teacher"),
         covid = str_detect(message, "(?i)covid")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(year, learning:covid) %>% 
  gather(key, val, -year) %>% 
  group_by(key, year) %>% 
  summarize(mean_val = mean(val, na.rm = TRUE))

text_to_plot %>% 
  filter(year >= 2010) %>% 
  ggplot(aes(x = year, y = mean_val, color = key, group = key)) +
  geom_point() +
  geom_line() +
  scale_color_brewer(type = "qual") +
  hrbrthemes::theme_ipsum() + 
  scale_x_continuous(breaks = 2010:2020) +
  ylab("proportion")

text <- tbl(cn, "posts") %>% 
  filter(year == 2020) %>% 
  select(message, month) %>% 
  collect() %>%  
  mutate(learning = str_detect(message, "(?i)learning"),
         technology = str_detect(message, "(?i)technology"),
         student = str_detect(message, "(?i)student"),
         education = str_detect(message, "(?i)education"),
         remote  = str_detect(message, "(?i)remote"),
         virtual = str_detect(message, "(?i)virtual"),
         remote = str_detect(message, "(?i)remote"),
         teacher = str_detect(message, "(?i)teacher"),
         meal = str_detect(message, "(?i)meal"),
         covid = str_detect(message, "(?i)covid")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(month, learning:covid) %>% 
  gather(key, val, -month) %>% 
  group_by(key, month) %>% 
  summarize(mean_val = mean(val, na.rm = TRUE))

text_to_plot %>% 
  ggplot(aes(x = month, y = mean_val, color = key, group = key)) +
  geom_point() +
  geom_line() +
  scale_color_brewer(type = "qual") +
  hrbrthemes::theme_ipsum() + 
  scale_x_continuous(breaks = 1:9) +
  ylab("proportion")

Only for ST math, a digital tool (of the kind that may be used more during the pandemic); what other tools can we examine? some ideas here: https://www.educationnext.org/google-search-data-reveals-about-learning-pandemic/

text <- tbl(cn, "posts") %>% 
  filter(year == 2020) %>% 
  select(message, month) %>% 
  collect() %>%  
  mutate(st_math= str_detect(message, "(?i)st math")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(month, st_math) %>% 
  gather(key, val, -month) %>% 
  group_by(key, month) %>% 
  summarize(mean_val = sum(val, na.rm = TRUE))

text_to_plot %>% 
  ggplot(aes(x = month, y = mean_val, color = key, group = key)) +
  geom_point() +
  geom_line() +
  scale_color_brewer(type = "qual") +
  hrbrthemes::theme_ipsum() + 
  scale_x_continuous(breaks = 1:9) +
  ylab("proportion")