Loading, setting up

library(tidyverse)
library(DBI)
library(scales)

cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))

URL is a unique identifier

tbl(cn, "posts") %>% 
  count(url) %>% 
  tally()

## # Source:   lazy query [?? x 1]
## # Database: sqlite 3.30.1
## #   [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
##          n
##      <int>
## 1 17131373

What we’ll need to merge on

Join un url from the db

tbl(cn, "posts") %>% 
  select(page_name, url) %>% 
  head(20) %>% 
  knitr::kable()

page_name	url
KIPP SoCal Public Schools	https://www.facebook.com/KIPPSoCal/posts/10151336401812074
Facing History and Ourselves	https://www.facebook.com/FacingHistory/posts/10151194384564740
Facing History and Ourselves	https://www.facebook.com/FacingHistory/posts/10151550837359740
Facing History and Ourselves	https://www.facebook.com/FacingHistory/posts/10151550841404740
Kalamazoo RESA	https://www.facebook.com/kalresa/posts/10150597628823386
Connections Academy	https://www.facebook.com/ConnectionsAcademy/posts/73709802093
KIPP New Jersey	https://www.facebook.com/KIPPNJ/posts/542265837903
Butler Tech	https://www.facebook.com/ButlerTech/posts/521858510614
Butler Tech	https://www.facebook.com/ButlerTech/posts/521858470694
Butler Tech	https://www.facebook.com/ButlerTech/posts/521858400834
South Carolina Governor’s School for Science and Mathematics	https://www.facebook.com/SCGSSM/posts/70158487414
KIPP New Jersey	https://www.facebook.com/KIPPNJ/posts/40412043731
KIPP New Jersey	https://www.facebook.com/KIPPNJ/posts/541827695943
The Learning Community	https://www.facebook.com/learningcommunity/posts/41054255641
KIPP New Jersey	https://www.facebook.com/KIPPNJ/posts/541811273853
South Carolina Governor’s School for Science and Mathematics	https://www.facebook.com/SCGSSM/posts/40847787156
PA Virtual Charter School	https://www.facebook.com/pavirtual/posts/1073286633288
Ozarks Technical Community College	https://www.facebook.com/ozarkstech/posts/1064932258496
Ozarks Technical Community College	https://www.facebook.com/ozarkstech/posts/1064925778334
The Learning Community	https://www.facebook.com/learningcommunity/posts/40059890641

Join on url from all-institutional-facebook-urls.csv:

all_institutional_facebook_urls <- read_csv("data-raw/all-institutional-facebook-urls.csv")

all_institutional_facebook_urls %>% 
  head(20) %>% 
  knitr::kable()

nces_id	url	parsed_path
0100197	https://wwww.facebook.com/mymaefnetwork	mymaefnetwork
0100190	https://wwww.facebook.com/alabastercityschools/	alabastercityschools/
0100005	https://wwww.facebook.com/albertvilleschools/	albertvilleschools/
0100030	https://wwww.facebook.com/alexander-city-schools-211441775676594/	alexander-city-schools-211441775676594/
0100090	https://wwww.facebook.com/acsbulldogs/	acsbulldogs/
0100210	https://wwww.facebook.com/auburncityedu/	auburncityedu/
0100240	https://wwww.facebook.com/autauga-county-school-system-569762506412543/	autauga-county-school-system-569762506412543/
0100330	https://wwww.facebook.com/bcstigers/	bcstigers/
0100390	https://wwww.facebook.com/birmingham-city-school-system-1417462398484288/	birmingham-city-school-system-1417462398484288/
0100510	https://wwww.facebook.com/butlercountyss/	butlercountyss/
0100540	https://wwww.facebook.com/calhouncountyboe	calhouncountyboe
0100600	https://wwww.facebook.com/chamberscountyschooldistrict	chamberscountyschooldistrict
0100188	https://wwww.facebook.com/chickasaw-city-schools-482235791805003/	chickasaw-city-schools-482235791805003/
0100660	https://wwww.facebook.com/pages/chilton-county-school-district/1418177841735719	pages/chilton-county-school-district/1418177841735719
0100660	https://wwww.facebook.com/chiltoncountyschooldistrict/	chiltoncountyschooldistrict/
0100750	https://wwww.facebook.com/pages/clay-county-board-of-education/926850330736055	pages/clay-county-board-of-education/926850330736055
0100780	https://wwww.facebook.com/cleburneschools/	cleburneschools/
0100840	https://wwww.facebook.com/colbertcountyschools	colbertcountyschools
0100870	https://wwww.facebook.com/conecuhcountyal	conecuhcountyal
0100900	https://wwww.facebook.com/coosa-county-school-system-1358964240820036/	coosa-county-school-system-1358964240820036/

Accounts

Total number of accounts

tbl(cn, "posts") %>% 
  count(facebook_id) %>% 
  tally()

## # Source:   lazy query [?? x 1]
## # Database: sqlite 3.30.1
## #   [/Users/jrosenb8/districts-facebook/db/k12-institutions-fb-posts.sqlite]
##       n
##   <int>
## 1 14178

Distribution of the number of posts by accounts

tbl(cn, "posts") %>% count(facebook_id) %>% 
  ggplot(aes(x = n)) + 
  geom_histogram(bins = 1000)

Time series

by_day <- tbl(cn, "posts") %>% 
  count(created_rounded_to_day) %>% 
  collect() %>% 
  mutate(day = as.POSIXct(created_rounded_to_day, origin = "1970-01-01")) %>% 
  filter(day >= lubridate::ymd("2005-01-01"))

Total number of posts

by_day %>% 
  summarize(sum_n = sum(n))

## # A tibble: 1 x 1
##      sum_n
##      <int>
## 1 17246268

Month of year - within the year

by_day %>% 
  mutate(yday = lubridate::yday(day),
         month = lubridate::month(day),
         year = lubridate::year(day)) %>% 
  filter(year >= 2010) %>% 
  mutate(year = as.factor(year)) %>% 
  group_by(year, month) %>% 
  summarize(sum_n = sum(n)) %>% 
  ggplot(aes(x = month, y = sum_n, color = year, group = year)) +
  geom_point() +
  geom_line() +
  hrbrthemes::theme_ipsum() +
  scale_color_brewer(type = "div") +
  scale_y_continuous(label = comma)

Month of year - between years

by_day %>% 
  ggplot(aes(x = day, y = n)) + 
  geom_point(alpha = .25) +
  geom_line() +
  theme_bw() +
  scale_y_continuous(label = comma) +
  geom_smooth(se = FALSE, color = "blue")

Day of week

tbl(cn, "posts") %>%  
  count(day_of_week) %>% 
  collect() %>% 
  mutate(day_of_week_string = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) %>% 
  mutate(day_of_week_string = fct_inorder(day_of_week_string)) %>% 
  ggplot(aes(x = day_of_week_string, y = n)) +
  geom_col() +
  theme_bw() + 
  scale_y_continuous(label = comma)

Hour

tbl(cn, "posts") %>%  
  count(hour) %>% 
  collect() %>% 
  mutate(hour = lubridate::make_datetime(hour = hour)) %>% 
  ggplot(aes(x = hour, y = n)) +
  geom_col() +
  hrbrthemes::theme_ipsum() +
  scale_x_datetime(date_labels = "%H:%M") +
  scale_y_continuous(label = comma)

Interactions

Total interactions per post (overall)

tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect() %>% 
  gather(key, val) %>% 
  summarize(mean_total_interactions_per_post = sum(val),
            sd_total_interactions_per_post = sd(val))

## # A tibble: 1 x 2
##   mean_total_interactions_per_post sd_total_interactions_per_post
##                              <dbl>                          <dbl>
## 1                            1035.                           108.

Mean total interactions per post (overall)

tbl(cn, "posts") %>% 
  select(likes:care, year) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect() %>% 
  select(-year_mean, -year_sd) %>% 
  gather(key, val) %>% 
  filter(!str_detect(key, "sd")) %>%
  summarize(mean_total_interactions_per_post = sum(val),
            sd_total_interactions_per_post = sd(val))

## # A tibble: 1 x 2
##   mean_total_interactions_per_post sd_total_interactions_per_post
##                              <dbl>                          <dbl>
## 1                             37.6                           7.61

Another way of presenting interactions

probably preferable to and redundant with the others for the purposes of exploring the data, at least the others that are not by time

descriptives_of_interactions <- tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(sum, mean, sd)) %>% 
  collect()

table_of_descriptives <- descriptives_of_interactions %>% 
  gather(key, val) %>% 
  separate(key, into = c("var", "stat")) %>% 
  spread(stat, val) %>% 
  arrange(desc(sum))

table_of_descriptives %>% 
  knitr::kable()

var	mean	sd	sum
likes	23.5881150	361.857310	406807637
shares	6.1378886	310.722884	105855849
love	4.1529660	134.946896	71623285
comments	3.0359537	88.960075	52358959
haha	0.2044435	36.358538	3525893
sad	0.1963104	36.106653	3385628
wow	0.1734847	6.798965	2991968
care	0.0525257	10.144989	905873
angry	0.0504053	11.941445	869305

table_of_descriptives %>% 
  summarize_if(is.numeric, sum)

## # A tibble: 1 x 3
##    mean    sd       sum
##   <dbl> <dbl>     <dbl>
## 1  37.6  998. 648324397

Mean interactions by type per post (overall)

interactions <- tbl(cn, "posts") %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect()

interactions %>% 
  select(contains("mean")) %>% 
  gather(key, val) %>% 
  ggplot(aes(x = reorder(key,val), y = val)) + 
  geom_col() + 
  coord_flip() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Mean interactions by type per post by Month (2010-September, 2020)

interactions_by_month <- tbl(cn, "posts") %>% 
  group_by(year, month) %>% 
  select(likes:care) %>% 
  summarize_all(funs(mean, sd)) %>% 
  collect()

interactions_by_month %>% 
  select(contains("mean"), year, month) %>%
  mutate(date = lubridate::make_date(year = year, month = month)) %>% 
  ungroup() %>% 
  select(-year, -month) %>% 
  gather(key, val, -date) %>% 
  filter(date >= lubridate::ymd("2010-01-01")) %>% 
  filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>% 
  ggplot(aes(x = date, y= val, group = key, color = key)) +
  geom_point(alpha = .2) +
  geom_line() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Mean interactions by type by month (2018-September, 2020)

interactions_by_month %>% 
  select(contains("mean"), year, month) %>%
  mutate(date = lubridate::make_date(year = year, month = month)) %>% 
  ungroup() %>% 
  select(-year, -month) %>% 
  gather(key, val, -date) %>% 
  filter(date >= lubridate::ymd("2018-01-01")) %>% 
  filter(key != "care_mean", key != "angry_mean", key != "wow_mean") %>% 
  ggplot(aes(x = date, y= val, group = key, color = key)) +
  geom_point(alpha = .2) +
  geom_line() +
  geom_smooth(se = FALSE) + 
  hrbrthemes::theme_ipsum()

Text

Starting with a random sample of 1,000,000 messages

text <- tbl(cn, "posts") %>% 
  select(message, year) %>% 
  collect() %>%  
  sample_n(1000000) %>% 
  mutate(learning = str_detect(message, "(?i)learning"),
         technology = str_detect(message, "(?i)technology"),
         student = str_detect(message, "(?i)student"),
         education = str_detect(message, "(?i)education"),
         teaching = str_detect(message, "(?i)teaching"),
         online = str_detect(message, "(?i)online"),
         teacher = str_detect(message, "(?i)teacher"),
         covid = str_detect(message, "(?i)covid")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(year, learning:covid) %>% 
  gather(key, val, -year) %>% 
  group_by(key, year) %>% 
  summarize(mean_val = mean(val, na.rm = TRUE))

text_to_plot %>% 
  filter(year >= 2010) %>% 
  ggplot(aes(x = year, y = mean_val, color = key, group = key)) +
  geom_point() +
  geom_line() +
  scale_color_brewer(type = "qual") +
  hrbrthemes::theme_ipsum() + 
  scale_x_continuous(breaks = 2010:2020) +
  ylab("proportion")

text <- tbl(cn, "posts") %>% 
  filter(year == 2020) %>% 
  select(message, month) %>% 
  collect() %>%  
  mutate(learning = str_detect(message, "(?i)learning"),
         technology = str_detect(message, "(?i)technology"),
         student = str_detect(message, "(?i)student"),
         education = str_detect(message, "(?i)education"),
         remote  = str_detect(message, "(?i)remote"),
         virtual = str_detect(message, "(?i)virtual"),
         remote = str_detect(message, "(?i)remote"),
         teacher = str_detect(message, "(?i)teacher"),
         meal = str_detect(message, "(?i)meal"),
         covid = str_detect(message, "(?i)covid")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(month, learning:covid) %>% 
  gather(key, val, -month) %>% 
  group_by(key, month) %>% 
  summarize(mean_val = mean(val, na.rm = TRUE))

text_to_plot %>% 
  ggplot(aes(x = month, y = mean_val, color = key, group = key)) +
  geom_point() +
  geom_line() +
  scale_color_brewer(type = "qual") +
  hrbrthemes::theme_ipsum() + 
  scale_x_continuous(breaks = 1:9) +
  ylab("proportion")

Only for ST math, a digital tool (of the kind that may be used more during the pandemic); what other tools can we examine? some ideas here: https://www.educationnext.org/google-search-data-reveals-about-learning-pandemic/

text <- tbl(cn, "posts") %>% 
  filter(year == 2020) %>% 
  select(message, month) %>% 
  collect() %>%  
  mutate(st_math= str_detect(message, "(?i)st math")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(month, st_math) %>% 
  gather(key, val, -month) %>% 
  group_by(key, month) %>% 
  summarize(mean_val = sum(val, na.rm = TRUE))

text_to_plot %>% 
  ggplot(aes(x = month, y = mean_val, color = key, group = key)) +
  geom_point() +
  geom_line() +
  scale_color_brewer(type = "qual") +
  hrbrthemes::theme_ipsum() + 
  scale_x_continuous(breaks = 1:9) +
  ylab("proportion")

Initial Exploration of FB Posts Database

2020-10-21