Loading, setting up

library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis
library(skimr) # for descriptive stats

cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))

Let’s join!

getting data

posts_100k <- tbl(cn, "posts") %>% 
  select(page_name, url, year) %>% 
  mutate(url = tolower(url)) %>% 
  filter(year == 2020) %>% 
  collect()

posts_100k <- posts_100k %>% 
  separate(url, into = c("url", "post"), sep = "/posts")

posts_100k <- posts_100k %>% distinct(url, post, .keep_all = TRUE)
all_institutional_facebook_urls <- read_csv("data-raw/all-institutional-facebook-urls.csv",
                                            col_types = cols(nces_id = col_character()))

all_institutional_facebook_urls <- all_institutional_facebook_urls %>% 
  mutate(parsed_path = ifelse(str_sub(parsed_path, start = -1) == "/",
                              str_sub(parsed_path, end = -2),
                              parsed_path)) %>% 
  mutate(url = str_c("https://www.facebook.com/", parsed_path)) %>% 
  select(-parsed_path)
all_distinct_urls <- all_institutional_facebook_urls %>% 
  distinct(url, .keep_all = T)

data_with_nces_id_joined <- left_join(posts_100k, all_distinct_urls, by = "url")

initial join

data_with_nces_id_joined_not_matched <- data_with_nces_id_joined %>% 
  filter(is.na(nces_id))

data_with_nces_id_joined_match <- data_with_nces_id_joined %>% 
  filter(!is.na(nces_id))

str_c(round(nrow(data_with_nces_id_joined_match) / nrow(posts_100k) * 100, 2), "% joined")
## [1] "79.13% joined"

joining on other parts of the URL

# 4th
fourth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[4]], NA))

all_institutional_facebook_urls_fourth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(fourth_spot))) %>% 
  filter(!is.na(url))

joined_fourth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fourth)  %>% 
  filter(!is.na(nces_id))

# 5th
fifth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[5]], NA))

all_institutional_facebook_urls_fifth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(fifth_spot))) %>% 
  filter(!is.na(url))

joined_fifth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fifth)  %>% 
  filter(!is.na(nces_id))

# 6th
sixth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[6]], NA))

all_institutional_facebook_urls_sixth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(sixth_spot))) %>% 
  filter(!is.na(url))

joined_sixth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_sixth) %>% 
  filter(!is.na(nces_id))

# 7th
seventh_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[7]], NA))

all_institutional_facebook_urls_seventh <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(seventh_spot))) %>% 
  filter(!is.na(url))

joined_seventh <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_seventh) %>% 
  filter(!is.na(nces_id))

# 8th
eighth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[8]], NA))

all_institutional_facebook_urls_eighth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(eighth_spot))) %>% 
  filter(!is.na(url))

joined_eighth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_eighth)  %>% 
  filter(!is.na(nces_id))

# 9th
ninth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[9]], NA))

all_institutional_facebook_urls_ninth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(ninth_spot))) %>% 
  filter(!is.na(url))

joined_ninth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_ninth) %>% 
  filter(!is.na(nces_id))

new_joined_data <- bind_rows(joined_fourth) %>% 
  bind_rows(joined_fifth) %>% 
  bind_rows(joined_sixth) %>% 
  bind_rows(joined_seventh) %>% 
  bind_rows(joined_eighth) %>%
  bind_rows(joined_ninth) %>% 
  distinct(page_name, post, .keep_all = TRUE)

removing spaces

# fourth
all_institutional_facebook_urls_fourth_no_dash <- all_institutional_facebook_urls_fourth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_fourth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fourth_no_dash) %>% 
  filter(!is.na(nces_id))

# fifth
all_institutional_facebook_urls_fifth_no_dash <- all_institutional_facebook_urls_fifth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_fifth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fifth_no_dash) %>% 
  filter(!is.na(nces_id))

# sixth
all_institutional_facebook_urls_sixth_no_dash <- all_institutional_facebook_urls_sixth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_sixth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_sixth_no_dash) %>% 
  filter(!is.na(nces_id))

# seventh
all_institutional_facebook_urls_seventh_no_dash <- all_institutional_facebook_urls_seventh %>% 
  mutate(url = str_remove_all(url, "-"))

joined_seventh_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_seventh_no_dash) %>% 
  filter(!is.na(nces_id))

# eighth
all_institutional_facebook_urls_eighth_no_dash <- all_institutional_facebook_urls_eighth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_eighth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_eighth_no_dash) %>% 
  filter(!is.na(nces_id))

new_data_without_dashes <- bind_rows(joined_fourth_no_dash) %>% 
  bind_rows(joined_fifth_no_dash) %>% 
  bind_rows(joined_sixth_no_dash) %>% 
  bind_rows(joined_seventh_no_dash) %>% 
  bind_rows(joined_eighth_no_dash) %>% 
  distinct(page_name, post, .keep_all = TRUE)

joining based only on integers

all_institutional_facebook_urls_b <- all_distinct_urls

all_of_the_numbers <- all_institutional_facebook_urls_b %>% 
  mutate(id = stringr::str_extract_all(url, '\\(?[0-9,.]+')) %>% 
  pull(id)

find_15_digits <- function(x) {
  x[nchar(x) == 15]
}

length_f <- function(x) {
  length(x) > 0  
}

all_of_the_numbers <- all_of_the_numbers %>% 
  map(possibly(find_15_digits, NA))

all_institutional_facebook_urls_b$id <- map(all_of_the_numbers, function(x) if(identical(x, character(0))) NA_character_ else x)

all_institutional_facebook_urls_b <- all_institutional_facebook_urls_b %>% 
  unnest(id)

x <- all_institutional_facebook_urls_b %>% 
  mutate(url = str_c("https://www.facebook.com/", id)) %>% 
  filter(!is.na(url)) %>% 
  select(-id)

from_15_char_id <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(x) %>% 
  filter(!is.na(nces_id))

joining all back up - all done!

all_to_join <- bind_rows(new_joined_data) %>% 
  bind_rows(new_data_without_dashes) %>% 
  bind_rows(from_15_char_id)

all_to_join <- all_to_join %>% 
  distinct(url, post, nces_id, page_name, year)

data_with_nces_id_joined_match_all <- data_with_nces_id_joined_match %>% 
  bind_rows(all_to_join)

str_c("added ", nrow(data_with_nces_id_joined_match_all) - nrow(data_with_nces_id_joined_match), " rows; ",
      (nrow(data_with_nces_id_joined_match_all) - nrow(data_with_nces_id_joined_match)) * 100 / nrow(posts_100k), "% change")
## [1] "added 306574 rows; 10.4357126347848% change"

Joining additional NCES data

This data is from the ELSI Table Generator: https://nces.ed.gov/ccd/elsi/tableGenerator.aspx

nces_info_for_districts <- read_csv(here::here("data", "nces-info-for-districts.csv")) %>% 
  select(nces_id, free_and_reduced_lunch_students_public_school_2017_18, state)

nces_info_for_schools <- read_csv("data/ELSI_csv_export_6374077161361122015895.csv") %>% 
  janitor::clean_names() %>% 
  mutate(nces_id = str_extract(school_id_nces_assigned_public_school_latest_available_year, '\\(?[0-9,.]+')) %>% 
  select(nces_id, free_and_reduced_lunch_students_public_school_2018_19, state = state_name_public_school_latest_available_year)

nces_info <- nces_info_for_districts %>% 
  bind_rows(nces_info_for_schools)

nces_info$nces_id %>% nchar() %>% table() # this looks right
## .
##     7    12 
##  9949 99599
data_with_nces_id_joined_match_all$nces_id %>% nchar() %>% table() # so does this
## .
##       7      12 
## 1788106  843130
data_with_nces_id_joined_match_all %>% 
  left_join(nces_info, by = "nces_id")  %>% 
  filter(is.na(state)) # all joined!
## # A tibble: 8,552 x 8
##    page_name  url    post   year nces_id free_and_reduce… state free_and_reduce…
##    <chr>      <chr>  <chr> <dbl> <chr>   <chr>            <chr> <chr>           
##  1 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
##  2 Emerado P… https… /758…  2020 3806360 <NA>             <NA>  <NA>            
##  3 East Live… https… /176…  2020 3904391 <NA>             <NA>  <NA>            
##  4 Bluford U… https… /327…  2020 1701417 <NA>             <NA>  <NA>            
##  5 East Live… https… /176…  2020 3904391 <NA>             <NA>  <NA>            
##  6 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
##  7 South Hol… https… /102…  2020 2923190 <NA>             <NA>  <NA>            
##  8 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
##  9 East Live… https… /176…  2020 3904391 <NA>             <NA>  <NA>            
## 10 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
## # … with 8,542 more rows
data_with_nces_id_joined_match_all %>% 
  left_join(nces_info, by = "nces_id") %>% 
  mutate(state = tolower(state)) %>% 
  count(state) %>% 
  knitr::kable() # success!
state n
alabama 48681
alaska 14085
arizona 48136
arkansas 54050
california 173110
colorado 51655
connecticut 14118
delaware 12785
district of columbia 7045
florida 131289
georgia 86670
hawaii 822
idaho 12976
illinois 103363
indiana 70117
iowa 87696
kansas 82143
kentucky 38758
louisiana 36871
maine 24024
maryland 20089
massachusetts 59675
michigan 167390
minnesota 81597
mississippi 39368
missouri 149685
montana 19474
nebraska 54687
nevada 7291
new hampshire 20808
new jersey 69573
new mexico 23607
new york 138830
north carolina 125036
north dakota 20424
ohio 159620
oklahoma 70229
oregon 55923
pennsylvania 104584
rhode island 6823
south carolina 15876
south dakota 23082
tennessee 63480
texas 314916
utah 27214
vermont 23540
virginia 34040
washington 47557
west virginia 6683
wisconsin 156768
wyoming 18452
NA 8552
str_c("Matched ", nrow(data_with_nces_id_joined_match_all)/nrow(posts_100k) * 100, "% of rows with NCES data")
## [1] "Matched 89.5667041898548% of rows with NCES data"

Exploring NCES data

n accounts

nces_dist <- read_csv(here::here("data", "nces-info-for-districts.csv"))

nces_scho <- read_csv("data/ELSI_csv_export_6374077161361122015895.csv") %>% 
  janitor::clean_names() %>% 
  mutate(nces_id = str_extract(school_id_nces_assigned_public_school_latest_available_year, '\\(?[0-9,.]+'))

all_nces_ids <- data_with_nces_id_joined_match_all %>% distinct(nces_id)

n_districts <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  nrow()

n_districts/8193
## [1] 0.8009276

checking sample

nces_dist$frpl <- as.integer(nces_dist$free_and_reduced_lunch_students_public_school_2017_18) /
  as.integer(nces_dist$total_students_all_grades_excludes_ae_district_2017_18)

mean_fb_frpl <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  summarize(mean_frpl = mean(frpl, na.rm = TRUE))

mean_frpl <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  summarize(mean_frpl = mean(frpl, na.rm = TRUE))

mean_fb_frpl
## # A tibble: 1 x 1
##   mean_frpl
##       <dbl>
## 1     0.471
mean_frpl
## # A tibble: 1 x 1
##   mean_frpl
##       <dbl>
## 1     0.476
fb_ss <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  summarize(mean_ss = mean(as.integer(total_students_all_grades_excludes_ae_district_2017_18), na.rm = TRUE))

mean_ss <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  summarize(mean_ss = mean(as.integer(total_students_all_grades_excludes_ae_district_2017_18), na.rm = TRUE))

fb_ss
## # A tibble: 1 x 1
##   mean_ss
##     <dbl>
## 1   4881.
mean_ss
## # A tibble: 1 x 1
##   mean_ss
##     <dbl>
## 1   4517.

writing data

data_with_nces_id_joined_match_all %>% 
  left_join(nces_dist) %>% 
  select(url, nces_id, state, url, frpl, n_students = total_students_all_grades_excludes_ae_district_2017_18)
## # A tibble: 2,631,236 x 5
##    url                                         nces_id    state  frpl n_students
##    <chr>                                       <chr>      <chr> <dbl>      <dbl>
##  1 https://www.facebook.com/mountainburgps     051026000… <NA>     NA         NA
##  2 https://www.facebook.com/okaloosacountysch… 1201380    <NA>     NA         NA
##  3 https://www.facebook.com/paulrbrownleaders… 3700351    <NA>     NA         NA
##  4 https://www.facebook.com/westerncenteracad… 061692012… <NA>     NA         NA
##  5 https://www.facebook.com/englishvalleysele… 1910980    <NA>     NA         NA
##  6 https://www.facebook.com/dierksoutlaws      050534000… <NA>     NA         NA
##  7 https://www.facebook.com/muskogeeschools    4020970    <NA>     NA         NA
##  8 https://www.facebook.com/mountainburgps     051026000… <NA>     NA         NA
##  9 https://www.facebook.com/tsmsonline.org     360007605… <NA>     NA         NA
## 10 https://www.facebook.com/scv35schools       0407520    <NA>     NA         NA
## # … with 2,631,226 more rows

activity

to_join_for_activity <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union")

activity_d <- data_with_nces_id_joined_match_all %>% 
  count(nces_id) %>% 
  left_join(to_join_for_activity)

activity_d %>% 
  select(n, frpl, total_ss = total_students_all_grades_excludes_ae_district_2017_18) %>% 
  corrr::correlate() %>% 
  knitr::kable()
rowname n frpl total_ss
n NA 0.0899972 0.1307116
frpl 0.0899972 NA 0.0608803
total_ss 0.1307116 0.0608803 NA

Posts per account

activity_d %>% 
  select(n) %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 12812
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
n 0 1 247.13 275.7 1 88 175 315 4567 ▇▁▁▁▁
activity_d %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 100) +
  ggtitle("posts per account") +
  theme_bw()

activity_d %>% 
  filter(n < 50) %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 100) +
  ggtitle("posts per account for accounts with less than 50 posts") +
  theme_bw() +
  scale_x_continuous(breaks = seq(from = 0, to = 49, by = 2), seq(from = 0, to = 49, by = 2)) +
  xlab(NULL)

by state

fb_state <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  count(state)

all_state <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  count(state) %>% 
  rename(total_n = n)

left_join(fb_state, all_state) %>% 
  mutate(prop = n / total_n) %>% 
  rename(fb_n = n) %>% 
  arrange(desc(prop)) %>% 
  knitr::kable()
state fb_n total_n prop
district of columbia 1 1 1.0000000
hawaii 2 2 1.0000000
new hampshire 1 1 1.0000000
wyoming 66 69 0.9565217
delaware 13 14 0.9285714
florida 41 45 0.9111111
virginia 77 86 0.8953488
alaska 24 27 0.8888889
maine 58 66 0.8787879
iowa 264 307 0.8599349
michigan 331 386 0.8575130
maryland 16 19 0.8421053
washington 139 166 0.8373494
louisiana 46 55 0.8363636
colorado 84 101 0.8316832
minnesota 226 273 0.8278388
oregon 94 114 0.8245614
indiana 192 233 0.8240343
missouri 266 323 0.8235294
new jersey 237 288 0.8229167
new york 393 479 0.8204593
wisconsin 364 447 0.8143177
ohio 369 455 0.8109890
massachusetts 132 164 0.8048780
pennsylvania 279 350 0.7971429
kentucky 83 105 0.7904762
georgia 74 94 0.7872340
kansas 173 220 0.7863636
north carolina 66 84 0.7857143
north dakota 55 70 0.7857143
utah 22 28 0.7857143
arkansas 51 65 0.7846154
alabama 65 83 0.7831325
arizona 89 114 0.7807018
illinois 420 538 0.7806691
nevada 7 9 0.7777778
rhode island 14 18 0.7777778
texas 593 764 0.7761780
south dakota 79 102 0.7745098
south carolina 46 60 0.7666667
connecticut 41 54 0.7592593
california 362 477 0.7589099
montana 59 78 0.7564103
new mexico 37 49 0.7551020
west virginia 18 24 0.7500000
oklahoma 200 269 0.7434944
nebraska 147 198 0.7424242
tennessee 62 85 0.7294118
idaho 34 51 0.6666667
mississippi 50 83 0.6024096

using all possible NCES IDs (either not available or not joined)

all_possible_nces_ids <- all_institutional_facebook_urls %>% 
  distinct(nces_id)

fb_pos_n <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_possible_nces_ids) %>% 
  nrow()

fb_pos_n
## [1] 7631
fb_pos_n/8193
## [1] 0.9314049
data_with_nces_id_joined_match_all 
## # A tibble: 2,631,236 x 5
##    page_name                url                       post        year nces_id  
##    <chr>                    <chr>                     <chr>      <dbl> <chr>    
##  1 Mountainburg Public Sch… https://www.facebook.com… /32428930…  2020 05102600…
##  2 Okaloosa County School … https://www.facebook.com… /42663976…  2020 1201380  
##  3 Paul R. Brown Leadershi… https://www.facebook.com… /27185905…  2020 3700351  
##  4 Western Center Academy   https://www.facebook.com… /10157361…  2020 06169201…
##  5 English Valleys Element… https://www.facebook.com… /27125007…  2020 1910980  
##  6 Dierks High School       https://www.facebook.com… /11764706…  2020 05053400…
##  7 Muskogee Public Schools  https://www.facebook.com… /10157437…  2020 4020970  
##  8 Mountainburg Public Sch… https://www.facebook.com… /32428767…  2020 05102600…
##  9 Tompkins Square Middle … https://www.facebook.com… /33497313…  2020 36000760…
## 10 Santa Cruz Valley Unifi… https://www.facebook.com… /38965960…  2020 0407520  
## # … with 2,631,226 more rows