Loading, setting up

library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis
library(skimr) # for descriptive stats

cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))

Let’s join!

getting data

posts_100k <- tbl(cn, "posts") %>% 
  select(page_name, url, year) %>% 
  mutate(url = tolower(url)) %>% 
  filter(year == 2020) %>% 
  collect()

posts_100k <- posts_100k %>% 
  separate(url, into = c("url", "post"), sep = "/posts")

posts_100k <- posts_100k %>% distinct(url, post, .keep_all = TRUE)

all_institutional_facebook_urls <- read_csv("data-raw/all-institutional-facebook-urls.csv",
                                            col_types = cols(nces_id = col_character()))

all_institutional_facebook_urls <- all_institutional_facebook_urls %>% 
  mutate(parsed_path = ifelse(str_sub(parsed_path, start = -1) == "/",
                              str_sub(parsed_path, end = -2),
                              parsed_path)) %>% 
  mutate(url = str_c("https://www.facebook.com/", parsed_path)) %>% 
  select(-parsed_path)

all_distinct_urls <- all_institutional_facebook_urls %>% 
  distinct(url, .keep_all = T)

data_with_nces_id_joined <- left_join(posts_100k, all_distinct_urls, by = "url")

initial join

data_with_nces_id_joined_not_matched <- data_with_nces_id_joined %>% 
  filter(is.na(nces_id))

data_with_nces_id_joined_match <- data_with_nces_id_joined %>% 
  filter(!is.na(nces_id))

str_c(round(nrow(data_with_nces_id_joined_match) / nrow(posts_100k) * 100, 2), "% joined")

## [1] "79.13% joined"

joining on other parts of the URL

# 4th
fourth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[4]], NA))

all_institutional_facebook_urls_fourth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(fourth_spot))) %>% 
  filter(!is.na(url))

joined_fourth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fourth)  %>% 
  filter(!is.na(nces_id))

# 5th
fifth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[5]], NA))

all_institutional_facebook_urls_fifth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(fifth_spot))) %>% 
  filter(!is.na(url))

joined_fifth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fifth)  %>% 
  filter(!is.na(nces_id))

# 6th
sixth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[6]], NA))

all_institutional_facebook_urls_sixth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(sixth_spot))) %>% 
  filter(!is.na(url))

joined_sixth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_sixth) %>% 
  filter(!is.na(nces_id))

# 7th
seventh_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[7]], NA))

all_institutional_facebook_urls_seventh <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(seventh_spot))) %>% 
  filter(!is.na(url))

joined_seventh <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_seventh) %>% 
  filter(!is.na(nces_id))

# 8th
eighth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[8]], NA))

all_institutional_facebook_urls_eighth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(eighth_spot))) %>% 
  filter(!is.na(url))

joined_eighth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_eighth)  %>% 
  filter(!is.na(nces_id))

# 9th
ninth_spot <- all_distinct_urls %>% 
  mutate(split_url = str_split(url, "/")) %>% 
  pull(split_url) %>% 
  map(possibly(~.[[9]], NA))

all_institutional_facebook_urls_ninth <- all_distinct_urls %>% 
  mutate(url = str_c("https://www.facebook.com/", unlist(ninth_spot))) %>% 
  filter(!is.na(url))

joined_ninth <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_ninth) %>% 
  filter(!is.na(nces_id))

new_joined_data <- bind_rows(joined_fourth) %>% 
  bind_rows(joined_fifth) %>% 
  bind_rows(joined_sixth) %>% 
  bind_rows(joined_seventh) %>% 
  bind_rows(joined_eighth) %>%
  bind_rows(joined_ninth) %>% 
  distinct(page_name, post, .keep_all = TRUE)

removing spaces

# fourth
all_institutional_facebook_urls_fourth_no_dash <- all_institutional_facebook_urls_fourth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_fourth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fourth_no_dash) %>% 
  filter(!is.na(nces_id))

# fifth
all_institutional_facebook_urls_fifth_no_dash <- all_institutional_facebook_urls_fifth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_fifth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_fifth_no_dash) %>% 
  filter(!is.na(nces_id))

# sixth
all_institutional_facebook_urls_sixth_no_dash <- all_institutional_facebook_urls_sixth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_sixth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_sixth_no_dash) %>% 
  filter(!is.na(nces_id))

# seventh
all_institutional_facebook_urls_seventh_no_dash <- all_institutional_facebook_urls_seventh %>% 
  mutate(url = str_remove_all(url, "-"))

joined_seventh_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_seventh_no_dash) %>% 
  filter(!is.na(nces_id))

# eighth
all_institutional_facebook_urls_eighth_no_dash <- all_institutional_facebook_urls_eighth %>% 
  mutate(url = str_remove_all(url, "-"))

joined_eighth_no_dash <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(all_institutional_facebook_urls_eighth_no_dash) %>% 
  filter(!is.na(nces_id))

new_data_without_dashes <- bind_rows(joined_fourth_no_dash) %>% 
  bind_rows(joined_fifth_no_dash) %>% 
  bind_rows(joined_sixth_no_dash) %>% 
  bind_rows(joined_seventh_no_dash) %>% 
  bind_rows(joined_eighth_no_dash) %>% 
  distinct(page_name, post, .keep_all = TRUE)

joining based only on integers

all_institutional_facebook_urls_b <- all_distinct_urls

all_of_the_numbers <- all_institutional_facebook_urls_b %>% 
  mutate(id = stringr::str_extract_all(url, '\\(?[0-9,.]+')) %>% 
  pull(id)

find_15_digits <- function(x) {
  x[nchar(x) == 15]
}

length_f <- function(x) {
  length(x) > 0  
}

all_of_the_numbers <- all_of_the_numbers %>% 
  map(possibly(find_15_digits, NA))

all_institutional_facebook_urls_b$id <- map(all_of_the_numbers, function(x) if(identical(x, character(0))) NA_character_ else x)

all_institutional_facebook_urls_b <- all_institutional_facebook_urls_b %>% 
  unnest(id)

x <- all_institutional_facebook_urls_b %>% 
  mutate(url = str_c("https://www.facebook.com/", id)) %>% 
  filter(!is.na(url)) %>% 
  select(-id)

from_15_char_id <- data_with_nces_id_joined_not_matched %>% 
  select(-nces_id) %>% 
  left_join(x) %>% 
  filter(!is.na(nces_id))

joining all back up - all done!

all_to_join <- bind_rows(new_joined_data) %>% 
  bind_rows(new_data_without_dashes) %>% 
  bind_rows(from_15_char_id)

all_to_join <- all_to_join %>% 
  distinct(url, post, nces_id, page_name, year)

data_with_nces_id_joined_match_all <- data_with_nces_id_joined_match %>% 
  bind_rows(all_to_join)

str_c("added ", nrow(data_with_nces_id_joined_match_all) - nrow(data_with_nces_id_joined_match), " rows; ",
      (nrow(data_with_nces_id_joined_match_all) - nrow(data_with_nces_id_joined_match)) * 100 / nrow(posts_100k), "% change")

## [1] "added 306574 rows; 10.4357126347848% change"

Joining additional NCES data

This data is from the ELSI Table Generator: https://nces.ed.gov/ccd/elsi/tableGenerator.aspx

nces_info_for_districts <- read_csv(here::here("data", "nces-info-for-districts.csv")) %>% 
  select(nces_id, free_and_reduced_lunch_students_public_school_2017_18, state)

nces_info_for_schools <- read_csv("data/ELSI_csv_export_6374077161361122015895.csv") %>% 
  janitor::clean_names() %>% 
  mutate(nces_id = str_extract(school_id_nces_assigned_public_school_latest_available_year, '\\(?[0-9,.]+')) %>% 
  select(nces_id, free_and_reduced_lunch_students_public_school_2018_19, state = state_name_public_school_latest_available_year)

nces_info <- nces_info_for_districts %>% 
  bind_rows(nces_info_for_schools)

nces_info$nces_id %>% nchar() %>% table() # this looks right

## .
##     7    12 
##  9949 99599

data_with_nces_id_joined_match_all$nces_id %>% nchar() %>% table() # so does this

## .
##       7      12 
## 1788106  843130

data_with_nces_id_joined_match_all %>% 
  left_join(nces_info, by = "nces_id")  %>% 
  filter(is.na(state)) # all joined!

## # A tibble: 8,552 x 8
##    page_name  url    post   year nces_id free_and_reduce… state free_and_reduce…
##    <chr>      <chr>  <chr> <dbl> <chr>   <chr>            <chr> <chr>           
##  1 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
##  2 Emerado P… https… /758…  2020 3806360 <NA>             <NA>  <NA>            
##  3 East Live… https… /176…  2020 3904391 <NA>             <NA>  <NA>            
##  4 Bluford U… https… /327…  2020 1701417 <NA>             <NA>  <NA>            
##  5 East Live… https… /176…  2020 3904391 <NA>             <NA>  <NA>            
##  6 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
##  7 South Hol… https… /102…  2020 2923190 <NA>             <NA>  <NA>            
##  8 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
##  9 East Live… https… /176…  2020 3904391 <NA>             <NA>  <NA>            
## 10 Lawrence … https… /169…  2020 2103240 <NA>             <NA>  <NA>            
## # … with 8,542 more rows

data_with_nces_id_joined_match_all %>% 
  left_join(nces_info, by = "nces_id") %>% 
  mutate(state = tolower(state)) %>% 
  count(state) %>% 
  knitr::kable() # success!

state	n
alabama	48681
alaska	14085
arizona	48136
arkansas	54050
california	173110
colorado	51655
connecticut	14118
delaware	12785
district of columbia	7045
florida	131289
georgia	86670
hawaii	822
idaho	12976
illinois	103363
indiana	70117
iowa	87696
kansas	82143
kentucky	38758
louisiana	36871
maine	24024
maryland	20089
massachusetts	59675
michigan	167390
minnesota	81597
mississippi	39368
missouri	149685
montana	19474
nebraska	54687
nevada	7291
new hampshire	20808
new jersey	69573
new mexico	23607
new york	138830
north carolina	125036
north dakota	20424
ohio	159620
oklahoma	70229
oregon	55923
pennsylvania	104584
rhode island	6823
south carolina	15876
south dakota	23082
tennessee	63480
texas	314916
utah	27214
vermont	23540
virginia	34040
washington	47557
west virginia	6683
wisconsin	156768
wyoming	18452
NA	8552

str_c("Matched ", nrow(data_with_nces_id_joined_match_all)/nrow(posts_100k) * 100, "% of rows with NCES data")

## [1] "Matched 89.5667041898548% of rows with NCES data"

Exploring NCES data

n accounts

nces_dist <- read_csv(here::here("data", "nces-info-for-districts.csv"))

nces_scho <- read_csv("data/ELSI_csv_export_6374077161361122015895.csv") %>% 
  janitor::clean_names() %>% 
  mutate(nces_id = str_extract(school_id_nces_assigned_public_school_latest_available_year, '\\(?[0-9,.]+'))

all_nces_ids <- data_with_nces_id_joined_match_all %>% distinct(nces_id)

n_districts <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  nrow()

n_districts/8193

## [1] 0.8009276

checking sample

nces_dist$frpl <- as.integer(nces_dist$free_and_reduced_lunch_students_public_school_2017_18) /
  as.integer(nces_dist$total_students_all_grades_excludes_ae_district_2017_18)

mean_fb_frpl <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  summarize(mean_frpl = mean(frpl, na.rm = TRUE))

mean_frpl <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  summarize(mean_frpl = mean(frpl, na.rm = TRUE))

mean_fb_frpl

## # A tibble: 1 x 1
##   mean_frpl
##       <dbl>
## 1     0.471

mean_frpl

## # A tibble: 1 x 1
##   mean_frpl
##       <dbl>
## 1     0.476

fb_ss <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  summarize(mean_ss = mean(as.integer(total_students_all_grades_excludes_ae_district_2017_18), na.rm = TRUE))

mean_ss <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  summarize(mean_ss = mean(as.integer(total_students_all_grades_excludes_ae_district_2017_18), na.rm = TRUE))

fb_ss

## # A tibble: 1 x 1
##   mean_ss
##     <dbl>
## 1   4881.

mean_ss

## # A tibble: 1 x 1
##   mean_ss
##     <dbl>
## 1   4517.

writing data

data_with_nces_id_joined_match_all %>% 
  left_join(nces_dist) %>% 
  select(url, nces_id, state, url, frpl, n_students = total_students_all_grades_excludes_ae_district_2017_18)

## # A tibble: 2,631,236 x 5
##    url                                         nces_id    state  frpl n_students
##    <chr>                                       <chr>      <chr> <dbl>      <dbl>
##  1 https://www.facebook.com/mountainburgps     051026000… <NA>     NA         NA
##  2 https://www.facebook.com/okaloosacountysch… 1201380    <NA>     NA         NA
##  3 https://www.facebook.com/paulrbrownleaders… 3700351    <NA>     NA         NA
##  4 https://www.facebook.com/westerncenteracad… 061692012… <NA>     NA         NA
##  5 https://www.facebook.com/englishvalleysele… 1910980    <NA>     NA         NA
##  6 https://www.facebook.com/dierksoutlaws      050534000… <NA>     NA         NA
##  7 https://www.facebook.com/muskogeeschools    4020970    <NA>     NA         NA
##  8 https://www.facebook.com/mountainburgps     051026000… <NA>     NA         NA
##  9 https://www.facebook.com/tsmsonline.org     360007605… <NA>     NA         NA
## 10 https://www.facebook.com/scv35schools       0407520    <NA>     NA         NA
## # … with 2,631,226 more rows

activity

to_join_for_activity <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union")

activity_d <- data_with_nces_id_joined_match_all %>% 
  count(nces_id) %>% 
  left_join(to_join_for_activity)

activity_d %>% 
  select(n, frpl, total_ss = total_students_all_grades_excludes_ae_district_2017_18) %>% 
  corrr::correlate() %>% 
  knitr::kable()

rowname	n	frpl	total_ss
n	NA	0.0899972	0.1307116
frpl	0.0899972	NA	0.0608803
total_ss	0.1307116	0.0608803	NA

Posts per account

activity_d %>% 
  select(n) %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	12812
Number of columns	1
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
n	0	1	247.13	275.7	1	88	175	315	4567	▇▁▁▁▁

activity_d %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 100) +
  ggtitle("posts per account") +
  theme_bw()

activity_d %>% 
  filter(n < 50) %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 100) +
  ggtitle("posts per account for accounts with less than 50 posts") +
  theme_bw() +
  scale_x_continuous(breaks = seq(from = 0, to = 49, by = 2), seq(from = 0, to = 49, by = 2)) +
  xlab(NULL)

by state

fb_state <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_nces_ids) %>% 
  count(state)

all_state <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  count(state) %>% 
  rename(total_n = n)

left_join(fb_state, all_state) %>% 
  mutate(prop = n / total_n) %>% 
  rename(fb_n = n) %>% 
  arrange(desc(prop)) %>% 
  knitr::kable()

state	fb_n	total_n	prop
district of columbia	1	1	1.0000000
hawaii	2	2	1.0000000
new hampshire	1	1	1.0000000
wyoming	66	69	0.9565217
delaware	13	14	0.9285714
florida	41	45	0.9111111
virginia	77	86	0.8953488
alaska	24	27	0.8888889
maine	58	66	0.8787879
iowa	264	307	0.8599349
michigan	331	386	0.8575130
maryland	16	19	0.8421053
washington	139	166	0.8373494
louisiana	46	55	0.8363636
colorado	84	101	0.8316832
minnesota	226	273	0.8278388
oregon	94	114	0.8245614
indiana	192	233	0.8240343
missouri	266	323	0.8235294
new jersey	237	288	0.8229167
new york	393	479	0.8204593
wisconsin	364	447	0.8143177
ohio	369	455	0.8109890
massachusetts	132	164	0.8048780
pennsylvania	279	350	0.7971429
kentucky	83	105	0.7904762
georgia	74	94	0.7872340
kansas	173	220	0.7863636
north carolina	66	84	0.7857143
north dakota	55	70	0.7857143
utah	22	28	0.7857143
arkansas	51	65	0.7846154
alabama	65	83	0.7831325
arizona	89	114	0.7807018
illinois	420	538	0.7806691
nevada	7	9	0.7777778
rhode island	14	18	0.7777778
texas	593	764	0.7761780
south dakota	79	102	0.7745098
south carolina	46	60	0.7666667
connecticut	41	54	0.7592593
california	362	477	0.7589099
montana	59	78	0.7564103
new mexico	37	49	0.7551020
west virginia	18	24	0.7500000
oklahoma	200	269	0.7434944
nebraska	147	198	0.7424242
tennessee	62	85	0.7294118
idaho	34	51	0.6666667
mississippi	50	83	0.6024096

using all possible NCES IDs (either not available or not joined)

all_possible_nces_ids <- all_institutional_facebook_urls %>% 
  distinct(nces_id)

fb_pos_n <- nces_dist %>% 
  filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>% 
  semi_join(all_possible_nces_ids) %>% 
  nrow()

fb_pos_n

## [1] 7631

fb_pos_n/8193

## [1] 0.9314049

data_with_nces_id_joined_match_all

## # A tibble: 2,631,236 x 5
##    page_name                url                       post        year nces_id  
##    <chr>                    <chr>                     <chr>      <dbl> <chr>    
##  1 Mountainburg Public Sch… https://www.facebook.com… /32428930…  2020 05102600…
##  2 Okaloosa County School … https://www.facebook.com… /42663976…  2020 1201380  
##  3 Paul R. Brown Leadershi… https://www.facebook.com… /27185905…  2020 3700351  
##  4 Western Center Academy   https://www.facebook.com… /10157361…  2020 06169201…
##  5 English Valleys Element… https://www.facebook.com… /27125007…  2020 1910980  
##  6 Dierks High School       https://www.facebook.com… /11764706…  2020 05053400…
##  7 Muskogee Public Schools  https://www.facebook.com… /10157437…  2020 4020970  
##  8 Mountainburg Public Sch… https://www.facebook.com… /32428767…  2020 05102600…
##  9 Tompkins Square Middle … https://www.facebook.com… /33497313…  2020 36000760…
## 10 Santa Cruz Valley Unifi… https://www.facebook.com… /38965960…  2020 0407520  
## # … with 2,631,226 more rows

Joining NCES IDs

2020-11-12

Loading, setting up

Let’s join!

getting data

initial join

joining on other parts of the URL

removing spaces

joining based only on integers

joining all back up - all done!

Joining additional NCES data

Exploring NCES data

n accounts

checking sample

writing data

activity

Posts per account

by state

using all possible NCES IDs (either not available or not joined)