Loading, setting up
library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis
library(skimr) # for descriptive stats
cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))
Let’s join!
getting data
posts_100k <- tbl(cn, "posts") %>%
select(page_name, url, year) %>%
mutate(url = tolower(url)) %>%
filter(year == 2020) %>%
collect()
posts_100k <- posts_100k %>%
separate(url, into = c("url", "post"), sep = "/posts")
posts_100k <- posts_100k %>% distinct(url, post, .keep_all = TRUE)
all_institutional_facebook_urls <- read_csv("data-raw/all-institutional-facebook-urls.csv",
col_types = cols(nces_id = col_character()))
all_institutional_facebook_urls <- all_institutional_facebook_urls %>%
mutate(parsed_path = ifelse(str_sub(parsed_path, start = -1) == "/",
str_sub(parsed_path, end = -2),
parsed_path)) %>%
mutate(url = str_c("https://www.facebook.com/", parsed_path)) %>%
select(-parsed_path)
all_distinct_urls <- all_institutional_facebook_urls %>%
distinct(url, .keep_all = T)
data_with_nces_id_joined <- left_join(posts_100k, all_distinct_urls, by = "url")
initial join
data_with_nces_id_joined_not_matched <- data_with_nces_id_joined %>%
filter(is.na(nces_id))
data_with_nces_id_joined_match <- data_with_nces_id_joined %>%
filter(!is.na(nces_id))
str_c(round(nrow(data_with_nces_id_joined_match) / nrow(posts_100k) * 100, 2), "% joined")
## [1] "79.13% joined"
joining on other parts of the URL
# 4th
fourth_spot <- all_distinct_urls %>%
mutate(split_url = str_split(url, "/")) %>%
pull(split_url) %>%
map(possibly(~.[[4]], NA))
all_institutional_facebook_urls_fourth <- all_distinct_urls %>%
mutate(url = str_c("https://www.facebook.com/", unlist(fourth_spot))) %>%
filter(!is.na(url))
joined_fourth <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_fourth) %>%
filter(!is.na(nces_id))
# 5th
fifth_spot <- all_distinct_urls %>%
mutate(split_url = str_split(url, "/")) %>%
pull(split_url) %>%
map(possibly(~.[[5]], NA))
all_institutional_facebook_urls_fifth <- all_distinct_urls %>%
mutate(url = str_c("https://www.facebook.com/", unlist(fifth_spot))) %>%
filter(!is.na(url))
joined_fifth <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_fifth) %>%
filter(!is.na(nces_id))
# 6th
sixth_spot <- all_distinct_urls %>%
mutate(split_url = str_split(url, "/")) %>%
pull(split_url) %>%
map(possibly(~.[[6]], NA))
all_institutional_facebook_urls_sixth <- all_distinct_urls %>%
mutate(url = str_c("https://www.facebook.com/", unlist(sixth_spot))) %>%
filter(!is.na(url))
joined_sixth <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_sixth) %>%
filter(!is.na(nces_id))
# 7th
seventh_spot <- all_distinct_urls %>%
mutate(split_url = str_split(url, "/")) %>%
pull(split_url) %>%
map(possibly(~.[[7]], NA))
all_institutional_facebook_urls_seventh <- all_distinct_urls %>%
mutate(url = str_c("https://www.facebook.com/", unlist(seventh_spot))) %>%
filter(!is.na(url))
joined_seventh <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_seventh) %>%
filter(!is.na(nces_id))
# 8th
eighth_spot <- all_distinct_urls %>%
mutate(split_url = str_split(url, "/")) %>%
pull(split_url) %>%
map(possibly(~.[[8]], NA))
all_institutional_facebook_urls_eighth <- all_distinct_urls %>%
mutate(url = str_c("https://www.facebook.com/", unlist(eighth_spot))) %>%
filter(!is.na(url))
joined_eighth <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_eighth) %>%
filter(!is.na(nces_id))
# 9th
ninth_spot <- all_distinct_urls %>%
mutate(split_url = str_split(url, "/")) %>%
pull(split_url) %>%
map(possibly(~.[[9]], NA))
all_institutional_facebook_urls_ninth <- all_distinct_urls %>%
mutate(url = str_c("https://www.facebook.com/", unlist(ninth_spot))) %>%
filter(!is.na(url))
joined_ninth <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_ninth) %>%
filter(!is.na(nces_id))
new_joined_data <- bind_rows(joined_fourth) %>%
bind_rows(joined_fifth) %>%
bind_rows(joined_sixth) %>%
bind_rows(joined_seventh) %>%
bind_rows(joined_eighth) %>%
bind_rows(joined_ninth) %>%
distinct(page_name, post, .keep_all = TRUE)
removing spaces
# fourth
all_institutional_facebook_urls_fourth_no_dash <- all_institutional_facebook_urls_fourth %>%
mutate(url = str_remove_all(url, "-"))
joined_fourth_no_dash <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_fourth_no_dash) %>%
filter(!is.na(nces_id))
# fifth
all_institutional_facebook_urls_fifth_no_dash <- all_institutional_facebook_urls_fifth %>%
mutate(url = str_remove_all(url, "-"))
joined_fifth_no_dash <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_fifth_no_dash) %>%
filter(!is.na(nces_id))
# sixth
all_institutional_facebook_urls_sixth_no_dash <- all_institutional_facebook_urls_sixth %>%
mutate(url = str_remove_all(url, "-"))
joined_sixth_no_dash <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_sixth_no_dash) %>%
filter(!is.na(nces_id))
# seventh
all_institutional_facebook_urls_seventh_no_dash <- all_institutional_facebook_urls_seventh %>%
mutate(url = str_remove_all(url, "-"))
joined_seventh_no_dash <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_seventh_no_dash) %>%
filter(!is.na(nces_id))
# eighth
all_institutional_facebook_urls_eighth_no_dash <- all_institutional_facebook_urls_eighth %>%
mutate(url = str_remove_all(url, "-"))
joined_eighth_no_dash <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(all_institutional_facebook_urls_eighth_no_dash) %>%
filter(!is.na(nces_id))
new_data_without_dashes <- bind_rows(joined_fourth_no_dash) %>%
bind_rows(joined_fifth_no_dash) %>%
bind_rows(joined_sixth_no_dash) %>%
bind_rows(joined_seventh_no_dash) %>%
bind_rows(joined_eighth_no_dash) %>%
distinct(page_name, post, .keep_all = TRUE)
joining based only on integers
all_institutional_facebook_urls_b <- all_distinct_urls
all_of_the_numbers <- all_institutional_facebook_urls_b %>%
mutate(id = stringr::str_extract_all(url, '\\(?[0-9,.]+')) %>%
pull(id)
find_15_digits <- function(x) {
x[nchar(x) == 15]
}
length_f <- function(x) {
length(x) > 0
}
all_of_the_numbers <- all_of_the_numbers %>%
map(possibly(find_15_digits, NA))
all_institutional_facebook_urls_b$id <- map(all_of_the_numbers, function(x) if(identical(x, character(0))) NA_character_ else x)
all_institutional_facebook_urls_b <- all_institutional_facebook_urls_b %>%
unnest(id)
x <- all_institutional_facebook_urls_b %>%
mutate(url = str_c("https://www.facebook.com/", id)) %>%
filter(!is.na(url)) %>%
select(-id)
from_15_char_id <- data_with_nces_id_joined_not_matched %>%
select(-nces_id) %>%
left_join(x) %>%
filter(!is.na(nces_id))
joining all back up - all done!
all_to_join <- bind_rows(new_joined_data) %>%
bind_rows(new_data_without_dashes) %>%
bind_rows(from_15_char_id)
all_to_join <- all_to_join %>%
distinct(url, post, nces_id, page_name, year)
data_with_nces_id_joined_match_all <- data_with_nces_id_joined_match %>%
bind_rows(all_to_join)
str_c("added ", nrow(data_with_nces_id_joined_match_all) - nrow(data_with_nces_id_joined_match), " rows; ",
(nrow(data_with_nces_id_joined_match_all) - nrow(data_with_nces_id_joined_match)) * 100 / nrow(posts_100k), "% change")
## [1] "added 306574 rows; 10.4357126347848% change"
Joining additional NCES data
This data is from the ELSI Table Generator: https://nces.ed.gov/ccd/elsi/tableGenerator.aspx
nces_info_for_districts <- read_csv(here::here("data", "nces-info-for-districts.csv")) %>%
select(nces_id, free_and_reduced_lunch_students_public_school_2017_18, state)
nces_info_for_schools <- read_csv("data/ELSI_csv_export_6374077161361122015895.csv") %>%
janitor::clean_names() %>%
mutate(nces_id = str_extract(school_id_nces_assigned_public_school_latest_available_year, '\\(?[0-9,.]+')) %>%
select(nces_id, free_and_reduced_lunch_students_public_school_2018_19, state = state_name_public_school_latest_available_year)
nces_info <- nces_info_for_districts %>%
bind_rows(nces_info_for_schools)
nces_info$nces_id %>% nchar() %>% table() # this looks right
## .
## 7 12
## 9949 99599
data_with_nces_id_joined_match_all$nces_id %>% nchar() %>% table() # so does this
## .
## 7 12
## 1788106 843130
data_with_nces_id_joined_match_all %>%
left_join(nces_info, by = "nces_id") %>%
filter(is.na(state)) # all joined!
## # A tibble: 8,552 x 8
## page_name url post year nces_id free_and_reduce… state free_and_reduce…
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 Lawrence … https… /169… 2020 2103240 <NA> <NA> <NA>
## 2 Emerado P… https… /758… 2020 3806360 <NA> <NA> <NA>
## 3 East Live… https… /176… 2020 3904391 <NA> <NA> <NA>
## 4 Bluford U… https… /327… 2020 1701417 <NA> <NA> <NA>
## 5 East Live… https… /176… 2020 3904391 <NA> <NA> <NA>
## 6 Lawrence … https… /169… 2020 2103240 <NA> <NA> <NA>
## 7 South Hol… https… /102… 2020 2923190 <NA> <NA> <NA>
## 8 Lawrence … https… /169… 2020 2103240 <NA> <NA> <NA>
## 9 East Live… https… /176… 2020 3904391 <NA> <NA> <NA>
## 10 Lawrence … https… /169… 2020 2103240 <NA> <NA> <NA>
## # … with 8,542 more rows
data_with_nces_id_joined_match_all %>%
left_join(nces_info, by = "nces_id") %>%
mutate(state = tolower(state)) %>%
count(state) %>%
knitr::kable() # success!
alabama |
48681 |
alaska |
14085 |
arizona |
48136 |
arkansas |
54050 |
california |
173110 |
colorado |
51655 |
connecticut |
14118 |
delaware |
12785 |
district of columbia |
7045 |
florida |
131289 |
georgia |
86670 |
hawaii |
822 |
idaho |
12976 |
illinois |
103363 |
indiana |
70117 |
iowa |
87696 |
kansas |
82143 |
kentucky |
38758 |
louisiana |
36871 |
maine |
24024 |
maryland |
20089 |
massachusetts |
59675 |
michigan |
167390 |
minnesota |
81597 |
mississippi |
39368 |
missouri |
149685 |
montana |
19474 |
nebraska |
54687 |
nevada |
7291 |
new hampshire |
20808 |
new jersey |
69573 |
new mexico |
23607 |
new york |
138830 |
north carolina |
125036 |
north dakota |
20424 |
ohio |
159620 |
oklahoma |
70229 |
oregon |
55923 |
pennsylvania |
104584 |
rhode island |
6823 |
south carolina |
15876 |
south dakota |
23082 |
tennessee |
63480 |
texas |
314916 |
utah |
27214 |
vermont |
23540 |
virginia |
34040 |
washington |
47557 |
west virginia |
6683 |
wisconsin |
156768 |
wyoming |
18452 |
NA |
8552 |
str_c("Matched ", nrow(data_with_nces_id_joined_match_all)/nrow(posts_100k) * 100, "% of rows with NCES data")
## [1] "Matched 89.5667041898548% of rows with NCES data"
Exploring NCES data
n accounts
nces_dist <- read_csv(here::here("data", "nces-info-for-districts.csv"))
nces_scho <- read_csv("data/ELSI_csv_export_6374077161361122015895.csv") %>%
janitor::clean_names() %>%
mutate(nces_id = str_extract(school_id_nces_assigned_public_school_latest_available_year, '\\(?[0-9,.]+'))
all_nces_ids <- data_with_nces_id_joined_match_all %>% distinct(nces_id)
n_districts <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
semi_join(all_nces_ids) %>%
nrow()
n_districts/8193
## [1] 0.8009276
checking sample
nces_dist$frpl <- as.integer(nces_dist$free_and_reduced_lunch_students_public_school_2017_18) /
as.integer(nces_dist$total_students_all_grades_excludes_ae_district_2017_18)
mean_fb_frpl <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
semi_join(all_nces_ids) %>%
summarize(mean_frpl = mean(frpl, na.rm = TRUE))
mean_frpl <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
summarize(mean_frpl = mean(frpl, na.rm = TRUE))
mean_fb_frpl
## # A tibble: 1 x 1
## mean_frpl
## <dbl>
## 1 0.471
mean_frpl
## # A tibble: 1 x 1
## mean_frpl
## <dbl>
## 1 0.476
fb_ss <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
semi_join(all_nces_ids) %>%
summarize(mean_ss = mean(as.integer(total_students_all_grades_excludes_ae_district_2017_18), na.rm = TRUE))
mean_ss <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
summarize(mean_ss = mean(as.integer(total_students_all_grades_excludes_ae_district_2017_18), na.rm = TRUE))
fb_ss
## # A tibble: 1 x 1
## mean_ss
## <dbl>
## 1 4881.
mean_ss
## # A tibble: 1 x 1
## mean_ss
## <dbl>
## 1 4517.
writing data
data_with_nces_id_joined_match_all %>%
left_join(nces_dist) %>%
select(url, nces_id, state, url, frpl, n_students = total_students_all_grades_excludes_ae_district_2017_18)
## # A tibble: 2,631,236 x 5
## url nces_id state frpl n_students
## <chr> <chr> <chr> <dbl> <dbl>
## 1 https://www.facebook.com/mountainburgps 051026000… <NA> NA NA
## 2 https://www.facebook.com/okaloosacountysch… 1201380 <NA> NA NA
## 3 https://www.facebook.com/paulrbrownleaders… 3700351 <NA> NA NA
## 4 https://www.facebook.com/westerncenteracad… 061692012… <NA> NA NA
## 5 https://www.facebook.com/englishvalleysele… 1910980 <NA> NA NA
## 6 https://www.facebook.com/dierksoutlaws 050534000… <NA> NA NA
## 7 https://www.facebook.com/muskogeeschools 4020970 <NA> NA NA
## 8 https://www.facebook.com/mountainburgps 051026000… <NA> NA NA
## 9 https://www.facebook.com/tsmsonline.org 360007605… <NA> NA NA
## 10 https://www.facebook.com/scv35schools 0407520 <NA> NA NA
## # … with 2,631,226 more rows
activity
to_join_for_activity <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union")
activity_d <- data_with_nces_id_joined_match_all %>%
count(nces_id) %>%
left_join(to_join_for_activity)
activity_d %>%
select(n, frpl, total_ss = total_students_all_grades_excludes_ae_district_2017_18) %>%
corrr::correlate() %>%
knitr::kable()
n |
NA |
0.0899972 |
0.1307116 |
frpl |
0.0899972 |
NA |
0.0608803 |
total_ss |
0.1307116 |
0.0608803 |
NA |
Posts per account
activity_d %>%
select(n) %>%
skimr::skim()
Data summary
Name |
Piped data |
Number of rows |
12812 |
Number of columns |
1 |
_______________________ |
|
Column type frequency: |
|
numeric |
1 |
________________________ |
|
Group variables |
None |
Variable type: numeric
n |
0 |
1 |
247.13 |
275.7 |
1 |
88 |
175 |
315 |
4567 |
▇▁▁▁▁ |
activity_d %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 100) +
ggtitle("posts per account") +
theme_bw()

activity_d %>%
filter(n < 50) %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 100) +
ggtitle("posts per account for accounts with less than 50 posts") +
theme_bw() +
scale_x_continuous(breaks = seq(from = 0, to = 49, by = 2), seq(from = 0, to = 49, by = 2)) +
xlab(NULL)

by state
fb_state <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
semi_join(all_nces_ids) %>%
count(state)
all_state <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
count(state) %>%
rename(total_n = n)
left_join(fb_state, all_state) %>%
mutate(prop = n / total_n) %>%
rename(fb_n = n) %>%
arrange(desc(prop)) %>%
knitr::kable()
district of columbia |
1 |
1 |
1.0000000 |
hawaii |
2 |
2 |
1.0000000 |
new hampshire |
1 |
1 |
1.0000000 |
wyoming |
66 |
69 |
0.9565217 |
delaware |
13 |
14 |
0.9285714 |
florida |
41 |
45 |
0.9111111 |
virginia |
77 |
86 |
0.8953488 |
alaska |
24 |
27 |
0.8888889 |
maine |
58 |
66 |
0.8787879 |
iowa |
264 |
307 |
0.8599349 |
michigan |
331 |
386 |
0.8575130 |
maryland |
16 |
19 |
0.8421053 |
washington |
139 |
166 |
0.8373494 |
louisiana |
46 |
55 |
0.8363636 |
colorado |
84 |
101 |
0.8316832 |
minnesota |
226 |
273 |
0.8278388 |
oregon |
94 |
114 |
0.8245614 |
indiana |
192 |
233 |
0.8240343 |
missouri |
266 |
323 |
0.8235294 |
new jersey |
237 |
288 |
0.8229167 |
new york |
393 |
479 |
0.8204593 |
wisconsin |
364 |
447 |
0.8143177 |
ohio |
369 |
455 |
0.8109890 |
massachusetts |
132 |
164 |
0.8048780 |
pennsylvania |
279 |
350 |
0.7971429 |
kentucky |
83 |
105 |
0.7904762 |
georgia |
74 |
94 |
0.7872340 |
kansas |
173 |
220 |
0.7863636 |
north carolina |
66 |
84 |
0.7857143 |
north dakota |
55 |
70 |
0.7857143 |
utah |
22 |
28 |
0.7857143 |
arkansas |
51 |
65 |
0.7846154 |
alabama |
65 |
83 |
0.7831325 |
arizona |
89 |
114 |
0.7807018 |
illinois |
420 |
538 |
0.7806691 |
nevada |
7 |
9 |
0.7777778 |
rhode island |
14 |
18 |
0.7777778 |
texas |
593 |
764 |
0.7761780 |
south dakota |
79 |
102 |
0.7745098 |
south carolina |
46 |
60 |
0.7666667 |
connecticut |
41 |
54 |
0.7592593 |
california |
362 |
477 |
0.7589099 |
montana |
59 |
78 |
0.7564103 |
new mexico |
37 |
49 |
0.7551020 |
west virginia |
18 |
24 |
0.7500000 |
oklahoma |
200 |
269 |
0.7434944 |
nebraska |
147 |
198 |
0.7424242 |
tennessee |
62 |
85 |
0.7294118 |
idaho |
34 |
51 |
0.6666667 |
mississippi |
50 |
83 |
0.6024096 |
using all possible NCES IDs (either not available or not joined)
all_possible_nces_ids <- all_institutional_facebook_urls %>%
distinct(nces_id)
fb_pos_n <- nces_dist %>%
filter(agency_type_district_2017_18 == "1-Regular local school district that is NOT a component of a supervisory union") %>%
semi_join(all_possible_nces_ids) %>%
nrow()
fb_pos_n
## [1] 7631
fb_pos_n/8193
## [1] 0.9314049
data_with_nces_id_joined_match_all
## # A tibble: 2,631,236 x 5
## page_name url post year nces_id
## <chr> <chr> <chr> <dbl> <chr>
## 1 Mountainburg Public Sch… https://www.facebook.com… /32428930… 2020 05102600…
## 2 Okaloosa County School … https://www.facebook.com… /42663976… 2020 1201380
## 3 Paul R. Brown Leadershi… https://www.facebook.com… /27185905… 2020 3700351
## 4 Western Center Academy https://www.facebook.com… /10157361… 2020 06169201…
## 5 English Valleys Element… https://www.facebook.com… /27125007… 2020 1910980
## 6 Dierks High School https://www.facebook.com… /11764706… 2020 05053400…
## 7 Muskogee Public Schools https://www.facebook.com… /10157437… 2020 4020970
## 8 Mountainburg Public Sch… https://www.facebook.com… /32428767… 2020 05102600…
## 9 Tompkins Square Middle … https://www.facebook.com… /33497313… 2020 36000760…
## 10 Santa Cruz Valley Unifi… https://www.facebook.com… /38965960… 2020 0407520
## # … with 2,631,226 more rows