library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis
cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))
ss_collected <- tbl(cn, "posts") %>%
filter(year == 2019) %>%
collect()
ss_collected
## # A tibble: 3,262,779 x 37
## page_name user_name facebook_id likes_at_posting created type likes comments
## <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 Sumter S… SumterSC… 3.57e14 NA 1.58e9 Photo 91 5
## 2 Licking … LHLocalS… 1.05e14 1598 1.58e9 Stat… 89 4
## 3 West Jef… Westjeff… 4.76e14 NA 1.58e9 Photo 122 2
## 4 Walled L… wlcsd 3.59e14 5224 1.58e9 Photo 123 1
## 5 North Ro… Royalton… 1.66e14 3127 1.58e9 Photo 90 1
## 6 Leona Sc… LeonaSch… 1.34e14 NA 1.58e9 Photo 2 0
## 7 Maple Pr… maplepri… 1.97e14 NA 1.58e9 Photo 73 0
## 8 Cobb Cou… cobbscho… 8.47e10 40307 1.58e9 Photo 42 5
## 9 Worthing… Worthing… 1.02e14 6516 1.58e9 Photo 111 0
## 10 Chagrin … CFEVS 2.72e14 1831 1.58e9 Photo 56 2
## # … with 3,262,769 more rows, and 29 more variables: shares <dbl>, love <dbl>,
## # wow <dbl>, haha <dbl>, sad <dbl>, angry <dbl>, care <dbl>,
## # video_share_status <chr>, post_views <dbl>, total_views <dbl>,
## # total_views_for_all_crossposts <dbl>, video_length <chr>, url <chr>,
## # message <chr>, link <chr>, final_link <chr>, image_text <chr>,
## # link_text <chr>, description <chr>, sponsor_id <chr>, sponsor_name <chr>,
## # overperforming_score <dbl>, hour <int>, day <dbl>, year <dbl>, month <dbl>,
## # day_of_week <dbl>, day_of_month <int>, created_rounded_to_day <dbl>
ss <- ss_collected %>%
mutate(links_list = stringr::str_extract_all(message,
"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"))
unnested_urls <- ss %>%
unnest(links_list) %>%
filter(!is.na(links_list))
# my_long_urls <- longurl::expand_urls(unnested_urls$links_list) # is slow; should only do with short links
my_long_urls_processed <- unnested_urls %>%
mutate(orig_url= str_split(links_list, ":=:")) %>%
mutate(orig_url_second = map(orig_url, pluck, 2)) %>%
rowwise() %>%
mutate(url = ifelse(is.null(orig_url_second), orig_url, orig_url_second))
parsed_urls <- urltools::url_parse(my_long_urls_processed$url)
parsed_urls %>%
count(domain) %>%
arrange(desc(n)) %>%
filter(!is.na(domain)) %>%
slice(1:40) %>%
knitr::kable()
domain | n |
---|---|
bit.ly | 22637 |
twitter.com | 17383 |
www.youtube.com | 15688 |
docs.google.com | 14306 |
www.facebook.com | 10317 |
www.google.com | 8606 |
ow.ly | 8409 |
drive.google.com | 8313 |
www.smore.com | 5425 |
www.signupgenius.com | 5351 |
mailchi.mp | 4079 |
www.eventbrite.com | 2745 |
www.nhaschools.com | 2678 |
content.parent-institute.com | 2434 |
www.surveymonkey.com | 2365 |
www.instagram.com | 2202 |
tinyurl.com | 2187 |
www.applitrack.com | 2070 |
buff.ly | 2036 |
vimeo.com | 1832 |
sites.google.com | 1706 |
bookfairs.scholastic.com | 1696 |
edne.tw | 1666 |
core-docs.s3.amazonaws.com | 1637 |
t.co | 1282 |
www.maranausd.org | 1179 |
accounts.google.com | 1138 |
myemail.constantcontact.com | 1089 |
youtu.be | 1033 |
edl.io | 980 |
uticak12.org | 936 |
goo.gl | 877 |
m.signupgenius.com | 874 |
www.donorschoose.org | 867 |
signup.com | 863 |
www.nfhsnetwork.com | 775 |
www.tulsaschools.org | 753 |
www.jostens.com | 733 |
forms.gle | 722 |
5il.co | 689 |
ss <- ss_collected %>%
mutate(links_list = stringr::str_extract_all(message,
"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"))
unnested_urls <- ss %>%
unnest(links_list) %>%
filter(!is.na(links_list))
# my_long_urls <- longurl::expand_urls(unnested_urls$links_list) # is slow; should only do with short links
my_long_urls_processed <- unnested_urls %>%
mutate(orig_url= str_split(links_list, ":=:")) %>%
mutate(orig_url_second = map(orig_url, pluck, 2)) %>%
rowwise() %>%
mutate(url = ifelse(is.null(orig_url_second), orig_url, orig_url_second))
parsed_urls <- urltools::url_parse(my_long_urls_processed$url)
parsed_urls$month <- my_long_urls_processed$month
parsed_urls %>%
as_tibble()
## # A tibble: 458,087 x 7
## scheme domain port path parameter fragment month
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 https www.bridge… <NA> <NA> <NA> <NA> 12
## 2 https www.bridge… <NA> <NA> <NA> <NA> 12
## 3 https www.facebo… <NA> login/ next=https%3A%2F%2Fw… <NA> 12
## 4 https khqa.com <NA> sports/content… <NA> <NA> 12
## 5 https www.riverc… <NA> public/genie/4… <NA> <NA> 12
## 6 https www.pinter… <NA> pin/3777394874… invite_code=417602ad… <NA> 12
## 7 https www.covele… <NA> news/autistic-… <NA> <NA> 12
## 8 https www.facebo… <NA> events/4299197… ti=cl <NA> 12
## 9 https dreamers.s… <NA> dreamers/noah-… <NA> <NA> 12
## 10 https www.facebo… <NA> 259617042211/p… d=n <NA> 12
## # … with 458,077 more rows
# could probably use table above for this
sum_by_domain <- parsed_urls %>%
group_by(month) %>%
count(domain) %>%
arrange(desc(n)) %>%
filter(!is.na(domain),
n > 0) %>%
ungroup() %>%
group_by(domain) %>%
summarize(sum_n = sum(n)) %>%
arrange(desc(sum_n)) %>%
slice(1:30) # grabbing 30 most common domains
parsed_urls %>%
group_by(month) %>%
count(domain) %>%
arrange(desc(n)) %>%
filter(!is.na(domain),
n > 0) %>%
spread(month, n, fill = 0) %>%
semi_join(sum_by_domain)
## # A tibble: 30 x 13
## domain `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 accou… 79 110 110 142 146 67 42 68 101 122 95
## 2 bit.ly 1819 1938 1924 2073 2204 1594 1501 1770 2066 2204 1949
## 3 bookf… 45 167 249 183 85 1 0 23 203 395 255
## 4 buff.… 128 155 169 150 199 180 155 174 194 251 141
## 5 conte… 249 203 228 223 210 212 187 138 175 181 187
## 6 core-… 114 123 120 109 97 81 122 189 171 214 168
## 7 docs.… 1184 1239 1375 1499 1519 611 672 1062 1299 1542 1267
## 8 drive… 716 697 759 806 862 491 458 643 750 829 721
## 9 edl.io 140 145 177 128 44 25 10 31 69 92 70
## 10 edne.… 173 184 228 206 219 89 51 92 115 131 99
## # … with 20 more rows, and 1 more variable: `12` <dbl>
my_corpus <- corpus(ss_collected, text_field = "message")
my_tokens <- tokens(my_corpus, remove_symbols = T, remove_numbers = T, remove_punct = T, remove_url = T)
my_dfm <- quanteda::dfm(my_tokens, remove = stopwords('en'))
Overall
textstat_frequency(my_dfm, n = 10, groups = "month") %>%
knitr::kable()
feature | frequency | rank | docfreq | group |
---|---|---|---|---|
school | 103207 | 1 | 65704 | 1 |
students | 66786 | 2 | 45755 | 1 |
january | 48551 | 3 | 38968 | 1 |
high | 30388 | 4 | 22458 | 1 |
day | 26288 | 5 | 20649 | 1 |
please | 25557 | 6 | 21338 | 1 |
today | 25166 | 7 | 23060 | 1 |
schools | 21933 | 8 | 18005 | 1 |
district | 19384 | 9 | 14831 | 1 |
basketball | 18790 | 10 | 14173 | 1 |
school | 123439 | 1 | 79047 | 10 |
students | 99756 | 2 | 68680 | 10 |
october | 50765 | 3 | 39367 | 10 |
day | 46329 | 4 | 36091 | 10 |
high | 40665 | 5 | 30965 | 10 |
week | 36885 | 6 | 29765 | 10 |
today | 33868 | 7 | 31259 | 10 |
please | 30575 | 8 | 25361 | 10 |
thank | 30113 | 9 | 27416 | 10 |
great | 25972 | 10 | 23609 | 10 |
school | 95363 | 1 | 62187 | 11 |
students | 82755 | 2 | 57212 | 11 |
november | 38866 | 3 | 30466 | 11 |
day | 37926 | 4 | 30865 | 11 |
high | 33955 | 5 | 25834 | 11 |
thank | 27948 | 6 | 24993 | 11 |
today | 27052 | 7 | 25003 | 11 |
please | 23150 | 8 | 19440 | 11 |
week | 21275 | 9 | 17923 | 11 |
grade | 20779 | 10 | 16174 | 11 |
school | 80006 | 1 | 52989 | 12 |
students | 71638 | 2 | 49328 | 12 |
december | 33260 | 3 | 26007 | 12 |
high | 28912 | 4 | 21863 | 12 |
holiday | 24383 | 5 | 20433 | 12 |
day | 24091 | 6 | 19243 | 12 |
today | 22823 | 7 | 21223 | 12 |
thank | 20615 | 8 | 18794 | 12 |
christmas | 20388 | 9 | 16648 | 12 |
grade | 20135 | 10 | 15359 | 12 |
school | 117857 | 1 | 74441 | 2 |
students | 79804 | 2 | 54522 | 2 |
february | 48261 | 3 | 39433 | 2 |
day | 39745 | 4 | 30402 | 2 |
high | 37118 | 5 | 27191 | 2 |
today | 32859 | 6 | 30176 | 2 |
please | 26934 | 7 | 22508 | 2 |
week | 23618 | 8 | 19090 | 2 |
schools | 23235 | 9 | 19131 | 2 |
district | 22555 | 10 | 17707 | 2 |
school | 103043 | 1 | 64468 | 3 |
students | 84923 | 2 | 57036 | 3 |
march | 53608 | 3 | 41098 | 3 |
high | 36494 | 4 | 26657 | 3 |
day | 32890 | 5 | 25720 | 3 |
today | 26096 | 6 | 24019 | 3 |
please | 23557 | 7 | 19556 | 3 |
team | 22236 | 8 | 15192 | 3 |
grade | 22052 | 9 | 16038 | 3 |
week | 21832 | 10 | 18147 | 3 |
school | 102974 | 1 | 64528 | 4 |
students | 84098 | 2 | 56440 | 4 |
april | 53835 | 3 | 40066 | 4 |
high | 39038 | 4 | 28459 | 4 |
day | 33110 | 5 | 25795 | 4 |
may | 27496 | 6 | 19354 | 4 |
today | 27369 | 7 | 25114 | 4 |
please | 24917 | 8 | 20723 | 4 |
grade | 22831 | 9 | 16575 | 4 |
year | 22159 | 10 | 17485 | 4 |
school | 122636 | 1 | 76986 | 5 |
students | 93888 | 2 | 63547 | 5 |
may | 59557 | 3 | 42854 | 5 |
day | 48075 | 4 | 37576 | 5 |
high | 46482 | 5 | 33963 | 5 |
year | 35984 | 6 | 27997 | 5 |
thank | 32584 | 7 | 29369 | 5 |
grade | 30466 | 8 | 22603 | 5 |
congratulations | 30282 | 9 | 29303 | 5 |
today | 30028 | 10 | 27726 | 5 |
school | 65478 | 1 | 40483 | 6 |
students | 40442 | 2 | 27139 | 6 |
summer | 26396 | 3 | 19828 | 6 |
high | 21312 | 4 | 15676 | 6 |
june | 20356 | 5 | 15579 | 6 |
year | 19797 | 6 | 15531 | 6 |
day | 18330 | 7 | 14555 | 6 |
congratulations | 12790 | 8 | 12469 | 6 |
class | 12366 | 9 | 10324 | 6 |
please | 11881 | 10 | 10088 | 6 |
school | 63036 | 1 | 36722 | 7 |
students | 28105 | 2 | 19112 | 7 |
july | 19038 | 3 | 14424 | 7 |
year | 17986 | 4 | 14232 | 7 |
summer | 16909 | 5 | 13380 | 7 |
new | 16011 | 6 | 12279 | 7 |
please | 14586 | 7 | 12182 | 7 |
high | 14399 | 8 | 10632 | 7 |
august | 14268 | 9 | 9985 | 7 |
day | 12080 | 10 | 9930 | 7 |
school | 126719 | 1 | 75257 | 8 |
students | 59467 | 2 | 41398 | 8 |
year | 38347 | 3 | 30835 | 8 |
day | 36048 | 4 | 28434 | 8 |
august | 31280 | 5 | 23541 | 8 |
new | 30871 | 6 | 23973 | 8 |
please | 27540 | 7 | 22026 | 8 |
first | 26565 | 8 | 22359 | 8 |
high | 24629 | 9 | 18553 | 8 |
back | 20065 | 10 | 17897 | 8 |
school | 107876 | 1 | 68223 | 9 |
students | 77116 | 2 | 53568 | 9 |
day | 41388 | 3 | 31648 | 9 |
september | 36070 | 4 | 28086 | 9 |
high | 33584 | 5 | 25410 | 9 |
please | 27652 | 6 | 22650 | 9 |
today | 27501 | 7 | 25294 | 9 |
week | 27484 | 8 | 22300 | 9 |
year | 22457 | 9 | 18196 | 9 |
great | 22186 | 10 | 20223 | 9 |
By month - weighted
my_dfm %>%
dfm_tfidf() %>%
textstat_frequency(n = 10, groups = "month", force = TRUE) %>%
as_tibble() %>%
select(feature, rank, group) %>%
group_split("group") %>%
knitr::kable()
|
By month - not weighted
my_dfm %>%
textstat_frequency(n = 10, groups = "month", force = TRUE) %>%
as_tibble() %>%
select(feature, rank, group) %>%
group_split("group") %>%
knitr::kable()
|
From report
kwic(my_tokens, "learning") %>% slice(1:5)
##
## [text130, 12] year of happiness health success | learning |
## [text135, 29] wait for all the new | learning |
## [text167, 4] Happy to our | learning |
## [text215, 31] our school cafeterias 21st century | learning |
## [text247, 13] adventures new friends and new | learning |
##
## for you your family#happynewyear
## new friendships and new discoveries
## community
## in action
##
kwic(my_tokens, "technology") %>% slice(1:5)
##
## [text273, 32] School for the Arts and | Technology |
## [text617, 100] taught middle school science and | technology |
## [text987, 50] classroom projects and building needs | technology |
## [text1224, 20] students love learning about science | technology |
## [text1513, 34] heating telephones copiers and classroom | technology |
##
## Freshman Open House#WeAreSouthfield#ScholarsPositionedForSuccess
## traveled to students homes as
## social-emotional supports reading and math
## engineering and math
## District personnel and network server
kwic(my_tokens, "student") %>% slice(1:5)
##
## [text78, 65] with the principal teachers and | student |
## [text161, 42] the best in education one | student |
## [text264, 30] community Gracias Please join our | student |
## [text273, 3] Your 8th-grade | student |
## [text283, 15] new year Know that every | student |
##
## council Daily Herald#d25ItsPersonal#d25ALookBack
## at a time Happy New
## athletes athletic coaches and alumni
## is on his or her
## family and staff member who
kwic(my_tokens, "education") %>% slice(1:5)
##
## [text161, 40] continue providing the best in | education |
## [text224, 110] Some of our Board of | Education |
## [text224, 151] students with the very best | education |
## [text303, 18] you to our Board of | Education |
## [text542, 64] the Mississippi Alliance for Arts | Education |
##
## one student at a time
## members are alumni and even
## possible What a legacy- Welcome
## members for their service These
## in Ms Stanford is chair
kwic(my_tokens, "teaching") %>% slice(1:5)
##
## [text343, 2] Elementary | Teaching |
## [text529, 17] Academy Read how the Capital | Teaching |
## [text542, 94] been a member of the | Teaching |
## [text542, 131] has received recognition for choreography | teaching |
## [text542, 185] technical skill In addition to | teaching |
##
## Position
## Residency helped propel her into
## Artist Roster for the Mississippi
## and directing of over twenty
## and directing at MSA Ms
kwic(my_tokens, "online") %>% slice(1:5)
##
## [text186, 29] creation of your painting Register | online |
## [text469, 19] all of the videos posted | online |
## [text593, 67] about Dawn and Lance's awards | online |
## [text676, 116] through the Avenues for Hope | Online |
## [text804, 27] yearbook links to purchases then | online |
##
## at bit.ly BCE_Winter2020
## reached almost half a million
## at
## Giving Campaign
## Make sure to click on
kwic(my_tokens, "teacher") %>% slice(1:5)
##
## [text16, 22] Students return next Tuesday January | Teacher |
## [text253, 18] Schools family lost an amazing | teacher |
## [text319, 30] They are nominated by a | teacher |
## [text428, 19] Hollister Middle School Vocal Music | Teacher |
## [text441, 11] Mae Stevens Early Learning Academy | teacher |
##
## Workday January#KGLIFE#TogetherTowardsTomorrow
## coach mentor and friend Mr
## on the trail Students names
## She has worked in Hollister
## Patricia Crawford in the Copperas
Other terms
kwic(my_tokens, "covid") %>% slice(1:5)
## kwic object with 0 rows
kwic(my_tokens, "school") %>% slice(1:5)
##
## [text11, 15] the second half of the | school | year and we want to
## [text13, 15] the second half of the | school | year and we want to
## [text20, 18] all in the new year | School | resumes on Thursday January
## [text25, 13] you are part of our | school | community
## [text30, 4] Happy New Year | School | resumes Thursday January
kwic(my_tokens, "website") %>% slice(1:5)
##
## [text288, 94] by going to the district | website |
## [text422, 12] ages According to the USDA's | website |
## [text425, 2] District | Website |
## [text671, 53] are available on the district | website |
## [text694, 118] school Please go to our | website |
##
## at www.astoria.k12 or.us Budget Committee
## Choose My Plate school-age children
## New Family Portal www.cheltenham.org familyportal
## www.scrsd.org
## www.autismcharter.org and click the Maroon
kwic(my_tokens, "resource") %>% slice(1:5)
##
## [text924, 29] diversions at the new Homeless | Resource |
## [text1034, 47] Year#NYE2020 Here is a | resource |
## [text1803, 3] #ScienceTuesday Cultural | Resource |
## [text2316, 20] Officer Roddy and the School | Resource |
## [text2913, 41] degrees in healthcare set up | resource |
##
## Centers wrapped up our Holiday
## for firework education
## Survey Program Artifacts found in
## Officer Program here at KES
## tables for students to visit
kwic(my_tokens, "link") %>% slice(1:5)
##
## [text192, 4] Click on the | link | below to see Community Flyers
## [text296, 19] on IG Click on the | link | to start following our Instagram
## [text301, 19] on IG Click on the | link | to start following our Instagram
## [text312, 20] us yet Just click the | link | below and stay in the
## [text313, 19] on IG Click on the | link | below to start in
kwic(my_tokens, "laptop") %>% slice(1:5)
##
## [text2830, 9] Heath Shelton working on the | laptop |
## [text3081, 401] The technology package includes a | laptop |
## [text18346, 7] Santa bringing your family a | laptop |
## [text19457, 25] of a new smartphone or | laptop |
## [text21616, 39] bring up captions If on | laptop |
##
## during Athletes and Friends
## computer tablet monitor keyboard mouse
## an iPad or tablet a
## It's a great time to
## click the CC button on
kwic(my_tokens, "internet") %>% slice(1:5)
##
## [text736, 13] the halftime show but the | internet |
## [text1083, 50] home school or anywhere there's | internet |
## [text3887, 19] can get super-fast portable UNLIMITED | internet |
## [text6366, 37] new phone tablet or any | internet |
## [text6517, 29] a very informative presentation on | Internet |
##
## in the stadium is spotty
## service
## for per month There is
## accessing device#learngrowsucceed#committedtoexcellence
## Safety Parents AND children are
kwic(my_tokens, "access") %>% slice(1:5)
##
## [text257, 100] médico llame a NM Crisis | Access |
## [text968, 105] your doctor call NM Crisis | Access |
## [text1083, 26] and Elysian will now have | access |
## [text1083, 41] baking to drawing through Creativebug | Access |
## [text1340, 37] to improve the quality and | access |
##
## al 1-855-NMCRISIS o visite www.drugs.com
## at 1-855-NMCRISIS or visit www.drugs.com
## to thousands of video arts
## from the library home school
## of learning beyond the classroom
kwic(my_tokens, "packet") %>% slice(1:5)
##
## [text1622, 27] accepted until January Once a | packet |
## [text4384, 28] Please refer to the order | packet |
## [text6139, 44] need a VUSD completed physical | packet |
## [text9464, 45] Education can pick up a | packet |
## [text9665, 23] year sports physical consent form | packet |
##
## is completed and returned to
## provided by the Herff-Jones rep
## on file with the Athletic
## during normal business hours Filing
## on file at the school
not working with the entire dataset
my_fcm <- fcm(my_dfm)
feat <- names(topfeatures(my_fcm, 50))
my_fcm_select <- fcm_select(my_fcm, pattern = feat, selection = "keep")
size <- log(colSums(dfm_select(my_dfm, feat, selection = "keep")))
textplot_network(my_fcm_select, min_freq = 0.8, vertex_size = size / max(size) * 3,
edge_alpha = .3,
edge_size = 2)
text <- ss %>%
select(message, year) %>%
collect() %>%
mutate(learning = str_detect(message, "(?i)learning"),
technology = str_detect(message, "(?i)technology"),
student = str_detect(message, "(?i)student"),
education = str_detect(message, "(?i)education"),
teaching = str_detect(message, "(?i)teaching"),
online = str_detect(message, "(?i)online"),
teacher = str_detect(message, "(?i)teacher"),
covid = str_detect(message, "(?i)covid")) %>%
filter(!is.na(message))
text_to_plot <- text %>%
select(year, learning:covid) %>%
gather(key, val, -year) %>%
group_by(key, year) %>%
summarize(mean_val = mean(val, na.rm = TRUE))
text_to_plot
## # A tibble: 8 x 3
## # Groups: key [8]
## key year mean_val
## <chr> <dbl> <dbl>
## 1 covid 2019 0.00000505
## 2 education 2019 0.0469
## 3 learning 2019 0.0357
## 4 online 2019 0.0196
## 5 student 2019 0.254
## 6 teacher 2019 0.0682
## 7 teaching 2019 0.00873
## 8 technology 2019 0.00903
# text_to_plot %>%
# filter(year >= 2010) %>%
# ggplot(aes(x = year, y = mean_val, color = key, group = key)) +
# geom_point() +
# geom_line() +
# scale_color_brewer(type = "qual") +
# hrbrthemes::theme_ipsum() +
# scale_x_continuous(breaks = 2010:2020) +
# ylab("proportion")
Not run just yet, working on API
ss_1000 <- tbl(cn, "posts") %>%
filter(year == 2020) %>%
select(facebook_id) %>%
collect() %>%
sample_n(1000)
files_to_download <- ss_collected %>% filter(type == "Photo") %>% pull(link) %>% head(5)
map2(files_to_download,
str_c("downloaded-photos/", 1:5, ".jpeg"),
download.file)