Loading, setting up

library(tidyverse) # analysis and processing
library(DBI) # database interface
library(scales) # plots
library(quanteda) # text analysis

cn <- dbConnect(RSQLite::SQLite(), dbname = here::here("db", "k12-institutions-fb-posts.sqlite"))
ss_collected <- tbl(cn, "posts") %>% 
  filter(year == 2019) %>% 
  collect()
ss_collected
## # A tibble: 3,262,779 x 37
##    page_name user_name facebook_id likes_at_posting created type  likes comments
##    <chr>     <chr>           <dbl>            <int>   <dbl> <chr> <dbl>    <dbl>
##  1 Sumter S… SumterSC…     3.57e14               NA  1.58e9 Photo    91        5
##  2 Licking … LHLocalS…     1.05e14             1598  1.58e9 Stat…    89        4
##  3 West Jef… Westjeff…     4.76e14               NA  1.58e9 Photo   122        2
##  4 Walled L… wlcsd         3.59e14             5224  1.58e9 Photo   123        1
##  5 North Ro… Royalton…     1.66e14             3127  1.58e9 Photo    90        1
##  6 Leona Sc… LeonaSch…     1.34e14               NA  1.58e9 Photo     2        0
##  7 Maple Pr… maplepri…     1.97e14               NA  1.58e9 Photo    73        0
##  8 Cobb Cou… cobbscho…     8.47e10            40307  1.58e9 Photo    42        5
##  9 Worthing… Worthing…     1.02e14             6516  1.58e9 Photo   111        0
## 10 Chagrin … CFEVS         2.72e14             1831  1.58e9 Photo    56        2
## # … with 3,262,769 more rows, and 29 more variables: shares <dbl>, love <dbl>,
## #   wow <dbl>, haha <dbl>, sad <dbl>, angry <dbl>, care <dbl>,
## #   video_share_status <chr>, post_views <dbl>, total_views <dbl>,
## #   total_views_for_all_crossposts <dbl>, video_length <chr>, url <chr>,
## #   message <chr>, link <chr>, final_link <chr>, image_text <chr>,
## #   link_text <chr>, description <chr>, sponsor_id <chr>, sponsor_name <chr>,
## #   overperforming_score <dbl>, hour <int>, day <dbl>, year <dbl>, month <dbl>,
## #   day_of_week <dbl>, day_of_month <int>, created_rounded_to_day <dbl>

Text

my_corpus <- corpus(ss_collected, text_field = "message")

my_tokens <- tokens(my_corpus, remove_symbols = T, remove_numbers = T, remove_punct = T, remove_url = T)

my_dfm <- quanteda::dfm(my_tokens, remove = stopwords('en'))

Frequencies

Overall

textstat_frequency(my_dfm, n = 10, groups = "month") %>% 
  knitr::kable()
feature frequency rank docfreq group
school 103207 1 65704 1
students 66786 2 45755 1
january 48551 3 38968 1
high 30388 4 22458 1
day 26288 5 20649 1
please 25557 6 21338 1
today 25166 7 23060 1
schools 21933 8 18005 1
district 19384 9 14831 1
basketball 18790 10 14173 1
school 123439 1 79047 10
students 99756 2 68680 10
october 50765 3 39367 10
day 46329 4 36091 10
high 40665 5 30965 10
week 36885 6 29765 10
today 33868 7 31259 10
please 30575 8 25361 10
thank 30113 9 27416 10
great 25972 10 23609 10
school 95363 1 62187 11
students 82755 2 57212 11
november 38866 3 30466 11
day 37926 4 30865 11
high 33955 5 25834 11
thank 27948 6 24993 11
today 27052 7 25003 11
please 23150 8 19440 11
week 21275 9 17923 11
grade 20779 10 16174 11
school 80006 1 52989 12
students 71638 2 49328 12
december 33260 3 26007 12
high 28912 4 21863 12
holiday 24383 5 20433 12
day 24091 6 19243 12
today 22823 7 21223 12
thank 20615 8 18794 12
christmas 20388 9 16648 12
grade 20135 10 15359 12
school 117857 1 74441 2
students 79804 2 54522 2
february 48261 3 39433 2
day 39745 4 30402 2
high 37118 5 27191 2
today 32859 6 30176 2
please 26934 7 22508 2
week 23618 8 19090 2
schools 23235 9 19131 2
district 22555 10 17707 2
school 103043 1 64468 3
students 84923 2 57036 3
march 53608 3 41098 3
high 36494 4 26657 3
day 32890 5 25720 3
today 26096 6 24019 3
please 23557 7 19556 3
team 22236 8 15192 3
grade 22052 9 16038 3
week 21832 10 18147 3
school 102974 1 64528 4
students 84098 2 56440 4
april 53835 3 40066 4
high 39038 4 28459 4
day 33110 5 25795 4
may 27496 6 19354 4
today 27369 7 25114 4
please 24917 8 20723 4
grade 22831 9 16575 4
year 22159 10 17485 4
school 122636 1 76986 5
students 93888 2 63547 5
may 59557 3 42854 5
day 48075 4 37576 5
high 46482 5 33963 5
year 35984 6 27997 5
thank 32584 7 29369 5
grade 30466 8 22603 5
congratulations 30282 9 29303 5
today 30028 10 27726 5
school 65478 1 40483 6
students 40442 2 27139 6
summer 26396 3 19828 6
high 21312 4 15676 6
june 20356 5 15579 6
year 19797 6 15531 6
day 18330 7 14555 6
congratulations 12790 8 12469 6
class 12366 9 10324 6
please 11881 10 10088 6
school 63036 1 36722 7
students 28105 2 19112 7
july 19038 3 14424 7
year 17986 4 14232 7
summer 16909 5 13380 7
new 16011 6 12279 7
please 14586 7 12182 7
high 14399 8 10632 7
august 14268 9 9985 7
day 12080 10 9930 7
school 126719 1 75257 8
students 59467 2 41398 8
year 38347 3 30835 8
day 36048 4 28434 8
august 31280 5 23541 8
new 30871 6 23973 8
please 27540 7 22026 8
first 26565 8 22359 8
high 24629 9 18553 8
back 20065 10 17897 8
school 107876 1 68223 9
students 77116 2 53568 9
day 41388 3 31648 9
september 36070 4 28086 9
high 33584 5 25410 9
please 27652 6 22650 9
today 27501 7 25294 9
week 27484 8 22300 9
year 22457 9 18196 9
great 22186 10 20223 9

By month - weighted

my_dfm %>% 
  dfm_tfidf() %>% 
  textstat_frequency(n = 10, groups = "month", force = TRUE) %>% 
  as_tibble() %>% 
  select(feature, rank, group) %>% 
  group_split("group") %>% 
  knitr::kable()
feature rank group “group”
january 1 1 group
school 2 1 group
students 3 1 group
high 4 1 group
basketball 5 1 group
jan 6 1 group
please 7 1 group
schools 8 1 group
today 9 1 group
day 10 1 group
october 1 10 group
school 2 10 group
students 3 10 group
day 4 10 group
week 5 10 group
high 6 10 group
today 7 10 group
please 8 10 group
thank 9 10 group
oct 10 10 group
november 1 11 group
students 2 11 group
school 3 11 group
day 4 11 group
thanksgiving 5 11 group
veterans 6 11 group
high 7 11 group
thank 8 11 group
today 9 11 group
nov 10 11 group
december 1 12 group
students 2 12 group
school 3 12 group
holiday 4 12 group
christmas 5 12 group
high 6 12 group
winter 7 12 group
grade 8 12 group
dec 9 12 group
today 10 12 group
february 1 2 group
school 2 2 group
students 3 2 group
day 4 2 group
high 5 2 group
feb 6 2 group
today 7 2 group
march 8 2 group
basketball 9 2 group
please 10 2 group
march 1 3 group
school 2 3 group
students 3 3 group
high 4 3 group
april 5 3 group
day 6 3 group
team 7 3 group
today 8 3 group
grade 9 3 group
state 10 3 group
april 1 4 group
school 2 4 group
students 3 4 group
high 4 4 group
may 5 4 group
day 6 4 group
today 7 4 group
grade 8 4 group
please 9 4 group
team 10 4 group
may 1 5 group
school 2 5 group
students 3 5 group
day 4 5 group
high 5 5 group
year 6 5 group
grade 7 5 group
congratulations 8 5 group
thank 9 5 group
teacher 10 5 group
summer 1 6 group
school 2 6 group
june 3 6 group
students 4 6 group
year 5 6 group
high 6 6 group
day 7 6 group
class 8 6 group
congratulations 9 6 group
program 10 6 group
july 1 7 group
school 2 7 group
summer 3 7 group
august 4 7 group
year 5 7 group
new 6 7 group
students 7 7 group
please 8 7 group
high 9 7 group
registration 10 7 group
school 1 8 group
august 2 8 group
year 3 8 group
students 4 8 group
new 5 8 group
day 6 8 group
first 7 8 group
back 8 8 group
please 9 8 group
high 10 8 group
school 1 9 group
september 2 9 group
students 3 9 group
day 4 9 group
high 5 9 group
week 6 9 group
please 7 9 group
today 8 9 group
football 9 9 group
game 10 9 group

By month - not weighted

my_dfm %>% 
  textstat_frequency(n = 10, groups = "month", force = TRUE) %>% 
  as_tibble() %>% 
  select(feature, rank, group) %>% 
  group_split("group") %>% 
  knitr::kable()
feature rank group “group”
school 1 1 group
students 2 1 group
january 3 1 group
high 4 1 group
day 5 1 group
please 6 1 group
today 7 1 group
schools 8 1 group
district 9 1 group
basketball 10 1 group
school 1 10 group
students 2 10 group
october 3 10 group
day 4 10 group
high 5 10 group
week 6 10 group
today 7 10 group
please 8 10 group
thank 9 10 group
great 10 10 group
school 1 11 group
students 2 11 group
november 3 11 group
day 4 11 group
high 5 11 group
thank 6 11 group
today 7 11 group
please 8 11 group
week 9 11 group
grade 10 11 group
school 1 12 group
students 2 12 group
december 3 12 group
high 4 12 group
holiday 5 12 group
day 6 12 group
today 7 12 group
thank 8 12 group
christmas 9 12 group
grade 10 12 group
school 1 2 group
students 2 2 group
february 3 2 group
day 4 2 group
high 5 2 group
today 6 2 group
please 7 2 group
week 8 2 group
schools 9 2 group
district 10 2 group
school 1 3 group
students 2 3 group
march 3 3 group
high 4 3 group
day 5 3 group
today 6 3 group
please 7 3 group
team 8 3 group
grade 9 3 group
week 10 3 group
school 1 4 group
students 2 4 group
april 3 4 group
high 4 4 group
day 5 4 group
may 6 4 group
today 7 4 group
please 8 4 group
grade 9 4 group
year 10 4 group
school 1 5 group
students 2 5 group
may 3 5 group
day 4 5 group
high 5 5 group
year 6 5 group
thank 7 5 group
grade 8 5 group
congratulations 9 5 group
today 10 5 group
school 1 6 group
students 2 6 group
summer 3 6 group
high 4 6 group
june 5 6 group
year 6 6 group
day 7 6 group
congratulations 8 6 group
class 9 6 group
please 10 6 group
school 1 7 group
students 2 7 group
july 3 7 group
year 4 7 group
summer 5 7 group
new 6 7 group
please 7 7 group
high 8 7 group
august 9 7 group
day 10 7 group
school 1 8 group
students 2 8 group
year 3 8 group
day 4 8 group
august 5 8 group
new 6 8 group
please 7 8 group
first 8 8 group
high 9 8 group
back 10 8 group
school 1 9 group
students 2 9 group
day 3 9 group
september 4 9 group
high 5 9 group
please 6 9 group
today 7 9 group
week 8 9 group
year 9 9 group
great 10 9 group

Key words in context

From report

kwic(my_tokens, "learning") %>% slice(1:5)
##                                                               
##  [text130, 12]   year of happiness health success | learning |
##  [text135, 29]               wait for all the new | learning |
##   [text167, 4]                       Happy to our | learning |
##  [text215, 31] our school cafeterias 21st century | learning |
##  [text247, 13]     adventures new friends and new | learning |
##                                     
##  for you your family#happynewyear   
##  new friendships and new discoveries
##  community                          
##  in action                          
## 
kwic(my_tokens, "technology") %>% slice(1:5)
##                                                                        
##   [text273, 32]                  School for the Arts and | Technology |
##  [text617, 100]         taught middle school science and | technology |
##   [text987, 50]    classroom projects and building needs | technology |
##  [text1224, 20]     students love learning about science | technology |
##  [text1513, 34] heating telephones copiers and classroom | technology |
##                                                                  
##  Freshman Open House#WeAreSouthfield#ScholarsPositionedForSuccess
##  traveled to students homes as                                   
##  social-emotional supports reading and math                      
##  engineering and math                                            
##  District personnel and network server
kwic(my_tokens, "student") %>% slice(1:5)
##                                                             
##   [text78, 65]   with the principal teachers and | student |
##  [text161, 42]         the best in education one | student |
##  [text264, 30] community Gracias Please join our | student |
##   [text273, 3]                    Your 8th-grade | student |
##  [text283, 15]          new year Know that every | student |
##                                                  
##  council Daily Herald#d25ItsPersonal#d25ALookBack
##  at a time Happy New                             
##  athletes athletic coaches and alumni            
##  is on his or her                                
##  family and staff member who
kwic(my_tokens, "education") %>% slice(1:5)
##                                                                
##   [text161, 40]    continue providing the best in | education |
##  [text224, 110]              Some of our Board of | Education |
##  [text224, 151]       students with the very best | education |
##   [text303, 18]               you to our Board of | Education |
##   [text542, 64] the Mississippi Alliance for Arts | Education |
##                                 
##  one student at a time          
##  members are alumni and even    
##  possible What a legacy- Welcome
##  members for their service These
##  in Ms Stanford is chair
kwic(my_tokens, "teaching") %>% slice(1:5)
##                                                                       
##    [text343, 2]                                Elementary | Teaching |
##   [text529, 17]              Academy Read how the Capital | Teaching |
##   [text542, 94]                      been a member of the | Teaching |
##  [text542, 131] has received recognition for choreography | teaching |
##  [text542, 185]            technical skill In addition to | teaching |
##                                   
##  Position                         
##  Residency helped propel her into 
##  Artist Roster for the Mississippi
##  and directing of over twenty     
##  and directing at MSA Ms
kwic(my_tokens, "online") %>% slice(1:5)
##                                                              
##   [text186, 29] creation of your painting Register | online |
##   [text469, 19]           all of the videos posted | online |
##   [text593, 67]      about Dawn and Lance's awards | online |
##  [text676, 116]       through the Avenues for Hope | Online |
##   [text804, 27]   yearbook links to purchases then | online |
##                               
##  at bit.ly BCE_Winter2020     
##  reached almost half a million
##  at                           
##  Giving Campaign              
##  Make sure to click on
kwic(my_tokens, "teacher") %>% slice(1:5)
##                                                                
##   [text16, 22] Students return next Tuesday January | Teacher |
##  [text253, 18]       Schools family lost an amazing | teacher |
##  [text319, 30]              They are nominated by a | teacher |
##  [text428, 19]  Hollister Middle School Vocal Music | Teacher |
##  [text441, 11]   Mae Stevens Early Learning Academy | teacher |
##                                                
##  Workday January#KGLIFE#TogetherTowardsTomorrow
##  coach mentor and friend Mr                    
##  on the trail Students names                   
##  She has worked in Hollister                   
##  Patricia Crawford in the Copperas

Other terms

kwic(my_tokens, "covid") %>% slice(1:5)
## kwic object with 0 rows
kwic(my_tokens, "school") %>% slice(1:5)
##                                                                            
##  [text11, 15] the second half of the | school | year and we want to        
##  [text13, 15] the second half of the | school | year and we want to        
##  [text20, 18]    all in the new year | School | resumes on Thursday January
##  [text25, 13]    you are part of our | school | community                  
##   [text30, 4]         Happy New Year | School | resumes Thursday January
kwic(my_tokens, "website") %>% slice(1:5) 
##                                                          
##   [text288, 94]      by going to the district | website |
##   [text422, 12]  ages According to the USDA's | website |
##    [text425, 2]                      District | Website |
##   [text671, 53] are available on the district | website |
##  [text694, 118]       school Please go to our | website |
##                                                   
##  at www.astoria.k12 or.us Budget Committee        
##  Choose My Plate school-age children              
##  New Family Portal www.cheltenham.org familyportal
##  www.scrsd.org                                    
##  www.autismcharter.org and click the Maroon
kwic(my_tokens, "resource") %>% slice(1:5)
##                                                            
##   [text924, 29] diversions at the new Homeless | Resource |
##  [text1034, 47]         Year#NYE2020 Here is a | resource |
##   [text1803, 3]       #ScienceTuesday Cultural | Resource |
##  [text2316, 20]   Officer Roddy and the School | Resource |
##  [text2913, 41]   degrees in healthcare set up | resource |
##                                   
##  Centers wrapped up our Holiday   
##  for firework education           
##  Survey Program Artifacts found in
##  Officer Program here at KES      
##  tables for students to visit
kwic(my_tokens, "link") %>% slice(1:5)
##                                                                               
##   [text192, 4]          Click on the | link | below to see Community Flyers   
##  [text296, 19]    on IG Click on the | link | to start following our Instagram
##  [text301, 19]    on IG Click on the | link | to start following our Instagram
##  [text312, 20] us yet Just click the | link | below and stay in the           
##  [text313, 19]    on IG Click on the | link | below to start in
kwic(my_tokens, "laptop") %>% slice(1:5)
##                                                              
##    [text2830, 9]      Heath Shelton working on the | laptop |
##  [text3081, 401] The technology package includes a | laptop |
##   [text18346, 7]      Santa bringing your family a | laptop |
##  [text19457, 25]            of a new smartphone or | laptop |
##  [text21616, 39]           bring up captions If on | laptop |
##                                        
##  during Athletes and Friends           
##  computer tablet monitor keyboard mouse
##  an iPad or tablet a                   
##  It's a great time to                  
##  click the CC button on
kwic(my_tokens, "internet") %>% slice(1:5) 
##                                                                   
##   [text736, 13]             the halftime show but the | internet |
##  [text1083, 50]       home school or anywhere there's | internet |
##  [text3887, 19] can get super-fast portable UNLIMITED | internet |
##  [text6366, 37]               new phone tablet or any | internet |
##  [text6517, 29]    a very informative presentation on | Internet |
##                                                         
##  in the stadium is spotty                               
##  service                                                
##  for per month There is                                 
##  accessing device#learngrowsucceed#committedtoexcellence
##  Safety Parents AND children are
kwic(my_tokens, "access") %>% slice(1:5)
##                                                                 
##  [text257, 100]              médico llame a NM Crisis | Access |
##  [text968, 105]            your doctor call NM Crisis | Access |
##  [text1083, 26]             and Elysian will now have | access |
##  [text1083, 41] baking to drawing through Creativebug | Access |
##  [text1340, 37]            to improve the quality and | access |
##                                          
##  al 1-855-NMCRISIS o visite www.drugs.com
##  at 1-855-NMCRISIS or visit www.drugs.com
##  to thousands of video arts              
##  from the library home school            
##  of learning beyond the classroom
kwic(my_tokens, "packet") %>% slice(1:5)
##                                                             
##  [text1622, 27]     accepted until January Once a | packet |
##  [text4384, 28]         Please refer to the order | packet |
##  [text6139, 44]    need a VUSD completed physical | packet |
##  [text9464, 45]           Education can pick up a | packet |
##  [text9665, 23] year sports physical consent form | packet |
##                                     
##  is completed and returned to       
##  provided by the Herff-Jones rep    
##  on file with the Athletic          
##  during normal business hours Filing
##  on file at the school

Term co-occurence

not working with the entire dataset

my_fcm <- fcm(my_dfm)
feat <- names(topfeatures(my_fcm, 50))
my_fcm_select <- fcm_select(my_fcm, pattern = feat, selection = "keep")

size <- log(colSums(dfm_select(my_dfm, feat, selection = "keep")))
textplot_network(my_fcm_select, min_freq = 0.8, vertex_size = size / max(size) * 3,
                 edge_alpha = .3,
                 edge_size = 2)

Using regex

text <- ss %>% 
  select(message, year) %>% 
  collect() %>%  
  mutate(learning = str_detect(message, "(?i)learning"),
         technology = str_detect(message, "(?i)technology"),
         student = str_detect(message, "(?i)student"),
         education = str_detect(message, "(?i)education"),
         teaching = str_detect(message, "(?i)teaching"),
         online = str_detect(message, "(?i)online"),
         teacher = str_detect(message, "(?i)teacher"),
         covid = str_detect(message, "(?i)covid")) %>% 
  filter(!is.na(message))

text_to_plot <- text %>% 
  select(year, learning:covid) %>% 
  gather(key, val, -year) %>% 
  group_by(key, year) %>% 
  summarize(mean_val = mean(val, na.rm = TRUE))

text_to_plot 
## # A tibble: 8 x 3
## # Groups:   key [8]
##   key         year   mean_val
##   <chr>      <dbl>      <dbl>
## 1 covid       2019 0.00000505
## 2 education   2019 0.0469    
## 3 learning    2019 0.0357    
## 4 online      2019 0.0196    
## 5 student     2019 0.254     
## 6 teacher     2019 0.0682    
## 7 teaching    2019 0.00873   
## 8 technology  2019 0.00903
# text_to_plot %>% 
#   filter(year >= 2010) %>% 
#   ggplot(aes(x = year, y = mean_val, color = key, group = key)) +
#   geom_point() +
#   geom_line() +
#   scale_color_brewer(type = "qual") +
#   hrbrthemes::theme_ipsum() + 
#   scale_x_continuous(breaks = 2010:2020) +
#   ylab("proportion")

Photos

Not run just yet, working on API

ss_1000 <- tbl(cn, "posts") %>% 
  filter(year == 2020) %>% 
  select(facebook_id) %>% 
  collect() %>% 
  sample_n(1000)

files_to_download <- ss_collected %>% filter(type == "Photo") %>% pull(link) %>% head(5)

map2(files_to_download,
     str_c("downloaded-photos/", 1:5, ".jpeg"),
           download.file)