dbdir <- "~/brexit/brexit_monet_db/"
# dbdir <- "/scratch/c.c1541911/brexit_monet_db" #directory of the Monet database, should be empty
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
tbl(con,"brexit") %>% colnames()
##   [1] "text_long"                                    
##   [2] "id_str"                                       
##   [3] "text"                                         
##   [4] "created_at"                                   
##   [5] "id"                                           
##   [6] "lang"                                         
##   [7] "possibly_sensitive"                           
##   [8] "source"                                       
##   [9] "timestamp_ms"                                 
##  [10] "coord_longitude"                              
##  [11] "coord_latitude"                               
##  [12] "coordinates.type"                             
##  [13] "place.name"                                   
##  [14] "place.country"                                
##  [15] "place.country_code"                           
##  [16] "favorite_count"                               
##  [17] "retweet_count"                                
##  [18] "truncated"                                    
##  [19] "entities.urls.0.expanded_url"                 
##  [20] "withheld_in_countries.0"                      
##  [21] "user.created_at"                              
##  [22] "user.description"                             
##  [23] "user.followers_count"                         
##  [24] "user.friends_count"                           
##  [25] "user.id"                                      
##  [26] "user.id_str"                                  
##  [27] "user.location"                                
##  [28] "user.name"                                    
##  [29] "user.protected"                               
##  [30] "user.screen_name"                             
##  [31] "user.statuses_count"                          
##  [32] "user.favourites_count"                        
##  [33] "user.verified"                                
##  [34] "user_created_at"                              
##  [35] "user.time_zone"                               
##  [36] "user.utc_offset"                              
##  [37] "is_quote_status"                              
##  [38] "quoted_status.created_at"                     
##  [39] "quoted_status.id"                             
##  [40] "quoted_status.id_str"                         
##  [41] "quoted_status.text"                           
##  [42] "quoted_status.lang"                           
##  [43] "quoted_status.possibly_sensitive"             
##  [44] "quoted_status.coord_longitude"                
##  [45] "quoted_status.coord_latitude"                 
##  [46] "quoted_status.place.name"                     
##  [47] "quoted_status.place.country"                  
##  [48] "quoted_status.place.country_code"             
##  [49] "quoted_status.entities.urls.0.expanded_url"   
##  [50] "quoted_status.user.created_at"                
##  [51] "quoted_status.user.description"               
##  [52] "quoted_status.user.friends_count"             
##  [53] "quoted_status.user.followers_count"           
##  [54] "quoted_status.user.id"                        
##  [55] "quoted_status.user.id_str"                    
##  [56] "quoted_status.user.screen_name"               
##  [57] "quoted_status.user.name"                      
##  [58] "quoted_status.user.statuses_count"            
##  [59] "quoted_status.favorite_count"                 
##  [60] "quoted_status.retweet_count"                  
##  [61] "quoted_status.user.verified"                  
##  [62] "quoted_status.user.protected"                 
##  [63] "quoted_status.user.location"                  
##  [64] "quoted_status.user.time_zone"                 
##  [65] "retweeted_status.created_at"                  
##  [66] "retweeted_status.id"                          
##  [67] "retweeted_status.id_str"                      
##  [68] "retweeted_status.text"                        
##  [69] "retweeted_status.lang"                        
##  [70] "retweeted_status.possibly_sensitive"          
##  [71] "retweeted_status.coord_longitude"             
##  [72] "retweeted_status.coord_latitude"              
##  [73] "retweeted_status.place.name"                  
##  [74] "retweeted_status.place.country"               
##  [75] "retweeted_status.place.country_code"          
##  [76] "retweeted_status.favorite_count"              
##  [77] "retweeted_status.retweet_count"               
##  [78] "retweeted_status.entities.urls.0.expanded_url"
##  [79] "retweeted_status.user.created_at"             
##  [80] "retweeted_status.user.description"            
##  [81] "retweeted_status.user.followers_count"        
##  [82] "retweeted_status.user.friends_count"          
##  [83] "retweeted_status.user.id"                     
##  [84] "retweeted_status.user.id_str"                 
##  [85] "retweeted_status.user.screen_name"            
##  [86] "retweeted_status.user.name"                   
##  [87] "retweeted_status.user.statuses_count"         
##  [88] "retweeted_status.user.verified"               
##  [89] "retweeted_status.user.location"               
##  [90] "retweeted_status.user.protected"              
##  [91] "retweeted_status.user.time_zone"              
##  [92] "in_reply_to_screen_name"                      
##  [93] "in_reply_to_status_id"                        
##  [94] "in_reply_to_status_id_str"                    
##  [95] "in_reply_to_user_id"                          
##  [96] "in_reply_to_user_id_str"                      
##  [97] "quoted_status.extended_tweet.full_text"       
##  [98] "retweeted_status.extended_tweet.full_text"    
##  [99] "extended_tweet.full_text"                     
## [100] "is_quote"                                     
## [101] "is_retweet"                                   
## [102] "is_reply"                                     
## [103] "tweet_date"                                   
## [104] "spike_number"

Collect Tweets from DB

tictoc::tic()
brexit_tweets <- tbl(con,"brexit") %>%
  select(id_str,text_long, is_retweet, retweeted_status.id_str) %>% 
  # filter(is_retweet==FALSE) %>% 
  collect()
brexit_nrow <- nrow(brexit_tweets)
brexit_nrow <- ncol(brexit_tweets)
tictoc::toc()
## 197.588 sec elapsed

Check dataset

There are 4 tweets in the dataset and we have 4 columns in the dataset. This is all three spikes combined. Attached is 10 rows sampled from data.

brexit_tweets %>%
  sample_n(10) %>% 
  rmarkdown:::print.paged_df()

Filter tweets referring to Polish

Identify a pattern referring to polish identity

polish_pattern <- paste(c("polish", "polski", "Polack", "Polak", "Polock"),
                        collapse = "|") 

Filter the tweets matching the polish_pattern for both original tweets and retweets. Then select distinct texts from both non-RT-ed and RT-ed tweets. Lastly merge them together to come up with unique tweets referring to Polish identity.

polish_non_rt <- brexit_tweets %>%
  filter(is_retweet==FALSE) %>% 
  filter(str_detect(text_long, pattern = regex (polish_pattern,ignore_case = T)))


polish_rt <- brexit_tweets %>%
  filter(is_retweet==TRUE) %>% 
  filter(str_detect(text_long, pattern = regex (polish_pattern,ignore_case = T)))

polish_data <- polish_non_rt %>% 
  distinct(text_long, .keep_all = T) %>% 
  rbind(polish_rt %>%
            distinct(text_long, .keep_all = T)) %>% 
  distinct(text_long,.keep_all = T)

Basic stats for Polish identity sample

polish_data %>% skimr::skim()
## Skim summary statistics
##  n obs: 12064 
##  n variables: 4 
## 
## ── Variable type:character ────────────────────────────────────────────────────────────────────────────────────────
##                 variable missing complete     n min max empty n_unique
##                   id_str       0    12064 12064  18  18     0    12064
##  retweeted_status.id_str   10145     1919 12064  18  18     0     1919
##                text_long       0    12064 12064  17 303     0    12064
## 
## ── Variable type:logical ──────────────────────────────────────────────────────────────────────────────────────────
##    variable missing complete     n mean                        count
##  is_retweet       0    12064 12064 0.16 FAL: 10145, TRU: 1919, NA: 0

First 10 rows of the Polish identity sample

polish_data %>% 
  head(10) %>% 
  rmarkdown:::print.paged_df()

Find Antagonistic Pattern in Polish identity sample

Create an antagonistic pattern with profanity words.

possible_antagonistic_pattern <- tolower(c("Arsehole", "Bint", "Bitch", "Bollocks", "Bullshit", "Feck", "Munter", "Pissed/pissed off", "Shit", "Son of a bitch", "Tits", "Bastard", "Beaver", "Beef curtains", "Bellend", "Bloodclaat", "Clunge", "Cock", "Dick", "Dickhead", "Fanny", "Flaps", "Gash", "Knob", "Minge", "Prick", "Punani", "Pussy", "Snatch", "Twat", "Cunt", "Fuck", "Motherfucker", "Cocksucker", "Nonce", "Prickteaser", "Rapey", "Skank", "Slag", "Slut", "Wanker", "Whore", "fuck", "fuck", "fucking", "fuckers", "cunt", "cunts", "bitch", "bitches", "bitching","hate", "to hell", "go back", "go home", "send them home", "send them back")) %>% 
  unique()
possible_antagonistic_pattern
##  [1] "arsehole"          "bint"              "bitch"            
##  [4] "bollocks"          "bullshit"          "feck"             
##  [7] "munter"            "pissed/pissed off" "shit"             
## [10] "son of a bitch"    "tits"              "bastard"          
## [13] "beaver"            "beef curtains"     "bellend"          
## [16] "bloodclaat"        "clunge"            "cock"             
## [19] "dick"              "dickhead"          "fanny"            
## [22] "flaps"             "gash"              "knob"             
## [25] "minge"             "prick"             "punani"           
## [28] "pussy"             "snatch"            "twat"             
## [31] "cunt"              "fuck"              "motherfucker"     
## [34] "cocksucker"        "nonce"             "prickteaser"      
## [37] "rapey"             "skank"             "slag"             
## [40] "slut"              "wanker"            "whore"            
## [43] "fucking"           "fuckers"           "cunts"            
## [46] "bitches"           "bitching"          "hate"             
## [49] "to hell"           "go back"           "go home"          
## [52] "send them home"    "send them back"
possible_antagonistic_pattern <- paste0(possible_antagonistic_pattern,collapse = "|")

Filter Polish identity data using possible_antagonistic_pattern.

polish_possible_antag <- polish_data %>% 
  filter(str_detect(text_long, pattern = regex(possible_antagonistic_pattern,ignore_case = T))) 

Polish subset with antagonistic Terms

As a result we end up with 521 tweets. I am attaching the final product data here. Need to go over these and ID the ones that are actually hateful against Polish people and Polish identity.

polish_possible_antag %>% 
  rmarkdown:::print.paged_df()