dbdir <- "~/brexit/brexit_monet_db/"
# dbdir <- "/scratch/c.c1541911/brexit_monet_db" #directory of the Monet database, should be empty
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
tbl(con,"brexit") %>% colnames()
## [1] "text_long"
## [2] "id_str"
## [3] "text"
## [4] "created_at"
## [5] "id"
## [6] "lang"
## [7] "possibly_sensitive"
## [8] "source"
## [9] "timestamp_ms"
## [10] "coord_longitude"
## [11] "coord_latitude"
## [12] "coordinates.type"
## [13] "place.name"
## [14] "place.country"
## [15] "place.country_code"
## [16] "favorite_count"
## [17] "retweet_count"
## [18] "truncated"
## [19] "entities.urls.0.expanded_url"
## [20] "withheld_in_countries.0"
## [21] "user.created_at"
## [22] "user.description"
## [23] "user.followers_count"
## [24] "user.friends_count"
## [25] "user.id"
## [26] "user.id_str"
## [27] "user.location"
## [28] "user.name"
## [29] "user.protected"
## [30] "user.screen_name"
## [31] "user.statuses_count"
## [32] "user.favourites_count"
## [33] "user.verified"
## [34] "user_created_at"
## [35] "user.time_zone"
## [36] "user.utc_offset"
## [37] "is_quote_status"
## [38] "quoted_status.created_at"
## [39] "quoted_status.id"
## [40] "quoted_status.id_str"
## [41] "quoted_status.text"
## [42] "quoted_status.lang"
## [43] "quoted_status.possibly_sensitive"
## [44] "quoted_status.coord_longitude"
## [45] "quoted_status.coord_latitude"
## [46] "quoted_status.place.name"
## [47] "quoted_status.place.country"
## [48] "quoted_status.place.country_code"
## [49] "quoted_status.entities.urls.0.expanded_url"
## [50] "quoted_status.user.created_at"
## [51] "quoted_status.user.description"
## [52] "quoted_status.user.friends_count"
## [53] "quoted_status.user.followers_count"
## [54] "quoted_status.user.id"
## [55] "quoted_status.user.id_str"
## [56] "quoted_status.user.screen_name"
## [57] "quoted_status.user.name"
## [58] "quoted_status.user.statuses_count"
## [59] "quoted_status.favorite_count"
## [60] "quoted_status.retweet_count"
## [61] "quoted_status.user.verified"
## [62] "quoted_status.user.protected"
## [63] "quoted_status.user.location"
## [64] "quoted_status.user.time_zone"
## [65] "retweeted_status.created_at"
## [66] "retweeted_status.id"
## [67] "retweeted_status.id_str"
## [68] "retweeted_status.text"
## [69] "retweeted_status.lang"
## [70] "retweeted_status.possibly_sensitive"
## [71] "retweeted_status.coord_longitude"
## [72] "retweeted_status.coord_latitude"
## [73] "retweeted_status.place.name"
## [74] "retweeted_status.place.country"
## [75] "retweeted_status.place.country_code"
## [76] "retweeted_status.favorite_count"
## [77] "retweeted_status.retweet_count"
## [78] "retweeted_status.entities.urls.0.expanded_url"
## [79] "retweeted_status.user.created_at"
## [80] "retweeted_status.user.description"
## [81] "retweeted_status.user.followers_count"
## [82] "retweeted_status.user.friends_count"
## [83] "retweeted_status.user.id"
## [84] "retweeted_status.user.id_str"
## [85] "retweeted_status.user.screen_name"
## [86] "retweeted_status.user.name"
## [87] "retweeted_status.user.statuses_count"
## [88] "retweeted_status.user.verified"
## [89] "retweeted_status.user.location"
## [90] "retweeted_status.user.protected"
## [91] "retweeted_status.user.time_zone"
## [92] "in_reply_to_screen_name"
## [93] "in_reply_to_status_id"
## [94] "in_reply_to_status_id_str"
## [95] "in_reply_to_user_id"
## [96] "in_reply_to_user_id_str"
## [97] "quoted_status.extended_tweet.full_text"
## [98] "retweeted_status.extended_tweet.full_text"
## [99] "extended_tweet.full_text"
## [100] "is_quote"
## [101] "is_retweet"
## [102] "is_reply"
## [103] "tweet_date"
## [104] "spike_number"
tictoc::tic()
brexit_tweets <- tbl(con,"brexit") %>%
select(id_str,text_long, is_retweet, retweeted_status.id_str) %>%
# filter(is_retweet==FALSE) %>%
collect()
brexit_nrow <- nrow(brexit_tweets)
brexit_nrow <- ncol(brexit_tweets)
tictoc::toc()
## 197.588 sec elapsed
There are 4 tweets in the dataset and we have 4 columns in the dataset. This is all three spikes combined. Attached is 10 rows sampled from data.
brexit_tweets %>%
sample_n(10) %>%
rmarkdown:::print.paged_df()
Identify a pattern referring to polish identity
polish_pattern <- paste(c("polish", "polski", "Polack", "Polak", "Polock"),
collapse = "|")
Filter the tweets matching the polish_pattern
for both original tweets and retweets. Then select distinct texts from both non-RT-ed and RT-ed tweets. Lastly merge them together to come up with unique tweets referring to Polish identity.
polish_non_rt <- brexit_tweets %>%
filter(is_retweet==FALSE) %>%
filter(str_detect(text_long, pattern = regex (polish_pattern,ignore_case = T)))
polish_rt <- brexit_tweets %>%
filter(is_retweet==TRUE) %>%
filter(str_detect(text_long, pattern = regex (polish_pattern,ignore_case = T)))
polish_data <- polish_non_rt %>%
distinct(text_long, .keep_all = T) %>%
rbind(polish_rt %>%
distinct(text_long, .keep_all = T)) %>%
distinct(text_long,.keep_all = T)
Basic stats for Polish identity sample
polish_data %>% skimr::skim()
## Skim summary statistics
## n obs: 12064
## n variables: 4
##
## ── Variable type:character ────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## id_str 0 12064 12064 18 18 0 12064
## retweeted_status.id_str 10145 1919 12064 18 18 0 1919
## text_long 0 12064 12064 17 303 0 12064
##
## ── Variable type:logical ──────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean count
## is_retweet 0 12064 12064 0.16 FAL: 10145, TRU: 1919, NA: 0
First 10 rows of the Polish identity sample
polish_data %>%
head(10) %>%
rmarkdown:::print.paged_df()
Create an antagonistic pattern with profanity words.
possible_antagonistic_pattern <- tolower(c("Arsehole", "Bint", "Bitch", "Bollocks", "Bullshit", "Feck", "Munter", "Pissed/pissed off", "Shit", "Son of a bitch", "Tits", "Bastard", "Beaver", "Beef curtains", "Bellend", "Bloodclaat", "Clunge", "Cock", "Dick", "Dickhead", "Fanny", "Flaps", "Gash", "Knob", "Minge", "Prick", "Punani", "Pussy", "Snatch", "Twat", "Cunt", "Fuck", "Motherfucker", "Cocksucker", "Nonce", "Prickteaser", "Rapey", "Skank", "Slag", "Slut", "Wanker", "Whore", "fuck", "fuck", "fucking", "fuckers", "cunt", "cunts", "bitch", "bitches", "bitching","hate", "to hell", "go back", "go home", "send them home", "send them back")) %>%
unique()
possible_antagonistic_pattern
## [1] "arsehole" "bint" "bitch"
## [4] "bollocks" "bullshit" "feck"
## [7] "munter" "pissed/pissed off" "shit"
## [10] "son of a bitch" "tits" "bastard"
## [13] "beaver" "beef curtains" "bellend"
## [16] "bloodclaat" "clunge" "cock"
## [19] "dick" "dickhead" "fanny"
## [22] "flaps" "gash" "knob"
## [25] "minge" "prick" "punani"
## [28] "pussy" "snatch" "twat"
## [31] "cunt" "fuck" "motherfucker"
## [34] "cocksucker" "nonce" "prickteaser"
## [37] "rapey" "skank" "slag"
## [40] "slut" "wanker" "whore"
## [43] "fucking" "fuckers" "cunts"
## [46] "bitches" "bitching" "hate"
## [49] "to hell" "go back" "go home"
## [52] "send them home" "send them back"
possible_antagonistic_pattern <- paste0(possible_antagonistic_pattern,collapse = "|")
Filter Polish identity data using possible_antagonistic_pattern
.
polish_possible_antag <- polish_data %>%
filter(str_detect(text_long, pattern = regex(possible_antagonistic_pattern,ignore_case = T)))
As a result we end up with 521 tweets. I am attaching the final product data here. Need to go over these and ID the ones that are actually hateful against Polish people and Polish identity.
polish_possible_antag %>%
rmarkdown:::print.paged_df()