library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(twitteR)
## 
## Attaching package: 'twitteR'
## The following object is masked from 'package:rtweet':
## 
##     lookup_statuses
## The following objects are masked from 'package:dplyr':
## 
##     id, location
library(udpipe)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer

Set key

appname <- "xxxxxxxxxxxx"
key <- "xxxxxxxxxxxxxxxxxxxxxxxxxxx"
secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
access_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

Set token

twitter_token <- create_token(app = appname,
                              consumer_key = key,
                              consumer_secret = secret,
                              access_token = access_token,
                              access_secret = access_secret)

Second: Does any hate word appear in covid-19 tweets?

covid_twitter %>%
  filter(str_detect(word, "chinese")) %>%
  group_by(location, word) %>%
  count(word) %>%
  ggplot(aes(location, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~word, scales = "free_y") +
  labs(y = "appear times", x = NULL) +
  theme_bw() +
  ggtitle("Chinese related tweets") +
  coord_flip()

covid_twitter %>%
  filter(str_detect(word, "chinese")) %>%
  group_by(location, word) %>%
  select(twitter_number) %>%
  return(twitter_number)-> num
## Adding missing grouping variables: `location`, `word`
covid_twitter_ [c(num$twitter_number), 5] -> chinese_group
  

chinese_group %>%
  filter(str_detect(text, "virus")) %>%
  tidy_text() -> chinese_group

txt_freq(chinese_group$word) -> chinese_group

chinese_group %>%
  group_by(freq) %>%
  slice_max(order_by = key, n=10) %>%
  arrange(desc(freq)) %>%
  mutate(key = reorder(key, freq)) %>%
  ggplot(aes(x = key, y = freq)) +
  geom_col() +
  theme_light()+
  coord_flip() +
  xlab("Key words") +
  ylab("Appear times") 

covid_twitter %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() -> covid_sent

covid_sent %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n=10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  theme_bw() +
  ggtitle("Covid sentiment analysis") +
  coord_flip()

Third: what is the user’s attitude when their twitts includes hate words?

Conduct the sentiment analysis

First step: searching and download the data

racist_word <- search_tweets("chinese virus OR Chinese Virus OR chinesevirus", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)
## retry on rate limit...
## waiting about 13 minutes...
racist_word2 <- search_tweets("chink OR CHINK OR Chink", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)
racist_word3 <- search_tweets("ching chong OR Ching Chong OR CHING CHONG", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)
racist_word4 <- search_tweets("chinaman OR Chinaman", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)
racist_word5 <- search_tweets("gook OR GOOK OR Gook", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

Build a tidy function

tidy_text2 <- function(text){
  text %>%
    mutate(twitter_number = row_number()) %>%
    unnest_tokens(word, text) %>% 
    mutate(word = str_extract(word, "[a-z']+")) %>% 
    filter(!is.na(word)) -> tidy_text2
  return(tidy_text2)
  
}

tidy each group

racist_word %>%
  tidy_text2() %>%
  select(location, twitter_number, word, location) -> racist_word

racist_word %>%
  mutate(Group = "Chinese Virus") %>%
  select(word, Group, twitter_number, location) -> racist_word
racist_word2 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word2

racist_word2 %>%
  mutate(Group = "chink") %>%
  select(word, Group, twitter_number, location) -> racist_word2
racist_word3 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word3

racist_word3 %>%
  mutate(Group = "ching chong") %>%
  select(word, Group, twitter_number, location) -> racist_word3
racist_word4 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word4

racist_word4 %>%
  mutate(Group = "chinaman") %>%
  select(word, Group, twitter_number, location) -> racist_word4
racist_word5 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word5

racist_word5 %>%
  mutate(Group = "gook") %>%
  select(word, Group, twitter_number, location) -> racist_word5

visulize each group

Group 1

racist_word %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word
racist_word %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 100, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment

racist_word_sentiment %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Chinese Virus Sentiment Analysis") +
  theme_bw()

Group 2

racist_word2 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word2
racist_word2 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 40, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment2

racist_word_sentiment2 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Chink Sentiment Analysis") +
  theme_bw()

### Group 3

racist_word3 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word3
racist_word3 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 10, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment3

racist_word_sentiment3 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Ching Sentiment Analysis") +
  theme_bw()

### Group 4

racist_word4 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word4
racist_word4 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 10, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment4

racist_word_sentiment4 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Chinaman Sentiment Analysis") +
  theme_bw()

### Group 5

racist_word5 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word5
racist_word5 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 10, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment5

racist_word_sentiment5 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Gook Sentiment Analysis") +
  theme_bw()

### Combine all the data sets to compare

hateword <- do.call("rbind", list(racist_word, racist_word2, racist_word3, racist_word4, racist_word5))
hateword %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber) -> hateword
hateword %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 80, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> hateword_sentiment

hateword_sentiment %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Group, ncol = 2, scales = "free_y") +
  theme_bw()

Now try to use nrc sentiment applied to chinese virus and also see the frequence

get_sentiments("nrc") %>%
  filter(sentiment == "anger" | sentiment == "fear" | sentiment == "disgust" |
           sentiment == "trust" | sentiment == "sadness") ->
  nrcanger
racist_word %>%
  inner_join(nrcanger, by = "word") -> nrc_analysis

nrc_analysis %>% 
  count(word, sentiment, sort = TRUE) %>%
  ungroup() -> nrc_analysis
nrc_analysis %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n = 10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  theme_bw() +
  coord_flip()

use bing to see deep of first group and also see the frequence

racist_word %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() ->
  bing_word_counts
bing_word_counts <- bing_word_counts[-1,]
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n=10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  theme_bw() +
  ggtitle("Hatewords sentiment analysis", subtitle = "Group of Chinese virus") +
  coord_flip()

Forth: Is there any relationship between “hate tweets” and hate crimes?

After we conducting the sentiment analysis, we will

combine the different bad words’ dataframe and put it on the map

to see if the high amount place occur in the fbi data

mapcompare <- do.call("rbind", list(racist_word, racist_word2, racist_word3, racist_word4, racist_word5))

mapcompare %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  filter(sentiment == "negative") %>%
  group_by(location) %>%
  count(location) %>%
  rename(state = location) %>%
  filter(str_detect(state, "CA"))
## # A tibble: 24 x 2
## # Groups:   state [24]
##    state                n
##    <chr>            <int>
##  1 Anaheim, CA          1
##  2 Bay Area, CA         1
##  3 Berkeley, CA         5
##  4 Campbell, CA         1
##  5 CANADA               1
##  6 Chino Hills, CA      1
##  7 Hidden Hills, CA     3
##  8 Lake Forest, CA      8
##  9 Los Angeles, CA     20
## 10 Oakland, CA          4
## # ... with 14 more rows

Import the data

readxl::read_excel("Table_13_Hate_Crime_Incidents_per_Bias_Motivation_and_Quarter_by_State_Federal_and_Agency_2020.xlsx") -> fbi
## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...

clean and tidy the data

fbi %>%
  filter(...2  == "Total") %>%
  select(`Table 13`, ...5) %>%
  rename(state = "Table 13") %>%
  rename(n = "...5") %>%
  group_by(state) %>%
  mutate(n = as.numeric(n)) %>%
  arrange(desc(n)) -> hate_crime_data

we need the usmap package

library(usmap)
plot_usmap(data = hate_crime_data, values = "n", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "cases", label = scales::comma) +
  labs(title = "US States",
       subtitle = paste0("Total hate crime by State as of ")) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

AS we can see the state of CA has the highest amount of hate crimes.