library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rtweet)

## 
## Attaching package: 'rtweet'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(twitteR)

## 
## Attaching package: 'twitteR'

## The following object is masked from 'package:rtweet':
## 
##     lookup_statuses

## The following objects are masked from 'package:dplyr':
## 
##     id, location

library(udpipe)
library(tidytext)
library(wordcloud)

## Loading required package: RColorBrewer

Set key

appname <- "xxxxxxxxxxxx"
key <- "xxxxxxxxxxxxxxxxxxxxxxxxxxx"
secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
access_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

Set token

twitter_token <- create_token(app = appname,
                              consumer_key = key,
                              consumer_secret = secret,
                              access_token = access_token,
                              access_secret = access_secret)

First: What is the most occurancy word in covid-19 related tweets

Download dataset

covid_twitter_ <- search_tweets("covid-19 OR COVID19 OR Covid 19 OR COVID-19 OR covid19", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

covid_twitter_ %>%
  select(created_at, screen_name, text, hashtags, name, location, description) -> covid_twitter

head(covid_twitter)

## # A tibble: 6 x 7
##   created_at          screen_name  text     hashtags name  location description 
##   <dttm>              <chr>        <chr>    <list>   <chr> <chr>    <chr>       
## 1 2021-12-08 01:27:04 Trackon123   "#GetVa~ <chr [2~ Pam ~ Texas    "\U0001f30a~
## 2 2021-12-08 01:27:00 DennisKoch10 "McMast~ <chr [2~ Denn~ Edmonto~ "Mental Hea~
## 3 2021-12-08 01:26:04 DennisKoch10 "Human ~ <chr [2~ Denn~ Edmonto~ "Mental Hea~
## 4 2021-12-08 01:25:12 DennisKoch10 "Human ~ <chr [2~ Denn~ Edmonto~ "Mental Hea~
## 5 2021-12-08 01:26:59 Sadblueplan~ "Anothe~ <chr [1~ Skep~ U.S.A.   "Indie agai~
## 6 2021-12-07 23:34:54 Sadblueplan~ "@not_t~ <chr [1~ Skep~ U.S.A.   "Indie agai~

create a function to tidy the text

tidy_text <- function(text){
  text %>%
    mutate(twitter_number = row_number()) %>%
    unnest_tokens(word, text) %>% 
    mutate(word = str_extract(word, "[a-z']+")) %>% 
    anti_join(stop_words, by = "word") %>%
    filter(!is.na(word)) -> tidy_text
  return(tidy_text)
  
}

tidy the data and plot it

covid_twitter %>%
  tidy_text() %>%
  select(location, twitter_number, word) -> covid_twitter

txt_freq(covid_twitter$word) -> covid_freq

covid_freq[3:40,] -> covid_freq


covid_freq %>%
  group_by(freq) %>%
  slice_max(order_by = freq, n=40) %>%
  arrange(desc(freq)) %>%
  mutate(key = reorder(key, freq)) %>%
  ggplot(aes(x = key, y = freq)) +
  geom_col() +
  theme_light()+
  coord_flip() +
  xlab("Key words") +
  ylab("Appear times") +
  ggtitle("Figure 2: The most frequently keywords related to covid-19")

covid_twitter %>%
  anti_join(stop_words, by = "word") %>%
  count(word) %>% 
  arrange(desc(n)) -> forcloud
forcloud <- forcloud[3:40,] 
forcloud %>%
  with(wordcloud(word, n, max.words = 40))

Second: Does any hate word appear in covid-19 tweets?

covid_twitter %>%
  filter(str_detect(word, "chinese")) %>%
  group_by(location, word) %>%
  count(word) %>%
  ggplot(aes(location, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~word, scales = "free_y") +
  labs(y = "appear times", x = NULL) +
  theme_bw() +
  ggtitle("Chinese related tweets") +
  coord_flip()

covid_twitter %>%
  filter(str_detect(word, "chinese")) %>%
  group_by(location, word) %>%
  select(twitter_number) %>%
  return(twitter_number)-> num

## Adding missing grouping variables: `location`, `word`

covid_twitter_ [c(num$twitter_number), 5] -> chinese_group
  

chinese_group %>%
  filter(str_detect(text, "virus")) %>%
  tidy_text() -> chinese_group

txt_freq(chinese_group$word) -> chinese_group

chinese_group %>%
  group_by(freq) %>%
  slice_max(order_by = key, n=10) %>%
  arrange(desc(freq)) %>%
  mutate(key = reorder(key, freq)) %>%
  ggplot(aes(x = key, y = freq)) +
  geom_col() +
  theme_light()+
  coord_flip() +
  xlab("Key words") +
  ylab("Appear times")

covid_twitter %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() -> covid_sent

covid_sent %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n=10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  theme_bw() +
  ggtitle("Covid sentiment analysis") +
  coord_flip()

Third: what is the user’s attitude when their twitts includes hate words?

Conduct the sentiment analysis

First step: searching and download the data

racist_word <- search_tweets("chinese virus OR Chinese Virus OR chinesevirus", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

## retry on rate limit...
## waiting about 13 minutes...

racist_word2 <- search_tweets("chink OR CHINK OR Chink", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

racist_word3 <- search_tweets("ching chong OR Ching Chong OR CHING CHONG", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

racist_word4 <- search_tweets("chinaman OR Chinaman", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

racist_word5 <- search_tweets("gook OR GOOK OR Gook", n = 18000, geocode = lookup_coords("usa"), include_rts = FALSE, lang = "en", retryonratelimit = TRUE)

Build a tidy function

tidy_text2 <- function(text){
  text %>%
    mutate(twitter_number = row_number()) %>%
    unnest_tokens(word, text) %>% 
    mutate(word = str_extract(word, "[a-z']+")) %>% 
    filter(!is.na(word)) -> tidy_text2
  return(tidy_text2)
  
}

tidy each group

racist_word %>%
  tidy_text2() %>%
  select(location, twitter_number, word, location) -> racist_word

racist_word %>%
  mutate(Group = "Chinese Virus") %>%
  select(word, Group, twitter_number, location) -> racist_word

racist_word2 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word2

racist_word2 %>%
  mutate(Group = "chink") %>%
  select(word, Group, twitter_number, location) -> racist_word2

racist_word3 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word3

racist_word3 %>%
  mutate(Group = "ching chong") %>%
  select(word, Group, twitter_number, location) -> racist_word3

racist_word4 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word4

racist_word4 %>%
  mutate(Group = "chinaman") %>%
  select(word, Group, twitter_number, location) -> racist_word4

racist_word5 %>%
  tidy_text2() %>%
  select(twitter_number, word, location) -> racist_word5

racist_word5 %>%
  mutate(Group = "gook") %>%
  select(word, Group, twitter_number, location) -> racist_word5

visulize each group

Group 1

racist_word %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word

racist_word %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 100, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment

racist_word_sentiment %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Chinese Virus Sentiment Analysis") +
  theme_bw()

Group 2

racist_word2 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word2

racist_word2 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 40, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment2

racist_word_sentiment2 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Chink Sentiment Analysis") +
  theme_bw()

### Group 3

racist_word3 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word3

racist_word3 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 10, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment3

racist_word_sentiment3 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Ching Sentiment Analysis") +
  theme_bw()

### Group 4

racist_word4 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word4

racist_word4 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 10, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment4

racist_word_sentiment4 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Chinaman Sentiment Analysis") +
  theme_bw()

### Group 5

racist_word5 %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber, location) -> racist_word5

racist_word5 %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 10, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> racist_word_sentiment5

racist_word_sentiment5 %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  ggtitle("Gook Sentiment Analysis") +
  theme_bw()

### Combine all the data sets to compare

hateword <- do.call("rbind", list(racist_word, racist_word2, racist_word3, racist_word4, racist_word5))

hateword %>%
  group_by(Group) %>%
  mutate(linenumber = row_number()) %>%
  select(word, Group, linenumber) -> hateword

hateword %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(Group, index = linenumber %/% 80, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>% 
  mutate(net = positive - negative) -> hateword_sentiment

hateword_sentiment %>%
  ggplot(aes(index, net, fill = Group)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Group, ncol = 2, scales = "free_y") +
  theme_bw()

Now try to use nrc sentiment applied to chinese virus and also see the frequence

get_sentiments("nrc") %>%
  filter(sentiment == "anger" | sentiment == "fear" | sentiment == "disgust" |
           sentiment == "trust" | sentiment == "sadness") ->
  nrcanger

racist_word %>%
  inner_join(nrcanger, by = "word") -> nrc_analysis

nrc_analysis %>% 
  count(word, sentiment, sort = TRUE) %>%
  ungroup() -> nrc_analysis

nrc_analysis %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n = 10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  theme_bw() +
  coord_flip()

use bing to see deep of first group and also see the frequence

racist_word %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() ->
  bing_word_counts
bing_word_counts <- bing_word_counts[-1,]

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n=10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  theme_bw() +
  ggtitle("Hatewords sentiment analysis", subtitle = "Group of Chinese virus") +
  coord_flip()

Forth: Is there any relationship between “hate tweets” and hate crimes?

After we conducting the sentiment analysis, we will

combine the different bad words’ dataframe and put it on the map

to see if the high amount place occur in the fbi data

mapcompare <- do.call("rbind", list(racist_word, racist_word2, racist_word3, racist_word4, racist_word5))

mapcompare %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  filter(sentiment == "negative") %>%
  group_by(location) %>%
  count(location) %>%
  rename(state = location) %>%
  filter(str_detect(state, "CA"))

## # A tibble: 24 x 2
## # Groups:   state [24]
##    state                n
##    <chr>            <int>
##  1 Anaheim, CA          1
##  2 Bay Area, CA         1
##  3 Berkeley, CA         5
##  4 Campbell, CA         1
##  5 CANADA               1
##  6 Chino Hills, CA      1
##  7 Hidden Hills, CA     3
##  8 Lake Forest, CA      8
##  9 Los Angeles, CA     20
## 10 Oakland, CA          4
## # ... with 14 more rows

Import the data

readxl::read_excel("Table_13_Hate_Crime_Incidents_per_Bias_Motivation_and_Quarter_by_State_Federal_and_Agency_2020.xlsx") -> fbi

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...

clean and tidy the data

fbi %>%
  filter(...2  == "Total") %>%
  select(`Table 13`, ...5) %>%
  rename(state = "Table 13") %>%
  rename(n = "...5") %>%
  group_by(state) %>%
  mutate(n = as.numeric(n)) %>%
  arrange(desc(n)) -> hate_crime_data

we need the usmap package

library(usmap)

plot_usmap(data = hate_crime_data, values = "n", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "cases", label = scales::comma) +
  labs(title = "US States",
       subtitle = paste0("Total hate crime by State as of ")) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

AS we can see the state of CA has the highest amount of hate crimes.

Twitter Analysis for Final

Hsiang-Yu Wen

11/8/2021

Set key

Set token

Second: Does any hate word appear in covid-19 tweets?

Third: what is the user’s attitude when their twitts includes hate words?

Conduct the sentiment analysis

First step: searching and download the data

Build a tidy function

tidy each group

visulize each group

Group 1

Group 2

Now try to use nrc sentiment applied to chinese virus and also see the frequence

use bing to see deep of first group and also see the frequence

Forth: Is there any relationship between “hate tweets” and hate crimes?

After we conducting the sentiment analysis, we will

combine the different bad words’ dataframe and put it on the map

to see if the high amount place occur in the fbi data

Import the data

clean and tidy the data

we need the usmap package