Final Project Description

1. Data Collection + Cleaning

Randonly fetch threads from subReddit, r/politics; r/PoliticalHumor; r/PoliticalDiscussion, where they are in top 10 pages of each subrredit’s search result - Search date: Apr 2nd, 2021

#top 20 pages of threads in r/politics, searched on Apr 2nd, 2021 and remove duplicate by each comment
#politics <- get_reddit(search_terms = "China", subreddit = "politics", cn_threshold = 10, sort_by = "comments", page_threshold = 1)
#politics_1 <- get_reddit(search_terms = "Chinese", subreddit = "politics", cn_threshold = 10, sort_by = "comments", page_threshold = 1)
#keyword_china_1 <- rbind(politics,politics_1)
#keyword_china_1 <- keyword_china_1 %>% distinct(comment, .keep_all= TRUE)
#write.csv(keyword_china_1,"keyword_china_1.csv", row.names = TRUE)

#top 10 pages of threads in r/PoliticalHumor, searched on Apr 2nd, 2021 and remove duplicate by each comment
#PoliticalHumor <- get_reddit(search_terms = "China", subreddit = "PoliticalHumor", cn_threshold = 10, sort_by = "comments", page_threshold = 1)
#PoliticalHumor_1 <- get_reddit(search_terms = "Chinese", subreddit = "PoliticalHumor", cn_threshold = 10, sort_by = "comments", page_threshold = 1)
#keyword_china_2 <- rbind(PoliticalHumor,PoliticalHumor_1)
#keyword_china_2 <- keyword_china_2 %>% distinct(comment, .keep_all= TRUE)
#write.csv(keyword_china_2,"keyword_china_2.csv", row.names = TRUE)

#top 10 pages of threads in r/PoliticalDiscussion, searched on Apr 2nd, 2021 and remove duplicate by each comment
#PoliticalDiscussion <- get_reddit(search_terms = "China", subreddit = "PoliticalDiscussion", cn_threshold = 10, sort_by = "comments", page_threshold = 1)
#PoliticalDiscussion_1 <- get_reddit(search_terms = "Chinese", subreddit = "PoliticalDiscussion", cn_threshold = 10, sort_by = "comments", page_threshold = 1)
#keyword_china_3 <- rbind(PoliticalDiscussion,PoliticalDiscussion_1)
#keyword_china_3 <- keyword_china_3 %>% distinct(comment, .keep_all= TRUE)
#write.csv(keyword_china_3,"keyword_china_3.csv", row.names = TRUE)

#combine data from three subreddits into one
#keyword_china_combine <- rbind(keyword_china_1,keyword_china_2,keyword_china_3)

#remove NA value in the comment column 
#keyword_china_combine <- keyword_china_combine %>% drop_na(comment)
#keyword_china_combine <- keyword_china_combine %>% drop_na(comm_date)
#keyword_china_combine <- keyword_china_combine %>% drop_na(post_date)

#write.csv(keyword_china_combine,"keyword_china_combine.csv", row.names = TRUE)
#saveRDS(keyword_china_combine, file = "keyword_china_combine.rds")
keyword_china_combine <- readRDS(file = "keyword_china_combine.rds")

2. Data Preprocessing and Exploration

#remove symbols
remove_reg <- "&amp;|&lt;|&gt;"

#remove stopwords
data(stop_words)

customize_stop_words <- bind_rows(tibble(word = c("china","chinese","country","https","world"), 
                                          lexicon = c("custom")), 
                               stop_words)

#tokenization of all comments, lemmatizing them, removing stopwords
tidy_china_combine <- keyword_china_combine %>%
  mutate(comment = str_remove_all(comment, remove_reg)) %>%
  unnest_tokens(word, comment) %>%
  filter(
    !word %in% customize_stop_words$word,
    !word %in% str_remove_all(customize_stop_words$word, "'"),
    str_detect(word, "[a-z]")
  )%>%
  mutate(word = textstem::lemmatize_words(word))

#remove punctuation
tidy_china_combine$word<-removePunctuation(tidy_china_combine$word)

# remove numbers
nums <- tidy_china_combine %>% filter(str_detect(word, "^[0-9]")) %>% select(word) %>% unique()
tidy_china_combine <- tidy_china_combine %>% 
  anti_join(nums, by = "word")

#Count the most frequent words
word_freq <-tidy_china_combine %>%
  count(title, word, sort = TRUE) 

#count only by word
word_freq_byword <- tidy_china_combine %>%
  count(word, sort = TRUE) 

#Group by title, forming comparison between a specific word's frequency in a thread's all tokens' frequency
wordfreq_bytitle <- word_freq %>%
  group_by(title) %>%
 summarize(total = sum(n))

word_freq <- left_join(word_freq, wordfreq_bytitle)
#word_freq

2.1. visulize the token frequency (i.e., n) in 3 online political communities regarding discussions around China, where n>2000

2.2. The dataframe: The frequent tokens (frequency>200) within the total number of characters in their threads

word_freq %>%
  group_by(title) %>%
  arrange(desc(n)) %>%
  filter(n > 200) %>%
  ungroup() %>%
  mutate(word = factor(word, levels = rev(unique(word)))) 

2.3. Wordcloud for frequency>1000 tokens

top_terms_1000 <- word_freq_byword %>%
  filter(!str_detect(word, "^#")) %>% 
  filter(!str_detect(word, "^@")) %>% 
  filter(n > 1000) %>%
  mutate(word = reorder(word, n)) 
wordcloud2(data = top_terms_1000, color = "random-light", backgroundColor = "white")