How do you
This special topic is developed to help you address these questions using one of the most popular machine learning languages, R.
After this workshop, you will have a better understanding of Twitter text data and online discussions centering around misinformation from a big data perspective.
The R Project for Statistical Computing. https://www.r-project.org/
R for Data Science - Hadley Wickham. https://r4ds.had.co.nz/
R Markdown. https://rmarkdown.rstudio.com/
R Markdown: The Definitive Guide - Bookdown. https://bookdown.org/yihui/rmarkdown/
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# text mining library
library(tidytext)
# plotting packages
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Loading required package: ggplot2
## load rtweet package
library(rtweet)
library(httpuv)
setwd("C:/Users/zxu3/Documents/R/fakenews")
rt<- read.csv(file = 'rt.csv')
#head(rt)
# remove http elements manually
rt$stripped_text <- gsub("http.*","", rt$text)
rt$stripped_text <- gsub("https.*","", rt$stripped_text)
# remove punctuation, convert to lowercase, add id for each tweet!
rt_clean <- rt %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat", "cat", ",")
unique(a_list_of_words)
## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","
# remove punctuation, convert to lowercase, add id for each tweet!
rt_clean <- rt %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# plot the top 15 words -- notice any issues?
rt_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
## Selecting by n
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
## Joining, by = "word"
## [1] 3613
## Selecting by n
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
## word1 word2 n
## 1 na na 55
## 2 spreading misinformation 29
## 3 spread misinformation 15
## 4 鈥 檚 15
## 5 covid 19 11
## 6 stop spreading 8