English
setwd("F:/Coronavirus_Tweets")
library(data.table)
library(tidytext)
library(ggplot2)
library(dplyr)
mm= fread("03282020_Corona_Clean.csv")
dim(mm)
## [1] 770375 34
## [1] "coordinates" "created_at"
## [3] "hashtags" "media"
## [5] "urls" "favorite_count"
## [7] "id" "in_reply_to_screen_name"
## [9] "in_reply_to_status_id" "in_reply_to_user_id"
## [11] "lang" "place"
## [13] "possibly_sensitive" "retweet_count"
## [15] "reweet_id" "retweet_screen_name"
## [17] "source" "text"
## [19] "tweet_url" "user_created_at"
## [21] "user_screen_name" "user_default_profile_image"
## [23] "user_description" "user_favourites_count"
## [25] "user_followers_count" "user_friends_count"
## [27] "user_listed_count" "user_location"
## [29] "user_name" "user_screen_name"
## [31] "user_statuses_count" "user_time_zone"
## [33] "user_urls" "user_verified"
mm$text <- tolower(mm$text)
mm= subset(mm, lang=="en")
dim(mm)
## [1] 422421 34
mm1= mm %>%
unnest_tokens(word, text)
stopwords1 <- fread("stop-word-list.csv")
mm1a <- mm1 %>%
anti_join(stopwords1)
mm1a %>%
count(word, sort = TRUE)
## # A tibble: 633,794 x 2
## word n
## <chr> <int>
## 1 people 33543
## 2 amp 26183
## 3 cases 24058
## 4 new 23271
## 5 now 22806
## 6 more 21679
## 7 via 21305
## 8 trump 21018
## 9 out 19402
## 10 up 16656
## # ... with 633,784 more rows
mm1a %>%
count(word, sort = TRUE) %>%
filter(n > 3000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +theme_bw()+
xlab(NULL) +
coord_flip()
