library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
Trump <- read.csv("C:/Users/xiayangxiao/downloads/trump.csv" , comment.char = "#")
summary(Trump)
## X
## Min. : 1
## 1st Qu.:1240
## Median :2478
## Mean :2478
## 3rd Qu.:3716
## Max. :4955
##
## MESSAGE_BODY
## RT @RickCanton: TAKE THE PLEDGE!\n\nTell the @GOP - @realDonaldTrump won't get your vote. No matter what.\n\n#NeverTrump #Election2016 https://… : 59
## RT @realDonaldTrump: "@iStandWithUSA: 62% said that @realDonaldTrump won #GOP Debate Vote your stand: https://t.co/l3XVkWXIEP #USA #Election2016 : 58
## RT @realDonaldTrump: "@bentomchik: @chucktodd: is there something to learn from @realDonaldTrump success? #Election2016" : 56
## RT @BNNDailyPoll: Is @realDonaldTrump v. @HillaryClinton inevitable? #Election2016 : 55
## RT @realDonaldTrump: "@ladycatherinecd @realDonaldTrump he has more smarts than any one of the politicians - he TRUMPED again ! #Trump #tcot #Election2016": 54
## RT @JudgeJeanine: After my interview with @realDonaldTrump. Am I standing with the next president of the United States? #Election2016 http://t.co/5NJX2JzLCj: 53
## (Other) :4620
## MESSAGE_COUNTRY MESSAGE_FAVORITES_COUNT MESSAGE_LOCATION
## United States : 150 Min. : 0.000 Manhattan : 10
## Canada : 9 1st Qu.: 0.000 St Paul : 9
## United Kingdom: 5 Median : 0.000 Chicago : 5
## België : 2 Mean : 2.442 Brooklyn : 4
## Colombia : 1 3rd Qu.: 0.000 Colorado Springs: 4
## (Other) : 4 Max. :1924.000 (Other) : 139
## NA's :4784 NA's :4784
## MESSAGE_LOCATION_DISPLAY_NAME MESSAGE_POSTED_TIME
## Manhattan, NY : 9 2016-02-21 17:17:08.000000: 15
## St Paul, MN : 9 2016-02-21 17:17:07.000000: 9
## Chicago, IL : 5 2016-02-21 17:17:09.000000: 7
## Brooklyn, NY : 4 2016-02-21 17:19:08.000000: 5
## Colorado Springs, CO: 4 2016-02-21 17:17:13.000000: 4
## (Other) : 140 2016-02-21 17:17:05.000000: 3
## NA's :4784 (Other) :4912
## MESSAGE_RETWEET_COUNT USER_CITY USER_COUNTRY
## Min. : 0.00 WASHINGTON : 75 United States :2798
## 1st Qu.: 0.00 San Diego : 74 UNITED KINGDOM: 66
## Median : 1.00 New York City: 62 Canada : 62
## Mean : 41.34 Los Angeles : 51 United Kingdom: 59
## 3rd Qu.: 13.00 York : 43 Australia : 26
## Max. :871.00 (Other) :1626 (Other) : 216
## NA's :3024 NA's :1728
## USER_DISPLAY_NAME USER_FOLLOWERS_COUNT USER_FRIENDS_COUNT
## ForPotus : 34 Min. : 0 Min. : 0
## NewsNetNews : 34 1st Qu.: 130 1st Qu.: 184
## BigGator5 : 29 Median : 534 Median : 592
## TheAmericanLifeStyle: 23 Mean : 12514 Mean : 2088
## USA Election2016 : 21 3rd Qu.: 2046 3rd Qu.: 1714
## CeeItTv : 16 Max. :4714925 Max. :282971
## (Other) :4798
## USER_GENDER USER_LOCATION_DISPLAY_NAME USER_SCREEN_NAME
## female : 847 United States : 130 ForPotus : 34
## male :1706 USA : 103 NewsNetNews : 34
## unknown:2137 Washington, DC: 52 BigGator5 : 31
## NA's : 265 San Diego : 47 atlaswon : 24
## Florida, USA : 42 TheUSALifeStyle: 23
## (Other) :3201 USAelection : 21
## NA's :1380 (Other) :4788
WASHINGTONTw <- subset(Trump, USER_CITY == "WASHINGTON")
NYCleTw <- subset(Trump, USER_CITY == "New York City")
saveRDS(WASHINGTONTw, "WASHINGTONTw.RDS")
WASHINGTONTw <- readRDS("WASHINGTONTw.RDS")
saveRDS(NYCleTw, "NYCleTw.RDS")
NYCTw <- readRDS("NYCleTw.RDS")
Wtweets <- WASHINGTONTw$MESSAGE_BODY
Ntweets <- NYCTw$MESSAGE_BODY
clean.text = function(x)
{
x = tolower(x)
x = gsub("rt", "", x)
x = gsub("@\\w+", "", x)
x = gsub("[[:punct:]]", "", x)
x = gsub("[[:digit:]]", "", x)
x = gsub("http\\w+", "", x)
x = gsub("[ |\t]{2,}", "", x)
x = gsub("^ ", "", x)
x = gsub(" $", "", x)
return(x)
}
Wtweets = clean.text(Wtweets)
Ntweets = clean.text(Ntweets)
corpus = Corpus(VectorSource(Wtweets))
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
head(word_freqs, 20)
## election trump gop debate primary just
## 48 13 10 6 3 3
## gopdebate lead policy amp point republican
## 3 3 3 3 3 3
## answer can new theyre michigan win
## 2 2 2 2 2 2
## beat amiright
## 2 2
corpus = Corpus(VectorSource(Ntweets))
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
head(word_freqs, 20)
## trump election republicans
## 49 30 23
## electiontrumptrain new vote
## 22 22 22
## will video one
## 3 3 3
## think gop againtrump
## 3 2 2
## politicianshe smas tcot
## 2 2 2
## trumped racistmania donaldtrump
## 2 2 2
## skills cruz’s
## 2 2
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.