week3

library('tm')

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')
Trump <- read.csv("C:/Users/xiayangxiao/downloads/trump.csv" , comment.char = "#")
summary(Trump)

##        X       
##  Min.   :   1  
##  1st Qu.:1240  
##  Median :2478  
##  Mean   :2478  
##  3rd Qu.:3716  
##  Max.   :4955  
##                
##                                                                                                                                                    MESSAGE_BODY     
##  RT @RickCanton: TAKE THE PLEDGE!\n\nTell the @GOP - @realDonaldTrump won't get your vote. No matter what.\n\n#NeverTrump #Election2016 https://â€¦          :  59  
##  RT @realDonaldTrump: "@iStandWithUSA: 62% said that @realDonaldTrump won #GOP Debate Vote your stand: https://t.co/l3XVkWXIEP #USA #Election2016            :  58  
##  RT @realDonaldTrump: "@bentomchik: @chucktodd: is there something to learn from @realDonaldTrump success? #Election2016"                                    :  56  
##  RT @BNNDailyPoll: Is @realDonaldTrump v. @HillaryClinton inevitable? #Election2016                                                                          :  55  
##  RT @realDonaldTrump: "@ladycatherinecd @realDonaldTrump he has more smarts than any one of the politicians - he TRUMPED again !  #Trump #tcot #Election2016":  54  
##  RT @JudgeJeanine: After my interview with @realDonaldTrump. Am I standing with the next president of the United States? #Election2016 http://t.co/5NJX2JzLCj:  53  
##  (Other)                                                                                                                                                     :4620  
##        MESSAGE_COUNTRY MESSAGE_FAVORITES_COUNT         MESSAGE_LOCATION
##  United States : 150   Min.   :   0.000        Manhattan       :  10   
##  Canada        :   9   1st Qu.:   0.000        St Paul         :   9   
##  United Kingdom:   5   Median :   0.000        Chicago         :   5   
##  BelgiÃ«       :   2   Mean   :   2.442        Brooklyn        :   4   
##  Colombia      :   1   3rd Qu.:   0.000        Colorado Springs:   4   
##  (Other)       :   4   Max.   :1924.000        (Other)         : 139   
##  NA's          :4784                           NA's            :4784   
##       MESSAGE_LOCATION_DISPLAY_NAME                 MESSAGE_POSTED_TIME
##  Manhattan, NY       :   9          2016-02-21 17:17:08.000000:  15    
##  St Paul, MN         :   9          2016-02-21 17:17:07.000000:   9    
##  Chicago, IL         :   5          2016-02-21 17:17:09.000000:   7    
##  Brooklyn, NY        :   4          2016-02-21 17:19:08.000000:   5    
##  Colorado Springs, CO:   4          2016-02-21 17:17:13.000000:   4    
##  (Other)             : 140          2016-02-21 17:17:05.000000:   3    
##  NA's                :4784          (Other)                   :4912    
##  MESSAGE_RETWEET_COUNT         USER_CITY            USER_COUNTRY 
##  Min.   :  0.00        WASHINGTON   :  75   United States :2798  
##  1st Qu.:  0.00        San Diego    :  74   UNITED KINGDOM:  66  
##  Median :  1.00        New York City:  62   Canada        :  62  
##  Mean   : 41.34        Los Angeles  :  51   United Kingdom:  59  
##  3rd Qu.: 13.00        York         :  43   Australia     :  26  
##  Max.   :871.00        (Other)      :1626   (Other)       : 216  
##                        NA's         :3024   NA's          :1728  
##             USER_DISPLAY_NAME USER_FOLLOWERS_COUNT USER_FRIENDS_COUNT
##  ForPotus            :  34    Min.   :      0      Min.   :     0    
##  NewsNetNews         :  34    1st Qu.:    130      1st Qu.:   184    
##  BigGator5           :  29    Median :    534      Median :   592    
##  TheAmericanLifeStyle:  23    Mean   :  12514      Mean   :  2088    
##  USA Election2016    :  21    3rd Qu.:   2046      3rd Qu.:  1714    
##  CeeItTv             :  16    Max.   :4714925      Max.   :282971    
##  (Other)             :4798                                           
##   USER_GENDER    USER_LOCATION_DISPLAY_NAME        USER_SCREEN_NAME
##  female : 847   United States : 130         ForPotus       :  34   
##  male   :1706   USA           : 103         NewsNetNews    :  34   
##  unknown:2137   Washington, DC:  52         BigGator5      :  31   
##  NA's   : 265   San Diego     :  47         atlaswon       :  24   
##                 Florida, USA  :  42         TheUSALifeStyle:  23   
##                 (Other)       :3201         USAelection    :  21   
##                 NA's          :1380         (Other)        :4788

WASHINGTONTw <- subset(Trump, USER_CITY == "WASHINGTON")
NYCleTw <- subset(Trump, USER_CITY == "New York City")
saveRDS(WASHINGTONTw, "WASHINGTONTw.RDS")
WASHINGTONTw <- readRDS("WASHINGTONTw.RDS")
saveRDS(NYCleTw, "NYCleTw.RDS")
NYCTw <- readRDS("NYCleTw.RDS")

Wtweets <- WASHINGTONTw$MESSAGE_BODY
Ntweets <- NYCTw$MESSAGE_BODY
clean.text = function(x)
{

  x = tolower(x)
  x = gsub("rt", "", x)
  x = gsub("@\\w+", "", x)
  x = gsub("[[:punct:]]", "", x)
  x = gsub("[[:digit:]]", "", x)
  x = gsub("http\\w+", "", x)
  x = gsub("[ |\t]{2,}", "", x)
  x = gsub("^ ", "", x)
  x = gsub(" $", "", x)
  return(x)
}

Wtweets = clean.text(Wtweets)
Ntweets = clean.text(Ntweets)

corpus = Corpus(VectorSource(Wtweets))
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)

##   election      trump        gop     debate    primary       just 
##         48         13         10          6          3          3 
##  gopdebate       lead     policy        amp      point republican 
##          3          3          3          3          3          3 
##     answer        can        new     theyre   michigan        win 
##          2          2          2          2          2          2 
##       beat   amiright 
##          2          2

corpus = Corpus(VectorSource(Ntweets))
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)

##              trump           election        republicans 
##                 49                 30                 23 
## electiontrumptrain                new               vote 
##                 22                 22                 22 
##               will              video                one 
##                  3                  3                  3 
##              think                gop         againtrump 
##                  3                  2                  2 
##      politicianshe               smas               tcot 
##                  2                  2                  2 
##            trumped        racistmania        donaldtrump 
##                  2                  2                  2 
##             skills           cruzâ€™s 
##                  2                  2

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

week3

Xiayang Xiao

February 6, 2018

Including Plots