#install.packages("tm")

getwd()
## [1] "C:/Users/Dell/Documents/R"
library(tm)
## Loading required package: NLP
getReaders()
##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"
getSources()
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"
#leo tolstoy
new2="C:/Users/Dell/Desktop/test"
Corpus1=Corpus(DirSource(new2), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3227440
summary(Corpus1)
##         Length Class             Mode
## war.txt 2      PlainTextDocument list
str(Corpus1)
## List of 1
##  $ war.txt:List of 2
##   ..$ content: chr [1:66055] "" "The Project Gutenberg EBook of War and Peace, by Leo Tolstoy" "" "This eBook is for the use of anyone anywhere at no cost and with almost" ...
##   ..$ meta   :List of 7
##   .. ..$ author       : chr(0) 
##   .. ..$ datetimestamp: POSIXlt[1:1], format: "2017-01-21 06:17:29"
##   .. ..$ description  : chr(0) 
##   .. ..$ heading      : chr(0) 
##   .. ..$ id           : chr "war.txt"
##   .. ..$ language     : chr "eng"
##   .. ..$ origin       : chr(0) 
##   .. ..- attr(*, "class")= chr "TextDocumentMeta"
##   ..- attr(*, "class")= chr [1:2] "PlainTextDocument" "TextDocument"
##  - attr(*, "class")= chr [1:2] "VCorpus" "Corpus"
Corpus1
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
Corpus1 <- tm_map(Corpus1, removePunctuation)  
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)  


Corpus1 <- tm_map(Corpus1, removeWords,c("the","and"))  

#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)  

Corpus1 <- tm_map(Corpus1, stripWhitespace)  

Corpus1 <- tm_map(Corpus1, PlainTextDocument)

dtm =DocumentTermMatrix(Corpus1)
inspect(dtm[1,1:10])
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 15
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           ‘ ‘a ‘against ‘ah ‘albanians’ ‘alexandre
##   character(0)  12    5          1     3               1            1
##               Terms
## Docs           ‘all ‘am ‘are ‘asis’
##   character(0)      1     1      2          1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 25698, documents: 1)>>
## Non-/sparse entries: 25698/0
## Sparsity           : 0%
## Maximal term length: 29
## Weighting          : term frequency (tf)
inspect(tdm[10:100,1])
## <<TermDocumentMatrix (terms: 91, documents: 1)>>
## Non-/sparse entries: 91/0
## Sparsity           : 0%
## Maximal term length: 18
## Weighting          : term frequency (tf)
## 
##                         Docs
## Terms                    character(0)
##   ‘asis’                        1
##   ‘at                             1
##   ‘believe                        1
##   ‘boyars’                      1
##   ‘bravo’                       1
##   ‘but                            4
##   ‘but’                         1
##   ‘chit                           1
##   ‘chosen                         1
##   ‘christ                         1
##   ‘come                           1
##   ‘consul’                      1
##   ‘corner’                      1
##   ‘count                          1
##   ‘dãlokhov                       1
##   ‘dear                           1
##   ‘dearest                        1
##   ‘devils’                      1
##   ‘did                            1
##   ‘dog’                         1
##   ‘don’t                        2
##   ‘em                             2
##   ‘em”                          1
##   ‘emperor’                     1
##   ‘everyone                       1
##   ‘extend’                      1
##   ‘faithful                       1
##   ‘fire’                        1
##   ‘fool’                        1
##   ‘for                            2
##   ‘from                           6
##   ‘general                        1
##   ‘genius’                      1
##   ‘gentlemen’                   1
##   ‘girl’                        1
##   ‘go                             4
##   ‘god                            2
##   ‘god’s                        5
##   ‘he’s                         2
##   ‘heaven                         1
##   ‘here                           1
##   ‘hey                            1
##   ‘holy                           1
##   ‘hosanna                        1
##   ‘how                            4
##   ‘hurrah’                      1
##   ‘hurrah’—a                  1
##   ‘husbands’                    1
##   ‘i                             14
##   ‘i’ll                         2
##   ‘i’m                          1
##   ‘i’ve                         1
##   ‘if                             2
##   ‘impossible’                  1
##   ‘is                             3
##   ‘it                             1
##   ‘it’                          1
##   ‘it’s                         3
##   ‘j                              1
##   ‘je                             2
##   ‘jerome                         1
##   ‘join                           1
##   ‘king’”                     1
##   ‘kuzmãch’                     1
##   ‘lectures                       1
##   ‘let                            1
##   ‘lighten                        1
##   ‘little                         2
##   ‘lord                           1
##   ‘lucky                          1
##   ‘make                           2
##   ‘mamma’                       1
##   ‘marriages                      1
##   ‘michael’                     1
##   ‘military                       1
##   ‘minister’                    1
##   ‘monsieur                       1
##   ‘moreorderers’                1
##   ‘my                             1
##   ‘napolãon                       1
##   ‘no                             1
##   ‘no’                          1
##   ‘nonsense’”                 1
##   ‘on                             2
##   ‘one                            3
##   ‘our                            1
##   ‘overresist’                  1
##   ‘papa                           1
##   ‘papa’                        1
##   ‘pardon’                      1
##   ‘perhaps’                     1
matx1=as.matrix(tdm)
matx1[1:10]
##  [1] 12  5  1  3  1  1  1  1  2  1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## that  his  was with  had  not  him  her  but  for 
## 7639 7241 6725 5675 5349 4616 4429 4150 3665 3460
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
##      Word Frequency
## that that      7639
## his   his      7241
## was   was      6725
## with with      5675
## had   had      5349
## not   not      4616
## him   him      4429
## her   her      4150
## but   but      3665
## for   for      3460
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))   

#install.packages("twitteR")
library(twitteR)
#Example from http://www.rdatamining.com/examples/text-mining
#https://dev.twitter.com/
#http://geoffjentry.hexdump.org/twitteR.pdf
#https://twitter.com/apps/new
#>
setup_twitter_oauth("KhrafjcC2WrNcs0pSZUrvqnsI",
                    "NJMVKckyj5L2IUdR3jhLqIpVBNa6L3lXmb0pLrnGTSmfJFUDlo",
                    "50995744-36gIVBSW7Buh81c6D2pXwxydWEAAJmxvEvwMAlD66",
                    "5tftwL76TTT2raWPDSwICMZmmX5oGdcRjfnoFf5D2mFi3") 
## [1] "Using direct authentication"
rdmTweets <- userTimeline("holydatascience", n=100)
rdmTweets[1:3]
## [[1]]
## [1] "holydatascience: Is he India;s number one data scientist for 2016? Read the interview #DataScience https://t.co/HZRvbmvtNQ"
## 
## [[2]]
## [1] "holydatascience: @AndrewBuncombe suspicious as in you think they did not do any strikes, or suspicious as in the strikes were not so surgical and  civilians"
## 
## [[3]]
## [1] "holydatascience: SAS and Jupyter work well together now https://t.co/XeZKjx6RUK via   #rstats #jupyter #python #sas #interfaces #analytics #datascience"
df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
str(df)         
## 'data.frame':    63 obs. of  16 variables:
##  $ text         : chr  "Is he India;s number one data scientist for 2016? Read the interview #DataScience https://t.co/HZRvbmvtNQ" "@AndrewBuncombe suspicious as in you think they did not do any strikes, or suspicious as in the strikes were not so surgical an"| __truncated__ "SAS and Jupyter work well together now https://t.co/XeZKjx6RUK via   #rstats #jupyter #python #sas #interfaces #analytics #data"| __truncated__ "ego getting used to lithium and bipolar verdict. still as Saleem said in Slumdog Millionaire- God is Great" ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: num  2 0 2 0 1 0 0 2 1 0 ...
##  $ replyToSN    : chr  NA "AndrewBuncombe" NA NA ...
##  $ created      : POSIXct, format: "2016-09-29 16:19:58" "2016-09-29 15:58:56" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : chr  NA "781481009418231808" NA NA ...
##  $ id           : chr  "781528959414448128" "781523667368878081" "764100231029989380" "762688099407233024" ...
##  $ replyToUID   : chr  NA "105998402" NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" ...
##  $ screenName   : chr  "holydatascience" "holydatascience" "holydatascience" "holydatascience" ...
##  $ retweetCount : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ isRetweet    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : logi  NA NA NA NA NA NA ...
##  $ latitude     : logi  NA NA NA NA NA NA ...
library(tm)
                    Corpus1=Corpus(VectorSource(df$text))
                    Corpus1 <- tm_map(Corpus1, removePunctuation)  
                    Corpus1 <- tm_map(Corpus1, removeNumbers)  
                    Corpus1 <- tm_map(Corpus1, tolower)
                    Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
                    Corpus1 <- tm_map(Corpus1, stemDocument)  
                    Corpus1 <- tm_map(Corpus1, stripWhitespace)   
                    Corpus1 <- tm_map(Corpus1, PlainTextDocument)
                    
                    dtm <- DocumentTermMatrix(Corpus1)
                    tdm <- TermDocumentMatrix(Corpus1)
                    matx1=as.matrix(tdm)
                    sort1=sort(rowSums(matx1),decreasing=T)
                    di=data.frame(Word=names(sort1),Frequency=sort1)
                    
                    library(wordcloud)
                    wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))   

                    findFreqTerms(dtm, lowfreq=10)
## [1] "byteacademyco" "sbcnosleepny"
                    findAssocs(dtm, 'rstats', 0.30)                   
## $rstats
##                now          analytics                ceo 
##               0.81               0.70               0.70 
##          computing            datasci            founder 
##               0.70               0.70               0.70 
##               full httpstcomemtqltyze  httpstcoxezkjxruk 
##               0.70               0.70               0.70 
##           idealism         interfaces   interviewrichard 
##               0.70               0.70               0.70 
##            jupyter         revolution                sas 
##               0.70               0.70               0.70 
##            schultz               sold           together 
##               0.70               0.70               0.70 
##               well               work                via 
##               0.70               0.70               0.48 
##             python 
##               0.38
                    library(devtools)
                    install_github('sentiment140', 'okugami79')
## Warning: Username parameter is deprecated. Please use okugami79/
## sentiment140
## Skipping install of 'sentiment' from a github remote, the SHA1 (75be56d6) has not changed since last install.
##   Use `force = TRUE` to force installation
                    library(sentiment)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
## 
##     id
                    a=sentiment(di$Word)
                    table(a$polarity)
## 
## negative  neutral positive 
##        1      420        1
                    #realDonaldTrump
                    
                    
                    rdmTweets <- userTimeline("realDonaldTrump", n=1000)
                    rdmTweets[1:3]
## [[1]]
## [1] "realDonaldTrump: THANK YOU for another wonderful evening in Washington, D.C. TOGETHER, we will MAKE AMERICA GREAT AGAIN<ed><U+00A0><U+00BC><ed><U+00B7><U+00BA><ed><U+00A0><U+00BC><ed><U+00B7><U+00B8> https://t.co/V3aoj9RUh4"
## 
## [[2]]
## [1] "realDonaldTrump: TO ALL AMERICANS<ed><U+00A0><U+00BC><ed><U+00B7><U+00BA><ed><U+00A0><U+00BC><ed><U+00B7><U+00B8>\nhttps://t.co/D7Es6ie4fY"
## 
## [[3]]
## [1] "realDonaldTrump: So to all Americans, in every city near and far, small and large, from mountain to mountain...https://t.co/cZKkrGXLSi"
                    df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
                    str(df)         
## 'data.frame':    631 obs. of  16 variables:
##  $ text         : chr  "THANK YOU for another wonderful evening in Washington, D.C. TOGETHER, we will MAKE AMERICA GREAT AGAIN\xed<U+00A0><U+00BC>\xed<"| __truncated__ "TO ALL AMERICANS\xed<U+00A0><U+00BC>\xed<U+00B7><U+00BA>\xed<U+00A0><U+00BC>\xed<U+00B7><U+00B8>\nhttps://t.co/D7Es6ie4fY""| __truncated__ "So to all Americans, in every city near and far, small and large, from mountain to mountain...https://t.co/cZKkrGXLSi" "It is time to remember that...https://t.co/ZKyOiOor62" ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: num  54982 65566 61960 42976 119236 ...
##  $ replyToSN    : chr  NA NA NA NA ...
##  $ created      : POSIXct, format: "2017-01-21 04:56:15" "2017-01-20 18:13:48" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : chr  NA NA NA NA ...
##  $ id           : chr  "822669114237943808" "822507434396753921" "822504142178500608" "822503558369181697" ...
##  $ replyToUID   : chr  NA NA NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
##  $ screenName   : chr  "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" ...
##  $ retweetCount : num  12721 13367 11327 7559 31458 ...
##  $ isRetweet    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : logi  NA NA NA NA NA NA ...
##  $ latitude     : logi  NA NA NA NA NA NA ...
                    library(tm)
                    Corpus1=Corpus(VectorSource(df$text))
                    Corpus1 <- tm_map(Corpus1, removePunctuation)  
                    Corpus1 <- tm_map(Corpus1, removeNumbers)  
                    Corpus1 <- tm_map(Corpus1, tolower)
                    Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
                    Corpus1 <- tm_map(Corpus1, stemDocument)  
                    Corpus1 <- tm_map(Corpus1, stripWhitespace)   
                    Corpus1 <- tm_map(Corpus1, PlainTextDocument)
                    
                    dtm <- DocumentTermMatrix(Corpus1)
                    tdm <- TermDocumentMatrix(Corpus1)
                    matx1=as.matrix(tdm)
                    sort1=sort(rowSums(matx1),decreasing=T)
                    di=data.frame(Word=names(sort1),Frequency=sort1)
                    
                    library(wordcloud)
                    wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))   

                    findFreqTerms(dtm, lowfreq=10)
##   [1] "america"        "american"       "amp"            "back"          
##   [5] "bad"            "big"            "bigleaguetruth" "bill"          
##   [9] "business"       "called"         "campaign"       "can"           
##  [13] "cant"           "carolina"       "clinton"        "cnn"           
##  [17] "country"        "crooked"        "deb"            "debate"        
##  [21] "debates"        "dishonest"      "donald"         "draintheswamp" 
##  [25] "election"       "enjoy"          "even"           "evening"       
##  [29] "far"            "first"          "florida"        "get"           
##  [33] "going"          "good"           "great"          "hillary"       
##  [37] "hillaryclinton" "icymi"          "job"            "jobs"          
##  [41] "john"           "join"           "just"           "know"          
##  [45] "last"           "let"            "like"           "live"          
##  [49] "look"           "made"           "maga"           "make"          
##  [53] "many"           "media"          "mexico"         "michigan"      
##  [57] "movement"       "much"           "must"           "never"         
##  [61] "new"            "news"           "night"          "north"         
##  [65] "nothing"        "now"            "nytimes"        "obama"         
##  [69] "obamacare"      "ohio"           "one"            "people"        
##  [73] "polls"          "president"      "repeal"         "replace"       
##  [77] "report"         "russia"         "said"           "see"           
##  [81] "state"          "states"         "support"        "tax"           
##  [85] "thank"          "tickets"        "time"           "today"         
##  [89] "together"       "tomorrow"       "tonight"        "total"         
##  [93] "totally"        "trump"          "two"            "united"        
##  [97] "vote"           "washington"     "watch"          "way"           
## [101] "will"           "win"            "women"          "wonderful"     
## [105] "world"          "wow"            "wrong"          "year"          
## [109] "years"
                    findAssocs(dtm, 'America', 0.30)                   
## $America
## numeric(0)
                    library(devtools)
                    #install_github('sentiment140', 'okugami79')
                    library(sentiment)
                    a=sentiment(di$Word)
                    table(a$polarity)
## 
## negative  neutral 
##        5     2678