text_2.R

#install.packages("tm")

getwd()

## [1] "C:/Users/Dell/Documents/R"

library(tm)

## Loading required package: NLP

getReaders()

##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"

getSources()

## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

#leo tolstoy
new2="C:/Users/Dell/Desktop/test"
Corpus1=Corpus(DirSource(new2), readerControl = list(language = "eng"))
inspect(Corpus1)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3227440

summary(Corpus1)

##         Length Class             Mode
## war.txt 2      PlainTextDocument list

str(Corpus1)

## List of 1
##  $ war.txt:List of 2
##   ..$ content: chr [1:66055] "ï»¿" "The Project Gutenberg EBook of War and Peace, by Leo Tolstoy" "" "This eBook is for the use of anyone anywhere at no cost and with almost" ...
##   ..$ meta   :List of 7
##   .. ..$ author       : chr(0) 
##   .. ..$ datetimestamp: POSIXlt[1:1], format: "2017-01-21 06:17:29"
##   .. ..$ description  : chr(0) 
##   .. ..$ heading      : chr(0) 
##   .. ..$ id           : chr "war.txt"
##   .. ..$ language     : chr "eng"
##   .. ..$ origin       : chr(0) 
##   .. ..- attr(*, "class")= chr "TextDocumentMeta"
##   ..- attr(*, "class")= chr [1:2] "PlainTextDocument" "TextDocument"
##  - attr(*, "class")= chr [1:2] "VCorpus" "Corpus"

Corpus1

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1

Corpus1 <- tm_map(Corpus1, removePunctuation)  
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)  


Corpus1 <- tm_map(Corpus1, removeWords,c("the","and"))  

#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)  

Corpus1 <- tm_map(Corpus1, stripWhitespace)  

Corpus1 <- tm_map(Corpus1, PlainTextDocument)

dtm =DocumentTermMatrix(Corpus1)
inspect(dtm[1,1:10])

## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 15
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           â âa âagainst âah âalbaniansâ âalexandre
##   character(0)  12    5          1     3               1            1
##               Terms
## Docs           âall âam âare âasisâ
##   character(0)      1     1      2          1

tdm <- TermDocumentMatrix(Corpus1)
tdm

## <<TermDocumentMatrix (terms: 25698, documents: 1)>>
## Non-/sparse entries: 25698/0
## Sparsity           : 0%
## Maximal term length: 29
## Weighting          : term frequency (tf)

inspect(tdm[10:100,1])

## <<TermDocumentMatrix (terms: 91, documents: 1)>>
## Non-/sparse entries: 91/0
## Sparsity           : 0%
## Maximal term length: 18
## Weighting          : term frequency (tf)
## 
##                         Docs
## Terms                    character(0)
##   âasisâ                        1
##   âat                             1
##   âbelieve                        1
##   âboyarsâ                      1
##   âbravoâ                       1
##   âbut                            4
##   âbutâ                         1
##   âchit                           1
##   âchosen                         1
##   âchrist                         1
##   âcome                           1
##   âconsulâ                      1
##   âcornerâ                      1
##   âcount                          1
##   âdãlokhov                       1
##   âdear                           1
##   âdearest                        1
##   âdevilsâ                      1
##   âdid                            1
##   âdogâ                         1
##   âdonât                        2
##   âem                             2
##   âemâ                          1
##   âemperorâ                     1
##   âeveryone                       1
##   âextendâ                      1
##   âfaithful                       1
##   âfireâ                        1
##   âfoolâ                        1
##   âfor                            2
##   âfrom                           6
##   âgeneral                        1
##   âgeniusâ                      1
##   âgentlemenâ                   1
##   âgirlâ                        1
##   âgo                             4
##   âgod                            2
##   âgodâs                        5
##   âheâs                         2
##   âheaven                         1
##   âhere                           1
##   âhey                            1
##   âholy                           1
##   âhosanna                        1
##   âhow                            4
##   âhurrahâ                      1
##   âhurrahââa                  1
##   âhusbandsâ                    1
##   âi                             14
##   âiâll                         2
##   âiâm                          1
##   âiâve                         1
##   âif                             2
##   âimpossibleâ                  1
##   âis                             3
##   âit                             1
##   âitâ                          1
##   âitâs                         3
##   âj                              1
##   âje                             2
##   âjerome                         1
##   âjoin                           1
##   âkingââ                     1
##   âkuzmãchâ                     1
##   âlectures                       1
##   âlet                            1
##   âlighten                        1
##   âlittle                         2
##   âlord                           1
##   âlucky                          1
##   âmake                           2
##   âmammaâ                       1
##   âmarriages                      1
##   âmichaelâ                     1
##   âmilitary                       1
##   âministerâ                    1
##   âmonsieur                       1
##   âmoreorderersâ                1
##   âmy                             1
##   ânapolãon                       1
##   âno                             1
##   ânoâ                          1
##   ânonsenseââ                 1
##   âon                             2
##   âone                            3
##   âour                            1
##   âoverresistâ                  1
##   âpapa                           1
##   âpapaâ                        1
##   âpardonâ                      1
##   âperhapsâ                     1

matx1=as.matrix(tdm)
matx1[1:10]

##  [1] 12  5  1  3  1  1  1  1  2  1

sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]

## that  his  was with  had  not  him  her  but  for 
## 7639 7241 6725 5675 5349 4616 4429 4150 3665 3460

di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]

##      Word Frequency
## that that      7639
## his   his      7241
## was   was      6725
## with with      5675
## had   had      5349
## not   not      4616
## him   him      4429
## her   her      4150
## but   but      3665
## for   for      3460

#install.packages("wordcloud")
library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))

#install.packages("twitteR")
library(twitteR)
#Example from http://www.rdatamining.com/examples/text-mining
#https://dev.twitter.com/
#http://geoffjentry.hexdump.org/twitteR.pdf
#https://twitter.com/apps/new
#>
setup_twitter_oauth("KhrafjcC2WrNcs0pSZUrvqnsI",
                    "NJMVKckyj5L2IUdR3jhLqIpVBNa6L3lXmb0pLrnGTSmfJFUDlo",
                    "50995744-36gIVBSW7Buh81c6D2pXwxydWEAAJmxvEvwMAlD66",
                    "5tftwL76TTT2raWPDSwICMZmmX5oGdcRjfnoFf5D2mFi3")

## [1] "Using direct authentication"

rdmTweets <- userTimeline("holydatascience", n=100)
rdmTweets[1:3]

## [[1]]
## [1] "holydatascience: Is he India;s number one data scientist for 2016? Read the interview #DataScience https://t.co/HZRvbmvtNQ"
## 
## [[2]]
## [1] "holydatascience: @AndrewBuncombe suspicious as in you think they did not do any strikes, or suspicious as in the strikes were not so surgical and  civilians"
## 
## [[3]]
## [1] "holydatascience: SAS and Jupyter work well together now https://t.co/XeZKjx6RUK via   #rstats #jupyter #python #sas #interfaces #analytics #datascience"

df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
str(df)

## 'data.frame':    63 obs. of  16 variables:
##  $ text         : chr  "Is he India;s number one data scientist for 2016? Read the interview #DataScience https://t.co/HZRvbmvtNQ" "@AndrewBuncombe suspicious as in you think they did not do any strikes, or suspicious as in the strikes were not so surgical an"| __truncated__ "SAS and Jupyter work well together now https://t.co/XeZKjx6RUK via   #rstats #jupyter #python #sas #interfaces #analytics #data"| __truncated__ "ego getting used to lithium and bipolar verdict. still as Saleem said in Slumdog Millionaire- God is Great" ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: num  2 0 2 0 1 0 0 2 1 0 ...
##  $ replyToSN    : chr  NA "AndrewBuncombe" NA NA ...
##  $ created      : POSIXct, format: "2016-09-29 16:19:58" "2016-09-29 15:58:56" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : chr  NA "781481009418231808" NA NA ...
##  $ id           : chr  "781528959414448128" "781523667368878081" "764100231029989380" "762688099407233024" ...
##  $ replyToUID   : chr  NA "105998402" NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" ...
##  $ screenName   : chr  "holydatascience" "holydatascience" "holydatascience" "holydatascience" ...
##  $ retweetCount : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ isRetweet    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : logi  NA NA NA NA NA NA ...
##  $ latitude     : logi  NA NA NA NA NA NA ...

library(tm)
                    Corpus1=Corpus(VectorSource(df$text))
                    Corpus1 <- tm_map(Corpus1, removePunctuation)  
                    Corpus1 <- tm_map(Corpus1, removeNumbers)  
                    Corpus1 <- tm_map(Corpus1, tolower)
                    Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
                    Corpus1 <- tm_map(Corpus1, stemDocument)  
                    Corpus1 <- tm_map(Corpus1, stripWhitespace)   
                    Corpus1 <- tm_map(Corpus1, PlainTextDocument)
                    
                    dtm <- DocumentTermMatrix(Corpus1)
                    tdm <- TermDocumentMatrix(Corpus1)
                    matx1=as.matrix(tdm)
                    sort1=sort(rowSums(matx1),decreasing=T)
                    di=data.frame(Word=names(sort1),Frequency=sort1)
                    
                    library(wordcloud)
                    wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))

                    findFreqTerms(dtm, lowfreq=10)

## [1] "byteacademyco" "sbcnosleepny"

                    findAssocs(dtm, 'rstats', 0.30)

## $rstats
##                now          analytics                ceo 
##               0.81               0.70               0.70 
##          computing            datasci            founder 
##               0.70               0.70               0.70 
##               full httpstcomemtqltyze  httpstcoxezkjxruk 
##               0.70               0.70               0.70 
##           idealism         interfaces   interviewrichard 
##               0.70               0.70               0.70 
##            jupyter         revolution                sas 
##               0.70               0.70               0.70 
##            schultz               sold           together 
##               0.70               0.70               0.70 
##               well               work                via 
##               0.70               0.70               0.48 
##             python 
##               0.38

                    library(devtools)
                    install_github('sentiment140', 'okugami79')

## Warning: Username parameter is deprecated. Please use okugami79/
## sentiment140

## Skipping install of 'sentiment' from a github remote, the SHA1 (75be56d6) has not changed since last install.
##   Use `force = TRUE` to force installation

                    library(sentiment)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: rjson

## Loading required package: plyr

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

                    a=sentiment(di$Word)
                    table(a$polarity)

## 
## negative  neutral positive 
##        1      420        1

                    #realDonaldTrump
                    
                    
                    rdmTweets <- userTimeline("realDonaldTrump", n=1000)
                    rdmTweets[1:3]

## [[1]]
## [1] "realDonaldTrump: THANK YOU for another wonderful evening in Washington, D.C. TOGETHER, we will MAKE AMERICA GREAT AGAIN<ed><U+00A0><U+00BC><ed><U+00B7><U+00BA><ed><U+00A0><U+00BC><ed><U+00B7><U+00B8> https://t.co/V3aoj9RUh4"
## 
## [[2]]
## [1] "realDonaldTrump: TO ALL AMERICANS<ed><U+00A0><U+00BC><ed><U+00B7><U+00BA><ed><U+00A0><U+00BC><ed><U+00B7><U+00B8>\nhttps://t.co/D7Es6ie4fY"
## 
## [[3]]
## [1] "realDonaldTrump: So to all Americans, in every city near and far, small and large, from mountain to mountain...https://t.co/cZKkrGXLSi"

                    df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
                    str(df)

## 'data.frame':    631 obs. of  16 variables:
##  $ text         : chr  "THANK YOU for another wonderful evening in Washington, D.C. TOGETHER, we will MAKE AMERICA GREAT AGAIN\xed<U+00A0><U+00BC>\xed<"| __truncated__ "TO ALL AMERICANS\xed<U+00A0><U+00BC>\xed<U+00B7><U+00BA>\xed<U+00A0><U+00BC>\xed<U+00B7><U+00B8>\nhttps://t.co/D7Es6ie4fY""| __truncated__ "So to all Americans, in every city near and far, small and large, from mountain to mountain...https://t.co/cZKkrGXLSi" "It is time to remember that...https://t.co/ZKyOiOor62" ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: num  54982 65566 61960 42976 119236 ...
##  $ replyToSN    : chr  NA NA NA NA ...
##  $ created      : POSIXct, format: "2017-01-21 04:56:15" "2017-01-20 18:13:48" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : chr  NA NA NA NA ...
##  $ id           : chr  "822669114237943808" "822507434396753921" "822504142178500608" "822503558369181697" ...
##  $ replyToUID   : chr  NA NA NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
##  $ screenName   : chr  "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" ...
##  $ retweetCount : num  12721 13367 11327 7559 31458 ...
##  $ isRetweet    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : logi  NA NA NA NA NA NA ...
##  $ latitude     : logi  NA NA NA NA NA NA ...

                    library(tm)
                    Corpus1=Corpus(VectorSource(df$text))
                    Corpus1 <- tm_map(Corpus1, removePunctuation)  
                    Corpus1 <- tm_map(Corpus1, removeNumbers)  
                    Corpus1 <- tm_map(Corpus1, tolower)
                    Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
                    Corpus1 <- tm_map(Corpus1, stemDocument)  
                    Corpus1 <- tm_map(Corpus1, stripWhitespace)   
                    Corpus1 <- tm_map(Corpus1, PlainTextDocument)
                    
                    dtm <- DocumentTermMatrix(Corpus1)
                    tdm <- TermDocumentMatrix(Corpus1)
                    matx1=as.matrix(tdm)
                    sort1=sort(rowSums(matx1),decreasing=T)
                    di=data.frame(Word=names(sort1),Frequency=sort1)
                    
                    library(wordcloud)
                    wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))

                    findFreqTerms(dtm, lowfreq=10)

##   [1] "america"        "american"       "amp"            "back"          
##   [5] "bad"            "big"            "bigleaguetruth" "bill"          
##   [9] "business"       "called"         "campaign"       "can"           
##  [13] "cant"           "carolina"       "clinton"        "cnn"           
##  [17] "country"        "crooked"        "deb"            "debate"        
##  [21] "debates"        "dishonest"      "donald"         "draintheswamp" 
##  [25] "election"       "enjoy"          "even"           "evening"       
##  [29] "far"            "first"          "florida"        "get"           
##  [33] "going"          "good"           "great"          "hillary"       
##  [37] "hillaryclinton" "icymi"          "job"            "jobs"          
##  [41] "john"           "join"           "just"           "know"          
##  [45] "last"           "let"            "like"           "live"          
##  [49] "look"           "made"           "maga"           "make"          
##  [53] "many"           "media"          "mexico"         "michigan"      
##  [57] "movement"       "much"           "must"           "never"         
##  [61] "new"            "news"           "night"          "north"         
##  [65] "nothing"        "now"            "nytimes"        "obama"         
##  [69] "obamacare"      "ohio"           "one"            "people"        
##  [73] "polls"          "president"      "repeal"         "replace"       
##  [77] "report"         "russia"         "said"           "see"           
##  [81] "state"          "states"         "support"        "tax"           
##  [85] "thank"          "tickets"        "time"           "today"         
##  [89] "together"       "tomorrow"       "tonight"        "total"         
##  [93] "totally"        "trump"          "two"            "united"        
##  [97] "vote"           "washington"     "watch"          "way"           
## [101] "will"           "win"            "women"          "wonderful"     
## [105] "world"          "wow"            "wrong"          "year"          
## [109] "years"

                    findAssocs(dtm, 'America', 0.30)

## $America
## numeric(0)

                    library(devtools)
                    #install_github('sentiment140', 'okugami79')
                    library(sentiment)
                    a=sentiment(di$Word)
                    table(a$polarity)

## 
## negative  neutral 
##        5     2678

text_2.R

Dell

Sat Jan 21 11:47:28 2017