#install.packages("tm")
getwd()
## [1] "C:/Users/Dell/Documents/R"
library(tm)
## Loading required package: NLP
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
#leo tolstoy
new2="C:/Users/Dell/Desktop/test"
Corpus1=Corpus(DirSource(new2), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3227440
summary(Corpus1)
## Length Class Mode
## war.txt 2 PlainTextDocument list
str(Corpus1)
## List of 1
## $ war.txt:List of 2
## ..$ content: chr [1:66055] "" "The Project Gutenberg EBook of War and Peace, by Leo Tolstoy" "" "This eBook is for the use of anyone anywhere at no cost and with almost" ...
## ..$ meta :List of 7
## .. ..$ author : chr(0)
## .. ..$ datetimestamp: POSIXlt[1:1], format: "2017-01-21 06:17:29"
## .. ..$ description : chr(0)
## .. ..$ heading : chr(0)
## .. ..$ id : chr "war.txt"
## .. ..$ language : chr "eng"
## .. ..$ origin : chr(0)
## .. ..- attr(*, "class")= chr "TextDocumentMeta"
## ..- attr(*, "class")= chr [1:2] "PlainTextDocument" "TextDocument"
## - attr(*, "class")= chr [1:2] "VCorpus" "Corpus"
Corpus1
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords,c("the","and"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm =DocumentTermMatrix(Corpus1)
inspect(dtm[1,1:10])
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 15
## Weighting : term frequency (tf)
##
## Terms
## Docs â âa âagainst âah âalbaniansâ âalexandre
## character(0) 12 5 1 3 1 1
## Terms
## Docs âall âam âare âasisâ
## character(0) 1 1 2 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 25698, documents: 1)>>
## Non-/sparse entries: 25698/0
## Sparsity : 0%
## Maximal term length: 29
## Weighting : term frequency (tf)
inspect(tdm[10:100,1])
## <<TermDocumentMatrix (terms: 91, documents: 1)>>
## Non-/sparse entries: 91/0
## Sparsity : 0%
## Maximal term length: 18
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## âasisâ 1
## âat 1
## âbelieve 1
## âboyarsâ 1
## âbravoâ 1
## âbut 4
## âbutâ 1
## âchit 1
## âchosen 1
## âchrist 1
## âcome 1
## âconsulâ 1
## âcornerâ 1
## âcount 1
## âdãlokhov 1
## âdear 1
## âdearest 1
## âdevilsâ 1
## âdid 1
## âdogâ 1
## âdonât 2
## âem 2
## âemâ 1
## âemperorâ 1
## âeveryone 1
## âextendâ 1
## âfaithful 1
## âfireâ 1
## âfoolâ 1
## âfor 2
## âfrom 6
## âgeneral 1
## âgeniusâ 1
## âgentlemenâ 1
## âgirlâ 1
## âgo 4
## âgod 2
## âgodâs 5
## âheâs 2
## âheaven 1
## âhere 1
## âhey 1
## âholy 1
## âhosanna 1
## âhow 4
## âhurrahâ 1
## âhurrahââa 1
## âhusbandsâ 1
## âi 14
## âiâll 2
## âiâm 1
## âiâve 1
## âif 2
## âimpossibleâ 1
## âis 3
## âit 1
## âitâ 1
## âitâs 3
## âj 1
## âje 2
## âjerome 1
## âjoin 1
## âkingââ 1
## âkuzmãchâ 1
## âlectures 1
## âlet 1
## âlighten 1
## âlittle 2
## âlord 1
## âlucky 1
## âmake 2
## âmammaâ 1
## âmarriages 1
## âmichaelâ 1
## âmilitary 1
## âministerâ 1
## âmonsieur 1
## âmoreorderersâ 1
## âmy 1
## ânapolãon 1
## âno 1
## ânoâ 1
## ânonsenseââ 1
## âon 2
## âone 3
## âour 1
## âoverresistâ 1
## âpapa 1
## âpapaâ 1
## âpardonâ 1
## âperhapsâ 1
matx1=as.matrix(tdm)
matx1[1:10]
## [1] 12 5 1 3 1 1 1 1 2 1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## that his was with had not him her but for
## 7639 7241 6725 5675 5349 4616 4429 4150 3665 3460
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
## Word Frequency
## that that 7639
## his his 7241
## was was 6725
## with with 5675
## had had 5349
## not not 4616
## him him 4429
## her her 4150
## but but 3665
## for for 3460
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))

#install.packages("twitteR")
library(twitteR)
#Example from http://www.rdatamining.com/examples/text-mining
#https://dev.twitter.com/
#http://geoffjentry.hexdump.org/twitteR.pdf
#https://twitter.com/apps/new
#>
setup_twitter_oauth("KhrafjcC2WrNcs0pSZUrvqnsI",
"NJMVKckyj5L2IUdR3jhLqIpVBNa6L3lXmb0pLrnGTSmfJFUDlo",
"50995744-36gIVBSW7Buh81c6D2pXwxydWEAAJmxvEvwMAlD66",
"5tftwL76TTT2raWPDSwICMZmmX5oGdcRjfnoFf5D2mFi3")
## [1] "Using direct authentication"
rdmTweets <- userTimeline("holydatascience", n=100)
rdmTweets[1:3]
## [[1]]
## [1] "holydatascience: Is he India;s number one data scientist for 2016? Read the interview #DataScience https://t.co/HZRvbmvtNQ"
##
## [[2]]
## [1] "holydatascience: @AndrewBuncombe suspicious as in you think they did not do any strikes, or suspicious as in the strikes were not so surgical and civilians"
##
## [[3]]
## [1] "holydatascience: SAS and Jupyter work well together now https://t.co/XeZKjx6RUK via #rstats #jupyter #python #sas #interfaces #analytics #datascience"
df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
str(df)
## 'data.frame': 63 obs. of 16 variables:
## $ text : chr "Is he India;s number one data scientist for 2016? Read the interview #DataScience https://t.co/HZRvbmvtNQ" "@AndrewBuncombe suspicious as in you think they did not do any strikes, or suspicious as in the strikes were not so surgical an"| __truncated__ "SAS and Jupyter work well together now https://t.co/XeZKjx6RUK via #rstats #jupyter #python #sas #interfaces #analytics #data"| __truncated__ "ego getting used to lithium and bipolar verdict. still as Saleem said in Slumdog Millionaire- God is Great" ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ favoriteCount: num 2 0 2 0 1 0 0 2 1 0 ...
## $ replyToSN : chr NA "AndrewBuncombe" NA NA ...
## $ created : POSIXct, format: "2016-09-29 16:19:58" "2016-09-29 15:58:56" ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : chr NA "781481009418231808" NA NA ...
## $ id : chr "781528959414448128" "781523667368878081" "764100231029989380" "762688099407233024" ...
## $ replyToUID : chr NA "105998402" NA NA ...
## $ statusSource : chr "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" ...
## $ screenName : chr "holydatascience" "holydatascience" "holydatascience" "holydatascience" ...
## $ retweetCount : num 0 0 0 0 0 0 0 0 0 0 ...
## $ isRetweet : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ longitude : logi NA NA NA NA NA NA ...
## $ latitude : logi NA NA NA NA NA NA ...
library(tm)
Corpus1=Corpus(VectorSource(df$text))
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
tdm <- TermDocumentMatrix(Corpus1)
matx1=as.matrix(tdm)
sort1=sort(rowSums(matx1),decreasing=T)
di=data.frame(Word=names(sort1),Frequency=sort1)
library(wordcloud)
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))

findFreqTerms(dtm, lowfreq=10)
## [1] "byteacademyco" "sbcnosleepny"
findAssocs(dtm, 'rstats', 0.30)
## $rstats
## now analytics ceo
## 0.81 0.70 0.70
## computing datasci founder
## 0.70 0.70 0.70
## full httpstcomemtqltyze httpstcoxezkjxruk
## 0.70 0.70 0.70
## idealism interfaces interviewrichard
## 0.70 0.70 0.70
## jupyter revolution sas
## 0.70 0.70 0.70
## schultz sold together
## 0.70 0.70 0.70
## well work via
## 0.70 0.70 0.48
## python
## 0.38
library(devtools)
install_github('sentiment140', 'okugami79')
## Warning: Username parameter is deprecated. Please use okugami79/
## sentiment140
## Skipping install of 'sentiment' from a github remote, the SHA1 (75be56d6) has not changed since last install.
## Use `force = TRUE` to force installation
library(sentiment)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
##
## id
a=sentiment(di$Word)
table(a$polarity)
##
## negative neutral positive
## 1 420 1
#realDonaldTrump
rdmTweets <- userTimeline("realDonaldTrump", n=1000)
rdmTweets[1:3]
## [[1]]
## [1] "realDonaldTrump: THANK YOU for another wonderful evening in Washington, D.C. TOGETHER, we will MAKE AMERICA GREAT AGAIN<ed><U+00A0><U+00BC><ed><U+00B7><U+00BA><ed><U+00A0><U+00BC><ed><U+00B7><U+00B8> https://t.co/V3aoj9RUh4"
##
## [[2]]
## [1] "realDonaldTrump: TO ALL AMERICANS<ed><U+00A0><U+00BC><ed><U+00B7><U+00BA><ed><U+00A0><U+00BC><ed><U+00B7><U+00B8>\nhttps://t.co/D7Es6ie4fY"
##
## [[3]]
## [1] "realDonaldTrump: So to all Americans, in every city near and far, small and large, from mountain to mountain...https://t.co/cZKkrGXLSi"
df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
str(df)
## 'data.frame': 631 obs. of 16 variables:
## $ text : chr "THANK YOU for another wonderful evening in Washington, D.C. TOGETHER, we will MAKE AMERICA GREAT AGAIN\xed<U+00A0><U+00BC>\xed<"| __truncated__ "TO ALL AMERICANS\xed<U+00A0><U+00BC>\xed<U+00B7><U+00BA>\xed<U+00A0><U+00BC>\xed<U+00B7><U+00B8>\nhttps://t.co/D7Es6ie4fY""| __truncated__ "So to all Americans, in every city near and far, small and large, from mountain to mountain...https://t.co/cZKkrGXLSi" "It is time to remember that...https://t.co/ZKyOiOor62" ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ favoriteCount: num 54982 65566 61960 42976 119236 ...
## $ replyToSN : chr NA NA NA NA ...
## $ created : POSIXct, format: "2017-01-21 04:56:15" "2017-01-20 18:13:48" ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : chr NA NA NA NA ...
## $ id : chr "822669114237943808" "822507434396753921" "822504142178500608" "822503558369181697" ...
## $ replyToUID : chr NA NA NA NA ...
## $ statusSource : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
## $ screenName : chr "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" "realDonaldTrump" ...
## $ retweetCount : num 12721 13367 11327 7559 31458 ...
## $ isRetweet : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ longitude : logi NA NA NA NA NA NA ...
## $ latitude : logi NA NA NA NA NA NA ...
library(tm)
Corpus1=Corpus(VectorSource(df$text))
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
tdm <- TermDocumentMatrix(Corpus1)
matx1=as.matrix(tdm)
sort1=sort(rowSums(matx1),decreasing=T)
di=data.frame(Word=names(sort1),Frequency=sort1)
library(wordcloud)
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))

findFreqTerms(dtm, lowfreq=10)
## [1] "america" "american" "amp" "back"
## [5] "bad" "big" "bigleaguetruth" "bill"
## [9] "business" "called" "campaign" "can"
## [13] "cant" "carolina" "clinton" "cnn"
## [17] "country" "crooked" "deb" "debate"
## [21] "debates" "dishonest" "donald" "draintheswamp"
## [25] "election" "enjoy" "even" "evening"
## [29] "far" "first" "florida" "get"
## [33] "going" "good" "great" "hillary"
## [37] "hillaryclinton" "icymi" "job" "jobs"
## [41] "john" "join" "just" "know"
## [45] "last" "let" "like" "live"
## [49] "look" "made" "maga" "make"
## [53] "many" "media" "mexico" "michigan"
## [57] "movement" "much" "must" "never"
## [61] "new" "news" "night" "north"
## [65] "nothing" "now" "nytimes" "obama"
## [69] "obamacare" "ohio" "one" "people"
## [73] "polls" "president" "repeal" "replace"
## [77] "report" "russia" "said" "see"
## [81] "state" "states" "support" "tax"
## [85] "thank" "tickets" "time" "today"
## [89] "together" "tomorrow" "tonight" "total"
## [93] "totally" "trump" "two" "united"
## [97] "vote" "washington" "watch" "way"
## [101] "will" "win" "women" "wonderful"
## [105] "world" "wow" "wrong" "year"
## [109] "years"
findAssocs(dtm, 'America', 0.30)
## $America
## numeric(0)
library(devtools)
#install_github('sentiment140', 'okugami79')
library(sentiment)
a=sentiment(di$Word)
table(a$polarity)
##
## negative neutral
## 5 2678