library(twitteR)
## Warning: package 'twitteR' was built under R version 3.5.1
library(ROAuth)
## Warning: package 'ROAuth' was built under R version 3.5.1
require(RCurl)
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.5.1
## Loading required package: bitops
#Connecting to Twitter API
api_key = "GStibPIrmlNybfNajlIacNBDn"
api_secret<-"jUaT7Bf09FP8HpmCdQicsUwPKpKcKdtOFzaV0b3Uc09IZEhiSh"
access_token<-"212500992-Mrjm52E3BLc8gaoqGXLui8hgLkrezw2gpn58IocI"
access_token_secret<-"VfQhQXYIrCdrWhrFJrD0VZdBbk4DtfSfRc7TXZ1sj0rSb"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
## [1] "Using direct authentication"
#Extracting tweets containing 'bitcoin' keyword.
bitcoin<-searchTwitter("bitcoin", n=1500)
#library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:twitteR':
##
## id, location
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.1
## Loading required package: RColorBrewer
#Cleaning Data
df1 <- twListToDF(bitcoin)
myCorpus <-Corpus(VectorSource(df1$text))
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents
myCorpus <- tm_map(myCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents
myCorpusCopy <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument)
## Warning in tm_map.SimpleCorpus(myCorpus, stemDocument): transformation
## drops documents
myCorpus <- Corpus(VectorSource(myCorpus))
#Creating the term document matrix
wordFreq <- function(corpus, word) {
results <- lapply(corpus,
function(x) { grep(as.character(x), pattern=paste0("\\<",word)) }
)
sum(unlist(results))
}
tdm <- TermDocumentMatrix(myCorpus,control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 3687, documents: 3)>>
## Non-/sparse entries: 3690/7371
## Sparsity : 67%
## Maximal term length: 35
## Weighting : term frequency (tf)
#Creating Wordcloud
myCorpus <- tm_map(myCorpus,removeWords,stopwords(kind = "en"))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords(kind =
## "en")): transformation drops documents
library(wordcloud)
wordcloud(myCorpus ,max.words =150,min.freq=3,scale=c(4,.5),colors=palette())
tdm <- TermDocumentMatrix(myCorpus,control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 3668, documents: 3)>>
## Non-/sparse entries: 3671/7333
## Sparsity : 67%
## Maximal term length: 35
## Weighting : term frequency (tf)
#Most frequent words after removing stop words
(freq.terms <- findFreqTerms(tdm, lowfreq = 50))
## [1] "airdrop" "banyak" "bitcoin" "blockchain" "bonus"
## [6] "btc" "crypto" "cryptocurr" "eth" "ethereum"
## [11] "exchang" "get" "googl" "i" "ico"
## [16] "its" "market" "new" "price" "read"
## [21] "rt" "the" "token" "trade" "will"
#Plot of most frequent words
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 50)
df2 <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df2, aes(x=term, y=freq)) + geom_bar(stat="identity") +xlab("Terms") + ylab("Count") + coord_flip() +theme(axis.text=element_text(size=7))
#Calculating the sentiment score
df <- twListToDF(bitcoin)
df <- df[, order(names(df))]
df$created <- strftime(df$created, '%Y-%m-%d')
if (file.exists(paste("bitcoin", '_stack.csv'))==FALSE) write.csv(df, file=paste("bitcoin", '_stack.csv'), row.names=F)
stack <- read.csv(file=paste("bitcoin", '_stack.csv'))
stack <- rbind(stack, df)
stack <- subset(stack, !duplicated(stack$text))
write.csv(stack, file=paste("bitcoin", '_stack.csv'), row.names=F)
score.sentiment <- function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
scores <- laply(sentences, function(sentence, pos.words, neg.words){
sentence <- gsub('[[:punct:]]', "", sentence)
sentence <- gsub('[[:cntrl:]]', "", sentence)
sentence <- gsub('\\d+', "", sentence)
sentence <- tolower(sentence)
word.list <- str_split(sentence, '\\s+')
words <- unlist(word.list)
pos.matches <- match(words, pos.words)
neg.matches <- match(words, neg.words)
pos.matches <- !is.na(pos.matches)
neg.matches <- !is.na(neg.matches)
score <- sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress)
scores.df <- data.frame(score=scores, text=sentences)
return(scores.df)
}
pos <- scan('D:\\DataScience\\DATA\\ExcelR\\TextMining\\TM R Data\\positive-words.txt', what='character', comment.char=';')
neg <- scan('D:\\DataScience\\DATA\\ExcelR\\TextMining\\TM R Data\\negative-words.txt', what='character', comment.char=';')
Dataset <- stack
Dataset$text <- as.factor(Dataset$text)
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:twitteR':
##
## id
library(dplyr)
library(ggplot2)
library(ggmap)
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.1
Dataset$text <- str_replace_all(Dataset$text,"í ½í²¸í ½í²°' "," ")
scores <- score.sentiment(Dataset$text, pos, neg, .progress='text')
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
getwd()
## [1] "C:/Users/PRATAP/Documents"
write.csv(scores, file=paste("bitcoin", '_scores.csv'), row.names=TRUE)
stat <- scores
stat$created <- stack$created
stat$created <- as.Date(stat$created)
stat <- mutate(stat, tweet=ifelse(stat$score > 0, 'positive', ifelse(stat$score < 0, 'negative', 'neutral')))
library(bindrcpp)
## Warning: package 'bindrcpp' was built under R version 3.5.1
library(dplyr)
library(ggplot2)
library(ggmap)
by.tweet <- group_by(stat, tweet, created)
#by.tweet <- summarise(by.tweet, number=n())
getwd()
## [1] "C:/Users/PRATAP/Documents"
write.csv(by.tweet, file=paste("bitcoin", '_opin.csv'), row.names=TRUE)
#Plot of the sentiment analysis
library(ggplot2)
library(ggmap)
#ggplot(by.tweet, aes(created, number)) + geom_line(aes(group=tweet, color=tweet), size=2) + geom_point(aes(group=tweet, color=tweet), size=4) +theme(text = element_text(size=18), axis.text.x = element_text(angle=90, vjust=1)) +ggtitle(bitcoin)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?