tweettextanalysis.R

###################################
#### TWEET TEXT ANALYSIS ##########
###################################

apple <- read.csv('apple.csv')
str(apple)

## 'data.frame':    1000 obs. of  16 variables:
##  $ text         : chr  "RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB" "RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB" "Let's see this break all timers. $AAPL 156.89" "RT @SylvaCap: Things might get ugly for $aapl with the iphone delay. With $aapl down that means almost all of t"| __truncated__ ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ replyToSN    : chr  NA NA NA NA ...
##  $ created      : chr  "2017-08-01 20:31:56" "2017-08-01 20:31:55" "2017-08-01 20:31:55" "2017-08-01 20:31:55" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ id           : num  8.92e+17 8.92e+17 8.92e+17 8.92e+17 8.92e+17 ...
##  $ replyToUID   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://stocktwits.com\" rel=\"nofollow\">StockTwits Web</a>" "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>" ...
##  $ screenName   : chr  "KnowledgeMC" "Migcortina" "beckyhiu" "MarveiTheBoxer" ...
##  $ retweetCount : int  3 3 0 85 0 30 30 9 10 1 ...
##  $ isRetweet    : logi  TRUE TRUE FALSE TRUE FALSE TRUE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : logi  NA NA NA NA NA NA ...
##  $ latitude     : logi  NA NA NA NA NA NA ...

library(tm)

## Warning: package 'tm' was built under R version 4.0.5

## Loading required package: NLP

corpus <- iconv(apple$text, to = 'utf-8')
corpus <- Corpus(VectorSource(corpus))
inspect(corpus[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB                              
## [2] RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB                              
## [3] Let's see this break all timers. $AAPL 156.89                                                                                                     
## [4] RT @SylvaCap: Things might get ugly for $aapl with the iphone delay. With $aapl down that means almost all of the FANG stocks were down posÃ¢â‚¬Â¦
## [5] $AAPL - wow! This was supposed to be a throw-away quarter and AAPL beats by over 500 million in revenue! Trillion dollar company by 2018!

corpus <- tm_map(corpus, tolower)

## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops documents

inspect(corpus[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt @option_snipper: $aapl beat on both eps and revenues. sees 4q rev. $49b-$52b, est. $49.1b https://t.co/hfhxqj0iob                              
## [2] rt @option_snipper: $aapl beat on both eps and revenues. sees 4q rev. $49b-$52b, est. $49.1b https://t.co/hfhxqj0iob                              
## [3] let's see this break all timers. $aapl 156.89                                                                                                     
## [4] rt @sylvacap: things might get ugly for $aapl with the iphone delay. with $aapl down that means almost all of the fang stocks were down posã¢â‚¬â¦
## [5] $aapl - wow! this was supposed to be a throw-away quarter and aapl beats by over 500 million in revenue! trillion dollar company by 2018!

corpus <- tm_map(corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

inspect(corpus[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt optionsnipper aapl beat on both eps and revenues sees 4q rev 49b52b est 491b httpstcohfhxqj0iob                                           
## [2] rt optionsnipper aapl beat on both eps and revenues sees 4q rev 49b52b est 491b httpstcohfhxqj0iob                                           
## [3] lets see this break all timers aapl 15689                                                                                                    
## [4] rt sylvacap things might get ugly for aapl with the iphone delay with aapl down that means almost all of the fang stocks were down posã¢â‚¬â¦
## [5] aapl  wow this was supposed to be a throwaway quarter and aapl beats by over 500 million in revenue trillion dollar company by 2018

corpus <- tm_map(corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

inspect(corpus[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt optionsnipper aapl beat on both eps and revenues sees q rev bb est b httpstcohfhxqjiob                                                    
## [2] rt optionsnipper aapl beat on both eps and revenues sees q rev bb est b httpstcohfhxqjiob                                                    
## [3] lets see this break all timers aapl                                                                                                          
## [4] rt sylvacap things might get ugly for aapl with the iphone delay with aapl down that means almost all of the fang stocks were down posã¢â‚¬â¦
## [5] aapl  wow this was supposed to be a throwaway quarter and aapl beats by over  million in revenue trillion dollar company by

cleanset <- tm_map(corpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

inspect(cleanset[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt optionsnipper aapl beat   eps  revenues sees q rev bb est b httpstcohfhxqjiob                       
## [2] rt optionsnipper aapl beat   eps  revenues sees q rev bb est b httpstcohfhxqjiob                       
## [3] lets see  break  timers aapl                                                                           
## [4] rt sylvacap things might get ugly  aapl   iphone delay  aapl   means almost    fang stocks   posã¢â‚¬â¦
## [5] aapl  wow   supposed    throwaway quarter  aapl beats    million  revenue trillion dollar company

removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(cleanset, content_transformer(removeURL)):
## transformation drops documents

inspect(cleanset[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt optionsnipper aapl beat   eps  revenues sees q rev bb est b                                         
## [2] rt optionsnipper aapl beat   eps  revenues sees q rev bb est b                                         
## [3] lets see  break  timers aapl                                                                           
## [4] rt sylvacap things might get ugly  aapl   iphone delay  aapl   means almost    fang stocks   posã¢â‚¬â¦
## [5] aapl  wow   supposed    throwaway quarter  aapl beats    million  revenue trillion dollar company

cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple'))

## Warning in tm_map.SimpleCorpus(cleanset, removeWords, c("aapl", "apple")):
## transformation drops documents

cleanset <- tm_map(cleanset, gsub,
                   pattern = 'stocks',
                   replacement = 'stock')

## Warning in tm_map.SimpleCorpus(cleanset, gsub, pattern = "stocks", replacement =
## "stock"): transformation drops documents

cleanset <- tm_map(cleanset, stemDocument)

## Warning in tm_map.SimpleCorpus(cleanset, stemDocument): transformation drops
## documents

cleanset <- tm_map(cleanset, stripWhitespace)

## Warning in tm_map.SimpleCorpus(cleanset, stripWhitespace): transformation drops
## documents

inspect(cleanset[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt optionsnipp beat ep revenu see q rev bb est b                              
## [2] rt optionsnipp beat ep revenu see q rev bb est b                              
## [3] let see break timer                                                           
## [4] rt sylvacap thing might get ugli iphon delay mean almost fang stock posã¢â‚¬â¦
## [5] wow suppos throwaway quarter beat million revenu trillion dollar compani

tdm <- TermDocumentMatrix(cleanset)
tdm <- as.matrix(tdm)
tdm[1:10, 1:20]

##              Docs
## Terms         1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##   beat        1 1 0 0 1 0 0 0 0  0  0  0  0  0  0  0  0  0  1  0
##   est         1 1 0 0 0 2 2 0 0  0  0  0  0  2  0  0  2  2  0  0
##   optionsnipp 1 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   rev         1 1 0 0 0 1 1 0 0  1  0  0  1  0  0  0  1  1  0  0
##   revenu      1 1 0 0 1 0 0 0 1  0  0  0  0  0  0  0  0  0  0  0
##   see         1 1 1 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   break       0 0 1 0 0 0 0 1 1  1  1  0  0  0  0  0  0  0  0  0
##   let         0 0 1 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   timer       0 0 1 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
##   almost      0 0 0 1 0 0 0 0 0  0  0  0  0  0  1  0  0  0  0  0

w <- rowSums(tdm)
w <- subset(w, w>=25)
barplot(w,
        las = 2,
        col = rainbow(50))

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.0.5

## Loading required package: RColorBrewer

w <- sort(rowSums(tdm), decreasing = TRUE)
set.seed(222)
wordcloud(words = names(w),
          freq = w,
          max.words = 150,
          random.order = F,
          min.freq = 1,
          colors = brewer.pal(8, 'Dark2'),
          scale = c(5, 1),
          rot.per = 0.5)

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : discuss could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : premium could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : past could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : dollar could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : benzinga could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : long could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : thestreet could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : goog could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : green could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : estim could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : jimcram could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : amc could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : cnbcclosingbel could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : need could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : back could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : robinhoodapp could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : nvda could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : posit could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : analysi could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : may could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : peterlusk could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : skew could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : price could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(w), freq = w, max.words = 150, random.order =
## F, : last could not be fit on page. It will not be plotted.

library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 4.0.5

w <- data.frame(names(w), w)
colnames(w) <- c('word', 'freq')
wordcloud2(w,
           size = 0.7,
           shape = 'triangle',
           rotateRatio = 0.5,
           minSize = 1)

tweettextanalysis.R

tariqm

2021-08-07