Name: Subhankar Pattnaik ID: 71710059
This project analyzes the tweets thereby giving out top twitter handles involved in conversation, top hastags used, top words used along top bigrams (two words) occured.
Load the required libraries
try(require(tm) || install.packages("tm"))
## Loading required package: tm
## Loading required package: NLP
## [1] TRUE
try(require(stringr) || install.packages("stringr", dependencies = TRUE))
## Loading required package: stringr
## [1] TRUE
try(require(dplyr) || install.packages("dplyr"))
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] TRUE
try(require(tidytext) || install.packages("tidytext"))
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.3.3
## [1] TRUE
try(require(tidyr) || install.packages("tidyr"))
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.3.3
## [1] TRUE
try(require(gsubfn) || install.packages("gsubfn", dependencies = TRUE))
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.3.3
## Loading required package: proto
## [1] TRUE
try(require(wordcloud) || install.packages("wordcloud"))
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
## [1] TRUE
library(tm)
library(stringr)
library(dplyr)
library(tidytext)
library(tidyr)
library(gsubfn)
library(wordcloud)
The below function is used basically to clean the tweets along with giving out some insights like top words, hastags, handles etc
twitter.analysis <- function(temp.text) {
# Length of Tweets
print("Length of tweets - ")
print(length(temp.text$text))
cat('\n')
x <- temp.text$text
twitter.df <- NULL
# Extract all the twitter handles present in the list of tweets
twitter.hashtag <- unlist(strapplyc(x,"\\#\\w+")) # extract twitter handles
# Extract all the twitter hashtags present in the list of tweets
twitter.handle <- unlist(strapplyc(x,"\\@\\w+")) # extract twitter hashtags
# function to clean tweets
clean_Twitter_Corpus <- function(x) {
x = gsub("\\\\x[89a-f][0-9a-f]", "", x) # remove latin encoded characters
x = gsub("(ftp|http|https):\\/\\/(\\w+:{0,1}\\w*@)?(\\S+)(:[0-9]+)?(\\/|\\/([\\w#!:.?+=&%@!\\-\\/]))?", " ", x) # remove http urls
x = gsub("^(b\'RT|b\'|b\"RT|b\"|via)", "", x) # remove b character
x = gsub('#\\w+|@\\w+',' ',x) # remove hastags and @
x = gsub("<.*?>", " ", x) # regex for removing HTML tags
x = iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
x = gsub("[^[:alnum:]]", " ", x) # keep only alpha numeric
x = tolower(x) # convert to lower case characters
x = removeNumbers(x) # removing numbers
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
return(x)
}
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
# top 5 hashtags
tbl.tag <- table(twitter.hashtag)
hashtags <- tbl.tag[order(tbl.tag, decreasing = T)] %>% head(5)
print(hashtags)
twitter.df$top_hashtags <- row.names(hashtags)
cat('\n')
# top 5 handles
tbl.hdl <- table(twitter.handle)
handles <- tbl.hdl[order(tbl.hdl, decreasing = T)] %>% head(5)
print(handles)
twitter.df$top_handles <- row.names(handles)
cat('\n')
# store the tweets into dataframe
textdf = data_frame(text = tweets)
# top 5 unigrams
unigrams <- textdf %>% unnest_tokens(word, text) %>% count(word, sort = TRUE) %>% rename(count = n) %>% anti_join(stop_words) %>% head(5)
print(unigrams)
twitter.df$top_words <- unigrams$word
cat('\n')
# top 5 bigrams
bigrams <- textdf %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort = TRUE) %>% head(5)
print(bigrams)
twitter.df$top_bigrams <- bigrams$bigram
cat('\n')
# word cloud
textdf %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# data frame with columns 'top_words', 'top_bigrams', 'top_hashtags' and 'top_handles'
twitter.df <- as.data.frame(twitter.df, stringsAsFactors=FALSE)
twitter.df
}
Read input files containing list of tweets and then pass to the predefined twitter.analysis function to get few insights for the concerned topic
Read @IBMResearch tweets
research.tweets <- read.csv(file.choose(), header=TRUE, sep=",", stringsAsFactors=FALSE, encoding = "Latin-1")
twitter.analysis(research.tweets)
## [1] "Length of tweets - "
## [1] 2000
##
## twitter.hashtag
## #ibmresearch #AI #IBMWatson #IBMResearch #tjbot
## 143 120 100 65 55
##
## twitter.handle
## @IBM @IBMResearch @IBMWatson @Ale_Curioni @banavar
## 215 203 151 40 37
##
## # A tibble: 5 × 2
## word count
## <chr> <int>
## 1 ibm 456
## 2 amp 350
## 3 cognitive 122
## 4 watson 122
## 5 lab 91
##
## # A tibble: 5 × 2
## bigram n
## <chr> <int>
## 1 ibm research 48
## 2 ibm fellow 34
## 3 cognitive computing 27
## 4 quantum computing 24
## 5 ibm watson 21

## top_hashtags top_handles top_words top_bigrams
## 1 #ibmresearch @IBM ibm ibm research
## 2 #AI @IBMResearch amp ibm fellow
## 3 #IBMWatson @IBMWatson cognitive cognitive computing
## 4 #IBMResearch @Ale_Curioni watson quantum computing
## 5 #tjbot @banavar lab ibm watson
Read @IBMWatson tweets
watson.tweets <- read.csv(file.choose(), header=TRUE, sep=",", stringsAsFactors=FALSE, encoding = "Latin-1")
twitter.analysis(watson.tweets)
## [1] "Length of tweets - "
## [1] 3229
##
## twitter.hashtag
## #IBMWatson #AI #cognitive #WatsonDevCon #ChefWatson
## 363 222 217 112 75
##
## twitter.handle
## @IBMWatson @IBM @IBMBluemix @davidwkenny @coastw
## 333 280 72 61 53
##
## # A tibble: 5 × 2
## word count
## <chr> <int>
## 1 watson 1197
## 2 amp 428
## 3 cognitive 391
## 4 ibm 337
## 5 learn 321
##
## # A tibble: 5 × 2
## bigram n
## <chr> <int>
## 1 ibm watson 155
## 2 cognitive computing 81
## 3 watson apis 67
## 4 ginni rometty 56
## 5 watson conversation 56

## top_hashtags top_handles top_words top_bigrams
## 1 #IBMWatson @IBMWatson watson ibm watson
## 2 #AI @IBM amp cognitive computing
## 3 #cognitive @IBMBluemix cognitive watson apis
## 4 #WatsonDevCon @davidwkenny ibm ginni rometty
## 5 #ChefWatson @coastw learn watson conversation
From the above word clouds for the two sets of tweets i.e. of IBMResearch and IBMWatson we observe that words ibm, amp, cognitive, watson occured a lot. Popular handles are IBM, IBMWatson, IBMResearch for obvious reasons. IBMWatson and AI are the words thats are used as hashtags for both the topics. We observe that in Reasearch ‘ibm watson’ or ‘watson’ word has been a lot, which says watson is used in research as well. We also noticed that handles like @Ale_Curioni & @banavar holds high position in IBM research department while @davidwkenny & @coastw for IBM Watson department