Name: Subhankar Pattnaik ID: 71710059

This project analyzes the tweets thereby giving out top twitter handles involved in conversation, top hastags used, top words used along top bigrams (two words) occured.

Load the required libraries

try(require(tm) || install.packages("tm"))
## Loading required package: tm
## Loading required package: NLP
## [1] TRUE
try(require(stringr) || install.packages("stringr", dependencies = TRUE))
## Loading required package: stringr
## [1] TRUE
try(require(dplyr) || install.packages("dplyr"))
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## [1] TRUE
try(require(tidytext) || install.packages("tidytext"))
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.3.3
## [1] TRUE
try(require(tidyr) || install.packages("tidyr"))
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.3.3
## [1] TRUE
try(require(gsubfn) || install.packages("gsubfn", dependencies = TRUE))
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.3.3
## Loading required package: proto
## [1] TRUE
try(require(wordcloud) || install.packages("wordcloud"))
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
## [1] TRUE
library(tm)
library(stringr)
library(dplyr)
library(tidytext)
library(tidyr)
library(gsubfn)
library(wordcloud)

The below function is used basically to clean the tweets along with giving out some insights like top words, hastags, handles etc

twitter.analysis <- function(temp.text) {
  
  # Length of Tweets
  print("Length of tweets - ")
  print(length(temp.text$text))
  cat('\n')
  x <- temp.text$text
  twitter.df <- NULL
  
  # Extract all the twitter handles present in the list of tweets  
  twitter.hashtag <- unlist(strapplyc(x,"\\#\\w+"))   # extract twitter handles
  
  # Extract all the twitter hashtags present in the list of tweets
  twitter.handle <- unlist(strapplyc(x,"\\@\\w+"))  # extract twitter hashtags
  
  # function to clean tweets
  clean_Twitter_Corpus <- function(x) {
    
    x  =  gsub("\\\\x[89a-f][0-9a-f]", "", x) # remove latin encoded characters
    x  =  gsub("(ftp|http|https):\\/\\/(\\w+:{0,1}\\w*@)?(\\S+)(:[0-9]+)?(\\/|\\/([\\w#!:.?+=&%@!\\-\\/]))?", " ", x) # remove http urls
    x  =  gsub("^(b\'RT|b\'|b\"RT|b\"|via)", "", x) # remove b character
    x  =  gsub('#\\w+|@\\w+',' ',x)           # remove hastags and @ 
    x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
    x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
    x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric
    x  =  tolower(x)                          # convert to lower case characters
    x  =  removeNumbers(x)                    # removing numbers
    x  =  stripWhitespace(x)                  # removing white space
    x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  
    return(x)
  }    
  
  # clean the twitter texts. call the clean_Twitter_Corpus function
  tweets <- clean_Twitter_Corpus(x)
  
  # top 5 hashtags
  tbl.tag <- table(twitter.hashtag)
  hashtags <- tbl.tag[order(tbl.tag, decreasing = T)] %>% head(5)
  print(hashtags)
  twitter.df$top_hashtags <- row.names(hashtags)
  cat('\n')
  
  # top 5 handles
  tbl.hdl <- table(twitter.handle)
  handles <- tbl.hdl[order(tbl.hdl, decreasing = T)] %>% head(5)
  print(handles)
  twitter.df$top_handles <- row.names(handles)
  cat('\n')
  
  # store the tweets into dataframe
  textdf = data_frame(text = tweets) 
  
  # top 5 unigrams
  unigrams <- textdf %>% unnest_tokens(word, text) %>% count(word, sort = TRUE) %>% rename(count = n) %>% anti_join(stop_words) %>% head(5)
  print(unigrams)
  twitter.df$top_words <- unigrams$word
  cat('\n')
  
  # top 5 bigrams
  bigrams <- textdf %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% separate(bigram, c("word1", "word2"), sep = " ") %>% 
    filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>%
    unite(bigram, word1, word2, sep = " ") %>%
    count(bigram, sort = TRUE) %>% head(5)
  print(bigrams)
  twitter.df$top_bigrams <- bigrams$bigram
  cat('\n')
  
  # word cloud
  textdf %>%
    unnest_tokens(word, text) %>%
    anti_join(stop_words) %>%
    count(word) %>%
    with(wordcloud(word, n, max.words = 100))
  
  # data frame with columns 'top_words', 'top_bigrams', 'top_hashtags' and 'top_handles'
  twitter.df <- as.data.frame(twitter.df, stringsAsFactors=FALSE)
  twitter.df
}

Read input files containing list of tweets and then pass to the predefined twitter.analysis function to get few insights for the concerned topic

Read @IBMResearch tweets

research.tweets <- read.csv(file.choose(), header=TRUE, sep=",", stringsAsFactors=FALSE, encoding = "Latin-1")
twitter.analysis(research.tweets)
## [1] "Length of tweets - "
## [1] 2000
## 
## twitter.hashtag
## #ibmresearch          #AI   #IBMWatson #IBMResearch       #tjbot 
##          143          120          100           65           55 
## 
## twitter.handle
##         @IBM @IBMResearch   @IBMWatson @Ale_Curioni     @banavar 
##          215          203          151           40           37 
## 
## # A tibble: 5 × 2
##        word count
##       <chr> <int>
## 1       ibm   456
## 2       amp   350
## 3 cognitive   122
## 4    watson   122
## 5       lab    91
## 
## # A tibble: 5 × 2
##                bigram     n
##                 <chr> <int>
## 1        ibm research    48
## 2          ibm fellow    34
## 3 cognitive computing    27
## 4   quantum computing    24
## 5          ibm watson    21

##   top_hashtags  top_handles top_words         top_bigrams
## 1 #ibmresearch         @IBM       ibm        ibm research
## 2          #AI @IBMResearch       amp          ibm fellow
## 3   #IBMWatson   @IBMWatson cognitive cognitive computing
## 4 #IBMResearch @Ale_Curioni    watson   quantum computing
## 5       #tjbot     @banavar       lab          ibm watson

Read @IBMWatson tweets

watson.tweets <- read.csv(file.choose(), header=TRUE, sep=",", stringsAsFactors=FALSE, encoding = "Latin-1")
twitter.analysis(watson.tweets)
## [1] "Length of tweets - "
## [1] 3229
## 
## twitter.hashtag
##    #IBMWatson           #AI    #cognitive #WatsonDevCon   #ChefWatson 
##           363           222           217           112            75 
## 
## twitter.handle
##   @IBMWatson         @IBM  @IBMBluemix @davidwkenny      @coastw 
##          333          280           72           61           53 
## 
## # A tibble: 5 × 2
##        word count
##       <chr> <int>
## 1    watson  1197
## 2       amp   428
## 3 cognitive   391
## 4       ibm   337
## 5     learn   321
## 
## # A tibble: 5 × 2
##                bigram     n
##                 <chr> <int>
## 1          ibm watson   155
## 2 cognitive computing    81
## 3         watson apis    67
## 4       ginni rometty    56
## 5 watson conversation    56

##    top_hashtags  top_handles top_words         top_bigrams
## 1    #IBMWatson   @IBMWatson    watson          ibm watson
## 2           #AI         @IBM       amp cognitive computing
## 3    #cognitive  @IBMBluemix cognitive         watson apis
## 4 #WatsonDevCon @davidwkenny       ibm       ginni rometty
## 5   #ChefWatson      @coastw     learn watson conversation

From the above word clouds for the two sets of tweets i.e. of IBMResearch and IBMWatson we observe that words ibm, amp, cognitive, watson occured a lot. Popular handles are IBM, IBMWatson, IBMResearch for obvious reasons. IBMWatson and AI are the words thats are used as hashtags for both the topics. We observe that in Reasearch ‘ibm watson’ or ‘watson’ word has been a lot, which says watson is used in research as well. We also noticed that handles like @Ale_Curioni & @banavar holds high position in IBM research department while @davidwkenny & @coastw for IBM Watson department