Homework 5

customWords = c("th","thi")
country1 = 'india'
country2 = 'us'

# Method 2: Authenticate via access token
# No external Login required

create_token(app = app_name,
             consumer_key = consumer_key,
             consumer_secret = consumer_secret,
             access_token = access_token, 
             access_secret = access_secret)
## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> 2019RushiElectionAnalysis
##   key:    DPGt2zSDlQQhRZzhu2r9YDGnd
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

Part 1 (50 points) Using the rtweet library, do the following (show the R code):

a) Pick two countries and search for the tweets associated with these two terms. (use a search tool, do not retrieve tweets from specific usernames)

b) Process each set of tweets into tidy text or corpus objects.

c) Use some of the pre-processing transformations described in the lecture.

getTwittesFromHandle <- function(twitterHande, topNumberOfTweets = 100, includeRetweets = FALSE){
  
  tweets.searched <- search_tweets( paste("#", twitterHande), n = topNumberOfTweets, include_rts = includeRetweets )
  
  
  # Remove http elements manually
  tweets.searched$stripped_text <- gsub("http\\S+","",tweets.searched$text)
  tweets.searched$stripped_text <- gsub("[^\u0020-\u007F]+","",tweets.searched$stripped_text)
  tweets.searched$stripped_text <- gsub("'|'","",tweets.searched$stripped_text)

  
  return (tweets.searched)
}

getWordsFromTwittes <- function(twitterText ){

  ##Use tidy library and clean the text
  ##Use tidy to remove stop_words
  
  tweets.clean_searched <- twitterText %>%
    select(stripped_text) %>%
    unnest_tokens(word, stripped_text) %>%
    anti_join(stop_words) %>%
    anti_join(tibble(word = customWords)) %>%
    mutate(word = wordStem(word))
  
  
  return (tweets.clean_searched)
}

country1.tweets = getTwittesFromHandle(country1)
## Searching for tweets...
## Finished collecting tweets!
country2.tweets = getTwittesFromHandle(country2)
## Searching for tweets...
## Finished collecting tweets!
country1.tweets.words = getWordsFromTwittes(country1.tweets)
## Joining, by = "word"
## Joining, by = "word"
country2.tweets.words = getWordsFromTwittes(country2.tweets)
## Joining, by = "word"
## Joining, by = "word"

d) Get a list of the most frequent terms from each country’s tweets. Compare them. Do the results make sense?

#Below function returns most frequent words

getMostFrequentWords <- function(words, top = 10){
  frequentWords <- words %>%
                  count(word, sort = TRUE) %>%
                  top_n(top) %>% 
                  mutate(word = reorder(word, n))
  
   return (frequentWords)
}

country1.tweets.commonWords = getMostFrequentWords(country1.tweets.words)
## Selecting by n
country2.tweets.commonWords = getMostFrequentWords(country2.tweets.words)
## Selecting by n
country1.tweets.commonWords
## # A tibble: 12 x 2
##    word         n
##    <fct>    <int>
##  1 india       92
##  2 new         57
##  3 review      44
##  4 srilanka    36
##  5 japan       35
##  6 usa         32
##  7 canada      30
##  8 uk          30
##  9 africa      29
## 10 england     29
## 11 singapor    29
## 12 visit       29
country2.tweets.commonWords
## # A tibble: 15 x 2
##    word             n
##    <fct>        <int>
##  1 amp             13
##  2 usa              8
##  3 polit            7
##  4 japan            6
##  5 uk               6
##  6 war              6
##  7 world            6
##  8 808constitu      5
##  9 american         5
## 10 bakari_sel       5
## 11 china            5
## 12 iran             5
## 13 jcinca1          5
## 14 new              5
## 15 tulsigabbard     5
#Below function makes plot of frequent words and returns the GGPlot object
getGGPlot <- function(words, handle){

  plot <- words %>%
    count(word, sort = TRUE) %>%
    top_n(10) %>%
    mutate(word = reorder(word, n)) %>%
    ggplot(aes(x = word, y = n)) +
    geom_col() +
    xlab(NULL) +
    coord_flip() +
    theme_classic() +
    labs(x = "Count",
         y = "Unique words",
         title = paste("Unique word counts for " , handle))
  return (plot)  
}

plot1 <- getGGPlot(country1.tweets.words, country1)
## Selecting by n
plot2 <- getGGPlot(country2.tweets.words, country2)
## Selecting by n
require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(plot1, plot2, ncol=2)

#Country 1 Frequent Words

e) Show the word cloud for each country.

displayFrequentWordCloud <- function(words, handle){
  tweets.counts = words %>% count(word)
  wordcloud(words = words$word, freq = tweets.counts$n, min.freq = 2,
            max.words=200, random.order=FALSE, rot.per=0.35, 
            colors=brewer.pal(8, "Dark2"))
}

#displayFrequentWordCloud()
# tweets.counts = tweets.clean_searched %>% count(word)
# wordcloud(words = tweets.counts$word, freq = tweets.counts$n, min.freq = 2,
#           max.words=200, random.order=FALSE, rot.per=0.35, 
#           colors=brewer.pal(8, "Dark2"))

displayFrequentWordCloud(country1.tweets.words, country1)

displayFrequentWordCloud(country2.tweets.words, country2)

f) Show top word pairs (bigrams) for each country as described in the lecture.

#Below function generates paird words with their counts
getPairedWordsWithCounts <- function(words, handle){

  tweets.pairs = words %>%
    select(word) %>%
    unnest_tokens(pairs, word,token = "ngrams", n = 2)
  
  tweets.pairs_counts <- tweets.pairs  %>%
    count(pairs, sort = TRUE)
  #head(twt.kag_pairs_counts)
  
  frequentWords <- words %>%
    count(word, sort = TRUE) %>%
    top_n(10) %>%
    mutate(word = reorder(word, n))
  
  
  tweets.pairs_separate = tweets.pairs %>%
    separate(pairs, c("Word1", "Word2"), sep = " ")
  #head(tweets.pairs_separate)
  
  tweets.pairs_clean <- tweets.pairs_separate %>%
    filter(!Word1 %in% stop_words$word) %>%
    filter(!Word2 %in% stop_words$word)
  #head(tweets.pairs_clean)
  
  tweets.pairs_counts <- tweets.pairs_clean  %>%
    count(Word1, Word2, sort = TRUE)
  #head(tweets.pairs_counts)
  
  return (tweets.pairs_counts)
}

#Below function displaus Biagram Plot for given paired words
paintBiagramsPlot <- function(pairedWords, handle) {
  
  pairedWords %>% 
    filter(n >= 2) %>% #only choose pairs that have more than 2 counts
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
    geom_node_point(color = "darkslategray4", size = 3) +
    geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
    labs(title = paste("Word Network: " , handle, "Searched Tweets"),
         subtitle = "Pairs",
         x = "", y = "") +
    theme_bw()
  
}

country1.tweets.pairs_counts = getPairedWordsWithCounts(country1.tweets.words, country1)
## Selecting by n
paintBiagramsPlot(country1.tweets.pairs_counts, country1)

country2.tweets.pairs_counts = getPairedWordsWithCounts(country2.tweets.words, country2)
## Selecting by n
paintBiagramsPlot(country2.tweets.pairs_counts, country2)

g) Compute the sentiment score (as described in the lecture) for all the tweets for each country. Compare the sentiments for the two countries. Do the results make sense?

sentiment_bing = function(txt){
  #Step 1;  perform basic text cleaning (on the tweet), as seen earlier
  twt_tbl = tibble(text = txt) %>% 
    mutate(
      # Remove http elements manually
      stripped_text = gsub("http\\S+","",text)
    ) %>% 
    unnest_tokens(word,stripped_text) %>% 
    anti_join(stop_words, by="word") %>%  #remove stop words
    inner_join(get_sentiments("bing"), by="word") %>% # merge with bing sentiment
    count(word, sentiment, sort = TRUE) %>% 
    ungroup() %>% 
    ## Create a column "score", that assigns a -1 one to all negative words, and 1 to positive words. 
    mutate(
      score = case_when(
        sentiment == 'negative'~ n*(-1),
        sentiment == 'positive'~ n*1)
    )
  ## Calculate total score
  sent.score = case_when(
    nrow(twt_tbl)==0~0, # if there are no words, score is 0
    nrow(twt_tbl)>0~sum(twt_tbl$score) #otherwise, sum the positive and negatives
  )
  ## This is to keep track of which tweets contained no words at all from the bing list
  zero.type = case_when(
    nrow(twt_tbl)==0~"Type 1", # Type 1: no words at all, zero = no
    nrow(twt_tbl)>0~"Type 2" # Type 2: zero means sum of words = 0
  )
  list(score = sent.score, type = zero.type, twt_tbl = twt_tbl)
}

country1.tweets.sentiments = lapply(country1.tweets ,function(x){sentiment_bing(x)})
country2.tweets.sentiments = lapply(country2.tweets ,function(x){sentiment_bing(x)})

library(purrr)
twitter_sentiments = bind_rows(
  tibble(
    country = country1,
    score = unlist(map(country1.tweets.sentiments,'score')),
    type = unlist(map(country1.tweets.sentiments,'type'))
  ),
  tibble(
    country = country2,
    score = unlist(map(country1.tweets.sentiments,'score')),
    type = unlist(map(country2.tweets.sentiments,'type'))
  )
)

ggplot(twitter_sentiments %>% filter(type != "Type 1"),aes(x=score, fill = country)) + geom_histogram(bins = 15, alpha = .6) +
  facet_grid(~country) + theme_bw()

f) Show top word pairs (bigrams) for each country as described in the lecture

Classify text using the kNN() function

We uses “class” library for classification using knn() function.

To make uniformality in our result we must set seed. We are setting seed to 4419 so that result can not be vary on each execution.

g) Compute the sentiment score (as described in the lecture) for all the tweets for each country. Compare the sentiments for the two countries. Do the results make sense?

Part 2 (50 points) Use the RedditExtractoR library, do the following (show the R code):

a) Pick two countries and search for the Reddit comments on the Subreddit “World News” for these two countries.

#Below function returns reddit links for provided search term
getRedditTopicURLs <- function(searchTerm, subTopic = 'World News', limitTopURLs = 10, pageThreshold = 1){
  reddit_links = reddit_urls(
    search_terms = searchTerm,
    subreddit = subTopic,
    sort_by = 'new',
    page_threshold = pageThreshold
  )
  
  
  return (reddit_links %>% top_n(limitTopURLs))
}


#Below function returns reddit commnets for provided redditLinks
getRedditsComments <- function(redditLinks, urlNumbers = 1, topComments = 100){

  topic_url = redditLinks$URL[urlNumbers]
  reddit_thread = reddit_content(topic_url)
  
  reddit_comments =  reddit_thread %>% mutate(
    # Remove http elements manually
    stripped_text = gsub("http\\S+","",comment)) %>% 
    select(stripped_text)
    # unnest_tokens(word, stripped_text) %>%
    # mutate(word = wordStem(word)) %>%
    # anti_join(stop_words) %>%
    # anti_join(tibble(word = customWords))
  
  return(reddit_comments)
}

#Below function returns comments to word. for this assighment we are going to pass Reddit comments
getRedditsWordFromComments <- function(comments){
  
  reddit_words =  comments %>% 
    unnest_tokens(word, stripped_text) %>%
    mutate(word = wordStem(word)) %>%
    anti_join(stop_words) %>%
    anti_join(tibble(word = customWords))
}


country1.reddit_links = getRedditTopicURLs(country1)
## Selecting by URL
country2.reddit_links = getRedditTopicURLs(country2)
## Selecting by URL

b) Process each set of comments into tidy text or corpus objects.

#Below function returns reddit commnets for provided redditLinks
getRedditsComments <- function(redditLinks, urlNumbers = 1, topComments = 100){

  topic_url = redditLinks$URL[urlNumbers]
  reddit_thread = reddit_content(topic_url)
  
  reddit_comments =  reddit_thread %>% mutate(
    # Remove http elements manually
    stripped_text = gsub("http\\S+","",comment)) %>% 
    select(stripped_text)
    # unnest_tokens(word, stripped_text) %>%
    # mutate(word = wordStem(word)) %>%
    # anti_join(stop_words) %>%
    # anti_join(tibble(word = customWords))
  
  return(reddit_comments)
}

country1.reddit_comments = getRedditsComments(country1.reddit_links, urlNumbers = 3)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
country2.reddit_comments = getRedditsComments(country2.reddit_links, urlNumbers = 6)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

c) Use some of the pre-processing transformations described in the lecture.

#Below function returns comments to word. for this assighment we are going to pass Reddit comments
getRedditsWordFromComments <- function(comments){
  
  reddit_words =  comments %>% 
    unnest_tokens(word, stripped_text) %>%
    mutate(word = wordStem(word)) %>%
    anti_join(stop_words) %>%
    anti_join(tibble(word = customWords))
}


country1.words = getRedditsWordFromComments(country1.reddit_comments)
## Joining, by = "word"
## Joining, by = "word"
country2.words = getRedditsWordFromComments(country2.reddit_comments)
## Joining, by = "word"
## Joining, by = "word"

d) Get a list of the most frequent terms from each country’s Reddit “World News” comments. Compare them. Do the results make sense?

getTopCommonWords <- function(words, top = 10){
  words %>%
    count(word, sort = TRUE) %>%
    top_n(top)
}

#Get list of most frequent terms for each country
country1.commonWords = getTopCommonWords(country1.words)
## Selecting by n
country2.commonWords = getTopCommonWords(country2.words)
## Selecting by n

e) Show the word cloud for each country

library(wordcloud)

country1.commonWords = getTopCommonWords(country1.words, 500)
## Selecting by n
country2.commonWords = getTopCommonWords(country2.words, 500)
## Selecting by n
country1.commonWords.counts = country1.commonWords %>% count(word)
wordcloud(words = country1.commonWords$word, freq = country1.commonWords$n, min.freq = 2,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

country2.commonWords.counts = country2.commonWords %>% count(word)
wordcloud(words = country2.commonWords$word, freq = country2.commonWords$n, min.freq = 2,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

f) Compute the sentiment score (as described in the lecture) for the Reddit comments for each country. Compare the sentiments for the two countries. Do the results make sense?

sentiment_bing = function(txt){
  #Step 1;  perform basic text cleaning (on the tweet), as seen earlier
  twt_tbl = tibble(text = txt) %>% 
    mutate(
      # Remove http elements manually
      stripped_text = gsub("http\\S+","",text)
    ) %>% 
    unnest_tokens(word,stripped_text) %>% 
    anti_join(stop_words, by="word") %>%  #remove stop words
    inner_join(get_sentiments("bing"), by="word") %>% # merge with bing sentiment
    count(word, sentiment, sort = TRUE) %>% 
    ungroup() %>% 
    ## Create a column "score", that assigns a -1 one to all negative words, and 1 to positive words. 
    mutate(
      score = case_when(
        sentiment == 'negative'~ n*(-1),
        sentiment == 'positive'~ n*1)
    )
  ## Calculate total score
  sent.score = case_when(
    nrow(twt_tbl)==0~0, # if there are no words, score is 0
    nrow(twt_tbl)>0~sum(twt_tbl$score) #otherwise, sum the positive and negatives
  )
  ## This is to keep track of which tweets contained no words at all from the bing list
  zero.type = case_when(
    nrow(twt_tbl)==0~"Type 1", # Type 1: no words at all, zero = no
    nrow(twt_tbl)>0~"Type 2" # Type 2: zero means sum of words = 0
  )
  list(score = sent.score, type = zero.type, twt_tbl = twt_tbl)
}

country1.sentiments = lapply(country1.reddit_comments ,function(x){sentiment_bing(x)})
country2.sentiments = lapply(country2.reddit_comments ,function(x){sentiment_bing(x)})



library(purrr)
Reddit_sentiments = bind_rows(
  tibble(
    country = country1,
    score = unlist(map(country1.sentiments,'score')),
    type = unlist(map(country1.sentiments,'type'))
  ),
  tibble(
    country = country2,
    score = unlist(map(country2.sentiments,'score')),
    type = unlist(map(country2.sentiments,'type'))
  )
)

ggplot(Reddit_sentiments,aes(x=score, fill = country)) + geom_histogram(bins = 15, alpha = .6) +
  facet_grid(~country) + theme_bw()

g) Compare the sentiment results in Part 2f with the sentiment results in Part 1g. What similarities do you see? What differences? Why?

#TBD

Code

The RMD file contains all code for current assignment. You may change the working directory in first line of this code and can run in R Studio.

Thank You