customWords = c("th","thi")
country1 = 'india'
country2 = 'us'
# Method 2: Authenticate via access token
# No external Login required
create_token(app = app_name,
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> 2019RushiElectionAnalysis
## key: DPGt2zSDlQQhRZzhu2r9YDGnd
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
getTwittesFromHandle <- function(twitterHande, topNumberOfTweets = 100, includeRetweets = FALSE){
tweets.searched <- search_tweets( paste("#", twitterHande), n = topNumberOfTweets, include_rts = includeRetweets )
# Remove http elements manually
tweets.searched$stripped_text <- gsub("http\\S+","",tweets.searched$text)
tweets.searched$stripped_text <- gsub("[^\u0020-\u007F]+","",tweets.searched$stripped_text)
tweets.searched$stripped_text <- gsub("'|'","",tweets.searched$stripped_text)
return (tweets.searched)
}
getWordsFromTwittes <- function(twitterText ){
##Use tidy library and clean the text
##Use tidy to remove stop_words
tweets.clean_searched <- twitterText %>%
select(stripped_text) %>%
unnest_tokens(word, stripped_text) %>%
anti_join(stop_words) %>%
anti_join(tibble(word = customWords)) %>%
mutate(word = wordStem(word))
return (tweets.clean_searched)
}
country1.tweets = getTwittesFromHandle(country1)
## Searching for tweets...
## Finished collecting tweets!
country2.tweets = getTwittesFromHandle(country2)
## Searching for tweets...
## Finished collecting tweets!
country1.tweets.words = getWordsFromTwittes(country1.tweets)
## Joining, by = "word"
## Joining, by = "word"
country2.tweets.words = getWordsFromTwittes(country2.tweets)
## Joining, by = "word"
## Joining, by = "word"
#Below function returns most frequent words
getMostFrequentWords <- function(words, top = 10){
frequentWords <- words %>%
count(word, sort = TRUE) %>%
top_n(top) %>%
mutate(word = reorder(word, n))
return (frequentWords)
}
country1.tweets.commonWords = getMostFrequentWords(country1.tweets.words)
## Selecting by n
country2.tweets.commonWords = getMostFrequentWords(country2.tweets.words)
## Selecting by n
country1.tweets.commonWords
## # A tibble: 12 x 2
## word n
## <fct> <int>
## 1 india 92
## 2 new 57
## 3 review 44
## 4 srilanka 36
## 5 japan 35
## 6 usa 32
## 7 canada 30
## 8 uk 30
## 9 africa 29
## 10 england 29
## 11 singapor 29
## 12 visit 29
country2.tweets.commonWords
## # A tibble: 15 x 2
## word n
## <fct> <int>
## 1 amp 13
## 2 usa 8
## 3 polit 7
## 4 japan 6
## 5 uk 6
## 6 war 6
## 7 world 6
## 8 808constitu 5
## 9 american 5
## 10 bakari_sel 5
## 11 china 5
## 12 iran 5
## 13 jcinca1 5
## 14 new 5
## 15 tulsigabbard 5
#Below function makes plot of frequent words and returns the GGPlot object
getGGPlot <- function(words, handle){
plot <- words %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_classic() +
labs(x = "Count",
y = "Unique words",
title = paste("Unique word counts for " , handle))
return (plot)
}
plot1 <- getGGPlot(country1.tweets.words, country1)
## Selecting by n
plot2 <- getGGPlot(country2.tweets.words, country2)
## Selecting by n
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(plot1, plot2, ncol=2)
#Country 1 Frequent Words
displayFrequentWordCloud <- function(words, handle){
tweets.counts = words %>% count(word)
wordcloud(words = words$word, freq = tweets.counts$n, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
#displayFrequentWordCloud()
# tweets.counts = tweets.clean_searched %>% count(word)
# wordcloud(words = tweets.counts$word, freq = tweets.counts$n, min.freq = 2,
# max.words=200, random.order=FALSE, rot.per=0.35,
# colors=brewer.pal(8, "Dark2"))
displayFrequentWordCloud(country1.tweets.words, country1)
displayFrequentWordCloud(country2.tweets.words, country2)
#Below function generates paird words with their counts
getPairedWordsWithCounts <- function(words, handle){
tweets.pairs = words %>%
select(word) %>%
unnest_tokens(pairs, word,token = "ngrams", n = 2)
tweets.pairs_counts <- tweets.pairs %>%
count(pairs, sort = TRUE)
#head(twt.kag_pairs_counts)
frequentWords <- words %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
mutate(word = reorder(word, n))
tweets.pairs_separate = tweets.pairs %>%
separate(pairs, c("Word1", "Word2"), sep = " ")
#head(tweets.pairs_separate)
tweets.pairs_clean <- tweets.pairs_separate %>%
filter(!Word1 %in% stop_words$word) %>%
filter(!Word2 %in% stop_words$word)
#head(tweets.pairs_clean)
tweets.pairs_counts <- tweets.pairs_clean %>%
count(Word1, Word2, sort = TRUE)
#head(tweets.pairs_counts)
return (tweets.pairs_counts)
}
#Below function displaus Biagram Plot for given paired words
paintBiagramsPlot <- function(pairedWords, handle) {
pairedWords %>%
filter(n >= 2) %>% #only choose pairs that have more than 2 counts
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = paste("Word Network: " , handle, "Searched Tweets"),
subtitle = "Pairs",
x = "", y = "") +
theme_bw()
}
country1.tweets.pairs_counts = getPairedWordsWithCounts(country1.tweets.words, country1)
## Selecting by n
paintBiagramsPlot(country1.tweets.pairs_counts, country1)
country2.tweets.pairs_counts = getPairedWordsWithCounts(country2.tweets.words, country2)
## Selecting by n
paintBiagramsPlot(country2.tweets.pairs_counts, country2)
sentiment_bing = function(txt){
#Step 1; perform basic text cleaning (on the tweet), as seen earlier
twt_tbl = tibble(text = txt) %>%
mutate(
# Remove http elements manually
stripped_text = gsub("http\\S+","",text)
) %>%
unnest_tokens(word,stripped_text) %>%
anti_join(stop_words, by="word") %>% #remove stop words
inner_join(get_sentiments("bing"), by="word") %>% # merge with bing sentiment
count(word, sentiment, sort = TRUE) %>%
ungroup() %>%
## Create a column "score", that assigns a -1 one to all negative words, and 1 to positive words.
mutate(
score = case_when(
sentiment == 'negative'~ n*(-1),
sentiment == 'positive'~ n*1)
)
## Calculate total score
sent.score = case_when(
nrow(twt_tbl)==0~0, # if there are no words, score is 0
nrow(twt_tbl)>0~sum(twt_tbl$score) #otherwise, sum the positive and negatives
)
## This is to keep track of which tweets contained no words at all from the bing list
zero.type = case_when(
nrow(twt_tbl)==0~"Type 1", # Type 1: no words at all, zero = no
nrow(twt_tbl)>0~"Type 2" # Type 2: zero means sum of words = 0
)
list(score = sent.score, type = zero.type, twt_tbl = twt_tbl)
}
country1.tweets.sentiments = lapply(country1.tweets ,function(x){sentiment_bing(x)})
country2.tweets.sentiments = lapply(country2.tweets ,function(x){sentiment_bing(x)})
library(purrr)
twitter_sentiments = bind_rows(
tibble(
country = country1,
score = unlist(map(country1.tweets.sentiments,'score')),
type = unlist(map(country1.tweets.sentiments,'type'))
),
tibble(
country = country2,
score = unlist(map(country1.tweets.sentiments,'score')),
type = unlist(map(country2.tweets.sentiments,'type'))
)
)
ggplot(twitter_sentiments %>% filter(type != "Type 1"),aes(x=score, fill = country)) + geom_histogram(bins = 15, alpha = .6) +
facet_grid(~country) + theme_bw()
Classify text using the kNN() function
We uses “class” library for classification using knn() function.
To make uniformality in our result we must set seed. We are setting seed to 4419 so that result can not be vary on each execution.
#Below function returns reddit links for provided search term
getRedditTopicURLs <- function(searchTerm, subTopic = 'World News', limitTopURLs = 10, pageThreshold = 1){
reddit_links = reddit_urls(
search_terms = searchTerm,
subreddit = subTopic,
sort_by = 'new',
page_threshold = pageThreshold
)
return (reddit_links %>% top_n(limitTopURLs))
}
#Below function returns reddit commnets for provided redditLinks
getRedditsComments <- function(redditLinks, urlNumbers = 1, topComments = 100){
topic_url = redditLinks$URL[urlNumbers]
reddit_thread = reddit_content(topic_url)
reddit_comments = reddit_thread %>% mutate(
# Remove http elements manually
stripped_text = gsub("http\\S+","",comment)) %>%
select(stripped_text)
# unnest_tokens(word, stripped_text) %>%
# mutate(word = wordStem(word)) %>%
# anti_join(stop_words) %>%
# anti_join(tibble(word = customWords))
return(reddit_comments)
}
#Below function returns comments to word. for this assighment we are going to pass Reddit comments
getRedditsWordFromComments <- function(comments){
reddit_words = comments %>%
unnest_tokens(word, stripped_text) %>%
mutate(word = wordStem(word)) %>%
anti_join(stop_words) %>%
anti_join(tibble(word = customWords))
}
country1.reddit_links = getRedditTopicURLs(country1)
## Selecting by URL
country2.reddit_links = getRedditTopicURLs(country2)
## Selecting by URL
#Below function returns reddit commnets for provided redditLinks
getRedditsComments <- function(redditLinks, urlNumbers = 1, topComments = 100){
topic_url = redditLinks$URL[urlNumbers]
reddit_thread = reddit_content(topic_url)
reddit_comments = reddit_thread %>% mutate(
# Remove http elements manually
stripped_text = gsub("http\\S+","",comment)) %>%
select(stripped_text)
# unnest_tokens(word, stripped_text) %>%
# mutate(word = wordStem(word)) %>%
# anti_join(stop_words) %>%
# anti_join(tibble(word = customWords))
return(reddit_comments)
}
country1.reddit_comments = getRedditsComments(country1.reddit_links, urlNumbers = 3)
##
|
| | 0%
|
|=================================================================| 100%
country2.reddit_comments = getRedditsComments(country2.reddit_links, urlNumbers = 6)
##
|
| | 0%
|
|=================================================================| 100%
#Below function returns comments to word. for this assighment we are going to pass Reddit comments
getRedditsWordFromComments <- function(comments){
reddit_words = comments %>%
unnest_tokens(word, stripped_text) %>%
mutate(word = wordStem(word)) %>%
anti_join(stop_words) %>%
anti_join(tibble(word = customWords))
}
country1.words = getRedditsWordFromComments(country1.reddit_comments)
## Joining, by = "word"
## Joining, by = "word"
country2.words = getRedditsWordFromComments(country2.reddit_comments)
## Joining, by = "word"
## Joining, by = "word"
getTopCommonWords <- function(words, top = 10){
words %>%
count(word, sort = TRUE) %>%
top_n(top)
}
#Get list of most frequent terms for each country
country1.commonWords = getTopCommonWords(country1.words)
## Selecting by n
country2.commonWords = getTopCommonWords(country2.words)
## Selecting by n
library(wordcloud)
country1.commonWords = getTopCommonWords(country1.words, 500)
## Selecting by n
country2.commonWords = getTopCommonWords(country2.words, 500)
## Selecting by n
country1.commonWords.counts = country1.commonWords %>% count(word)
wordcloud(words = country1.commonWords$word, freq = country1.commonWords$n, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
country2.commonWords.counts = country2.commonWords %>% count(word)
wordcloud(words = country2.commonWords$word, freq = country2.commonWords$n, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
sentiment_bing = function(txt){
#Step 1; perform basic text cleaning (on the tweet), as seen earlier
twt_tbl = tibble(text = txt) %>%
mutate(
# Remove http elements manually
stripped_text = gsub("http\\S+","",text)
) %>%
unnest_tokens(word,stripped_text) %>%
anti_join(stop_words, by="word") %>% #remove stop words
inner_join(get_sentiments("bing"), by="word") %>% # merge with bing sentiment
count(word, sentiment, sort = TRUE) %>%
ungroup() %>%
## Create a column "score", that assigns a -1 one to all negative words, and 1 to positive words.
mutate(
score = case_when(
sentiment == 'negative'~ n*(-1),
sentiment == 'positive'~ n*1)
)
## Calculate total score
sent.score = case_when(
nrow(twt_tbl)==0~0, # if there are no words, score is 0
nrow(twt_tbl)>0~sum(twt_tbl$score) #otherwise, sum the positive and negatives
)
## This is to keep track of which tweets contained no words at all from the bing list
zero.type = case_when(
nrow(twt_tbl)==0~"Type 1", # Type 1: no words at all, zero = no
nrow(twt_tbl)>0~"Type 2" # Type 2: zero means sum of words = 0
)
list(score = sent.score, type = zero.type, twt_tbl = twt_tbl)
}
country1.sentiments = lapply(country1.reddit_comments ,function(x){sentiment_bing(x)})
country2.sentiments = lapply(country2.reddit_comments ,function(x){sentiment_bing(x)})
library(purrr)
Reddit_sentiments = bind_rows(
tibble(
country = country1,
score = unlist(map(country1.sentiments,'score')),
type = unlist(map(country1.sentiments,'type'))
),
tibble(
country = country2,
score = unlist(map(country2.sentiments,'score')),
type = unlist(map(country2.sentiments,'type'))
)
)
ggplot(Reddit_sentiments,aes(x=score, fill = country)) + geom_histogram(bins = 15, alpha = .6) +
facet_grid(~country) + theme_bw()
#TBD
The RMD file contains all code for current assignment. You may change the working directory in first line of this code and can run in R Studio.