Data Science in context

#We are going to do a sentiment analysis on the tweets for Uber

The first step is to create a connection with the app created at developer.twitter account. The access keys can be copied from there

##Creating the connection with rtweet
library(rtweet)
create_token(
  app = "607",
  consumer_key = "wQxZWZFTFTKuWLvKWgiIWV5pA",
  consumer_secret = "gN4gmBOAzkW2jdv4AhQMYMVB2Oo2Q2fBSjqsCyiL6kxn8QJwLS",
  access_token = "480323958-6kQk1ST8tm0CA2qLShYtnozd2Y3rJSsePg5ejqci",
  access_secret = "Zj4q5qqm7pcBpQ4ZOT2TaFBIcZeJ2g1b3bDE3SgMrS1rY"
)

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> 607
##   key:    wQxZWZFTFTKuWLvKWgiIWV5pA
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

Twitter API gives us 10 days of tweets. I restricted the searck to english tweets.

##Downloading tweets about Uber
twitter_handle_string = "@@Uber OR @UBERNews"
uber <- search_tweets(twitter_handle_string, n =8000, lang = 'en')

The data is then cleaned and several symbols dominant in the text column is removed, also removed some objectionable words. Removed all the ounctuation and tokenized the words using the unnest command

##Cleaning the data

library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

uber$text <- gsub("http.*", "", uber$text)
uber$text <- gsub("https.*", "", uber$text)
uber$text <- gsub("&", "", uber$text)
uber$text <- gsub("fucking", "", uber$text)
uber$text <- gsub("fuck", "", uber$text)

## Remove punctuation, convert to lowercase, seperate all words
uber_clean <- uber %>%
  dplyr::select(text) %>%
  unnest_tokens(word, text)

Removed the stop words, calculated frequency, and created inner join with bing lexicon to identify positive vs negative sentiments

## Load list of stop words - from the tidytext package
data("stop_words")

## Remove stop words from your list of words
cleaned_uber <- uber_clean %>%
  anti_join(stop_words)

## Joining, by = "word"

##Count no of words
cleaned_uber<-cleaned_uber%>%
  count(word, sort=TRUE)  

## Get sentiments from bing lexicon

sentiment_uber<-cleaned_uber%>%
  inner_join(get_sentiments('bing'), by=c(word='word'))

Plotted the positive and negative sentiments with words of frequency of more than or equal to 50

##Sentiment analysis with positive and negative words
sentiment_uber %>%
  filter(n>=50)%>%
  mutate(n=ifelse(sentiment=='negative', -n, n))%>%
  mutate(word=reorder(word,n))%>%
  ggplot(aes(word, n, fill=sentiment))+
    geom_bar(stat='identity')+
    ylab("Contribution to Uber sentiment")+
    coord_flip()

#Sentiment analysis on the tweets for LYFT Conducted similar analysis on another ride sharing company Lyft

##Downloading tweets about Uber
twitter_handle_string = "@@Lyft OR @LYFTNews"
lyft <- search_tweets(twitter_handle_string, n =8000, lang = 'en')

##Cleaning the data

lyft$text <- gsub("http.*", "", lyft$text)
lyft$text <- gsub("https.*", "", lyft$text)
lyft$text <- gsub("&", "", lyft$text)
lyft$text <- gsub("fucking", "", lyft$text)
lyft$text <- gsub("fuck", "", lyft$text)

## Remove punctuation, convert to lowercase, seperate all words
lyft_clean <- lyft %>%
  dplyr::select(text) %>%
  unnest_tokens(word, text)

## Load list of stop words - from the tidytext package
data("stop_words")

## Remove stop words from your list of words
cleaned_lyft <- lyft_clean %>%
  anti_join(stop_words)

## Joining, by = "word"

##Count no of words
cleaned_lyft<-cleaned_lyft%>%
  count(word, sort=TRUE)  

## Get sentiments from bing lexicon

sentiment_lyft<-cleaned_lyft%>%
  inner_join(get_sentiments('bing'), by=c(word='word'))

##Sentiment analysis with positive and negative words
sentiment_lyft %>%
  filter(n>=55)%>%
  mutate(n=ifelse(sentiment=='negative', -n, n))%>%
  mutate(word=reorder(word,n))%>%
  ggplot(aes(word, n, fill=sentiment))+
  geom_bar(stat='identity')+
  ylab("Contribution to Lyft sentiment")+
  coord_flip()

Data Science in context

Farhana Zahir

17/11/2019