URL for Creating corpus: https://data.world/crowdflower/weather-sentiment


Downloading and installing tm the package from CRAN

use an R package called “tm”.This package supports all text mining functions like loading data,cleaning data and building a term matrix.

#install.packages("tm")

Import Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(textreadr)
## Warning: package 'textreadr' was built under R version 4.1.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(tidyr)
library(ggplot2)
library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.1.1
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate

Loading Data

Text to be mined is be loaded into R.It comes from text files(.txt). To be used in the tm package, it is turned into a “corpus”. The tm package use the Corpus() function to create a corpus.

#loading a text file from github

github_link <- "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week10/weather-tweet.csv"

weather_data<- read.csv(github_link)
#weather_data

Loading data into a vector

data_vector <- weather_data$tweet_text                         
#data_vector

Loading data as corpus

#VectorSource() creates character vectors
mydata <- Corpus(VectorSource(data_vector))

mydata
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1000

Data Cleaning Steps

# convert to lower case
mydata <- tm_map(mydata, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(tolower)):
## transformation drops documents
#remove ������ what would be emojis
mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(gsub), pattern = "\
## \W", : transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeURL)
)
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeURL)):
## transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeNumPunct)):
## transformation drops documents
# remove stopwords
mydata <- tm_map(mydata, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(mydata, removeWords, stopwords("english")):
## transformation drops documents
# remove extra whitespace
mydata <- tm_map(mydata, stripWhitespace)
## Warning in tm_map.SimpleCorpus(mydata, stripWhitespace): transformation drops
## documents
# Remove numbers
mydata <- tm_map(mydata, removeNumbers)
## Warning in tm_map.SimpleCorpus(mydata, removeNumbers): transformation drops
## documents
# Remove punctuations
mydata <- tm_map(mydata, removePunctuation)
## Warning in tm_map.SimpleCorpus(mydata, removePunctuation): transformation drops
## documents

Stemming Process using SnowballC library.

Stemming is the process of gathering words of similar origin into one word for example “communication”, “communicates”, “communicate”. Stemming helps us increase accuracy in our mined text by removing suffixes and reducing words to their basic forms.

library(SnowballC)
## Warning: package 'SnowballC' was built under R version 4.1.1
mydata <- tm_map(mydata, stemDocument)
## Warning in tm_map.SimpleCorpus(mydata, stemDocument): transformation drops
## documents
mydata
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1000

Creating a term Matrix and finding word frequencies:

The matrix logs the number of times the term appears in our clean data set thus being called a term matrix.

#create a term matrix and store it as dtm
dtm <- TermDocumentMatrix(mydata)
dtm
## <<TermDocumentMatrix (terms: 2640, documents: 1000)>>
## Non-/sparse entries: 8358/2631642
## Sparsity           : 100%
## Maximal term length: 31
## Weighting          : term frequency (tf)

Analysizing using tidytool

we need to turn it into a one-term-per-document-per-row data frame first.

library(dplyr)
library(tidytext)

tweet_td <- tidy(dtm)

tweet_td
## # A tibble: 8,358 x 3
##    term    document count
##    <chr>   <chr>    <dbl>
##  1 amaz    1            1
##  2 final   1            1
##  3 glad    1            1
##  4 grill   1            2
##  5 kabob   1            1
##  6 last    1            1
##  7 night   1            1
##  8 weather 1            1
##  9 day     2            1
## 10 even    2            1
## # ... with 8,348 more rows

Performing Sentiment Analysis:

Performing sentiment analysis now using the lexicon from Bing Liu and collaborators, which assigns positive/negative labels for each word

ap_sentiments <- tweet_td %>%
  inner_join(get_sentiments("bing"), by = c(term = "word"))

ap_sentiments
## # A tibble: 1,018 x 4
##    term     document count sentiment
##    <chr>    <chr>    <dbl> <chr>    
##  1 glad     1            1 positive 
##  2 slowest  2            1 negative 
##  3 enjoy    4            1 positive 
##  4 nice     4            1 positive 
##  5 pleasant 4            1 positive 
##  6 delay    5            1 negative 
##  7 work     5            1 positive 
##  8 great    6            1 positive 
##  9 nice     6            1 positive 
## 10 work     6            1 positive 
## # ... with 1,008 more rows

We can find the most negative documents:

library(tidyr)

ap_sentiments %>%
  count(document, sentiment, wt = count) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  arrange(sentiment)
## # A tibble: 673 x 4
##    document negative positive sentiment
##    <chr>       <dbl>    <dbl>     <dbl>
##  1 405             7        0        -7
##  2 531             4        0        -4
##  3 178             3        0        -3
##  4 300             3        0        -3
##  5 564             3        0        -3
##  6 714             3        0        -3
##  7 811             3        0        -3
##  8 83              3        0        -3
##  9 962             3        0        -3
## 10 109             2        0        -2
## # ... with 663 more rows

Visualization Analysis

visualize which words contributed to positive and negative sentiment: Showing more negative word frequencies than positive

library(ggplot2)

ap_sentiments %>%
  count(sentiment, term, wt = count) %>%
  filter(n >= 5) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab("Contribution to sentiment")

Steps to Create Word Clouds:

Most often, word clouds are used to analyse twitter data or a corpus of text. Using the TermDocumentMatrix tdm created above which contains each word in your first column and their frequency in the second column.

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)
#df

Generate the word cloud:

set.seed(1234) # for reproducibility 

wordcloud(words = df$word, freq = df$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35,
          colors=brewer.pal(8, "Dark2"))

Now Trying the package SentimentAnalysis from CRAN

library(SentimentAnalysis)
## Warning: package 'SentimentAnalysis' was built under R version 4.1.3
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
## 
##     write

Passing the vector for sentiment analysis

sentiment <- analyzeSentiment(data_vector)

Extract dictionary-based sentiment according to the QDAP dictionary

#sentiment$SentimentQDAP

View sentiment direction (i.e. positive, neutral and negative)

#convertToDirection(sentiment$SentimentQDAP)
df <- data.frame(No=1:5, Sentiment=sentiment)
#convertToBinaryResponse(df)