URL for Creating corpus: https://data.world/crowdflower/weather-sentiment
use an R package called “tm”.This package supports all text mining functions like loading data,cleaning data and building a term matrix.
#install.packages("tm")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(textreadr)
## Warning: package 'textreadr' was built under R version 4.1.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(tidyr)
library(ggplot2)
library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.1.1
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
Text to be mined is be loaded into R.It comes from text files(.txt). To be used in the tm package, it is turned into a “corpus”. The tm package use the Corpus() function to create a corpus.
#loading a text file from github
github_link <- "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week10/weather-tweet.csv"
weather_data<- read.csv(github_link)
#weather_data
data_vector <- weather_data$tweet_text
#data_vector
#VectorSource() creates character vectors
mydata <- Corpus(VectorSource(data_vector))
mydata
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1000
# convert to lower case
mydata <- tm_map(mydata, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(tolower)):
## transformation drops documents
#remove ������ what would be emojis
mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(gsub), pattern = "\
## \W", : transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeURL)
)
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeURL)):
## transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeNumPunct)):
## transformation drops documents
# remove stopwords
mydata <- tm_map(mydata, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(mydata, removeWords, stopwords("english")):
## transformation drops documents
# remove extra whitespace
mydata <- tm_map(mydata, stripWhitespace)
## Warning in tm_map.SimpleCorpus(mydata, stripWhitespace): transformation drops
## documents
# Remove numbers
mydata <- tm_map(mydata, removeNumbers)
## Warning in tm_map.SimpleCorpus(mydata, removeNumbers): transformation drops
## documents
# Remove punctuations
mydata <- tm_map(mydata, removePunctuation)
## Warning in tm_map.SimpleCorpus(mydata, removePunctuation): transformation drops
## documents
Stemming is the process of gathering words of similar origin into one word for example “communication”, “communicates”, “communicate”. Stemming helps us increase accuracy in our mined text by removing suffixes and reducing words to their basic forms.
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 4.1.1
mydata <- tm_map(mydata, stemDocument)
## Warning in tm_map.SimpleCorpus(mydata, stemDocument): transformation drops
## documents
mydata
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1000
The matrix logs the number of times the term appears in our clean data set thus being called a term matrix.
#create a term matrix and store it as dtm
dtm <- TermDocumentMatrix(mydata)
dtm
## <<TermDocumentMatrix (terms: 2640, documents: 1000)>>
## Non-/sparse entries: 8358/2631642
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency (tf)
we need to turn it into a one-term-per-document-per-row data frame first.
library(dplyr)
library(tidytext)
tweet_td <- tidy(dtm)
tweet_td
## # A tibble: 8,358 x 3
## term document count
## <chr> <chr> <dbl>
## 1 amaz 1 1
## 2 final 1 1
## 3 glad 1 1
## 4 grill 1 2
## 5 kabob 1 1
## 6 last 1 1
## 7 night 1 1
## 8 weather 1 1
## 9 day 2 1
## 10 even 2 1
## # ... with 8,348 more rows
Performing sentiment analysis now using the lexicon from Bing Liu and collaborators, which assigns positive/negative labels for each word
ap_sentiments <- tweet_td %>%
inner_join(get_sentiments("bing"), by = c(term = "word"))
ap_sentiments
## # A tibble: 1,018 x 4
## term document count sentiment
## <chr> <chr> <dbl> <chr>
## 1 glad 1 1 positive
## 2 slowest 2 1 negative
## 3 enjoy 4 1 positive
## 4 nice 4 1 positive
## 5 pleasant 4 1 positive
## 6 delay 5 1 negative
## 7 work 5 1 positive
## 8 great 6 1 positive
## 9 nice 6 1 positive
## 10 work 6 1 positive
## # ... with 1,008 more rows
We can find the most negative documents:
library(tidyr)
ap_sentiments %>%
count(document, sentiment, wt = count) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
arrange(sentiment)
## # A tibble: 673 x 4
## document negative positive sentiment
## <chr> <dbl> <dbl> <dbl>
## 1 405 7 0 -7
## 2 531 4 0 -4
## 3 178 3 0 -3
## 4 300 3 0 -3
## 5 564 3 0 -3
## 6 714 3 0 -3
## 7 811 3 0 -3
## 8 83 3 0 -3
## 9 962 3 0 -3
## 10 109 2 0 -2
## # ... with 663 more rows
visualize which words contributed to positive and negative sentiment: Showing more negative word frequencies than positive
library(ggplot2)
ap_sentiments %>%
count(sentiment, term, wt = count) %>%
filter(n >= 5) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
Most often, word clouds are used to analyse twitter data or a corpus of text. Using the TermDocumentMatrix tdm created above which contains each word in your first column and their frequency in the second column.
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
#df
set.seed(1234) # for reproducibility
wordcloud(words = df$word, freq = df$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
library(SentimentAnalysis)
## Warning: package 'SentimentAnalysis' was built under R version 4.1.3
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
sentiment <- analyzeSentiment(data_vector)
#sentiment$SentimentQDAP
#convertToDirection(sentiment$SentimentQDAP)
df <- data.frame(No=1:5, Sentiment=sentiment)
#convertToBinaryResponse(df)