INfo about data, collection of data, when where, why and how, maybe adding some other info and screens etc., link to github where you can find codes how it was gathered etc.
Before loading packages by “library(package_name)” first make sure that they are installed. If they are not please use “install.packages(package_name)” to install them.
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.0.5
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Loading required package: stats4
library(fpc)
## Warning: package 'fpc' was built under R version 4.0.5
library(clustertend)
## Warning: package 'clustertend' was built under R version 4.0.5
library(cluster)
## Warning: package 'cluster' was built under R version 4.0.5
library(ClusterR)
## Warning: package 'ClusterR' was built under R version 4.0.5
## Loading required package: gtools
## Warning: package 'gtools' was built under R version 4.0.5
library(stats)
library(graphics)
library(purrr)
library(stringr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(syuzhet)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
library(plyr)
## Warning: package 'plyr' was built under R version 4.0.5
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## The following object is masked from 'package:modeltools':
##
## empty
After loading needed libraries we can go throught clearing and exploring data, before we get to the actual analysis with use of Unsupervised Learning algorithms
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Loading data from csv files
Data <- read.csv("C:\\Users\\48799\\Desktop\\RStudio\\Projekt_licencjat\\Kopia\\Old_data")
Emoji_db <- read.csv("C:\\Users\\48799\\Desktop\\RStudio\\Projekt_licencjat\\emoji_df.csv")
Emoji_sentiment <- read.csv("C:\\Users\\48799\\Desktop\\RStudio\\Projekt_licencjat\\Emoji_Sentiment_Data.csv")
Clearing twitter data from additional special signs near emojis codepoints - will be used later to check which emoji is in each tweet and how many times is appears
Data$text <- gsub("[<>]"," ", Data$text)
Data$text <- gsub("000","",Data$text)
Data$text <- gsub("\\+","!",Data$text)
Data$text <- gsub("U!","",Data$text)
Merging emojis codepoints and their sentiment
Emoji_sentiment$codepoints <- sprintf("%x", Emoji_sentiment$Unicode.codepoint)
Emoji <- merge(x = Emoji_db, y = Emoji_sentiment, by = "codepoints", all.y = TRUE)
rm(Emoji_db)
rm(Emoji_sentiment)
Emoji <- Emoji[,c(1,8,9,10,11,12)]
colnames(Emoji)<-c("codepoints","Ocurrences","position","negative","neutral","positive")
Emoji$codepoints <- toupper(Emoji$codepoints)
Normalization of the sentiment data of each emoji
Emoji$substract<-Emoji$positive-Emoji$negative
Emoji$sentiment <-((Emoji$substract- mean(Emoji$substract))/(sd(Emoji$substract)))
Checking for outliers - if there is a big sentiment outlier especially for very common emojis it is very bad situation as it can have a huge impact on overall sentiment of the tweet
Based on the plot I decided to set threashold that if sentiment is more than 1 or less than 1 the sentiment will be equal respectively 1 or -1 (all sentiments for each emoji are from interval from -1 to 1)
Emoji$sentiment <- ifelse(Emoji$sentiment>=1,1,Emoji$sentiment)
Emoji$sentiment<- ifelse(Emoji$sentiment<=-1,-1,Emoji$sentiment)
Plot after applying threashold
Now I will check each tweet if it contains emojis and if they do count each type of emoji and multiply by it sentiment based on previously calculated emojis sentiment. Then I sum up all sentiments from emojis for each tweet to obtain overall sentiment from emojis
z <- c()
matrix_Emoji <- c()
for(i in Emoji$codepoints){
z <- as.data.frame(str_count(Data$text,i))
z <- z * Emoji$sentiment[Emoji$codepoints==i]
matrix_Emoji <- as.data.frame(append(matrix_Emoji,z))
}
Emoji_sentiment_sum_mat <- as.matrix(matrix_Emoji)
Emoji_sentiment_sum <- rowSums(Emoji_sentiment_sum_mat)
We must take into consideration that some tweets contains same emojis and it can create some otliers. Let`s plot and see if there are some tweets for which emoji sentiment is from iinterval from -3 to 3
plot(1:length(Data$text),Emoji_sentiment_sum, xlab="N.o. of tweet", ylab="Emoji sentiment")
abline(h=c(-3,3), col="red", lwd = 2)
We could see some outliers. Let`s assume if tweet sentiment is more than 3 or less than -3 we set maximum walues for them respectively 3 and -3.
Emoji_sentiment_sum <- ifelse(Emoji_sentiment_sum >=3,3,Emoji_sentiment_sum)
Emoji_sentiment_sum <- ifelse(Emoji_sentiment_sum <=-3,-3,Emoji_sentiment_sum)
Emoji_sentiment_sum <- as.data.frame(Emoji_sentiment_sum)
First we need to get rid of special signs, stop words, numbers, blak spaces etc.
twitterCorpus <- Corpus(VectorSource(Data$text))
twitterCorpus <- tm_map(twitterCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(twitterCorpus, content_transformer(tolower)):
## transformation drops documents
twitterCorpus <- tm_map(twitterCorpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(twitterCorpus, removeWords, stopwords("en")):
## transformation drops documents
twitterCorpus <- tm_map(twitterCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(twitterCorpus, removeNumbers): transformation
## drops documents
removeURL <- function(x) gsub("http[[:alnum:]]*","",x)
twitterCorpus <- tm_map(twitterCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(twitterCorpus, content_transformer(removeURL)):
## transformation drops documents
removeURL_2 <- function(x) gsub("edua[[:alnum:]]*","",x)
twitterCorpus <- tm_map(twitterCorpus, content_transformer(removeURL_2))
## Warning in tm_map.SimpleCorpus(twitterCorpus, content_transformer(removeURL_2)):
## transformation drops documents
removeNonAscii <- function(x) textclean::replace_non_ascii(x)
twitterCorpus <- tm_map(twitterCorpus, content_transformer(removeNonAscii))
## Warning in tm_map.SimpleCorpus(twitterCorpus,
## content_transformer(removeNonAscii)): transformation drops documents
twitterCorpus <- tm_map(twitterCorpus, removeWords,c("amp","ufef","ufeft","uufefuufefuufef","uufef","s","uffuffuufef"))
## Warning in tm_map.SimpleCorpus(twitterCorpus, removeWords, c("amp", "ufef", :
## transformation drops documents
twitterCorpus <- tm_map(twitterCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(twitterCorpus, stripWhitespace): transformation
## drops documents
twitterCorpus <- tm_map(twitterCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(twitterCorpus, removePunctuation): transformation
## drops documents
Now I will create one data frame with all needed data that contains tweets and sentiment from emojis for each tweet
Data[,1] <- twitterCorpus$content
Data$EmojisSentiment <- Emoji_sentiment_sum$Emoji_sentiment_sum
We can use some built-in packages in R with sentiment dictionaries to evaluate each tweet sentiment based on text. First we will use get_nrc_sentiment function to obtain emotions connected with text in each tweet such as anger, anticipation, disgust, fear, joy, sadness, surprise, trust, negative and positive.
emotions <- get_nrc_sentiment(Data$text)
Data$anger <- emotions$anger
Data$anticipation <- emotions$anticipation
Data$disgust <- emotions$disgust
Data$fear <- emotions$fear
Data$joy <- emotions$joy
Data$sadness <- emotions$sadness
Data$surprise <- emotions$surprise
Data$trust <- emotions$trust
Data$negative <- emotions$negative
Data$positive <- emotions$positive
Now I will use second algorithm that will evaluate sentiment based on the interval scale. Positive values means that the sentiment is more positive, negative that tweet has negative sentiment and around 0 means that tweet is neutral
sentiment_syuzhet <- get_sentiment(Data$text, method = "syuzhet")
Data$Sentiment <- sentiment_syuzhet
Data <- read.csv("C:/Users/48799/Desktop/Studia/Magisterka UW/Unsupervised Learning/Data.csv")
Adding month and hour as different columns and SentimentOverall which is sum of sentiment from text and from emoticons
Data <- Data[,-1]
Data <- Data[,-3]
month <- as.numeric(substr(Data$created,7,7))
hour <- as.numeric(substr(Data$created,12,13))
Data <- Data[,-2]
Data <- as.data.frame(scale(Data))
Data$month <- month
Data$hour <- hour
Data$SentimentOverall <- Data$EmojisSentiment+Data$Sentiment
Now I would like to see some descriptive statistics for our data
summary(Data)