Analysis of Comments from VK

Install packages

Prepare text for analysis

# Read the text file from local machine , choose file interactively
text <- readLines("vk_text_t.csv")
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text))
 #Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team")) 
# Remove punctuation
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)

Build a term document matrix

# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)

Identify most frequently used words

# Sort by descending value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Plot the most frequent words
barplot(dtm_d[1:50,]$freq, las = 2, names.arg = dtm_d[1:50,]$word,
        col ="lightgreen", main ="Top 50 most frequent words",
        ylab = "Word frequencies")

Word cloud of top 150 words

#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 5,
          max.words=150, random.order=FALSE, rot.per=0.40, 
          colors=brewer.pal(8, "Dark2"))

Find associations between words

# Find associations 
findAssocs(TextDoc_dtm, terms = c("putin","ukraine","ukrainian","power","war","russia","russian","prigozhin","money","pay","work","job","food","children","death"), corlimit = 0.35)
[WARNING] This document format requires a nonempty <title> element.
  Defaulting to 'vk.knit' as the title.
  To specify a title, use 'title' in metadata or --metadata title="...".
$putin
   kalugin     kudrin   kussinen   liberman   ppoibwug   umohgsuu       gref      youtu nabiullina     chubai 
      0.38       0.38       0.38       0.38       0.38       0.38       0.37       0.37       0.37       0.36 
  yakovlev 
      0.36 

$ukraine
numeric(0)

$ukrainian
 ”kennedi dishonest 
     0.42      0.38 

$power
numeric(0)

$war
numeric(0)

$russia
numeric(0)

$russian
feder 
 0.46 

$prigozhin
yevgeni 
   0.41 

$money
numeric(0)

$pay
numeric(0)

$work
numeric(0)

$job
numeric(0)

$food
numeric(0)

$children
infant 
  0.36 

$death
numeric(0)
gc()
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   1769130   94.5    4543567  242.7   4543567  242.7
Vcells 385726429 2942.9 1107033624 8446.0 780303040 5953.3
# Find associations for words that occur at least 1480 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 1480), corlimit = 0.35)
[WARNING] This document format requires a nonempty <title> element.
  Defaulting to 'vk.knit' as the title.
  To specify a title, use 'title' in metadata or --metadata title="...".
$creatur
numeric(0)

$even
numeric(0)

$know
numeric(0)

$like
numeric(0)

$live
numeric(0)

$peopl
numeric(0)

$putin
   kalugin     kudrin   kussinen   liberman   ppoibwug   umohgsuu       gref      youtu nabiullina     chubai 
      0.38       0.38       0.38       0.38       0.38       0.38       0.37       0.37       0.37       0.36 
  yakovlev 
      0.36 

$russian
feder 
 0.46 

$will
numeric(0)

$fool
numeric(0)

$russia
numeric(0)

$ukrain
 arm 
0.35 

$war
numeric(0)

$world
numeric(0)

$year
 old 
0.42 

$need
numeric(0)

$power
numeric(0)

$everyth
numeric(0)

$one
numeric(0)

$time
numeric(0)

$author
numeric(0)

$just
numeric(0)

$can
numeric(0)

$now
numeric(0)

$countri
numeric(0)

$want
numeric(0)

$well
numeric(0)

$idiot
numeric(0)

$fuck
numeric(0)

Evaluate sentiments

Evaluation emotion

# run nrc sentiment analysis to return data frame with each row classified as one of the following
# emotions, rather than a score: 
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust 
# It also counts the number of positive and negative emotions found in each row
d<-get_nrc_sentiment(text)
# head(d,10) - to see top 10 lines of the get_nrc_sentiment dataframe
head (d,10)
#transpose
td<-data.frame(t(d))
#The function rowSums computes column sums across rows for each level of a grouping variable.
td_new <- data.frame(rowSums(td[2:14809]))
#Transformation and cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
#Plot One - count of words associated with each sentiment
quickplot(sentiment, data=td_new2, weight=count, geom="bar", fill=sentiment, ylab="count")+ggtitle("Sentiments in Comments on VK")

#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
  sort(colSums(prop.table(d[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Text from VK Comments", xlab="Percentage"
)

library(DT)
library(readr)
vk_text_t <- read_csv("vk_text_t.csv", col_types = cols(...1 = col_skip()))
datatable(vk_text_t)
---
output: 
  html_notebook:
    toc: true
    toc_float: true
---

# Analysis of Comments from VK {.unnumbered}

### Install packages

```{r packages, eval=FALSE, message=FALSE, warning=FALSE, include=FALSE, paged.print=TRUE}
# Install
install.packages("tm")  # for text mining
install.packages("SnowballC") # for text stemming
install.packages("wordcloud") # word-cloud generator 
install.packages("RColorBrewer") # color palettes
install.packages("syuzhet") # for sentiment analysis
install.packages("ggplot2") # for plotting graphs
install.packages("DT") # for displaying table
```

```{r include=FALSE}
# Load
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library("syuzhet")
library("ggplot2")
```

### Prepare text for analysis

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
# Read the text file from local machine , choose file interactively
text <- readLines("vk_text_t.csv")
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text))
```

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
 #Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team")) 
# Remove punctuation
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
```

### Build a term document matrix

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
```

### Identify most frequently used words

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
# Sort by descending value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
```

```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=TRUE}
# Display the top 500 most frequent words
(head(dtm_d, 500))

# view top 500
library(DT)
library(readr)
vk_top_500 <- read_csv("vk_top_500.csv", 
    col_types = cols(...1 = col_skip()))
datatable(vk_top_500)
```

```{r fig.height=5, fig.width=10}
# Plot the most frequent words
barplot(dtm_d[1:50,]$freq, las = 2, names.arg = dtm_d[1:50,]$word,
        col ="lightgreen", main ="Top 50 most frequent words",
        ylab = "Word frequencies")
```

### Word cloud of top 150 words

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 5,
          max.words=150, random.order=FALSE, rot.per=0.40, 
          colors=brewer.pal(8, "Dark2"))
```

### Find associations between words

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
# Find associations 
findAssocs(TextDoc_dtm, terms = c("putin","ukraine","ukrainian","power","war","russia","russian","prigozhin","money","pay","work","job","food","children","death"), corlimit = 0.35)
```

```{r message=FALSE, warning=FALSE}
# Find associations for words that occur at least 1480 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 1480), corlimit = 0.35)
```

### Evaluate sentiments

```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=TRUE}
# regular sentiment score using get_sentiment() function and method of your choice
# please note that different methods may have different scales
syuzhet_vector <- get_sentiment(text, method="syuzhet")
# see the first row of the vector
head(syuzhet_vector)
# see summary statistics of the vector
summary(syuzhet_vector)
```

```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=TRUE}
# bing
bing_vector <- get_sentiment(text, method="bing")
head(bing_vector)
summary(bing_vector)
#affin
afinn_vector <- get_sentiment(text, method="afinn")
head(afinn_vector)
summary(afinn_vector)
```

```{r message=FALSE, warning=FALSE, include=FALSE, paged.print=TRUE}
#compare the first row of each vector using sign function
rbind(
  sign(head(syuzhet_vector)),
  sign(head(bing_vector)),
  sign(head(afinn_vector))
)
```

### Evaluation emotion

```{r emotion, message=FALSE, warning=FALSE, paged.print=TRUE}
# run nrc sentiment analysis to return data frame with each row classified as one of the following
# emotions, rather than a score: 
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust 
# It also counts the number of positive and negative emotions found in each row
d<-get_nrc_sentiment(text)
# head(d,10) - to see top 10 lines of the get_nrc_sentiment dataframe
head (d,10)
```

```{r sentiments, message=FALSE, warning=FALSE, paged.print=FALSE}
#transpose
td<-data.frame(t(d))
#The function rowSums computes column sums across rows for each level of a grouping variable.
td_new <- data.frame(rowSums(td[2:14809]))
#Transformation and cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
#Plot One - count of words associated with each sentiment
quickplot(sentiment, data=td_new2, weight=count, geom="bar", fill=sentiment, ylab="count")+ggtitle("Sentiments in Comments on VK")

```

```{r sentiment_vk, message=FALSE, warning=FALSE, paged.print=TRUE}
#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
  sort(colSums(prop.table(d[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Text from VK Comments", xlab="Percentage"
)
```

```{r view, message=FALSE, warning=FALSE, paged.print=TRUE}
library(DT)
library(readr)
vk_text_t <- read_csv("vk_text_t.csv", col_types = cols(...1 = col_skip()))
datatable(vk_text_t)
```
