text

posts = read.csv("/home/voskresenskiiv/wwsss_project/posts.lem.csv", sep = "|", row.names = NULL)
posts$group_name = ifelse(posts$from_id == -15755094, "RIA",
                          ifelse(posts$from_id == -17568841, "Dojd",
                                        ifelse(posts$from_id == -26284064, "Tass",
                                               ifelse(posts$from_id == -49388814, "Pervy",
                                                      ifelse(posts$from_id == -60556804, "Echo",
                                                             ifelse(posts$from_id == -6726778, "New_gazeta", "Meduza"))))))

posts$direction = ifelse(posts$group_name == "RIA" | posts$group_name == "Tass" | posts$group_name == "Pervy", "governmental", "independent")

posts$text <- gsub("[[:punct:]]", " ", posts$text)
posts$text <- gsub("[0-9]", " ", posts$text)
posts$text <- gsub("[^[:alnum:]]", " ", posts$text)

library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tm)

## Loading required package: NLP

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

tidy_posts <- posts %>% unnest_tokens(word, text)
stopwords = stopwords("ru")
tidy_posts = tidy_posts %>% group_by(word) %>% mutate(count = n())
freq_words = tidy_posts %>% select(word, count) %>% unique() %>% arrange(-count) %>% head(50)
stopwords = c(stopwords, freq_words$word)
tidy_posts = tidy_posts[!(tidy_posts$word %in% stopwords),]
tidy_posts = tidy_posts %>% group_by(group_name, word) %>% mutate(docw_freq = n())

tidy_posts = tidy_posts %>%
  select(-count) %>%
  arrange(desc(docw_freq))

plot_freq <- tidy_posts %>%
  ungroup() %>%
  arrange(desc(docw_freq)) %>%
  mutate(word = factor(word, levels = rev(unique(word))))

plot_freq %>% 
  select(group_name, word, docw_freq) %>%
  unique() %>%
  group_by(group_name) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(reorder(word, docw_freq), docw_freq)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~group_name, ncol = 4, scales = "free") +
  coord_flip()

## Selecting by docw_freq

plot_freq %>% 
  select(direction, word, docw_freq) %>%
  unique() %>%
  group_by(direction) %>% 
  top_n(30) %>% 
  ungroup() %>%
  ggplot(aes(reorder(word, docw_freq), docw_freq)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~direction, ncol = 4, scales = "free") +
  coord_flip()

## Selecting by docw_freq

tidy_words <- tidy_posts %>% bind_tf_idf(word, group_name, docw_freq)  

tidy_words = tidy_words %>%
  arrange(desc(tf_idf))

plot_tfidf <- tidy_words %>%
  ungroup() %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word))))

plot_tfidf %>% 
  select(group_name, word, tf_idf) %>%
  unique() %>%
  group_by(group_name) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(reorder(word, tf_idf), tf_idf)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~group_name, ncol = 4, scales = "free") +
  coord_flip()

## Selecting by tf_idf

plot_tfidf %>% 
  select(direction, word, tf_idf) %>%
  unique() %>%
  group_by(direction) %>% 
  top_n(30) %>% 
  ungroup() %>%
  ggplot(aes(reorder(word, tf_idf), tf_idf)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~direction, ncol = 4, scales = "free") +
  coord_flip()

## Selecting by tf_idf

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

alexey = filter(tidy_posts, word == "алексей")

alexey$date.normal = as.Date(as.POSIXct(alexey$date, origin="1970-01-01"))

ggplot(alexey, aes(date.normal, count_likes, colour = direction)) + 
  geom_line() +
  scale_x_date(labels = function(x) format(x, "%d-%b"))+
  xlab("") + 
  ylab("")

library(tidyr)
word_ratios <- tidy_posts %>%
  count(word, direction) %>%
  ungroup() %>%
  spread(direction, n, fill = 0) %>%
  mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
  mutate(logratio = log(independent / governmental)) %>%
  arrange(desc(logratio))

word_ratios %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col() +
  coord_flip() +
  ylab("log odds ratio (independent / governmental)") +
  scale_fill_discrete(name = "", labels = c("Independent", "Governmental"))