library(tidyverse)
library(rvest)
library(jsonlite)
library(pbapply)
library(data.table)
library(urltools)
apr = readLines("data/april.csv")
Создаем функцию извлекающую названия статей из всех URL адресов
art_title <- function(url.x){
x <- read_html(url.x)
title_xpath <- '//*[@id="screen-reader-main-title"]/span'
x <- x %>%
html_node(xpath = title_xpath) %>%
html_text(trim = T)
#x <- html_text(x)
x <- data.table(x[[1]])
return(x)
Sys.sleep(5)
}
Объединяем их
a_t_apr <- pblapply(apr, art_title)
Создаем функцию извлекающую аннотации из всех URL адресов
art_abs <- function(url.y){
y <- read_html(url.y)
body_xpath <- "//*[@id='abstracts']"
y <- y %>%
html_nodes(xpath = body_xpath) %>%
html_text(trim = T) %>%
paste0(collapse = "\n")
y <- data.table(y[[1]])
return(y)
Sys.sleep(3)
}
a_a_apr <- pblapply(apr, art_abs)
df1 = data.frame(matrix(unlist(a_t_apr), nrow = length(a_t_apr), byrow = T))
df1 <- tibble::rowid_to_column(df1, "doc_id")
names(df1)[2] <- "text"
df2 = data.frame(matrix(unlist(a_a_apr), nrow = length(a_a_apr), byrow = T))
df2 <- tibble::rowid_to_column(df2, "doc_id")
names(df2)[2] <- "text"
md4 <-rbind(df1, df2)
Записываем полученные данные в файл
write.csv2(md4, file = "data/april_data.csv")
md4 <- read.csv2("data/april_data.csv", sep = ";", encoding = "UTF-8")
Сохраняем данные в файл
save(md4, file = "data/data_april.RData")
Размерность фрейма данных
dim(md4)
## [1] 534 3
Названия столбцов
colnames(md4)
## [1] "X" "doc_id" "text"
Удаляем пустой столбец в фрейме данных
md4 <- md4[-grep('X', colnames(md4))]
Проверяем
colnames(md4)
## [1] "doc_id" "text"
Загружаем библиотеки
library(NLP)
library(tm)
corpus_apr <- Corpus(DataframeSource(md4))
Смотрим на корпус
corpus_apr
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 534
Получим доступ к отдельному элементу
corpus_apr[[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 15
Посмотрим содержание текста 62 элемента
as.character(corpus_apr[[62]])
## [1] "The ICCP-SS technique for retrofitting reinforced concrete compressive members subjected to corrosion"
DTM <- DocumentTermMatrix(corpus_apr)
Посмотрим на основную информацию о матрице
DTM
## <<DocumentTermMatrix (documents: 534, terms: 9518)>>
## Non-/sparse entries: 33868/5048744
## Sparsity : 99%
## Maximal term length: 41
## Weighting : term frequency (tf)
Размеры DTM
dim(DTM)
## [1] 534 9518
Загрузим пакет для арифметических операций с разреженными матрицами
library(slam)
Подсчитаем частоту слов
freqs <- col_sums(DTM)
Получим вектор словаря
words <- colnames(DTM)
Объединим слова и их частоты в фрейме данных
wordlist <- data.frame(words, freqs)
Упорядочим список слов
wordIndexes <- order(wordlist[, "freqs"], decreasing = TRUE)
wordlist <- wordlist[wordIndexes, ]
Посмотрим самые часто встречающиеся слова
head(wordlist, 25)
## words freqs
## the the 4348
## and and 2401
## with with 697
## was was 579
## for for 552
## were were 549
## concrete concrete 500
## that that 386
## strength strength 320
## are are 291
## this this 269
## properties properties 262
## cement cement 261
## results results 224
## from from 205
## asphalt asphalt 202
## effect effect 197
## can can 197
## using using 196
## test test 193
## different different 192
## used used 181
## between between 164
## mechanical mechanical 163
## compressive compressive 155
Приведем распределение слов по частоте
plot(wordlist$freqs , type = "l", lwd=2,
main = "Rank frequency Plot", xlab="Rank", ylab ="Frequency")
Используем логарифмические оси
plot(wordlist$freqs , type = "l", log="xy", lwd=2,
main = "Rank-Frequency Plot", xlab="log-Rank", ylab ="log-Frequency")
Удалим стоп-слова
library(NLP)
plot(wordlist$freqs, type = "l", log="xy",lwd=2,
main = "Rank-Frequency plot", xlab="Rank", ylab = "Frequency")
englishStopwords <- stopwords("en")
stopwords_idx <- which(wordlist$words %in% englishStopwords)
low_frequent_idx <- which(wordlist$freqs < 10)
insignificant_idx <- union(stopwords_idx, low_frequent_idx)
meaningful_range_idx <- setdiff(1:nrow(wordlist), insignificant_idx)
lines(meaningful_range_idx, wordlist$freqs[meaningful_range_idx],
col = "green", lwd=2, type="p", pch=20)
Преобразуем слова в нижний регистр
corpus <- tm_map(corpus_apr, content_transformer(tolower))
Удаляем стоп-слова
corpus <- tm_map(corpus, removeWords, stopwords("en"))
Оставляем только буквенно-цифровые символы
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
Удаляем числовые символы
corpus <- tm_map(corpus, removeNumbers)
Удаляем специальные символы
for(j in seq(corpus))
{
corpus[[j]] <- gsub("•", " ", corpus[[j]])
}
for(j in seq(corpus))
{
corpus[[j]] <- gsub("\u0095", " ", corpus[[j]])
}
Удаляем лишние пробелы
corpus <- tm_map(corpus, stripWhitespace)
Загружаем библиотеку
library(SnowballC)
Удаляем специфические термины
corpus <- tm_map(corpus, removeWords, c("editorial", "article", "board",
"download", "google", "figure",
"fig", "groups", "however",
"high", "human", "levels",
"larger", "may", "number",
"shown", "study", "studies", "this",
"using", "two", "the", "highlights",
"view", "biol", "via", "image",
"doi", "one", "abstract", "well",
"use", "also", "paper"
))
Результаты предварительной обработки для 62 элемента
as.character(corpus[[62]])
## [1] " iccp-ss technique retrofitting reinforced concrete compressive members subjected corrosion"
DTM1 <- DocumentTermMatrix(corpus)
DTM1
## <<DocumentTermMatrix (documents: 534, terms: 5838)>>
## Non-/sparse entries: 25452/3092040
## Sparsity : 99%
## Maximal term length: 40
## Weighting : term frequency (tf)
freqs1 <- col_sums(DTM1)
words1 <- colnames(DTM1)
wordlist1 <- data.frame(words1, freqs1)
wordIndexes1 <- order(wordlist1[, "freqs1"], decreasing = TRUE)
wordlist1 <- wordlist1[wordIndexes1, ]
head(wordlist1, 25)
## words1 freqs1
## concrete concrete 605
## strength strength 401
## properties properties 326
## cement cement 299
## results results 264
## test test 243
## effect effect 223
## asphalt asphalt 211
## different different 197
## can can 197
## used used 194
## mechanical mechanical 172
## compressive compressive 160
## performance performance 148
## based based 136
## temperature temperature 130
## water water 129
## specimens specimens 126
## resistance resistance 125
## materials materials 123
## aggregate aggregate 122
## tests tests 122
## model model 120
## method method 117
## mortar mortar 116
tail(wordlist1, 25)
## words1 freqs1
## partitioned partitioned 1
## prominently prominently 1
## references references 1
## stagger stagger 1
## ar-glass ar-glass 1
## drop-weight drop-weight 1
## pre-tensioned pre-tensioned 1
## pretension pretension 1
## pretensioned pretensioned 1
## textiles textiles 1
## typeabstractthe typeabstractthe 1
## ms ms 1
## beamsabstractshear beamsabstractshear 1
## predictable predictable 1
## synergy synergy 1
## broadly broadly 1
## chloridesabstractthis chloridesabstractthis 1
## enlarges enlarges 1
## triggered triggered 1
## acrylic acrylic 1
## butadiene butadiene 1
## extreme extreme 1
## pae pae 1
## polyacrylic polyacrylic 1
## sae sae 1
positive_terms_all <- readLines("positive-words.txt")
negative_terms_all <- readLines("negative-words.txt")
positive_terms_in_suto <- intersect(colnames(DTM1), positive_terms_all)
counts_positive <- row_sums(DTM1[, positive_terms_in_suto])
negative_terms_in_suto <- intersect(colnames(DTM1), negative_terms_all)
counts_negative <- row_sums(DTM1[, negative_terms_in_suto])
counts_all_terms <- row_sums(DTM1)
relative_sentiment_frequencies <- data.frame(
positive = counts_positive / counts_all_terms,
negative = counts_negative / counts_all_terms
)
sentiments_per_article <- aggregate(relative_sentiment_frequencies,
by = list(doc_id = md4$doc_id), mean)
Посмотрим на первые пять элементов
head(sentiments_per_article)
## doc_id positive negative
## 1 1 NaN NaN
## 2 2 0.045454545 0.007177033
## 3 3 0.031007752 0.085755814
## 4 4 0.012820513 0.092735043
## 5 5 0.022900763 0.103435115
## 6 6 0.004237288 0.021186441
Преобразуем данные в подходящий для ggplot2 формат
library(reshape2)
data_f <- melt(sentiments_per_article, id.vars = "doc_id")
Удалим пустые строки и проверим их количество
row.has.na <- apply(data_f, 1, function(x){any(is.na(x))})
sum(row.has.na)
## [1] 8
data_f <- data_f[!row.has.na,]
Представим результат графически
library(ggplot2)
ggplot(data = data_f, aes(x = doc_id, y = value, fill = variable)) +
geom_bar(stat="identity", position=position_dodge()) + coord_flip()
Отсортируем термины по негативному/позитивному настроению
Позитивное
ggplot(data = data_f, aes(x = reorder(doc_id, data_f$value, head, 1), y = value, fill = variable)) +
geom_bar(stat="identity", position=position_dodge()) + coord_flip()
## Warning: Use of `data_f$value` is discouraged. Use `value` instead.
Негативное
ggplot(data = data_f, aes(x = reorder(doc_id, data_f$value, tail, 1), y = value, fill = variable)) +
geom_bar(stat="identity", position=position_dodge()) + coord_flip()
## Warning: Use of `data_f$value` is discouraged. Use `value` instead.
library(wordcloud2)
freqs1 <- col_sums(DTM1)
words1 <- colnames(DTM1)
top300 <- sort(freqs1, decreasing = TRUE)[1:300]
top300_df <- data.frame(words1 = names(top300), count = top300)
wordcloud2(top300_df, shuffle = F, shape = 'pentagon')