1 Загружаем требуемые библиотеки
2 Конвертируем ссылки из csv файла в строку
3 Создаем корпус
4 Создаем матрицу терминов документа
5 Предварительная обработка корпуса
6 Создадим новый DTM дл предварительно обработанного корпуса
7 Группировка настроений
8 Создадим облако слов

1 Загружаем требуемые библиотеки

library(tidyverse)
library(rvest)
library(jsonlite)
library(pbapply)
library(data.table)
library(urltools)

2 Конвертируем ссылки из csv файла в строку

apr = readLines("data/april.csv")

Создаем функцию извлекающую названия статей из всех URL адресов

art_title <- function(url.x){
        x <- read_html(url.x)
        
        title_xpath <- '//*[@id="screen-reader-main-title"]/span'
        x <- x %>%
                html_node(xpath = title_xpath) %>%
                html_text(trim = T)
        
        #x <- html_text(x)
        x <- data.table(x[[1]])
        return(x)
        Sys.sleep(5)
}

Объединяем их

a_t_apr <- pblapply(apr, art_title)

Создаем функцию извлекающую аннотации из всех URL адресов

art_abs <- function(url.y){
        y <- read_html(url.y)
        
        body_xpath <- "//*[@id='abstracts']"
        y <- y %>%
                html_nodes(xpath = body_xpath) %>%
                html_text(trim = T) %>%
                paste0(collapse = "\n")
        y <- data.table(y[[1]])
        return(y)
        Sys.sleep(3)
}

a_a_apr <- pblapply(apr, art_abs)

df1 = data.frame(matrix(unlist(a_t_apr), nrow = length(a_t_apr), byrow = T))
df1 <- tibble::rowid_to_column(df1, "doc_id")
names(df1)[2] <- "text"

df2 = data.frame(matrix(unlist(a_a_apr), nrow = length(a_a_apr), byrow = T))
df2 <- tibble::rowid_to_column(df2, "doc_id")
names(df2)[2] <- "text"

md4 <-rbind(df1, df2)

Записываем полученные данные в файл

write.csv2(md4, file = "data/april_data.csv")
md4 <- read.csv2("data/april_data.csv", sep = ";", encoding = "UTF-8")

Сохраняем данные в файл

save(md4, file = "data/data_april.RData")

Размерность фрейма данных

dim(md4)

## [1] 534   3

Названия столбцов

colnames(md4)

## [1] "X"      "doc_id" "text"

Удаляем пустой столбец в фрейме данных

md4 <- md4[-grep('X', colnames(md4))]

Проверяем

colnames(md4)

## [1] "doc_id" "text"

Загружаем библиотеки

library(NLP)
library(tm)

3 Создаем корпус

corpus_apr <- Corpus(DataframeSource(md4))

Смотрим на корпус

corpus_apr

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 534

Получим доступ к отдельному элементу

corpus_apr[[1]]

## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 15

Посмотрим содержание текста 62 элемента

as.character(corpus_apr[[62]])

## [1] "The ICCP-SS technique for retrofitting reinforced concrete compressive members subjected to corrosion"

4 Создаем матрицу терминов документа

DTM <- DocumentTermMatrix(corpus_apr)

Посмотрим на основную информацию о матрице

DTM

## <<DocumentTermMatrix (documents: 534, terms: 9518)>>
## Non-/sparse entries: 33868/5048744
## Sparsity           : 99%
## Maximal term length: 41
## Weighting          : term frequency (tf)

Размеры DTM

dim(DTM)

## [1]  534 9518

Загрузим пакет для арифметических операций с разреженными матрицами

library(slam)

Подсчитаем частоту слов

freqs <- col_sums(DTM)

Получим вектор словаря

words <- colnames(DTM)

Объединим слова и их частоты в фрейме данных

wordlist <- data.frame(words, freqs)

Упорядочим список слов

wordIndexes <- order(wordlist[, "freqs"], decreasing = TRUE)
wordlist <- wordlist[wordIndexes, ]

Посмотрим самые часто встречающиеся слова

head(wordlist, 25)

##                   words freqs
## the                 the  4348
## and                 and  2401
## with               with   697
## was                 was   579
## for                 for   552
## were               were   549
## concrete       concrete   500
## that               that   386
## strength       strength   320
## are                 are   291
## this               this   269
## properties   properties   262
## cement           cement   261
## results         results   224
## from               from   205
## asphalt         asphalt   202
## effect           effect   197
## can                 can   197
## using             using   196
## test               test   193
## different     different   192
## used               used   181
## between         between   164
## mechanical   mechanical   163
## compressive compressive   155

Приведем распределение слов по частоте

plot(wordlist$freqs , type = "l", lwd=2, 
     main = "Rank frequency Plot", xlab="Rank", ylab ="Frequency")

Используем логарифмические оси

plot(wordlist$freqs , type = "l", log="xy", lwd=2, 
     main = "Rank-Frequency Plot", xlab="log-Rank", ylab ="log-Frequency")

5 Предварительная обработка корпуса

Удалим стоп-слова

library(NLP)
plot(wordlist$freqs, type = "l", log="xy",lwd=2, 
     main = "Rank-Frequency plot", xlab="Rank", ylab = "Frequency")
englishStopwords <- stopwords("en")
stopwords_idx <- which(wordlist$words %in% englishStopwords)
low_frequent_idx <- which(wordlist$freqs < 10)
insignificant_idx <- union(stopwords_idx, low_frequent_idx)
meaningful_range_idx <- setdiff(1:nrow(wordlist), insignificant_idx)
lines(meaningful_range_idx, wordlist$freqs[meaningful_range_idx], 
      col = "green", lwd=2, type="p", pch=20)

Преобразуем слова в нижний регистр

corpus <- tm_map(corpus_apr, content_transformer(tolower))

Удаляем стоп-слова

corpus <- tm_map(corpus, removeWords, stopwords("en"))

Оставляем только буквенно-цифровые символы

corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)

Удаляем числовые символы

corpus <- tm_map(corpus, removeNumbers)

Удаляем специальные символы

for(j in seq(corpus))
{
        corpus[[j]] <- gsub("•", " ", corpus[[j]])
}

for(j in seq(corpus))
{
        corpus[[j]] <- gsub("\u0095", " ", corpus[[j]])
}

Удаляем лишние пробелы

corpus <- tm_map(corpus, stripWhitespace)

Загружаем библиотеку

library(SnowballC)

Удаляем специфические термины

corpus <- tm_map(corpus, removeWords, c("editorial", "article", "board",
                                              "download", "google", "figure",
                                              "fig", "groups", "however",
                                              "high", "human", "levels",
                                              "larger", "may", "number",
                                              "shown", "study", "studies", "this",
                                              "using", "two", "the", "highlights",
                                              "view", "biol",  "via", "image", 
                                              "doi", "one", "abstract", "well",
                                              "use", "also", "paper"
))

Результаты предварительной обработки для 62 элемента

as.character(corpus[[62]])

## [1] " iccp-ss technique retrofitting reinforced concrete compressive members subjected corrosion"

6 Создадим новый DTM дл предварительно обработанного корпуса

DTM1 <- DocumentTermMatrix(corpus)
DTM1

## <<DocumentTermMatrix (documents: 534, terms: 5838)>>
## Non-/sparse entries: 25452/3092040
## Sparsity           : 99%
## Maximal term length: 40
## Weighting          : term frequency (tf)

freqs1 <- col_sums(DTM1)
words1 <- colnames(DTM1)
wordlist1 <- data.frame(words1, freqs1)
wordIndexes1 <- order(wordlist1[, "freqs1"], decreasing = TRUE)
wordlist1 <- wordlist1[wordIndexes1, ]

head(wordlist1, 25)

##                  words1 freqs1
## concrete       concrete    605
## strength       strength    401
## properties   properties    326
## cement           cement    299
## results         results    264
## test               test    243
## effect           effect    223
## asphalt         asphalt    211
## different     different    197
## can                 can    197
## used               used    194
## mechanical   mechanical    172
## compressive compressive    160
## performance performance    148
## based             based    136
## temperature temperature    130
## water             water    129
## specimens     specimens    126
## resistance   resistance    125
## materials     materials    123
## aggregate     aggregate    122
## tests             tests    122
## model             model    120
## method           method    117
## mortar           mortar    116

tail(wordlist1, 25)

##                                      words1 freqs1
## partitioned                     partitioned      1
## prominently                     prominently      1
## references                       references      1
## stagger                             stagger      1
## ar-glass                           ar-glass      1
## drop-weight                     drop-weight      1
## pre-tensioned                 pre-tensioned      1
## pretension                       pretension      1
## pretensioned                   pretensioned      1
## textiles                           textiles      1
## typeabstractthe             typeabstractthe      1
##  ms                                      ms      1
## beamsabstractshear       beamsabstractshear      1
## predictable                     predictable      1
## synergy                             synergy      1
## broadly                             broadly      1
## chloridesabstractthis chloridesabstractthis      1
## enlarges                           enlarges      1
## triggered                         triggered      1
## acrylic                             acrylic      1
## butadiene                         butadiene      1
## extreme                             extreme      1
## pae                                     pae      1
## polyacrylic                     polyacrylic      1
## sae                                     sae      1

7 Группировка настроений

positive_terms_all <- readLines("positive-words.txt")
negative_terms_all <- readLines("negative-words.txt")

positive_terms_in_suto <- intersect(colnames(DTM1), positive_terms_all)
counts_positive <- row_sums(DTM1[, positive_terms_in_suto])

negative_terms_in_suto <- intersect(colnames(DTM1), negative_terms_all)
counts_negative <- row_sums(DTM1[, negative_terms_in_suto])

counts_all_terms <- row_sums(DTM1)

relative_sentiment_frequencies <- data.frame(
        positive = counts_positive / counts_all_terms,
        negative = counts_negative / counts_all_terms
)

sentiments_per_article <- aggregate(relative_sentiment_frequencies, 
                                      by = list(doc_id = md4$doc_id), mean)

Посмотрим на первые пять элементов

head(sentiments_per_article)

##   doc_id    positive    negative
## 1      1         NaN         NaN
## 2      2 0.045454545 0.007177033
## 3      3 0.031007752 0.085755814
## 4      4 0.012820513 0.092735043
## 5      5 0.022900763 0.103435115
## 6      6 0.004237288 0.021186441

Преобразуем данные в подходящий для ggplot2 формат

library(reshape2)
data_f <- melt(sentiments_per_article, id.vars = "doc_id")

Удалим пустые строки и проверим их количество

row.has.na <- apply(data_f, 1, function(x){any(is.na(x))})

sum(row.has.na)

## [1] 8

data_f <- data_f[!row.has.na,]

Представим результат графически

library(ggplot2)
ggplot(data = data_f, aes(x = doc_id, y = value, fill = variable)) + 
        geom_bar(stat="identity", position=position_dodge()) + coord_flip()

Отсортируем термины по негативному/позитивному настроению

Позитивное

ggplot(data = data_f, aes(x = reorder(doc_id, data_f$value, head, 1), y = value, fill = variable)) + 
        geom_bar(stat="identity", position=position_dodge()) + coord_flip()

## Warning: Use of `data_f$value` is discouraged. Use `value` instead.

Негативное

ggplot(data = data_f, aes(x = reorder(doc_id, data_f$value, tail, 1), y = value, fill = variable)) + 
        geom_bar(stat="identity", position=position_dodge()) + coord_flip()

## Warning: Use of `data_f$value` is discouraged. Use `value` instead.

8 Создадим облако слов

library(wordcloud2)
freqs1 <- col_sums(DTM1)
words1 <- colnames(DTM1)
top300 <- sort(freqs1, decreasing = TRUE)[1:300]
top300_df <- data.frame(words1 = names(top300), count = top300)
wordcloud2(top300_df, shuffle = F, shape = 'pentagon')

Scraping and Text Mining in R

pr7391

25 april 2019 г