R Markdown

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
require(NLP)
require(jiebaR)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
aki_review = fread('./data/AKI_review.csv',encoding = 'UTF-8')
aki_title <- aki_review %>% 
             select(id,title) %>% as.data.frame()

aki_abstract <- aki_review %>% 
             select(id,abstract)  %>% as.data.frame()
aki_title_tokens <- aki_title %>% 
  unnest_tokens(word, title) %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9']",word)) %>% # 去數字
  anti_join(stop_words) 
## Joining, by = "word"
aki_abstract_tokens <- aki_abstract %>% 
  unnest_tokens(word, abstract) %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9']",word)) %>% # 去數字
  anti_join(stop_words)
## Joining, by = "word"

title wordcloud

my_stopwords <- tibble(word = c("patient","patients","aki","acute","kidney","injury"))

aki_title_tokens %>% 
  anti_join(my_stopwords) %>% 
  count(word, sort = TRUE) %>% 
  filter(n>3) %>%   #過濾3次以上的 
  arrange(desc(n()))
## Joining, by = "word"
##               word  n
## 1       prediction 51
## 2             risk 44
## 3          surgery 31
## 4            score 28
## 5            model 24
## 6          predict 21
## 7          cardiac 20
## 8         contrast 20
## 9         coronary 20
## 10         induced 19
## 11      validation 19
## 12      predicting 17
## 13        nomogram 16
## 14      undergoing 16
## 15      predictive 15
## 16        clinical 14
## 17     development 14
## 18          models 14
## 19           renal 13
## 20           study 13
## 21      biomarkers 11
## 22          scores 11
## 23     angiography  9
## 24    percutaneous  9
## 25          cohort  8
## 26         disease  8
## 27         failure  8
## 28    intervention  8
## 29        learning  8
## 30           based  7
## 31         urinary  7
## 32        analysis  6
## 33            care  6
## 34         chinese  6
## 35            data  6
## 36      derivation  6
## 37      electronic  6
## 38           heart  6
## 39         machine  6
## 40     nephropathy  6
## 41   postoperative  6
## 42      critically  5
## 43         factors  5
## 44        hospital  5
## 45    hospitalized  5
## 46             ill  5
## 47           liver  5
## 48         records  5
## 49          simple  5
## 50         chronic  4
## 51      comparison  4
## 52          health  4
## 53      infarction  4
## 54       intensive  4
## 55  intraoperative  4
## 56           major  4
## 57      myocardial  4
## 58        outcomes  4
## 59        predicts  4
## 60 transplantation  4
## 61            unit  4
  # arrange(desc(n()))  %>% 
  # wordcloud2() 

title wordcloud

abstract wordcloud

aki_abstract_tokens %>% 
  anti_join(my_stopwords) %>% 
  count(word, sort = TRUE) %>% 
  filter(n>30) %>%   #過濾50次以上的 
  arrange(desc(n()))  %>% 
  wordcloud2()
## Joining, by = "word"

abstract wordcloud

title_word_pairs <- aki_title_tokens %>% 
  pairwise_count(word, id, sort = TRUE, upper = FALSE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
title_word_pairs %>% head(20)
## # A tibble: 20 x 3
##    item1  item2          n
##    <chr>  <chr>      <dbl>
##  1 kidney injury       100
##  2 acute  injury        99
##  3 acute  kidney        98
##  4 acute  prediction    43
##  5 kidney prediction    42
##  6 injury prediction    42
##  7 acute  patients      41
##  8 kidney patients      40
##  9 injury patients      40
## 10 acute  risk          34
## 11 kidney risk          34
## 12 injury risk          34
## 13 acute  surgery       25
## 14 kidney surgery       25
## 15 injury surgery       25
## 16 acute  score         22
## 17 kidney score         22
## 18 injury score         22
## 19 acute  model         22
## 20 kidney model         22
abstract_word_pairs <- aki_abstract_tokens %>% 
  pairwise_count(word, id, sort = TRUE, upper = FALSE) 

abstract_word_pairs %>% head(20) 
## # A tibble: 20 x 3
##    item1   item2        n
##    <chr>   <chr>    <dbl>
##  1 acute   patients   117
##  2 kidney  patients   112
##  3 acute   kidney     111
##  4 kidney  injury     111
##  5 risk    patients   111
##  6 acute   injury     110
##  7 injury  patients   109
##  8 acute   risk       108
##  9 kidney  risk       103
## 10 kidney  aki        102
## 11 aki     patients   101
## 12 injury  aki        100
## 13 injury  risk       100
## 14 results patients   100
## 15 acute   aki         99
## 16 acute   results     98
## 17 aki     risk        94
## 18 kidney  results     93
## 19 risk    results     92
## 20 injury  results     91

title word network

set.seed(1234)
title_word_pairs %>%
  filter(n >=6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

> title word network

abstract word network

set.seed(1234)
abstract_word_pairs %>%
  filter(n >=40) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

abstract word network