library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
require(NLP)
require(jiebaR)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
aki_review = fread('./data/AKI_review.csv',encoding = 'UTF-8')
aki_title <- aki_review %>%
select(id,title) %>% as.data.frame()
aki_abstract <- aki_review %>%
select(id,abstract) %>% as.data.frame()
aki_title_tokens <- aki_title %>%
unnest_tokens(word, title) %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9']",word)) %>% # 去數字
anti_join(stop_words)
## Joining, by = "word"
aki_abstract_tokens <- aki_abstract %>%
unnest_tokens(word, abstract) %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9']",word)) %>% # 去數字
anti_join(stop_words)
## Joining, by = "word"
my_stopwords <- tibble(word = c("patient","patients","aki","acute","kidney","injury"))
aki_title_tokens %>%
anti_join(my_stopwords) %>%
count(word, sort = TRUE) %>%
filter(n>3) %>% #過濾3次以上的
arrange(desc(n()))
## Joining, by = "word"
## word n
## 1 prediction 51
## 2 risk 44
## 3 surgery 31
## 4 score 28
## 5 model 24
## 6 predict 21
## 7 cardiac 20
## 8 contrast 20
## 9 coronary 20
## 10 induced 19
## 11 validation 19
## 12 predicting 17
## 13 nomogram 16
## 14 undergoing 16
## 15 predictive 15
## 16 clinical 14
## 17 development 14
## 18 models 14
## 19 renal 13
## 20 study 13
## 21 biomarkers 11
## 22 scores 11
## 23 angiography 9
## 24 percutaneous 9
## 25 cohort 8
## 26 disease 8
## 27 failure 8
## 28 intervention 8
## 29 learning 8
## 30 based 7
## 31 urinary 7
## 32 analysis 6
## 33 care 6
## 34 chinese 6
## 35 data 6
## 36 derivation 6
## 37 electronic 6
## 38 heart 6
## 39 machine 6
## 40 nephropathy 6
## 41 postoperative 6
## 42 critically 5
## 43 factors 5
## 44 hospital 5
## 45 hospitalized 5
## 46 ill 5
## 47 liver 5
## 48 records 5
## 49 simple 5
## 50 chronic 4
## 51 comparison 4
## 52 health 4
## 53 infarction 4
## 54 intensive 4
## 55 intraoperative 4
## 56 major 4
## 57 myocardial 4
## 58 outcomes 4
## 59 predicts 4
## 60 transplantation 4
## 61 unit 4
# arrange(desc(n())) %>%
# wordcloud2()
title wordcloud
aki_abstract_tokens %>%
anti_join(my_stopwords) %>%
count(word, sort = TRUE) %>%
filter(n>30) %>% #過濾50次以上的
arrange(desc(n())) %>%
wordcloud2()
## Joining, by = "word"
abstract wordcloud
title_word_pairs <- aki_title_tokens %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
title_word_pairs %>% head(20)
## # A tibble: 20 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 kidney injury 100
## 2 acute injury 99
## 3 acute kidney 98
## 4 acute prediction 43
## 5 kidney prediction 42
## 6 injury prediction 42
## 7 acute patients 41
## 8 kidney patients 40
## 9 injury patients 40
## 10 acute risk 34
## 11 kidney risk 34
## 12 injury risk 34
## 13 acute surgery 25
## 14 kidney surgery 25
## 15 injury surgery 25
## 16 acute score 22
## 17 kidney score 22
## 18 injury score 22
## 19 acute model 22
## 20 kidney model 22
abstract_word_pairs <- aki_abstract_tokens %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
abstract_word_pairs %>% head(20)
## # A tibble: 20 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 acute patients 117
## 2 kidney patients 112
## 3 acute kidney 111
## 4 kidney injury 111
## 5 risk patients 111
## 6 acute injury 110
## 7 injury patients 109
## 8 acute risk 108
## 9 kidney risk 103
## 10 kidney aki 102
## 11 aki patients 101
## 12 injury aki 100
## 13 injury risk 100
## 14 results patients 100
## 15 acute aki 99
## 16 acute results 98
## 17 aki risk 94
## 18 kidney results 93
## 19 risk results 92
## 20 injury results 91
set.seed(1234)
title_word_pairs %>%
filter(n >=6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
> title word network
set.seed(1234)
abstract_word_pairs %>%
filter(n >=40) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
abstract word network