datasciencewc.R

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RISmed)
if(!require(wordcloud))install.packages("wordcloud")

## Loading required package: wordcloud

## Loading required package: RColorBrewer

library(wordcloud)
library(tidytext)
library(tidyr)

res1 <- EUtilsSummary("data + science, public + health", 
                      type = "esearch", 
                      db = "pubmed",
                      datetype = "pdat",
                      retmax = 5000,
                      mindate = 2000, 
                      maxdate = 2016)

res1

## [1] "((\"EPJ Data Sci\"[Journal] OR (\"data\"[All Fields] AND \"science\"[All Fields]) OR \"data science\"[All Fields]) AND (\"public health\"[MeSH Terms] OR (\"public\"[All Fields] AND \"health\"[All Fields]) OR \"public health\"[All Fields])) AND 2000[PDAT] : 2016[PDAT]"

fetch <- EUtilsGet(res1, type = "efetch", db = "pubmed")

## Warning in Medline(Result, query): NAs introduced by coercion

## Warning in Medline(Result, query): NAs introduced by coercion

titles <- data.frame(title = fetch@ArticleTitle)
titles <- as.character(titles$title) 



tidyabst <- data.frame(titles) 
tidyabst$titles <- as.character(tidyabst$titles)
cloud <- tidyabst %>%
  unnest_tokens(word, titles) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

cloud %>%
with(wordcloud(word, n, min.freq = 10, max.words = 100, colors = brewer.pal(8, "Dark2")))

ds_bigrams <- tidyabst %>%
  unnest_tokens(bigrams, titles, token = "ngrams", n = 2) %>%
  count(bigrams, sort  =TRUE)

ds_bigrams

## # A tibble: 0 × 2
## # ... with 2 variables: bigrams <chr>, n <int>

bigrams_separated <- ds_bigrams %>%
  separate(bigrams, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

g <- bigrams_filtered[grepl("^0", bigrams_filtered$word1),]

g

## # A tibble: 0 × 3
## # ... with 3 variables: word1 <chr>, word2 <chr>, n <int>

bigrams_filtered %>% 
  filter(word1 != "0")  %>%
  count(word1, word2) %>%
  arrange(-nn)

## Source: local data frame [0 x 3]
## Groups: word1 [?]
## 
## # ... with 3 variables: word1 <chr>, word2 <chr>, nn <int>

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united %>%
  count(bigram, sort = TRUE)

## # A tibble: 0 × 2
## # ... with 2 variables: bigram <chr>, nn <int>

datasciencewc.R

julianflowers

Sun Nov 27 09:28:48 2016