library("RISmed")
?EUtilsSummary
res <-EUtilsSummary("alternative splicing", type="esearch", db="pubmed", datetype='pdat', mindate=2012, maxdate=2015, retmax=500)
head(res@PMID)
[1] "26463470" "26942228" "26923414"
[4] "26904376" "26891005" "26885001"
QueryCount(res)
[1] 5491
summary(res)
Query:
("alternative splicing"[MeSH Terms] OR ("alternative"[All Fields] AND "splicing"[All Fields]) OR "alternative splicing"[All Fields]) AND 2012[PDAT] : 2015[PDAT] 

Result count:  5491
?EUtilsGet
records <- EUtilsGet(res)
pubmed_data<-data.frame('year'=YearPubmed(records), 'Title'=ArticleTitle(records),'Abstract'=AbstractText(records))
View(pubmed_data)
head(pubmed_data,3)
table(pubmed_data$year)

2015 2016 
 466   34 
barplot(table(pubmed_data$year))

pubmed_date<-data.frame('month'=MonthPubmed(records), 'year'=YearPubmed(records))
barplot(table(pubmed_date[pubmed_date$year == 2015, ]$month))

將文字轉成語料(Corpus)

library(tm)
sentence <- 'Hello, I am David. I have taken over 100 courses ~~~'
sentence_source <- VectorSource(sentence)
sentence_corpus <- VCorpus(sentence_source)
sentence_corpus[[1]]$content
[1] "Hello, I am David. I have taken over 100 courses ~~~"
getTransformations()
[1] "removeNumbers"     "removePunctuation" "removeWords"      
[4] "stemDocument"      "stripWhitespace"  
doc <- tm_map(sentence_corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
doc[[1]]$content
[1] "Hello I am David I have taken over  courses "
## Transformer
removetilde<-content_transformer(
  function(x, pattern){
    return(gsub("~", "", x))
    }
)
sentence_source <- VectorSource(sentence)
sentence_corpus <- VCorpus(sentence_source)
sentence_corpus[[1]]$content
[1] "Hello, I am David. I have taken over 100 courses ~~~"
doc <- tm_map(sentence_corpus, removetilde)
doc[[1]]$content
[1] "Hello, I am David. I have taken over 100 courses "
# install.packages("SnowballC")
library(SnowballC)
stem_words <- stemDocument(c("complicatedly", "complicated", "complication"))
stem_words
[1] "complic" "complic" "complic"
# install.packages("jiebaR")
library(jiebaR)
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
  input string 1 is invalid in this locale
a <-  '隞予<e5><8e>擃<ab><88><88><88>'
mixseg <- worker()
mixseg <= a
[1] "隞予"    
1, <e5><8e>    
1, 擃<ab><88><88><88>"
1,
a <-  '李玉璽2014年就以歌手身分出道,演出《我的少女時代》後知名度大開,他坦言星二代身分讓他有壓力,但他說:「如果標籤沒有撕下來,就代表努力還不夠。」'
strsplit(a, ',|:|「|」|《|》')
#edit_dict()
#USERPATH
library(NLP)
a <-  '復興航空宣布解散後,傳出遠東航空董事長張綱維表態有不少投資人拱他接下興航重整案,他願意考慮。張綱維今天上午10時出面召開記者會說明。不過最後張綱維僅說,「呼籲想要航權的國籍航空兩大龍頭華航與長榮出面接手!若兩家航空公司不願意,他才會考慮。'
s <-strsplit(x=a, split='')
bigram <- ngrams(unlist(s), 3)
terms <- sapply(bigram, function(e) paste(e, collapse=''))
tb<-table(terms)
tb[tb >=2]

a
mixseg <- worker()
mixseg <= a

edit_dict()

建立詞頻矩陣

s1 <-'this is a book'
s2 <-'this is my car'
split_string<-strsplit(c(s1, s2), ' ')
split_string
[[1]]
[1] "this" "is"   "a"    "book"

[[2]]
[1] "this" "is"   "my"   "car" 
corpus <- Corpus(VectorSource(split_string))
dtm<-DocumentTermMatrix(corpus)
inspect(dtm)
<<DocumentTermMatrix (documents: 2, terms: 3)>>
Non-/sparse entries: 4/2
Sparsity           : 33%
Maximal term length: 4
Weighting          : term frequency (tf)

    Terms
Docs book car this
   1    1   0    1
   2    0   1    1
control.list<-list(wordLengths=c(1, 20))
dtm<-DocumentTermMatrix(corpus, control=control.list)
inspect(dtm)
<<DocumentTermMatrix (documents: 2, terms: 6)>>
Non-/sparse entries: 8/4
Sparsity           : 33%
Maximal term length: 4
Weighting          : term frequency (tf)

    Terms
Docs a book car is my this
   1 1    1   0  1  0    1
   2 0    0   1  1  1    1
stopwords("english")
  [1] "i"          "me"         "my"         "myself"    
  [5] "we"         "our"        "ours"       "ourselves" 
  [9] "you"        "your"       "yours"      "yourself"  
 [13] "yourselves" "he"         "him"        "his"       
 [17] "himself"    "she"        "her"        "hers"      
 [21] "herself"    "it"         "its"        "itself"    
 [25] "they"       "them"       "their"      "theirs"    
 [29] "themselves" "what"       "which"      "who"       
 [33] "whom"       "this"       "that"       "these"     
 [37] "those"      "am"         "is"         "are"       
 [41] "was"        "were"       "be"         "been"      
 [45] "being"      "have"       "has"        "had"       
 [49] "having"     "do"         "does"       "did"       
 [53] "doing"      "would"      "should"     "could"     
 [57] "ought"      "i'm"        "you're"     "he's"      
 [61] "she's"      "it's"       "we're"      "they're"   
 [65] "i've"       "you've"     "we've"      "they've"   
 [69] "i'd"        "you'd"      "he'd"       "she'd"     
 [73] "we'd"       "they'd"     "i'll"       "you'll"    
 [77] "he'll"      "she'll"     "we'll"      "they'll"   
 [81] "isn't"      "aren't"     "wasn't"     "weren't"   
 [85] "hasn't"     "haven't"    "hadn't"     "doesn't"   
 [89] "don't"      "didn't"     "won't"      "wouldn't"  
 [93] "shan't"     "shouldn't"  "can't"      "cannot"    
 [97] "couldn't"   "mustn't"    "let's"      "that's"    
[101] "who's"      "what's"     "here's"     "there's"   
[105] "when's"     "where's"    "why's"      "how's"     
[109] "a"          "an"         "the"        "and"       
[113] "but"        "if"         "or"         "because"   
[117] "as"         "until"      "while"      "of"        
[121] "at"         "by"         "for"        "with"      
[125] "about"      "against"    "between"    "into"      
[129] "through"    "during"     "before"     "after"     
[133] "above"      "below"      "to"         "from"      
[137] "up"         "down"       "in"         "out"       
[141] "on"         "off"        "over"       "under"     
[145] "again"      "further"    "then"       "once"      
[149] "here"       "there"      "when"       "where"     
[153] "why"        "how"        "all"        "any"       
[157] "both"       "each"       "few"        "more"      
[161] "most"       "other"      "some"       "such"      
[165] "no"         "nor"        "not"        "only"      
[169] "own"        "same"       "so"         "than"      
[173] "too"        "very"      
str(pubmed_data)
'data.frame':   500 obs. of  3 variables:
 $ year    : num  2016 2016 2016 2016 2016 ...
 $ Title   : Factor w/ 500 levels "[Alternative Splicing Detection as a Biomarker for Cancer Diagnosis: A Novel Progressive Mechanism of Acute Lymphoblastic Leuke"| __truncated__,..: 90 50 491 120 183 373 474 465 271 79 ...
 $ Abstract: chr  "The composition and function of the central nervous system (CNS) is extremely complex. In addition to hundreds of subtypes of n"| __truncated__ "Neurodegenerative diseases have a variety of different genes contributing to their underlying pathology. Unfortunately, for man"| __truncated__ "An astonishing number of neurological diseases result from expansion of unstable repetitive sequences causing alterations in ke"| __truncated__ "Composite retrotransposons are widely distributed in the plant and animal kingdoms. Some of the most complex of these are found"| __truncated__ ...
pubmed_data$Abstract <- as.character(pubmed_data$Abstract)
#strsplit(pubmed_data$Abstract, ' ')
split_string<-strsplit(pubmed_data$Abstract, ' ')
corpus <-Corpus(VectorSource(split_string))
doc <-tm_map(corpus, removeNumbers)
doc <-tm_map(doc, removePunctuation)
doc <-tm_map(doc, stemDocument)
doc <-tm_map(doc, removeWords, stopwords("english"))
doc <-tm_map(doc, removeWords, c('the', 'use', 'also'))
dtm<-DocumentTermMatrix(doc)
?findFreqTerms
findFreqTerms(dtm, 200,Inf)
 [1] "activ"      "altern"     "analysi"    "associ"    
 [5] "cancer"     "cell"       "data"       "develop"   
 [9] "differ"     "diseas"     "exon"       "express"   
[13] "factor"     "function"   "gene"       "human"     
[17] "identifi"   "isoform"    "level"      "mechan"    
[21] "mutat"      "protein"    "regul"      "result"    
[25] "rna"        "role"       "sequenc"    "show"      
[29] "splice"     "studi"      "the"        "transcript"
[33] "variant"   
findAssocs(dtm, 'exon', 0.2)
$exon
                  skip                 splice 
                  0.47                   0.39 
                inclus                  chose 
                  0.34                   0.33 
               duchenn           exonsresults 
                  0.33                   0.33 
      humanconclusions                 richer 
                  0.33                   0.33 
            skipenhanc         splicingtarget 
                  0.33                   0.33 
               tginduc                 chemic 
                  0.33                   0.29 
              insensit          polypyrimidin 
                  0.28                   0.27 
               cassett                  mutat 
                  0.26                   0.26 
                  pair                   rule 
                  0.26                   0.26 
                  ball                 bypass 
                  0.25                   0.25 
                  cart                 epitop 
                  0.25                   0.25 
                 escap   isoformsabstracttext 
                  0.25                   0.25 
  isoformssignificance               preexist 
                  0.25                   0.25 
         pulldownsirna            receptorarm 
                  0.25                   0.25 
                exonic                   lack 
                  0.24                   0.24 
               minigen                synonym 
                  0.24                   0.24 
                 tract                  bcell 
                  0.24                   0.23 
              compound                   fail 
                  0.23                   0.23 
                  kill                 relaps 
                  0.23                   0.23 
                  site                antigen 
                  0.23                   0.22 
            combinator                    eci 
                  0.22                   0.22 
                featur                halflif 
                  0.22                   0.22 
        leukemiaspecif               powerlaw 
                  0.22                   0.22 
           prerequisit quotattractivenessquot 
                  0.22                   0.22 
     quotexonshubsquot              runxrunxt 
                  0.22                   0.22 
              scalefre               stochast 
                  0.22                   0.22 
                tposit              unproduct 
                  0.22                   0.22 
               varianc                   ensu 
                  0.22                   0.21 
               hemizyg                attempt 
                  0.21                   0.20 
                enhanc            lymphoblast 
                  0.20                   0.20 
dim(dtm)
[1]  500 7517
dtm.remove<-removeSparseTerms(dtm, 0.95)
dim(dtm.remove)
[1] 500 348
head(dtm.remove$dimnames$Terms)
[1] "aberr" "abund" "acid"  "act"   "activ" "addit"
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
edit_dict()

mixseg <- worker()
s <- '林明昇超跑基地曝光! 30輛頂級保時捷藏在這裡'
s1 <- '復興航空一路騙 董事長林明昇被譙翻'
s2 <- '熱愛超跑 林明昇被批敗家'

s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s2.vec <- segment(code= s2 , jiebar = mixseg)
s.corpus = CNCorpus(list(s.vec, s1.vec, s2.vec))

control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc   <-tm_map(s.corpus, removeNumbers)
s.dtm <- DocumentTermMatrix(doc, control=control.list)
inspect(s.dtm)
tb[order(tb, decreasing = TRUE)][1:20]

         of         the         and          in 
       4480        4358        3583        2961 
         to           a        that          is 
       1758        1575        1220        1004 
       with    splicing         for          by 
        864         840         765         693 
alternative         are  expression          we 
        580         544         544         512 
         as         The        were        gene 
        507         494         457         421 

Word Frequency Matrix

wfm(pubmed_data$Abstract)
Error: could not find function "wfm"

使用詞頻矩陣的結果計算詞頻(續)

doc <-tm_map(doc, stemDocument)
doc <-tm_map(doc, stemDocument)
doc <-tm_map(doc, removeWords, stopwords("english"))
dtm<-DocumentTermMatrix(doc)
terms <- colSums(as.matrix(dtm))
sort(terms, decreasing = TRUE)[1:10]
    splice       gene    express       cell 
      1356        903        860        745 
    altern    protein        the transcript 
       658        528        493        486 
     regul   function 
       450        416 

使用barplot呈現詞頻最高的字詞

term_sum<-colSums(as.matrix(dtm))
Warning message:
In file(con, "rb") :
  cannot open file 'C:/Users/wilsonhsu/AppData/Local/RStudio-Desktop/notebooks/23299BFB-Demo20161125/1/s/ceylbzotjaoi1/temp': Permission denied
barplot(sort(term_sum, decreasing =TRUE)[1:10])

#dtm.remove$dimnames$Terms
assocation <- data.frame()
Warning messages:
1: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
for (t in dtm.remove$dimnames$Terms){
   for (assoc in names(findAssocs(dtm.remove, t, 0.2)[[1]])){
     assocation <- rbind(assocation, data.frame(source = t,  target= assoc))
   }
} 
write.csv(x = assocation, file='association.csv')
getwd()
[1] "C:/Users/wilsonhsu/Desktop"
res2 <-EUtilsSummary("gene expression", type="esearch", db="pubmed", datetype='pdat', mindate=2012, maxdate=2015, retmax=500)records2<-EUtilsGet(res2)gene_expression<-data.frame('year'=YearPubmed(records2), 'Title'=ArticleTitle(records2),'Abstract'=AbstractText(records2))
Error: unexpected symbol in "res2 <-EUtilsSummary("gene expression", type="esearch", db="pubmed", datetype='pdat', mindate=2012, maxdate=2015, retmax=500)records2"
clean_corpus<-function(corpus){
  corpus <-tm_map(corpus, removePunctuation)corpus <-tm_map(corpus, stripWhitespace)
Error: unexpected symbol in:
"clean_corpus<-function(corpus){
  corpus <-tm_map(corpus, removePunctuation)corpus"

pyramid.plot(top25_df$x, top25_df$y,labels=top25_df$labels,main ="Words in Common",gap =150,space=0.2,raxlab=NULL, unit=NULL,top.labels=c("Alternative Splicing","Words","Gene Expression"))
[1] 5.1 4.1 2.1 2.1

