library("RISmed")
?EUtilsSummary
res <-EUtilsSummary("alternative splicing", type="esearch", db="pubmed", datetype='pdat', mindate=2012, maxdate=2015, retmax=500)
head(res@PMID)
[1] "26463470" "26942228" "26923414"
[4] "26904376" "26891005" "26885001"
QueryCount(res)
[1] 5491
summary(res)
Query:
("alternative splicing"[MeSH Terms] OR ("alternative"[All Fields] AND "splicing"[All Fields]) OR "alternative splicing"[All Fields]) AND 2012[PDAT] : 2015[PDAT]
Result count: 5491
?EUtilsGet
records <- EUtilsGet(res)
pubmed_data<-data.frame('year'=YearPubmed(records), 'Title'=ArticleTitle(records),'Abstract'=AbstractText(records))
View(pubmed_data)
head(pubmed_data,3)
table(pubmed_data$year)
2015 2016
466 34
barplot(table(pubmed_data$year))

pubmed_date<-data.frame('month'=MonthPubmed(records), 'year'=YearPubmed(records))
barplot(table(pubmed_date[pubmed_date$year == 2015, ]$month))

將文字轉成語料(Corpus)
library(tm)
sentence <- 'Hello, I am David. I have taken over 100 courses ~~~'
sentence_source <- VectorSource(sentence)
sentence_corpus <- VCorpus(sentence_source)
sentence_corpus[[1]]$content
[1] "Hello, I am David. I have taken over 100 courses ~~~"
getTransformations()
[1] "removeNumbers" "removePunctuation" "removeWords"
[4] "stemDocument" "stripWhitespace"
doc <- tm_map(sentence_corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
doc[[1]]$content
[1] "Hello I am David I have taken over courses "
## Transformer
removetilde<-content_transformer(
function(x, pattern){
return(gsub("~", "", x))
}
)
sentence_source <- VectorSource(sentence)
sentence_corpus <- VCorpus(sentence_source)
sentence_corpus[[1]]$content
[1] "Hello, I am David. I have taken over 100 courses ~~~"
doc <- tm_map(sentence_corpus, removetilde)
doc[[1]]$content
[1] "Hello, I am David. I have taken over 100 courses "
# install.packages("SnowballC")
library(SnowballC)
stem_words <- stemDocument(c("complicatedly", "complicated", "complication"))
stem_words
[1] "complic" "complic" "complic"
# install.packages("jiebaR")
library(jiebaR)
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
a <- '隞予<e5><8e>擃<ab><88><88><88>'
mixseg <- worker()
mixseg <= a
[1] "隞予"
1, <e5><8e>
1, 擃<ab><88><88><88>"
1,
a <- '李玉璽2014年就以歌手身分出道,演出《我的少女時代》後知名度大開,他坦言星二代身分讓他有壓力,但他說:「如果標籤沒有撕下來,就代表努力還不夠。」'
strsplit(a, ',|:|「|」|《|》')
#edit_dict()
#USERPATH
library(NLP)
a <- '復興航空宣布解散後,傳出遠東航空董事長張綱維表態有不少投資人拱他接下興航重整案,他願意考慮。張綱維今天上午10時出面召開記者會說明。不過最後張綱維僅說,「呼籲想要航權的國籍航空兩大龍頭華航與長榮出面接手!若兩家航空公司不願意,他才會考慮。'
s <-strsplit(x=a, split='')
bigram <- ngrams(unlist(s), 3)
terms <- sapply(bigram, function(e) paste(e, collapse=''))
tb<-table(terms)
tb[tb >=2]
a
mixseg <- worker()
mixseg <= a
edit_dict()
建立詞頻矩陣
s1 <-'this is a book'
s2 <-'this is my car'
split_string<-strsplit(c(s1, s2), ' ')
split_string
[[1]]
[1] "this" "is" "a" "book"
[[2]]
[1] "this" "is" "my" "car"
corpus <- Corpus(VectorSource(split_string))
dtm<-DocumentTermMatrix(corpus)
inspect(dtm)
<<DocumentTermMatrix (documents: 2, terms: 3)>>
Non-/sparse entries: 4/2
Sparsity : 33%
Maximal term length: 4
Weighting : term frequency (tf)
Terms
Docs book car this
1 1 0 1
2 0 1 1
control.list<-list(wordLengths=c(1, 20))
dtm<-DocumentTermMatrix(corpus, control=control.list)
inspect(dtm)
<<DocumentTermMatrix (documents: 2, terms: 6)>>
Non-/sparse entries: 8/4
Sparsity : 33%
Maximal term length: 4
Weighting : term frequency (tf)
Terms
Docs a book car is my this
1 1 1 0 1 0 1
2 0 0 1 1 1 1
stopwords("english")
[1] "i" "me" "my" "myself"
[5] "we" "our" "ours" "ourselves"
[9] "you" "your" "yours" "yourself"
[13] "yourselves" "he" "him" "his"
[17] "himself" "she" "her" "hers"
[21] "herself" "it" "its" "itself"
[25] "they" "them" "their" "theirs"
[29] "themselves" "what" "which" "who"
[33] "whom" "this" "that" "these"
[37] "those" "am" "is" "are"
[41] "was" "were" "be" "been"
[45] "being" "have" "has" "had"
[49] "having" "do" "does" "did"
[53] "doing" "would" "should" "could"
[57] "ought" "i'm" "you're" "he's"
[61] "she's" "it's" "we're" "they're"
[65] "i've" "you've" "we've" "they've"
[69] "i'd" "you'd" "he'd" "she'd"
[73] "we'd" "they'd" "i'll" "you'll"
[77] "he'll" "she'll" "we'll" "they'll"
[81] "isn't" "aren't" "wasn't" "weren't"
[85] "hasn't" "haven't" "hadn't" "doesn't"
[89] "don't" "didn't" "won't" "wouldn't"
[93] "shan't" "shouldn't" "can't" "cannot"
[97] "couldn't" "mustn't" "let's" "that's"
[101] "who's" "what's" "here's" "there's"
[105] "when's" "where's" "why's" "how's"
[109] "a" "an" "the" "and"
[113] "but" "if" "or" "because"
[117] "as" "until" "while" "of"
[121] "at" "by" "for" "with"
[125] "about" "against" "between" "into"
[129] "through" "during" "before" "after"
[133] "above" "below" "to" "from"
[137] "up" "down" "in" "out"
[141] "on" "off" "over" "under"
[145] "again" "further" "then" "once"
[149] "here" "there" "when" "where"
[153] "why" "how" "all" "any"
[157] "both" "each" "few" "more"
[161] "most" "other" "some" "such"
[165] "no" "nor" "not" "only"
[169] "own" "same" "so" "than"
[173] "too" "very"
str(pubmed_data)
'data.frame': 500 obs. of 3 variables:
$ year : num 2016 2016 2016 2016 2016 ...
$ Title : Factor w/ 500 levels "[Alternative Splicing Detection as a Biomarker for Cancer Diagnosis: A Novel Progressive Mechanism of Acute Lymphoblastic Leuke"| __truncated__,..: 90 50 491 120 183 373 474 465 271 79 ...
$ Abstract: chr "The composition and function of the central nervous system (CNS) is extremely complex. In addition to hundreds of subtypes of n"| __truncated__ "Neurodegenerative diseases have a variety of different genes contributing to their underlying pathology. Unfortunately, for man"| __truncated__ "An astonishing number of neurological diseases result from expansion of unstable repetitive sequences causing alterations in ke"| __truncated__ "Composite retrotransposons are widely distributed in the plant and animal kingdoms. Some of the most complex of these are found"| __truncated__ ...
pubmed_data$Abstract <- as.character(pubmed_data$Abstract)
#strsplit(pubmed_data$Abstract, ' ')
split_string<-strsplit(pubmed_data$Abstract, ' ')
corpus <-Corpus(VectorSource(split_string))
doc <-tm_map(corpus, removeNumbers)
doc <-tm_map(doc, removePunctuation)
doc <-tm_map(doc, stemDocument)
doc <-tm_map(doc, removeWords, stopwords("english"))
doc <-tm_map(doc, removeWords, c('the', 'use', 'also'))
dtm<-DocumentTermMatrix(doc)
?findFreqTerms
findFreqTerms(dtm, 200,Inf)
[1] "activ" "altern" "analysi" "associ"
[5] "cancer" "cell" "data" "develop"
[9] "differ" "diseas" "exon" "express"
[13] "factor" "function" "gene" "human"
[17] "identifi" "isoform" "level" "mechan"
[21] "mutat" "protein" "regul" "result"
[25] "rna" "role" "sequenc" "show"
[29] "splice" "studi" "the" "transcript"
[33] "variant"
findAssocs(dtm, 'exon', 0.2)
$exon
skip splice
0.47 0.39
inclus chose
0.34 0.33
duchenn exonsresults
0.33 0.33
humanconclusions richer
0.33 0.33
skipenhanc splicingtarget
0.33 0.33
tginduc chemic
0.33 0.29
insensit polypyrimidin
0.28 0.27
cassett mutat
0.26 0.26
pair rule
0.26 0.26
ball bypass
0.25 0.25
cart epitop
0.25 0.25
escap isoformsabstracttext
0.25 0.25
isoformssignificance preexist
0.25 0.25
pulldownsirna receptorarm
0.25 0.25
exonic lack
0.24 0.24
minigen synonym
0.24 0.24
tract bcell
0.24 0.23
compound fail
0.23 0.23
kill relaps
0.23 0.23
site antigen
0.23 0.22
combinator eci
0.22 0.22
featur halflif
0.22 0.22
leukemiaspecif powerlaw
0.22 0.22
prerequisit quotattractivenessquot
0.22 0.22
quotexonshubsquot runxrunxt
0.22 0.22
scalefre stochast
0.22 0.22
tposit unproduct
0.22 0.22
varianc ensu
0.22 0.21
hemizyg attempt
0.21 0.20
enhanc lymphoblast
0.20 0.20
dim(dtm)
[1] 500 7517
dtm.remove<-removeSparseTerms(dtm, 0.95)
dim(dtm.remove)
[1] 500 348
head(dtm.remove$dimnames$Terms)
[1] "aberr" "abund" "acid" "act" "activ" "addit"
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
edit_dict()
mixseg <- worker()
s <- '林明昇超跑基地曝光! 30輛頂級保時捷藏在這裡'
s1 <- '復興航空一路騙 董事長林明昇被譙翻'
s2 <- '熱愛超跑 林明昇被批敗家'
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s2.vec <- segment(code= s2 , jiebar = mixseg)
s.corpus = CNCorpus(list(s.vec, s1.vec, s2.vec))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc <-tm_map(s.corpus, removeNumbers)
s.dtm <- DocumentTermMatrix(doc, control=control.list)
inspect(s.dtm)
tb[order(tb, decreasing = TRUE)][1:20]
of the and in
4480 4358 3583 2961
to a that is
1758 1575 1220 1004
with splicing for by
864 840 765 693
alternative are expression we
580 544 544 512
as The were gene
507 494 457 421
Word Frequency Matrix
wfm(pubmed_data$Abstract)
Error: could not find function "wfm"
使用詞頻矩陣的結果計算詞頻(續)
doc <-tm_map(doc, stemDocument)
doc <-tm_map(doc, stemDocument)
doc <-tm_map(doc, removeWords, stopwords("english"))
dtm<-DocumentTermMatrix(doc)
terms <- colSums(as.matrix(dtm))
sort(terms, decreasing = TRUE)[1:10]
splice gene express cell
1356 903 860 745
altern protein the transcript
658 528 493 486
regul function
450 416
使用barplot呈現詞頻最高的字詞
term_sum<-colSums(as.matrix(dtm))
Warning message:
In file(con, "rb") :
cannot open file 'C:/Users/wilsonhsu/AppData/Local/RStudio-Desktop/notebooks/23299BFB-Demo20161125/1/s/ceylbzotjaoi1/temp': Permission denied
barplot(sort(term_sum, decreasing =TRUE)[1:10])

#dtm.remove$dimnames$Terms
assocation <- data.frame()
Warning messages:
1: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
for (t in dtm.remove$dimnames$Terms){
for (assoc in names(findAssocs(dtm.remove, t, 0.2)[[1]])){
assocation <- rbind(assocation, data.frame(source = t, target= assoc))
}
}
write.csv(x = assocation, file='association.csv')
getwd()
[1] "C:/Users/wilsonhsu/Desktop"
res2 <-EUtilsSummary("gene expression", type="esearch", db="pubmed", datetype='pdat', mindate=2012, maxdate=2015, retmax=500)records2<-EUtilsGet(res2)gene_expression<-data.frame('year'=YearPubmed(records2), 'Title'=ArticleTitle(records2),'Abstract'=AbstractText(records2))
Error: unexpected symbol in "res2 <-EUtilsSummary("gene expression", type="esearch", db="pubmed", datetype='pdat', mindate=2012, maxdate=2015, retmax=500)records2"
clean_corpus<-function(corpus){
corpus <-tm_map(corpus, removePunctuation)corpus <-tm_map(corpus, stripWhitespace)
Error: unexpected symbol in:
"clean_corpus<-function(corpus){
corpus <-tm_map(corpus, removePunctuation)corpus"


pyramid.plot(top25_df$x, top25_df$y,labels=top25_df$labels,main ="Words in Common",gap =150,space=0.2,raxlab=NULL, unit=NULL,top.labels=c("Alternative Splicing","Words","Gene Expression"))
[1] 5.1 4.1 2.1 2.1

