wordCount.R

wada_kazuya — Sep 5, 2013, 8:09 PM

#wordCount
#setting
Sys.setenv(HADOOP_CMD="/usr/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.3.0.jar")

#library
library(rmr2)

Loading required package: Rcpp Loading required package: RJSONIO Loading
required package: bitops Loading required package: digest Loading required
package: functional Loading required package: stringr Loading required
package: plyr Loading required package: reshape2

library(sqldf)

Loading required package: DBI Loading required package: gsubfn Loading
required package: proto Loading required namespace: tcltk

Warning: couldn't connect to display ":0"

Loading required package: chron Loading required package: RSQLite Loading
required package: RSQLite.extfuns

source("pubmed.R")

#関数表示
get.pmid

function (term = "wada+kazuya[author]") 
{
    url.str <- paste0("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=", 
        term)
    xml1 <- xmlTreeParse(getURL(url.str))
    pmids.list <- xml1[["doc"]][["eSearchResult"]][["IdList"]]
    pmids <- rep(NA, length = length(pmids.list))
    for (i in 1:length(pmids.list)) {
        pmids[i] <- as.integer(xmlValue(pmids.list[[i]]))
    }
    return(pmids)
}

get.pmsummary

function (pmids = c(21799770, 21416533)) 
{
    url.str <- paste0("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=", 
        paste(pmids, collapse = ","), "&retmode=xml")
    xml1 <- xmlTreeParse(getURL(url.str))
    pm.summaries <- xml1[["doc"]][["PubmedArticleSet"]]
    pmsummary <- rep(NA, length = length(pm.summaries))
    for (i in 1:length(pm.summaries)) {
        pmsummary[i] <- xmlValue(pm.summaries[[i]][[1]][["Article"]][["Abstract"]][["AbstractText"]])
    }
    return(pmsummary)
}


#pubmed
pubmed.summary <- get.pmsummary(get.pmid("wada+kazuya[author]"))

#print pubmed summary
print(pubmed.summary)

[1] "Interstitial lung disease (ILD) events have been reported in Japanese non-small-cell lung cancer (NSCLC) patients receiving EGFR tyrosine kinase inhibitors. We investigated proteomic biomarkers for mechanistic insights and improved prediction of ILD. Blood plasma was collected from 43 gefitinib-treated NSCLC patients developing acute ILD (confirmed by blinded diagnostic review) and 123 randomly selected controls in a nested case-control study within a pharmacoepidemiological cohort study in Japan. We generated ∼7 million tandem mass spectrometry (MS/MS) measurements with extensive quality control and validation, producing one of the largest proteomic lung cancer datasets to date, incorporating rigorous study design, phenotype definition, and evaluation of sample processing. After alignment, scaling, and measurement batch adjustment, we identified 41 peptide peaks representing 29 proteins best predicting ILD. Multivariate peptide, protein, and pathway modeling achieved ILD prediction comparable to previously identified clinical variables; combining the two provided some improvement. The acute phase response pathway was strongly represented (17 of 29 proteins, p = 1.0×10(-25)), suggesting a key role with potential utility as a marker for increased risk of acute ILD events. Validation by Western blotting showed correlation for identified proteins, confirming that robust results can be generated from an MS/MS platform implementing strict quality control."                                                                                                                                                                                                                   
[2] "We developed a novel software named i-RUBY (identification-Related qUantification-Based strategY algorithm for liquid chromatography/tandem mass spectrometry (LC/MS/MS) data) that enables us to perform fully automatic ion current-based spectral feature analysis of highly accurate data obtained by LC/MS/MS. At the 1st step, this software utilizes accurate peptide/protein identification information for peak detection and peak matching among measurements. Then, at the 2nd step, it picks yet unidentified peaks and matches them to the peaks identified at the 1st step by a linear interpolation algorithm. The analysis of human plasma externally spiked with a known amount of yeast alcohol dehydrogenase 1 showed a good linear relationship between the amount of protein spiked and the quantitative values obtained by i-RUBY analysis. Experiment using human plasma digests spiked with a mixture of known amounts of synthetic peptides derived from two yeast proteins, alcohol dehydrogenase 1 and glucose-6-phospate isomerase, showed the expansion by the 2nd step of i-RUBY of the lower quantification limits to 1/10 to 1/1000 of those reached only by identified peaks at the 1st step. Good correlations between the i-RUBY results and the amount of proteins were confirmed by the analysis of real samples, i.e., sera of normal subjects and cancer patients, by comparing quantitative values of acute-phase proteins obtained by i-RUBY analysis of LC/MS/MS data with those obtained by an immunological method using Bio-Plex. These results taken together show that i-RUBY is a useful tool for obtaining dependable quantitative information from highly accurate shotgun-proteomics LC/MS/MS data."


#file upload
pm.hdfs <- to.dfs(pubmed.summary)

Warning: Converting to.dfs argument to keyval with a NULL key


#wordcount関数 on Hadoop
wordcount <- function(input, output = NULL, pattern = " "){
    wc.map <- function(k, v) {
        keyval(unlist(strsplit(x = v, split = pattern)), 1)
    }
    wc.reduce <- function(word, counts) {
        keyval(word, sum(counts))
    }
    mapreduce(input=input, output=output, input.format="native", map=wc.map, reduce = wc.reduce,
      combine = T, backend.parameters=list(hadoop=list(D='mapred.reduce.tasks=10')))
}

#wordcount実行(して結果をhdfs上に保存してそのパスを取得)
wc.dfs <- wordcount(input=pm.hdfs, pattern="( |\n)")

Warning: backend.parameters is deprecated.


#wordcount結果を取得
wc_pubmed <- as.data.frame(from.dfs(wc.dfs))

#上位20件表示
head(sqldf("select * from wc_pubmed order by val desc"),20)

Loading required package: tcltk

          key val
1          of  19
2         the  15
3         and  12
4          by  11
5           a  10
6         for   6
7      i-RUBY   6
8          to   6
9  identified   5
10       with   5
11       from   4
12      peaks   4
13   analysis   4
14   obtained   4
15         We   3
16     amount   3
17     cancer   3
18     showed   3
19     spiked   3
20   accurate   3