loading it

loadFromFiles=F
if (loadFromFiles) system.time({
            concept     <-read.delim('inst/extdata/concept.csv',as.is=T,quote = "")
            vocabulary  <-read.delim('inst/extdata/vocabulary.csv',as.is=T,quote = "")
            cRelationship <-read.delim('inst/extdata/concept_relationship.csv',as.is=T,quote = "")
            relationship<-read.delim('inst/extdata/relationship.csv',as.is=T,quote = "")
            cAncestor   <-read.delim('inst/extdata/concept_ancestor.csv',as.is=T,quote = "")

  })

#186s (3min)

#save it for later
  fName<-'inst/extdata/cdmv-Nov2015.RData'
  #save.image(file=fName)
  if (!loadFromFiles) load(file=fName)


#library(readr)
#rm(concept)
  #system.time({  concept     <- read_tsv('inst/extdata/concept.csv',n_max = 10)  })
  



  library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

What version it is?

  print(filter(vocabulary,VOCABULARY_ID=='None'))

##   VOCABULARY_ID                VOCABULARY_NAME VOCABULARY_REFERENCE
## 1          None OMOP Standardized Vocabularies       OMOP generated
##   VOCABULARY_VERSION VOCABULARY_CONCEPT_ID
## 1     v5.0 10-NOV-15              44819096

  #printing just one column
  print(filter(vocabulary,VOCABULARY_ID=='None')$VOCABULARY_VERSION)

## [1] "v5.0 10-NOV-15"

  stats<- concept %>% group_by(VOCABULARY_ID) %>% summarise(count= n()) %>% arrange(-count)
  #install.packages("magrittr")
  #u2 %>% group_by(NIH_Login) %>% summarise(count = n()) %>% arrange(-count)
  #library(magrittr)
  write.csv(stats,file="stats.csv",row.names=F,quote=F,na='')
  #print to terminal
  library(pander);panderOptions('table.split.table', Inf);options(knitr.table.format = 'pandoc');panderOptions("table.alignment.default", "left")
  pander(stats)

VOCABULARY_ID	count
SNOMED	734950
NDC	676286
RxNorm	230765
SPL	168506
LOINC	115189
Read	108259
MedDRA	98138
VA Product	53545
CIEL	50881
NDFRT	36734
Genseqno	22033
ICD9CM	18672
Gemscript	14404
CPT4	14202
ICD10	12855
MeSH	11092
OPCS4	10892
Multum	9770
OXMIS	8118
HCPCS	7584
ATC	5964
ICD9Proc	4657
DRG	1362
APC	1358
UCUM	971
NUCC	829
Revenue Code	538
VA Class	486
Relationship	384
SMQ	324
Concept Class	255
Currency	180
HES Specialty	148
Specialty	111
Condition Type	99
Procedure Type	93
ABMS	89
PCORNet	81
Cohort	78
Vocabulary	64
Domain	54
Race	53
Place of Service	49
MDC	26
Drug Type	13
Observation Type	12
Note Type	10
Death Type	9
Obs Period Type	6
Gender	5
Meas Type	5
Visit	4
Device Type	3
Visit Type	3
Ethnicity	2
None	1

  #get to clipboard for pasting to excel
  write.table(stats,file="clipboard", sep="\t",row.names=F) #paste to excel
  
  #nicer print to a report 
  library(pander);panderOptions('table.split.table', Inf)
  options(knitr.table.format = 'pandoc');panderOptions("table.alignment.default", "left")
  pander(stats)

VOCABULARY_ID	count
SNOMED	734950
NDC	676286
RxNorm	230765
SPL	168506
LOINC	115189
Read	108259
MedDRA	98138
VA Product	53545
CIEL	50881
NDFRT	36734
Genseqno	22033
ICD9CM	18672
Gemscript	14404
CPT4	14202
ICD10	12855
MeSH	11092
OPCS4	10892
Multum	9770
OXMIS	8118
HCPCS	7584
ATC	5964
ICD9Proc	4657
DRG	1362
APC	1358
UCUM	971
NUCC	829
Revenue Code	538
VA Class	486
Relationship	384
SMQ	324
Concept Class	255
Currency	180
HES Specialty	148
Specialty	111
Condition Type	99
Procedure Type	93
ABMS	89
PCORNet	81
Cohort	78
Vocabulary	64
Domain	54
Race	53
Place of Service	49
MDC	26
Drug Type	13
Observation Type	12
Note Type	10
Death Type	9
Obs Period Type	6
Gender	5
Meas Type	5
Visit	4
Device Type	3
Visit Type	3
Ethnicity	2
None	1

  #is CPT there? not by d
  library(stringr)
  str_detect(stats$VOCABULARY_ID,'C')

##  [1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [12]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [23] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [34] FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE

  str_detect(tolower(stats$VOCABULARY_ID),'cpt')

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE

  #relationships
  
  
  library(tidyr)
  glimpse(cRelationship)

## Observations: 14,601,794
## Variables: 6
## $ CONCEPT_ID_1     (int) 19089168, 19126485, 40231922, 40102010, 42800...
## $ CONCEPT_ID_2     (int) 19124279, 501419, 1128998, 40155459, 19133116...
## $ RELATIONSHIP_ID  (chr) "Concept replaces", "Concept replaces", "Conc...
## $ VALID_START_DATE (int) 20121001, 20121001, 20121001, 20121001, 20121...
## $ VALID_END_DATE   (int) 20991231, 20991231, 20991231, 20991231, 20991...
## $ INVALID_REASON   (chr) "", "", "", "", "", "", "", "D", "D", "", "",...

  rel_stats<- cRelationship  %>% group_by(RELATIONSHIP_ID) %>% summarise(count= n()) %>% arrange(-count) %>% left_join(relationship) %>% separate(col=RELATIONSHIP_NAME,into=c('rel','source'),sep='\\(',remove=F)

## Joining by: "RELATIONSHIP_ID"

## Warning: Too few values at 6 locations: 3, 4, 25, 26, 175, 176

  rel_stats

## Source: local data frame [300 x 9]
## 
##     RELATIONSHIP_ID   count                   RELATIONSHIP_NAME
##               (chr)   (int)                               (chr)
## 1       Mapped from 1837212 Standard to Non-standard map (OMOP)
## 2           Maps to 1837212 Non-standard to Standard map (OMOP)
## 3              Is a 1475948                                Is a
## 4          Subsumes 1475948                            Subsumes
## 5      RxNorm - SPL 1015683                 RxNorm to SPL (NLM)
## 6   Finding site of  212347            Finding site of (SNOMED)
## 7  Has finding site  212347           Has finding site (SNOMED)
## 8       Consists of  179961                Consists of (RxNorm)
## 9       Constitutes  179961                Constitutes (RxNorm)
## 10   RxNorm has ing  168262             Has ingredient (RxNorm)
## ..              ...     ...                                 ...
## Variables not shown: rel (chr), source (chr), IS_HIERARCHICAL (int),
##   DEFINES_ANCESTRY (int), REVERSE_RELATIONSHIP_ID (chr),
##   RELATIONSHIP_CONCEPT_ID (int)

  #rel_stats2 <- rel_stats %>% separate(col=RELATIONSHIP_NAME,into=c('rel','source'),sep='\\(',remove=F)
  #rm(rel_stats2)
  rel2<-rel_stats %>% group_by(source) %>% summarize(distinct_rels = n(),row_cnt=sum(count))
  write.csv(rel2,file="statsTWO.csv",row.names=F,quote=F,na='')
  pander(rel2)

source	distinct_rels	row_cnt
CMS)	6	4388
DM+D)	8	137636
LOINC)	6	149124
MedDRA)	2	33278
MSSO)	2	33588
NDF-RT)	48	914734
NDF)	6	195188
NHS)	2	21026
NLM)	2	1169684
OMOP)	32	4011590
RxNorm)	34	1699510
SNOMED)	146	3084502
NA	6	3147546

  rm(rel_stats)
  rm(rel2)
  rm(stats)

cdmvA-01

Vojtech Huser

November 2, 2015

loading it

end