Exeriments with CDMV

loading it

loadFromFiles=F
if (loadFromFiles) system.time({
            concept     <-read.delim('inst/extdata/concept.csv',as.is=T,quote = "")
            vocabulary  <-read.delim('inst/extdata/vocabulary.csv',as.is=T,quote = "")
            cRelationship <-read.delim('inst/extdata/concept_relationship.csv',as.is=T,quote = "")
            relationship<-read.delim('inst/extdata/relationship.csv',as.is=T,quote = "")
            cAncestor   <-read.delim('inst/extdata/concept_ancestor.csv',as.is=T,quote = "")

  })

#186s (3min)

#save it for later
  fName<-'inst/extdata/cdmv-Nov2015.RData'
  #save.image(file=fName)
  if (!loadFromFiles) load(file=fName)


#library(readr)
#rm(concept)
  #system.time({  concept     <- read_tsv('inst/extdata/concept.csv',n_max = 10)  })
  



  library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

What version it is?

  print(filter(vocabulary,VOCABULARY_ID=='None'))
##   VOCABULARY_ID                VOCABULARY_NAME VOCABULARY_REFERENCE
## 1          None OMOP Standardized Vocabularies       OMOP generated
##   VOCABULARY_VERSION VOCABULARY_CONCEPT_ID
## 1     v5.0 10-NOV-15              44819096
  #printing just one column
  print(filter(vocabulary,VOCABULARY_ID=='None')$VOCABULARY_VERSION)
## [1] "v5.0 10-NOV-15"
  stats<- concept %>% group_by(VOCABULARY_ID) %>% summarise(count= n()) %>% arrange(-count)
  #install.packages("magrittr")
  #u2 %>% group_by(NIH_Login) %>% summarise(count = n()) %>% arrange(-count)
  #library(magrittr)
  write.csv(stats,file="stats.csv",row.names=F,quote=F,na='')
  #print to terminal
  library(pander);panderOptions('table.split.table', Inf);options(knitr.table.format = 'pandoc');panderOptions("table.alignment.default", "left")
  pander(stats)
VOCABULARY_ID count
SNOMED 734950
NDC 676286
RxNorm 230765
SPL 168506
LOINC 115189
Read 108259
MedDRA 98138
VA Product 53545
CIEL 50881
NDFRT 36734
Genseqno 22033
ICD9CM 18672
Gemscript 14404
CPT4 14202
ICD10 12855
MeSH 11092
OPCS4 10892
Multum 9770
OXMIS 8118
HCPCS 7584
ATC 5964
ICD9Proc 4657
DRG 1362
APC 1358
UCUM 971
NUCC 829
Revenue Code 538
VA Class 486
Relationship 384
SMQ 324
Concept Class 255
Currency 180
HES Specialty 148
Specialty 111
Condition Type 99
Procedure Type 93
ABMS 89
PCORNet 81
Cohort 78
Vocabulary 64
Domain 54
Race 53
Place of Service 49
MDC 26
Drug Type 13
Observation Type 12
Note Type 10
Death Type 9
Obs Period Type 6
Gender 5
Meas Type 5
Visit 4
Device Type 3
Visit Type 3
Ethnicity 2
None 1
  #get to clipboard for pasting to excel
  write.table(stats,file="clipboard", sep="\t",row.names=F) #paste to excel
  
  #nicer print to a report 
  library(pander);panderOptions('table.split.table', Inf)
  options(knitr.table.format = 'pandoc');panderOptions("table.alignment.default", "left")
  pander(stats)
VOCABULARY_ID count
SNOMED 734950
NDC 676286
RxNorm 230765
SPL 168506
LOINC 115189
Read 108259
MedDRA 98138
VA Product 53545
CIEL 50881
NDFRT 36734
Genseqno 22033
ICD9CM 18672
Gemscript 14404
CPT4 14202
ICD10 12855
MeSH 11092
OPCS4 10892
Multum 9770
OXMIS 8118
HCPCS 7584
ATC 5964
ICD9Proc 4657
DRG 1362
APC 1358
UCUM 971
NUCC 829
Revenue Code 538
VA Class 486
Relationship 384
SMQ 324
Concept Class 255
Currency 180
HES Specialty 148
Specialty 111
Condition Type 99
Procedure Type 93
ABMS 89
PCORNet 81
Cohort 78
Vocabulary 64
Domain 54
Race 53
Place of Service 49
MDC 26
Drug Type 13
Observation Type 12
Note Type 10
Death Type 9
Obs Period Type 6
Gender 5
Meas Type 5
Visit 4
Device Type 3
Visit Type 3
Ethnicity 2
None 1
  #is CPT there? not by d
  library(stringr)
  str_detect(stats$VOCABULARY_ID,'C')
##  [1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [12]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [23] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [34] FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE
  str_detect(tolower(stats$VOCABULARY_ID),'cpt')
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE
  #relationships
  
  
  library(tidyr)
  glimpse(cRelationship)
## Observations: 14,601,794
## Variables: 6
## $ CONCEPT_ID_1     (int) 19089168, 19126485, 40231922, 40102010, 42800...
## $ CONCEPT_ID_2     (int) 19124279, 501419, 1128998, 40155459, 19133116...
## $ RELATIONSHIP_ID  (chr) "Concept replaces", "Concept replaces", "Conc...
## $ VALID_START_DATE (int) 20121001, 20121001, 20121001, 20121001, 20121...
## $ VALID_END_DATE   (int) 20991231, 20991231, 20991231, 20991231, 20991...
## $ INVALID_REASON   (chr) "", "", "", "", "", "", "", "D", "D", "", "",...
  rel_stats<- cRelationship  %>% group_by(RELATIONSHIP_ID) %>% summarise(count= n()) %>% arrange(-count) %>% left_join(relationship) %>% separate(col=RELATIONSHIP_NAME,into=c('rel','source'),sep='\\(',remove=F)
## Joining by: "RELATIONSHIP_ID"
## Warning: Too few values at 6 locations: 3, 4, 25, 26, 175, 176
  rel_stats
## Source: local data frame [300 x 9]
## 
##     RELATIONSHIP_ID   count                   RELATIONSHIP_NAME
##               (chr)   (int)                               (chr)
## 1       Mapped from 1837212 Standard to Non-standard map (OMOP)
## 2           Maps to 1837212 Non-standard to Standard map (OMOP)
## 3              Is a 1475948                                Is a
## 4          Subsumes 1475948                            Subsumes
## 5      RxNorm - SPL 1015683                 RxNorm to SPL (NLM)
## 6   Finding site of  212347            Finding site of (SNOMED)
## 7  Has finding site  212347           Has finding site (SNOMED)
## 8       Consists of  179961                Consists of (RxNorm)
## 9       Constitutes  179961                Constitutes (RxNorm)
## 10   RxNorm has ing  168262             Has ingredient (RxNorm)
## ..              ...     ...                                 ...
## Variables not shown: rel (chr), source (chr), IS_HIERARCHICAL (int),
##   DEFINES_ANCESTRY (int), REVERSE_RELATIONSHIP_ID (chr),
##   RELATIONSHIP_CONCEPT_ID (int)
  #rel_stats2 <- rel_stats %>% separate(col=RELATIONSHIP_NAME,into=c('rel','source'),sep='\\(',remove=F)
  #rm(rel_stats2)
  rel2<-rel_stats %>% group_by(source) %>% summarize(distinct_rels = n(),row_cnt=sum(count))
  write.csv(rel2,file="statsTWO.csv",row.names=F,quote=F,na='')
  pander(rel2)
source distinct_rels row_cnt
CMS) 6 4388
DM+D) 8 137636
LOINC) 6 149124
MedDRA) 2 33278
MSSO) 2 33588
NDF-RT) 48 914734
NDF) 6 195188
NHS) 2 21026
NLM) 2 1169684
OMOP) 32 4011590
RxNorm) 34 1699510
SNOMED) 146 3084502
NA 6 3147546
  rm(rel_stats)
  rm(rel2)
  rm(stats)

end