loading it
loadFromFiles=F
if (loadFromFiles) system.time({
concept <-read.delim('inst/extdata/concept.csv',as.is=T,quote = "")
vocabulary <-read.delim('inst/extdata/vocabulary.csv',as.is=T,quote = "")
cRelationship <-read.delim('inst/extdata/concept_relationship.csv',as.is=T,quote = "")
relationship<-read.delim('inst/extdata/relationship.csv',as.is=T,quote = "")
cAncestor <-read.delim('inst/extdata/concept_ancestor.csv',as.is=T,quote = "")
})
#186s (3min)
#save it for later
fName<-'inst/extdata/cdmv-Nov2015.RData'
#save.image(file=fName)
if (!loadFromFiles) load(file=fName)
#library(readr)
#rm(concept)
#system.time({ concept <- read_tsv('inst/extdata/concept.csv',n_max = 10) })
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
What version it is?
print(filter(vocabulary,VOCABULARY_ID=='None'))
## VOCABULARY_ID VOCABULARY_NAME VOCABULARY_REFERENCE
## 1 None OMOP Standardized Vocabularies OMOP generated
## VOCABULARY_VERSION VOCABULARY_CONCEPT_ID
## 1 v5.0 10-NOV-15 44819096
#printing just one column
print(filter(vocabulary,VOCABULARY_ID=='None')$VOCABULARY_VERSION)
## [1] "v5.0 10-NOV-15"
stats<- concept %>% group_by(VOCABULARY_ID) %>% summarise(count= n()) %>% arrange(-count)
#install.packages("magrittr")
#u2 %>% group_by(NIH_Login) %>% summarise(count = n()) %>% arrange(-count)
#library(magrittr)
write.csv(stats,file="stats.csv",row.names=F,quote=F,na='')
#print to terminal
library(pander);panderOptions('table.split.table', Inf);options(knitr.table.format = 'pandoc');panderOptions("table.alignment.default", "left")
pander(stats)
| SNOMED |
734950 |
| NDC |
676286 |
| RxNorm |
230765 |
| SPL |
168506 |
| LOINC |
115189 |
| Read |
108259 |
| MedDRA |
98138 |
| VA Product |
53545 |
| CIEL |
50881 |
| NDFRT |
36734 |
| Genseqno |
22033 |
| ICD9CM |
18672 |
| Gemscript |
14404 |
| CPT4 |
14202 |
| ICD10 |
12855 |
| MeSH |
11092 |
| OPCS4 |
10892 |
| Multum |
9770 |
| OXMIS |
8118 |
| HCPCS |
7584 |
| ATC |
5964 |
| ICD9Proc |
4657 |
| DRG |
1362 |
| APC |
1358 |
| UCUM |
971 |
| NUCC |
829 |
| Revenue Code |
538 |
| VA Class |
486 |
| Relationship |
384 |
| SMQ |
324 |
| Concept Class |
255 |
| Currency |
180 |
| HES Specialty |
148 |
| Specialty |
111 |
| Condition Type |
99 |
| Procedure Type |
93 |
| ABMS |
89 |
| PCORNet |
81 |
| Cohort |
78 |
| Vocabulary |
64 |
| Domain |
54 |
| Race |
53 |
| Place of Service |
49 |
| MDC |
26 |
| Drug Type |
13 |
| Observation Type |
12 |
| Note Type |
10 |
| Death Type |
9 |
| Obs Period Type |
6 |
| Gender |
5 |
| Meas Type |
5 |
| Visit |
4 |
| Device Type |
3 |
| Visit Type |
3 |
| Ethnicity |
2 |
| None |
1 |
#get to clipboard for pasting to excel
write.table(stats,file="clipboard", sep="\t",row.names=F) #paste to excel
#nicer print to a report
library(pander);panderOptions('table.split.table', Inf)
options(knitr.table.format = 'pandoc');panderOptions("table.alignment.default", "left")
pander(stats)
| SNOMED |
734950 |
| NDC |
676286 |
| RxNorm |
230765 |
| SPL |
168506 |
| LOINC |
115189 |
| Read |
108259 |
| MedDRA |
98138 |
| VA Product |
53545 |
| CIEL |
50881 |
| NDFRT |
36734 |
| Genseqno |
22033 |
| ICD9CM |
18672 |
| Gemscript |
14404 |
| CPT4 |
14202 |
| ICD10 |
12855 |
| MeSH |
11092 |
| OPCS4 |
10892 |
| Multum |
9770 |
| OXMIS |
8118 |
| HCPCS |
7584 |
| ATC |
5964 |
| ICD9Proc |
4657 |
| DRG |
1362 |
| APC |
1358 |
| UCUM |
971 |
| NUCC |
829 |
| Revenue Code |
538 |
| VA Class |
486 |
| Relationship |
384 |
| SMQ |
324 |
| Concept Class |
255 |
| Currency |
180 |
| HES Specialty |
148 |
| Specialty |
111 |
| Condition Type |
99 |
| Procedure Type |
93 |
| ABMS |
89 |
| PCORNet |
81 |
| Cohort |
78 |
| Vocabulary |
64 |
| Domain |
54 |
| Race |
53 |
| Place of Service |
49 |
| MDC |
26 |
| Drug Type |
13 |
| Observation Type |
12 |
| Note Type |
10 |
| Death Type |
9 |
| Obs Period Type |
6 |
| Gender |
5 |
| Meas Type |
5 |
| Visit |
4 |
| Device Type |
3 |
| Visit Type |
3 |
| Ethnicity |
2 |
| None |
1 |
#is CPT there? not by d
library(stringr)
str_detect(stats$VOCABULARY_ID,'C')
## [1] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [12] TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [23] FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [34] FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE
str_detect(tolower(stats$VOCABULARY_ID),'cpt')
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE
#relationships
library(tidyr)
glimpse(cRelationship)
## Observations: 14,601,794
## Variables: 6
## $ CONCEPT_ID_1 (int) 19089168, 19126485, 40231922, 40102010, 42800...
## $ CONCEPT_ID_2 (int) 19124279, 501419, 1128998, 40155459, 19133116...
## $ RELATIONSHIP_ID (chr) "Concept replaces", "Concept replaces", "Conc...
## $ VALID_START_DATE (int) 20121001, 20121001, 20121001, 20121001, 20121...
## $ VALID_END_DATE (int) 20991231, 20991231, 20991231, 20991231, 20991...
## $ INVALID_REASON (chr) "", "", "", "", "", "", "", "D", "D", "", "",...
rel_stats<- cRelationship %>% group_by(RELATIONSHIP_ID) %>% summarise(count= n()) %>% arrange(-count) %>% left_join(relationship) %>% separate(col=RELATIONSHIP_NAME,into=c('rel','source'),sep='\\(',remove=F)
## Joining by: "RELATIONSHIP_ID"
## Warning: Too few values at 6 locations: 3, 4, 25, 26, 175, 176
rel_stats
## Source: local data frame [300 x 9]
##
## RELATIONSHIP_ID count RELATIONSHIP_NAME
## (chr) (int) (chr)
## 1 Mapped from 1837212 Standard to Non-standard map (OMOP)
## 2 Maps to 1837212 Non-standard to Standard map (OMOP)
## 3 Is a 1475948 Is a
## 4 Subsumes 1475948 Subsumes
## 5 RxNorm - SPL 1015683 RxNorm to SPL (NLM)
## 6 Finding site of 212347 Finding site of (SNOMED)
## 7 Has finding site 212347 Has finding site (SNOMED)
## 8 Consists of 179961 Consists of (RxNorm)
## 9 Constitutes 179961 Constitutes (RxNorm)
## 10 RxNorm has ing 168262 Has ingredient (RxNorm)
## .. ... ... ...
## Variables not shown: rel (chr), source (chr), IS_HIERARCHICAL (int),
## DEFINES_ANCESTRY (int), REVERSE_RELATIONSHIP_ID (chr),
## RELATIONSHIP_CONCEPT_ID (int)
#rel_stats2 <- rel_stats %>% separate(col=RELATIONSHIP_NAME,into=c('rel','source'),sep='\\(',remove=F)
#rm(rel_stats2)
rel2<-rel_stats %>% group_by(source) %>% summarize(distinct_rels = n(),row_cnt=sum(count))
write.csv(rel2,file="statsTWO.csv",row.names=F,quote=F,na='')
pander(rel2)
| CMS) |
6 |
4388 |
| DM+D) |
8 |
137636 |
| LOINC) |
6 |
149124 |
| MedDRA) |
2 |
33278 |
| MSSO) |
2 |
33588 |
| NDF-RT) |
48 |
914734 |
| NDF) |
6 |
195188 |
| NHS) |
2 |
21026 |
| NLM) |
2 |
1169684 |
| OMOP) |
32 |
4011590 |
| RxNorm) |
34 |
1699510 |
| SNOMED) |
146 |
3084502 |
| NA |
6 |
3147546 |
rm(rel_stats)
rm(rel2)
rm(stats)