This file
folder='k:/w/loinc'
folder=getwd()
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 0.8.5
## v tidyr 1.0.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
library(DT)
a<-fs::path(folder,'l','LoincAnswerListLink.csv')
l<-read_csv(a)
## Parsed with column specification:
## cols(
## LoincNumber = col_character(),
## LongCommonName = col_character(),
## AnswerListId = col_character(),
## AnswerListName = col_character(),
## AnswerListLinkType = col_character(),
## ApplicableContext = col_character()
## )
b<-fs::path(folder,'l','AnswerList.csv')
al<-read_csv(b)
## Parsed with column specification:
## cols(
## AnswerListId = col_character(),
## AnswerListName = col_character(),
## AnswerListOID = col_character(),
## ExtDefinedYN = col_character(),
## ExtDefinedAnswerListCodeSystem = col_logical(),
## ExtDefinedAnswerListLink = col_logical(),
## AnswerStringId = col_character(),
## LocalAnswerCode = col_character(),
## LocalAnswerCodeSystem = col_logical(),
## SequenceNumber = col_double(),
## DisplayText = col_character(),
## ExtCodeId = col_double(),
## ExtCodeDisplayName = col_character(),
## ExtCodeSystem = col_character(),
## ExtCodeSystemVersion = col_character(),
## ExtCodeSystemCopyrightNotice = col_character(),
## SubsequentTextPrompt = col_logical(),
## Description = col_character(),
## Score = col_double()
## )
## Warning: 497 parsing failures.
## row col expected actual file
## 3110 SubsequentTextPrompt 1/0/T/F/TRUE/FALSE # of months: 'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5659 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE http://www.cdc.gov/nchs/icd.htm 'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5660 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE http://www.cdc.gov/nchs/icd.htm 'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5661 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE http://www.nlm.nih.gov/research/umls/rxnorm/ 'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5864 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE https://www.cms.gov/manuals/downloads/clm104c15.pdf 'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## .... ........................ .................. ................................................... ..................................................
## See problems(...) for more details.
#install.packages("glue")
names(l)
## [1] "LoincNumber" "LongCommonName" "AnswerListId"
## [4] "AnswerListName" "AnswerListLinkType" "ApplicableContext"
names(al)
## [1] "AnswerListId" "AnswerListName"
## [3] "AnswerListOID" "ExtDefinedYN"
## [5] "ExtDefinedAnswerListCodeSystem" "ExtDefinedAnswerListLink"
## [7] "AnswerStringId" "LocalAnswerCode"
## [9] "LocalAnswerCodeSystem" "SequenceNumber"
## [11] "DisplayText" "ExtCodeId"
## [13] "ExtCodeDisplayName" "ExtCodeSystem"
## [15] "ExtCodeSystemVersion" "ExtCodeSystemCopyrightNotice"
## [17] "SubsequentTextPrompt" "Description"
## [19] "Score"
c<-b<-fs::path(folder,'l','Loinc.csv')
#all problems are related to all chars as type
codes<-read_csv(c,col_types = cols(.default = "c"))
za<-problems(codes)
#names(za)
#za
za %>% count(col)
## # A tibble: 0 x 2
## # ... with 2 variables: col <int>, n <int>
zb<-za %>% filter(col=='PanelType')
#make l data to be one row per AL
l2<-l %>% group_by(AnswerListId,AnswerListName) %>% summarise( loincNumber=paste(LoincNumber,collapse = '|') ,loincCommonName=paste(LongCommonName,collapse = '|')
,n=n()
)
#join them
#just one line per list
d<-al %>% left_join(l2)
## Joining, by = c("AnswerListId", "AnswerListName")
#multiple lines per list for each loinc code it is linked to
big<-l %>% left_join(al)
## Joining, by = c("AnswerListId", "AnswerListName")
#get just lists from two different directions
lkup1<-al %>% count(AnswerListId,AnswerListName)
lkup2<-l %>% count(AnswerListId,AnswerListName,AnswerListLinkType) %>%
arrange(AnswerListLinkType) %>%
group_by(AnswerListId,AnswerListName) %>% summarise(AnswerListLinkType=paste(AnswerListLinkType,collapse = '|')) %>% ungroup
#majority are normative
lkup2 %>% count(AnswerListLinkType,sort = TRUE)
## # A tibble: 7 x 2
## AnswerListLinkType n
## <chr> <int>
## 1 NORMATIVE 2119
## 2 EXAMPLE 1040
## 3 PREFERRED 299
## 4 EXAMPLE|NORMATIVE 159
## 5 EXAMPLE|PREFERRED 39
## 6 EXAMPLE|NORMATIVE|PREFERRED 14
## 7 NORMATIVE|PREFERRED 8
#names(al)
#al %>% count(ExtCodeId)
ca0<-al %>% count(is.na(ExtCodeDisplayName))
100*1948/nrow(al)
## [1] 7.871025
# ca<-al %>% filter(!is.na(ExtCodeDisplayName))%>% count(ExtCodeDisplayName,sort = TRUE)
ca<-al %>% filter(!is.na(ExtCodeSystem))%>% count(ExtCodeSystem,sort = TRUE)%>% mutate(p=100*n/sum(n))
#what external terminologies (ppoint25)
ca
## # A tibble: 3 x 3
## ExtCodeSystem n p
## <chr> <int> <dbl>
## 1 http://snomed.info/sct 1943 98.8
## 2 CMS/HCPCS 18 0.916
## 3 Unicode 5 0.254
#just to check
ca2<-al %>% count(ExtCodeSystem,sort = TRUE)%>% mutate(p=100*n/sum(n))
#snomed is the answer in 1943
ca
## # A tibble: 3 x 3
## ExtCodeSystem n p
## <chr> <int> <dbl>
## 1 http://snomed.info/sct 1943 98.8
## 2 CMS/HCPCS 18 0.916
## 3 Unicode 5 0.254
#in 7.8% snomed value is the answer for a loinc list
#prop.table(table(ca$n))
#if snomed, is it all snomed
options(scipen = 999)
sal<-al %>% filter(ExtCodeSystem=='http://snomed.info/sct')
names(big)
## [1] "LoincNumber" "LongCommonName"
## [3] "AnswerListId" "AnswerListName"
## [5] "AnswerListLinkType" "ApplicableContext"
## [7] "AnswerListOID" "ExtDefinedYN"
## [9] "ExtDefinedAnswerListCodeSystem" "ExtDefinedAnswerListLink"
## [11] "AnswerStringId" "LocalAnswerCode"
## [13] "LocalAnswerCodeSystem" "SequenceNumber"
## [15] "DisplayText" "ExtCodeId"
## [17] "ExtCodeDisplayName" "ExtCodeSystem"
## [19] "ExtCodeSystemVersion" "ExtCodeSystemCopyrightNotice"
## [21] "SubsequentTextPrompt" "Description"
## [23] "Score"
names(codes)
## [1] "LOINC_NUM" "COMPONENT"
## [3] "PROPERTY" "TIME_ASPCT"
## [5] "SYSTEM" "SCALE_TYP"
## [7] "METHOD_TYP" "CLASS"
## [9] "VersionLastChanged" "CHNG_TYPE"
## [11] "DefinitionDescription" "STATUS"
## [13] "CONSUMER_NAME" "CLASSTYPE"
## [15] "FORMULA" "SPECIES"
## [17] "EXMPL_ANSWERS" "SURVEY_QUEST_TEXT"
## [19] "SURVEY_QUEST_SRC" "UNITSREQUIRED"
## [21] "SUBMITTED_UNITS" "RELATEDNAMES2"
## [23] "SHORTNAME" "ORDER_OBS"
## [25] "CDISC_COMMON_TESTS" "HL7_FIELD_SUBFIELD_ID"
## [27] "EXTERNAL_COPYRIGHT_NOTICE" "EXAMPLE_UNITS"
## [29] "LONG_COMMON_NAME" "UnitsAndRange"
## [31] "EXAMPLE_UCUM_UNITS" "EXAMPLE_SI_UCUM_UNITS"
## [33] "STATUS_REASON" "STATUS_TEXT"
## [35] "CHANGE_REASON_PUBLIC" "COMMON_TEST_RANK"
## [37] "COMMON_ORDER_RANK" "COMMON_SI_TEST_RANK"
## [39] "HL7_ATTACHMENT_STRUCTURE" "EXTERNAL_COPYRIGHT_LINK"
## [41] "PanelType" "AskAtOrderEntry"
## [43] "AssociatedObservations" "VersionFirstReleased"
## [45] "ValidHL7AttachmentRequest" "DisplayName"
big2=big %>% left_join(codes, by = c("LoincNumber"="LOINC_NUM"))
sbig<-big2 %>% filter(ExtCodeSystem=='http://snomed.info/sct')
fa<-sbig %>% select(LoincNumber,CLASSTYPE) %>% distinct
fa %>% count(CLASSTYPE)
## # A tibble: 4 x 2
## CLASSTYPE n
## <chr> <int>
## 1 1 3498
## 2 2 2320
## 3 3 67
## 4 4 1431
#ppoint 30
#fa
codesWSct=fa %>% filter(CLASSTYPE==1)
nrow(codesWSct)
## [1] 3498
#in all LOINC there are that many lab codes (value 1 for type)
fb<-codes %>% count(CLASSTYPE)
fb
## # A tibble: 4 x 2
## CLASSTYPE n
## <chr> <int>
## 1 1 56364
## 2 2 24190
## 3 3 1157
## 4 4 10658
lab_sbig = sbig %>% inner_join(codesWSct)
## Joining, by = c("LoincNumber", "CLASSTYPE")
# Type Allowed values:
# 1
#
# 2
#
# 3
#
# 4
#
# Numeric representation of the class type. To search for all the LOINC codes containing glucose with a class type of 2 use the query:
# glucose Type:2
#
# See the LOINC Users' Guide for definitions of each class type.
# TypeName Allowed values:
# Lab
# Clinical
# Attachment
# Survey
# String representation of the class type. To search for all the LOINC codes containing glucose with a class type of clinical use the query:
# glucose TypeName:clinical
#LL construct (interim) between lab test and PV
ga=codes %>% group_by(CLASSTYPE,SCALE_TYP) %>% summarize(n=n()) %>% ungroup() %>% group_by(CLASSTYPE) %>% mutate(perc=100*n/sum(n))
ga
## # A tibble: 32 x 4
## # Groups: CLASSTYPE [4]
## CLASSTYPE SCALE_TYP n perc
## <chr> <chr> <int> <dbl>
## 1 1 - 1043 1.85
## 2 1 Doc 690 1.22
## 3 1 Nar 434 0.770
## 4 1 Nom 3540 6.28
## 5 1 Ord 14711 26.1
## 6 1 OrdQn 1437 2.55
## 7 1 Qn 34509 61.2
## 8 2 - 1030 4.26
## 9 2 * 1 0.00413
## 10 2 Doc 9512 39.3
## # ... with 22 more rows
#lab_sbig #has one row per answer
lcnt<-lab_sbig %>% select(LoincNumber) %>% distinct() %>% nrow()
#percentage of all lab concepts
lcnt/56364 #ppoint 40
## [1] 0.06206089
#what type of tests refence SCT
ua<-lab_sbig %>% select(LoincNumber,PROPERTY) %>% distinct() %>% count(PROPERTY, sort=TRUE)
ub<-lab_sbig %>% select(LoincNumber,SCALE_TYP) %>% distinct() %>% count(SCALE_TYP, sort=TRUE)
ub
## # A tibble: 6 x 2
## SCALE_TYP n
## <chr> <int>
## 1 Ord 3204
## 2 Nom 287
## 3 Qn 3
## 4 Nar 2
## 5 - 1
## 6 Doc 1
#
# Ord
# 3204
#
# Nom
# 287
(3204+287)/lcnt
## [1] 0.9979989
#what value is most represented
names(lab_sbig)
## [1] "LoincNumber" "LongCommonName"
## [3] "AnswerListId" "AnswerListName"
## [5] "AnswerListLinkType" "ApplicableContext"
## [7] "AnswerListOID" "ExtDefinedYN"
## [9] "ExtDefinedAnswerListCodeSystem" "ExtDefinedAnswerListLink"
## [11] "AnswerStringId" "LocalAnswerCode"
## [13] "LocalAnswerCodeSystem" "SequenceNumber"
## [15] "DisplayText" "ExtCodeId"
## [17] "ExtCodeDisplayName" "ExtCodeSystem"
## [19] "ExtCodeSystemVersion" "ExtCodeSystemCopyrightNotice"
## [21] "SubsequentTextPrompt" "Description"
## [23] "Score" "COMPONENT"
## [25] "PROPERTY" "TIME_ASPCT"
## [27] "SYSTEM" "SCALE_TYP"
## [29] "METHOD_TYP" "CLASS"
## [31] "VersionLastChanged" "CHNG_TYPE"
## [33] "DefinitionDescription" "STATUS"
## [35] "CONSUMER_NAME" "CLASSTYPE"
## [37] "FORMULA" "SPECIES"
## [39] "EXMPL_ANSWERS" "SURVEY_QUEST_TEXT"
## [41] "SURVEY_QUEST_SRC" "UNITSREQUIRED"
## [43] "SUBMITTED_UNITS" "RELATEDNAMES2"
## [45] "SHORTNAME" "ORDER_OBS"
## [47] "CDISC_COMMON_TESTS" "HL7_FIELD_SUBFIELD_ID"
## [49] "EXTERNAL_COPYRIGHT_NOTICE" "EXAMPLE_UNITS"
## [51] "LONG_COMMON_NAME" "UnitsAndRange"
## [53] "EXAMPLE_UCUM_UNITS" "EXAMPLE_SI_UCUM_UNITS"
## [55] "STATUS_REASON" "STATUS_TEXT"
## [57] "CHANGE_REASON_PUBLIC" "COMMON_TEST_RANK"
## [59] "COMMON_ORDER_RANK" "COMMON_SI_TEST_RANK"
## [61] "HL7_ATTACHMENT_STRUCTURE" "EXTERNAL_COPYRIGHT_LINK"
## [63] "PanelType" "AskAtOrderEntry"
## [65] "AssociatedObservations" "VersionFirstReleased"
## [67] "ValidHL7AttachmentRequest" "DisplayName"
va<-lab_sbig %>% group_by(DisplayText) %>% summarize(n=n()
,shortnames=paste(SHORTNAME,collapse = '|')
,LONG_COMMON_NAMEs=paste(LONG_COMMON_NAME,collapse = '|')
) %>% arrange(desc(n)) %>% ungroup() %>% mutate(perc=n/sum(n))
va %>% datatable(rownames = FALSE)
va %>% write_csv('LA-answers.csv')
#how many SCT answers appear on only 1 loinc code?
va %>% count(n)
## # A tibble: 29 x 2
## n nn
## <int> <int>
## 1 1 80
## 2 2 4
## 3 3 28
## 4 4 86
## 5 5 3
## 6 6 8
## 7 7 1
## 8 8 1
## 9 9 2
## 10 11 4
## # ... with 19 more rows
names(l)
## [1] "LoincNumber" "LongCommonName" "AnswerListId"
## [4] "AnswerListName" "AnswerListLinkType" "ApplicableContext"
xa<-l %>% count(AnswerListId,AnswerListName)
xb<-l %>% count(AnswerListId,AnswerListName,AnswerListLinkType)
xc<-l %>% count(AnswerListLinkType)
xd <- l %>% select(LoincNumber,AnswerListId,AnswerListName,AnswerListLinkType) %>% distinct()
xe <- lab_sbig %>% select(LoincNumber,AnswerListId,AnswerListName,AnswerListLinkType) %>% distinct()
xf<-xe %>% count(AnswerListLinkType, sort=TRUE)
xf %<>% mutate(perc=n/sum(n))
xf
## # A tibble: 3 x 3
## AnswerListLinkType n perc
## <chr> <int> <dbl>
## 1 EXAMPLE 3392 0.969
## 2 PREFERRED 97 0.0277
## 3 NORMATIVE 10 0.00286
#LAs not linked to SCT
non<-big2 %>% filter(is.na(ExtCodeSystem))
nonfa<-non %>% select(LoincNumber,CLASSTYPE) %>% distinct
nonfa %>% count(CLASSTYPE)
## # A tibble: 4 x 2
## CLASSTYPE n
## <chr> <int>
## 1 1 3859
## 2 2 3561
## 3 3 162
## 4 4 6411
#ppoint 130
nonfa
## # A tibble: 13,993 x 2
## LoincNumber CLASSTYPE
## <chr> <chr>
## 1 10061-0 2
## 2 10389-5 1
## 3 10390-3 1
## 4 10393-7 1
## 5 10395-2 1
## 6 10401-8 1
## 7 10410-9 1
## 8 10568-4 1
## 9 10570-0 1
## 10 10571-8 1
## # ... with 13,983 more rows
# codesWSct=fa %>% filter(CLASSTYPE==1)
# nrow(codesWSct)
#
#
# lab_sbig = sbig %>% inner_join(codesWSct)
ya<-non %>% count(CLASSTYPE,DisplayText, sort=TRUE)
nonterms<-non %>% count(DisplayText, sort=TRUE) %>% mutate(term=tolower(DisplayText))
nonterms2<-non %>% count(CLASSTYPE,DisplayText, sort=TRUE) %>% mutate(term=tolower(DisplayText))
load('C:/c/OneDrive - National Institutes of Health/snomed/snomedct.rda')
SYNONYM=900000000000013056
SCTSyns<-d %>% filter(typeId==SYNONYM) %>% filter(active==1) %>% mutate(term=tolower(term))
yc<-nonterms %>% left_join(SCTSyns)
## Joining, by = "term"
#additional grouping by loinc type (lab vs clinical vs survey)
yc2<-nonterms2 %>% left_join(SCTSyns) %>% filter(!is.na(conceptId))
## Joining, by = "term"
yc2 %>% write_csv('LA-nonMapped.csv')
nrow(yc)
## [1] 17741
names(yc)
## [1] "DisplayText" "n" "term"
## [4] "id" "effectiveTime" "active"
## [7] "moduleId" "conceptId" "languageCode"
## [10] "typeId" "caseSignificanceId"
yd=yc %>% count(!is.na(conceptId))
yd
## # A tibble: 2 x 2
## `!is.na(conceptId)` n
## <lgl> <int>
## 1 FALSE 13815
## 2 TRUE 3926