This file

folder='k:/w/loinc'
folder=getwd()
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   0.8.5
## v tidyr   1.0.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
library(DT)

a<-fs::path(folder,'l','LoincAnswerListLink.csv')
l<-read_csv(a)
## Parsed with column specification:
## cols(
##   LoincNumber = col_character(),
##   LongCommonName = col_character(),
##   AnswerListId = col_character(),
##   AnswerListName = col_character(),
##   AnswerListLinkType = col_character(),
##   ApplicableContext = col_character()
## )
b<-fs::path(folder,'l','AnswerList.csv')
al<-read_csv(b)
## Parsed with column specification:
## cols(
##   AnswerListId = col_character(),
##   AnswerListName = col_character(),
##   AnswerListOID = col_character(),
##   ExtDefinedYN = col_character(),
##   ExtDefinedAnswerListCodeSystem = col_logical(),
##   ExtDefinedAnswerListLink = col_logical(),
##   AnswerStringId = col_character(),
##   LocalAnswerCode = col_character(),
##   LocalAnswerCodeSystem = col_logical(),
##   SequenceNumber = col_double(),
##   DisplayText = col_character(),
##   ExtCodeId = col_double(),
##   ExtCodeDisplayName = col_character(),
##   ExtCodeSystem = col_character(),
##   ExtCodeSystemVersion = col_character(),
##   ExtCodeSystemCopyrightNotice = col_character(),
##   SubsequentTextPrompt = col_logical(),
##   Description = col_character(),
##   Score = col_double()
## )
## Warning: 497 parsing failures.
##  row                      col           expected                                              actual                                               file
## 3110 SubsequentTextPrompt     1/0/T/F/TRUE/FALSE # of months:                                        'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5659 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE http://www.cdc.gov/nchs/icd.htm                     'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5660 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE http://www.cdc.gov/nchs/icd.htm                     'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5661 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE http://www.nlm.nih.gov/research/umls/rxnorm/        'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## 5864 ExtDefinedAnswerListLink 1/0/T/F/TRUE/FALSE https://www.cms.gov/manuals/downloads/clm104c15.pdf 'C:/c/OneDrive/rproject/osandbox/l/AnswerList.csv'
## .... ........................ .................. ................................................... ..................................................
## See problems(...) for more details.
#install.packages("glue")
names(l)
## [1] "LoincNumber"        "LongCommonName"     "AnswerListId"      
## [4] "AnswerListName"     "AnswerListLinkType" "ApplicableContext"
names(al)
##  [1] "AnswerListId"                   "AnswerListName"                
##  [3] "AnswerListOID"                  "ExtDefinedYN"                  
##  [5] "ExtDefinedAnswerListCodeSystem" "ExtDefinedAnswerListLink"      
##  [7] "AnswerStringId"                 "LocalAnswerCode"               
##  [9] "LocalAnswerCodeSystem"          "SequenceNumber"                
## [11] "DisplayText"                    "ExtCodeId"                     
## [13] "ExtCodeDisplayName"             "ExtCodeSystem"                 
## [15] "ExtCodeSystemVersion"           "ExtCodeSystemCopyrightNotice"  
## [17] "SubsequentTextPrompt"           "Description"                   
## [19] "Score"
c<-b<-fs::path(folder,'l','Loinc.csv')
#all problems are related to all chars as type
codes<-read_csv(c,col_types = cols(.default = "c"))
za<-problems(codes)
#names(za)
#za
za %>% count(col)  
## # A tibble: 0 x 2
## # ... with 2 variables: col <int>, n <int>
zb<-za %>% filter(col=='PanelType')

#make l data to be one row per AL
l2<-l %>% group_by(AnswerListId,AnswerListName) %>% summarise( loincNumber=paste(LoincNumber,collapse = '|') ,loincCommonName=paste(LongCommonName,collapse = '|')
,n=n()
                                               )


#join them
#just one line per list
d<-al %>% left_join(l2)
## Joining, by = c("AnswerListId", "AnswerListName")
#multiple lines per list for each loinc code it is linked to
big<-l %>% left_join(al)
## Joining, by = c("AnswerListId", "AnswerListName")
#get just lists from two different directions
lkup1<-al %>% count(AnswerListId,AnswerListName)
lkup2<-l %>% count(AnswerListId,AnswerListName,AnswerListLinkType) %>% 
  arrange(AnswerListLinkType) %>% 
  group_by(AnswerListId,AnswerListName) %>% summarise(AnswerListLinkType=paste(AnswerListLinkType,collapse = '|')) %>% ungroup

#majority are normative
lkup2 %>% count(AnswerListLinkType,sort = TRUE)
## # A tibble: 7 x 2
##   AnswerListLinkType              n
##   <chr>                       <int>
## 1 NORMATIVE                    2119
## 2 EXAMPLE                      1040
## 3 PREFERRED                     299
## 4 EXAMPLE|NORMATIVE             159
## 5 EXAMPLE|PREFERRED              39
## 6 EXAMPLE|NORMATIVE|PREFERRED    14
## 7 NORMATIVE|PREFERRED             8

Ext codes

#names(al)
#al %>% count(ExtCodeId)
ca0<-al %>% count(is.na(ExtCodeDisplayName))
100*1948/nrow(al)
## [1] 7.871025
# ca<-al %>%  filter(!is.na(ExtCodeDisplayName))%>% count(ExtCodeDisplayName,sort = TRUE)

ca<-al %>%  filter(!is.na(ExtCodeSystem))%>% count(ExtCodeSystem,sort = TRUE)%>% mutate(p=100*n/sum(n))
#what external terminologies (ppoint25)
ca
## # A tibble: 3 x 3
##   ExtCodeSystem              n      p
##   <chr>                  <int>  <dbl>
## 1 http://snomed.info/sct  1943 98.8  
## 2 CMS/HCPCS                 18  0.916
## 3 Unicode                    5  0.254
#just to check
ca2<-al %>% count(ExtCodeSystem,sort = TRUE)%>% mutate(p=100*n/sum(n))


#snomed is the answer in 1943
ca
## # A tibble: 3 x 3
##   ExtCodeSystem              n      p
##   <chr>                  <int>  <dbl>
## 1 http://snomed.info/sct  1943 98.8  
## 2 CMS/HCPCS                 18  0.916
## 3 Unicode                    5  0.254
#in 7.8% snomed value is the answer for a loinc list 
#prop.table(table(ca$n))

#if snomed, is it all snomed

options(scipen = 999)
sal<-al %>% filter(ExtCodeSystem=='http://snomed.info/sct')
names(big)
##  [1] "LoincNumber"                    "LongCommonName"                
##  [3] "AnswerListId"                   "AnswerListName"                
##  [5] "AnswerListLinkType"             "ApplicableContext"             
##  [7] "AnswerListOID"                  "ExtDefinedYN"                  
##  [9] "ExtDefinedAnswerListCodeSystem" "ExtDefinedAnswerListLink"      
## [11] "AnswerStringId"                 "LocalAnswerCode"               
## [13] "LocalAnswerCodeSystem"          "SequenceNumber"                
## [15] "DisplayText"                    "ExtCodeId"                     
## [17] "ExtCodeDisplayName"             "ExtCodeSystem"                 
## [19] "ExtCodeSystemVersion"           "ExtCodeSystemCopyrightNotice"  
## [21] "SubsequentTextPrompt"           "Description"                   
## [23] "Score"
names(codes)
##  [1] "LOINC_NUM"                 "COMPONENT"                
##  [3] "PROPERTY"                  "TIME_ASPCT"               
##  [5] "SYSTEM"                    "SCALE_TYP"                
##  [7] "METHOD_TYP"                "CLASS"                    
##  [9] "VersionLastChanged"        "CHNG_TYPE"                
## [11] "DefinitionDescription"     "STATUS"                   
## [13] "CONSUMER_NAME"             "CLASSTYPE"                
## [15] "FORMULA"                   "SPECIES"                  
## [17] "EXMPL_ANSWERS"             "SURVEY_QUEST_TEXT"        
## [19] "SURVEY_QUEST_SRC"          "UNITSREQUIRED"            
## [21] "SUBMITTED_UNITS"           "RELATEDNAMES2"            
## [23] "SHORTNAME"                 "ORDER_OBS"                
## [25] "CDISC_COMMON_TESTS"        "HL7_FIELD_SUBFIELD_ID"    
## [27] "EXTERNAL_COPYRIGHT_NOTICE" "EXAMPLE_UNITS"            
## [29] "LONG_COMMON_NAME"          "UnitsAndRange"            
## [31] "EXAMPLE_UCUM_UNITS"        "EXAMPLE_SI_UCUM_UNITS"    
## [33] "STATUS_REASON"             "STATUS_TEXT"              
## [35] "CHANGE_REASON_PUBLIC"      "COMMON_TEST_RANK"         
## [37] "COMMON_ORDER_RANK"         "COMMON_SI_TEST_RANK"      
## [39] "HL7_ATTACHMENT_STRUCTURE"  "EXTERNAL_COPYRIGHT_LINK"  
## [41] "PanelType"                 "AskAtOrderEntry"          
## [43] "AssociatedObservations"    "VersionFirstReleased"     
## [45] "ValidHL7AttachmentRequest" "DisplayName"
big2=big %>% left_join(codes, by = c("LoincNumber"="LOINC_NUM"))

sbig<-big2 %>% filter(ExtCodeSystem=='http://snomed.info/sct')
fa<-sbig %>% select(LoincNumber,CLASSTYPE) %>% distinct
fa %>% count(CLASSTYPE)
## # A tibble: 4 x 2
##   CLASSTYPE     n
##   <chr>     <int>
## 1 1          3498
## 2 2          2320
## 3 3            67
## 4 4          1431
#ppoint 30
#fa
codesWSct=fa %>% filter(CLASSTYPE==1)
nrow(codesWSct)
## [1] 3498
#in all LOINC there are that many lab codes (value 1 for type)
fb<-codes %>% count(CLASSTYPE)
fb
## # A tibble: 4 x 2
##   CLASSTYPE     n
##   <chr>     <int>
## 1 1         56364
## 2 2         24190
## 3 3          1157
## 4 4         10658
lab_sbig = sbig %>% inner_join(codesWSct)
## Joining, by = c("LoincNumber", "CLASSTYPE")
# Type  Allowed values:
# 1
# 
# 2
# 
# 3
# 
# 4
# 
# Numeric representation of the class type. To search for all the LOINC codes containing glucose with a class type of 2 use the query:
# glucose Type:2
# 
# See the LOINC Users' Guide for definitions of each class type.
# TypeName  Allowed values:
# Lab
# Clinical
# Attachment
# Survey
# String representation of the class type. To search for all the LOINC codes containing glucose with a class type of clinical use the query:
# glucose TypeName:clinical

#LL construct (interim) between lab test and PV

Type

ga=codes %>% group_by(CLASSTYPE,SCALE_TYP) %>% summarize(n=n()) %>% ungroup() %>%  group_by(CLASSTYPE) %>% mutate(perc=100*n/sum(n))
ga
## # A tibble: 32 x 4
## # Groups:   CLASSTYPE [4]
##    CLASSTYPE SCALE_TYP     n     perc
##    <chr>     <chr>     <int>    <dbl>
##  1 1         -          1043  1.85   
##  2 1         Doc         690  1.22   
##  3 1         Nar         434  0.770  
##  4 1         Nom        3540  6.28   
##  5 1         Ord       14711 26.1    
##  6 1         OrdQn      1437  2.55   
##  7 1         Qn        34509 61.2    
##  8 2         -          1030  4.26   
##  9 2         *             1  0.00413
## 10 2         Doc        9512 39.3    
## # ... with 22 more rows

LinkType

names(l)
## [1] "LoincNumber"        "LongCommonName"     "AnswerListId"      
## [4] "AnswerListName"     "AnswerListLinkType" "ApplicableContext"
xa<-l %>% count(AnswerListId,AnswerListName)
xb<-l %>% count(AnswerListId,AnswerListName,AnswerListLinkType)
xc<-l %>% count(AnswerListLinkType)

xd <- l %>% select(LoincNumber,AnswerListId,AnswerListName,AnswerListLinkType) %>% distinct()
xe <- lab_sbig %>% select(LoincNumber,AnswerListId,AnswerListName,AnswerListLinkType) %>% distinct()
xf<-xe %>% count(AnswerListLinkType, sort=TRUE)
xf %<>% mutate(perc=n/sum(n))
xf
## # A tibble: 3 x 3
##   AnswerListLinkType     n    perc
##   <chr>              <int>   <dbl>
## 1 EXAMPLE             3392 0.969  
## 2 PREFERRED             97 0.0277 
## 3 NORMATIVE             10 0.00286
#LAs not linked to SCT

non<-big2 %>% filter(is.na(ExtCodeSystem))
nonfa<-non %>% select(LoincNumber,CLASSTYPE) %>% distinct
nonfa %>% count(CLASSTYPE)
## # A tibble: 4 x 2
##   CLASSTYPE     n
##   <chr>     <int>
## 1 1          3859
## 2 2          3561
## 3 3           162
## 4 4          6411
#ppoint 130
nonfa
## # A tibble: 13,993 x 2
##    LoincNumber CLASSTYPE
##    <chr>       <chr>    
##  1 10061-0     2        
##  2 10389-5     1        
##  3 10390-3     1        
##  4 10393-7     1        
##  5 10395-2     1        
##  6 10401-8     1        
##  7 10410-9     1        
##  8 10568-4     1        
##  9 10570-0     1        
## 10 10571-8     1        
## # ... with 13,983 more rows
# codesWSct=fa %>% filter(CLASSTYPE==1)
# nrow(codesWSct)
# 
# 
# lab_sbig = sbig %>% inner_join(codesWSct)

ya<-non %>% count(CLASSTYPE,DisplayText, sort=TRUE)
nonterms<-non %>% count(DisplayText, sort=TRUE) %>% mutate(term=tolower(DisplayText))
nonterms2<-non %>% count(CLASSTYPE,DisplayText, sort=TRUE) %>% mutate(term=tolower(DisplayText))

load('C:/c/OneDrive - National Institutes of Health/snomed/snomedct.rda')
SYNONYM=900000000000013056

SCTSyns<-d %>% filter(typeId==SYNONYM) %>% filter(active==1) %>% mutate(term=tolower(term))

yc<-nonterms %>% left_join(SCTSyns)
## Joining, by = "term"
#additional grouping by loinc type (lab vs clinical vs survey)
yc2<-nonterms2 %>% left_join(SCTSyns) %>% filter(!is.na(conceptId))
## Joining, by = "term"
yc2 %>% write_csv('LA-nonMapped.csv')
nrow(yc)
## [1] 17741
names(yc)
##  [1] "DisplayText"        "n"                  "term"              
##  [4] "id"                 "effectiveTime"      "active"            
##  [7] "moduleId"           "conceptId"          "languageCode"      
## [10] "typeId"             "caseSignificanceId"
yd=yc %>% count(!is.na(conceptId))
yd
## # A tibble: 2 x 2
##   `!is.na(conceptId)`     n
##   <lgl>               <int>
## 1 FALSE               13815
## 2 TRUE                 3926