1 General inforamtion

  • I completed this practical using my personal computer, using rstudio. I do not have sql on my machine, so as an example of my sql abilities some sql code is provided.

2 Task 1

  • Task: 1: Spend some time exploring the data and show or discuss what you find. What are the types of data quality issues will you have to consider?
  • Exploring each provided dataset by getting descriptive stats
  • Each file has a partial overlap of ID’s, but not complete info for each pt
  • Each individual file has limited missingness for each variable
  • Patient History
    • Includes questionnaire data, I should assume data here may be incomplete or potentially inaccurate
    • Smoking is commonly under reported
    • Includes a mix of continuous and categorical variables
    • Seems to include markers for cardiac conditions
  • Patient clinical
    • Patient clinical diagnosis_code column includes ICD and snomed codes
  • Patient note
    • Includes medical notes
    • Appears to have multiple empty columns
  • Patient demographics
    • I updated the column ‘male’ to sex to easier visual the sex differences among pts
    • Without a data dictionary I’m going to assume the education variables correspond to level of education with 1 being perhaps not finishing high school and 4 being an advanced degree

2.1 Patient History

pt_hist_explore_tbl <- raw_pt_history %>%
  select(-patient_id) %>%
  tbl_summary(
    statistic = list(
      all_categorical() ~ "{n} ({p}%)",
      all_continuous() ~ c("{mean} ({sd})",
                           "{median} ({p25}, {p75})",
                           "{min}, {max}")
    ),
    type = list(all_continuous() ~ 'continuous2'),
    digits = list(all_categorical() ~ c(0, 1),
                  all_continuous2() ~ c(1, 1))
  ) 

pt_hist_explore_tbl %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Explore Patient History")
Explore Patient History
Characteristic N = 4,2291
currentSmoker 2,090 (49.4%)
cigsPerDay
    Mean (SD) 9.0 (11.9)
    Median (Q1, Q3) 0.0 (0.0, 20.0)
    Min, Max 0.0, 70.0
    Unknown 29
BPMeds 124 (3.0%)
    Unknown 53
prevalentStroke 25 (0.6%)
prevalentHyp 1,313 (31.0%)
diabetes 109 (2.6%)
1 n (%)

2.2 Patient Clinical EHR

pt_clinic_explore_tbl <- raw_pt_clinic %>%
  select(-patient_id) %>%
  tbl_summary(
    statistic = list(
      all_categorical() ~ "{n} ({p}%)",
      all_continuous() ~ c("{mean} ({sd})",
                           "{median} ({p25}, {p75})",
                           "{min}, {max}")
    ),
    type = list(all_continuous() ~ 'continuous2'),
    digits = list(all_categorical() ~ c(0, 1),
                  all_continuous2() ~ c(1, 1))
  ) 

pt_clinic_explore_tbl %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Explore Patient Clinical EHR")
Explore Patient Clinical EHR
Characteristic N = 4,2311
diagnosis_code
     287 (6.8%)
    124111000119102 112 (2.6%)
    15699201000119104 52 (1.2%)
    15699241000119102 52 (1.2%)
    15699281000119107 52 (1.2%)
    16320791000119106 39 (0.9%)
    193410003 78 (1.8%)
    193411004 39 (0.9%)
    193413001 52 (1.2%)
    193687000 39 (0.9%)
    231996009 78 (1.8%)
    232050001 78 (1.8%)
    232051002 78 (1.8%)
    232052009 52 (1.2%)
    232053004 52 (1.2%)
    232054005 51 (1.2%)
    232055006 52 (1.2%)
    232061009 39 (0.9%)
    232063007 61 (1.4%)
    232064001 61 (1.4%)
    232065000 61 (1.4%)
    24704003 39 (0.9%)
    267613004 52 (1.2%)
    28835009 52 (1.2%)
    312898002 39 (0.9%)
    312921000 78 (1.8%)
    312925009 78 (1.8%)
    312927001 39 (0.9%)
    314270008 61 (1.4%)
    360455002 73 (1.7%)
    362.53 78 (1.8%)
    362.74 39 (0.9%)
    368.54 39 (0.9%)
    368.61 39 (0.9%)
    390936003 38 (0.9%)
    408847006 99 (2.3%)
    408848001 99 (2.3%)
    408849009 87 (2.1%)
    408850009 87 (2.1%)
    408851008 87 (2.1%)
    415297005 112 (2.6%)
    54122009 38 (0.9%)
    56852002 38 (0.9%)
    66892 28 (0.7%)
    66894 28 (0.7%)
    66999 42 (1.0%)
    70099003 65 (1.5%)
    711162004 73 (1.7%)
    711482008 38 (0.9%)
    715562001 39 (0.9%)
    716663009 39 (0.9%)
    718718009 38 (0.9%)
    719297006 39 (0.9%)
    719431007 52 (1.2%)
    719520001 78 (1.8%)
    722066001 39 (0.9%)
    737579002 39 (0.9%)
    763387005 78 (1.8%)
    764452004 38 (0.9%)
    764939004 39 (0.9%)
    765191009 39 (0.9%)
    95501007 38 (0.9%)
    H35.35 78 (1.8%)
    H35.351 78 (1.8%)
    H35.352 78 (1.8%)
    H35.353 78 (1.8%)
    H35.359 77 (1.8%)
    H35.52 39 (0.9%)
    H53.51 39 (0.9%)
    H53.63 39 (0.9%)
totChol
    Mean (SD) 236.7 (44.6)
    Median (Q1, Q3) 234.0 (206.0, 263.0)
    Min, Max 107.0, 696.0
    Unknown 50
sysBP
    Mean (SD) 132.4 (22.0)
    Median (Q1, Q3) 128.0 (117.0, 144.0)
    Min, Max 83.5, 295.0
diaBP
    Mean (SD) 82.9 (11.9)
    Median (Q1, Q3) 82.0 (75.0, 90.0)
    Min, Max 48.0, 142.5
BMI
    Mean (SD) 25.8 (4.1)
    Median (Q1, Q3) 25.4 (23.1, 28.0)
    Min, Max 15.5, 56.8
    Unknown 19
heartRate
    Mean (SD) 75.9 (12.0)
    Median (Q1, Q3) 75.0 (68.0, 83.0)
    Min, Max 44.0, 143.0
    Unknown 1
glucose
    Mean (SD) 82.0 (24.0)
    Median (Q1, Q3) 78.0 (71.0, 87.0)
    Min, Max 40.0, 394.0
    Unknown 386
TenYearCHD 641 (15.2%)
1 n (%)

2.3 Patient Note

pt_note_explore_tbl <- raw_pt_note %>%
  select(Note) %>%
  tbl_summary(
    statistic = list(
      all_categorical() ~ "{n} ({p}%)",
      all_continuous() ~ c("{mean} ({sd})",
                           "{median} ({p25}, {p75})",
                           "{min}, {max}")
    ),
    type = list(all_continuous() ~ 'continuous2'),
    digits = list(all_categorical() ~ c(0, 1),
                  all_continuous2() ~ c(1, 1))
  ) 
pt_note_explore_tbl %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Explore Patient Notes")
Explore Patient Notes
Characteristic N = 2,6111
Note
    chest pain 47 (1.8%)
    F/U to procedure 286 (11.0%)
    insurance denied new medicaiton 190 (7.3%)
    medication review 380 (14.6%)
    Nausea, indigestion, heartburn (PRN) 239 (9.2%)
    patient often fatigued 47 (1.8%)
    patient reports being out of breath, feeling pain in the chest 190 (7.3%)
    patient reports feeling pressure in the chest off and on 190 (7.3%)
    patient reports fluttering and irregular pain 190 (7.3%)
    patient reports pain in arm but otherwise says they feel fine 48 (1.8%)
    patient reports pain in foot but otherwise says they feel fine 95 (3.6%)
    patient reports pain in leg but otherwise says they feel fine 47 (1.8%)
    PRO: tightness in chest 188 (7.2%)
    routine visit and labwork 286 (11.0%)
    we discussed improving diet and getting more exercise 47 (1.8%)
    we discussed smoking and family history and I counseled them to lose weight 141 (5.4%)
1 n (%)

2.4 Patient Demographics

pt_demo_explore_tbl <- raw_pt_demo  %>%
  select(-patient_id, -male) %>%
  tbl_summary(
    statistic = list(
      all_categorical() ~ "{n} ({p}%)",
      all_continuous() ~ c("{mean} ({sd})",
                           "{median} ({p25}, {p75})",
                           "{min}, {max}")
    ),
    type = list(all_continuous() ~ 'continuous2'),
    digits = list(all_categorical() ~ c(0, 1),
                  all_continuous2() ~ c(1, 1))
  ) 
pt_demo_explore_tbl %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Explore Patient Demographics")
Explore Patient Demographics
Characteristic N = 4,2401
age
    Mean (SD) 49.6 (8.6)
    Median (Q1, Q3) 49.0 (42.0, 56.0)
    Min, Max 32.0, 70.0
education
    1 1,720 (41.6%)
    2 1,253 (30.3%)
    3 689 (16.7%)
    4 473 (11.4%)
    Unknown 105
sex
    female 2,420 (57.1%)
    male 1,820 (42.9%)
1 n (%)

3 Task 2

  • Task 2: Medical Device Company A comes to us and wants to find out how many patients with diabetes are under 75, have the following diagnostic codes: 408850009, 232063007, 232053004, a total Cholesterol reading between 185 and 230, and a diastolic blood pressure reading of over 100. How many patients meet this criteria? How would you report this information back to Medical Device Company A?
  • How I would complete in sql: drop table if EXISTS diabetes_count;

create table diabetes_count as

WITH history AS ( SELECT patient_id FROM raw_pt_history WHERE diabetes = 1 )

, demo AS ( SELECT patient_id FROM raw_pt_demo WHERE age < 75 —- not including age 75 )

, clinic AS ( SELECT patient_id FROM raw_pt_clinic WHERE diagnosis_code IN (‘408850009’, ‘232063007’, ‘232053004’) AND totChol BETWEEN 185 and 230 AND diaBP > 100 —- not including 100 )

select count(h.patient_id) FROM history as h INNER JOIN demo as d ON h.patient_id = d.patient_id INNER JOIN clinic as C ON h.patient_id = c.patient_id

  • Answer: There are no pts that meet all of those criteria
    • I would let the client know that no patients meet all of these criteria and provide a table of counts of each of the criteria in case they want to adjust their definition.
    • I would return results in the format that we had discussed previously ex: excel doc, rmarkdwon, etc.
all_pts_all_tbls <- rbind(raw_pt_history %>% select(patient_id), 
                          raw_pt_demo %>% select(patient_id),
                          raw_pt_clinic %>% select(patient_id), 
                          raw_pt_note %>% select(patient_id) ) %>%
  distinct()

# there are no patients that meet all criteria
diabetes_pts <- raw_pt_history %>%
  filter(diabetes == 1) %>%
  inner_join(
    raw_pt_demo %>% filter(age < 75), by = 'patient_id'
  ) %>%
  inner_join(
    raw_pt_clinic %>% 
      filter(diagnosis_code == '408850009' | 
               diagnosis_code == '232063007' | 
               diagnosis_code == '232053004') %>%
      filter(totChol >= 185 & totChol <= 230 & diaBP > 100 ), by = 'patient_id'
  ) %>%
  mutate( diab_flag = 1) %>%
  distinct(patient_id, diab_flag)
# lets give counts of each criteria instead:

diabetes_tbl <- raw_pt_history %>%
  inner_join(
    raw_pt_demo, by = 'patient_id'
  ) %>%
  inner_join(raw_pt_clinic, by = 'patient_id') %>%
  mutate(
    code408850009 = case_when(
      diagnosis_code == '408850009' ~ 1,
      T ~ 0 
    ), 
    code232063007 = case_when(
      diagnosis_code == '232063007' ~ 1,
      T ~ 0 
    ),
    code232053004 = case_when(
      diagnosis_code == '232053004' ~ 1,
      T ~ 0 
    ),
    diabetes_cholesteral = case_when(
      totChol >= 185 & totChol <= 230 ~ 1,
      T ~ 0
    ),
    diabetes_bp = case_when(
      diaBP > 100 ~ 1,
      T ~ 0
    ),
    age_under75 = case_when(
      age < 75 ~ 1,
      T ~ 0
    )
  ) %>%
  select( diabetes, code408850009, code232063007, code232053004, diabetes_cholesteral, 
          diabetes_bp, age_under75) 

label(diabetes_tbl$diabetes) <- "Diabetes"
label(diabetes_tbl$code408850009) <- "408850009"
label(diabetes_tbl$code232063007) <- "232063007"
label(diabetes_tbl$code232053004) <- "232053004"
label(diabetes_tbl$diabetes_cholesteral) <- "Total cholesterol between 185 and 230"
label(diabetes_tbl$diabetes_bp) <- "Diastolic blood pressure reading of over 100"
label(diabetes_tbl$age_under75) <- "Age under 75"

diabetes_counts <- diabetes_tbl %>%
  tbl_summary()

diabetes_counts  %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Counts of diabetes criteria")
Counts of diabetes criteria
Characteristic N = 4,2201
Diabetes 109 (2.6%)
408850009 87 (2.1%)
232063007 61 (1.4%)
232053004 52 (1.2%)
Total cholesterol between 185 and 230 1,519 (36%)
Diastolic blood pressure reading of over 100 317 (7.5%)
Age under 75
    1 4,220 (100%)
1 n (%)

4 Task 3

  • Task 3: Pharma Co. Z has a product on the market to prevent heart attacks. They want to study patients who complain of signs of heart attacks to their doctors. Specifically, they are interested in knowing how many patients complain of pain, fluttering, pressure, or tightness in their chest have that documented in the notes of their record. Write code for and provide counts of how many unique patients match this criteria. Write a short summary to the client communicating what you did. If we wanted to recommend a more advanced text search, what would you suggest doing?
  • Answer:
    • Of the 2,611 pts with notes available, 995 include the verbiage Pharma Co. Z is interested in including in the analysis.
    • To get this answer I made sure all the text was lower case and did a stringr_detect search for each key word.
    • Hello Pharma Co. Z, I did a basic word search through the patients notes and identified 995 patients that have notes matching the words you are interested in. However, I did not search for context. So If a note included the word pain but was referring to non cardiac pain, it was still included in the results. To grab patients that may be more relevant to your area of interest, I might recommend a more complex natural language processing algorithm. I can do some research on what may be available and best for your needs.
ha_notes <- raw_pt_note %>%
  select(patient_id, Note) %>%
  mutate(Note = tolower(Note) ) %>%
 filter(str_detect(Note, paste0(c("pain","fluttering","pressure","tightness"), collapse = '|') )) %>%
  mutate( HA_flag = 1)

ha_notes_count <- ha_notes %>%
  summarize( HA_count = n_distinct(patient_id))

5 Task 4

5.1 Task 4a

  • How many patients are male, have diagnoses: 232065000 or H35.52, and the physician noted the patient complains of pain, fluttering, pressure or tightness in their chest?
  • Answer: Of the 995 pts with these code words in their notes 13 additional have either of these diagnosis and are male.
task4a <- all_tbls %>%
  inner_join(ha_notes %>% select(patient_id, HA_flag), by = 'patient_id') %>%
  filter(diagnosis_code == '232065000' | diagnosis_code == 'H35.52' ) %>%
  filter(HA_flag == 1) %>%
  filter(sex == 'male') %>%
  summarize( HA_count = n_distinct(patient_id))

5.2 Task 4b

  • How many patients with either of the two diagnoses might have experienced those symptoms, but cannot be confirmed using this data? Write 1-2 sentences for Pharma Co. Z about why this might be the case.
  • Answer: Of the patients that had a diagnosis of 232065000 or H35.52, 72 either didn’t have a patient note or the note did not include the words of interest.
  • Hello Pharma Co. Z, When we look at the data of who has those diagnosis of interest, but were not confirmed by patient notes. We find 72 individuals. This individuals either didn’t have the word criteria in their notes or did not have any patient notes available. If you’re trying to build as robust a cohort as possible, I would recommend a definition that is inclusive of multiple data sources, diagnostics, history, and notes. To Capture as many patients as possible.
task4b <- all_tbls %>%
  left_join(ha_notes %>% select(patient_id, HA_flag), by = 'patient_id') %>%
  filter(diagnosis_code == '232065000' | diagnosis_code == 'H35.52' ) %>%
  filter(is.na(HA_flag)) %>%
  summarize( HA_count = n_distinct(patient_id))

6 Task 5

  • Assess the distribution of TenYearCHD among men and women. Run a statistical test to determine significance and interpret the results.
  • Answer: The difference between the distribution of ten year CHD by sex is statistically significant with a p-value < 0.001. Identified via chi-square test.
chd_tbl <- all_tbls %>%
  filter(!is.na(TenYearCHD) & !is.na(sex))

chd_by_sex <- chd_tbl %>%
  tbl_summary(
    include = c(TenYearCHD, sex),
    by = 'sex'
  ) %>%
  add_p()

chd_by_sex  %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Ten year CHD by sex")
Ten year CHD by sex
Characteristic female
N = 2,415
1
male
N = 1,816
1
p-value2
TenYearCHD 299 (12%) 342 (19%) <0.001
1 n (%)
2 Pearson’s Chi-squared test

7 Task 6

  • Is there a relationship between the number of cigarettes smoked per day and the prevalence of diabetes? Run a statistical test to determine significance, interpret the results and create a visualization to display this.
  • I excluded those that did not have results for diabetes status or cigarettes per day. I then ran a binary model predicting diabetes status. Those that smoked more cigarettes were slightly protective against diabetes. Though the p-value is significant the CI includes 1 so I would not say that this result was significant.
  • When we look at the box plot we see that those that are non diabetic do have some outliers that could be driving the results we see
diabetes_cig_tbl <- all_tbls %>%
  filter(!is.na(cigsPerDay) & !is.na(diabetes)) %>%
  mutate(Diabetes = case_when(
    diabetes == 1 ~ 'Diabetic',
    T ~ 'Non-Diabetic'
  ))

# model
m1 <- glm(diabetes ~ cigsPerDay, diabetes_cig_tbl, family = binomial)
cigs_diabets_tbl <- tbl_regression(m1, exponentiate = TRUE)

# figure
p <- ggplot(diabetes_cig_tbl, aes(x=Diabetes, y=cigsPerDay, fill=Diabetes)) + 
  geom_boxplot() + 
  stat_summary(fun.y=mean, geom="point", shape=23, size=4) +
  theme_classic()

cigs_diabets_tbl  %>%
  gtsummary::as_gt() %>%
  gt::tab_header("Prediction model diabetes by cigarettes per day")
Prediction model diabetes by cigarettes per day
Characteristic OR 95% CI p-value
cigsPerDay 0.98 0.96, 1.00 0.017
Abbreviations: CI = Confidence Interval, OR = Odds Ratio
p