The purpose of this notebook is to examine EHR data to better understand the complex relationships between heart failure, air pollution, and family history. Here, we match ~21,000 individual EHRs with ~11,700 instances of family health history. Below, this data is loaded into the workspace. The family history is converted from long to wide format.
In the Family History data, some family members report multiple diseases, which creates duplicate columns for a single individual. Presently, the dataframe fam_wide takes each potential family member as a factor and reports the first listed instance of disease, or reports NA for none listed.
Here, the two dataframes are merged so that each EHR is paired with the affiliated family history (fam_wide). The family history contained extraneous records for which we do not having corresponding EHR data, so these were removed.
library(faraway); library(gtsummary);library(ggplot2);library(gridExtra);library(kableExtra);library(gt);library(tidyverse);library(glue);library(formattable);library(DT)
load("alice_data.RData")
fam <- read.csv("fam_hx.csv")
fam_wide <- reshape(fam, idvar = "MRN", timevar = "FAM_RLTNP", direction = "wide")
x <- merge(alice_data,fam_wide, all=TRUE)
df <- x[!is.na(x$AGE),]
Below, a summary table examines the patient history data by whether or not the patient is living. To be honest, I can’t remember off the top of my head which package we are using here, but will confirm on Thursday.
tbl_summary(df[,2:27], by= ALL_CAUSE) %>% #this is grabbing only the factors I care about from the patient data
add_p() %>% #add_p will add a column with a p-value of some inferential stat between the two variables
modify_spanning_header(everything() ~ "**SUMMARY OF PT DATA BY ALIVENESS**") %>%
modify_header(label = "**Variable**", stat_1 = "**LIVING, N = 15,599**",stat_2="**DECEASED, N = 5,321**") %>%
bold_labels()
| SUMMARY OF PT DATA BY ALIVENESS | |||
|---|---|---|---|
| Variable | LIVING, N = 15,5991 | DECEASED, N = 5,3211 | p-value2 |
| AGE | 69 (58, 79) | 74 (63, 84) | <0.001 |
| SEX_CD | <0.001 | ||
| F | 8,342 (53%) | 2,656 (50%) | |
| M | 7,257 (47%) | 2,665 (50%) | |
| RACE | <0.001 | ||
| BLACK | 4,227 (27%) | 1,337 (25%) | |
| OTHER | 1,177 (7.5%) | 304 (5.7%) | |
| WHITE | 10,195 (65%) | 3,680 (69%) | |
| ANNUAL_AVG_PM | 9.30 (8.60, 10.06) | 10.54 (9.34, 12.76) | <0.001 |
| visit_year | 2,015 (2,012, 2,016) | 2,009 (2,006, 2,014) | <0.001 |
| income | 48,276 (35,000, 65,333) | 52,550 (36,967, 70,795) | <0.001 |
| med_house_value | 146,300 (105,600, 214,700) | 167,400 (114,200, 263,400) | <0.001 |
| pubassist | 0.92 (0.00, 2.93) | 0.57 (0.00, 2.91) | <0.001 |
| urban | 89 (13, 100) | 87 (11, 100) | 0.8 |
| poverty | 14 (7, 25) | 13 (6, 23) | <0.001 |
| NEW_TOT_VISITS | 11 (4, 26) | 10 (3, 31) | 0.001 |
| N_OUTPATIENT | 9 (3, 24) | 8 (1, 27) | <0.001 |
| NEW_INPATIENT | 1.00 (0.00, 2.00) | 1.00 (1.00, 3.00) | <0.001 |
| N_EMERGENCY | 0.00 (0.00, 1.00) | 1.00 (0.00, 3.00) | <0.001 |
| EMERG_NOT_INPATIENT | 0.00 (0.00, 0.00) | 0.00 (0.00, 1.00) | <0.001 |
| log_FU | 0.64 (-0.29, 1.47) | 0.02 (-1.54, 1.04) | <0.001 |
| PM_5day_avg | 8.8 (7.0, 10.9) | 9.8 (7.7, 12.6) | <0.001 |
| HARMONIZED_SMK | <0.001 | ||
| CURRENT | 1,785 (11%) | 244 (4.6%) | |
| FORMER | 5,686 (36%) | 829 (16%) | |
| NEVER | 5,545 (36%) | 631 (12%) | |
| UNKNOWN | 2,583 (17%) | 3,617 (68%) | |
| CKD | 9,234 (59%) | 3,949 (74%) | <0.001 |
| IHD | 9,635 (62%) | 3,803 (71%) | <0.001 |
| BP_PRI | 12,248 (79%) | 4,354 (82%) | <0.001 |
| COPD | 6,468 (41%) | 2,557 (48%) | <0.001 |
| T2D | 5,480 (35%) | 2,147 (40%) | <0.001 |
| LIPID | 12,590 (81%) | 4,452 (84%) | <0.001 |
| PAD | 6,631 (43%) | 2,598 (49%) | <0.001 |
|
1
Statistics presented: median (IQR); n (%)
2
Statistical tests performed: Wilcoxon rank-sum test; chi-square test of independence
|
|||
gghisto <- list(
theme(axis.text.x = element_text(face="bold", color="cornflowerblue", size=14,angle=17),
axis.text.y = element_text(face="bold", color="royalblue4",
size=16, angle=25),
axis.title=element_text(size=17,face="italic"),
plot.title = element_text(size=20,face="bold.italic"))
)
ggbar <- list(
theme(axis.text.x = element_text(face="bold", color="gray18", size=10,angle=20),
axis.text.y = element_text(face="bold", color="royalblue4",
size=12, angle=25),
axis.title=element_text(size=12,face="italic"),
plot.title = element_text(size=16,face="bold.italic"))
)
p1 <- ggplot(df, aes(x=NEW_TOT_VISITS)) + #grabs the total visits from the df
geom_histogram(aes(y=..density..), #creates a histogram of dist (bars)
binwidth=6,
color="gray18",fill="white")+
geom_density(alpha=.2, fill="red",size=.5, color="gray14") + #creates a density distribution (pink)
ggtitle("Frequency Dist of Total Hospital Visits")+
scale_x_continuous(limits = c(0, 150))+ #I added x-axis limits because there are some funky outliers that go out to over 700.
gghisto
p1 #display p1
t <- as.data.frame(table(df$NEW_TOT_VISITS)) #create a frequency table to accompany the plot we made
colnames(t) <- c("No. of Hospital Admissions","Frequency")
as.datatable(formattable(t, align=c("c","r"),list(Frequency = normalize_bar(color="pink"), title)),rownames=FALSE,options = list(
autoWidth=TRUE,
columnDefs = list(list(className = 'dt-center', targets = 0)),
columnDefs = list(list(width = '150px', targets = list(0:1)))))
p2 <- ggplot(df, aes(x=N_OUTPATIENT)) +
geom_histogram(aes(y=..density..),
binwidth=3,
color="gray18",fill="white")+
geom_density(alpha=.2, fill="red",size=.5, color="gray14") +
ggtitle("Frequency Dist of Outpatient Visits")+
scale_x_continuous(limits = c(-1, 100))+ #Using `-1` in the parameter is cosmetic and technically makes the distribution inaccurate, but here I like the way it displays the plot better
gghisto
p2
t <- as.data.frame(table(df$N_OUTPATIENT))
colnames(t) <- c("No. of OutPatient Admissions","Frequency")
as.datatable(formattable(t, align=c("c","r"),list(Frequency = normalize_bar(color="pink"), title)),rownames=FALSE,options = list(
autoWidth=TRUE,
columnDefs = list(list(className = 'dt-center', targets = 0)),
columnDefs = list(list(width = '100px', targets = list(0:1)))))
pa <- ggplot(df, aes(x=ANNUAL_AVG_PM)) +
geom_histogram(aes(y=..density..),
binwidth=.5,
color="gray18",fill="white")+
geom_density(alpha=.2, fill="red",size=.5, color="gray14") +
ggtitle("pm exposure distribution for entire population")+
gghisto
pb <- ggplot(df, aes(x=income)) +
geom_histogram(aes(y=..density..),
binwidth=10000,
color="gray18",fill="white")+
geom_density(alpha=.2, fill="red",size=.5, color="gray14") +
ggtitle("Data Distribution of Pt Income")+
gghisto
pc <- ggplot(df, aes(x=med_house_value)) +
geom_histogram(aes(y=..density..),
binwidth=28000,
color="gray18",fill="white")+
geom_density(alpha=.2, fill="red",size=.5, color="gray14") +
ggtitle("Data Dist of Median House Val")+
gghisto
gen_data_plots <- grid.arrange(pa,pb,pc) #put plots on page
a <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = HARMONIZED_SMK)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x SMOKING")+gghisto
b <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = RACE)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x RACE")+gghisto
c <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = ALL_CAUSE)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x DEATH")+gghisto
ppm_plots <- grid.arrange(a,b,c) #put plots on page
a <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = CKD)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x CKD")+gghisto
b <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = IHD)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x ihd")+gghisto
c <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = COPD)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x copd")+gghisto
d <- ggplot(df, aes(x = ANNUAL_AVG_PM, fill = T2D)) +geom_density(alpha = .3)+ ggtitle("AVG ANNUAL PM x t2d")+gghisto
disease_ppm_plots1 <- grid.arrange(a,b,c,d, nrow=3,ncol=2) #put plots on page
a <- table(df[,21])["TRUE"]
b <- table(df[,21])["FALSE"]
c <- a/(a+b)
ckd <- c(a,b,c)
a <- table(df[,22])["TRUE"]
b <- table(df[,22])["FALSE"]
c <- a/(a+b)
ihd <- c(a,b,c)
a <- table(df[,23])["TRUE"]
b <- table(df[,23])["FALSE"]
c <- a/(a+b)
bp_pri <- c(a,b,c)
a <- table(df[,24])["TRUE"]
b <- table(df[,24])["FALSE"]
c <- a/(a+b)
copd <- c(a,b,c)
a <- table(df[,25])["TRUE"]
b <- table(df[,25])["FALSE"]
c <- a/(a+b)
t2d <- c(a,b,c)
a <- table(df[,26])["TRUE"]
b <- table(df[,26])["FALSE"]
c <- a/(a+b)
lipid <- c(a,b,c)
a <- table(df[,27])["TRUE"]
b <- table(df[,27])["FALSE"]
c <- a/(a+b)
pad <- c(a,b,c)
d <- as.data.frame(rbind(ckd,ihd,bp_pri,copd,t2d,lipid,pad))
colnames(d) <- c("True","False","Proportion with Disease")
formattable(d,list(`True`= color_bar("#B1F4FF"),`False` = color_bar("pink"),`Proportion with Disease`= percent), caption="General Distributions of Diseases for All Patients")
| True | False | Proportion with Disease | |
|---|---|---|---|
| ckd | 13183 | 7737 | 63.02% |
| ihd | 13438 | 7482 | 64.24% |
| bp_pri | 16602 | 4318 | 79.36% |
| copd | 9025 | 11895 | 43.14% |
| t2d | 7627 | 13293 | 36.46% |
| lipid | 17042 | 3878 | 81.46% |
| pad | 9229 | 11691 | 44.12% |
t <- table(df$FAM_MDCL_HIST.Father)
dad <- sort(t, decreasing=TRUE)[1:10]
dad <- data.frame(dad)
colnames(dad) <- c("disease","freq")
t <- table(df$FAM_MDCL_HIST.Mother)
mom <- sort(t, decreasing=TRUE)[1:10]
mom <- data.frame(mom)
colnames(mom) <- c("disease","freq")
fam <- cbind(mom,dad)
colnames(fam) <- c("Top 10 Mom Diseases","frequency","Top 10 Dad Diseases","freq (dad)")
formattable(fam,list(`frequency` = color_bar("#FF78A4"),`freq (dad)` = color_bar("#59E7FF")),caption="Top Ten Diseases by Parents")
| Top 10 Mom Diseases | frequency | Top 10 Dad Diseases | freq (dad) |
|---|---|---|---|
| Cancer | 965 | Heart disease | 1089 |
| Diabetes | 886 | Cancer | 946 |
| Heart disease | 817 | Heart attack | 578 |
| Hypertension | 483 | Diabetes | 569 |
| Heart attack | 276 | Hypertension | 368 |
| Arthritis | 256 | Stroke | 214 |
| Stroke | 230 | Coronary artery disease | 190 |
| Breast cancer | 199 | Alcohol abuse | 186 |
| Heart failure | 165 | Heart failure | 145 |
| Coronary artery disease | 125 | COPD | 141 |