morgan_data <- read.csv("/Volumes/WardCaviness_Cavin_IRB17-0150/Hospitalizations and Family History/Heart Failure/mnarain/morgan_datacsv.csv")
# load("/Volumes/WardCaviness_Cavin_IRB17-0150/Hospitalizations and Family History/Heart Failure/mnarain/morgan_data.RData")
famhist <- read.csv("/Volumes/WardCaviness_Cavin_IRB17-0150/Hospitalizations and Family History/Heart Failure/FamHistory Data/HF Family History.csv")
summary(morgan_data)
## X MRN AGE SEX_CD
## Min. : 6 Min. :2.400e+02 Min. : 20.02 Length:12474
## 1st Qu.:11988 1st Qu.:6.375e+06 1st Qu.: 58.94 Class :character
## Median :19609 Median :1.544e+07 Median : 70.14 Mode :character
## Mean :19360 Mean :2.446e+10 Mean : 69.01
## 3rd Qu.:28406 3rd Qu.:4.256e+07 3rd Qu.: 80.85
## Max. :35714 Max. :1.001e+11 Max. :107.16
## RACE ANNUAL_AVG_PM visit_year income
## Length:12474 Min. : 2.785 Min. :2004 Min. : 9330
## Class :character 1st Qu.: 8.894 1st Qu.:2008 1st Qu.: 36136
## Mode :character Median : 9.708 Median :2013 Median : 50852
## Mean :10.162 Mean :2012 Mean : 55592
## 3rd Qu.:11.503 3rd Qu.:2015 3rd Qu.: 68750
## Max. :16.628 Max. :2016 Max. :233438
## med_house_value pubassist urban poverty
## Min. : 10000 Min. : 0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.:113400 1st Qu.: 0.0000 1st Qu.: 18.80 1st Qu.: 6.299
## Median :158900 Median : 0.7353 Median : 91.15 Median :13.008
## Mean :193088 Mean : 1.9649 Mean : 64.87 Mean :16.603
## 3rd Qu.:236000 3rd Qu.: 2.9412 3rd Qu.:100.00 3rd Qu.:23.402
## Max. :976000 Max. :28.2946 Max. :100.00 Max. :82.222
## N_30day_readmit N_7day_readmit N_60day_readmit N_90day_readmit
## Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.0000 Median : 0.000 Median : 0.00
## Mean : 0.5754 Mean : 0.1419 Mean : 0.858 Mean : 1.02
## 3rd Qu.: 1.0000 3rd Qu.: 0.0000 3rd Qu.: 1.000 3rd Qu.: 1.00
## Max. :108.0000 Max. :105.0000 Max. :110.000 Max. :110.00
## log_FU ALL_CAUSE PM_5day_avg HARMONIZED_SMK
## Min. :-5.8999 Mode :logical Min. : 0.4737 Length:12474
## 1st Qu.:-0.5480 FALSE:8124 1st Qu.: 7.2912 Class :character
## Median : 0.6029 TRUE :4350 Median : 9.2761 Mode :character
## Mean : 0.2801 Mean : 9.9379
## 3rd Qu.: 1.5091 3rd Qu.:11.5844
## Max. : 2.5245 Max. :46.7985
## CKD IHD BP_PRI COPD
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:3969 FALSE:3824 FALSE:2199 FALSE:6544
## TRUE :8505 TRUE :8650 TRUE :10275 TRUE :5930
##
##
##
## T2D LIPID PAD
## Mode :logical Mode :logical Mode :logical
## FALSE:7527 FALSE:1984 FALSE:6505
## TRUE :4947 TRUE :10490 TRUE :5969
##
##
##
In order to determine which of the patients had a genetic history of disease to to those who do not, the two data frames had to be joined and then subset.
#join all who have family history and all who do not together
famhist_join <- merge(x=morgan_data, y=famhist, by='MRN', all.x = TRUE)
#separate those who have history from those who do not
onlyfamhist <- subset(famhist_join, famhist_join$FAM_MDCL_HIST != "NA"
& famhist_join$FAM_RLTNP != "NA",
select=MRN:FAM_MDCL_HIST)
#subset those who have no family history from those who do
nofamhist <- subset(famhist_join, is.na(FAM_MDCL_HIST),
select=MRN:FAM_MDCL_HIST)
Now, we can plot histograms for those who do have history versus those who do not. This is done using ggplot2.
#age histogram of those with family history
ggplot(data=onlyfamhist) + geom_histogram(aes(x = AGE),
binwidth = 5,
color = "black",
fill = "turquoise",
linetype = "solid",
alpha = 0.2) +
labs(title = "Age Distribution Among HF Patients w/ history", x = "Age(yrs)", y = "Count")
#age histogram of those without family history
ggplot(data=nofamhist) + geom_histogram(aes(x = AGE),
binwidth = 5,
color = "black",
fill = "turquoise",
linetype = "solid",
alpha = 0.2) +
labs(title = "Age Distribution Among HF Patients w/o history", x = "Age(yrs)", y = "Count")
# histogram of PM exposure with family history
ggplot(data=onlyfamhist) + geom_histogram(aes(x=PM_5day_avg), binwidth = 1,
color = "black",
fill = "turquoise",
linetype = "solid",
alpha = 0.2) +
labs(title = "PM 5-Day Avg Distribution w/ family history",
x = "PM Exposure",
y = "Count")
# histogram of PM exposure without family history
ggplot(data=nofamhist) + geom_histogram(aes(x=PM_5day_avg), binwidth = 1,
color = "black",
fill = "turquoise",
linetype = "solid",
alpha = 0.2) +
labs(title = "PM 5-Day Avg Distribution w/o family history",
x = "PM Exposure",
y = "Count")
# histogram of 7 day readmission with family history
ggplot(data=onlyfamhist) + geom_histogram(aes(x=N_7day_readmit), binwidth = 1,
color = "black",
fill = "turquoise",
linetype = "solid",
alpha = 0.2) +
labs(title = "7 day readmission w/ family history",
x = "Number of 7 day Admissions",
y = "Count")
# histogram of 7 day readmission w/o family history
ggplot(data=nofamhist) + geom_histogram(aes(x=N_7day_readmit), binwidth = 1,
color = "black",
fill = "turquoise",
linetype = "solid",
alpha = 0.2) +
labs(title = "7 day readmission w/o family history",
x = "Number of 7 day Admissions",
y = "Count")
Then I can compare race to the binary disease variables and the factored variables such as smoking status.
# bar graph of race vs binary disease variables
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(IHD))) +
geom_bar() +
labs(fill="IHD")
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(COPD))) +
geom_bar() +
labs(fill="COPD")
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(PAD))) +
geom_bar() +
labs(fill="Peripherial Artery Disease")
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(T2D))) +
geom_bar() +
labs(fill="Type 2 Diabetes")
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(LIPID))) +
geom_bar() +
labs(fill="Dyslipidemia")
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(BP_PRI))) +
geom_bar() +
labs(fill="High Blood Pressure")
# race vs smoking status
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(HARMONIZED_SMK))) +
geom_bar() +
labs(fill="smoker status")
# race vs 7 day readmission
ggplot(data = morgan_data, aes(x=N_7day_readmit, fill=as.factor(RACE))) +
geom_bar()
# race vs 30 day readmit
ggplot(data = morgan_data, aes(x=N_30day_readmit, fill=as.factor(RACE))) +
geom_bar()
#race vs 60 day readmit
ggplot(data = morgan_data, aes(x=N_60day_readmit, fill=as.factor(RACE))) +
geom_bar()
#race vs 90 day readmit
ggplot(data = morgan_data, aes(x=N_90day_readmit, fill=as.factor(RACE))) +
geom_bar()
Here, I compare PM exposure to race to see where or not there is a race based difference.
#box plot of annual pm exposure vs race
ggplot(data = famhist_join, aes(x=RACE, y=ANNUAL_AVG_PM)) +
geom_boxplot(fill="turquoise",
alpha = 0.5)
# box plot of 5 day avg pm exposure by race
ggplot(data = famhist_join, aes(x=RACE, y=PM_5day_avg)) +
geom_boxplot(fill="turquoise",
alpha = 0.5)