morgan_data <- read.csv("/Volumes/WardCaviness_Cavin_IRB17-0150/Hospitalizations and Family History/Heart Failure/mnarain/morgan_datacsv.csv")

# load("/Volumes/WardCaviness_Cavin_IRB17-0150/Hospitalizations and Family History/Heart Failure/mnarain/morgan_data.RData")
famhist <- read.csv("/Volumes/WardCaviness_Cavin_IRB17-0150/Hospitalizations and Family History/Heart Failure/FamHistory Data/HF Family History.csv")

summary(morgan_data)
##        X              MRN                 AGE            SEX_CD         
##  Min.   :    6   Min.   :2.400e+02   Min.   : 20.02   Length:12474      
##  1st Qu.:11988   1st Qu.:6.375e+06   1st Qu.: 58.94   Class :character  
##  Median :19609   Median :1.544e+07   Median : 70.14   Mode  :character  
##  Mean   :19360   Mean   :2.446e+10   Mean   : 69.01                     
##  3rd Qu.:28406   3rd Qu.:4.256e+07   3rd Qu.: 80.85                     
##  Max.   :35714   Max.   :1.001e+11   Max.   :107.16                     
##      RACE           ANNUAL_AVG_PM      visit_year       income      
##  Length:12474       Min.   : 2.785   Min.   :2004   Min.   :  9330  
##  Class :character   1st Qu.: 8.894   1st Qu.:2008   1st Qu.: 36136  
##  Mode  :character   Median : 9.708   Median :2013   Median : 50852  
##                     Mean   :10.162   Mean   :2012   Mean   : 55592  
##                     3rd Qu.:11.503   3rd Qu.:2015   3rd Qu.: 68750  
##                     Max.   :16.628   Max.   :2016   Max.   :233438  
##  med_house_value    pubassist           urban           poverty      
##  Min.   : 10000   Min.   : 0.0000   Min.   :  0.00   Min.   : 0.000  
##  1st Qu.:113400   1st Qu.: 0.0000   1st Qu.: 18.80   1st Qu.: 6.299  
##  Median :158900   Median : 0.7353   Median : 91.15   Median :13.008  
##  Mean   :193088   Mean   : 1.9649   Mean   : 64.87   Mean   :16.603  
##  3rd Qu.:236000   3rd Qu.: 2.9412   3rd Qu.:100.00   3rd Qu.:23.402  
##  Max.   :976000   Max.   :28.2946   Max.   :100.00   Max.   :82.222  
##  N_30day_readmit    N_7day_readmit     N_60day_readmit   N_90day_readmit 
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.:  0.00  
##  Median :  0.0000   Median :  0.0000   Median :  0.000   Median :  0.00  
##  Mean   :  0.5754   Mean   :  0.1419   Mean   :  0.858   Mean   :  1.02  
##  3rd Qu.:  1.0000   3rd Qu.:  0.0000   3rd Qu.:  1.000   3rd Qu.:  1.00  
##  Max.   :108.0000   Max.   :105.0000   Max.   :110.000   Max.   :110.00  
##      log_FU        ALL_CAUSE        PM_5day_avg      HARMONIZED_SMK    
##  Min.   :-5.8999   Mode :logical   Min.   : 0.4737   Length:12474      
##  1st Qu.:-0.5480   FALSE:8124      1st Qu.: 7.2912   Class :character  
##  Median : 0.6029   TRUE :4350      Median : 9.2761   Mode  :character  
##  Mean   : 0.2801                   Mean   : 9.9379                     
##  3rd Qu.: 1.5091                   3rd Qu.:11.5844                     
##  Max.   : 2.5245                   Max.   :46.7985                     
##     CKD             IHD            BP_PRI           COPD        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:3969      FALSE:3824      FALSE:2199      FALSE:6544     
##  TRUE :8505      TRUE :8650      TRUE :10275     TRUE :5930     
##                                                                 
##                                                                 
##                                                                 
##     T2D            LIPID            PAD         
##  Mode :logical   Mode :logical   Mode :logical  
##  FALSE:7527      FALSE:1984      FALSE:6505     
##  TRUE :4947      TRUE :10490     TRUE :5969     
##                                                 
##                                                 
## 

In order to determine which of the patients had a genetic history of disease to to those who do not, the two data frames had to be joined and then subset.

#join all who have family history and all who do not together
famhist_join <- merge(x=morgan_data, y=famhist, by='MRN', all.x = TRUE)
#separate those who have history from those who do not
onlyfamhist <- subset(famhist_join, famhist_join$FAM_MDCL_HIST != "NA" 
                    & famhist_join$FAM_RLTNP != "NA",
                    select=MRN:FAM_MDCL_HIST)
#subset those who have no family history from those who do 
nofamhist <- subset(famhist_join,  is.na(FAM_MDCL_HIST),
                             select=MRN:FAM_MDCL_HIST)

Now, we can plot histograms for those who do have history versus those who do not. This is done using ggplot2.

#age histogram of those with family history
ggplot(data=onlyfamhist) + geom_histogram(aes(x = AGE),
                                             binwidth = 5, 
                                             color = "black",
                                             fill = "turquoise",
                                             linetype = "solid",
                                             alpha = 0.2) +
  labs(title = "Age Distribution Among HF Patients w/ history", x = "Age(yrs)", y = "Count")

#age histogram of those without family history
ggplot(data=nofamhist) + geom_histogram(aes(x = AGE),
                                             binwidth = 5, 
                                             color = "black",
                                             fill = "turquoise",
                                             linetype = "solid",
                                             alpha = 0.2) +
  labs(title = "Age Distribution Among HF Patients w/o history", x = "Age(yrs)", y = "Count")

# histogram of PM exposure with family history
ggplot(data=onlyfamhist) + geom_histogram(aes(x=PM_5day_avg), binwidth = 1, 
                                          color = "black",
                                             fill = "turquoise",
                                             linetype = "solid",
                                             alpha = 0.2) + 
  labs(title = "PM 5-Day Avg Distribution w/ family history",
       x = "PM Exposure", 
       y = "Count")

# histogram of PM exposure without family history
ggplot(data=nofamhist) + geom_histogram(aes(x=PM_5day_avg), binwidth = 1, 
                                          color = "black",
                                             fill = "turquoise",
                                             linetype = "solid",
                                             alpha = 0.2) + 
  labs(title = "PM 5-Day Avg Distribution w/o family history",
       x = "PM Exposure", 
       y = "Count")

# histogram of 7 day readmission with family history
ggplot(data=onlyfamhist) + geom_histogram(aes(x=N_7day_readmit), binwidth = 1, 
                                          color = "black",
                                             fill = "turquoise",
                                             linetype = "solid",
                                             alpha = 0.2) + 
  labs(title = "7 day readmission w/ family history",
       x = "Number of 7 day Admissions", 
       y = "Count")

# histogram of 7 day readmission w/o family history
ggplot(data=nofamhist) + geom_histogram(aes(x=N_7day_readmit), binwidth = 1, 
                                          color = "black",
                                             fill = "turquoise",
                                             linetype = "solid",
                                             alpha = 0.2) + 
  labs(title = "7 day readmission w/o family history",
       x = "Number of 7 day Admissions", 
       y = "Count")

Then I can compare race to the binary disease variables and the factored variables such as smoking status.

# bar graph of race vs binary disease variables
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(IHD))) +
         geom_bar() + 
  labs(fill="IHD")

ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(COPD))) +
         geom_bar() +
  labs(fill="COPD")

ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(PAD))) +
         geom_bar() +
  labs(fill="Peripherial Artery Disease")

ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(T2D))) +
         geom_bar() +
 labs(fill="Type 2 Diabetes")

ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(LIPID))) +
         geom_bar() +
labs(fill="Dyslipidemia")

ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(BP_PRI))) +
         geom_bar() +
  labs(fill="High Blood Pressure")

# race vs smoking status
ggplot(data = morgan_data, aes(x=RACE, fill= as.factor(HARMONIZED_SMK))) +
         geom_bar() +
  labs(fill="smoker status")

# race vs 7 day readmission
ggplot(data = morgan_data, aes(x=N_7day_readmit, fill=as.factor(RACE))) +
  geom_bar()

# race vs 30 day readmit
ggplot(data = morgan_data, aes(x=N_30day_readmit, fill=as.factor(RACE))) +
  geom_bar()

#race vs 60 day readmit
ggplot(data = morgan_data, aes(x=N_60day_readmit, fill=as.factor(RACE))) +
  geom_bar()

#race vs 90 day readmit
ggplot(data = morgan_data, aes(x=N_90day_readmit, fill=as.factor(RACE))) +
  geom_bar()

Here, I compare PM exposure to race to see where or not there is a race based difference.

#box plot of annual pm exposure vs race 
ggplot(data = famhist_join, aes(x=RACE, y=ANNUAL_AVG_PM)) +
  geom_boxplot(fill="turquoise",
               alpha = 0.5)

# box plot of 5 day avg pm exposure by race
ggplot(data = famhist_join, aes(x=RACE, y=PM_5day_avg)) +
  geom_boxplot(fill="turquoise",
               alpha = 0.5)