library(tidyverse)
library(knitr)
library(psych)
library(ggplot2)
library(plotly)
patient.data.df = read.csv('https://raw.githubusercontent.com/niteen11/CUNY_DATA_698/master/dataset_diabetes/diabetic_data.csv')
kable(head(patient.data.df))
encounter_id patient_nbr race gender age weight admission_type_id discharge_disposition_id admission_source_id time_in_hospital payer_code medical_specialty num_lab_procedures num_procedures num_medications number_outpatient number_emergency number_inpatient diag_1 diag_2 diag_3 number_diagnoses max_glu_serum A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone acarbose miglitol troglitazone tolazamide examide citoglipton insulin glyburide.metformin glipizide.metformin glimepiride.pioglitazone metformin.rosiglitazone metformin.pioglitazone change diabetesMed readmitted
2278392 8222157 Caucasian Female [0-10) ? 6 25 1 1 ? Pediatrics-Endocrinology 41 0 1 0 0 0 250.83 ? ? 1 None None No No No No No No No No No No No No No No No No No No No No No No No No No NO
149190 55629189 Caucasian Female [10-20) ? 1 1 7 3 ? ? 59 0 18 0 0 0 276 250.01 255 9 None None No No No No No No No No No No No No No No No No No Up No No No No No Ch Yes >30
64410 86047875 AfricanAmerican Female [20-30) ? 1 1 7 2 ? ? 11 5 13 2 0 1 648 250 V27 6 None None No No No No No No Steady No No No No No No No No No No No No No No No No No Yes NO
500364 82442376 Caucasian Male [30-40) ? 1 1 7 2 ? ? 44 1 16 0 0 0 8 250.43 403 7 None None No No No No No No No No No No No No No No No No No Up No No No No No Ch Yes NO
16680 42519267 Caucasian Male [40-50) ? 1 1 7 1 ? ? 51 0 8 0 0 0 197 157 250 5 None None No No No No No No Steady No No No No No No No No No No Steady No No No No No Ch Yes NO
35754 82637451 Caucasian Male [50-60) ? 2 1 2 3 ? ? 31 6 16 0 0 0 414 411 250 9 None None No No No No No No No No No No No No No No No No No Steady No No No No No No Yes >30
dim(patient.data.df)
## [1] 101766     50
summary(patient.data.df$gender)
##          Female            Male Unknown/Invalid 
##           54708           47055               3
summary(patient.data.df)
##   encounter_id        patient_nbr                     race      
##  Min.   :    12522   Min.   :      135   ?              : 2273  
##  1st Qu.: 84961194   1st Qu.: 23413221   AfricanAmerican:19210  
##  Median :152388987   Median : 45505143   Asian          :  641  
##  Mean   :165201646   Mean   : 54330401   Caucasian      :76099  
##  3rd Qu.:230270888   3rd Qu.: 87545950   Hispanic       : 2037  
##  Max.   :443867222   Max.   :189502619   Other          : 1506  
##                                                                 
##              gender           age              weight     
##  Female         :54708   [70-80):26068   ?        :98569  
##  Male           :47055   [60-70):22483   [75-100) : 1336  
##  Unknown/Invalid:    3   [50-60):17256   [50-75)  :  897  
##                          [80-90):17197   [100-125):  625  
##                          [40-50): 9685   [125-150):  145  
##                          [30-40): 3775   [25-50)  :   97  
##                          (Other): 5302   (Other)  :   97  
##  admission_type_id discharge_disposition_id admission_source_id
##  Min.   :1.000     Min.   : 1.000           Min.   : 1.000     
##  1st Qu.:1.000     1st Qu.: 1.000           1st Qu.: 1.000     
##  Median :1.000     Median : 1.000           Median : 7.000     
##  Mean   :2.024     Mean   : 3.716           Mean   : 5.754     
##  3rd Qu.:3.000     3rd Qu.: 4.000           3rd Qu.: 7.000     
##  Max.   :8.000     Max.   :28.000           Max.   :25.000     
##                                                                
##  time_in_hospital   payer_code                 medical_specialty
##  Min.   : 1.000   ?      :40256   ?                     :49949  
##  1st Qu.: 2.000   MC     :32439   InternalMedicine      :14635  
##  Median : 4.000   HM     : 6274   Emergency/Trauma      : 7565  
##  Mean   : 4.396   SP     : 5007   Family/GeneralPractice: 7440  
##  3rd Qu.: 6.000   BC     : 4655   Cardiology            : 5352  
##  Max.   :14.000   MD     : 3532   Surgery-General       : 3099  
##                   (Other): 9603   (Other)               :13726  
##  num_lab_procedures num_procedures num_medications number_outpatient
##  Min.   :  1.0      Min.   :0.00   Min.   : 1.00   Min.   : 0.0000  
##  1st Qu.: 31.0      1st Qu.:0.00   1st Qu.:10.00   1st Qu.: 0.0000  
##  Median : 44.0      Median :1.00   Median :15.00   Median : 0.0000  
##  Mean   : 43.1      Mean   :1.34   Mean   :16.02   Mean   : 0.3694  
##  3rd Qu.: 57.0      3rd Qu.:2.00   3rd Qu.:20.00   3rd Qu.: 0.0000  
##  Max.   :132.0      Max.   :6.00   Max.   :81.00   Max.   :42.0000  
##                                                                     
##  number_emergency  number_inpatient      diag_1          diag_2     
##  Min.   : 0.0000   Min.   : 0.0000   428    : 6862   276    : 6752  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   414    : 6581   428    : 6662  
##  Median : 0.0000   Median : 0.0000   786    : 4016   250    : 6071  
##  Mean   : 0.1978   Mean   : 0.6356   410    : 3614   427    : 5036  
##  3rd Qu.: 0.0000   3rd Qu.: 1.0000   486    : 3508   401    : 3736  
##  Max.   :76.0000   Max.   :21.0000   427    : 2766   496    : 3305  
##                                      (Other):74419   (Other):70204  
##      diag_3      number_diagnoses max_glu_serum A1Cresult   
##  250    :11555   Min.   : 1.000   >200: 1485    >7  : 3812  
##  401    : 8289   1st Qu.: 6.000   >300: 1264    >8  : 8216  
##  276    : 5175   Median : 8.000   None:96420    None:84748  
##  428    : 4577   Mean   : 7.423   Norm: 2597    Norm: 4990  
##  427    : 3955   3rd Qu.: 9.000                             
##  414    : 3664   Max.   :16.000                             
##  (Other):64551                                              
##   metformin     repaglinide     nateglinide     chlorpropamide 
##  Down  :  575   Down  :    45   Down  :    11   Down  :     1  
##  No    :81778   No    :100227   No    :101063   No    :101680  
##  Steady:18346   Steady:  1384   Steady:   668   Steady:    79  
##  Up    : 1067   Up    :   110   Up    :    24   Up    :     6  
##                                                                
##                                                                
##                                                                
##  glimepiride    acetohexamide    glipizide      glyburide    
##  Down  :  194   No    :101765   Down  :  560   Down  :  564  
##  No    :96575   Steady:     1   No    :89080   No    :91116  
##  Steady: 4670                   Steady:11356   Steady: 9274  
##  Up    :  327                   Up    :  770   Up    :  812  
##                                                              
##                                                              
##                                                              
##  tolbutamide     pioglitazone   rosiglitazone    acarbose     
##  No    :101743   Down  :  118   Down  :   87   Down  :     3  
##  Steady:    23   No    :94438   No    :95401   No    :101458  
##                  Steady: 6976   Steady: 6100   Steady:   295  
##                  Up    :  234   Up    :  178   Up    :    10  
##                                                               
##                                                               
##                                                               
##    miglitol      troglitazone     tolazamide     examide     citoglipton
##  Down  :     5   No    :101763   No    :101727   No:101766   No:101766  
##  No    :101728   Steady:     3   Steady:    38                          
##  Steady:    31                   Up    :     1                          
##  Up    :     2                                                          
##                                                                         
##                                                                         
##                                                                         
##    insulin      glyburide.metformin glipizide.metformin
##  Down  :12218   Down  :     6       No    :101753      
##  No    :47383   No    :101060       Steady:    13      
##  Steady:30849   Steady:   692                          
##  Up    :11316   Up    :     8                          
##                                                        
##                                                        
##                                                        
##  glimepiride.pioglitazone metformin.rosiglitazone metformin.pioglitazone
##  No    :101765            No    :101764           No    :101765         
##  Steady:     1            Steady:     2           Steady:     1         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  change     diabetesMed readmitted 
##  Ch:47011   No :23403   <30:11357  
##  No:54755   Yes:78363   >30:35545  
##                         NO :54864  
##                                    
##                                    
##                                    
## 
colnames(patient.data.df)
##  [1] "encounter_id"             "patient_nbr"             
##  [3] "race"                     "gender"                  
##  [5] "age"                      "weight"                  
##  [7] "admission_type_id"        "discharge_disposition_id"
##  [9] "admission_source_id"      "time_in_hospital"        
## [11] "payer_code"               "medical_specialty"       
## [13] "num_lab_procedures"       "num_procedures"          
## [15] "num_medications"          "number_outpatient"       
## [17] "number_emergency"         "number_inpatient"        
## [19] "diag_1"                   "diag_2"                  
## [21] "diag_3"                   "number_diagnoses"        
## [23] "max_glu_serum"            "A1Cresult"               
## [25] "metformin"                "repaglinide"             
## [27] "nateglinide"              "chlorpropamide"          
## [29] "glimepiride"              "acetohexamide"           
## [31] "glipizide"                "glyburide"               
## [33] "tolbutamide"              "pioglitazone"            
## [35] "rosiglitazone"            "acarbose"                
## [37] "miglitol"                 "troglitazone"            
## [39] "tolazamide"               "examide"                 
## [41] "citoglipton"              "insulin"                 
## [43] "glyburide.metformin"      "glipizide.metformin"     
## [45] "glimepiride.pioglitazone" "metformin.rosiglitazone" 
## [47] "metformin.pioglitazone"   "change"                  
## [49] "diabetesMed"              "readmitted"
p <- ggplot(data = patient.data.df, aes(x = reorder(race, -time_in_hospital), y = patient.data.df$time_in_hospital, fill = patient.data.df$race)) +
    geom_bar(stat = "summary",fun.y='mean') +
    ylab("Time in Hospital") +
    xlab("Race") +
    ggtitle("Time in Hospital Vs. Race")

p <- ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p
ggplot(data = patient.data.df)+    
    geom_smooth(aes(x = patient.data.df$time_in_hospital, y =patient.data.df$num_procedures, se = F,show.legend = T,colour="num of procedures"))+
    geom_smooth(aes(x = patient.data.df$time_in_hospital, y =patient.data.df$num_medications, se = F,show.legend = T,colour="num of medications"))+
    scale_x_discrete(limit = patient.data.df$time_in_hospital)+
    #scale_y_discrete(limit = patient.data.df$num_medications)+
    scale_colour_manual(name="legend", values=c("blue", "red"))+
    ylab("Number of Procedures")+
    xlab("Time in Hospital")+
    ggtitle("Time in Hospital Vs. Number of Procedures and Medications")

#Research Question 1
patient.rq1 <- select(patient.data.df,age,gender,race,admission_source_id,time_in_hospital,change,readmitted)
head(patient.rq1)
##       age gender            race admission_source_id time_in_hospital
## 1  [0-10) Female       Caucasian                   1                1
## 2 [10-20) Female       Caucasian                   7                3
## 3 [20-30) Female AfricanAmerican                   7                2
## 4 [30-40)   Male       Caucasian                   7                2
## 5 [40-50)   Male       Caucasian                   7                1
## 6 [50-60)   Male       Caucasian                   2                3
##   change readmitted
## 1     No         NO
## 2     Ch        >30
## 3     No         NO
## 4     Ch         NO
## 5     Ch         NO
## 6     No        >30
patient.med.no.change <- patient.rq1 %>%
  filter(change=='No')
head(patient.med.no.change)
##       age gender            race admission_source_id time_in_hospital
## 1  [0-10) Female       Caucasian                   1                1
## 2 [20-30) Female AfricanAmerican                   7                2
## 3 [50-60)   Male       Caucasian                   2                3
## 4 [70-80)   Male       Caucasian                   7                5
## 5 [40-50) Female AfricanAmerican                   7                9
## 6 [80-90)   Male       Caucasian                   7               10
##   change readmitted
## 1     No         NO
## 2     No         NO
## 3     No        >30
## 4     No        >30
## 5     No        >30
## 6     No         NO
patient.med.change <- patient.rq1 %>%
  filter(change!='No')
kable(head(patient.med.change))
age gender race admission_source_id time_in_hospital change readmitted
[10-20) Female Caucasian 7 3 Ch >30
[30-40) Male Caucasian 7 2 Ch NO
[40-50) Male Caucasian 7 1 Ch NO
[60-70) Male Caucasian 2 4 Ch NO
[80-90) Female Caucasian 4 13 Ch NO
[90-100) Female Caucasian 4 12 Ch NO
kable(describe(patient.rq1))
vars n mean sd median trimmed mad min max range skew kurtosis se
age* 1 101766 7.096702 1.5940838 7 7.207016 1.4826 1 10 9 -0.6305202 0.2812491 0.0049970
gender* 2 101766 1.462443 0.4986491 1 1.453018 0.0000 1 3 2 0.1513636 -1.9752019 0.0015631
race* 3 101766 3.598776 0.9384152 4 3.714361 0.0000 1 6 5 -1.0364629 0.6639945 0.0029417
admission_source_id 4 101766 5.754437 4.0640808 7 5.327843 0.0000 1 25 24 1.0299045 1.7447514 0.0127398
time_in_hospital 5 101766 4.395987 2.9851078 4 3.993859 2.9652 1 14 13 1.1339653 0.8500744 0.0093575
change* 6 101766 1.538048 0.4985527 2 1.547559 0.0000 1 2 1 -0.1526326 -1.9767227 0.0015628
readmitted* 7 101766 2.427520 0.6840684 3 2.534392 0.0000 1 3 2 -0.7834405 -0.5659045 0.0021444
kable(describe(patient.med.no.change))
vars n mean sd median trimmed mad min max range skew kurtosis se
age* 1 54755 7.149904 1.6149143 7 7.270540 1.4826 1 10 9 -0.6856491 0.4071192 0.0069014
gender* 2 54755 1.455557 0.4980621 1 1.444424 0.0000 1 3 2 0.1789188 -1.9668371 0.0021285
race* 3 54755 3.590667 0.9316863 4 3.709348 0.0000 1 6 5 -1.0655339 0.6064163 0.0039816
admission_source_id 4 54755 5.745265 4.0458276 7 5.324872 0.0000 1 22 21 1.0371237 1.7822736 0.0172900
time_in_hospital 5 54755 4.094019 2.8735766 3 3.676110 2.9652 1 14 13 1.2431397 1.2370554 0.0122804
change* 6 54755 2.000000 0.0000000 2 2.000000 0.0000 2 2 0 NaN NaN 0.0000000
readmitted* 7 54755 2.454461 0.6780493 3 2.568063 0.0000 1 3 2 -0.8539526 -0.4555402 0.0028977
kable(describe(patient.med.change))
vars n mean sd median trimmed mad min max range skew kurtosis se
age* 1 47011 7.034737 1.5672154 7 7.133027 1.4826 1 10 9 -0.5698760 0.1393703 0.0072282
gender* 2 47011 1.470464 0.4992174 1 1.463027 0.0000 1 3 2 0.1193713 -1.9830541 0.0023024
race* 3 47011 3.608219 0.9461145 4 3.720200 0.0000 1 6 5 -1.0050952 0.7248451 0.0043636
admission_source_id 4 47011 5.765119 4.0852551 7 5.331304 0.0000 1 25 24 1.0214967 1.7014673 0.0188417
time_in_hospital 5 47011 4.747697 3.0728184 4 4.379324 2.9652 1 14 13 1.0228509 0.5077545 0.0141722
change* 6 47011 1.000000 0.0000000 1 1.000000 0.0000 1 1 0 NaN NaN 0.0000000
readmitted* 7 47011 2.396141 0.6896947 3 2.495174 0.0000 1 3 2 -0.7050878 -0.6725262 0.0031810
p <- ggplot(data=filter(patient.rq1,change=='No'), aes(readmitted))+
      geom_bar(aes(fill=gender))+
      geom_text(stat='count', aes(label=..count..), vjust=-0.2)+
      #scale_x_continuous(breaks = c(0, 1))+
      xlab('Readmission time range ')+
      facet_wrap(~race)+
      ggtitle('Readmission for No Medication Changes')+
      theme_bw()
p <- ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p
barplot(table(patient.rq1$change), col=c("#ADD8E6","#ff7373"), main = 'Readmission - Patient Medication ')

mosaicplot(data = patient.rq1, ~change+readmitted, color=c("#ADD8E6","#ff7373"), main ="medication change Vs Readmission" )

hist(patient.rq1$time_in_hospital, probability = TRUE, main = "Histogram of Readmissions", xlab = "Time in Hospital")
x <- 1:15
y <- dnorm(x = x, mean = mean(patient.rq1$time_in_hospital), sd = sd(patient.rq1$time_in_hospital))
lines(x = x, y = y, col = "blue")

par(mfrow=c(1,3))
boxplot(patient.data.df$admission_source_id, main = "Admission Source ID",col = "#ADD8E6")
boxplot(patient.data.df$time_in_hospital, main = "Time in Hospital", col="#ff7373")
boxplot(patient.data.df$num_medications, main = "Number of Medications", col="#98FB98")

qqnorm(patient.data.df$time_in_hospital)
qqline(patient.data.df$number_diagnoses)