library(tidyverse)
library(knitr)
library(psych)
library(ggplot2)
library(plotly)
patient.data.df = read.csv('https://raw.githubusercontent.com/niteen11/CUNY_DATA_698/master/dataset_diabetes/diabetic_data.csv')
kable(head(patient.data.df))
2278392 |
8222157 |
Caucasian |
Female |
[0-10) |
? |
6 |
25 |
1 |
1 |
? |
Pediatrics-Endocrinology |
41 |
0 |
1 |
0 |
0 |
0 |
250.83 |
? |
? |
1 |
None |
None |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
NO |
149190 |
55629189 |
Caucasian |
Female |
[10-20) |
? |
1 |
1 |
7 |
3 |
? |
? |
59 |
0 |
18 |
0 |
0 |
0 |
276 |
250.01 |
255 |
9 |
None |
None |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
Up |
No |
No |
No |
No |
No |
Ch |
Yes |
>30 |
64410 |
86047875 |
AfricanAmerican |
Female |
[20-30) |
? |
1 |
1 |
7 |
2 |
? |
? |
11 |
5 |
13 |
2 |
0 |
1 |
648 |
250 |
V27 |
6 |
None |
None |
No |
No |
No |
No |
No |
No |
Steady |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
Yes |
NO |
500364 |
82442376 |
Caucasian |
Male |
[30-40) |
? |
1 |
1 |
7 |
2 |
? |
? |
44 |
1 |
16 |
0 |
0 |
0 |
8 |
250.43 |
403 |
7 |
None |
None |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
Up |
No |
No |
No |
No |
No |
Ch |
Yes |
NO |
16680 |
42519267 |
Caucasian |
Male |
[40-50) |
? |
1 |
1 |
7 |
1 |
? |
? |
51 |
0 |
8 |
0 |
0 |
0 |
197 |
157 |
250 |
5 |
None |
None |
No |
No |
No |
No |
No |
No |
Steady |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
Steady |
No |
No |
No |
No |
No |
Ch |
Yes |
NO |
35754 |
82637451 |
Caucasian |
Male |
[50-60) |
? |
2 |
1 |
2 |
3 |
? |
? |
31 |
6 |
16 |
0 |
0 |
0 |
414 |
411 |
250 |
9 |
None |
None |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
No |
Steady |
No |
No |
No |
No |
No |
No |
Yes |
>30 |
dim(patient.data.df)
## [1] 101766 50
summary(patient.data.df$gender)
## Female Male Unknown/Invalid
## 54708 47055 3
summary(patient.data.df)
## encounter_id patient_nbr race
## Min. : 12522 Min. : 135 ? : 2273
## 1st Qu.: 84961194 1st Qu.: 23413221 AfricanAmerican:19210
## Median :152388987 Median : 45505143 Asian : 641
## Mean :165201646 Mean : 54330401 Caucasian :76099
## 3rd Qu.:230270888 3rd Qu.: 87545950 Hispanic : 2037
## Max. :443867222 Max. :189502619 Other : 1506
##
## gender age weight
## Female :54708 [70-80):26068 ? :98569
## Male :47055 [60-70):22483 [75-100) : 1336
## Unknown/Invalid: 3 [50-60):17256 [50-75) : 897
## [80-90):17197 [100-125): 625
## [40-50): 9685 [125-150): 145
## [30-40): 3775 [25-50) : 97
## (Other): 5302 (Other) : 97
## admission_type_id discharge_disposition_id admission_source_id
## Min. :1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median :1.000 Median : 1.000 Median : 7.000
## Mean :2.024 Mean : 3.716 Mean : 5.754
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.: 7.000
## Max. :8.000 Max. :28.000 Max. :25.000
##
## time_in_hospital payer_code medical_specialty
## Min. : 1.000 ? :40256 ? :49949
## 1st Qu.: 2.000 MC :32439 InternalMedicine :14635
## Median : 4.000 HM : 6274 Emergency/Trauma : 7565
## Mean : 4.396 SP : 5007 Family/GeneralPractice: 7440
## 3rd Qu.: 6.000 BC : 4655 Cardiology : 5352
## Max. :14.000 MD : 3532 Surgery-General : 3099
## (Other): 9603 (Other) :13726
## num_lab_procedures num_procedures num_medications number_outpatient
## Min. : 1.0 Min. :0.00 Min. : 1.00 Min. : 0.0000
## 1st Qu.: 31.0 1st Qu.:0.00 1st Qu.:10.00 1st Qu.: 0.0000
## Median : 44.0 Median :1.00 Median :15.00 Median : 0.0000
## Mean : 43.1 Mean :1.34 Mean :16.02 Mean : 0.3694
## 3rd Qu.: 57.0 3rd Qu.:2.00 3rd Qu.:20.00 3rd Qu.: 0.0000
## Max. :132.0 Max. :6.00 Max. :81.00 Max. :42.0000
##
## number_emergency number_inpatient diag_1 diag_2
## Min. : 0.0000 Min. : 0.0000 428 : 6862 276 : 6752
## 1st Qu.: 0.0000 1st Qu.: 0.0000 414 : 6581 428 : 6662
## Median : 0.0000 Median : 0.0000 786 : 4016 250 : 6071
## Mean : 0.1978 Mean : 0.6356 410 : 3614 427 : 5036
## 3rd Qu.: 0.0000 3rd Qu.: 1.0000 486 : 3508 401 : 3736
## Max. :76.0000 Max. :21.0000 427 : 2766 496 : 3305
## (Other):74419 (Other):70204
## diag_3 number_diagnoses max_glu_serum A1Cresult
## 250 :11555 Min. : 1.000 >200: 1485 >7 : 3812
## 401 : 8289 1st Qu.: 6.000 >300: 1264 >8 : 8216
## 276 : 5175 Median : 8.000 None:96420 None:84748
## 428 : 4577 Mean : 7.423 Norm: 2597 Norm: 4990
## 427 : 3955 3rd Qu.: 9.000
## 414 : 3664 Max. :16.000
## (Other):64551
## metformin repaglinide nateglinide chlorpropamide
## Down : 575 Down : 45 Down : 11 Down : 1
## No :81778 No :100227 No :101063 No :101680
## Steady:18346 Steady: 1384 Steady: 668 Steady: 79
## Up : 1067 Up : 110 Up : 24 Up : 6
##
##
##
## glimepiride acetohexamide glipizide glyburide
## Down : 194 No :101765 Down : 560 Down : 564
## No :96575 Steady: 1 No :89080 No :91116
## Steady: 4670 Steady:11356 Steady: 9274
## Up : 327 Up : 770 Up : 812
##
##
##
## tolbutamide pioglitazone rosiglitazone acarbose
## No :101743 Down : 118 Down : 87 Down : 3
## Steady: 23 No :94438 No :95401 No :101458
## Steady: 6976 Steady: 6100 Steady: 295
## Up : 234 Up : 178 Up : 10
##
##
##
## miglitol troglitazone tolazamide examide citoglipton
## Down : 5 No :101763 No :101727 No:101766 No:101766
## No :101728 Steady: 3 Steady: 38
## Steady: 31 Up : 1
## Up : 2
##
##
##
## insulin glyburide.metformin glipizide.metformin
## Down :12218 Down : 6 No :101753
## No :47383 No :101060 Steady: 13
## Steady:30849 Steady: 692
## Up :11316 Up : 8
##
##
##
## glimepiride.pioglitazone metformin.rosiglitazone metformin.pioglitazone
## No :101765 No :101764 No :101765
## Steady: 1 Steady: 2 Steady: 1
##
##
##
##
##
## change diabetesMed readmitted
## Ch:47011 No :23403 <30:11357
## No:54755 Yes:78363 >30:35545
## NO :54864
##
##
##
##
colnames(patient.data.df)
## [1] "encounter_id" "patient_nbr"
## [3] "race" "gender"
## [5] "age" "weight"
## [7] "admission_type_id" "discharge_disposition_id"
## [9] "admission_source_id" "time_in_hospital"
## [11] "payer_code" "medical_specialty"
## [13] "num_lab_procedures" "num_procedures"
## [15] "num_medications" "number_outpatient"
## [17] "number_emergency" "number_inpatient"
## [19] "diag_1" "diag_2"
## [21] "diag_3" "number_diagnoses"
## [23] "max_glu_serum" "A1Cresult"
## [25] "metformin" "repaglinide"
## [27] "nateglinide" "chlorpropamide"
## [29] "glimepiride" "acetohexamide"
## [31] "glipizide" "glyburide"
## [33] "tolbutamide" "pioglitazone"
## [35] "rosiglitazone" "acarbose"
## [37] "miglitol" "troglitazone"
## [39] "tolazamide" "examide"
## [41] "citoglipton" "insulin"
## [43] "glyburide.metformin" "glipizide.metformin"
## [45] "glimepiride.pioglitazone" "metformin.rosiglitazone"
## [47] "metformin.pioglitazone" "change"
## [49] "diabetesMed" "readmitted"
p <- ggplot(data = patient.data.df, aes(x = reorder(race, -time_in_hospital), y = patient.data.df$time_in_hospital, fill = patient.data.df$race)) +
geom_bar(stat = "summary",fun.y='mean') +
ylab("Time in Hospital") +
xlab("Race") +
ggtitle("Time in Hospital Vs. Race")
p <- ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p
ggplot(data = patient.data.df)+
geom_smooth(aes(x = patient.data.df$time_in_hospital, y =patient.data.df$num_procedures, se = F,show.legend = T,colour="num of procedures"))+
geom_smooth(aes(x = patient.data.df$time_in_hospital, y =patient.data.df$num_medications, se = F,show.legend = T,colour="num of medications"))+
scale_x_discrete(limit = patient.data.df$time_in_hospital)+
#scale_y_discrete(limit = patient.data.df$num_medications)+
scale_colour_manual(name="legend", values=c("blue", "red"))+
ylab("Number of Procedures")+
xlab("Time in Hospital")+
ggtitle("Time in Hospital Vs. Number of Procedures and Medications")

#Research Question 1
patient.rq1 <- select(patient.data.df,age,gender,race,admission_source_id,time_in_hospital,change,readmitted)
head(patient.rq1)
## age gender race admission_source_id time_in_hospital
## 1 [0-10) Female Caucasian 1 1
## 2 [10-20) Female Caucasian 7 3
## 3 [20-30) Female AfricanAmerican 7 2
## 4 [30-40) Male Caucasian 7 2
## 5 [40-50) Male Caucasian 7 1
## 6 [50-60) Male Caucasian 2 3
## change readmitted
## 1 No NO
## 2 Ch >30
## 3 No NO
## 4 Ch NO
## 5 Ch NO
## 6 No >30
patient.med.no.change <- patient.rq1 %>%
filter(change=='No')
head(patient.med.no.change)
## age gender race admission_source_id time_in_hospital
## 1 [0-10) Female Caucasian 1 1
## 2 [20-30) Female AfricanAmerican 7 2
## 3 [50-60) Male Caucasian 2 3
## 4 [70-80) Male Caucasian 7 5
## 5 [40-50) Female AfricanAmerican 7 9
## 6 [80-90) Male Caucasian 7 10
## change readmitted
## 1 No NO
## 2 No NO
## 3 No >30
## 4 No >30
## 5 No >30
## 6 No NO
patient.med.change <- patient.rq1 %>%
filter(change!='No')
kable(head(patient.med.change))
[10-20) |
Female |
Caucasian |
7 |
3 |
Ch |
>30 |
[30-40) |
Male |
Caucasian |
7 |
2 |
Ch |
NO |
[40-50) |
Male |
Caucasian |
7 |
1 |
Ch |
NO |
[60-70) |
Male |
Caucasian |
2 |
4 |
Ch |
NO |
[80-90) |
Female |
Caucasian |
4 |
13 |
Ch |
NO |
[90-100) |
Female |
Caucasian |
4 |
12 |
Ch |
NO |
kable(describe(patient.rq1))
age* |
1 |
101766 |
7.096702 |
1.5940838 |
7 |
7.207016 |
1.4826 |
1 |
10 |
9 |
-0.6305202 |
0.2812491 |
0.0049970 |
gender* |
2 |
101766 |
1.462443 |
0.4986491 |
1 |
1.453018 |
0.0000 |
1 |
3 |
2 |
0.1513636 |
-1.9752019 |
0.0015631 |
race* |
3 |
101766 |
3.598776 |
0.9384152 |
4 |
3.714361 |
0.0000 |
1 |
6 |
5 |
-1.0364629 |
0.6639945 |
0.0029417 |
admission_source_id |
4 |
101766 |
5.754437 |
4.0640808 |
7 |
5.327843 |
0.0000 |
1 |
25 |
24 |
1.0299045 |
1.7447514 |
0.0127398 |
time_in_hospital |
5 |
101766 |
4.395987 |
2.9851078 |
4 |
3.993859 |
2.9652 |
1 |
14 |
13 |
1.1339653 |
0.8500744 |
0.0093575 |
change* |
6 |
101766 |
1.538048 |
0.4985527 |
2 |
1.547559 |
0.0000 |
1 |
2 |
1 |
-0.1526326 |
-1.9767227 |
0.0015628 |
readmitted* |
7 |
101766 |
2.427520 |
0.6840684 |
3 |
2.534392 |
0.0000 |
1 |
3 |
2 |
-0.7834405 |
-0.5659045 |
0.0021444 |
kable(describe(patient.med.no.change))
age* |
1 |
54755 |
7.149904 |
1.6149143 |
7 |
7.270540 |
1.4826 |
1 |
10 |
9 |
-0.6856491 |
0.4071192 |
0.0069014 |
gender* |
2 |
54755 |
1.455557 |
0.4980621 |
1 |
1.444424 |
0.0000 |
1 |
3 |
2 |
0.1789188 |
-1.9668371 |
0.0021285 |
race* |
3 |
54755 |
3.590667 |
0.9316863 |
4 |
3.709348 |
0.0000 |
1 |
6 |
5 |
-1.0655339 |
0.6064163 |
0.0039816 |
admission_source_id |
4 |
54755 |
5.745265 |
4.0458276 |
7 |
5.324872 |
0.0000 |
1 |
22 |
21 |
1.0371237 |
1.7822736 |
0.0172900 |
time_in_hospital |
5 |
54755 |
4.094019 |
2.8735766 |
3 |
3.676110 |
2.9652 |
1 |
14 |
13 |
1.2431397 |
1.2370554 |
0.0122804 |
change* |
6 |
54755 |
2.000000 |
0.0000000 |
2 |
2.000000 |
0.0000 |
2 |
2 |
0 |
NaN |
NaN |
0.0000000 |
readmitted* |
7 |
54755 |
2.454461 |
0.6780493 |
3 |
2.568063 |
0.0000 |
1 |
3 |
2 |
-0.8539526 |
-0.4555402 |
0.0028977 |
kable(describe(patient.med.change))
age* |
1 |
47011 |
7.034737 |
1.5672154 |
7 |
7.133027 |
1.4826 |
1 |
10 |
9 |
-0.5698760 |
0.1393703 |
0.0072282 |
gender* |
2 |
47011 |
1.470464 |
0.4992174 |
1 |
1.463027 |
0.0000 |
1 |
3 |
2 |
0.1193713 |
-1.9830541 |
0.0023024 |
race* |
3 |
47011 |
3.608219 |
0.9461145 |
4 |
3.720200 |
0.0000 |
1 |
6 |
5 |
-1.0050952 |
0.7248451 |
0.0043636 |
admission_source_id |
4 |
47011 |
5.765119 |
4.0852551 |
7 |
5.331304 |
0.0000 |
1 |
25 |
24 |
1.0214967 |
1.7014673 |
0.0188417 |
time_in_hospital |
5 |
47011 |
4.747697 |
3.0728184 |
4 |
4.379324 |
2.9652 |
1 |
14 |
13 |
1.0228509 |
0.5077545 |
0.0141722 |
change* |
6 |
47011 |
1.000000 |
0.0000000 |
1 |
1.000000 |
0.0000 |
1 |
1 |
0 |
NaN |
NaN |
0.0000000 |
readmitted* |
7 |
47011 |
2.396141 |
0.6896947 |
3 |
2.495174 |
0.0000 |
1 |
3 |
2 |
-0.7050878 |
-0.6725262 |
0.0031810 |
p <- ggplot(data=filter(patient.rq1,change=='No'), aes(readmitted))+
geom_bar(aes(fill=gender))+
geom_text(stat='count', aes(label=..count..), vjust=-0.2)+
#scale_x_continuous(breaks = c(0, 1))+
xlab('Readmission time range ')+
facet_wrap(~race)+
ggtitle('Readmission for No Medication Changes')+
theme_bw()
p <- ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p
barplot(table(patient.rq1$change), col=c("#ADD8E6","#ff7373"), main = 'Readmission - Patient Medication ')

mosaicplot(data = patient.rq1, ~change+readmitted, color=c("#ADD8E6","#ff7373"), main ="medication change Vs Readmission" )

hist(patient.rq1$time_in_hospital, probability = TRUE, main = "Histogram of Readmissions", xlab = "Time in Hospital")
x <- 1:15
y <- dnorm(x = x, mean = mean(patient.rq1$time_in_hospital), sd = sd(patient.rq1$time_in_hospital))
lines(x = x, y = y, col = "blue")

par(mfrow=c(1,3))
boxplot(patient.data.df$admission_source_id, main = "Admission Source ID",col = "#ADD8E6")
boxplot(patient.data.df$time_in_hospital, main = "Time in Hospital", col="#ff7373")
boxplot(patient.data.df$num_medications, main = "Number of Medications", col="#98FB98")

qqnorm(patient.data.df$time_in_hospital)
qqline(patient.data.df$number_diagnoses)
