You can also embed plots, for example:
glimpse(diab2)
## Rows: 768
## Columns: 9
## $ Pregnancies <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
summary(diab2)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
EDA
plot_correlation(diab2)
outcome_counts <- table(diab2$Outcome)
outcome_df <- data.frame(Outcome = names(outcome_counts),
Count = as.numeric(outcome_counts))
# Create bar plot
ggplot(outcome_df, aes(x=Outcome, y=Count)) +
geom_bar(stat="identity", fill="purple",col='red') +
labs(title="Distribution of Diabetes Outcomes", x="Outcome", y="Count") +
theme_tq() +
theme(axis.text.x = element_text(size=12),
axis.text.y = element_text(size=12),
axis.title = element_text(size=12),
plot.title = element_text(size=16, face="bold"))
# Select relevant columns
diabetes_subset <- diab2[, c("Pregnancies", "Glucose", "BloodPressure",
"BMI", "Age", "Outcome")]
# Histograms
ggplot(diabetes_subset, aes(x = Pregnancies, fill = factor(Outcome))) +
geom_histogram(position = "identity", bins = 30, alpha = 0.7) +
labs(title = "Distribution of Pregnancies by Outcome") +
facet_wrap(~Outcome, scales = "free_y") +
theme_minimal()
# Boxplot
ggplot(diab2, aes(x = factor(Outcome), y = BMI, fill = factor(Outcome))) +
geom_boxplot() +
labs(title = "BMI Distribution by Outcome") +
theme_tq()
survival analysis
kmpd <- survfit(Surv(Age,Outcome)~1, data=diab2)
kmpd
## Call: survfit(formula = Surv(Age, Outcome) ~ 1, data = diab2)
##
## n events median 0.95LCL 0.95UCL
## [1,] 768 268 45 43 47
summary(kmpd,1*2*3*4)
## Call: survfit(formula = Surv(Age, Outcome) ~ 1, data = diab2)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 24 595 31 0.954 0.00807 0.938 0.97
gg_diab <- ggsurvplot(kmpd,
conf.int = T,
conf.type='log-log',
risk.table = 'percentage',
risk.table.height=.28,
surv.median.line = 'hv',
ggtheme = theme_tq(),
censor.shape='|',
xlab='Years')
gg_diab
coxph regression
creg <- coxph(Surv(Age,Outcome==1)~Glucose+BloodPressure
+BMI, data = diab2)
summary(creg)
## Call:
## coxph(formula = Surv(Age, Outcome == 1) ~ Glucose + BloodPressure +
## BMI, data = diab2)
##
## n= 768, number of events= 268
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Glucose 0.009011 1.009052 0.001904 4.733 2.21e-06 ***
## BloodPressure -0.012825 0.987257 0.002615 -4.904 9.39e-07 ***
## BMI 0.066488 1.068748 0.008760 7.590 3.21e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Glucose 1.0091 0.9910 1.0053 1.0128
## BloodPressure 0.9873 1.0129 0.9822 0.9923
## BMI 1.0687 0.9357 1.0506 1.0873
##
## Concordance= 0.695 (se = 0.019 )
## Likelihood ratio test= 99.53 on 3 df, p=<2e-16
## Wald test = 108.9 on 3 df, p=<2e-16
## Score (logrank) test = 103.7 on 3 df, p=<2e-16
ggforest(creg)
gcx <- survfit(creg,data=diab2)
ggsurvplot(gcx,
conf.int = T,
conf.type='log-log',
surv.median.line='hv',
risk.table='percentage',
ggtheme=theme_tq())
autoplot(gcx)
Glucose is statistically significant with a pvalue less than .05
BloodPressure is statistically significant with a pvalue less than .05
BMI is statistically significant with a pvalue less than .05
Glucose coef value is postive and thereby increases one’s chance of diabetes BloodPressure coef value is negative and thereby reduces one’s chance of diabetes BMI coef value is postive and thereby increases one’s chance of diabetes
Glucose exp(coef) a one unit increase in Glucose increases one’s chance of diabetes by .09% BloodPressure exp(coef) a onit increase in Blood pressure reduces one’s chance of diabetes by 1.3% BMI exp(coef) a one unit increase in BMI increases one’s chance of diabetes by 6.5%
The concordance score is nearly 70 which does ok for distinguishing between each of the individual covariate’s impacts on the Outcome
In summary