Including Plots

You can also embed plots, for example:

glimpse(diab2)
## Rows: 768
## Columns: 9
## $ Pregnancies              <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose                  <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure            <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness            <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin                  <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI                      <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age                      <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome                  <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
summary(diab2)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

EDA

plot_correlation(diab2)

outcome_counts <- table(diab2$Outcome)

outcome_df <- data.frame(Outcome = names(outcome_counts), 
                         Count = as.numeric(outcome_counts))

# Create bar plot
ggplot(outcome_df, aes(x=Outcome, y=Count)) +
  geom_bar(stat="identity", fill="purple",col='red') +
  labs(title="Distribution of Diabetes Outcomes", x="Outcome", y="Count") +
  theme_tq() +
  theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12),
        axis.title = element_text(size=12),
        plot.title = element_text(size=16, face="bold"))

# Select relevant columns
diabetes_subset <- diab2[, c("Pregnancies", "Glucose", "BloodPressure", 
                                "BMI", "Age", "Outcome")]

# Histograms
ggplot(diabetes_subset, aes(x = Pregnancies, fill = factor(Outcome))) +
  geom_histogram(position = "identity", bins = 30, alpha = 0.7) +
  labs(title = "Distribution of Pregnancies by Outcome") +
  facet_wrap(~Outcome, scales = "free_y") +
  theme_minimal()

# Boxplot
ggplot(diab2, aes(x = factor(Outcome), y = BMI, fill = factor(Outcome))) +
  geom_boxplot() +
  labs(title = "BMI Distribution by Outcome") +
  theme_tq()

survival analysis

kmpd <- survfit(Surv(Age,Outcome)~1, data=diab2)
kmpd
## Call: survfit(formula = Surv(Age, Outcome) ~ 1, data = diab2)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 768    268     45      43      47
summary(kmpd,1*2*3*4)
## Call: survfit(formula = Surv(Age, Outcome) ~ 1, data = diab2)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    24    595      31    0.954 0.00807        0.938         0.97
gg_diab <- ggsurvplot(kmpd,
                      conf.int = T,
                      conf.type='log-log',
                      risk.table = 'percentage',
                      risk.table.height=.28,
                      surv.median.line = 'hv',
                      ggtheme = theme_tq(),
                      censor.shape='|',
                      xlab='Years')



gg_diab

coxph regression

creg <- coxph(Surv(Age,Outcome==1)~Glucose+BloodPressure
             +BMI, data = diab2)
summary(creg)
## Call:
## coxph(formula = Surv(Age, Outcome == 1) ~ Glucose + BloodPressure + 
##     BMI, data = diab2)
## 
##   n= 768, number of events= 268 
## 
##                    coef exp(coef)  se(coef)      z Pr(>|z|)    
## Glucose        0.009011  1.009052  0.001904  4.733 2.21e-06 ***
## BloodPressure -0.012825  0.987257  0.002615 -4.904 9.39e-07 ***
## BMI            0.066488  1.068748  0.008760  7.590 3.21e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## Glucose          1.0091     0.9910    1.0053    1.0128
## BloodPressure    0.9873     1.0129    0.9822    0.9923
## BMI              1.0687     0.9357    1.0506    1.0873
## 
## Concordance= 0.695  (se = 0.019 )
## Likelihood ratio test= 99.53  on 3 df,   p=<2e-16
## Wald test            = 108.9  on 3 df,   p=<2e-16
## Score (logrank) test = 103.7  on 3 df,   p=<2e-16
ggforest(creg)

gcx <- survfit(creg,data=diab2)

ggsurvplot(gcx,
           conf.int = T,
           conf.type='log-log',
           surv.median.line='hv',
           risk.table='percentage',
           ggtheme=theme_tq())

autoplot(gcx)

Glucose is statistically significant with a pvalue less than .05 BloodPressure is statistically significant with a pvalue less than .05 BMI is statistically significant with a pvalue less than .05

Glucose coef value is postive and thereby increases one’s chance of diabetes BloodPressure coef value is negative and thereby reduces one’s chance of diabetes BMI coef value is postive and thereby increases one’s chance of diabetes

Glucose exp(coef) a one unit increase in Glucose increases one’s chance of diabetes by .09% BloodPressure exp(coef) a onit increase in Blood pressure reduces one’s chance of diabetes by 1.3% BMI exp(coef) a one unit increase in BMI increases one’s chance of diabetes by 6.5%

The concordance score is nearly 70 which does ok for distinguishing between each of the individual covariate’s impacts on the Outcome

In summary
  1. To help treat those most at risk for diabetes we should recommend a diet low to fully free of Glucose
  2. More exercise and less stress in order to bring down their BMI
  3. according to cx curve plot we can predict that within 46 to 47 months 50% of subjects will have diabetes