library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
data_path <- "C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data <-  read.csv(data_path)

Selecting a binary column:

I would like to choose the column “DRK_Y_N” which contains binary values, of whether a person drinks or not. “Y” indicates the person drinks and “N” indicates that the person does not drink.

Explanatory variables:

The 4 explanatory variables that I have chosen are: 1) Hemoglobin 2) Systolic blood Pressure 3) Age 4) Cholesterol

Building a logistic regression model:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)

Changing “Y” and “N” to 1 and 0

data$DRK_BINARY <- ifelse(data$DRK_YN == "Y", 1, 0)
head(data)
##    sex age height weight waistline sight_left sight_right hear_left hear_right
## 1 Male  35    170     75        90        1.0         1.0         1          1
## 2 Male  30    180     80        89        0.9         1.2         1          1
## 3 Male  40    165     75        91        1.2         1.5         1          1
## 4 Male  50    175     80        91        1.5         1.2         1          1
## 5 Male  50    165     60        80        1.0         1.2         1          1
## 6 Male  50    165     55        75        1.2         1.5         1          1
##   SBP DBP BLDS tot_chole HDL_chole LDL_chole triglyceride hemoglobin
## 1 120  80   99       193        48       126           92       17.1
## 2 130  82  106       228        55       148          121       15.8
## 3 120  70   98       136        41        74          104       15.8
## 4 145  87   95       201        76       104          106       17.6
## 5 138  82  101       199        61       117          104       13.8
## 6 142  92   99       218        77        95          232       13.8
##   urine_protein serum_creatinine SGOT_AST SGOT_ALT gamma_GTP SMK_stat_type_cd
## 1             1              1.0       21       35        40                1
## 2             1              0.9       20       36        27                3
## 3             1              0.9       47       32        68                1
## 4             1              1.1       29       34        18                1
## 5             1              0.8       19       12        25                1
## 6             3              0.8       29       40        37                3
##   DRK_YN DRK_BINARY
## 1      Y          1
## 2      N          0
## 3      N          0
## 4      N          0
## 5      N          0
## 6      Y          1

Creating logistic regression model:

model <- glm(DRK_BINARY ~ hemoglobin + SBP + age + tot_chole, data = data, family = binomial)
model$coefficients
##   (Intercept)    hemoglobin           SBP           age     tot_chole 
## -4.2565126514  0.3634780148  0.0097200802 -0.0419405233 -0.0005829611

Interpretations:

  1. Intercept (-4.2565): The log-odds of drinking (DRK_BINARY = 1) when hemoglobin, SBP, age, and total cholesterol are all zero. Since some of these variables may not make sense at zero, this intercept might not have a meaningful interpretation by itself.

  2. Hemoglobin (0.3635): For each one-unit increase in hemoglobin, the log-odds of drinking increase by approximately 0.3635, holding all other variables constant. This means that higher hemoglobin levels are associated with a higher likelihood of drinking, assuming the other variables do not change.

  3. Systolic blood Pressure (0.0097): For each one-unit increase in systolic blood pressure (SBP), the log-odds of drinking increase by approximately 0.0097, holding all other variables constant. This suggests that higher SBP is associated with a slightly higher likelihood of drinking.

  4. Age (-0.0419): For each one-year increase in age, the log-odds of drinking decrease by approximately 0.0419, holding all other variables constant. This means that as people get older, the likelihood of drinking tends to decrease.

  5. Cholesterol (-0.0006): For each one-unit increase in total cholesterol, the log-odds of drinking decrease by approximately 0.0006, holding all other variables constant. This suggests that higher total cholesterol levels are associated with a slightly lower likelihood of drinking.

Confidence Interval:

summary_model <- summary(model)

se_hemoglobin <- summary_model$coefficients["hemoglobin", "Std. Error"]


ci_hemoglobin <- confint(model, "hemoglobin")
## Waiting for profiling to be done...
cat("Standard Error for hemoglobin coefficient:", se_hemoglobin, "\n")
## Standard Error for hemoglobin coefficient: 0.001539004
cat("95% Confidence Interval for hemoglobin coefficient:", ci_hemoglobin, "\n")
## 95% Confidence Interval for hemoglobin coefficient: 0.3604631 0.3664959

Interpretation:

For each one-unit increase in hemoglobin, the log-odds of drinking increase by approximately 0.360 to 0.366. This implies that higher hemoglobin levels are associated with a statistically significant and higher likelihood of drinking, holding other variables constant.

Transforming Explanatory variable:

Systolic Blood Pressure Vs Hemoglobin

model <- lm(SBP ~ hemoglobin,data=data)

rsquared <- summary(model)$r.squared

data |> 
  ggplot(mapping = aes(x = hemoglobin, 
                       y = SBP)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', 
              se = FALSE) +
  geom_smooth(se = FALSE) +
  labs(title = "systolic Blood Pressure Vs Hemoglobin.",
       subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Trying square root of Systolic blood pressure:

data <- data |>
  mutate(SBP_2 = sqrt(SBP))

model <- lm(SBP_2 ~ hemoglobin,data=data)

rsquared <- summary(model)$r.squared

data |> 
  ggplot(mapping = aes(x = hemoglobin, 
                       y = SBP_2)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', 
              se = FALSE) +
  geom_smooth(se = FALSE) +
  labs(title = "systolic Blood Pressure Vs Squared Hemoglobin.",
       subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'