library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
data_path <- "C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data <- read.csv(data_path)
I would like to choose the column “DRK_Y_N” which contains binary values, of whether a person drinks or not. “Y” indicates the person drinks and “N” indicates that the person does not drink.
The 4 explanatory variables that I have chosen are: 1) Hemoglobin 2) Systolic blood Pressure 3) Age 4) Cholesterol
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
data$DRK_BINARY <- ifelse(data$DRK_YN == "Y", 1, 0)
head(data)
## sex age height weight waistline sight_left sight_right hear_left hear_right
## 1 Male 35 170 75 90 1.0 1.0 1 1
## 2 Male 30 180 80 89 0.9 1.2 1 1
## 3 Male 40 165 75 91 1.2 1.5 1 1
## 4 Male 50 175 80 91 1.5 1.2 1 1
## 5 Male 50 165 60 80 1.0 1.2 1 1
## 6 Male 50 165 55 75 1.2 1.5 1 1
## SBP DBP BLDS tot_chole HDL_chole LDL_chole triglyceride hemoglobin
## 1 120 80 99 193 48 126 92 17.1
## 2 130 82 106 228 55 148 121 15.8
## 3 120 70 98 136 41 74 104 15.8
## 4 145 87 95 201 76 104 106 17.6
## 5 138 82 101 199 61 117 104 13.8
## 6 142 92 99 218 77 95 232 13.8
## urine_protein serum_creatinine SGOT_AST SGOT_ALT gamma_GTP SMK_stat_type_cd
## 1 1 1.0 21 35 40 1
## 2 1 0.9 20 36 27 3
## 3 1 0.9 47 32 68 1
## 4 1 1.1 29 34 18 1
## 5 1 0.8 19 12 25 1
## 6 3 0.8 29 40 37 3
## DRK_YN DRK_BINARY
## 1 Y 1
## 2 N 0
## 3 N 0
## 4 N 0
## 5 N 0
## 6 Y 1
model <- glm(DRK_BINARY ~ hemoglobin + SBP + age + tot_chole, data = data, family = binomial)
model$coefficients
## (Intercept) hemoglobin SBP age tot_chole
## -4.2565126514 0.3634780148 0.0097200802 -0.0419405233 -0.0005829611
Intercept (-4.2565): The log-odds of drinking (DRK_BINARY = 1) when hemoglobin, SBP, age, and total cholesterol are all zero. Since some of these variables may not make sense at zero, this intercept might not have a meaningful interpretation by itself.
Hemoglobin (0.3635): For each one-unit increase in hemoglobin, the log-odds of drinking increase by approximately 0.3635, holding all other variables constant. This means that higher hemoglobin levels are associated with a higher likelihood of drinking, assuming the other variables do not change.
Systolic blood Pressure (0.0097): For each one-unit increase in systolic blood pressure (SBP), the log-odds of drinking increase by approximately 0.0097, holding all other variables constant. This suggests that higher SBP is associated with a slightly higher likelihood of drinking.
Age (-0.0419): For each one-year increase in age, the log-odds of drinking decrease by approximately 0.0419, holding all other variables constant. This means that as people get older, the likelihood of drinking tends to decrease.
Cholesterol (-0.0006): For each one-unit increase in total cholesterol, the log-odds of drinking decrease by approximately 0.0006, holding all other variables constant. This suggests that higher total cholesterol levels are associated with a slightly lower likelihood of drinking.
summary_model <- summary(model)
se_hemoglobin <- summary_model$coefficients["hemoglobin", "Std. Error"]
ci_hemoglobin <- confint(model, "hemoglobin")
## Waiting for profiling to be done...
cat("Standard Error for hemoglobin coefficient:", se_hemoglobin, "\n")
## Standard Error for hemoglobin coefficient: 0.001539004
cat("95% Confidence Interval for hemoglobin coefficient:", ci_hemoglobin, "\n")
## 95% Confidence Interval for hemoglobin coefficient: 0.3604631 0.3664959
For each one-unit increase in hemoglobin, the log-odds of drinking increase by approximately 0.360 to 0.366. This implies that higher hemoglobin levels are associated with a statistically significant and higher likelihood of drinking, holding other variables constant.
model <- lm(SBP ~ hemoglobin,data=data)
rsquared <- summary(model)$r.squared
data |>
ggplot(mapping = aes(x = hemoglobin,
y = SBP)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed',
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "systolic Blood Pressure Vs Hemoglobin.",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
data <- data |>
mutate(SBP_2 = sqrt(SBP))
model <- lm(SBP_2 ~ hemoglobin,data=data)
rsquared <- summary(model)$r.squared
data |>
ggplot(mapping = aes(x = hemoglobin,
y = SBP_2)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed',
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "systolic Blood Pressure Vs Squared Hemoglobin.",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'