library(boot)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(broom)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)

df['BMI'] <- df['Weight']/(df['Height']**2)

Select an interesting binary column of data, or one which can be reasonably converted into a binary variable. This should be something worth modeling

df['is_obese'] <- ifelse( df$BMI >30 ,1,0)

Build a logistic regression model for this variable, using between 1-4 explanatory variables

model <- glm(is_obese ~ FAF + NCP + CH2O, data = df, family = binomial(link = 'logit'))
model$coefficients
## (Intercept)         FAF         NCP        CH2O 
##  -0.9641686  -0.4167630   0.1528797   0.4048754
sigmoid <- function(x, model) {
  intercept <- model$coefficients[1]
  coef_FA <- model$coefficients['FAF']
  coef_NCP <- model$coefficients['NCP']
  coef_CH2O <- model$coefficients['CH2O']
  log_odds <- intercept + coef_FA * mean(df$FAF) + coef_NCP * mean(df$NCP) + coef_CH2O * x
  probability <- 1 / (1 + exp(-log_odds))
  return(probability)
}
df |> 
  ggplot(mapping = aes(x = CH2O, y = is_obese)) + 
  geom_jitter(width = 0, height = 0.1, size = 1) + 
  geom_function(fun = function(x) sigmoid(x, model), color = 'blue', linewidth = 1) +
  scale_y_continuous(breaks = c(0, 0.5, 1)) + 
  theme_classic()

CH2O_decision_threshold  <- -model$coefficients["(Intercept)"]/model$coefficients["CH2O"]
CH2O_decision_threshold
## (Intercept) 
##    2.381396

Interpret the coefficients, and explain what they mean in your notebook

summary(model)
## 
## Call:
## glm(formula = is_obese ~ FAF + NCP + CH2O, family = binomial(link = "logit"), 
##     data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.96417    0.21277  -4.531 5.86e-06 ***
## FAF         -0.41676    0.05498  -7.580 3.46e-14 ***
## NCP          0.15288    0.05816   2.628  0.00858 ** 
## CH2O         0.40488    0.07449   5.435 5.48e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2913.9  on 2110  degrees of freedom
## Residual deviance: 2832.4  on 2107  degrees of freedom
## AIC: 2840.4
## 
## Number of Fisher Scoring iterations: 4

Using the Standard Error for at least one coefficient, build a C.I. for that coefficient, and translate its meaning

tidy(model, conf.int  = TRUE)
## # A tibble: 4 × 7
##   term        estimate std.error statistic  p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)   -0.964    0.213      -4.53 5.86e- 6  -1.38      -0.549
## 2 FAF           -0.417    0.0550     -7.58 3.46e-14  -0.525     -0.310
## 3 NCP            0.153    0.0582      2.63 8.58e- 3   0.0393     0.267
## 4 CH2O           0.405    0.0745      5.44 5.48e- 8   0.259      0.552
tidy(model, conf.int = TRUE) |>
  ggplot(mapping = aes(x = estimate, y = term))+
  geom_point()+
  geom_vline(xintercept = 0, linetype = 'dotted', color = 'gray')+
  geom_errorbarh(mapping = aes(xmin = conf.low,
                               xmax = conf.high,
                               height = 0.5)) +
  labs(title = 'Model Coefficient C.I.')