library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
mepsData <-read.csv("C:/Users/DELL/Desktop/Naomi_work/Assignment 4/mepsData.csv")
str(mepsData)
## 'data.frame':    30461 obs. of  20 variables:
##  $ X                       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Person_ID               : num  2.29e+09 2.29e+09 2.29e+09 2.29e+09 2.29e+09 ...
##  $ FluVaccination          : int  2 1 2 2 NA NA NA NA 2 NA ...
##  $ Age                     : int  26 25 33 39 11 8 4 2 36 36 ...
##  $ Sex                     : int  2 1 2 1 1 1 1 1 2 1 ...
##  $ RaceEthnicity           : int  2 2 1 1 1 1 1 1 2 2 ...
##  $ HealthInsurance         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NotAffordHealthCare     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ FamIncome_Continuous    : num  32000 32000 55000 55000 55000 ...
##  $ MentalHealth            : int  3 2 3 3 3 3 3 3 3 3 ...
##  $ FamIncome_Categorical   : int  3 3 3 3 3 3 3 3 5 5 ...
##  $ FamIncome_PercentPoverty: num  190 190 164 164 164 ...
##  $ HealthStatus            : int  2 2 3 3 3 3 3 3 2 3 ...
##  $ HaveProvider            : int  1 1 1 2 1 1 1 1 1 1 ...
##  $ CensusRegion            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ TotalHealthExpenditure  : int  2368 2040 173 0 103 0 0 63 535 7023 ...
##  $ HasHypertension         : int  2 2 2 2 NA NA NA NA 2 2 ...
##  $ HasDiabetes             : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ BMI                     : num  21.4 30.6 28.2 28.7 NA NA NA NA 21.5 NA ...
##  $ Drinks5Day              : int  NA 1 NA 2 NA NA NA NA NA NA ...

Preliminary Analysis

Recoding

mepsData_1 <- mepsData %>% 
  mutate(BMI_Category = case_when(
    BMI < 16.0 ~ "Underweight-Severe",
    BMI >= 16.0 & BMI <= 16.9 ~ "Underweight-Moderate",
    BMI >= 17.0 & BMI <= 18.4 ~ "Undereight-Mild",
    BMI >= 18.5 & BMI <= 24.9 ~ "Normal",
    BMI >= 25.0 & BMI <= 29.9 ~ "Overweight",
    BMI >= 30.0 & BMI <= 34.9 ~ "Obese-i",
    BMI >= 35.0 & BMI <= 39.9 ~ "Obese-ii",
    TRUE ~ "Obese-iii"  # For any other value of BMI
  ))

Exploratory Analysis

head(mepsData_1,5)
##   X  Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101              2  26   2             2               1
## 2 2 2290001102              1  25   1             2               1
## 3 3 2290002101              2  33   2             1               1
## 4 4 2290002102              2  39   1             1               1
## 5 5 2290002103             NA  11   1             1               1
##   NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1                   2                32000            3                     3
## 2                   2                32000            2                     3
## 3                   2                55000            3                     3
## 4                   2                55000            3                     3
## 5                   2                55000            3                     3
##   FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1                   190.31            2            1            2
## 2                   190.31            2            1            2
## 3                   163.92            3            1            2
## 4                   163.92            3            2            2
## 5                   163.92            3            1            2
##   TotalHealthExpenditure HasHypertension HasDiabetes  BMI Drinks5Day
## 1                   2368               2           2 21.4         NA
## 2                   2040               2           2 30.6          1
## 3                    173               2           2 28.2         NA
## 4                      0               2           2 28.7          2
## 5                    103              NA           2   NA         NA
##   BMI_Category
## 1       Normal
## 2      Obese-i
## 3   Overweight
## 4   Overweight
## 5    Obese-iii
Two_bmi <- mepsData_1%>%group_by(BMI_Category)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
                      Twentyfive_percentile = quantile(TotalHealthExpenditure, 0.25),
                    Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))

Two_income <- mepsData_1 %>%group_by(FamIncome_Categorical)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
                                                              Twenty_percentile = quantile(TotalHealthExpenditure, 0.25),
                                                              Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))


Two_healthstatus <- mepsData_1 %>%group_by(HealthStatus)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
                                                              Twenty_percentile = quantile(TotalHealthExpenditure, 0.25),
                                                              Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))

Two_healthstatus <- as.data.frame(Two_healthstatus)
Two_healthstatus%>%ggplot(aes(x=HealthStatus,y=average))+geom_line()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Two_income%>%ggplot(aes(x=FamIncome_Categorical, y=average))+geom_line()

Two_bmi%>%ggplot()

Descritpive Analysis

```{ r analysis}

mepsData_1%>%ggplot(aes(y=TotalHealthExpenditure, fill=as.factor(HealthStatus)))+geom_boxplot() #Question three mepsData_1%>%na.omit()%>%group_by(HealthStatus)%>%summarise(count=n()) mepsData_1%>%na.omit()%>%group_by(MentalHealth)%>%summarise(count=n())


## Check preliminary evaluation
```{ r check}

summary(mepsData_1$HealthStatus)
summary(mepsData_1$MentalHealth)

Perform a chi-squared test for independence

chi_sq_result <- chisq.test(mepsData_1$HealthStatus, mepsData_1$MentalHealth)
print(chi_sq_result)
## 
##  Pearson's Chi-squared test
## 
## data:  mepsData_1$HealthStatus and mepsData_1$MentalHealth
## X-squared = 25532, df = 16, p-value < 2.2e-16

Hypothesis b: Medical Expenditure varies by whether the person has high blood pressure (Has Hypertension)

Perform a t-test or Mann-Whitney U test

t_test_result <- t.test(mepsData_1$TotalHealthExpenditure ~ mepsData_1$HasHypertension)
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  mepsData_1$TotalHealthExpenditure by mepsData_1$HasHypertension
## t = 23.992, df = 11499, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  6608.840 7784.801
## sample estimates:
## mean in group 1 mean in group 2 
##       12050.802        4853.982

Hypothesis c: Medical Expenditure varies by Census Region

Check preliminary evaluation

Create a box plot

mepsData_1%>%ggplot(aes(x = as.factor(CensusRegion), y = TotalHealthExpenditure)) +
  geom_boxplot()

Perform an ANOVA or Kruskal-Wallis test

anova_result <- aov(TotalHealthExpenditure ~ CensusRegion, data = mepsData_1)
print(summary(anova_result))
##                 Df    Sum Sq   Mean Sq F value   Pr(>F)    
## CensusRegion     1 1.324e+10 1.324e+10   42.96 5.69e-11 ***
## Residuals    30084 9.274e+12 3.083e+08                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 375 observations deleted due to missingness

Hypothesis d: Medical Expenditure varies by Age

Create a scatterplot

mepsData_1%>% ggplot(aes(x = Age, y = log(TotalHealthExpenditure))) +
  geom_point() + geom_smooth(method="lm", se =FALSE, color ="red")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4822 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 375 rows containing missing values or values outside the scale range
## (`geom_point()`).

mepsData_1%>% ggplot(aes(x = sqrt(BMI), y = sqrt(TotalHealthExpenditure))) +
  geom_point() + geom_smooth(method="lm", se =FALSE, color ="red")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 12209 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12209 rows containing missing values or values outside the scale range
## (`geom_point()`).

Perform a correlation analysis

correlation_result <- cor.test(mepsData_1$Age, mepsData_1$TotalHealthExpenditure)
print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  mepsData_1$Age and mepsData_1$TotalHealthExpenditure
## t = 38.333, df = 30084, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2049986 0.2265458
## sample estimates:
##       cor 
## 0.2157985

Create a correlation matrix for numerical variables

numerical_data <- mepsData_1[, c("Age", "BMI", "FamIncome_Continuous","FamIncome_PercentPoverty", "TotalHealthExpenditure" )]%>%na.omit()
head(mepsData)
##   X  Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101              2  26   2             2               1
## 2 2 2290001102              1  25   1             2               1
## 3 3 2290002101              2  33   2             1               1
## 4 4 2290002102              2  39   1             1               1
## 5 5 2290002103             NA  11   1             1               1
## 6 6 2290002104             NA   8   1             1               1
##   NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1                   2                32000            3                     3
## 2                   2                32000            2                     3
## 3                   2                55000            3                     3
## 4                   2                55000            3                     3
## 5                   2                55000            3                     3
## 6                   2                55000            3                     3
##   FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1                   190.31            2            1            2
## 2                   190.31            2            1            2
## 3                   163.92            3            1            2
## 4                   163.92            3            2            2
## 5                   163.92            3            1            2
## 6                   163.92            3            1            2
##   TotalHealthExpenditure HasHypertension HasDiabetes  BMI Drinks5Day
## 1                   2368               2           2 21.4         NA
## 2                   2040               2           2 30.6          1
## 3                    173               2           2 28.2         NA
## 4                      0               2           2 28.7          2
## 5                    103              NA           2   NA         NA
## 6                      0              NA           2   NA         NA
correlation_matrix <- cor(numerical_data)
print(correlation_matrix)
##                                    Age           BMI FamIncome_Continuous
## Age                       1.0000000000 -0.0002200695          -0.04774148
## BMI                      -0.0002200695  1.0000000000          -0.05935660
## FamIncome_Continuous     -0.0477414774 -0.0593565997           1.00000000
## FamIncome_PercentPoverty  0.0830837604 -0.0558400269           0.91169200
## TotalHealthExpenditure    0.1922121028  0.0534125709          -0.02371020
##                          FamIncome_PercentPoverty TotalHealthExpenditure
## Age                                   0.083083760            0.192212103
## BMI                                  -0.055840027            0.053412571
## FamIncome_Continuous                  0.911691997           -0.023710200
## FamIncome_PercentPoverty              1.000000000            0.006211844
## TotalHealthExpenditure                0.006211844            1.000000000
plot(mepsData_1$TotalHealthExpenditure, mepsData_1$Sex)

cor(mepsData_1$TotalHealthExpenditure, mepsData_1$Sex)
## [1] 0.02617498
mepsData_1 <- mepsData_1 %>% na.omit()

summary(mepsData_1$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   34.00   50.00   49.51   64.00   85.00

Fit a linear regression model

mepsData_model <- mepsData_1 %>% 
  mutate(Age_Category = case_when(
    Age < 20 ~ 0,
    Age >= 21 & Age <= 40 ~ 1,
    Age >= 41 & Age <= 60 ~ 2,
    Age >= 61 & Age <= 80 ~ 3,
    Age >= 81 & Age <= 100 ~ 4
  ), BMI_2 = case_when(
      BMI < 16.0 ~ 0,
      BMI >= 16.0 & BMI <= 16.9 ~ 1,
      BMI >= 17.0 & BMI <= 18.4 ~ 2,
      BMI >= 18.5 & BMI <= 24.9 ~ 3,
      BMI >= 25.0 & BMI <= 29.9 ~ 4,
      BMI >= 30.0 & BMI <= 34.9 ~ 5,
      BMI >= 35.0 & BMI <= 39.9 ~ 6,
      TRUE ~ 7  # For any other value of BMI
    ))
model <- lm(TotalHealthExpenditure ~ factor(Age_Category) + factor(Sex) + factor(BMI_2) + factor(RaceEthnicity) + factor(MentalHealth), data = mepsData_model)

Summarize the model

summary(model)
## 
## Call:
## lm(formula = TotalHealthExpenditure ~ factor(Age_Category) + 
##     factor(Sex) + factor(BMI_2) + factor(RaceEthnicity) + factor(MentalHealth), 
##     data = mepsData_model)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21792  -6430  -2975    172 546948 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1558.2     2058.8  -0.757 0.449151    
## factor(Age_Category)1    1416.4     1372.4   1.032 0.302099    
## factor(Age_Category)2    5075.3     1377.7   3.684 0.000231 ***
## factor(Age_Category)3    9923.4     1390.0   7.139 1.03e-12 ***
## factor(Age_Category)4   13338.0     1694.3   7.872 3.95e-15 ***
## factor(Sex)2            12583.2    10984.3   1.146 0.252009    
## factor(BMI_2)1            885.4     3799.3   0.233 0.815730    
## factor(BMI_2)2          -1464.7     2749.4  -0.533 0.594234    
## factor(BMI_2)3           -535.6     1608.6  -0.333 0.739168    
## factor(BMI_2)4           -541.7     1589.4  -0.341 0.733260    
## factor(BMI_2)5           -700.7     1622.4  -0.432 0.665835    
## factor(BMI_2)6           1362.7     1729.0   0.788 0.430635    
## factor(BMI_2)7           2033.1     1861.0   1.092 0.274655    
## factor(RaceEthnicity)2   2952.3      577.1   5.116 3.20e-07 ***
## factor(RaceEthnicity)3   1670.2      777.4   2.148 0.031715 *  
## factor(RaceEthnicity)4   1118.2     1120.0   0.998 0.318100    
## factor(RaceEthnicity)5    204.5     1339.9   0.153 0.878692    
## factor(MentalHealth)2     817.3      532.1   1.536 0.124550    
## factor(MentalHealth)3    2483.4      567.7   4.375 1.23e-05 ***
## factor(MentalHealth)4    7739.9      880.1   8.794  < 2e-16 ***
## factor(MentalHealth)5    9178.1     1839.6   4.989 6.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18990 on 7812 degrees of freedom
##   (108 observations deleted due to missingness)
## Multiple R-squared:  0.06327,    Adjusted R-squared:  0.06087 
## F-statistic: 26.38 on 20 and 7812 DF,  p-value: < 2.2e-16

Model Evaluation

Steps for model evealuation Step 1: Check for linearity using a residual vs. fitted plot

plot(model, which = 1)

Step 2: Check for independence using a residual vs. order plot

plot(model, which = 2)

Step 3: Check for homoscedasticity using a scale-location plot

plot(model, which = 3)

Step 4: Check for normality using a Q-Q plot

plot(model, which = 2)

qqnorm(model$residuals)

Split the data into training and testing datasets

library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(123)  # for reproducibility
trainIndex <- createDataPartition(mepsData$TotalHealthExpenditure, p = 0.9, list = FALSE)
train_data <- mepsData[trainIndex, ]
test_data <- mepsData[-trainIndex, ]

Fit a new linear regression model using the training dataset

model_training <- lm(TotalHealthExpenditure ~ Age + factor(Sex) + BMI + factor(RaceEthnicity) + factor(MentalHealth)+factor(FamIncome_Categorical), data = train_data)

Predict on the training dataset

predicted_training <- predict(model_training, newdata = train_data)

Calculate Mean Squared Error (MSE) on the training data

mse_training <- mean((train_data$TotalHealthExpenditure - predicted_training)^2)
print(paste("MSE on training data:", mse_training))
## [1] "MSE on training data: NA"

Create a data frame for two individuals

new_data <- data.frame(
  Age = c(60, 60),
  Sex = c(1,2),
  BMI = c(27, 27),  # Replace with actual values
  RaceEthnicity = c(1, 1),  # Replace with actual values
  MentalHealth = c(2, 2),  # Replace with actual values
  FamIncome_Categorical = c(4, 4)  # Replace with actual values
)

Predict average and confidence intervals

predicted_values <- predict(model_training, newdata = new_data, interval = "confidence")
print(predicted_values)
##        fit      lwr      upr
## 1 5239.717 4266.585 6212.849
## 2 6262.529 5302.963 7222.095

Predict on the testing dataset

predicted_testing <- predict(model_training, newdata = test_data)

Calculate Mean Squared Error (MSE) on the testing data

mse_testing <- mean((test_data$TotalHealthExpenditure - predicted_testing)^2)
print(paste("MSE on testing data:", mse_testing))
## [1] "MSE on testing data: NA"

PART B of the Analysis

Recode the variable to 0 (for 2) and 1 (for 1)

mepsData$FluVaccination <- ifelse(mepsData$FluVaccination == 1, 1, 0)

Create a subset to include only adults (>18 years) with valid answers to flu vaccination status (Yes or No)

adults_subset <- subset(mepsData, Age > 18 & FluVaccination %in% c(1, 0))
head(adults_subset,5)
##   X  Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101              0  26   2             2               1
## 2 2 2290001102              1  25   1             2               1
## 3 3 2290002101              0  33   2             1               1
## 4 4 2290002102              0  39   1             1               1
## 9 9 2290003101              0  36   2             2               1
##   NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1                   2                32000            3                     3
## 2                   2                32000            2                     3
## 3                   2                55000            3                     3
## 4                   2                55000            3                     3
## 9                   2               258083            3                     5
##   FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1                   190.31            2            1            2
## 2                   190.31            2            1            2
## 3                   163.92            3            1            2
## 4                   163.92            3            2            2
## 9                  1013.48            2            1            2
##   TotalHealthExpenditure HasHypertension HasDiabetes  BMI Drinks5Day
## 1                   2368               2           2 21.4         NA
## 2                   2040               2           2 30.6          1
## 3                    173               2           2 28.2         NA
## 4                      0               2           2 28.7          2
## 9                    535               2           2 21.5         NA

Remove observations with NA in any of these variables

adults_subset <- na.omit(adults_subset)

Calculate the proportion of individuals who received a flu vaccine stratified by different variables

prop_by_provider <- (table(adults_subset$FluVaccination, adults_subset$HaveProvider)/length(adults_subset$FluVaccination))*100
prop_by_insurance <- (table(adults_subset$FluVaccination, adults_subset$HealthInsurance)/length(adults_subset$FluVaccination))*100
prop_by_health_status <- (table(adults_subset$FluVaccination, adults_subset$HealthStatus)/length(adults_subset$FluVaccination))*100
prop_by_income <- (table(adults_subset$FluVaccination, adults_subset$FamIncome_Categorical)/length(adults_subset$FluVaccination))*100
prop_by_provider
##    
##             1         2
##   0 35.984704 23.339707
##   1 34.786488  5.889101
prop_by_insurance
##    
##              1          2          3
##   0 38.0369662 12.6067559  8.6806883
##   1 27.9923518 11.7144678  0.9687699
prop_by_health_status
##    
##             1         2         3         4         5
##   0 14.671765 20.701083 17.221160  5.391969  1.338432
##   1  7.801147 13.652008 12.160612  5.634162  1.427661
prop_by_income
##    
##             1         2         3         4         5
##   0  7.469726  2.804334  8.272785 18.763544 22.014022
##   1  3.518164  1.287444  4.359465 10.860421 20.650096

Fit a logistic model

original_model <- glm(FluVaccination ~ Age + Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous, 
                     family = binomial, data = adults_subset)

Model with Age, Sex, and HealthInsurance

model1 <- glm(FluVaccination ~ Age + Sex + HealthInsurance, family = binomial, data = adults_subset)

Model with Age, RaceEthnicity, and FamIncome_Continuous

model2 <- glm(FluVaccination ~ Age + RaceEthnicity + FamIncome_Continuous, family = binomial, data = adults_subset)

Compare AIC values

aic_original <- AIC(original_model)
aic_model1 <- AIC(model1)
aic_model2 <- AIC(model2)

Choose the model with the lowest AIC

if (aic_model1 < aic_model2) {
  selected_model <- model1
} else {
  selected_model <- model2
}

Stepwise variable selection

stepwise_model <- step(original_model, direction = "both")
## Start:  AIC=9390.03
## FluVaccination ~ Age + Sex + RaceEthnicity + HealthInsurance + 
##     FamIncome_Continuous
## 
##                        Df Deviance     AIC
## - Sex                   1   9379.3  9389.3
## <none>                      9378.0  9390.0
## - RaceEthnicity         1   9384.7  9394.7
## - FamIncome_Continuous  1   9455.0  9465.0
## - HealthInsurance       1   9460.8  9470.8
## - Age                   1  10406.2 10416.2
## 
## Step:  AIC=9389.27
## FluVaccination ~ Age + RaceEthnicity + HealthInsurance + FamIncome_Continuous
## 
##                        Df Deviance     AIC
## <none>                      9379.3  9389.3
## + Sex                   1   9378.0  9390.0
## - RaceEthnicity         1   9386.1  9394.1
## - FamIncome_Continuous  1   9456.2  9464.2
## - HealthInsurance       1   9462.0  9470.0
## - Age                   1  10406.8 10414.8

Interaction between Age and Sex

interaction_model <- glm(FluVaccination ~ Age * Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous, 
                         family = binomial, data = adults_subset)

Quadratic transformation of Age

quadratic_age_model <- glm(FluVaccination ~ poly(Age, 2) + Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous, 
                           family = binomial, data = adults_subset)

Compare models based on AIC

aic_original <- AIC(original_model)
aic_selected <- AIC(selected_model)
aic_stepwise <- AIC(stepwise_model)
aic_interaction <- AIC(interaction_model)
aic_quadratic <- AIC(quadratic_age_model)

Choose the model with the lowest AIC

best_model <- selected_model  # Replace with the model that has the lowest AIC

Calculate odds ratios and their 95% confidence intervals

library(broom)
## Warning: package 'broom' was built under R version 4.3.2
odds_ratios <- tidy(model, exponentiate = TRUE)
conf_intervals <- confint(model)

Combine coefficients, odds ratios, and confidence intervals

coefficients_odds_ratios <- data.frame(
  Coefficient = summary(model)$coefficients[, 1],
  OddsRatio = odds_ratios$estimate,
  CI_Lower = conf_intervals[, 1],
  CI_Upper = conf_intervals[, 2]
)

coefficients_odds_ratios
##                        Coefficient     OddsRatio   CI_Lower  CI_Upper
## (Intercept)             -1558.2441  0.000000e+00 -5594.0444  2477.556
## factor(Age_Category)1    1416.3596           Inf -1273.9625  4106.682
## factor(Age_Category)2    5075.2518           Inf  2374.5665  7775.937
## factor(Age_Category)3    9923.4444           Inf  7198.5744 12648.315
## factor(Age_Category)4   13338.0056           Inf 10016.7113 16659.300
## factor(Sex)2            12583.2438           Inf -8948.8880 34115.375
## factor(BMI_2)1            885.4291           Inf -6562.2934  8333.152
## factor(BMI_2)2          -1464.6815  0.000000e+00 -6854.1903  3924.827
## factor(BMI_2)3           -535.6030 2.457909e-233 -3688.8431  2617.637
## factor(BMI_2)4           -541.6726 5.683044e-236 -3657.3109  2573.966
## factor(BMI_2)5           -700.6970 4.910758e-305 -3881.0505  2479.656
## factor(BMI_2)6           1362.6880           Inf -2026.5713  4751.947
## factor(BMI_2)7           2033.1480           Inf -1614.9778  5681.274
## factor(RaceEthnicity)2   2952.3316           Inf  1821.0701  4083.593
## factor(RaceEthnicity)3   1670.1791           Inf   146.2340  3194.124
## factor(RaceEthnicity)4   1118.2435           Inf -1077.2518  3313.739
## factor(RaceEthnicity)5    204.5188  6.628214e+88 -2422.1356  2831.173
## factor(MentalHealth)2     817.2963           Inf  -225.6703  1860.263
## factor(MentalHealth)3    2483.4385           Inf  1370.6028  3596.274
## factor(MentalHealth)4    7739.9157           Inf  6014.6110  9465.220
## factor(MentalHealth)5    9178.0917           Inf  5571.9036 12784.280

Create a data frame for 4 individuals

Predict probabilities and 95% confidence intervals

predicted_probs <- predict(model, newdata = mepsData_model, type = "response", se.fit = TRUE)

Choose a prediction threshold

threshold <- 0.5

Predicted probabilities

predicted_y <- ifelse(predicted_probs$fit > threshold, 1, 0)

Actual outcomes (flu vaccination status)

actual_y <- ifelse(adults_subset$FluVaccination == 1, 1, 0)

Calculate prediction accuracy

accuracy <- mean(predicted_y == actual_y)
## Warning in predicted_y == actual_y: longer object length is not a multiple of
## shorter object length
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
improved_model = quadratic_age_model

Calculate ROC curves for both models

roc_improved <- roc(adults_subset$FluVaccination, predict(improved_model, type="response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_original <- roc(adults_subset$FluVaccination, predict(original_model, type = "response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Plot the ROC curves

plot(roc_improved, col = "blue", main = "ROC Curves")
lines(roc_original, col = "red")
legend("bottomright", legend = c("Improved Model", "Original Model"), col = c("blue", "red"))

library(coefplot)
## Warning: package 'coefplot' was built under R version 4.3.3

Assuming ‘model’ is your logistic regression model

coefplot(original_model)

## Assuming ‘model’ is your logistic regression model

plot(original_model)

Calculate AUC for both models

auc_improved <- auc(roc_improved)
auc_original <- auc(roc_original)

cat("AUC for Improved Model:", auc_improved, "\n")
## AUC for Improved Model: 0.729134
cat("AUC for Original Model:", auc_original, "\n")
## AUC for Original Model: 0.7253676