library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
mepsData <-read.csv("C:/Users/DELL/Desktop/Naomi_work/Assignment 4/mepsData.csv")
str(mepsData)
## 'data.frame': 30461 obs. of 20 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Person_ID : num 2.29e+09 2.29e+09 2.29e+09 2.29e+09 2.29e+09 ...
## $ FluVaccination : int 2 1 2 2 NA NA NA NA 2 NA ...
## $ Age : int 26 25 33 39 11 8 4 2 36 36 ...
## $ Sex : int 2 1 2 1 1 1 1 1 2 1 ...
## $ RaceEthnicity : int 2 2 1 1 1 1 1 1 2 2 ...
## $ HealthInsurance : int 1 1 1 1 1 1 1 1 1 1 ...
## $ NotAffordHealthCare : int 2 2 2 2 2 2 2 2 2 2 ...
## $ FamIncome_Continuous : num 32000 32000 55000 55000 55000 ...
## $ MentalHealth : int 3 2 3 3 3 3 3 3 3 3 ...
## $ FamIncome_Categorical : int 3 3 3 3 3 3 3 3 5 5 ...
## $ FamIncome_PercentPoverty: num 190 190 164 164 164 ...
## $ HealthStatus : int 2 2 3 3 3 3 3 3 2 3 ...
## $ HaveProvider : int 1 1 1 2 1 1 1 1 1 1 ...
## $ CensusRegion : int 2 2 2 2 2 2 2 2 2 2 ...
## $ TotalHealthExpenditure : int 2368 2040 173 0 103 0 0 63 535 7023 ...
## $ HasHypertension : int 2 2 2 2 NA NA NA NA 2 2 ...
## $ HasDiabetes : int 2 2 2 2 2 2 2 2 2 2 ...
## $ BMI : num 21.4 30.6 28.2 28.7 NA NA NA NA 21.5 NA ...
## $ Drinks5Day : int NA 1 NA 2 NA NA NA NA NA NA ...
mepsData_1 <- mepsData %>%
mutate(BMI_Category = case_when(
BMI < 16.0 ~ "Underweight-Severe",
BMI >= 16.0 & BMI <= 16.9 ~ "Underweight-Moderate",
BMI >= 17.0 & BMI <= 18.4 ~ "Undereight-Mild",
BMI >= 18.5 & BMI <= 24.9 ~ "Normal",
BMI >= 25.0 & BMI <= 29.9 ~ "Overweight",
BMI >= 30.0 & BMI <= 34.9 ~ "Obese-i",
BMI >= 35.0 & BMI <= 39.9 ~ "Obese-ii",
TRUE ~ "Obese-iii" # For any other value of BMI
))
head(mepsData_1,5)
## X Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101 2 26 2 2 1
## 2 2 2290001102 1 25 1 2 1
## 3 3 2290002101 2 33 2 1 1
## 4 4 2290002102 2 39 1 1 1
## 5 5 2290002103 NA 11 1 1 1
## NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1 2 32000 3 3
## 2 2 32000 2 3
## 3 2 55000 3 3
## 4 2 55000 3 3
## 5 2 55000 3 3
## FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1 190.31 2 1 2
## 2 190.31 2 1 2
## 3 163.92 3 1 2
## 4 163.92 3 2 2
## 5 163.92 3 1 2
## TotalHealthExpenditure HasHypertension HasDiabetes BMI Drinks5Day
## 1 2368 2 2 21.4 NA
## 2 2040 2 2 30.6 1
## 3 173 2 2 28.2 NA
## 4 0 2 2 28.7 2
## 5 103 NA 2 NA NA
## BMI_Category
## 1 Normal
## 2 Obese-i
## 3 Overweight
## 4 Overweight
## 5 Obese-iii
Two_bmi <- mepsData_1%>%group_by(BMI_Category)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
Twentyfive_percentile = quantile(TotalHealthExpenditure, 0.25),
Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))
Two_income <- mepsData_1 %>%group_by(FamIncome_Categorical)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
Twenty_percentile = quantile(TotalHealthExpenditure, 0.25),
Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))
Two_healthstatus <- mepsData_1 %>%group_by(HealthStatus)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
Twenty_percentile = quantile(TotalHealthExpenditure, 0.25),
Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))
Two_healthstatus <- as.data.frame(Two_healthstatus)
Two_healthstatus%>%ggplot(aes(x=HealthStatus,y=average))+geom_line()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Two_income%>%ggplot(aes(x=FamIncome_Categorical, y=average))+geom_line()
Two_bmi%>%ggplot()
```{ r analysis}
mepsData_1%>%ggplot(aes(y=TotalHealthExpenditure, fill=as.factor(HealthStatus)))+geom_boxplot() #Question three mepsData_1%>%na.omit()%>%group_by(HealthStatus)%>%summarise(count=n()) mepsData_1%>%na.omit()%>%group_by(MentalHealth)%>%summarise(count=n())
## Check preliminary evaluation
```{ r check}
summary(mepsData_1$HealthStatus)
summary(mepsData_1$MentalHealth)
chi_sq_result <- chisq.test(mepsData_1$HealthStatus, mepsData_1$MentalHealth)
print(chi_sq_result)
##
## Pearson's Chi-squared test
##
## data: mepsData_1$HealthStatus and mepsData_1$MentalHealth
## X-squared = 25532, df = 16, p-value < 2.2e-16
t_test_result <- t.test(mepsData_1$TotalHealthExpenditure ~ mepsData_1$HasHypertension)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: mepsData_1$TotalHealthExpenditure by mepsData_1$HasHypertension
## t = 23.992, df = 11499, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## 6608.840 7784.801
## sample estimates:
## mean in group 1 mean in group 2
## 12050.802 4853.982
mepsData_1%>%ggplot(aes(x = as.factor(CensusRegion), y = TotalHealthExpenditure)) +
geom_boxplot()
anova_result <- aov(TotalHealthExpenditure ~ CensusRegion, data = mepsData_1)
print(summary(anova_result))
## Df Sum Sq Mean Sq F value Pr(>F)
## CensusRegion 1 1.324e+10 1.324e+10 42.96 5.69e-11 ***
## Residuals 30084 9.274e+12 3.083e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 375 observations deleted due to missingness
mepsData_1%>% ggplot(aes(x = Age, y = log(TotalHealthExpenditure))) +
geom_point() + geom_smooth(method="lm", se =FALSE, color ="red")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4822 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 375 rows containing missing values or values outside the scale range
## (`geom_point()`).
mepsData_1%>% ggplot(aes(x = sqrt(BMI), y = sqrt(TotalHealthExpenditure))) +
geom_point() + geom_smooth(method="lm", se =FALSE, color ="red")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 12209 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12209 rows containing missing values or values outside the scale range
## (`geom_point()`).
correlation_result <- cor.test(mepsData_1$Age, mepsData_1$TotalHealthExpenditure)
print(correlation_result)
##
## Pearson's product-moment correlation
##
## data: mepsData_1$Age and mepsData_1$TotalHealthExpenditure
## t = 38.333, df = 30084, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2049986 0.2265458
## sample estimates:
## cor
## 0.2157985
numerical_data <- mepsData_1[, c("Age", "BMI", "FamIncome_Continuous","FamIncome_PercentPoverty", "TotalHealthExpenditure" )]%>%na.omit()
head(mepsData)
## X Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101 2 26 2 2 1
## 2 2 2290001102 1 25 1 2 1
## 3 3 2290002101 2 33 2 1 1
## 4 4 2290002102 2 39 1 1 1
## 5 5 2290002103 NA 11 1 1 1
## 6 6 2290002104 NA 8 1 1 1
## NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1 2 32000 3 3
## 2 2 32000 2 3
## 3 2 55000 3 3
## 4 2 55000 3 3
## 5 2 55000 3 3
## 6 2 55000 3 3
## FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1 190.31 2 1 2
## 2 190.31 2 1 2
## 3 163.92 3 1 2
## 4 163.92 3 2 2
## 5 163.92 3 1 2
## 6 163.92 3 1 2
## TotalHealthExpenditure HasHypertension HasDiabetes BMI Drinks5Day
## 1 2368 2 2 21.4 NA
## 2 2040 2 2 30.6 1
## 3 173 2 2 28.2 NA
## 4 0 2 2 28.7 2
## 5 103 NA 2 NA NA
## 6 0 NA 2 NA NA
correlation_matrix <- cor(numerical_data)
print(correlation_matrix)
## Age BMI FamIncome_Continuous
## Age 1.0000000000 -0.0002200695 -0.04774148
## BMI -0.0002200695 1.0000000000 -0.05935660
## FamIncome_Continuous -0.0477414774 -0.0593565997 1.00000000
## FamIncome_PercentPoverty 0.0830837604 -0.0558400269 0.91169200
## TotalHealthExpenditure 0.1922121028 0.0534125709 -0.02371020
## FamIncome_PercentPoverty TotalHealthExpenditure
## Age 0.083083760 0.192212103
## BMI -0.055840027 0.053412571
## FamIncome_Continuous 0.911691997 -0.023710200
## FamIncome_PercentPoverty 1.000000000 0.006211844
## TotalHealthExpenditure 0.006211844 1.000000000
plot(mepsData_1$TotalHealthExpenditure, mepsData_1$Sex)
cor(mepsData_1$TotalHealthExpenditure, mepsData_1$Sex)
## [1] 0.02617498
mepsData_1 <- mepsData_1 %>% na.omit()
summary(mepsData_1$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 34.00 50.00 49.51 64.00 85.00
mepsData_model <- mepsData_1 %>%
mutate(Age_Category = case_when(
Age < 20 ~ 0,
Age >= 21 & Age <= 40 ~ 1,
Age >= 41 & Age <= 60 ~ 2,
Age >= 61 & Age <= 80 ~ 3,
Age >= 81 & Age <= 100 ~ 4
), BMI_2 = case_when(
BMI < 16.0 ~ 0,
BMI >= 16.0 & BMI <= 16.9 ~ 1,
BMI >= 17.0 & BMI <= 18.4 ~ 2,
BMI >= 18.5 & BMI <= 24.9 ~ 3,
BMI >= 25.0 & BMI <= 29.9 ~ 4,
BMI >= 30.0 & BMI <= 34.9 ~ 5,
BMI >= 35.0 & BMI <= 39.9 ~ 6,
TRUE ~ 7 # For any other value of BMI
))
model <- lm(TotalHealthExpenditure ~ factor(Age_Category) + factor(Sex) + factor(BMI_2) + factor(RaceEthnicity) + factor(MentalHealth), data = mepsData_model)
summary(model)
##
## Call:
## lm(formula = TotalHealthExpenditure ~ factor(Age_Category) +
## factor(Sex) + factor(BMI_2) + factor(RaceEthnicity) + factor(MentalHealth),
## data = mepsData_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21792 -6430 -2975 172 546948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1558.2 2058.8 -0.757 0.449151
## factor(Age_Category)1 1416.4 1372.4 1.032 0.302099
## factor(Age_Category)2 5075.3 1377.7 3.684 0.000231 ***
## factor(Age_Category)3 9923.4 1390.0 7.139 1.03e-12 ***
## factor(Age_Category)4 13338.0 1694.3 7.872 3.95e-15 ***
## factor(Sex)2 12583.2 10984.3 1.146 0.252009
## factor(BMI_2)1 885.4 3799.3 0.233 0.815730
## factor(BMI_2)2 -1464.7 2749.4 -0.533 0.594234
## factor(BMI_2)3 -535.6 1608.6 -0.333 0.739168
## factor(BMI_2)4 -541.7 1589.4 -0.341 0.733260
## factor(BMI_2)5 -700.7 1622.4 -0.432 0.665835
## factor(BMI_2)6 1362.7 1729.0 0.788 0.430635
## factor(BMI_2)7 2033.1 1861.0 1.092 0.274655
## factor(RaceEthnicity)2 2952.3 577.1 5.116 3.20e-07 ***
## factor(RaceEthnicity)3 1670.2 777.4 2.148 0.031715 *
## factor(RaceEthnicity)4 1118.2 1120.0 0.998 0.318100
## factor(RaceEthnicity)5 204.5 1339.9 0.153 0.878692
## factor(MentalHealth)2 817.3 532.1 1.536 0.124550
## factor(MentalHealth)3 2483.4 567.7 4.375 1.23e-05 ***
## factor(MentalHealth)4 7739.9 880.1 8.794 < 2e-16 ***
## factor(MentalHealth)5 9178.1 1839.6 4.989 6.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18990 on 7812 degrees of freedom
## (108 observations deleted due to missingness)
## Multiple R-squared: 0.06327, Adjusted R-squared: 0.06087
## F-statistic: 26.38 on 20 and 7812 DF, p-value: < 2.2e-16
Steps for model evealuation Step 1: Check for linearity using a residual vs. fitted plot
plot(model, which = 1)
plot(model, which = 2)
plot(model, which = 3)
plot(model, which = 2)
qqnorm(model$residuals)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123) # for reproducibility
trainIndex <- createDataPartition(mepsData$TotalHealthExpenditure, p = 0.9, list = FALSE)
train_data <- mepsData[trainIndex, ]
test_data <- mepsData[-trainIndex, ]
model_training <- lm(TotalHealthExpenditure ~ Age + factor(Sex) + BMI + factor(RaceEthnicity) + factor(MentalHealth)+factor(FamIncome_Categorical), data = train_data)
predicted_training <- predict(model_training, newdata = train_data)
mse_training <- mean((train_data$TotalHealthExpenditure - predicted_training)^2)
print(paste("MSE on training data:", mse_training))
## [1] "MSE on training data: NA"
new_data <- data.frame(
Age = c(60, 60),
Sex = c(1,2),
BMI = c(27, 27), # Replace with actual values
RaceEthnicity = c(1, 1), # Replace with actual values
MentalHealth = c(2, 2), # Replace with actual values
FamIncome_Categorical = c(4, 4) # Replace with actual values
)
predicted_values <- predict(model_training, newdata = new_data, interval = "confidence")
print(predicted_values)
## fit lwr upr
## 1 5239.717 4266.585 6212.849
## 2 6262.529 5302.963 7222.095
predicted_testing <- predict(model_training, newdata = test_data)
mse_testing <- mean((test_data$TotalHealthExpenditure - predicted_testing)^2)
print(paste("MSE on testing data:", mse_testing))
## [1] "MSE on testing data: NA"
mepsData$FluVaccination <- ifelse(mepsData$FluVaccination == 1, 1, 0)
adults_subset <- subset(mepsData, Age > 18 & FluVaccination %in% c(1, 0))
head(adults_subset,5)
## X Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101 0 26 2 2 1
## 2 2 2290001102 1 25 1 2 1
## 3 3 2290002101 0 33 2 1 1
## 4 4 2290002102 0 39 1 1 1
## 9 9 2290003101 0 36 2 2 1
## NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1 2 32000 3 3
## 2 2 32000 2 3
## 3 2 55000 3 3
## 4 2 55000 3 3
## 9 2 258083 3 5
## FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1 190.31 2 1 2
## 2 190.31 2 1 2
## 3 163.92 3 1 2
## 4 163.92 3 2 2
## 9 1013.48 2 1 2
## TotalHealthExpenditure HasHypertension HasDiabetes BMI Drinks5Day
## 1 2368 2 2 21.4 NA
## 2 2040 2 2 30.6 1
## 3 173 2 2 28.2 NA
## 4 0 2 2 28.7 2
## 9 535 2 2 21.5 NA
adults_subset <- na.omit(adults_subset)
prop_by_provider <- (table(adults_subset$FluVaccination, adults_subset$HaveProvider)/length(adults_subset$FluVaccination))*100
prop_by_insurance <- (table(adults_subset$FluVaccination, adults_subset$HealthInsurance)/length(adults_subset$FluVaccination))*100
prop_by_health_status <- (table(adults_subset$FluVaccination, adults_subset$HealthStatus)/length(adults_subset$FluVaccination))*100
prop_by_income <- (table(adults_subset$FluVaccination, adults_subset$FamIncome_Categorical)/length(adults_subset$FluVaccination))*100
prop_by_provider
##
## 1 2
## 0 35.984704 23.339707
## 1 34.786488 5.889101
prop_by_insurance
##
## 1 2 3
## 0 38.0369662 12.6067559 8.6806883
## 1 27.9923518 11.7144678 0.9687699
prop_by_health_status
##
## 1 2 3 4 5
## 0 14.671765 20.701083 17.221160 5.391969 1.338432
## 1 7.801147 13.652008 12.160612 5.634162 1.427661
prop_by_income
##
## 1 2 3 4 5
## 0 7.469726 2.804334 8.272785 18.763544 22.014022
## 1 3.518164 1.287444 4.359465 10.860421 20.650096
original_model <- glm(FluVaccination ~ Age + Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous,
family = binomial, data = adults_subset)
model1 <- glm(FluVaccination ~ Age + Sex + HealthInsurance, family = binomial, data = adults_subset)
model2 <- glm(FluVaccination ~ Age + RaceEthnicity + FamIncome_Continuous, family = binomial, data = adults_subset)
aic_original <- AIC(original_model)
aic_model1 <- AIC(model1)
aic_model2 <- AIC(model2)
if (aic_model1 < aic_model2) {
selected_model <- model1
} else {
selected_model <- model2
}
stepwise_model <- step(original_model, direction = "both")
## Start: AIC=9390.03
## FluVaccination ~ Age + Sex + RaceEthnicity + HealthInsurance +
## FamIncome_Continuous
##
## Df Deviance AIC
## - Sex 1 9379.3 9389.3
## <none> 9378.0 9390.0
## - RaceEthnicity 1 9384.7 9394.7
## - FamIncome_Continuous 1 9455.0 9465.0
## - HealthInsurance 1 9460.8 9470.8
## - Age 1 10406.2 10416.2
##
## Step: AIC=9389.27
## FluVaccination ~ Age + RaceEthnicity + HealthInsurance + FamIncome_Continuous
##
## Df Deviance AIC
## <none> 9379.3 9389.3
## + Sex 1 9378.0 9390.0
## - RaceEthnicity 1 9386.1 9394.1
## - FamIncome_Continuous 1 9456.2 9464.2
## - HealthInsurance 1 9462.0 9470.0
## - Age 1 10406.8 10414.8
interaction_model <- glm(FluVaccination ~ Age * Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous,
family = binomial, data = adults_subset)
quadratic_age_model <- glm(FluVaccination ~ poly(Age, 2) + Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous,
family = binomial, data = adults_subset)
aic_original <- AIC(original_model)
aic_selected <- AIC(selected_model)
aic_stepwise <- AIC(stepwise_model)
aic_interaction <- AIC(interaction_model)
aic_quadratic <- AIC(quadratic_age_model)
best_model <- selected_model # Replace with the model that has the lowest AIC
library(broom)
## Warning: package 'broom' was built under R version 4.3.2
odds_ratios <- tidy(model, exponentiate = TRUE)
conf_intervals <- confint(model)
coefficients_odds_ratios <- data.frame(
Coefficient = summary(model)$coefficients[, 1],
OddsRatio = odds_ratios$estimate,
CI_Lower = conf_intervals[, 1],
CI_Upper = conf_intervals[, 2]
)
coefficients_odds_ratios
## Coefficient OddsRatio CI_Lower CI_Upper
## (Intercept) -1558.2441 0.000000e+00 -5594.0444 2477.556
## factor(Age_Category)1 1416.3596 Inf -1273.9625 4106.682
## factor(Age_Category)2 5075.2518 Inf 2374.5665 7775.937
## factor(Age_Category)3 9923.4444 Inf 7198.5744 12648.315
## factor(Age_Category)4 13338.0056 Inf 10016.7113 16659.300
## factor(Sex)2 12583.2438 Inf -8948.8880 34115.375
## factor(BMI_2)1 885.4291 Inf -6562.2934 8333.152
## factor(BMI_2)2 -1464.6815 0.000000e+00 -6854.1903 3924.827
## factor(BMI_2)3 -535.6030 2.457909e-233 -3688.8431 2617.637
## factor(BMI_2)4 -541.6726 5.683044e-236 -3657.3109 2573.966
## factor(BMI_2)5 -700.6970 4.910758e-305 -3881.0505 2479.656
## factor(BMI_2)6 1362.6880 Inf -2026.5713 4751.947
## factor(BMI_2)7 2033.1480 Inf -1614.9778 5681.274
## factor(RaceEthnicity)2 2952.3316 Inf 1821.0701 4083.593
## factor(RaceEthnicity)3 1670.1791 Inf 146.2340 3194.124
## factor(RaceEthnicity)4 1118.2435 Inf -1077.2518 3313.739
## factor(RaceEthnicity)5 204.5188 6.628214e+88 -2422.1356 2831.173
## factor(MentalHealth)2 817.2963 Inf -225.6703 1860.263
## factor(MentalHealth)3 2483.4385 Inf 1370.6028 3596.274
## factor(MentalHealth)4 7739.9157 Inf 6014.6110 9465.220
## factor(MentalHealth)5 9178.0917 Inf 5571.9036 12784.280
predicted_probs <- predict(model, newdata = mepsData_model, type = "response", se.fit = TRUE)
threshold <- 0.5
predicted_y <- ifelse(predicted_probs$fit > threshold, 1, 0)
actual_y <- ifelse(adults_subset$FluVaccination == 1, 1, 0)
accuracy <- mean(predicted_y == actual_y)
## Warning in predicted_y == actual_y: longer object length is not a multiple of
## shorter object length
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
improved_model = quadratic_age_model
roc_improved <- roc(adults_subset$FluVaccination, predict(improved_model, type="response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_original <- roc(adults_subset$FluVaccination, predict(original_model, type = "response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_improved, col = "blue", main = "ROC Curves")
lines(roc_original, col = "red")
legend("bottomright", legend = c("Improved Model", "Original Model"), col = c("blue", "red"))
library(coefplot)
## Warning: package 'coefplot' was built under R version 4.3.3
coefplot(original_model)
##
Assuming ‘model’ is your logistic regression model
plot(original_model)
auc_improved <- auc(roc_improved)
auc_original <- auc(roc_original)
cat("AUC for Improved Model:", auc_improved, "\n")
## AUC for Improved Model: 0.729134
cat("AUC for Original Model:", auc_original, "\n")
## AUC for Original Model: 0.7253676