library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
mepsData <-read.csv("C:/Users/DELL/Desktop/Naomi_work/Assignment 4/mepsData.csv")
str(mepsData)

## 'data.frame':    30461 obs. of  20 variables:
##  $ X                       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Person_ID               : num  2.29e+09 2.29e+09 2.29e+09 2.29e+09 2.29e+09 ...
##  $ FluVaccination          : int  2 1 2 2 NA NA NA NA 2 NA ...
##  $ Age                     : int  26 25 33 39 11 8 4 2 36 36 ...
##  $ Sex                     : int  2 1 2 1 1 1 1 1 2 1 ...
##  $ RaceEthnicity           : int  2 2 1 1 1 1 1 1 2 2 ...
##  $ HealthInsurance         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NotAffordHealthCare     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ FamIncome_Continuous    : num  32000 32000 55000 55000 55000 ...
##  $ MentalHealth            : int  3 2 3 3 3 3 3 3 3 3 ...
##  $ FamIncome_Categorical   : int  3 3 3 3 3 3 3 3 5 5 ...
##  $ FamIncome_PercentPoverty: num  190 190 164 164 164 ...
##  $ HealthStatus            : int  2 2 3 3 3 3 3 3 2 3 ...
##  $ HaveProvider            : int  1 1 1 2 1 1 1 1 1 1 ...
##  $ CensusRegion            : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ TotalHealthExpenditure  : int  2368 2040 173 0 103 0 0 63 535 7023 ...
##  $ HasHypertension         : int  2 2 2 2 NA NA NA NA 2 2 ...
##  $ HasDiabetes             : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ BMI                     : num  21.4 30.6 28.2 28.7 NA NA NA NA 21.5 NA ...
##  $ Drinks5Day              : int  NA 1 NA 2 NA NA NA NA NA NA ...

Preliminary Analysis

Recoding

mepsData_1 <- mepsData %>% 
  mutate(BMI_Category = case_when(
    BMI < 16.0 ~ "Underweight-Severe",
    BMI >= 16.0 & BMI <= 16.9 ~ "Underweight-Moderate",
    BMI >= 17.0 & BMI <= 18.4 ~ "Undereight-Mild",
    BMI >= 18.5 & BMI <= 24.9 ~ "Normal",
    BMI >= 25.0 & BMI <= 29.9 ~ "Overweight",
    BMI >= 30.0 & BMI <= 34.9 ~ "Obese-i",
    BMI >= 35.0 & BMI <= 39.9 ~ "Obese-ii",
    TRUE ~ "Obese-iii"  # For any other value of BMI
  ))

Exploratory Analysis

head(mepsData_1,5)

##   X  Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101              2  26   2             2               1
## 2 2 2290001102              1  25   1             2               1
## 3 3 2290002101              2  33   2             1               1
## 4 4 2290002102              2  39   1             1               1
## 5 5 2290002103             NA  11   1             1               1
##   NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1                   2                32000            3                     3
## 2                   2                32000            2                     3
## 3                   2                55000            3                     3
## 4                   2                55000            3                     3
## 5                   2                55000            3                     3
##   FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1                   190.31            2            1            2
## 2                   190.31            2            1            2
## 3                   163.92            3            1            2
## 4                   163.92            3            2            2
## 5                   163.92            3            1            2
##   TotalHealthExpenditure HasHypertension HasDiabetes  BMI Drinks5Day
## 1                   2368               2           2 21.4         NA
## 2                   2040               2           2 30.6          1
## 3                    173               2           2 28.2         NA
## 4                      0               2           2 28.7          2
## 5                    103              NA           2   NA         NA
##   BMI_Category
## 1       Normal
## 2      Obese-i
## 3   Overweight
## 4   Overweight
## 5    Obese-iii

Two_bmi <- mepsData_1%>%group_by(BMI_Category)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
                      Twentyfive_percentile = quantile(TotalHealthExpenditure, 0.25),
                    Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))

Two_income <- mepsData_1 %>%group_by(FamIncome_Categorical)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
                                                              Twenty_percentile = quantile(TotalHealthExpenditure, 0.25),
                                                              Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))


Two_healthstatus <- mepsData_1 %>%group_by(HealthStatus)%>%summarise(average =mean(TotalHealthExpenditure),std = sd(TotalHealthExpenditure),
                                                              Twenty_percentile = quantile(TotalHealthExpenditure, 0.25),
                                                              Seventyfive_percentile=quantile(TotalHealthExpenditure, 0.75))

Two_healthstatus <- as.data.frame(Two_healthstatus)
Two_healthstatus%>%ggplot(aes(x=HealthStatus,y=average))+geom_line()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Two_income%>%ggplot(aes(x=FamIncome_Categorical, y=average))+geom_line()

Two_bmi%>%ggplot()

Descritpive Analysis

```{ r analysis}

mepsData_1%>%ggplot(aes(y=TotalHealthExpenditure, fill=as.factor(HealthStatus)))+geom_boxplot() #Question three mepsData_1%>%na.omit()%>%group_by(HealthStatus)%>%summarise(count=n()) mepsData_1%>%na.omit()%>%group_by(MentalHealth)%>%summarise(count=n())


## Check preliminary evaluation
```{ r check}

summary(mepsData_1$HealthStatus)
summary(mepsData_1$MentalHealth)

Perform a chi-squared test for independence

chi_sq_result <- chisq.test(mepsData_1$HealthStatus, mepsData_1$MentalHealth)
print(chi_sq_result)

## 
##  Pearson's Chi-squared test
## 
## data:  mepsData_1$HealthStatus and mepsData_1$MentalHealth
## X-squared = 25532, df = 16, p-value < 2.2e-16

Hypothesis b: Medical Expenditure varies by whether the person has high blood pressure (Has Hypertension)

Perform a t-test or Mann-Whitney U test

t_test_result <- t.test(mepsData_1$TotalHealthExpenditure ~ mepsData_1$HasHypertension)
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  mepsData_1$TotalHealthExpenditure by mepsData_1$HasHypertension
## t = 23.992, df = 11499, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  6608.840 7784.801
## sample estimates:
## mean in group 1 mean in group 2 
##       12050.802        4853.982

Hypothesis c: Medical Expenditure varies by Census Region

Check preliminary evaluation

Create a box plot

mepsData_1%>%ggplot(aes(x = as.factor(CensusRegion), y = TotalHealthExpenditure)) +
  geom_boxplot()

Perform an ANOVA or Kruskal-Wallis test

anova_result <- aov(TotalHealthExpenditure ~ CensusRegion, data = mepsData_1)
print(summary(anova_result))

##                 Df    Sum Sq   Mean Sq F value   Pr(>F)    
## CensusRegion     1 1.324e+10 1.324e+10   42.96 5.69e-11 ***
## Residuals    30084 9.274e+12 3.083e+08                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 375 observations deleted due to missingness

Hypothesis d: Medical Expenditure varies by Age

Create a scatterplot

mepsData_1%>% ggplot(aes(x = Age, y = log(TotalHealthExpenditure))) +
  geom_point() + geom_smooth(method="lm", se =FALSE, color ="red")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 4822 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 375 rows containing missing values or values outside the scale range
## (`geom_point()`).

mepsData_1%>% ggplot(aes(x = sqrt(BMI), y = sqrt(TotalHealthExpenditure))) +
  geom_point() + geom_smooth(method="lm", se =FALSE, color ="red")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 12209 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 12209 rows containing missing values or values outside the scale range
## (`geom_point()`).

Perform a correlation analysis

correlation_result <- cor.test(mepsData_1$Age, mepsData_1$TotalHealthExpenditure)
print(correlation_result)

## 
##  Pearson's product-moment correlation
## 
## data:  mepsData_1$Age and mepsData_1$TotalHealthExpenditure
## t = 38.333, df = 30084, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2049986 0.2265458
## sample estimates:
##       cor 
## 0.2157985

Create a correlation matrix for numerical variables

numerical_data <- mepsData_1[, c("Age", "BMI", "FamIncome_Continuous","FamIncome_PercentPoverty", "TotalHealthExpenditure" )]%>%na.omit()
head(mepsData)

##   X  Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101              2  26   2             2               1
## 2 2 2290001102              1  25   1             2               1
## 3 3 2290002101              2  33   2             1               1
## 4 4 2290002102              2  39   1             1               1
## 5 5 2290002103             NA  11   1             1               1
## 6 6 2290002104             NA   8   1             1               1
##   NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1                   2                32000            3                     3
## 2                   2                32000            2                     3
## 3                   2                55000            3                     3
## 4                   2                55000            3                     3
## 5                   2                55000            3                     3
## 6                   2                55000            3                     3
##   FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1                   190.31            2            1            2
## 2                   190.31            2            1            2
## 3                   163.92            3            1            2
## 4                   163.92            3            2            2
## 5                   163.92            3            1            2
## 6                   163.92            3            1            2
##   TotalHealthExpenditure HasHypertension HasDiabetes  BMI Drinks5Day
## 1                   2368               2           2 21.4         NA
## 2                   2040               2           2 30.6          1
## 3                    173               2           2 28.2         NA
## 4                      0               2           2 28.7          2
## 5                    103              NA           2   NA         NA
## 6                      0              NA           2   NA         NA

correlation_matrix <- cor(numerical_data)
print(correlation_matrix)

##                                    Age           BMI FamIncome_Continuous
## Age                       1.0000000000 -0.0002200695          -0.04774148
## BMI                      -0.0002200695  1.0000000000          -0.05935660
## FamIncome_Continuous     -0.0477414774 -0.0593565997           1.00000000
## FamIncome_PercentPoverty  0.0830837604 -0.0558400269           0.91169200
## TotalHealthExpenditure    0.1922121028  0.0534125709          -0.02371020
##                          FamIncome_PercentPoverty TotalHealthExpenditure
## Age                                   0.083083760            0.192212103
## BMI                                  -0.055840027            0.053412571
## FamIncome_Continuous                  0.911691997           -0.023710200
## FamIncome_PercentPoverty              1.000000000            0.006211844
## TotalHealthExpenditure                0.006211844            1.000000000

plot(mepsData_1$TotalHealthExpenditure, mepsData_1$Sex)

cor(mepsData_1$TotalHealthExpenditure, mepsData_1$Sex)

## [1] 0.02617498

mepsData_1 <- mepsData_1 %>% na.omit()

summary(mepsData_1$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   34.00   50.00   49.51   64.00   85.00

Fit a linear regression model

mepsData_model <- mepsData_1 %>% 
  mutate(Age_Category = case_when(
    Age < 20 ~ 0,
    Age >= 21 & Age <= 40 ~ 1,
    Age >= 41 & Age <= 60 ~ 2,
    Age >= 61 & Age <= 80 ~ 3,
    Age >= 81 & Age <= 100 ~ 4
  ), BMI_2 = case_when(
      BMI < 16.0 ~ 0,
      BMI >= 16.0 & BMI <= 16.9 ~ 1,
      BMI >= 17.0 & BMI <= 18.4 ~ 2,
      BMI >= 18.5 & BMI <= 24.9 ~ 3,
      BMI >= 25.0 & BMI <= 29.9 ~ 4,
      BMI >= 30.0 & BMI <= 34.9 ~ 5,
      BMI >= 35.0 & BMI <= 39.9 ~ 6,
      TRUE ~ 7  # For any other value of BMI
    ))
model <- lm(TotalHealthExpenditure ~ factor(Age_Category) + factor(Sex) + factor(BMI_2) + factor(RaceEthnicity) + factor(MentalHealth), data = mepsData_model)

Summarize the model

summary(model)

## 
## Call:
## lm(formula = TotalHealthExpenditure ~ factor(Age_Category) + 
##     factor(Sex) + factor(BMI_2) + factor(RaceEthnicity) + factor(MentalHealth), 
##     data = mepsData_model)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21792  -6430  -2975    172 546948 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1558.2     2058.8  -0.757 0.449151    
## factor(Age_Category)1    1416.4     1372.4   1.032 0.302099    
## factor(Age_Category)2    5075.3     1377.7   3.684 0.000231 ***
## factor(Age_Category)3    9923.4     1390.0   7.139 1.03e-12 ***
## factor(Age_Category)4   13338.0     1694.3   7.872 3.95e-15 ***
## factor(Sex)2            12583.2    10984.3   1.146 0.252009    
## factor(BMI_2)1            885.4     3799.3   0.233 0.815730    
## factor(BMI_2)2          -1464.7     2749.4  -0.533 0.594234    
## factor(BMI_2)3           -535.6     1608.6  -0.333 0.739168    
## factor(BMI_2)4           -541.7     1589.4  -0.341 0.733260    
## factor(BMI_2)5           -700.7     1622.4  -0.432 0.665835    
## factor(BMI_2)6           1362.7     1729.0   0.788 0.430635    
## factor(BMI_2)7           2033.1     1861.0   1.092 0.274655    
## factor(RaceEthnicity)2   2952.3      577.1   5.116 3.20e-07 ***
## factor(RaceEthnicity)3   1670.2      777.4   2.148 0.031715 *  
## factor(RaceEthnicity)4   1118.2     1120.0   0.998 0.318100    
## factor(RaceEthnicity)5    204.5     1339.9   0.153 0.878692    
## factor(MentalHealth)2     817.3      532.1   1.536 0.124550    
## factor(MentalHealth)3    2483.4      567.7   4.375 1.23e-05 ***
## factor(MentalHealth)4    7739.9      880.1   8.794  < 2e-16 ***
## factor(MentalHealth)5    9178.1     1839.6   4.989 6.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18990 on 7812 degrees of freedom
##   (108 observations deleted due to missingness)
## Multiple R-squared:  0.06327,    Adjusted R-squared:  0.06087 
## F-statistic: 26.38 on 20 and 7812 DF,  p-value: < 2.2e-16

Model Evaluation

Steps for model evealuation Step 1: Check for linearity using a residual vs. fitted plot

plot(model, which = 1)

Step 2: Check for independence using a residual vs. order plot

plot(model, which = 2)

Step 3: Check for homoscedasticity using a scale-location plot

plot(model, which = 3)

Step 4: Check for normality using a Q-Q plot

plot(model, which = 2)

qqnorm(model$residuals)

Split the data into training and testing datasets

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(123)  # for reproducibility
trainIndex <- createDataPartition(mepsData$TotalHealthExpenditure, p = 0.9, list = FALSE)
train_data <- mepsData[trainIndex, ]
test_data <- mepsData[-trainIndex, ]

Fit a new linear regression model using the training dataset

model_training <- lm(TotalHealthExpenditure ~ Age + factor(Sex) + BMI + factor(RaceEthnicity) + factor(MentalHealth)+factor(FamIncome_Categorical), data = train_data)

Predict on the training dataset

predicted_training <- predict(model_training, newdata = train_data)

Calculate Mean Squared Error (MSE) on the training data

mse_training <- mean((train_data$TotalHealthExpenditure - predicted_training)^2)
print(paste("MSE on training data:", mse_training))

## [1] "MSE on training data: NA"

Create a data frame for two individuals

new_data <- data.frame(
  Age = c(60, 60),
  Sex = c(1,2),
  BMI = c(27, 27),  # Replace with actual values
  RaceEthnicity = c(1, 1),  # Replace with actual values
  MentalHealth = c(2, 2),  # Replace with actual values
  FamIncome_Categorical = c(4, 4)  # Replace with actual values
)

Predict average and confidence intervals

predicted_values <- predict(model_training, newdata = new_data, interval = "confidence")
print(predicted_values)

##        fit      lwr      upr
## 1 5239.717 4266.585 6212.849
## 2 6262.529 5302.963 7222.095

Predict on the testing dataset

predicted_testing <- predict(model_training, newdata = test_data)

Calculate Mean Squared Error (MSE) on the testing data

mse_testing <- mean((test_data$TotalHealthExpenditure - predicted_testing)^2)
print(paste("MSE on testing data:", mse_testing))

## [1] "MSE on testing data: NA"

PART B of the Analysis

Recode the variable to 0 (for 2) and 1 (for 1)

mepsData$FluVaccination <- ifelse(mepsData$FluVaccination == 1, 1, 0)

Create a subset to include only adults (>18 years) with valid answers to flu vaccination status (Yes or No)

adults_subset <- subset(mepsData, Age > 18 & FluVaccination %in% c(1, 0))
head(adults_subset,5)

##   X  Person_ID FluVaccination Age Sex RaceEthnicity HealthInsurance
## 1 1 2290001101              0  26   2             2               1
## 2 2 2290001102              1  25   1             2               1
## 3 3 2290002101              0  33   2             1               1
## 4 4 2290002102              0  39   1             1               1
## 9 9 2290003101              0  36   2             2               1
##   NotAffordHealthCare FamIncome_Continuous MentalHealth FamIncome_Categorical
## 1                   2                32000            3                     3
## 2                   2                32000            2                     3
## 3                   2                55000            3                     3
## 4                   2                55000            3                     3
## 9                   2               258083            3                     5
##   FamIncome_PercentPoverty HealthStatus HaveProvider CensusRegion
## 1                   190.31            2            1            2
## 2                   190.31            2            1            2
## 3                   163.92            3            1            2
## 4                   163.92            3            2            2
## 9                  1013.48            2            1            2
##   TotalHealthExpenditure HasHypertension HasDiabetes  BMI Drinks5Day
## 1                   2368               2           2 21.4         NA
## 2                   2040               2           2 30.6          1
## 3                    173               2           2 28.2         NA
## 4                      0               2           2 28.7          2
## 9                    535               2           2 21.5         NA

Remove observations with NA in any of these variables

adults_subset <- na.omit(adults_subset)

Calculate the proportion of individuals who received a flu vaccine stratified by different variables

prop_by_provider <- (table(adults_subset$FluVaccination, adults_subset$HaveProvider)/length(adults_subset$FluVaccination))*100
prop_by_insurance <- (table(adults_subset$FluVaccination, adults_subset$HealthInsurance)/length(adults_subset$FluVaccination))*100
prop_by_health_status <- (table(adults_subset$FluVaccination, adults_subset$HealthStatus)/length(adults_subset$FluVaccination))*100
prop_by_income <- (table(adults_subset$FluVaccination, adults_subset$FamIncome_Categorical)/length(adults_subset$FluVaccination))*100
prop_by_provider

##    
##             1         2
##   0 35.984704 23.339707
##   1 34.786488  5.889101

prop_by_insurance

##    
##              1          2          3
##   0 38.0369662 12.6067559  8.6806883
##   1 27.9923518 11.7144678  0.9687699

prop_by_health_status

##    
##             1         2         3         4         5
##   0 14.671765 20.701083 17.221160  5.391969  1.338432
##   1  7.801147 13.652008 12.160612  5.634162  1.427661

prop_by_income

##    
##             1         2         3         4         5
##   0  7.469726  2.804334  8.272785 18.763544 22.014022
##   1  3.518164  1.287444  4.359465 10.860421 20.650096

Fit a logistic model

original_model <- glm(FluVaccination ~ Age + Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous, 
                     family = binomial, data = adults_subset)

Model with Age, Sex, and HealthInsurance

model1 <- glm(FluVaccination ~ Age + Sex + HealthInsurance, family = binomial, data = adults_subset)

Model with Age, RaceEthnicity, and FamIncome_Continuous

model2 <- glm(FluVaccination ~ Age + RaceEthnicity + FamIncome_Continuous, family = binomial, data = adults_subset)

Compare AIC values

aic_original <- AIC(original_model)
aic_model1 <- AIC(model1)
aic_model2 <- AIC(model2)

Choose the model with the lowest AIC

if (aic_model1 < aic_model2) {
  selected_model <- model1
} else {
  selected_model <- model2
}

Stepwise variable selection

stepwise_model <- step(original_model, direction = "both")

## Start:  AIC=9390.03
## FluVaccination ~ Age + Sex + RaceEthnicity + HealthInsurance + 
##     FamIncome_Continuous
## 
##                        Df Deviance     AIC
## - Sex                   1   9379.3  9389.3
## <none>                      9378.0  9390.0
## - RaceEthnicity         1   9384.7  9394.7
## - FamIncome_Continuous  1   9455.0  9465.0
## - HealthInsurance       1   9460.8  9470.8
## - Age                   1  10406.2 10416.2
## 
## Step:  AIC=9389.27
## FluVaccination ~ Age + RaceEthnicity + HealthInsurance + FamIncome_Continuous
## 
##                        Df Deviance     AIC
## <none>                      9379.3  9389.3
## + Sex                   1   9378.0  9390.0
## - RaceEthnicity         1   9386.1  9394.1
## - FamIncome_Continuous  1   9456.2  9464.2
## - HealthInsurance       1   9462.0  9470.0
## - Age                   1  10406.8 10414.8

Interaction between Age and Sex

interaction_model <- glm(FluVaccination ~ Age * Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous, 
                         family = binomial, data = adults_subset)

Quadratic transformation of Age

quadratic_age_model <- glm(FluVaccination ~ poly(Age, 2) + Sex + RaceEthnicity + HealthInsurance + FamIncome_Continuous, 
                           family = binomial, data = adults_subset)

Compare models based on AIC

aic_original <- AIC(original_model)
aic_selected <- AIC(selected_model)
aic_stepwise <- AIC(stepwise_model)
aic_interaction <- AIC(interaction_model)
aic_quadratic <- AIC(quadratic_age_model)

Choose the model with the lowest AIC

best_model <- selected_model  # Replace with the model that has the lowest AIC

Calculate odds ratios and their 95% confidence intervals

library(broom)

## Warning: package 'broom' was built under R version 4.3.2

odds_ratios <- tidy(model, exponentiate = TRUE)
conf_intervals <- confint(model)

Combine coefficients, odds ratios, and confidence intervals

coefficients_odds_ratios <- data.frame(
  Coefficient = summary(model)$coefficients[, 1],
  OddsRatio = odds_ratios$estimate,
  CI_Lower = conf_intervals[, 1],
  CI_Upper = conf_intervals[, 2]
)

coefficients_odds_ratios

##                        Coefficient     OddsRatio   CI_Lower  CI_Upper
## (Intercept)             -1558.2441  0.000000e+00 -5594.0444  2477.556
## factor(Age_Category)1    1416.3596           Inf -1273.9625  4106.682
## factor(Age_Category)2    5075.2518           Inf  2374.5665  7775.937
## factor(Age_Category)3    9923.4444           Inf  7198.5744 12648.315
## factor(Age_Category)4   13338.0056           Inf 10016.7113 16659.300
## factor(Sex)2            12583.2438           Inf -8948.8880 34115.375
## factor(BMI_2)1            885.4291           Inf -6562.2934  8333.152
## factor(BMI_2)2          -1464.6815  0.000000e+00 -6854.1903  3924.827
## factor(BMI_2)3           -535.6030 2.457909e-233 -3688.8431  2617.637
## factor(BMI_2)4           -541.6726 5.683044e-236 -3657.3109  2573.966
## factor(BMI_2)5           -700.6970 4.910758e-305 -3881.0505  2479.656
## factor(BMI_2)6           1362.6880           Inf -2026.5713  4751.947
## factor(BMI_2)7           2033.1480           Inf -1614.9778  5681.274
## factor(RaceEthnicity)2   2952.3316           Inf  1821.0701  4083.593
## factor(RaceEthnicity)3   1670.1791           Inf   146.2340  3194.124
## factor(RaceEthnicity)4   1118.2435           Inf -1077.2518  3313.739
## factor(RaceEthnicity)5    204.5188  6.628214e+88 -2422.1356  2831.173
## factor(MentalHealth)2     817.2963           Inf  -225.6703  1860.263
## factor(MentalHealth)3    2483.4385           Inf  1370.6028  3596.274
## factor(MentalHealth)4    7739.9157           Inf  6014.6110  9465.220
## factor(MentalHealth)5    9178.0917           Inf  5571.9036 12784.280

Create a data frame for 4 individuals

Predict probabilities and 95% confidence intervals

predicted_probs <- predict(model, newdata = mepsData_model, type = "response", se.fit = TRUE)

Choose a prediction threshold

threshold <- 0.5

Predicted probabilities

predicted_y <- ifelse(predicted_probs$fit > threshold, 1, 0)

Actual outcomes (flu vaccination status)

actual_y <- ifelse(adults_subset$FluVaccination == 1, 1, 0)

Calculate prediction accuracy

accuracy <- mean(predicted_y == actual_y)

## Warning in predicted_y == actual_y: longer object length is not a multiple of
## shorter object length

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

improved_model = quadratic_age_model

Calculate ROC curves for both models

roc_improved <- roc(adults_subset$FluVaccination, predict(improved_model, type="response"))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc_original <- roc(adults_subset$FluVaccination, predict(original_model, type = "response"))

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Plot the ROC curves

plot(roc_improved, col = "blue", main = "ROC Curves")
lines(roc_original, col = "red")
legend("bottomright", legend = c("Improved Model", "Original Model"), col = c("blue", "red"))

library(coefplot)

## Warning: package 'coefplot' was built under R version 4.3.3

Assuming ‘model’ is your logistic regression model

coefplot(original_model)

## Assuming ‘model’ is your logistic regression model

plot(original_model)

Calculate AUC for both models

auc_improved <- auc(roc_improved)
auc_original <- auc(roc_original)

cat("AUC for Improved Model:", auc_improved, "\n")

## AUC for Improved Model: 0.729134

cat("AUC for Original Model:", auc_original, "\n")

## AUC for Original Model: 0.7253676

Factors contribute to the high cost of healthcare

Adekunle Joseph Damilare

2024-04-18

Preliminary Analysis

Recoding

Exploratory Analysis

Descritpive Analysis

Perform a chi-squared test for independence

Hypothesis b: Medical Expenditure varies by whether the person has high blood pressure (Has Hypertension)

Perform a t-test or Mann-Whitney U test

Hypothesis c: Medical Expenditure varies by Census Region

Check preliminary evaluation

Create a box plot

Perform an ANOVA or Kruskal-Wallis test

Hypothesis d: Medical Expenditure varies by Age

Create a scatterplot

Perform a correlation analysis

Create a correlation matrix for numerical variables

Fit a linear regression model

Summarize the model

Model Evaluation

Step 2: Check for independence using a residual vs. order plot

Step 3: Check for homoscedasticity using a scale-location plot

Step 4: Check for normality using a Q-Q plot

Split the data into training and testing datasets

Fit a new linear regression model using the training dataset

Predict on the training dataset

Calculate Mean Squared Error (MSE) on the training data

Create a data frame for two individuals

Predict average and confidence intervals

Predict on the testing dataset

Calculate Mean Squared Error (MSE) on the testing data

PART B of the Analysis

Recode the variable to 0 (for 2) and 1 (for 1)

Create a subset to include only adults (>18 years) with valid answers to flu vaccination status (Yes or No)

Remove observations with NA in any of these variables

Calculate the proportion of individuals who received a flu vaccine stratified by different variables

Fit a logistic model

Model with Age, Sex, and HealthInsurance

Model with Age, RaceEthnicity, and FamIncome_Continuous

Compare AIC values

Choose the model with the lowest AIC

Stepwise variable selection

Interaction between Age and Sex

Quadratic transformation of Age

Compare models based on AIC

Choose the model with the lowest AIC

Calculate odds ratios and their 95% confidence intervals

Combine coefficients, odds ratios, and confidence intervals

Create a data frame for 4 individuals

Predict probabilities and 95% confidence intervals

Choose a prediction threshold

Predicted probabilities

Actual outcomes (flu vaccination status)

Calculate prediction accuracy

Calculate ROC curves for both models

Plot the ROC curves

Assuming ‘model’ is your logistic regression model

Calculate AUC for both models