#We’ll study the relationship between the number of doctor visits (count data) and two predictors:
#Age
#Gender
#Healthstatus_num
#InsuranceCoverage
#____________________________
#Healthstatus_num
#Type: Categorical (e.g., “Excellent”, “Good”, “Fair”, “Poor”)
#People with poorer health are expected to visit doctors more often
#InsuranceCoverage
#Type: Binary (0 = No, 1 = Yes)
#Individuals with health insurance are more likely to seek medical care because costs are subsidized.
set.seed(123)
n <- 100
age <- round(runif(n, 20, 70)) # random ages between 20–70
gender <- rbinom(n, 1, 0.5) #0 = female, 1 = male
Insurancecoverage <- rbinom(n, 1, 0.7) # Assume probability (p) = 0.7 means 70% have insurance
p <- c(0.2, 0.4, 0.25, 0.15) # Excellent, Good, Fair, Poor
Healthstatus_num <- apply(rmultinom(n, size = 1, prob = p), 2, which.max)
#true parameters
beta0 <- 0.5
beta1 <- 0.03
beta2 <- -0.4
beta3 <- 0.5
beta4 <- -0.4
# Generate Poisson counts
lambda <- exp(beta0 + beta1 * age + beta2 * gender+beta3 *Insurancecoverage + beta4 * Healthstatus_num )
visits <- rpois(n, lambda)
mydata <- data.frame(visits, age, gender,Insurancecoverage,Healthstatus_num)
head(mydata)
## visits age gender Insurancecoverage Healthstatus_num
## 1 0 34 1 1 4
## 2 6 59 0 0 2
## 3 2 40 0 1 3
## 4 6 64 1 1 3
## 5 9 67 0 1 2
## 6 0 22 1 0 4
#fit poisson
pois_model <- glm(visits ~ age + gender + Insurancecoverage + Healthstatus_num,family = poisson, data = mydata)
summary(pois_model)
##
## Call:
## glm(formula = visits ~ age + gender + Insurancecoverage + Healthstatus_num,
## family = poisson, data = mydata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.663257 0.284871 2.328 0.019898 *
## age 0.028379 0.004015 7.068 1.57e-12 ***
## gender -0.368519 0.109750 -3.358 0.000786 ***
## Insurancecoverage 0.378605 0.136045 2.783 0.005387 **
## Healthstatus_num -0.380588 0.054850 -6.939 3.96e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 276.15 on 99 degrees of freedom
## Residual deviance: 122.44 on 95 degrees of freedom
## AIC: 396.17
##
## Number of Fisher Scoring iterations: 5
confint(pois_model)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.09738297 1.21436718
## age 0.02058437 0.03633322
## gender -0.58520310 -0.15460852
## Insurancecoverage 0.11833949 0.65247910
## Healthstatus_num -0.48901050 -0.27387774
exp(confint(pois_model)) # CI for incidence rate ratios
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.1022824 3.3681620
## age 1.0207977 1.0370013
## gender 0.5569927 0.8567505
## Insurancecoverage 1.1256262 1.9202955
## Healthstatus_num 0.6132329 0.7604250
#Likelihood Ratio Test (LRT)
model0 <- glm(visits ~ 1, family = poisson, data = mydata) # reduced (intercept only)
pois_model <- glm(visits ~ age + gender+ Insurancecoverage + Healthstatus_num, family = poisson, data = mydata)
anova(model0, pois_model, test = "Chisq") #anova compares two nested models, test = "Chisq" tells R to use the Chi-square Likelihood Ratio Test.
## Analysis of Deviance Table
##
## Model 1: visits ~ 1
## Model 2: visits ~ age + gender + Insurancecoverage + Healthstatus_num
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 99 276.15
## 2 95 122.44 4 153.71 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model0$deviance
## [1] 276.1518
pois_model$deviance
## [1] 122.4379