#We’ll study the relationship between the number of doctor visits (count data) and two predictors:
#Age
#Gender
#Healthstatus_num
#InsuranceCoverage

#____________________________

#Healthstatus_num
#Type: Categorical (e.g., “Excellent”, “Good”, “Fair”, “Poor”)
#People with poorer health are expected to visit doctors more often
#InsuranceCoverage
#Type: Binary (0 = No, 1 = Yes)
#Individuals with health insurance are more likely to seek medical care because costs are subsidized.

set.seed(123)
n <- 100
age <- round(runif(n, 20, 70))      # random ages between 20–70
gender <- rbinom(n, 1, 0.5) #0 = female, 1 = male
Insurancecoverage <- rbinom(n, 1, 0.7)   # Assume probability (p) = 0.7 means 70% have insurance 
p <- c(0.2, 0.4, 0.25, 0.15)  # Excellent, Good, Fair, Poor
Healthstatus_num <- apply(rmultinom(n, size = 1, prob = p), 2, which.max)

#true parameters 
beta0 <- 0.5
beta1 <- 0.03    
beta2 <- -0.4
beta3 <- 0.5
beta4 <- -0.4
# Generate Poisson counts
lambda <- exp(beta0 + beta1 * age + beta2 * gender+beta3 *Insurancecoverage + beta4 * Healthstatus_num )
visits <- rpois(n, lambda)
mydata <- data.frame(visits, age, gender,Insurancecoverage,Healthstatus_num)
head(mydata)
##   visits age gender Insurancecoverage Healthstatus_num
## 1      0  34      1                 1                4
## 2      6  59      0                 0                2
## 3      2  40      0                 1                3
## 4      6  64      1                 1                3
## 5      9  67      0                 1                2
## 6      0  22      1                 0                4
#fit poisson
pois_model <- glm(visits ~ age + gender + Insurancecoverage + Healthstatus_num,family = poisson, data = mydata)
summary(pois_model)
## 
## Call:
## glm(formula = visits ~ age + gender + Insurancecoverage + Healthstatus_num, 
##     family = poisson, data = mydata)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.663257   0.284871   2.328 0.019898 *  
## age                0.028379   0.004015   7.068 1.57e-12 ***
## gender            -0.368519   0.109750  -3.358 0.000786 ***
## Insurancecoverage  0.378605   0.136045   2.783 0.005387 ** 
## Healthstatus_num  -0.380588   0.054850  -6.939 3.96e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 276.15  on 99  degrees of freedom
## Residual deviance: 122.44  on 95  degrees of freedom
## AIC: 396.17
## 
## Number of Fisher Scoring iterations: 5
confint(pois_model)
## Waiting for profiling to be done...
##                         2.5 %      97.5 %
## (Intercept)        0.09738297  1.21436718
## age                0.02058437  0.03633322
## gender            -0.58520310 -0.15460852
## Insurancecoverage  0.11833949  0.65247910
## Healthstatus_num  -0.48901050 -0.27387774
exp(confint(pois_model))  # CI for incidence rate ratios
## Waiting for profiling to be done...
##                       2.5 %    97.5 %
## (Intercept)       1.1022824 3.3681620
## age               1.0207977 1.0370013
## gender            0.5569927 0.8567505
## Insurancecoverage 1.1256262 1.9202955
## Healthstatus_num  0.6132329 0.7604250
#Likelihood Ratio Test (LRT)
model0 <- glm(visits ~ 1, family = poisson, data = mydata) # reduced (intercept only)
pois_model <- glm(visits ~ age + gender+ Insurancecoverage + Healthstatus_num, family = poisson, data = mydata)
anova(model0, pois_model, test = "Chisq") #anova compares two nested models, test = "Chisq" tells R to use the Chi-square Likelihood Ratio Test.
## Analysis of Deviance Table
## 
## Model 1: visits ~ 1
## Model 2: visits ~ age + gender + Insurancecoverage + Healthstatus_num
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        99     276.15                          
## 2        95     122.44  4   153.71 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model0$deviance   
## [1] 276.1518
pois_model$deviance
## [1] 122.4379