# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 6 ~
# ~ Count Regression ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Number of Births Example:
# A study was performed to describe the frequency of births.
# Researchers were interested in determining whether the
# number of births to women living in rural versus urban
# areas differed. Researchers were able to recruit 49 women to
# participate in the study. Working with the study participants
# and their physicians, research retrospectively collected the
# number births over a 3 year period.
# Data Set: births
# Data Dictionary:
# (1) ID Subject identifier
# (2) NBirth Number of births
# (3) Rural Location of home (1 = Rural area; 0 = Urban area)
# (4) SurvPeriod Length of retrospective medical review (years)
## Download and load the data file = bpdata
load(url("http://www.duke.edu/~sgrambow/crp241data/births.RData"))
# Question 1:
# Is there evidence of a relationship
# between the number of births and whether
# the woman lives in a rural vs. urban
# location? Create a plot to address this
# question.
# ANSWER:
# - In a regression setting, typically create a scatter plot of X vs. Y
plot(births$Rural,births$NBirth)
# But scatter plots are not always informative if BOTH variables
# are categorical # - Some alternative plotting approaches
# that are more informative are ...
# -- (1) Jitter the points in the scatter plot
plot(jitter(births$Rural),births$NBirth)
# -- (2) Create a split-level bar plot instead
# information on creating bar plots here:
# https://www.statmethods.net/graphs/bar.html
c.tab <- table(births$Rural,births$NBirth)
rownames(c.tab) <- c("Urban","Rural")
barplot(c.tab, main="Distribution of Births (Rural vs Urban)",
xlab="Number of births over 3 year period",
ylab="Frequency",
legend=rownames(c.tab),
col=c("darkblue","red"))
# NOTE Questions 2-4 can all be answered with
# same model fit.
fit.poisson <- glm(NBirth ~ Rural, data=births, family='poisson')
summary(fit.poisson)
##
## Call:
## glm(formula = NBirth ~ Rural, family = "poisson", data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6870 -1.3513 -0.3749 0.4555 1.7646
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09097 0.21822 -0.417 0.677
## Rural 0.44379 0.27321 1.624 0.104
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 56.033 on 48 degrees of freedom
## Residual deviance: 53.306 on 47 degrees of freedom
## AIC: 138.05
##
## Number of Fisher Scoring iterations: 5
# Question 2:
# Is there evidence of an association
# between the number of births and whether
# the woman lives in a rural vs. urban
# location at significance level 0.05?
# Address this question by fitting a
# Poisson regression model.
# ANSWER:
# For this question, we need to examine
# the p-value for the slope coefficient
# for the Rural variable.
# p-value = 0.104
# Since 0.104 > 0.05, we conclude
# that there is NO STATISTICAL ASSOCIATION!
# Question 3:
# Consider the model fit in Question 2.
# What does the estimated slope
# coefficient for the predictor 'Rural'
# represent?
# ANSWER:
# The slope coefficient = 0.44.
# This is the difference in the log mean
# count of the rural births minus the log mean count
# of the urban births (a 1 unit change in X here
# is simply the shift from urban to rural)
# Can we verify this result by examining
# the raw data?
# There are a number of ways we can accomplish this.
# As done before, we can use the subset function
# to create two new data frames, one for
# those who are urban and one for those who are rural
urban <- subset(births,Rural==0)
rural <- subset(births,Rural==1)
# Now simply calculate the mean and sd of each group
mean(urban$NBirth)
## [1] 0.9130435
# [1] 0.9130435
sd(urban$NBirth)
## [1] 0.9001537
# [1] 0.9001537
mean(rural$NBirth)
## [1] 1.423077
# [1] 1.423077
sd(rural$NBirth)
## [1] 1.172112
# [1] 1.172112
# Now we can simply check the math with the raw
# numbers
# log Rural - log Urban
log(1.42)-log(.913)
## [1] 0.4416763
# yields 0.44168 which is equal to the beta coefficient
# for rural.
# Another way to generate both the mean
# and SD for each group is to use the table1
# function we have used previously
# Let's create a table 1
# using the table 1 function
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table.data <- births
table.data$Rural <- factor(table.data$Rural,
labels = c("Urban","Rural"))
# We can create a nicely formatted table with
# a single function call to table1
table1(~ NBirth | Rural, data=table.data)
| Urban (N=23) |
Rural (N=26) |
Overall (N=49) |
|
|---|---|---|---|
| NBirth | |||
| Mean (SD) | 0.913 (0.900) | 1.42 (1.17) | 1.18 (1.07) |
| Median [Min, Max] | 1.00 [0, 3.00] | 1.00 [0, 4.00] | 1.00 [0, 4.00] |
# This table provides us with the mean and
# standard deviation for each group
# from here, we can use the log means to
# check the answer as we did above
# Question 4:
# Consider the model fit in Question 2.
# Does the data suggest that the mean
# number of births is numerically higher
# among women living in rural vs. urban
# areas? Why?
# ANSWER:
# Yes. Rural group has a higher mean count.
# Using our model, we can exponentiate the slope
# coefficient
exp(fit.poisson$coefficients)
## (Intercept) Rural
## 0.9130435 1.5586081
# which yields
# (Intercept) Rural
# 0.9130435 1.5586081
# Interpretation:
# mean number of births is 56% higher in Rural vs Urban areas
# mean number of births is 1.56 times higher in Rural vs Urban areas
# Question 5:
# Consider the model fit in Question 2.
# What is the estimated mean number of
# births among women living in rural
# areas? Among women living in
# urban areas?
# this is a question about prediction
# We want the predicted mean number
# of births when x=1 (rural) and wen x=0 (urban)
# we need to 'untransform' the data as shown
# in the lecture slides. so we need to evalute
# the exponential function of b0+b1X
# So:
# - Among women living in rural areas (x=1)
exp(-0.09097 + (0.44379*1))
## [1] 1.423075
# [1] 1.423075
# Thus, the predicated mean number of births
# for women in rural areas is 1.42
# - Among women living in urban areas
exp(-0.09097 + (0.44379*0))
## [1] 0.9130451
# [1] 0.9130451
# Thus, the predicated mean number of births
# for women in rural areas is 0.91
# - Double check estimates using descriptive
# statistics. We can do this by simply
# calculating the sample mean fore each group
mean(urban$NBirth)
## [1] 0.9130435
# [1] 0.9130435
mean(rural$NBirth)
## [1] 1.423077
# [1] 1.423077
# We can also use the predict function to generate
# these predicted values.
# Type = response below specifies that we want the prediction
# on the probability scale.
# for Rural = 1
predict(fit.poisson,newdata=data.frame(Rural=1),type="response")
## 1
## 1.423077
# for Rural = 0
predict(fit.poisson,newdata=data.frame(Rural=0),type="response")
## 1
## 0.9130435
# Question 6:
# Consider the model fit in Question 2.
# Estimate the ratio-of-means birth counts
# for women living in rural vs. urban areas
# and it's 95% confidence interval.
# What does this point estimate represent?
# ANSWER:
# Here we need to exponentiate both the
# point estimate and confidence limits.
exp(fit.poisson$coefficients)
## (Intercept) Rural
## 0.9130435 1.5586081
exp(confint(fit.poisson))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.5760851 1.361045
## Rural 0.9215483 2.707077
# this yields
# > exp(fit$coefficients)
# (Intercept) Rural
# 0.9130435 1.5586081
# 1.56 is the *** Ratio-of-Means point estimate ***
# > exp(confint(fit))
# Waiting for profiling to be done...
# 2.5 % 97.5 %
# (Intercept) 0.5760851 1.361045
# Rural 0.9215483 2.707077
# [0.92, 2.71] is the *** Ratio-of-Means confidence interval ***
# - We can Double check this estimate using
# raw data once again and simply calculate
# the ratio of the means
mean(rural$NBirth)/mean(urban$NBirth)
## [1] 1.558608
# [1] 1.558608
# *** Same as Ratio-of-Means point estimate ***
# Question 7:
# Consider the model fit in Question 2.
# Suppose it was discovered that the
# researchers had access to 6 years of
# medical records for women living in
# urban areas but only 3 years of medical
# records for women in living rural areas.
# Does this new information impact the
# validity of the modeling results in Question 2?
# If so, how?
# This is problematic as the basic model assumes
# that the exposure/surveillance period is uniform.
# Therefore, we want to use an offset term so we can
# incorporate information about the length of surveillance
# for each subject.
fit.poisson.off <- glm(NBirth ~ Rural + offset(log(SurvPeriod)),
data=births, family='poisson')
summary(fit.poisson.off)
##
## Call:
## glm(formula = NBirth ~ Rural + offset(log(SurvPeriod)), family = "poisson",
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6870 -1.3513 -0.3749 0.4555 1.7646
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8827 0.2182 -8.628 < 2e-16 ***
## Rural 1.1369 0.2732 4.161 3.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 71.563 on 48 degrees of freedom
## Residual deviance: 53.306 on 47 degrees of freedom
## AIC: 138.05
##
## Number of Fisher Scoring iterations: 5
# Rate-Ratio under Model 1 (which assumes uniform surveillance)
# -- Using summary stats and 3 year exposure period for all observations
(1.423/3)/(0.913/3)
## [1] 1.558598
# -- Using Poisson regression model estimate
exp(0.44379)
## [1] 1.558603
# [1] 1.558598
# mean number of births is 55% higher in Rural vs Urban areas
# - Rate-Ratio under Model 2 (which incorporates offset information)
# -- Using summary stats and 3/6 year exposure period for
# rural/urban living areas
(1.423/3)/(0.913/6)
## [1] 3.117196
# [1] 3.117196
# -- Using Poisson regression model estimate
exp(1.1369)
## [1] 3.11709
# [1] 3.11709
# End of Program