# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 6 ~
# ~ Count Regression     ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# 

# Number of Births Example: 
# A study was performed to describe the frequency of births. 
# Researchers were interested in determining whether the 
# number of births to women living in rural versus urban 
# areas differed. Researchers were able to recruit 49 women to 
# participate in the study. Working with the study participants 
# and their physicians, research retrospectively collected the 
# number births over a 3 year period.

# Data Set: births

# Data Dictionary: 
# (1) ID          Subject identifier 
# (2) NBirth      Number of births 
# (3) Rural       Location of home (1 = Rural area; 0 = Urban area)
# (4) SurvPeriod  Length of retrospective medical review (years) 

## Download and load the data file = bpdata
load(url("http://www.duke.edu/~sgrambow/crp241data/births.RData"))
# Question 1:  
# Is there evidence of a relationship 
# between the number of births and whether
# the woman lives in a rural vs. urban 
# location? Create a plot to address this
# question.
# ANSWER:
# - In a regression setting, typically create a scatter plot of X vs. Y
plot(births$Rural,births$NBirth)

# But scatter plots are not always informative if BOTH variables 
# are categorical # - Some alternative plotting approaches 
# that are more informative are ... 
# -- (1) Jitter the points in the scatter plot 

plot(jitter(births$Rural),births$NBirth)

# -- (2) Create a split-level bar plot instead 
# information on creating bar plots here:
# https://www.statmethods.net/graphs/bar.html

c.tab <- table(births$Rural,births$NBirth)
rownames(c.tab) <- c("Urban","Rural") 
barplot(c.tab, main="Distribution of Births (Rural vs Urban)",
        xlab="Number of births over 3 year period",
        ylab="Frequency",
        legend=rownames(c.tab),
        col=c("darkblue","red"))

# NOTE Questions 2-4 can all be answered with
# same model fit.

fit.poisson <- glm(NBirth ~ Rural, data=births, family='poisson')
summary(fit.poisson)
## 
## Call:
## glm(formula = NBirth ~ Rural, family = "poisson", data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6870  -1.3513  -0.3749   0.4555   1.7646  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09097    0.21822  -0.417    0.677
## Rural        0.44379    0.27321   1.624    0.104
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 56.033  on 48  degrees of freedom
## Residual deviance: 53.306  on 47  degrees of freedom
## AIC: 138.05
## 
## Number of Fisher Scoring iterations: 5
# Question 2:  
# Is there evidence of an association 
# between the number of births and whether
# the woman lives in a rural vs. urban 
# location at significance level 0.05?
# Address this question by fitting a 
# Poisson regression model. 
# ANSWER:
# For this question, we need to examine
# the p-value for the slope coefficient 
# for the Rural variable.

# p-value = 0.104 
# Since 0.104 > 0.05, we conclude
# that there is NO STATISTICAL ASSOCIATION!
# Question 3:  
# Consider the model fit in Question 2.
# What does the estimated slope 
# coefficient for the predictor 'Rural' 
# represent?
# ANSWER:
# The slope coefficient = 0.44.
# This is the difference in the log mean
# count of the rural births minus the log mean count 
# of the urban births (a 1 unit change in X here
# is simply the shift from urban to rural)

# Can we verify this result by examining 
# the raw data?

# There are a number of ways we can accomplish this.
# As done before, we can use the subset function
# to create two new data frames, one for
# those who are urban and one for those who are rural

urban <- subset(births,Rural==0)
rural <- subset(births,Rural==1)

# Now simply calculate the mean and sd of each group

mean(urban$NBirth)
## [1] 0.9130435
# [1] 0.9130435
sd(urban$NBirth)
## [1] 0.9001537
# [1] 0.9001537


mean(rural$NBirth)
## [1] 1.423077
# [1] 1.423077
sd(rural$NBirth)
## [1] 1.172112
# [1] 1.172112

# Now we can simply check the math with the raw 
# numbers
# log Rural - log Urban

log(1.42)-log(.913)
## [1] 0.4416763
# yields 0.44168 which is equal to the beta coefficient
# for rural.

# Another way to generate both the mean
# and SD for each group is to use the table1
# function we have used previously
# Let's create a table 1
# using the table 1 function
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table.data <- births
table.data$Rural <- factor(table.data$Rural,
                           labels = c("Urban","Rural"))
# We can create a nicely formatted table with 
# a single function call to table1
table1(~ NBirth | Rural, data=table.data)
Urban
(N=23)
Rural
(N=26)
Overall
(N=49)
NBirth
Mean (SD) 0.913 (0.900) 1.42 (1.17) 1.18 (1.07)
Median [Min, Max] 1.00 [0, 3.00] 1.00 [0, 4.00] 1.00 [0, 4.00]
# This table provides us with the mean and 
# standard deviation for each group
# from here, we can use the log means to 
# check the answer as we did above
# Question 4:  
# Consider the model fit in Question 2.
# Does the data suggest that the mean 
# number of births is numerically higher 
# among women living in rural vs. urban 
# areas? Why?
# ANSWER:
# Yes.  Rural group has a higher mean count.
# Using our model, we can exponentiate the slope  
# coefficient

exp(fit.poisson$coefficients)
## (Intercept)       Rural 
##   0.9130435   1.5586081
# which yields
# (Intercept)       Rural 
# 0.9130435   1.5586081 
# Interpretation:
# mean number of births is 56% higher in Rural vs Urban areas
# mean number of births is 1.56 times higher in Rural vs Urban areas
# Question 5:  
# Consider the model fit in Question 2.
# What is the estimated mean number of 
# births among women living in rural 
# areas? Among women living in 
# urban areas?
# this is a question about prediction
# We want the predicted mean number 
# of births when x=1 (rural) and wen x=0 (urban)
# we need to 'untransform' the data as shown
# in the lecture slides. so we need to evalute 
# the exponential function of b0+b1X
# So:
# - Among women living in rural areas (x=1)

exp(-0.09097 + (0.44379*1))
## [1] 1.423075
# [1] 1.423075
# Thus, the predicated mean number of births
# for women in rural areas is 1.42
# - Among women living in urban areas

exp(-0.09097 + (0.44379*0))
## [1] 0.9130451
# [1] 0.9130451
# Thus, the predicated mean number of births
# for women in rural areas is 0.91

# - Double check estimates using descriptive
# statistics.  We can do this by simply
# calculating the sample mean fore each group

mean(urban$NBirth)
## [1] 0.9130435
# [1] 0.9130435

mean(rural$NBirth)
## [1] 1.423077
# [1] 1.423077

# We can also use the predict function to generate
# these predicted values.

# Type = response below specifies that we want the prediction
# on the probability scale.

# for Rural = 1
predict(fit.poisson,newdata=data.frame(Rural=1),type="response")
##        1 
## 1.423077
# for Rural = 0
predict(fit.poisson,newdata=data.frame(Rural=0),type="response")
##         1 
## 0.9130435
# Question 6:  
# Consider the model fit in Question 2.
# Estimate the ratio-of-means birth counts 
# for women living in rural vs. urban areas 
# and it's 95% confidence interval. 
# What does this point estimate represent?
# ANSWER:
# Here we need to exponentiate both the 
# point estimate and confidence limits.

exp(fit.poisson$coefficients)
## (Intercept)       Rural 
##   0.9130435   1.5586081
exp(confint(fit.poisson))
## Waiting for profiling to be done...
##                 2.5 %   97.5 %
## (Intercept) 0.5760851 1.361045
## Rural       0.9215483 2.707077
# this yields
# > exp(fit$coefficients)
# (Intercept)       Rural 
#   0.9130435   1.5586081       
# 1.56 is the *** Ratio-of-Means point estimate *** 
# > exp(confint(fit))
# Waiting for profiling to be done...
#                 2.5 %   97.5 %
# (Intercept) 0.5760851 1.361045
# Rural       0.9215483 2.707077  
# [0.92, 2.71] is the *** Ratio-of-Means confidence interval *** 

# - We can Double check this estimate using
# raw data once again and simply calculate 
# the ratio of the means
mean(rural$NBirth)/mean(urban$NBirth)
## [1] 1.558608
# [1] 1.558608
# *** Same as Ratio-of-Means point estimate ***
# Question 7:  
# Consider the model fit in Question 2.
# Suppose it was discovered that the 
# researchers had access to 6 years of 
# medical records for women living in 
# urban areas but only 3 years of medical 
# records for women in living rural areas. 
# Does this new information impact the 
# validity of the modeling results in Question 2? 
# If so, how?
# This is problematic as the basic model assumes
# that the exposure/surveillance period is uniform.
# Therefore, we want to use an offset term so we can
# incorporate information about the length of surveillance
# for each subject.

fit.poisson.off <- glm(NBirth ~ Rural + offset(log(SurvPeriod)),
                       data=births, family='poisson')
summary(fit.poisson.off)
## 
## Call:
## glm(formula = NBirth ~ Rural + offset(log(SurvPeriod)), family = "poisson", 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6870  -1.3513  -0.3749   0.4555   1.7646  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8827     0.2182  -8.628  < 2e-16 ***
## Rural         1.1369     0.2732   4.161 3.16e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 71.563  on 48  degrees of freedom
## Residual deviance: 53.306  on 47  degrees of freedom
## AIC: 138.05
## 
## Number of Fisher Scoring iterations: 5
# Rate-Ratio under Model 1 (which assumes uniform surveillance)
# -- Using summary stats and 3 year exposure period for all observations

(1.423/3)/(0.913/3)
## [1] 1.558598
# -- Using Poisson regression model estimate

exp(0.44379)
## [1] 1.558603
# [1] 1.558598 
# mean number of births is 55% higher in Rural vs Urban areas
# - Rate-Ratio under Model 2 (which incorporates offset information)
# -- Using summary stats and 3/6 year exposure period for 
#    rural/urban living areas

(1.423/3)/(0.913/6)
## [1] 3.117196
# [1] 3.117196

# -- Using Poisson regression model estimate

exp(1.1369)
## [1] 3.11709
# [1] 3.11709
# End of Program