# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 1 Day 2 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Fair's Affair Data Example: 
# A cross-sectional study was performed to 
# understand the factors that correlate with an 
# individuals decision to engage in extramarital 
# affairs. 601 individuals participated 
# in the study. The response of interest was 
# whether or not the subject engaged in any 
# extramarital sexual relations at least once in 
# the last 12 months. Researchers wanted 
# to understand the relationship between a 
#subjects decision to engage in an affair and 
# the length of their marriage. 

# Data Set: affairs2

# Data Dictionary: 
# (1) id          Subject identifier 
# (2) anyaffair   Subject engaged in extramarital 
#                 sexual relations at least once 
#                 in the last 12 months (1 = Yes; 0 = No)
# (3) yrsmarried  Number of years subject has been married 
#                 at time of study (in years)

## Download and load the data file = affairs2
download.file("http://www.duke.edu/~sgrambow/crp241data/affairs2.RData",
              destfile = "affairs2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("affairs2.RData")
# Q1:
# Is there evidence of an association between 
# the number of years married and the decision 
# to engage in an extra-marital affair? 
# Create a scatterplot to address this question.

#visualize
# NOTE the use of jitter() function to scatter overlying
# points so you get a better sense of the density of 
# points in the plot.  We demonstrated using jitter
# in both the x and y directions.  Generally, people
# tend to use jitter in the y direction.
plot(affairs2$yrsmarried,affairs2$anyaffair)

plot(affairs2$yrsmarried,jitter(affairs2$anyaffair))

plot(jitter(affairs2$yrsmarried),affairs2$anyaffair)

# Q2:
# Is there evidence of an association between 
# the number of years married and the decision 
# to engage in an extra-marital affair at 
# significance level 0.05? Address this question 
# by fitting a logistic regression model.

#fit model
fit <- glm(anyaffair ~ yrsmarried, data=affairs2, family='binomial')
summary(fit)
## 
## Call:
## glm(formula = anyaffair ~ yrsmarried, family = "binomial", data = affairs2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8883  -0.7846  -0.6720  -0.6061   1.8894  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
## yrsmarried   0.05882    0.01724   3.412 0.000646 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 663.50  on 599  degrees of freedom
## AIC: 667.5
## 
## Number of Fisher Scoring iterations: 4
exp(fit$coefficients)
## (Intercept)  yrsmarried 
##   0.2001702   1.0605864
# ANSWER: Based on the p-value of 0.000646 for 
# the yrsmarried coefficient, we conclude there is 
# an association between the number of years married
# and the decision to engage in an extra-marital affair.
# 
# NOTE 1 -- if you forget to indicate the family in the 
# glm() function call, it will default to linear regression
# and will happily provide you with the WRONG results -- it will
# not give you an error.
# NOTE 2 -- The key output we want is provided by the
# summary() function which yields the beta coefficients
# and the p-values for the betas. Remember that the 
# betas here are describing changes on the log odds scale.
# Q3:
# Consider the model fit in Question 2.
# Do these data suggest that the association 
# between the number of years married and 
# the decision to engage in an extra-marital 
# affair is positive or negative?

# output from summary(fit) above
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

# ANSWER: coefficient is positive; beta1 > 0 and OR > 1; 
# positive association the longer your are married, 
# the higher the chance you will have an affair.
# Q4:
# Consider the model fit in Question 2.
# What does the estimated slope coefficient 
# for the predictor number of years married 
# represent?

# output from fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

# ANSWER: Change in log-odds of Y=1 (having an affair) for a 
# unit increase in X; here X=yrsmarried so a unit increase 
# in X is additional year of marriage. So, it's the change 
# in the log-odds of having an affair for each additional 
# increase of 1 year in the length of marriage.
# Q5: 
# Consider the model fit in Question 2.
# Estimate the odds ratio for the decision to 
# engage in an extra-marital affair per decade 
# of marriage and it's 95% confidence interval. 
# What does the point estimate represent?

# output from fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

#ANSWER: So the OR for a 1 year increase in marriage is OR=1.06.
# for a 1 decade increase...
# NOTE that the code immediately below is pulling the 
# beta coefficients from the fit object. It exponentiates
# both betas but we are only interested in the slope coefficient
# for yrsmarried.  Note also the we multiple by 10 BEFORE 
# exponentiating.
exp(10*fit$coefficients)
##  (Intercept)   yrsmarried 
## 1.032749e-07 1.800780e+00
# (Intercept)   yrsmarried 
# 1.032749e-07 1.800780e+00 

# To get the correct confidence interaval
# limits for a 1 decade increase, 
# calculate the standard confidence interval
# for a 1 year change, multiply that by 10 and 
# exponent iate to get it on the OR scale.
confint(fit)
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept) -1.97824219 -1.25863037
## yrsmarried   0.02525637  0.09293219
exp(10*confint(fit))
## Waiting for profiling to be done...
##                    2.5 %       97.5 %
## (Intercept) 2.562143e-09 3.418517e-06
## yrsmarried  1.287322e+00 2.532791e+00
# Thus, the OR for having an affair per decade of marriage is 1.8 with
# 95% CI from 1.29 to 2.53. Therefore, we can say that the odds of
# having an affair increases by 80% for each decade of marriage.  We
# could also express this as, "the odds of having an affair increase
# by 1.8 times for each decade of marriage."
# Q6:
# Consider the model fit in Question 2.
# What is the predicted probability that 
# someone who has been married 1 year decides to 
# engage in an extra-marital affair?

# output from fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

#ANSWER:
# Getting predicted probabilities from a Logistic Regression Model: 
# NOTE helpful to refer back to Slide 34 in the mod1day2 slides
xb <- -1.60859 + (0.05882*1)
exp(xb)/(1+exp(xb))
## [1] 0.1751195
# [1] 0.1751195
# Thus the predicted probability of deciding to engage
# in an affair for someone who is married for a single year is 17.5%
#
# ALTERNATE SOLUTION USING PREDICT() FUNCTION
# Note that fit below refers to the original fit object we
# created above and newdata is the new x value we want to create
# a predicted probability for...
# Type = response below specifies that we want the prediction
# on the probability scale.
predict(fit,newdata=data.frame(yrsmarried=1),type="response")
##         1 
## 0.1751202
# This yields:
# 1 
# 0.1751202
# 
# we can extend this and plot the predicted 
# probabilities for a range of x values corresponding to the observed 
# x's for our data. First, generate a range of value based on what 
# we observed in the data
range(affairs2$yrsmarried)
## [1]  0.125 15.000
# This yields
# [1]  0.125 15.000
# let's create a sequence of 100 evenly spaced values between
# 0.125 and 15
xyrsmarried <- seq(0.125,15,length=100)

# Now let's use the predict function to create the model
# for all of the values of xyrsmarried.
yyrsmarried <- predict(fit,list(yrsmarried=xyrsmarried),type="response")

# Now we plot. Note that I have added jitter in the y-axis
# to help better visualize the data structure in the plot
plot(affairs2$yrsmarried,jitter(affairs2$anyaffair),pch=1,col="blue",
     xlab="YEARS MARRIED (yrs)",ylab="Prob(Affair)")
# note pch is plotting symbol and 1 is open circle
# col is color of the plotting symbols
# now add the predicted probabilities
lines(xyrsmarried,yyrsmarried,col="dark red",lwd=2)

# note that lwd is line width, col is color
# End of Program