# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 Day 2 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Fair's Affair Data Example: 
# A cross-sectional study was performed to 
# understand the factors that correlate with an 
# individuals decision to engage in extramarital 
# affairs. 601 individuals participated 
# in the study. The response of interest was 
# whether or not the subject engaged in any 
# extramarital sexual relations at least once in 
# the last 12 months. Researchers wanted 
# to understand the relationship between a 
# subjects decision to engage in an affair and 
# the length of their marriage. 

# Data Set: affairs2

# Data Dictionary: 
# (1) id          Subject identifier 
# (2) anyaffair   Subject engaged in extramarital 
#                 sexual relations at least once 
#                 in the last 12 months (1 = Yes; 0 = No)
# (3) yrsmarried  Number of years subject has been married 
#                 at time of study (in years)

load(url("http://www.duke.edu/~sgrambow/crp241data/affairs2.RData"))
# Q1:
# Is there evidence of an association between 
# the number of years married and the decision 
# to engage in an extra-marital affair? 
# Create a scatterplot to address this question.

# visualize
# NOTE the use of jitter() function to scatter overlying
# points so you get a better sense of the density of 
# points in the plot.  We demonstrated using jitter
# in both the x and y directions.  Generally, people
# tend to use jitter in the y direction.

plot(affairs2$yrsmarried,affairs2$anyaffair) # No jitter

plot(affairs2$yrsmarried,jitter(affairs2$anyaffair)) # jitter in Y

plot(jitter(affairs2$yrsmarried),affairs2$anyaffair) # jitter in X

# Q2:
# Is there evidence of an association between 
# the number of years married and the decision 
# to engage in an extra-marital affair at 
# significance level 0.05? Address this question 
# by fitting a logistic regression model.

#fit model
ucheat.fit <- glm(anyaffair ~ yrsmarried, data=affairs2, family='binomial')
summary(ucheat.fit)
## 
## Call:
## glm(formula = anyaffair ~ yrsmarried, family = "binomial", data = affairs2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8883  -0.7846  -0.6720  -0.6061   1.8894  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
## yrsmarried   0.05882    0.01724   3.412 0.000646 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 663.50  on 599  degrees of freedom
## AIC: 667.5
## 
## Number of Fisher Scoring iterations: 4
exp(ucheat.fit$coefficients)
## (Intercept)  yrsmarried 
##   0.2001702   1.0605864
# ANSWER: Based on the p-value of 0.000646 for 
# the yrsmarried coefficient, we conclude there is 
# an association between the number of years married
# and the decision to engage in an extra-marital affair.
# 
# NOTE 1 -- if you forget to indicate the family in the 
# glm() function call, it will default to linear regression
# and will happily provide you with the WRONG results -- it will
# not give you an error.
# NOTE 2 -- The key output we want is provided by the
# summary() function which yields the beta coefficients
# and the p-values for the betas. 
# Q3:
# Consider the model fit in Question 2.
# Do these data suggest that the association 
# between the number of years married and 
# the decision to engage in an extra-marital 
# affair is positive or negative?

# output from summary(ucheat.fit) above
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(ucheat.fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

# ANSWER: coefficient is positive; beta1 > 0 and OR > 1; 
# positive association the longer your are married, 
# the higher the chance you will have an affair.
# Q4:
# Consider the model fit in Question 2.
# What does the estimated slope coefficient 
# for the predictor number of years married 
# represent?

# output from ucheat.fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(ucheat.fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

# ANSWER: Change in log-odds of Y=1 (having an affair) for a 
# unit increase in X; here X=yrsmarried so a unit increase 
# in X is an additional year of marriage. So, it's the change 
# in the log-odds of having an affair for each additional 
# increase of 1 year in the length of marriage.

# Beta1 is 0.06 so this means that there is 
# a .06 change (increase) in the log-odds of 
# having an affair for each additional year of marriage.
# Q5: 
# Consider the model fit in Question 2.
# Estimate the odds ratio for the decision to 
# engage in an extra-marital affair per decade 
# of marriage and it's 95% confidence interval. 
# What does the point estimate represent?

# output from ucheat.fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(ucheat.fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

#ANSWER: 
# So the OR for a 1 year increase in marriage is OR=1.06.
# for a 1 decade increase...
# NOTE that the code immediately below is pulling the 
# beta coefficients from the fit object. It exponentiates
# both betas but we are only interested in the slope coefficient
# for yrsmarried.  Note also the we multiply by 10 BEFORE 
# exponentiating. We could of course do this multiplication
# manually with the output from above.

exp(10*ucheat.fit$coefficients)
##  (Intercept)   yrsmarried 
## 1.032749e-07 1.800780e+00
# (Intercept)   yrsmarried 
# 1.032749e-07 1.800780e+00 

# To get the correct confidence interval limits 
# for a 1 decade increase, calculate the standard 
# confidence interval for a 1 year change, 
# multiply that by 10 and exponentiate to 
# get it on the OR scale.

confint(ucheat.fit)
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept) -1.97824219 -1.25863037
## yrsmarried   0.02525637  0.09293219
exp(10*confint(ucheat.fit))
## Waiting for profiling to be done...
##                    2.5 %       97.5 %
## (Intercept) 2.562143e-09 3.418517e-06
## yrsmarried  1.287322e+00 2.532791e+00
# Thus, the OR for having an affair per decade of marriage is 1.8 with
# 95% CI from 1.29 to 2.53. Therefore, we can say that the odds of
# having an affair increases by 80% for each decade of marriage.  We
# could also express this as, "the odds of having an affair increase
# by 1.8 times for each decade of marriage."
# Q6:
# Consider the model fit in Question 2.
# What is the predicted probability that 
# someone who has been married 1 year decides to 
# engage in an extra-marital affair?

# output from ucheat.fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -1.60859    0.18327  -8.777  < 2e-16 ***
# yrsmarried   0.05882    0.01724   3.412 0.000646 ***
# > exp(ucheat.fit$coefficients)
# (Intercept)  yrsmarried 
# 0.2001702   1.0605864 

#ANSWER:
# Getting predicted probabilities from a Logistic Regression Model: 
# NOTE helpful to refer back to Slide 34 in the mod1day2 slides
xb <- -1.60859 + (0.05882*1)
exp(xb)/(1+exp(xb))
## [1] 0.1751195
# [1] 0.1751195
# Thus the predicted probability of deciding to engage
# in an affair for someone who is married for a single year is 17.5%
#
# ALTERNATE SOLUTION USING PREDICT() FUNCTION
# Note that fit below refers to the original fit object we
# created above and newdata is the new x value we want to create
# a predicted probability for...
# Type = response below specifies that we want the prediction
# on the probability scale. 
predict(ucheat.fit,newdata=data.frame(yrsmarried=1),type="response")
##         1 
## 0.1751202
# This yields:
# 1 
# 0.1751202
# 
# we can extend this and plot the predicted 
# probabilities for a range of x values corresponding to the observed 
# x's for our data. First, generate a range of value based on what 
# we observed in the data
range(affairs2$yrsmarried)
## [1]  0.125 15.000
# This yields
# [1]  0.125 15.000
# let's create a sequence of 100 evenly spaced values between
# 0.125 and 15
xyrsmarried <- seq(0.125,15,length=100)

# Now let's use the predict function to create the model
# for all of the values of xyrsmarried.
yyrsmarried <- predict(ucheat.fit,list(yrsmarried=xyrsmarried),
                       type="response")

# Now we plot. Note that I have added jitter in the y-axis
# to help better visualize the data structure in the plot
plot(affairs2$yrsmarried,jitter(affairs2$anyaffair),pch=1,col="blue",
     xlab="YEARS MARRIED (yrs)",ylab="Prob(Affair)")
# note pch is plotting symbol and 1 is open circle
# col is color of the plotting symbols
# now add the predicted probabilities
lines(xyrsmarried,yyrsmarried,col="dark red",lwd=2)

# note that lwd is line width, col is color
# End of Program