# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 1 Day 2 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Fair's Affair Data Example:
# A cross-sectional study was performed to
# understand the factors that correlate with an
# individuals decision to engage in extramarital
# affairs. 601 individuals participated
# in the study. The response of interest was
# whether or not the subject engaged in any
# extramarital sexual relations at least once in
# the last 12 months. Researchers wanted
# to understand the relationship between a
#subjects decision to engage in an affair and
# the length of their marriage.
# Data Set: affairs2
# Data Dictionary:
# (1) id Subject identifier
# (2) anyaffair Subject engaged in extramarital
# sexual relations at least once
# in the last 12 months (1 = Yes; 0 = No)
# (3) yrsmarried Number of years subject has been married
# at time of study (in years)
## Download and load the data file = affairs2
download.file("http://www.duke.edu/~sgrambow/crp241data/affairs2.RData",
destfile = "affairs2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("affairs2.RData")
# Q1:
# Is there evidence of an association between
# the number of years married and the decision
# to engage in an extra-marital affair?
# Create a scatterplot to address this question.
#visualize
# NOTE the use of jitter() function to scatter overlying
# points so you get a better sense of the density of
# points in the plot. We demonstrated using jitter
# in both the x and y directions. Generally, people
# tend to use jitter in the y direction.
plot(affairs2$yrsmarried,affairs2$anyaffair)

plot(affairs2$yrsmarried,jitter(affairs2$anyaffair))

plot(jitter(affairs2$yrsmarried),affairs2$anyaffair)

# Q2:
# Is there evidence of an association between
# the number of years married and the decision
# to engage in an extra-marital affair at
# significance level 0.05? Address this question
# by fitting a logistic regression model.
#fit model
fit <- glm(anyaffair ~ yrsmarried, data=affairs2, family='binomial')
summary(fit)
##
## Call:
## glm(formula = anyaffair ~ yrsmarried, family = "binomial", data = affairs2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8883 -0.7846 -0.6720 -0.6061 1.8894
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.60859 0.18327 -8.777 < 2e-16 ***
## yrsmarried 0.05882 0.01724 3.412 0.000646 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 663.50 on 599 degrees of freedom
## AIC: 667.5
##
## Number of Fisher Scoring iterations: 4
exp(fit$coefficients)
## (Intercept) yrsmarried
## 0.2001702 1.0605864
# ANSWER: Based on the p-value of 0.000646 for
# the yrsmarried coefficient, we conclude there is
# an association between the number of years married
# and the decision to engage in an extra-marital affair.
#
# NOTE 1 -- if you forget to indicate the family in the
# glm() function call, it will default to linear regression
# and will happily provide you with the WRONG results -- it will
# not give you an error.
# NOTE 2 -- The key output we want is provided by the
# summary() function which yields the beta coefficients
# and the p-values for the betas. Remember that the
# betas here are describing changes on the log odds scale.
# Q3:
# Consider the model fit in Question 2.
# Do these data suggest that the association
# between the number of years married and
# the decision to engage in an extra-marital
# affair is positive or negative?
# output from summary(fit) above
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.60859 0.18327 -8.777 < 2e-16 ***
# yrsmarried 0.05882 0.01724 3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept) yrsmarried
# 0.2001702 1.0605864
# ANSWER: coefficient is positive; beta1 > 0 and OR > 1;
# positive association the longer your are married,
# the higher the chance you will have an affair.
# Q4:
# Consider the model fit in Question 2.
# What does the estimated slope coefficient
# for the predictor number of years married
# represent?
# output from fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.60859 0.18327 -8.777 < 2e-16 ***
# yrsmarried 0.05882 0.01724 3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept) yrsmarried
# 0.2001702 1.0605864
# ANSWER: Change in log-odds of Y=1 (having an affair) for a
# unit increase in X; here X=yrsmarried so a unit increase
# in X is additional year of marriage. So, it's the change
# in the log-odds of having an affair for each additional
# increase of 1 year in the length of marriage.
# Q5:
# Consider the model fit in Question 2.
# Estimate the odds ratio for the decision to
# engage in an extra-marital affair per decade
# of marriage and it's 95% confidence interval.
# What does the point estimate represent?
# output from fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.60859 0.18327 -8.777 < 2e-16 ***
# yrsmarried 0.05882 0.01724 3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept) yrsmarried
# 0.2001702 1.0605864
#ANSWER: So the OR for a 1 year increase in marriage is OR=1.06.
# for a 1 decade increase...
# NOTE that the code immediately below is pulling the
# beta coefficients from the fit object. It exponentiates
# both betas but we are only interested in the slope coefficient
# for yrsmarried. Note also the we multiple by 10 BEFORE
# exponentiating.
exp(10*fit$coefficients)
## (Intercept) yrsmarried
## 1.032749e-07 1.800780e+00
# (Intercept) yrsmarried
# 1.032749e-07 1.800780e+00
# To get the correct confidence interaval
# limits for a 1 decade increase,
# calculate the standard confidence interval
# for a 1 year change, multiply that by 10 and
# exponent iate to get it on the OR scale.
confint(fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.97824219 -1.25863037
## yrsmarried 0.02525637 0.09293219
exp(10*confint(fit))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.562143e-09 3.418517e-06
## yrsmarried 1.287322e+00 2.532791e+00
# Thus, the OR for having an affair per decade of marriage is 1.8 with
# 95% CI from 1.29 to 2.53. Therefore, we can say that the odds of
# having an affair increases by 80% for each decade of marriage. We
# could also express this as, "the odds of having an affair increase
# by 1.8 times for each decade of marriage."
# Q6:
# Consider the model fit in Question 2.
# What is the predicted probability that
# someone who has been married 1 year decides to
# engage in an extra-marital affair?
# output from fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.60859 0.18327 -8.777 < 2e-16 ***
# yrsmarried 0.05882 0.01724 3.412 0.000646 ***
# > exp(fit$coefficients)
# (Intercept) yrsmarried
# 0.2001702 1.0605864
#ANSWER:
# Getting predicted probabilities from a Logistic Regression Model:
# NOTE helpful to refer back to Slide 34 in the mod1day2 slides
xb <- -1.60859 + (0.05882*1)
exp(xb)/(1+exp(xb))
## [1] 0.1751195
# [1] 0.1751195
# Thus the predicted probability of deciding to engage
# in an affair for someone who is married for a single year is 17.5%
#
# ALTERNATE SOLUTION USING PREDICT() FUNCTION
# Note that fit below refers to the original fit object we
# created above and newdata is the new x value we want to create
# a predicted probability for...
# Type = response below specifies that we want the prediction
# on the probability scale.
predict(fit,newdata=data.frame(yrsmarried=1),type="response")
## 1
## 0.1751202
# This yields:
# 1
# 0.1751202
#
# we can extend this and plot the predicted
# probabilities for a range of x values corresponding to the observed
# x's for our data. First, generate a range of value based on what
# we observed in the data
range(affairs2$yrsmarried)
## [1] 0.125 15.000
# This yields
# [1] 0.125 15.000
# let's create a sequence of 100 evenly spaced values between
# 0.125 and 15
xyrsmarried <- seq(0.125,15,length=100)
# Now let's use the predict function to create the model
# for all of the values of xyrsmarried.
yyrsmarried <- predict(fit,list(yrsmarried=xyrsmarried),type="response")
# Now we plot. Note that I have added jitter in the y-axis
# to help better visualize the data structure in the plot
plot(affairs2$yrsmarried,jitter(affairs2$anyaffair),pch=1,col="blue",
xlab="YEARS MARRIED (yrs)",ylab="Prob(Affair)")
# note pch is plotting symbol and 1 is open circle
# col is color of the plotting symbols
# now add the predicted probabilities
lines(xyrsmarried,yyrsmarried,col="dark red",lwd=2)

# note that lwd is line width, col is color
# End of Program