# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 01-13-22 ~
# ~ Review Logistic Regression ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Logistic Data Example -- Prostate Cancer Data
# When a patient is diagnosed as having cancer of the prostate,
# an important question in deciding on treatment strategy for
# the patient is whether or not the cancer has spread to
# neighboring lymph nodes. For a sample of 53 prostate cancer patients,
# a number of possible predictor variables were measured before surgery.
# The patients then had surgery to determine nodal involvement.
# Assume the principal investigator was interested in investigating the
# relationship between nodal involvement found at surgery and serum
# acid phosphatase level.
# Data Dictionary
# X -- Patient No
# Xray -- Measure of the seriousness of the cancer taken
# from an X-ray reading with a value of 1 indicating
# a more serious case of the cancer.
# Grade -- Dichotomized tumor measure with a value of 1 indicating
# a more serious cancer.
# Stage -- Dichotomized tumor measure with a value of 1 indicating
# a more serious cancer.
# Age -- Age at diagnosis in years
# Acid -- Serum acid phosphatase (x 100)
# Nodes -- Nodal involvement found at surgery
# (1 = nodal involvement; 0 = no nodal involvement)
# load the dataset from the web:
load(url("http://www.duke.edu/~sgrambow/crp241data/prostate_node.RData"))
# Q1:
# Is there evidence of an association between
# nodal involvement and serum acid phosphatase level
# Create a scatterplot to address this question.
# quick summary to check for missing values and overall distribution
# of each variable
summary(prostate_node)
## X Xray Grade Stage Age
## Min. : 1 Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :45.00
## 1st Qu.:14 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:56.00
## Median :27 Median :0.000 Median :0.0000 Median :1.0000 Median :60.00
## Mean :27 Mean :0.283 Mean :0.3774 Mean :0.5094 Mean :59.38
## 3rd Qu.:40 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:65.00
## Max. :53 Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :68.00
## Acid Nodes
## Min. : 40.00 Min. :0.0000
## 1st Qu.: 50.00 1st Qu.:0.0000
## Median : 65.00 Median :0.0000
## Mean : 69.42 Mean :0.3774
## 3rd Qu.: 78.00 3rd Qu.:1.0000
## Max. :187.00 Max. :1.0000
# visualize
# NOTE the use of jitter() function to scatter overlying
# points so you get a better sense of the density of
# points in the plot. We demonstrated using jitter
# in both the x and y directions. Generally, people
# tend to use jitter in the y direction.
# 3 slightly different versions of the same plot
plot(prostate_node$Acid,prostate_node$Nodes) # No jitter

plot(prostate_node$Acid,jitter(prostate_node$Nodes)) # jitter in Y

plot(jitter(prostate_node$Acid),prostate_node$Nodes) # jitter in X

# Q2:
# Is there evidence of an association between
# serum acid phosphatase and nodal involvement at surgery at
# significance level 0.05? Address this question
# by fitting a logistic regression model.
#fit model
acid.fit <- glm(Nodes ~ Acid,data=prostate_node, family='binomial')
summary(acid.fit)
##
## Call:
## glm(formula = Nodes ~ Acid, family = "binomial", data = prostate_node)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0144 -0.9120 -0.8165 1.2871 1.5971
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.92703 0.92104 -2.092 0.0364 *
## Acid 0.02040 0.01257 1.624 0.1045
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 70.252 on 52 degrees of freedom
## Residual deviance: 67.116 on 51 degrees of freedom
## AIC: 71.116
##
## Number of Fisher Scoring iterations: 4
# This yields:
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.92703 0.92104 -2.092 0.0364 *
# Acid 0.02040 0.01257 1.624 0.1045
# ANSWER: Based on the p-value of 0.1045 for
# the Acid coefficient, we conclude there is
# NOT an association at significance level 0.05
# between serum acid phosphatase and nodal involvement
# at surgery.
#
# NOTE 1 -- if you forget to indicate the family in the
# glm() function call, it will default to linear regression
# and will happily provide you with the WRONG results -- it will
# not give you an error.
nofamily.acid.fit <- glm(Nodes ~ Acid,data=prostate_node)
summary(nofamily.acid.fit)
##
## Call:
## glm(formula = Nodes ~ Acid, data = prostate_node)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9100 -0.3438 -0.2849 0.5656 0.7196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.062943 0.188029 0.335 0.7392
## Acid 0.004530 0.002537 1.785 0.0802 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2298121)
##
## Null deviance: 12.453 on 52 degrees of freedom
## Residual deviance: 11.720 on 51 degrees of freedom
## AIC: 76.433
##
## Number of Fisher Scoring iterations: 2
# NOTE 2 -- The key output we want is provided by the
# summary() function which yields the beta coefficients
# and the p-values for the betas.
# Q3:
# Consider the model fit in Question 2.
# Do these data suggest that the association
# between serum acid phosphatase and
# nodal involvement at surgery is positive or negative?
# NOTE1: In question 2 we did not calculate the Odds Ratio
# which is the usual effect measure calculated from the
# slope coefficient for logistic regression.
# we can do this with the following code:
exp(acid.fit$coefficients)
## (Intercept) Acid
## 0.1455796 1.0206103
# This yields:
# > exp(acid.fit$coefficients)
# (Intercept) Acid
# 0.1455796 1.0206103
# ANSWER: So from the summary in Q2 above and the
# odds ratio code above, we see that the coefficient
# is positive; beta1 > 0 and OR > 1;
# positive association the higher your serum acid phosphatase,
# the higher the chance nodal involvement will be found at surgery.
# Q4:
# Consider the model fit in Question 2.
# What does the estimated slope coefficient
# for the predictor serum acid phosphatase
# represent?
# output from acid.fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.92703 0.92104 -2.092 0.0364 *
# Acid 0.02040 0.01257 1.624 0.1045
# > exp(acid.fit$coefficients)
# (Intercept) Acid
# 0.1455796 1.0206103
# ANSWER: Change in log-odds of Y=1 (nodal involvement) for a
# unit increase in X; here X=serum acid phosphatase so a unit increase
# in X is an additional 1 unit increase in serum acid phosphatase.
# So, it's the change in the log-odds of nodal involvement found
# at surgery for each additional 1 unit increase in serum acid phosphatase.
# Beta1 is 0.004530 so this means that there is
# a 0.005 change (increase) in the log-odds of
# nodal involvement for each 1 unit increase in serum acid phosphatase.
# Q5:
# Consider the model fit in Question 2.
# Estimate the odds ratio for nodal involvement per 50 unit
# increase in serum acid phosphatase and the associated 95% CI
# What does the point estimate represent?
# output from acid.fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.92703 0.92104 -2.092 0.0364 *
# Acid 0.02040 0.01257 1.624 0.1045
# > exp(acid.fit$coefficients)
# (Intercept) Acid
# 0.1455796 1.0206103
#ANSWER:
# So the OR for a 1 year unit increase in serum acid phosphatase
# is OR=1.02. For a 10 unit increase...
# NOTE that the code immediately below is pulling the
# beta coefficients from the fit object. It exponentiates
# both betas but we are only interested in the slope coefficient
# for Acid. Note also the we multiply by 10 BEFORE
# exponentiating. We could of course do this multiplication
# manually with the output from above.
exp(10*acid.fit$coefficients)
## (Intercept) Acid
## 4.275675e-09 1.226308e+00
# (Intercept) Acid
# 4.275675e-09 1.226308e+00
# So the OR for a 10 unit increase is 1.23
# To get the correct confidence interval limits
# for a 10 unit increase, calculate the standard
# confidence interval for a 1 unit change,
# multiply that by 10 and exponentiate to
# get it on the OR scale.
confint(acid.fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -3.937742047 -0.25838113
## Acid -0.002072047 0.04836487
exp(10*confint(acid.fit))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 7.917783e-18 0.07548575
## Acid 9.794927e-01 1.62198170
# > exp(10*confint(acid.fit))
# Waiting for profiling to be done...
# 2.5 % 97.5 %
# (Intercept) 7.917783e-18 0.07548575
# Acid 9.794927e-01 1.62198170
# Thus, the OR for nodal involvement per 10 unit change in
# serum acid phosphatase is 1.23 with 95% CI from 0.98 to 1.62.
# Therefore, we can say that the odds of nodal involvement at surgery
# increases by 23% for each 10 unit increase in serum acid phosphatase.
# We could also express this as, "the odds of nodal involvement increase
# by 1.23 times for each 10 unit increase in serum acid phosphatase."
# Q6:
# Consider the model fit in Question 2.
# What is the predicted probability that
# someone who with a serum acid phosphatase of 78 (Q3) will have
# nodal involvement found at surgery?
# output from acid.fit
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.92703 0.92104 -2.092 0.0364 *
# Acid 0.02040 0.01257 1.624 0.1045
# > exp(acid.fit$coefficients)
# (Intercept) Acid
# 0.1455796 1.0206103
#ANSWER:
# Getting predicted probabilities from a Logistic Regression Model:
# (see slide 30 from calss notes)
xb <- -1.92703 + (0.02040*78)
exp(xb)/(1+exp(xb))
## [1] 0.4168228
# [1] 0.4168228
# Thus the predicted probability is 41.7%.
# ALTERNATE SOLUTION USING PREDICT() FUNCTION
# Note that fit below refers to the original fit object we
# created above and newdata is the new x value we want to create
# a predicted probability for...
# Type = response below specifies that we want the prediction
# on the probability scale.
predict(acid.fit,newdata=data.frame(Acid=78),type="response")
## 1
## 0.4168367
# This yields:
# 1
# 0.4168367
#
# we can extend this and plot the predicted
# probabilities for a range of x values corresponding to the observed
# x's for our data. First, generate a range of value based on what
# we observed in the data
range(prostate_node$Acid)
## [1] 40 187
# This yields
# [1] 40 187
# let's create a sequence of 100 evenly spaced values between
# 40 and 187
xAcid <- seq(40,187,length=100)
# Now let's use the predict function to create the model
# for all of the values of xacid.
yxAcid <- predict(acid.fit,list(Acid=xAcid),
type="response")
# Now we plot. Note that I have added jitter in the y-axis
# to help better visualize the data structure in the plot
plot(prostate_node$Acid,jitter(prostate_node$Nodes),pch=1,col="blue",
xlab="Serum Acid Phosphatase",ylab="Prob(Nodal Involvement)")
# note pch is plotting symbol and 1 is open circle
# col is color of the plotting symbols
# now add the predicted probabilities
lines(xAcid,yxAcid,col="dark red",lwd=2)

# note that lwd is line width, col is color
# Instead of overlaying, we can just plot the predicted points
plot(xAcid,yxAcid,col="dark red",pch=1,
xlab="Serum Acid Phosphatase",ylab="Prob(Nodal Involvement)")

# End of Program