# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Part 1 01-13-22    ~
# ~ Review Logistic Regression ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Logistic Data Example -- Prostate Cancer Data

# When a patient is diagnosed as having cancer of the prostate, 
# an important question in deciding on treatment strategy for 
# the patient is whether or not the cancer has spread to 
# neighboring lymph nodes. For a sample of 53 prostate cancer patients,
# a number of possible predictor variables were measured before surgery. 
# The patients then had surgery to determine nodal involvement. 
# Assume the principal investigator was interested in investigating the 
# relationship between nodal involvement found at surgery and serum
# acid phosphatase level.

# Data Dictionary
# X         -- Patient No
# Xray      -- Measure of the seriousness of the cancer taken 
#              from an X-ray reading with a value of 1 indicating 
#              a more serious case of the cancer.
# Grade     -- Dichotomized tumor measure with a value of 1 indicating
#              a more serious cancer.
# Stage     -- Dichotomized tumor measure with a value of 1 indicating
#              a more serious cancer.
# Age       -- Age at diagnosis in years
# Acid      -- Serum acid phosphatase (x 100)
# Nodes     -- Nodal involvement found at surgery 
#              (1 = nodal involvement; 0 = no nodal involvement)

# load the dataset from the web:
load(url("http://www.duke.edu/~sgrambow/crp241data/prostate_node.RData"))
# Q1:
# Is there evidence of an association between 
# nodal involvement and serum acid phosphatase level
# Create a scatterplot to address this question.

# quick summary to check for missing values and overall distribution
# of each variable
summary(prostate_node)
##        X           Xray           Grade            Stage             Age       
##  Min.   : 1   Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :45.00  
##  1st Qu.:14   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:56.00  
##  Median :27   Median :0.000   Median :0.0000   Median :1.0000   Median :60.00  
##  Mean   :27   Mean   :0.283   Mean   :0.3774   Mean   :0.5094   Mean   :59.38  
##  3rd Qu.:40   3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:65.00  
##  Max.   :53   Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :68.00  
##       Acid            Nodes       
##  Min.   : 40.00   Min.   :0.0000  
##  1st Qu.: 50.00   1st Qu.:0.0000  
##  Median : 65.00   Median :0.0000  
##  Mean   : 69.42   Mean   :0.3774  
##  3rd Qu.: 78.00   3rd Qu.:1.0000  
##  Max.   :187.00   Max.   :1.0000
# visualize
# NOTE the use of jitter() function to scatter overlying
# points so you get a better sense of the density of 
# points in the plot.  We demonstrated using jitter
# in both the x and y directions.  Generally, people
# tend to use jitter in the y direction.
# 3 slightly different versions of the same plot

plot(prostate_node$Acid,prostate_node$Nodes) # No jitter

plot(prostate_node$Acid,jitter(prostate_node$Nodes)) # jitter in Y

plot(jitter(prostate_node$Acid),prostate_node$Nodes) # jitter in X

# Q2:
# Is there evidence of an association between 
# serum acid phosphatase and nodal involvement at surgery at 
# significance level 0.05? Address this question 
# by fitting a logistic regression model.

#fit model
acid.fit <- glm(Nodes ~ Acid,data=prostate_node, family='binomial')
summary(acid.fit)
## 
## Call:
## glm(formula = Nodes ~ Acid, family = "binomial", data = prostate_node)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0144  -0.9120  -0.8165   1.2871   1.5971  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -1.92703    0.92104  -2.092   0.0364 *
## Acid         0.02040    0.01257   1.624   0.1045  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.252  on 52  degrees of freedom
## Residual deviance: 67.116  on 51  degrees of freedom
## AIC: 71.116
## 
## Number of Fisher Scoring iterations: 4
# This yields:
# Coefficients:
#   Estimate   Std. Error   z value           Pr(>|z|)  
#   (Intercept) -1.92703    0.92104  -2.092   0.0364 *
#   Acid         0.02040    0.01257   1.624   0.1045  

# ANSWER: Based on the p-value of 0.1045 for 
# the Acid coefficient, we conclude there is 
# NOT an association at significance level 0.05
# between serum acid phosphatase and nodal involvement
# at surgery.
# 
# NOTE 1 -- if you forget to indicate the family in the 
# glm() function call, it will default to linear regression
# and will happily provide you with the WRONG results -- it will
# not give you an error.

nofamily.acid.fit <- glm(Nodes ~ Acid,data=prostate_node)
summary(nofamily.acid.fit)
## 
## Call:
## glm(formula = Nodes ~ Acid, data = prostate_node)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9100  -0.3438  -0.2849   0.5656   0.7196  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 0.062943   0.188029   0.335   0.7392  
## Acid        0.004530   0.002537   1.785   0.0802 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2298121)
## 
##     Null deviance: 12.453  on 52  degrees of freedom
## Residual deviance: 11.720  on 51  degrees of freedom
## AIC: 76.433
## 
## Number of Fisher Scoring iterations: 2
# NOTE 2 -- The key output we want is provided by the
# summary() function which yields the beta coefficients
# and the p-values for the betas. 
# Q3:
# Consider the model fit in Question 2.
# Do these data suggest that the association 
# between serum acid phosphatase and 
# nodal involvement at surgery is positive or negative?

# NOTE1: In question 2 we did not calculate the Odds Ratio 
# which is the usual effect measure calculated from the
# slope coefficient for logistic regression.
# we can do this with the following code:

exp(acid.fit$coefficients)
## (Intercept)        Acid 
##   0.1455796   1.0206103
# This yields:
# > exp(acid.fit$coefficients)
# (Intercept)        Acid 
#   0.1455796   1.0206103

# ANSWER: So from the summary in Q2 above and the 
# odds ratio code above, we see that the coefficient 
# is positive; beta1 > 0 and OR > 1; 
# positive association the higher your serum acid phosphatase, 
# the higher the chance nodal involvement will be found at surgery.
# Q4:
# Consider the model fit in Question 2.
# What does the estimated slope coefficient 
# for the predictor serum acid phosphatase 
# represent?

# output from acid.fit
# Coefficients:
#   Estimate   Std. Error   z value           Pr(>|z|)  
#   (Intercept) -1.92703    0.92104  -2.092   0.0364 *
#   Acid         0.02040    0.01257   1.624   0.1045  

# > exp(acid.fit$coefficients)
# (Intercept)        Acid 
#   0.1455796   1.0206103


# ANSWER: Change in log-odds of Y=1 (nodal involvement) for a 
# unit increase in X; here X=serum acid phosphatase so a unit increase 
# in X is an additional 1 unit increase in serum acid phosphatase. 
# So, it's the change in the log-odds of nodal involvement found 
# at surgery for each additional 1 unit increase in serum acid phosphatase.

# Beta1 is 0.004530 so this means that there is 
# a 0.005 change (increase) in the log-odds of 
# nodal involvement for each 1 unit increase in serum acid phosphatase.
# Q5: 
# Consider the model fit in Question 2.
# Estimate the odds ratio for nodal involvement per 50 unit
# increase in serum acid phosphatase and the associated 95% CI
# What does the point estimate represent?

# output from acid.fit
# Coefficients:
#   Estimate   Std. Error   z value           Pr(>|z|)  
#   (Intercept) -1.92703    0.92104  -2.092   0.0364 *
#   Acid         0.02040    0.01257   1.624   0.1045  

# > exp(acid.fit$coefficients)
# (Intercept)        Acid 
#   0.1455796   1.0206103


#ANSWER: 
# So the OR for a 1 year unit increase in serum acid phosphatase 
# is OR=1.02. For a 10 unit increase...
# NOTE that the code immediately below is pulling the 
# beta coefficients from the fit object. It exponentiates
# both betas but we are only interested in the slope coefficient
# for Acid.  Note also the we multiply by 10 BEFORE 
# exponentiating. We could of course do this multiplication
# manually with the output from above.

exp(10*acid.fit$coefficients)
##  (Intercept)         Acid 
## 4.275675e-09 1.226308e+00
# (Intercept)         Acid 
# 4.275675e-09 1.226308e+00  
# So the OR for a 10 unit increase is 1.23

# To get the correct confidence interval limits 
# for a 10 unit increase, calculate the standard 
# confidence interval for a 1 unit change, 
# multiply that by 10 and exponentiate to 
# get it on the OR scale.

confint(acid.fit)
## Waiting for profiling to be done...
##                    2.5 %      97.5 %
## (Intercept) -3.937742047 -0.25838113
## Acid        -0.002072047  0.04836487
exp(10*confint(acid.fit))
## Waiting for profiling to be done...
##                    2.5 %     97.5 %
## (Intercept) 7.917783e-18 0.07548575
## Acid        9.794927e-01 1.62198170
# > exp(10*confint(acid.fit))
# Waiting for profiling to be done...
# 2.5 %     97.5 %
# (Intercept) 7.917783e-18 0.07548575
# Acid        9.794927e-01 1.62198170

# Thus, the OR for nodal involvement per 10 unit change in 
# serum acid phosphatase is 1.23 with 95% CI from 0.98 to 1.62. 
# Therefore, we can say that the odds of nodal involvement at surgery
# increases by 23% for each 10 unit increase in serum acid phosphatase.
# We could also express this as, "the odds of nodal involvement increase
# by 1.23 times for each 10 unit increase in serum acid phosphatase."
# Q6:
# Consider the model fit in Question 2.
# What is the predicted probability that 
# someone who with a serum acid phosphatase of 78 (Q3) will have  
# nodal involvement found at surgery?

# output from acid.fit
# Coefficients:
#   Estimate   Std. Error   z value           Pr(>|z|)  
#   (Intercept) -1.92703    0.92104  -2.092   0.0364 *
#   Acid         0.02040    0.01257   1.624   0.1045  

# > exp(acid.fit$coefficients)
# (Intercept)        Acid 
#   0.1455796   1.0206103

#ANSWER:
# Getting predicted probabilities from a Logistic Regression Model:
# (see slide 30 from calss notes)
xb <- -1.92703 + (0.02040*78)
exp(xb)/(1+exp(xb))
## [1] 0.4168228
# [1] 0.4168228
# Thus the predicted probability is 41.7%.

# ALTERNATE SOLUTION USING PREDICT() FUNCTION
# Note that fit below refers to the original fit object we
# created above and newdata is the new x value we want to create
# a predicted probability for...
# Type = response below specifies that we want the prediction
# on the probability scale. 
predict(acid.fit,newdata=data.frame(Acid=78),type="response")
##         1 
## 0.4168367
# This yields:
# 1 
# 0.4168367
# 
# we can extend this and plot the predicted 
# probabilities for a range of x values corresponding to the observed 
# x's for our data. First, generate a range of value based on what 
# we observed in the data
range(prostate_node$Acid)
## [1]  40 187
# This yields
# [1]  40 187
# let's create a sequence of 100 evenly spaced values between
# 40 and 187
xAcid <- seq(40,187,length=100)

# Now let's use the predict function to create the model
# for all of the values of xacid.
yxAcid <- predict(acid.fit,list(Acid=xAcid),
                       type="response")

# Now we plot. Note that I have added jitter in the y-axis
# to help better visualize the data structure in the plot
plot(prostate_node$Acid,jitter(prostate_node$Nodes),pch=1,col="blue",
     xlab="Serum Acid Phosphatase",ylab="Prob(Nodal Involvement)")
# note pch is plotting symbol and 1 is open circle
# col is color of the plotting symbols
# now add the predicted probabilities
lines(xAcid,yxAcid,col="dark red",lwd=2)

# note that lwd is line width, col is color

# Instead of overlaying, we can just plot the predicted points
plot(xAcid,yxAcid,col="dark red",pch=1,
     xlab="Serum Acid Phosphatase",ylab="Prob(Nodal Involvement)")

# End of Program