Prostate Cancer

Part I: Exploratory Data Analysis

  • Read data into R. Conduct a visual inspection of the data in ”prostate.txt”

  • Check for missing values in the data replace missing values with “NA”

  • Drop irrelevant variables

  • Check variable types and determine which variable(s) should be treated as qualitative/categorical and which should be treated as quantitative

  • Change variable type for quantitative ad qualitative variables if necessary

  • Draw pie chart (with class percentage) for each categorical variable

  • Draw histograms each quantitative variable to determine their distributions

  • Decide on whether variables should be transformed

  • Draw scatter plot matrix among quantitative variables with the lower panel showing correlation coefficients to determine their relationships

  • Obtain the pairwise correlation matrix for all quantitative variables

  • Draw side-by-side box plots for ”PSA” with respect to each categorical variable

###DATA PROCESSING/MANIPULATION

##Read data into R
data <- read.table("C:/Users/kayan/R-Projects/STA206/Project/data/prostate.txt", header=F)

##Assign header names 
names(data) = c("patient_id","psa_level","cancer_vol","weight", "age", "benign_prostatic_hyperplasia", "seminal_vesicle_invasion", "capsular_penetration", "gleason_score")

##Change data to dataframe
data = as.data.frame(data)

##Check summary statistic for each variable:
summary(data)
##    patient_id   psa_level         cancer_vol          weight      
##  Min.   : 1   Min.   :  0.651   Min.   : 0.2592   Min.   : 10.70  
##  1st Qu.:25   1st Qu.:  5.641   1st Qu.: 1.6653   1st Qu.: 29.37  
##  Median :49   Median : 13.330   Median : 4.2631   Median : 37.34  
##  Mean   :49   Mean   : 23.730   Mean   : 6.9987   Mean   : 45.49  
##  3rd Qu.:73   3rd Qu.: 21.328   3rd Qu.: 8.4149   3rd Qu.: 48.42  
##  Max.   :97   Max.   :265.072   Max.   :45.6042   Max.   :450.34  
##       age        benign_prostatic_hyperplasia seminal_vesicle_invasion
##  Min.   :41.00   Min.   : 0.000               Min.   :0.0000          
##  1st Qu.:60.00   1st Qu.: 0.000               1st Qu.:0.0000          
##  Median :65.00   Median : 1.350               Median :0.0000          
##  Mean   :63.87   Mean   : 2.535               Mean   :0.2165          
##  3rd Qu.:68.00   3rd Qu.: 4.759               3rd Qu.:0.0000          
##  Max.   :79.00   Max.   :10.278               Max.   :1.0000          
##  capsular_penetration gleason_score  
##  Min.   : 0.0000      Min.   :6.000  
##  1st Qu.: 0.0000      1st Qu.:6.000  
##  Median : 0.4493      Median :7.000  
##  Mean   : 2.2454      Mean   :6.876  
##  3rd Qu.: 3.2544      3rd Qu.:7.000  
##  Max.   :18.1741      Max.   :8.000
##check for variable types 
sapply(data, class)
##                   patient_id                    psa_level 
##                    "integer"                    "numeric" 
##                   cancer_vol                       weight 
##                    "numeric"                    "numeric" 
##                          age benign_prostatic_hyperplasia 
##                    "integer"                    "numeric" 
##     seminal_vesicle_invasion         capsular_penetration 
##                    "integer"                    "numeric" 
##                gleason_score 
##                    "integer"
##Check number of missing values for each variable:
sapply(data, function(x) sum(is.na(x)))
##                   patient_id                    psa_level 
##                            0                            0 
##                   cancer_vol                       weight 
##                            0                            0 
##                          age benign_prostatic_hyperplasia 
##                            0                            0 
##     seminal_vesicle_invasion         capsular_penetration 
##                            0                            0 
##                gleason_score 
##                            0
##Drop patient_id 
drops=c("patient_id")
prostate=data[,!(names(data)%in%drops)]

##Change variables to categorical
prostate$gleason_score <- as.factor(prostate$gleason_score) 
prostate$seminal_vesicle_invasion <- as.factor(prostate$seminal_vesicle_invasion) 

##check for variable types 
sapply(prostate, class)
##                    psa_level                   cancer_vol 
##                    "numeric"                    "numeric" 
##                       weight                          age 
##                    "numeric"                    "integer" 
## benign_prostatic_hyperplasia     seminal_vesicle_invasion 
##                    "numeric"                     "factor" 
##         capsular_penetration                gleason_score 
##                    "numeric"                     "factor"
###DATA VISUALIZATION - 1

##Pie charts (with class percentage) for seminal_vesicle_invasion
table(prostate$seminal_vesicle_invasion)
## 
##  0  1 
## 76 21
levels(prostate$seminal_vesicle_invasion)
## [1] "0" "1"
n <- nrow(prostate)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
svi <- data.frame("Seminal" = c("Absence", "Presence"), "Count" = c(76, 21))
colors <- c('#FC8D62', "#8DA0CB")

fig <- plot_ly(svi, labels = ~Seminal, values = ~Count, type = 'pie',
        textposition = 'inside',
        textinfo = 'label+percent',
        insidetextfont = list(color = '#FFFFFF'),
        hoverinfo = 'text',
        text = ~paste(Count),
        marker = list(colors = colors,
                      line = list(color = '#FFFFFF', width = 1)),
                      #The 'pull' attribute can also be used to create space between the sectors
        showlegend = FALSE)
(fig <- fig %>% layout(title = 'Seminal Vesicle Invasion',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)))
##Pie charts (with class percentage) for gleason_score
table(prostate$gleason_score)
## 
##  6  7  8 
## 33 43 21
levels(prostate$gleason_score)
## [1] "6" "7" "8"
gs <- data.frame("Gleason" = c("Bad Prognosis (6)","Worse Prognosis (7)", "Worst Prognosis (8)"), "count" = c(33 , 43, 21  ))
colors <- c('#khaki', "palegreen", "plum")

fig1 <- plot_ly(gs, labels = ~Gleason, values = ~count, type = 'pie',
        textposition = 'inside',
        textinfo = 'label+percent',
        insidetextfont = list(color = '#FFFFFF'),
        hoverinfo = 'text',
        text = ~paste(count),
        marker = list(colors = colors,
                      line = list(color = '#FFFFFF', width = 1)),
                      #The 'pull' attribute can also be used to create space between the sectors
        showlegend = FALSE)
(fig1 <- fig1 %>% layout(title = 'Gleason Score',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)))
###DATA VISUALIZATION - 2

##Histograms of each quantitative variable

##function to combine plots in one panel
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }
 if (numPlots==1) {
    print(plots[[1]])
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

#Plot 1 - PSA level
library(ggplot2)
ggp1 <- ggplot(prostate, aes(x = psa_level)) + xlab("PSA Level") +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "white") +
    geom_vline(xintercept=mean(prostate$psa_level), col="red", lwd=1.5) 
#Plot 2 - Cancer Volume
ggp2<-ggplot(prostate, aes(x = cancer_vol)) + xlab("Cancer Volume") +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "white") +
    geom_vline(xintercept=mean(prostate$cancer_vol), col="red", lwd=1.5)
#Plot 3 - Weight
ggp3<-ggplot(prostate, aes(x = weight)) + xlab("Weight") +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "white") +
    geom_vline(xintercept=mean(prostate$weight), col="red", lwd=1.5)
#Plot 4 - age
ggp4<-ggplot(prostate, aes(x = age)) + xlab("Age") +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "white") +
    geom_vline(xintercept=mean(prostate$age), col="red", lwd=1.5)
#Plot 5 - Benign prostatic Hyperplasia
ggp5<-ggplot(prostate, aes(x = benign_prostatic_hyperplasia)) + xlab("Benign Prostatic Hyperplasia") +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "white") +
    geom_vline(xintercept=mean(prostate$benign_prostatic_hyperplasia), col="red", lwd=1.5)
#Plot 6 - Capsular Penetration
ggp6<-ggplot(prostate, aes(x = capsular_penetration)) + xlab("Capsular Penetration") +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "white") +
    geom_vline(xintercept=mean(prostate$capsular_penetration), col="red", lwd=1.5)

myplots <- list(ggp1, ggp2, ggp3, ggp4, ggp5, ggp6) 
multiplot(plotlist = myplots, cols = 3)

##transform PSA level to eliminate right skewed distribution 

##Box-cox transformation
fit1<-lm(psa_level~., data=prostate) #first-order model with the (non-transformed) variables
par(mfrow=c(2,2))
plot(fit1,pch =20, cex = 2, col = "aquamarine2")

###The Box-Cox likelihood suggests a power transformation with power -0.1.
bc<-MASS::boxcox(fit1)
(lambda <- bc$x[which.max(bc$y)])
## [1] 0.1010101

##psa_level transformation 
ggp0 <- ggplot(prostate, aes(x = psa_level^(-0.1))) + xlab("Histogram of psa_level^(-0.1))") +
    geom_histogram(aes(y = ..density..), bins = 30, fill = "gold", color = "white") + 
                stat_function(fun = dnorm,
                args = list(mean = mean(prostate$psa_level^(-0.1)),
                            sd = sd(prostate$psa_level^(-0.1))),
                col = "grey40",
                size = 0.5)
ggp7 <- ggplot(prostate, aes(x = log(psa_level))) + xlab("Histogram of log(psa_level)") +
    geom_histogram(aes(y = ..density..),bins = 30, fill = "rosybrown", color = "white") +
                stat_function(fun = dnorm,
                args = list(mean = mean(log(prostate$psa_level)),
                            sd = sd(log(prostate$psa_level))),
                col = "grey40",
                size = 0.5)
#Plot 2 - Cancer Volume
ggp8<-ggplot(prostate, aes(x = sqrt(psa_level))) + xlab("Histogram of sqrt(psa_level)") +
    geom_histogram(aes(y = ..density..),bins = 30, fill = "springgreen", color = "white")+
                stat_function(fun = dnorm,
                args = list(mean = mean(sqrt(prostate$psa_level)),
                            sd = sd(sqrt(prostate$psa_level))),
                col = "grey40",
                size = 0.5) 
#Plot 3 - Weight
ggp9<-ggplot(prostate, aes(x = 1/psa_level)) + xlab("Histogram of 1/psa_level") +
    geom_histogram(aes(y = ..density..),bins = 30, fill = "darkmagenta", color = "white")  +
                stat_function(fun = dnorm,
                args = list(mean = mean(1/prostate$psa_level),
                            sd = sd(1/prostate$psa_level)),
                col = "grey40",
                size = 0.5) 
myplots2 <- list(ggp0,ggp7, ggp8, ggp9) 
multiplot(plotlist = myplots2, cols = 2)

##transformation of  PSA level 
prostate$psa_level<-log(prostate$psa_level)

##Model diagnostic
fit2 <-lm(psa_level~., data=prostate) #first-order model with the (transformed) variable
par(mfrow=c(2,2))
plot(fit2,pch =20, cex = 2, col = "aquamarine2")

##DATA VISUALIZATION - 3 

##scatter plot matrix among quantitative variables with the lower panel showing correlation
# Correlation panel
panel.cor <- function(x, y) {
    #usr <- par('usr') on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- round(cor(x, y, use = "complete.obs"), 2)
    txt <- paste0("R = ", r)
    cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * r)
}

# Customize upper panel
upper.panel<-function(x, y){
  #my_cols <- c("#00AFBB", "#E7B800", "#FC4E07") 
  points(x,y, pch = 19, col = "red")
}

# Create the plots
pairs(~psa_level + cancer_vol + weight + age + benign_prostatic_hyperplasia + capsular_penetration, data = prostate,lower.panel = panel.cor, upper.panel = upper.panel)

##Scatter plot 2
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#p <- ggpairs(dat_psa)
# put scatterplots on top so y axis is vertical
(p <- ggpairs(prostate %>% select(psa_level, cancer_vol, weight, age, benign_prostatic_hyperplasia, capsular_penetration)
            #, upper = list(continuous = wrap("points", alpha = 0.2, size = 0.5))
            , aes(color = prostate$seminal_vesicle_invasion)
            , upper = list(continuous = "points")
            , lower = list(continuous = "cor")
            ))

##Pairwise correlation matrix for all quantitative variables
quantitative<-c('cancer_vol','weight','age','benign_prostatic_hyperplasia' , 'capsular_penetration')
cor(prostate[,c("psa_level",quantitative)], use = "pairwise.complete.obs")
##                              psa_level   cancer_vol      weight        age
## psa_level                    1.0000000  0.657073936 0.121720757 0.16990682
## cancer_vol                   0.6570739  1.000000000 0.005107148 0.03909442
## weight                       0.1217208  0.005107148 1.000000000 0.16432371
## age                          0.1699068  0.039094423 0.164323714 1.00000000
## benign_prostatic_hyperplasia 0.1574016 -0.133209431 0.321848748 0.36634121
## capsular_penetration         0.5180231  0.692896688 0.001578905 0.09955535
##                              benign_prostatic_hyperplasia capsular_penetration
## psa_level                                      0.15740158          0.518023108
## cancer_vol                                    -0.13320943          0.692896688
## weight                                         0.32184875          0.001578905
## age                                            0.36634121          0.099555351
## benign_prostatic_hyperplasia                   1.00000000         -0.083008649
## capsular_penetration                          -0.08300865          1.000000000
##DATA VISUALIZATION - 4 

##side-by-side box plots for ”PSA” with respect to each categorical variable
fig4 <- plot_ly(prostate, y = ~psa_level, color = ~gleason_score, type = "box")
fig5 <- plot_ly(prostate, y = ~psa_level, color = ~seminal_vesicle_invasion, type = "box")
# subplot
fig6 <- subplot(fig4, fig5, nrows = 1) 
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
fig6 <- fig6 %>% layout(title = "PSA Level: Side-by-Side Boxplot by Gleason Score (Left) and Seminal Vesicle Invasion (Right)")
fig6
##plot the distribution of PSA level 
##by seminal invasion in using kernel density plots
ggp7<-ggplot(prostate, 
       aes(x = psa_level, 
           fill = seminal_vesicle_invasion)) +
  geom_density(alpha = 0.4) +
  labs(title = "PSA Level distribution by Seminal Vesicle Invasion")
ggp8<-ggplot(prostate, 
       aes(x = psa_level, 
           fill = gleason_score)) +
  geom_density(alpha = 0.4) +
  labs(title = "PSA Level distribution by Gleason Score")

myplots1 <- list(ggp7, ggp8) 
multiplot(plotlist = myplots1, cols = 1)

Observations:

  • PSA level increases from low to high gleason score and from absence to presence of seminal vesicle invasion

  • There are no obvious nonlinearity

  • The Box-Cox likelihood suggests a power transformation with power -0.1 which produces similar results as log transformation of response variable so we can use either one

  • R-squared increases from Model 1 to Model 2 after log transformation

Part II: Multiple Regression with Categorical Variables

  • Response variable: psa_level
  • Predictor variables: cancer_vol, weight, age, benign_prostatic_hyperplasia, seminal_vesicle_invasion, capsular_penetration, gleason_score
  • Fit model with interactions.
  • Compare the two models using the function anova()
  • Construct a 95% prediction interval of PSA
  • Determine which variables are not significant
##Preliminary Model Fitting
summary(fit1) #model with all predictors (non-transformed psa_level) from Part I
## 
## Call:
## lm(formula = psa_level ~ ., data = prostate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.153  -7.323  -0.177   6.403 161.547 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                  31.849265  28.958981   1.100  0.27442   
## cancer_vol                    1.748107   0.615858   2.838  0.00563 **
## weight                       -0.004546   0.074038  -0.061  0.95118   
## age                          -0.537278   0.471991  -1.138  0.25808   
## benign_prostatic_hyperplasia  1.530782   1.201007   1.275  0.20581   
## seminal_vesicle_invasion1    21.108723  10.844893   1.946  0.05479 . 
## capsular_penetration          1.097882   1.322879   0.830  0.40883   
## gleason_score7               -1.661862   7.570741  -0.220  0.82676   
## gleason_score8               18.423157  10.661795   1.728  0.08750 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.91 on 88 degrees of freedom
## Multiple R-squared:  0.4733, Adjusted R-squared:  0.4254 
## F-statistic: 9.886 on 8 and 88 DF,  p-value: 1.037e-09
summary(fit2) #model with all predictors (transformed psa_level) from part I
## 
## Call:
## lm(formula = psa_level ~ ., data = prostate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85118 -0.45409  0.07018  0.45549  1.50935 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.496462   0.722596   2.071  0.04129 *  
## cancer_vol                    0.067454   0.015367   4.389 3.15e-05 ***
## weight                        0.001268   0.001847   0.687  0.49419    
## age                          -0.002799   0.011777  -0.238  0.81267    
## benign_prostatic_hyperplasia  0.089106   0.029968   2.973  0.00380 ** 
## seminal_vesicle_invasion1     0.793175   0.270606   2.931  0.00430 ** 
## capsular_penetration         -0.026527   0.033009  -0.804  0.42378    
## gleason_score7                0.296768   0.188908   1.571  0.11978    
## gleason_score8                0.746606   0.266037   2.806  0.00617 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7714 on 88 degrees of freedom
## Multiple R-squared:  0.5902, Adjusted R-squared:  0.5529 
## F-statistic: 15.84 on 8 and 88 DF,  p-value: 3.135e-14

Observations:

  • First order model (fit2 - includes all predictors): \(\beta_2\), \(\beta_3\), and \(\beta_6\) are not significant \[H_0:\beta_2=0~~vs.~~H_a:\beta_2\neq0,~T^*=0.687, ~Under H_0, T^*\sim t_{(88)}, p-value=0.49419\] \[H_0:\beta_3=0~~vs.~~H_a:\beta_3\neq0,~T^*=-0.238, ~Under H_0, T^*\sim t_{(88)}, p-value=0.81267\] \[H_0:\beta_6=0~~vs.~~H_a:\beta_6\neq0,~T^*=-0.804 , ~Under H_0, T^*\sim t_{(88)}, p-value=0.42378 \]
  • Log transformation of response variable increased \(R^2\)
##Check for multicollinearity in this data

##Variance inflation factors VIF_k
#Obtain R-squared_k by regressing X_k to {X_j : 1≤j not equal to k≤4} (k = 1, 2, 3, 4)
(vif_1 <- 1/(1-(summary(lm(cancer_vol~weight+age+benign_prostatic_hyperplasia+capsular_penetration, data = prostate))$r.squared)))
## [1] 1.948535
(vif_2 <- 1/(1-(summary(lm(weight~cancer_vol+age+benign_prostatic_hyperplasia+capsular_penetration, data = prostate))$r.squared)))
## [1] 1.121247
(vif_3 <- 1/(1-(summary(lm(age~cancer_vol+weight+benign_prostatic_hyperplasia+capsular_penetration, data = prostate))$r.squared)))
## [1] 1.181007
(vif_4 <- 1/(1-(summary(lm(benign_prostatic_hyperplasia~cancer_vol+weight+age+capsular_penetration, data = prostate))$r.squared)))
## [1] 1.293057
(vif_5 <- 1/(1-(summary(lm(capsular_penetration~cancer_vol+weight+age+ benign_prostatic_hyperplasia, data = prostate))$r.squared)))
## [1] 1.944823
  • All five VIF values are a higher than 1 and far less than 10, so we can conclude that there is not much multicollinearity in the model
##First order model with significant X variables
fit3a <- lm(psa_level~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion , data=prostate)
summary(fit3a)
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9867 -0.4996  0.1032  0.5545  1.4993 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.51484    0.13206  11.471  < 2e-16 ***
## cancer_vol                    0.07618    0.01256   6.067 2.78e-08 ***
## benign_prostatic_hyperplasia  0.09971    0.02674   3.729 0.000331 ***
## seminal_vesicle_invasion1     0.82194    0.23858   3.445 0.000858 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7861 on 93 degrees of freedom
## Multiple R-squared:  0.5502, Adjusted R-squared:  0.5357 
## F-statistic: 37.92 on 3 and 93 DF,  p-value: 4.247e-16
fit3aa <- lm(psa_level~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion +gleason_score, data=prostate)
summary(fit3aa)
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + gleason_score, data = prostate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85235 -0.45777  0.06741  0.51651  1.53204 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.38817    0.15609   8.894 5.27e-14 ***
## cancer_vol                    0.06241    0.01367   4.566 1.55e-05 ***
## benign_prostatic_hyperplasia  0.09265    0.02627   3.527  0.00066 ***
## seminal_vesicle_invasion1     0.69646    0.23837   2.922  0.00439 ** 
## gleason_score7                0.26028    0.18280   1.424  0.15790    
## gleason_score8                0.70545    0.25712   2.744  0.00732 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7636 on 91 degrees of freedom
## Multiple R-squared:  0.5848, Adjusted R-squared:  0.5619 
## F-statistic: 25.63 on 5 and 91 DF,  p-value: 4.722e-16
##Model with two-way interactions (significant predictors)
drops1=c("weight", "age", "capsular_penetration")
pd=prostate[,!(names(prostate)%in%drops1)]
fit3b <- lm(psa_level~ .^2, data=pd)
summary(fit3b)
## 
## Call:
## lm(formula = psa_level ~ .^2, data = pd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.71044 -0.41531 -0.00348  0.48621  1.80370 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                             1.080795   0.202661
## cancer_vol                                              0.118417   0.027680
## benign_prostatic_hyperplasia                            0.141754   0.052268
## seminal_vesicle_invasion1                              -0.004472   1.263746
## gleason_score7                                          0.410471   0.312494
## gleason_score8                                          1.197254   0.518413
## cancer_vol:benign_prostatic_hyperplasia                 0.001194   0.008732
## cancer_vol:seminal_vesicle_invasion1                   -0.020573   0.036158
## cancer_vol:gleason_score7                              -0.020164   0.053961
## cancer_vol:gleason_score8                              -0.073630   0.042757
## benign_prostatic_hyperplasia:seminal_vesicle_invasion1 -0.166088   0.090083
## benign_prostatic_hyperplasia:gleason_score7            -0.053534   0.062762
## benign_prostatic_hyperplasia:gleason_score8            -0.052336   0.105125
## seminal_vesicle_invasion1:gleason_score7                1.141611   1.178306
## seminal_vesicle_invasion1:gleason_score8                1.493889   1.077867
##                                                        t value Pr(>|t|)    
## (Intercept)                                              5.333 8.36e-07 ***
## cancer_vol                                               4.278 5.07e-05 ***
## benign_prostatic_hyperplasia                             2.712  0.00814 ** 
## seminal_vesicle_invasion1                               -0.004  0.99719    
## gleason_score7                                           1.314  0.19267    
## gleason_score8                                           2.309  0.02343 *  
## cancer_vol:benign_prostatic_hyperplasia                  0.137  0.89159    
## cancer_vol:seminal_vesicle_invasion1                    -0.569  0.57093    
## cancer_vol:gleason_score7                               -0.374  0.70962    
## cancer_vol:gleason_score8                               -1.722  0.08883 .  
## benign_prostatic_hyperplasia:seminal_vesicle_invasion1  -1.844  0.06884 .  
## benign_prostatic_hyperplasia:gleason_score7             -0.853  0.39616    
## benign_prostatic_hyperplasia:gleason_score8             -0.498  0.61992    
## seminal_vesicle_invasion1:gleason_score7                 0.969  0.33547    
## seminal_vesicle_invasion1:gleason_score8                 1.386  0.16951    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7489 on 82 degrees of freedom
## Multiple R-squared:  0.6401, Adjusted R-squared:  0.5787 
## F-statistic: 10.42 on 14 and 82 DF,  p-value: 5.1e-13
##Model with two-way interactions 
fit3b <- lm(psa_level~ .^2, data=prostate)
summary(fit3b)
## 
## Call:
## lm(formula = psa_level ~ .^2, data = prostate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6692 -0.2934  0.0000  0.3874  1.6488 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                             4.233800   2.570730
## cancer_vol                                              0.303276   0.173336
## weight                                                 -0.086133   0.081243
## age                                                    -0.064713   0.042944
## benign_prostatic_hyperplasia                            0.389719   0.435132
## seminal_vesicle_invasion1                              -1.420907   4.851800
## capsular_penetration                                   -0.049000   0.546142
## gleason_score7                                         -0.030655   1.802806
## gleason_score8                                          3.454056   4.332597
## cancer_vol:weight                                      -0.002764   0.001897
## cancer_vol:age                                         -0.000597   0.002230
## cancer_vol:benign_prostatic_hyperplasia                 0.011300   0.010969
## cancer_vol:seminal_vesicle_invasion1                   -0.031776   0.069049
## cancer_vol:capsular_penetration                         0.005215   0.005983
## cancer_vol:gleason_score7                              -0.125014   0.087691
## cancer_vol:gleason_score8                              -0.130589   0.058787
## weight:age                                              0.001707   0.001286
## weight:benign_prostatic_hyperplasia                    -0.003891   0.001476
## weight:seminal_vesicle_invasion1                       -0.043008   0.052745
## weight:capsular_penetration                             0.003770   0.005860
## weight:gleason_score7                                   0.004122   0.010257
## weight:gleason_score8                                   0.020481   0.029580
## age:benign_prostatic_hyperplasia                       -0.001387   0.006858
## age:seminal_vesicle_invasion1                           0.068016   0.076388
## age:capsular_penetration                               -0.004708   0.009147
## age:gleason_score7                                      0.009721   0.029847
## age:gleason_score8                                     -0.048420   0.079074
## benign_prostatic_hyperplasia:seminal_vesicle_invasion1  0.129029   0.180185
## benign_prostatic_hyperplasia:capsular_penetration      -0.034542   0.021356
## benign_prostatic_hyperplasia:gleason_score7            -0.066955   0.082549
## benign_prostatic_hyperplasia:gleason_score8            -0.057507   0.145909
## seminal_vesicle_invasion1:capsular_penetration         -0.135395   0.120265
## seminal_vesicle_invasion1:gleason_score7                0.017252   2.245275
## seminal_vesicle_invasion1:gleason_score8                0.076708   2.105042
## capsular_penetration:gleason_score7                     0.274529   0.213993
## capsular_penetration:gleason_score8                     0.309074   0.212771
##                                                        t value Pr(>|t|)  
## (Intercept)                                              1.647   0.1047  
## cancer_vol                                               1.750   0.0852 .
## weight                                                  -1.060   0.2932  
## age                                                     -1.507   0.1370  
## benign_prostatic_hyperplasia                             0.896   0.3740  
## seminal_vesicle_invasion1                               -0.293   0.7706  
## capsular_penetration                                    -0.090   0.9288  
## gleason_score7                                          -0.017   0.9865  
## gleason_score8                                           0.797   0.4284  
## cancer_vol:weight                                       -1.457   0.1503  
## cancer_vol:age                                          -0.268   0.7898  
## cancer_vol:benign_prostatic_hyperplasia                  1.030   0.3070  
## cancer_vol:seminal_vesicle_invasion1                    -0.460   0.6470  
## cancer_vol:capsular_penetration                          0.872   0.3868  
## cancer_vol:gleason_score7                               -1.426   0.1591  
## cancer_vol:gleason_score8                               -2.221   0.0300 *
## weight:age                                               1.327   0.1893  
## weight:benign_prostatic_hyperplasia                     -2.637   0.0106 *
## weight:seminal_vesicle_invasion1                        -0.815   0.4180  
## weight:capsular_penetration                              0.643   0.5224  
## weight:gleason_score7                                    0.402   0.6892  
## weight:gleason_score8                                    0.692   0.4913  
## age:benign_prostatic_hyperplasia                        -0.202   0.8404  
## age:seminal_vesicle_invasion1                            0.890   0.3768  
## age:capsular_penetration                                -0.515   0.6086  
## age:gleason_score7                                       0.326   0.7458  
## age:gleason_score8                                      -0.612   0.5426  
## benign_prostatic_hyperplasia:seminal_vesicle_invasion1   0.716   0.4767  
## benign_prostatic_hyperplasia:capsular_penetration       -1.617   0.1109  
## benign_prostatic_hyperplasia:gleason_score7             -0.811   0.4205  
## benign_prostatic_hyperplasia:gleason_score8             -0.394   0.6949  
## seminal_vesicle_invasion1:capsular_penetration          -1.126   0.2647  
## seminal_vesicle_invasion1:gleason_score7                 0.008   0.9939  
## seminal_vesicle_invasion1:gleason_score8                 0.036   0.9711  
## capsular_penetration:gleason_score7                      1.283   0.2044  
## capsular_penetration:gleason_score8                      1.453   0.1515  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7434 on 61 degrees of freedom
## Multiple R-squared:  0.7361, Adjusted R-squared:  0.5847 
## F-statistic: 4.862 on 35 and 61 DF,  p-value: 3.611e-08
fit3 <- lm(psa_level~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion + gleason_score+ cancer_vol:benign_prostatic_hyperplasia + cancer_vol:seminal_vesicle_invasion + cancer_vol:gleason_score, data=prostate)
summary(fit3)
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + gleason_score + cancer_vol:benign_prostatic_hyperplasia + 
##     cancer_vol:seminal_vesicle_invasion + cancer_vol:gleason_score, 
##     data = prostate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.67536 -0.45692  0.05281  0.43829  1.70583 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              1.188041   0.182582   6.507 4.71e-09
## cancer_vol                               0.103718   0.024185   4.289 4.64e-05
## benign_prostatic_hyperplasia             0.129957   0.040745   3.190  0.00198
## seminal_vesicle_invasion1                0.946895   0.418419   2.263  0.02612
## gleason_score7                           0.229407   0.253334   0.906  0.36767
## gleason_score8                           1.105168   0.404354   2.733  0.00760
## cancer_vol:benign_prostatic_hyperplasia -0.006877   0.005386  -1.277  0.20507
## cancer_vol:seminal_vesicle_invasion1    -0.025957   0.030557  -0.849  0.39795
## cancer_vol:gleason_score7                0.003900   0.041014   0.095  0.92446
## cancer_vol:gleason_score8               -0.039213   0.032541  -1.205  0.23146
##                                            
## (Intercept)                             ***
## cancer_vol                              ***
## benign_prostatic_hyperplasia            ** 
## seminal_vesicle_invasion1               *  
## gleason_score7                             
## gleason_score8                          ** 
## cancer_vol:benign_prostatic_hyperplasia    
## cancer_vol:seminal_vesicle_invasion1       
## cancer_vol:gleason_score7                  
## cancer_vol:gleason_score8                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7541 on 87 degrees of freedom
## Multiple R-squared:  0.6128, Adjusted R-squared:  0.5728 
## F-statistic:  15.3 on 9 and 87 DF,  p-value: 1.251e-14
##Polynomial Model
fit4 <- lm(psa_level~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion + gleason_score+cancer_vol:benign_prostatic_hyperplasia+I(cancer_vol^2)+I(benign_prostatic_hyperplasia^2), data=prostate)
summary(fit4)
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + gleason_score + cancer_vol:benign_prostatic_hyperplasia + 
##     I(cancer_vol^2) + I(benign_prostatic_hyperplasia^2), data = prostate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63690 -0.47919  0.09209  0.50902  1.74263 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              1.1369391  0.1834337   6.198 1.80e-08
## cancer_vol                               0.1272796  0.0304641   4.178 6.91e-05
## benign_prostatic_hyperplasia             0.1947783  0.0794516   2.452  0.01620
## seminal_vesicle_invasion1                0.6221195  0.2399560   2.593  0.01115
## gleason_score7                           0.2325434  0.1807751   1.286  0.20169
## gleason_score8                           0.6879983  0.2573059   2.674  0.00894
## I(cancer_vol^2)                         -0.0017616  0.0007775  -2.266  0.02592
## I(benign_prostatic_hyperplasia^2)       -0.0077762  0.0087863  -0.885  0.37855
## cancer_vol:benign_prostatic_hyperplasia -0.0078307  0.0054541  -1.436  0.15462
##                                            
## (Intercept)                             ***
## cancer_vol                              ***
## benign_prostatic_hyperplasia            *  
## seminal_vesicle_invasion1               *  
## gleason_score7                             
## gleason_score8                          ** 
## I(cancer_vol^2)                         *  
## I(benign_prostatic_hyperplasia^2)          
## cancer_vol:benign_prostatic_hyperplasia    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7493 on 88 degrees of freedom
## Multiple R-squared:  0.6133, Adjusted R-squared:  0.5782 
## F-statistic: 17.45 on 8 and 88 DF,  p-value: 2.714e-15
par(mfrow = c(3, 2))
plot(fit1, which = 1, pch =20, cex = 2, col = "aquamarine2")
plot(fit2, which = 1, pch =20, cex = 2, col = "olivedrab1")
plot(fit3a, which = 1, pch =20, cex = 2, col = "goldenrod")
plot(fit3aa, which = 1, pch =20, cex = 2, col = "aquamarine2")
plot(fit3b, which = 1, pch =20, cex = 2, col = "dodgerblue")
plot(fit3, which = 1, pch =20, cex = 2, col = "coral")

plot(fit4, which = 1, pch =20, cex = 2, col = "orchid")

Part III: Model building and model selection

  • Split the data into 2: training and validation data

  • Draw side-by-side boxplots for PSA and other variables, in training data and validation data, respectively to see if they have similar distributions in these two sets.

  • Apply box-cox procedure on first-order effects of all predictors (Model 1) and determine if it appears that any (further) transformation of the response variable is still needed

  • Apply the forward stepwise procedure using R function stepAIC(), starting from the null-model and using the AICp criterion.

  • Determine if the best model produced by prior step matches with the best model determined using AICp in the best subsets selection method.

###split data into two halves (training and validation)
set.seed(10)
n <- nrow(prostate)/2
ind <- sample(1:(2*n), n, replace=FALSE)
prostate.t <- prostate[ind, ] #training set
prostate.v <- prostate[-ind, ] #validation/test set

par(mfrow=c(2,3))
for (col_name in c('psa_level', 'cancer_vol', 'weight',
'age', 'benign_prostatic_hyperplasia', 'capsular_penetration')){
boxplot(prostate.t[, col_name],prostate.v[, col_name],main=col_name,names=c('training','validation'), col=c("coral","lightblue"))
}

  • The training data and validation data appear to have similar distribution.

I. Best Subsets Regression

  • Perform best subsets selection using the R function regsubsets() from the leaps library with Model 1 as the full model

  • Identify the best model according to each criterion: \(SSE_p\),\(R^2_p\), \(R^2_{a,p}\), \(C_p\), \(AIC_p\),\(BIC_p\)

fit5<-lm(psa_level~.,data=prostate.t)
summary(fit5)
## 
## Call:
## lm(formula = psa_level ~ ., data = prostate.t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39781 -0.42545 -0.02081  0.43310  1.45613 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                   4.321e-01  1.229e+00   0.352  0.72698   
## cancer_vol                    4.602e-02  2.417e-02   1.904  0.06436 . 
## weight                        2.396e-05  1.694e-03   0.014  0.98879   
## age                           1.810e-02  1.956e-02   0.926  0.36033   
## benign_prostatic_hyperplasia  7.640e-02  4.039e-02   1.892  0.06596 . 
## seminal_vesicle_invasion1     1.011e+00  3.109e-01   3.251  0.00237 **
## capsular_penetration         -4.094e-02  4.545e-02  -0.901  0.37330   
## gleason_score7                2.291e-01  2.457e-01   0.933  0.35672   
## gleason_score8                7.121e-01  3.068e-01   2.321  0.02561 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6727 on 39 degrees of freedom
## Multiple R-squared:  0.6087, Adjusted R-squared:  0.5284 
## F-statistic: 7.584 on 8 and 39 DF,  p-value: 4.642e-06
length(fit5$coef) #8 regression coefficients
## [1] 9
anova(fit5)['Residuals',3] #MSE
## [1] 0.4525879
par(mfrow=c(2,2))
plot(fit5,which=1:3, pch =20, cex = 2, col = "aquamarine2")
MASS::boxcox(fit5)

library(leaps)
sub_set<-regsubsets(psa_level~.,data=prostate.t,nbest=3,nvmax=8,method="exhaustive")
(sum_sub<-summary(sub_set))
## Subset selection object
## Call: regsubsets.formula(psa_level ~ ., data = prostate.t, nbest = 3, 
##     nvmax = 8, method = "exhaustive")
## 8 Variables  (and intercept)
##                              Forced in Forced out
## cancer_vol                       FALSE      FALSE
## weight                           FALSE      FALSE
## age                              FALSE      FALSE
## benign_prostatic_hyperplasia     FALSE      FALSE
## seminal_vesicle_invasion1        FALSE      FALSE
## capsular_penetration             FALSE      FALSE
## gleason_score7                   FALSE      FALSE
## gleason_score8                   FALSE      FALSE
## 3 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          cancer_vol weight age benign_prostatic_hyperplasia
## 1  ( 1 ) "*"        " "    " " " "                         
## 1  ( 2 ) " "        " "    " " " "                         
## 1  ( 3 ) " "        " "    " " " "                         
## 2  ( 1 ) " "        " "    " " " "                         
## 2  ( 2 ) "*"        " "    " " " "                         
## 2  ( 3 ) " "        " "    " " "*"                         
## 3  ( 1 ) " "        " "    " " "*"                         
## 3  ( 2 ) "*"        " "    " " "*"                         
## 3  ( 3 ) " "        " "    "*" " "                         
## 4  ( 1 ) "*"        " "    " " "*"                         
## 4  ( 2 ) " "        " "    "*" "*"                         
## 4  ( 3 ) " "        " "    " " "*"                         
## 5  ( 1 ) "*"        " "    " " "*"                         
## 5  ( 2 ) "*"        " "    "*" "*"                         
## 5  ( 3 ) "*"        " "    " " "*"                         
## 6  ( 1 ) "*"        " "    "*" "*"                         
## 6  ( 2 ) "*"        " "    " " "*"                         
## 6  ( 3 ) "*"        " "    "*" "*"                         
## 7  ( 1 ) "*"        " "    "*" "*"                         
## 7  ( 2 ) "*"        "*"    "*" "*"                         
## 7  ( 3 ) "*"        "*"    " " "*"                         
## 8  ( 1 ) "*"        "*"    "*" "*"                         
##          seminal_vesicle_invasion1 capsular_penetration gleason_score7
## 1  ( 1 ) " "                       " "                  " "           
## 1  ( 2 ) "*"                       " "                  " "           
## 1  ( 3 ) " "                       " "                  " "           
## 2  ( 1 ) "*"                       " "                  " "           
## 2  ( 2 ) "*"                       " "                  " "           
## 2  ( 3 ) "*"                       " "                  " "           
## 3  ( 1 ) "*"                       " "                  " "           
## 3  ( 2 ) "*"                       " "                  " "           
## 3  ( 3 ) "*"                       " "                  " "           
## 4  ( 1 ) "*"                       " "                  " "           
## 4  ( 2 ) "*"                       " "                  " "           
## 4  ( 3 ) "*"                       " "                  "*"           
## 5  ( 1 ) "*"                       " "                  "*"           
## 5  ( 2 ) "*"                       " "                  " "           
## 5  ( 3 ) "*"                       "*"                  " "           
## 6  ( 1 ) "*"                       " "                  "*"           
## 6  ( 2 ) "*"                       "*"                  "*"           
## 6  ( 3 ) "*"                       "*"                  " "           
## 7  ( 1 ) "*"                       "*"                  "*"           
## 7  ( 2 ) "*"                       " "                  "*"           
## 7  ( 3 ) "*"                       "*"                  "*"           
## 8  ( 1 ) "*"                       "*"                  "*"           
##          gleason_score8
## 1  ( 1 ) " "           
## 1  ( 2 ) " "           
## 1  ( 3 ) "*"           
## 2  ( 1 ) "*"           
## 2  ( 2 ) " "           
## 2  ( 3 ) " "           
## 3  ( 1 ) "*"           
## 3  ( 2 ) " "           
## 3  ( 3 ) "*"           
## 4  ( 1 ) "*"           
## 4  ( 2 ) "*"           
## 4  ( 3 ) "*"           
## 5  ( 1 ) "*"           
## 5  ( 2 ) "*"           
## 5  ( 3 ) "*"           
## 6  ( 1 ) "*"           
## 6  ( 2 ) "*"           
## 6  ( 3 ) "*"           
## 7  ( 1 ) "*"           
## 7  ( 2 ) "*"           
## 7  ( 3 ) "*"           
## 8  ( 1 ) "*"
n <-nrow(prostate.t)
## number of coefficients in each model: p
p.m<-as.integer(as.numeric(rownames(sum_sub$which))+1)
sse<-sum_sub$rss
aic<-n*log(sse/n)+2*p.m
bic<-n*log(sse/n)+log(n)*p.m
res_sub<-cbind(sum_sub$which,sse,sum_sub$rsq,sum_sub$adjr2,sum_sub$cp, aic, bic)
fit<-lm(psa_level~1,data=prostate.t) ##fit the model with only intercept
full <- lm(psa_level ~., data = prostate.t)  #full first-order model
full1 <- lm(psa_level~ .^2, data=prostate.t) ##full first-order model with interactions 
sse1<-sum(fit$residuals^2)
p<-1
c1<-sse1/summary(full)$sigma^2 - (n - 2 * p)
aic1<-n*log(sse1/n)+2*p
bic1<-n*log(sse1/n)+log(n)*p
none<-c(1,rep(0,8),sse1,0,0,c1,bic1,aic1)
res_sub<-rbind(none,res_sub) ##combine the results with other models
colnames(res_sub)<-c(colnames(sum_sub$which),"sse", "R^2", "R^2_a", "Cp", "aic", "bic")
res_sub
##      (Intercept) cancer_vol weight age benign_prostatic_hyperplasia
## none           1          0      0   0                            0
## 1              1          1      0   0                            0
## 1              1          0      0   0                            0
## 1              1          0      0   0                            0
## 2              1          0      0   0                            0
## 2              1          1      0   0                            0
## 2              1          0      0   0                            1
## 3              1          0      0   0                            1
## 3              1          1      0   0                            1
## 3              1          0      0   1                            0
## 4              1          1      0   0                            1
## 4              1          0      0   1                            1
## 4              1          0      0   0                            1
## 5              1          1      0   0                            1
## 5              1          1      0   1                            1
## 5              1          1      0   0                            1
## 6              1          1      0   1                            1
## 6              1          1      0   0                            1
## 6              1          1      0   1                            1
## 7              1          1      0   1                            1
## 7              1          1      1   1                            1
## 7              1          1      1   0                            1
## 8              1          1      1   1                            1
##      seminal_vesicle_invasion1 capsular_penetration gleason_score7
## none                         0                    0              0
## 1                            0                    0              0
## 1                            1                    0              0
## 1                            0                    0              0
## 2                            1                    0              0
## 2                            1                    0              0
## 2                            1                    0              0
## 3                            1                    0              0
## 3                            1                    0              0
## 3                            1                    0              0
## 4                            1                    0              0
## 4                            1                    0              0
## 4                            1                    0              1
## 5                            1                    0              1
## 5                            1                    0              0
## 5                            1                    1              0
## 6                            1                    0              1
## 6                            1                    1              1
## 6                            1                    1              0
## 7                            1                    1              1
## 7                            1                    0              1
## 7                            1                    1              1
## 8                            1                    1              1
##      gleason_score8      sse       R^2     R^2_a        Cp         aic
## none              0 45.10856 0.0000000 0.0000000 53.668051   0.8890059
## 1                 0 28.50702 0.3680352 0.3542968 18.986704 -21.0104279
## 1                 0 30.61089 0.3213949 0.3066427 23.635243 -17.5925620
## 1                 1 34.90068 0.2262957 0.2094760 33.113600 -11.2973397
## 2                 1 24.26454 0.4620857 0.4381784 11.612865 -26.7448887
## 2                 0 25.18281 0.4417288 0.4169167 13.641802 -24.9618948
## 2                 0 25.96976 0.4242832 0.3986957 15.380576 -23.4848831
## 3                 1 20.21460 0.5518677 0.5213132  4.664474 -33.5101997
## 3                 0 21.01723 0.5340744 0.5023067  6.437896 -31.6412022
## 3                 1 21.55065 0.5222493 0.4896754  7.616483 -30.4381718
## 4                 1 18.88099 0.5814321 0.5424956  3.717845 -34.7861729
## 4                 1 19.61030 0.5652642 0.5248236  5.329271 -32.9669997
## 4                 1 19.87350 0.5594294 0.5184461  5.910815 -32.3270522
## 5                 1 18.35819 0.5930221 0.5445723  4.562699 -34.1340165
## 5                 1 18.47101 0.5905208 0.5417733  4.811991 -33.8399187
## 5                 1 18.51809 0.5894773 0.5406056  4.915995 -33.7177528
## 6                 1 18.02045 0.6005092 0.5420472  5.816467 -33.0252958
## 6                 1 18.03874 0.6001039 0.5415825  5.856864 -32.9766208
## 6                 1 18.05368 0.5997727 0.5412028  5.889876 -32.9368807
## 7                 1 17.65102 0.6086991 0.5402214  7.000200 -32.0195582
## 7                 1 18.01806 0.6005622 0.5306606  7.811184 -31.0316658
## 7                 1 18.03870 0.6001047 0.5301230  7.856788 -30.9767124
## 8                 1 17.65093 0.6087011 0.5284346  9.000000 -30.0198044
##              bic
## none  -0.9821951
## 1    -17.2680258
## 1    -13.8501600
## 1     -7.5549377
## 2    -21.1312857
## 2    -19.3482918
## 2    -17.8712800
## 3    -26.0253956
## 3    -24.1563981
## 3    -22.9533678
## 4    -25.4301678
## 4    -23.6109947
## 4    -22.9710472
## 5    -22.9068105
## 5    -22.6127127
## 5    -22.4905467
## 6    -19.9268888
## 6    -19.8782137
## 6    -19.8384737
## 7    -17.0499501
## 7    -16.0620577
## 7    -16.0071043
## 8    -13.1789953
##The best model according to each criterion are extracted as follows: 
#result.sum = summary(sub_set)
#(criteria = data.frame(Nvar = 1:7, R2adj=result.sum$adjr2, CP=result.sum$cp, BIC=result.sum$bic))
##Estimated best subset byeach criterion
#(which.best.subset = data.frame(R2adj=which.max(result.sum$adjr2), CP=which.min(result.sum$cp), BIC=which.min(result.sum$bic)))

(PRESS_none <- sum((fit$residuals/(1-influence(fit)$hat))^2))
## [1] 47.04849
(PRESS_full <- sum((full$residuals/(1-influence(full)$hat))^2))
## [1] 26.62755
#plot results
p.plot <- res_sub[, 1] + res_sub[, 2] + res_sub[, 3] + res_sub[, 4] + res_sub[, 5]+ res_sub[, 6]+ res_sub[, 7]+ res_sub[, 8]
res.sub.plot <- as.data.frame(cbind(p.plot, res_sub))
best.plot <- res.sub.plot[c(1), ]
par(mfrow = c(3, 2))
plot(res.sub.plot$p.plot, res.sub.plot$`R^2`, xlab = "p", ylab = "R^2")
lines(best.plot$p.plot, best.plot$`R^2`, lwd = 2)

plot(res.sub.plot$p.plot, res.sub.plot$`R^2_a`, xlab = "p", ylab = "R^2_a")
lines(best.plot$p.plot, best.plot$`R^2_a`, lwd = 2)

plot(res.sub.plot$p.plot, res.sub.plot$Cp, xlab = "p", ylab = "Cp")
lines(best.plot$p.plot, best.plot$Cp, lwd = 2)
lines(best.plot$p.plot, best.plot$p.plot, col = "red")

plot(res.sub.plot$p.plot, res.sub.plot$aic, xlab = "p", ylab = "aic")
lines(best.plot$p.plot, best.plot$aic, lwd = 2)

plot(res.sub.plot$p.plot, res.sub.plot$bic, xlab = "p", ylab = "bic")
lines(best.plot$p.plot, best.plot$bic, lwd = 2)

Observations:

  • Best subsets selection
  • Best model according to \(SSEp\): Model 8
  • Best model according to \(R^2\): Model 8
  • Best model according to \(R^2_{a}\): Model 5 ( cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion1 + gleason_score7 + gleason_score8)
  • Best model according to \(C_p\), \(AIC_p\), \(BIC_p\): Model 4 (cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion1 + gleason_score8) _ The box-cox log-likelihood does not suggest any power transformation since lambda~1

II: Stepwise Procedures

  • Perform best subsets selection using the R function regsubsets() from the leaps library

  • Identify the best model according to \(AIC_p\) and \(BIC_p\)

  • Apply the forward stepwise procedure using R function stepAIC(), starting from the null-model and using the AICp criterion.

  • Determine if the best model produced by prior step matches with the best model determined using AICp in the best subsets selection method.

  • Draw residual vs. fitted value plot and the residual Q-Q plot of this model and determine if the model appears to be adequate.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:plotly':
## 
##     select
n <-nrow(prostate.t)
## forward selection based on AIC:
(step.f<-stepAIC(fit, scope = list(upper = full, lower = ~1), direction = "forward", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.f$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev        AIC
## 1                                                    47   45.10856   1.017805
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.010428
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -18.961895
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -23.641202
(step.fa<-stepAIC(fit, scope = list(upper = full1, lower = ~1), direction = "forward", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.fa$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev        AIC
## 1                                                    47   45.10856   1.017805
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.010428
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -18.961895
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -23.641202
## backward elimination based on AIC
(step.b <- stepAIC(full, scope = list(upper = full, lower = ~1), direction = "backward", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.b$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -12.01980
## 2               - weight  1 9.055792e-05        40   17.65102 -16.01956
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.02530
## 4                  - age  1 3.377354e-01        42   18.35819 -22.13402
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -23.64120
(step.ba <- stepAIC(full, scope = list(upper = full1, lower = ~1), direction = "backward", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.ba$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -12.01980
## 2               - weight  1 9.055792e-05        40   17.65102 -16.01956
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.02530
## 4                  - age  1 3.377354e-01        42   18.35819 -22.13402
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -23.64120
## forward stepwise based on AIC
(step.fs<-stepAIC(fit, scope = list(upper = full, lower = ~1), direction = "both", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.fs$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev        AIC
## 1                                                    47   45.10856   1.017805
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.010428
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -18.961895
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -23.641202
(step.fsa<-stepAIC(fit, scope = list(upper = full1, lower = ~1), direction = "both", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.fsa$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev        AIC
## 1                                                    47   45.10856   1.017805
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.010428
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -18.961895
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -23.641202
## backward stepwise based on AIC
(step.bs<-stepAIC(full, scope = list(upper = full, lower = ~1), direction = "both", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.bs$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -12.01980
## 2               - weight  1 9.055792e-05        40   17.65102 -16.01956
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.02530
## 4                  - age  1 3.377354e-01        42   18.35819 -22.13402
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -23.64120
(step.bsa<-stepAIC(full, scope = list(upper = full1, lower = ~1), direction = "both", 
    k = 4, trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.bsa$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -12.01980
## 2               - weight  1 9.055792e-05        40   17.65102 -16.01956
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.02530
## 4                  - age  1 3.377354e-01        42   18.35819 -22.13402
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -23.64120
## selection based on BIC: set option 'k=log(n)'
(step.f1<-stepAIC(fit, scope = list(upper = full, lower = ~1), direction = "forward", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.f1$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev         AIC
## 1                                                    47   45.10856   0.8890059
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.2680258
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -19.3482918
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -24.1563981
(step.f1a<-stepAIC(fit, scope = list(upper = full1, lower = ~1), direction = "forward", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.f1a$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev         AIC
## 1                                                    47   45.10856   0.8890059
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.2680258
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -19.3482918
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -24.1563981
(step.b1 <- stepAIC(full, scope = list(upper = full, lower = ~1), direction = "backward", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.b1$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -13.17900
## 2               - weight  1 9.055792e-05        40   17.65102 -17.04995
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.92689
## 4                  - age  1 3.377354e-01        42   18.35819 -22.90681
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -24.15640
(step.b1a <- stepAIC(full, scope = list(upper = full1, lower = ~1), direction = "backward", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.b1a$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -13.17900
## 2               - weight  1 9.055792e-05        40   17.65102 -17.04995
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.92689
## 4                  - age  1 3.377354e-01        42   18.35819 -22.90681
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -24.15640
(step.fs1<-stepAIC(fit, scope = list(upper = full, lower = ~1), direction = "both", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.fs1$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev         AIC
## 1                                                    47   45.10856   0.8890059
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.2680258
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -19.3482918
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -24.1563981
(step.fs2<-stepAIC(fit, scope = list(upper = full1, lower = ~1), direction = "both", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
##    seminal_vesicle_invasion1  benign_prostatic_hyperplasia  
##                      0.90993                       0.10150
step.fs2$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ 1
## 
## Final Model:
## psa_level ~ cancer_vol + seminal_vesicle_invasion + benign_prostatic_hyperplasia
## 
## 
##                             Step Df  Deviance Resid. Df Resid. Dev         AIC
## 1                                                    47   45.10856   0.8890059
## 2                   + cancer_vol  1 16.601535        46   28.50702 -17.2680258
## 3     + seminal_vesicle_invasion  1  3.324214        45   25.18281 -19.3482918
## 4 + benign_prostatic_hyperplasia  1  4.165577        44   21.01723 -24.1563981
(step.bs1<-stepAIC(full, scope = list(upper = full1, lower = ~1), direction = "both", 
    k = log(n), trace = FALSE))
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate.t)
## 
## Coefficients:
##                  (Intercept)                    cancer_vol  
##                      1.69605                       0.05786  
## benign_prostatic_hyperplasia     seminal_vesicle_invasion1  
##                      0.10150                       0.90993
step.bs1$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## psa_level ~ cancer_vol + weight + age + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + capsular_penetration + gleason_score
## 
## Final Model:
## psa_level ~ cancer_vol + benign_prostatic_hyperplasia + seminal_vesicle_invasion
## 
## 
##                     Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                               39   17.65093 -13.17900
## 2               - weight  1 9.055792e-05        40   17.65102 -17.04995
## 3 - capsular_penetration  1 3.694326e-01        41   18.02045 -19.92689
## 4                  - age  1 3.377354e-01        42   18.35819 -22.90681
## 5        - gleason_score  2 2.659044e+00        44   21.01723 -24.15640
  • The “best” model according to all stepwise procedures include predictors: cancer_vol, seminal_vesicle_invasion, and benign_prostatic_hyperplasia

Part IV: Model validation

  • Internal validation of model

  • External validation using the validation set

  • Choose the “final model” based on the internal and external model validation results

  • Fit the “final model” using the entire data set (training and validation data set combined)

###use best model according to stepwise procedures 
train1 <- lm(psa_level ~ cancer_vol  + benign_prostatic_hyperplasia  + seminal_vesicle_invasion , data = prostate.t)
# re-run model using validation data
valid1 <- lm(psa_level ~ cancer_vol  + benign_prostatic_hyperplasia  + seminal_vesicle_invasion, data = prostate.v)

mod_sum <- cbind(coef(summary(train1))[, 1], coef(summary(valid1))[, 1], coef(summary(train1))[, 
    2], coef(summary(valid1))[, 2])
colnames(mod_sum) <- c("Train Est", "Valid Est", "Train s.e.", "Valid s.e.")
mod_sum
##                               Train Est  Valid Est Train s.e. Valid s.e.
## (Intercept)                  1.69604835 1.38587311 0.17802214 0.20198456
## cancer_vol                   0.05785642 0.08685467 0.01796800 0.01884337
## benign_prostatic_hyperplasia 0.10149702 0.10406554 0.03436981 0.04286352
## seminal_vesicle_invasion1    0.90992732 0.71425373 0.27981314 0.44023801
##examine the SSE and adjusted R squares using both the training data and validation data
sse_t <- sum(train1$residuals^2)
sse_v <- sum(valid1$residuals^2)
Radj_t <- summary(train1)$adj.r.squared
Radj_v <- summary(valid1)$adj.r.squared
train_sum <- c(sse_t,Radj_t)
valid_sum <- c(sse_v,Radj_v)
criteria <- rbind(train_sum,valid_sum)
colnames(criteria) <- c("SSE","R2_adj")
criteria
##                SSE    R2_adj
## train_sum 21.01723 0.5023067
## valid_sum 35.18765 0.5333025
#Get MSPE_v from new data
newdata <- prostate.v[, -1]
newdata$seminal_vesicle_invasion<-as.factor(newdata$seminal_vesicle_invasion)
sapply(newdata, class)
##                   cancer_vol                       weight 
##                    "numeric"                    "numeric" 
##                          age benign_prostatic_hyperplasia 
##                    "integer"                    "numeric" 
##     seminal_vesicle_invasion         capsular_penetration 
##                     "factor"                    "numeric" 
##                gleason_score 
##                     "factor"
n <- nrow(prostate)/2
y.hat <- predict(train1, newdata)
(MSPE <- mean((prostate.v$psa_level - y.hat)^2))
## [1] 0.7873197
sse_t/n
## [1] 0.433345
  • Most of the estimated coefficients as well as their standard errors agree quite closely on the training and validation data sets
  • The SSE and adjusted R squares values are close
  • Since \(MSPE_v\) is somewhat close to \(SSE/n\), then there is no severe overfitting in the model.
##Internal validation  - 1
(names(step.fs1$coefficients)[-1])
## [1] "cancer_vol"                   "seminal_vesicle_invasion1"   
## [3] "benign_prostatic_hyperplasia"
modelfs1 <- lm(psa_level~cancer_vol+ benign_prostatic_hyperplasia  + seminal_vesicle_invasion , data = prostate.t)
drops2=c("weight", "age", "capsular_penetration", "gleason-score")
data.tt=prostate.t[,!(names(prostate)%in%drops2)]
#data.tt<-prostate.t[ c("psa_level","cancer_vol" , "benign_prostatic_hyperplasia", "seminal_vesicle_invasion1"),  ]
fit5<- lm (psa_level~., data = data.tt)
length(fit5$coef) #number of coefficents in Model
## [1] 6
mse5<-anova(fit5)["Residuals",3]
mse5#MSE for Model
## [1] 0.4370997
sse.fs1<-anova(step.fs1)["Residuals",2] #first order selected
sse.fs1
## [1] 21.01723
mse.fs1<-anova(step.fs1)["Residuals",3] #MSE for Model fs1
mse.fs1
## [1] 0.4776643
p.fs1<-length(step.fs1$coefficients) #4
p.fs1
## [1] 4
cp.fs1<-sse.fs1/mse5-(n-2*p.fs1) #C_p for Model fs1
cp.fs1
## [1] 7.58338
press.fs1<-sum(step.fs1$residuals^2/(1-influence(step.fs1)$hat)^2)
press.fs1
## [1] 24.58292
##Internal validation -2
###using best model according to best subset procedures based on criterion 
model2 <- lm(psa_level~cancer_vol+ benign_prostatic_hyperplasia  + seminal_vesicle_invasion + gleason_score, data = prostate.t)
drops2a=c("weight", "age", "capsular_penetration")
data.tt2=prostate.t[,!(names(prostate)%in%drops2a)]
#data.tt<-prostate.t[ c("psa_level","cancer_vol" , "benign_prostatic_hyperplasia", "seminal_vesicle_invasion1"),  ]
fit6<- lm (psa_level~., data = data.tt2)
length(fit6$coef) #number of coefficents in Model
## [1] 6
(mse6<-anova(fit6)["Residuals",3])#MSE for Model
## [1] 0.4370997
(sse.fs2<-anova(model2)["Residuals",2]) #first order selected
## [1] 18.35819
(mse.fs2<-anova(model2)["Residuals",3]) #MSE for Model2
## [1] 0.4370997
(p.fs2<-length(model2$coefficients)) #5
## [1] 6
(cp.fs2<-sse.fs2/mse6-(n-2*p.fs2)) #C_p for Model 2
## [1] 5.5
(press.fs2<-sum(model2$residuals^2/(1-influence(model2)$hat)^2))
## [1] 23.25294
  • For both models, \(Cp ≈p\) and \(Press_p\) and \(SSE_p\) are somewhat close, which supports their validity: little bias and not much overfitting.
##External  validation - 1 
fit.fs1.v<-lm(step.fs1,data=prostate.v) #Model fs1 on validation data
summary(step.fs1) #summary on training data
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + seminal_vesicle_invasion + 
##     benign_prostatic_hyperplasia, data = prostate.t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.62251 -0.48856 -0.01258  0.43652  1.39462 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.69605    0.17802   9.527 2.91e-12 ***
## cancer_vol                    0.05786    0.01797   3.220  0.00241 ** 
## seminal_vesicle_invasion1     0.90993    0.27981   3.252  0.00220 ** 
## benign_prostatic_hyperplasia  0.10150    0.03437   2.953  0.00503 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6911 on 44 degrees of freedom
## Multiple R-squared:  0.5341, Adjusted R-squared:  0.5023 
## F-statistic: 16.81 on 3 and 44 DF,  p-value: 2.022e-07
summary(fit.fs1.v) #summary on validation data
## 
## Call:
## lm(formula = step.fs1, data = prostate.v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8638 -0.5089  0.1578  0.5782  1.5188 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.38587    0.20198   6.861 1.64e-08 ***
## cancer_vol                    0.08685    0.01884   4.609 3.34e-05 ***
## seminal_vesicle_invasion1     0.71425    0.44024   1.622   0.1117    
## benign_prostatic_hyperplasia  0.10407    0.04286   2.428   0.0193 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8843 on 45 degrees of freedom
## Multiple R-squared:  0.5625, Adjusted R-squared:  0.5333 
## F-statistic: 19.28 on 3 and 45 DF,  p-value: 3.471e-08
#percent change in parameter estimation
round(abs(coef(step.fs1)-coef(fit.fs1.v))/abs(coef(step.fs1))*100,3)
##                  (Intercept)                   cancer_vol 
##                       18.288                       50.121 
##    seminal_vesicle_invasion1 benign_prostatic_hyperplasia 
##                       21.504                        2.531
sd.fs1<- summary(step.fs1)$coefficients[,"Std. Error"]
sd.fs1.v<- summary(fit.fs1.v)$coefficients[,"Std. Error"]
#percent change in standard errors
round(abs(sd.fs1-sd.fs1.v)/sd.fs1*100,3)
##                  (Intercept)                   cancer_vol 
##                       13.460                        4.872 
##    seminal_vesicle_invasion1 benign_prostatic_hyperplasia 
##                       57.333                       24.713
##mean squared prediction error
pred.fs1<-predict.lm(step.fs1, prostate.v[,-3])
pred.fs1
##        1        2        3        4        5        6       10       12 
## 1.728442 1.717548 1.730791 1.713475 1.818530 1.716292 1.768143 2.072465 
##       16       17       18       19       20       21       22       23 
## 1.965925 2.084793 2.267375 1.729096 2.299117 1.878770 2.591421 1.775364 
##       31       35       36       37       38       40       43       44 
## 2.493925 1.753326 2.799417 1.992234 1.787699 2.084640 2.360557 2.035718 
##       45       46       47       48       49       52       53       56 
## 2.536838 2.189005 4.158243 2.441782 2.028989 2.804676 2.242749 2.747651 
##       57       58       61       62       63       64       66       67 
## 1.848668 2.207602 2.830875 3.546357 2.628636 3.804003 2.325122 2.734055 
##       69       82       85       90       91       93       94       95 
## 1.732938 2.565560 2.526960 3.141135 3.188183 3.586382 5.244471 3.668034 
##       97 
## 4.622884
mspe.fs1<-mean((pred.fs1-prostate.v[,3])^2)
mspe.fs1
## [1] 1698.921
press.fs1/n
## [1] 0.5068644
mse.fs1
## [1] 0.4776643
##External  validation - 2 
fit.fs2.v<-lm(model2,data=prostate.v) #Model2 on validation data
summary(model2) #summary on training data
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion + gleason_score, data = prostate.t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43793 -0.46912 -0.04104  0.43159  1.37643 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.55126    0.22390   6.928 1.84e-08 ***
## cancer_vol                    0.03759    0.02019   1.862  0.06962 .  
## benign_prostatic_hyperplasia  0.09697    0.03294   2.944  0.00526 ** 
## seminal_vesicle_invasion1     0.91864    0.27903   3.292  0.00202 ** 
## gleason_score7                0.25904    0.23686   1.094  0.28034    
## gleason_score8                0.73913    0.30001   2.464  0.01793 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6611 on 42 degrees of freedom
## Multiple R-squared:  0.593,  Adjusted R-squared:  0.5446 
## F-statistic: 12.24 on 5 and 42 DF,  p-value: 2.388e-07
summary(fit.fs2.v) #summary on validation data
## 
## Call:
## lm(formula = model2, data = prostate.v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7509 -0.5067  0.1642  0.6007  1.6324 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.27905    0.22835   5.601 1.39e-06 ***
## cancer_vol                    0.07610    0.01970   3.863 0.000372 ***
## benign_prostatic_hyperplasia  0.10556    0.04407   2.395 0.021049 *  
## seminal_vesicle_invasion1     0.33023    0.48883   0.676 0.502942    
## gleason_score7                0.18969    0.28902   0.656 0.515109    
## gleason_score8                0.85107    0.48734   1.746 0.087888 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8741 on 43 degrees of freedom
## Multiple R-squared:  0.5915, Adjusted R-squared:  0.544 
## F-statistic: 12.45 on 5 and 43 DF,  p-value: 1.702e-07
#percent change in parameter estimation
round(abs(coef(model2)-coef(fit.fs2.v))/abs(coef(model2))*100,3)
##                  (Intercept)                   cancer_vol 
##                       17.547                      102.440 
## benign_prostatic_hyperplasia    seminal_vesicle_invasion1 
##                        8.853                       64.052 
##               gleason_score7               gleason_score8 
##                       26.773                       15.145
sd.fs2<- summary(model2)$coefficients[,"Std. Error"]
sd.fs2.v<- summary(fit.fs2.v)$coefficients[,"Std. Error"]
#percent change in standard errors
round(abs(sd.fs2-sd.fs2.v)/sd.fs2*100,3)
##                  (Intercept)                   cancer_vol 
##                        1.990                        2.434 
## benign_prostatic_hyperplasia    seminal_vesicle_invasion1 
##                       33.791                       75.188 
##               gleason_score7               gleason_score8 
##                       22.020                       62.438
##mean squared prediction error
(pred.fs2<-predict.lm(model2, prostate.v[,-3]))
##        1        2        3        4        5        6       10       12 
## 1.572303 1.824268 1.832872 1.562579 1.630834 1.564409 1.598097 1.906309 
##       16       17       18       19       20       21       22       23 
## 1.726598 2.170094 1.922452 1.572728 2.365309 1.669973 2.526982 1.616730 
##       31       35       36       37       38       40       43       44 
## 2.289928 1.588471 2.798928 2.500197 1.869846 2.142205 2.154552 2.030986 
##       45       46       47       48       49       52       53       56 
## 2.535127 2.188258 4.420897 2.466368 1.767571 2.517440 2.303174 2.752040 
##       57       58       61       62       63       64       66       67 
## 1.909458 2.011987 2.607475 3.496705 2.896299 3.738885 2.335168 2.668700 
##       69       82       85       90       91       93       94       95 
## 1.575224 2.423408 3.000089 3.636160 2.520709 3.846006 4.923281 3.899057 
##       97 
## 4.567610
(mspe.fs2<-mean((pred.fs2-prostate.v[,3])^2))
## [1] 1703.327
(press.fs2/n)
## [1] 0.479442
(mse.fs2)
## [1] 0.4370997
  • Model 1 is preferred based on smaller \(MSPE_v\) and more consistent parameter estimation in training and validation data set

Part V: Model diagnostics: Outlying and influential cases

  • Conduct model diagnostics for the “final model” (fitted on the entire data set).
  • Draw residual vs fitted value plot and residual Q-Q plot
  • Obtain the studentized deleted residuals and identify any outlying Y observations. Use the Bonferroni outlier test procedure at α = 0.1.
  • Obtain the leverage and identify any outlying X observations. Draw residual vs. leverage plot.
  • Draw an influence index plot using Cook’s distance. Are there any influential cases according to this measure?
  • Calculate the average absolute percent difference in the fitted values with and without the most influential case identified from the previous question. What does this measure indicate the influence of this case?
final_mod <- lm(psa_level~cancer_vol+ benign_prostatic_hyperplasia + 
    seminal_vesicle_invasion, data = prostate)
summary(final_mod)
## 
## Call:
## lm(formula = psa_level ~ cancer_vol + benign_prostatic_hyperplasia + 
##     seminal_vesicle_invasion, data = prostate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9867 -0.4996  0.1032  0.5545  1.4993 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.51484    0.13206  11.471  < 2e-16 ***
## cancer_vol                    0.07618    0.01256   6.067 2.78e-08 ***
## benign_prostatic_hyperplasia  0.09971    0.02674   3.729 0.000331 ***
## seminal_vesicle_invasion1     0.82194    0.23858   3.445 0.000858 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7861 on 93 degrees of freedom
## Multiple R-squared:  0.5502, Adjusted R-squared:  0.5357 
## F-statistic: 37.92 on 3 and 93 DF,  p-value: 4.247e-16
anova(final_mod)
## Analysis of Variance Table
## 
## Response: psa_level
##                              Df Sum Sq Mean Sq F value    Pr(>F)    
## cancer_vol                    1 55.164  55.164  89.271 3.006e-15 ***
## benign_prostatic_hyperplasia  1  7.803   7.803  12.628 0.0005992 ***
## seminal_vesicle_invasion      1  7.334   7.334  11.868 0.0008583 ***
## Residuals                    93 57.468   0.618                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(3,2))
plot(final_mod,pch =20, cex = 2, col = "aquamarine2")

## check outliers in Y
res<-residuals(final_mod)# residuals of the final model
p <- length(final_mod$coefficients)
n.s<-nrow(prostate)
h1 <- influence(final_mod)$hat
d.res.std<-studres(final_mod) #studentized deleted residuals
qt(1-0.1/(2*n.s),n.s-p) # bonferronis thresh hold
## [1] 3.38885
idx.Y <- as.vector(which(abs(d.res.std)>=qt(1-0.1/(2*n.s),n.s-p)))
idx.Y ## outliers
## integer(0)
idx.X <- as.vector(which(h1>(2*p/n.s)))
idx.X ## outliers
## [1] 55 64 68 73 78 86 91 94 97
plot(h1,res,xlab="leverage",ylab="residuals", pch =20, cex = 2, col = "aquamarine2")
plot(final_mod, which=4, col = "aquamarine2") #Case 94 is an influential case according to Cook’s distance.

final_mod2<-lm(final_mod, data=prostate[-94,])
f1<-fitted(final_mod)
f2<-fitted(final_mod2)
SUM<-sum(abs((f1[-94]-f2)/f1[-94]))
SUM<-SUM+abs((f1[94]-predict(final_mod,newdata = prostate[94,]))/f1[94])
per.average<-SUM/n.s
per.average
##         94 
## 0.01872144
  • The potential influential case identified previously is the 94th case, we fit the model without 94th case and calculate the average absolute difference in the fitted values as 1.87%. For 94th case, the percentage change on the fitted value with or without the case is very small. Therefore, no case have a large influence on prediction. Hence, all cases can be retained.