R Markdown

######################################################################################
#######                                                                      #########
####### ******* Prostate cancer dataset, biopsy results for 97 men ********* #########
#######                                                                      #########
######################################################################################

# chagne working directory to where prostate.data is at
setwd("~/..")

# load  prostate.csv Data
prostate <- read.csv("prostate.csv", sep = "," , header = T)

# preview prostate data
head(prostate,3)
##       lcavol age      lbph       lcp gleason       lpsa
## 1 -0.5798185  50 -1.386294 -1.386294       6 -0.4307829
## 2 -0.9942523  58 -1.386294 -1.386294       6 -0.1625189
## 3 -0.5108256  74 -1.386294 -1.386294       7 -0.1625189
####### some exploratory analysis ####################################################
library(corrplot)
## corrplot 0.84 loaded
prostate.corr <- cor(prostate)
corrplot.mixed(prostate.corr)

####### split to train and test set ##################################################
if (!require("caret")) install.packages("caret", dependencies=TRUE)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2

if (!require("caTools")) install.packages("caTools", dependencies=TRUE)
## Loading required package: caTools
library(caret) 
library(caTools)

# spliting the prostate data
split <- sample.split(prostate, SplitRatio = 0.9)
train <- subset(prostate, split == TRUE)
test <-  subset(prostate, split == FALSE)

####### The goal here is, to build a linear model for predicting ####################
####### 'lpsa' as a function of the other 5 predictors           ####################       
# build simple linear model
lm <- lm(lpsa ~ lbph + gleason + lcp  +lcavol, data = train)

# model summary
summary(lm)
## 
## Call:
## lm(formula = lpsa ~ lbph + gleason + lcp + lcavol, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50266 -0.48708  0.08128  0.59388  1.95615 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.58992    0.93113   1.708   0.0918 .  
## lbph         0.13914    0.06099   2.281   0.0253 *  
## gleason      0.00723    0.13704   0.053   0.9581    
## lcp          0.10914    0.09050   1.206   0.2316    
## lcavol       0.63313    0.10118   6.258 2.12e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7889 on 76 degrees of freedom
## Multiple R-squared:  0.5607, Adjusted R-squared:  0.5376 
## F-statistic: 24.26 on 4 and 76 DF,  p-value: 5.911e-13
par(mfrow=c(2,2))
plot(lm)

# using the Caret package let us now compute Confusion matrix to measure model accuracy
predicted <- predict(lm, test[,c("lbph","gleason","lcp","lcavol","lpsa")], interval = "confidence")[,"fit"]
actual <- test[,"lpsa"]
u <- union(predicted, actual)
t <- table(factor(predicted, u), factor(actual, u))
confusionMatrix(t)$overall # we need only overall statistics
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.0000000      0.0000000      0.0000000      0.2059072      0.0625000 
## AccuracyPValue  McnemarPValue 
##      1.0000000            NaN
####### Due to the most terrible result I obtained from regular  ####################
####### 'lm' I decided to use an enssamble method with lasso     ####################
####### The plan here is to use machine learning regularization  ####################

if (!require("lar")) install.packages("lar", dependencies=TRUE)
## Loading required package: lar
## Warning: package 'lar' was built under R version 3.5.3
library(lars)
## Loaded lars 1.2
X <- train[,1:5]
y <- train[,6]
lasso.model <- lars(as.matrix(X), y, type="lasso")
plot(lasso.model, breaks=F, xvar="norm")

# now let us find the 'best.model' with a predicted model less than 1 standard error

coefficient.vector <- cv.lars(as.matrix(X), y, type="lasso")      # find the optimal 'coeffient.vector'

minimum.index <- which.min(coefficient.vector$cv)                 # find 'minimum.index'
standard.error.index <- which.min(abs(coefficient.vector$cv -     # find 'standard.error.index'
                 (coefficient.vector$cv[minimum.index]+
                  coefficient.vector$cv.error[minimum.index])))
best.s.e <- coefficient.vector$fraction[standard.error.index]     # find best standard error 'best.s.e'


# now to find the optimal coefficients for new 'lasso.model'
predict.lars(lasso.model, newx = as.matrix(test[,1:5]), s = best.s.e, 
             type = "coefficients", mode ="fraction")$coefficients
##      lcavol age lbph lcp gleason