R Markdown
######################################################################################
####### #########
####### ******* Prostate cancer dataset, biopsy results for 97 men ********* #########
####### #########
######################################################################################
# chagne working directory to where prostate.data is at
setwd("~/..")
# load prostate.csv Data
prostate <- read.csv("prostate.csv", sep = "," , header = T)
# preview prostate data
head(prostate,3)
## lcavol age lbph lcp gleason lpsa
## 1 -0.5798185 50 -1.386294 -1.386294 6 -0.4307829
## 2 -0.9942523 58 -1.386294 -1.386294 6 -0.1625189
## 3 -0.5108256 74 -1.386294 -1.386294 7 -0.1625189
####### some exploratory analysis ####################################################
library(corrplot)
## corrplot 0.84 loaded
prostate.corr <- cor(prostate)
corrplot.mixed(prostate.corr)
####### split to train and test set ##################################################
if (!require("caret")) install.packages("caret", dependencies=TRUE)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2

if (!require("caTools")) install.packages("caTools", dependencies=TRUE)
## Loading required package: caTools
library(caret)
library(caTools)
# spliting the prostate data
split <- sample.split(prostate, SplitRatio = 0.9)
train <- subset(prostate, split == TRUE)
test <- subset(prostate, split == FALSE)
####### The goal here is, to build a linear model for predicting ####################
####### 'lpsa' as a function of the other 5 predictors ####################
# build simple linear model
lm <- lm(lpsa ~ lbph + gleason + lcp +lcavol, data = train)
# model summary
summary(lm)
##
## Call:
## lm(formula = lpsa ~ lbph + gleason + lcp + lcavol, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.50266 -0.48708 0.08128 0.59388 1.95615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.58992 0.93113 1.708 0.0918 .
## lbph 0.13914 0.06099 2.281 0.0253 *
## gleason 0.00723 0.13704 0.053 0.9581
## lcp 0.10914 0.09050 1.206 0.2316
## lcavol 0.63313 0.10118 6.258 2.12e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7889 on 76 degrees of freedom
## Multiple R-squared: 0.5607, Adjusted R-squared: 0.5376
## F-statistic: 24.26 on 4 and 76 DF, p-value: 5.911e-13
par(mfrow=c(2,2))
plot(lm)

# using the Caret package let us now compute Confusion matrix to measure model accuracy
predicted <- predict(lm, test[,c("lbph","gleason","lcp","lcavol","lpsa")], interval = "confidence")[,"fit"]
actual <- test[,"lpsa"]
u <- union(predicted, actual)
t <- table(factor(predicted, u), factor(actual, u))
confusionMatrix(t)$overall # we need only overall statistics
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.0000000 0.0000000 0.0000000 0.2059072 0.0625000
## AccuracyPValue McnemarPValue
## 1.0000000 NaN
####### Due to the most terrible result I obtained from regular ####################
####### 'lm' I decided to use an enssamble method with lasso ####################
####### The plan here is to use machine learning regularization ####################
if (!require("lar")) install.packages("lar", dependencies=TRUE)
## Loading required package: lar
## Warning: package 'lar' was built under R version 3.5.3
library(lars)
## Loaded lars 1.2
X <- train[,1:5]
y <- train[,6]
lasso.model <- lars(as.matrix(X), y, type="lasso")
plot(lasso.model, breaks=F, xvar="norm")
# now let us find the 'best.model' with a predicted model less than 1 standard error
coefficient.vector <- cv.lars(as.matrix(X), y, type="lasso") # find the optimal 'coeffient.vector'
minimum.index <- which.min(coefficient.vector$cv) # find 'minimum.index'
standard.error.index <- which.min(abs(coefficient.vector$cv - # find 'standard.error.index'
(coefficient.vector$cv[minimum.index]+
coefficient.vector$cv.error[minimum.index])))
best.s.e <- coefficient.vector$fraction[standard.error.index] # find best standard error 'best.s.e'
# now to find the optimal coefficients for new 'lasso.model'
predict.lars(lasso.model, newx = as.matrix(test[,1:5]), s = best.s.e,
type = "coefficients", mode ="fraction")$coefficients
## lcavol age lbph lcp gleason
