library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
prostate.data <- read.delim("~/Dropbox/UFCG/AD2/M3/prostate.data.txt")
train <- prostate.data[prostate.data$train==TRUE,]
test <- prostate.data[prostate.data$train==FALSE,]

Aplicando Cross Validation

Cross validation é uma técnica utilizada para evitar overfitting do treino, ou seja, enviesamento do algoritmo em relação aos dados de treino. Os dados são divididos em pastas. Uma certa porcentagem de pastas são separadas para teste e a outra parte para treino. Este processo é repetido n vezes, sendo n o número de pastas.

fitControl <- trainControl(method='cv', number = 10)

Aplicando o Lasso

O lasso é um shrinkage method utilizado para regularizar os dados

lasso.fit <- train(train$lpsa ~ ., data=select(train,lcavol,lweight,pgg45,age,lbph,svi,lcp,gleason), 
                   method='lasso', 
                   metric="RMSE",
                   tuneLength = 10,
                   trControl=fitControl)
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2

Plot do Lasso

plot(lasso.fit)

Valores Estimados

lasso.fit
## The lasso 
## 
## 67 samples
##  7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 61, 61, 59, 62, 61, 59, ... 
## Resampling results across tuning parameters:
## 
##   fraction   RMSE       Rsquared   RMSE SD    Rsquared SD
##   0.1000000  1.0291342  0.6400350  0.2863298  0.1630571  
##   0.1888889  0.9165176  0.6426413  0.2864238  0.1674569  
##   0.2777778  0.8399055  0.6755766  0.2872709  0.1779984  
##   0.3666667  0.7844351  0.6933819  0.2848517  0.1775586  
##   0.4555556  0.7507244  0.7200065  0.2661199  0.1735123  
##   0.5444444  0.7383927  0.7323360  0.2540268  0.1796649  
##   0.6333333  0.7334976  0.7381928  0.2545808  0.1826935  
##   0.7222222  0.7237915  0.7447951  0.2540808  0.1823334  
##   0.8111111  0.7182931  0.7470008  0.2542696  0.1836815  
##   0.9000000  0.7168761  0.7458975  0.2552912  0.1858821  
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was fraction = 0.9.
lasso_prediction <- predict(lasso.fit, select(test,lcavol,lweight,pgg45,age,lbph,svi,lcp,gleason))

lasso_prediction <- data.frame(pred = lasso_prediction, obs = test$lpsa)
round(defaultSummary(lasso_prediction), digits = 3)
##     RMSE Rsquared 
##    0.706    0.527

Plotar Intervalo de Confianca para varios cross validations