library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
prostate.data <- read.delim("~/Dropbox/UFCG/AD2/M3/prostate.data.txt")
train <- prostate.data[prostate.data$train==TRUE,]
test <- prostate.data[prostate.data$train==FALSE,]
Aplicando Cross Validation
Cross validation é uma técnica utilizada para evitar overfitting do treino, ou seja, enviesamento do algoritmo em relação aos dados de treino. Os dados são divididos em pastas. Uma certa porcentagem de pastas são separadas para teste e a outra parte para treino. Este processo é repetido n vezes, sendo n o número de pastas.
fitControl <- trainControl(method='cv', number = 10)
Aplicando o Lasso
O lasso é um shrinkage method utilizado para regularizar os dados
lasso.fit <- train(train$lpsa ~ ., data=select(train,lcavol,lweight,pgg45,age,lbph,svi,lcp,gleason),
method='lasso',
metric="RMSE",
tuneLength = 10,
trControl=fitControl)
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2
Plot do Lasso
plot(lasso.fit)
Valores Estimados
lasso.fit
## The lasso
##
## 67 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 61, 61, 59, 62, 61, 59, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared RMSE SD Rsquared SD
## 0.1000000 1.0291342 0.6400350 0.2863298 0.1630571
## 0.1888889 0.9165176 0.6426413 0.2864238 0.1674569
## 0.2777778 0.8399055 0.6755766 0.2872709 0.1779984
## 0.3666667 0.7844351 0.6933819 0.2848517 0.1775586
## 0.4555556 0.7507244 0.7200065 0.2661199 0.1735123
## 0.5444444 0.7383927 0.7323360 0.2540268 0.1796649
## 0.6333333 0.7334976 0.7381928 0.2545808 0.1826935
## 0.7222222 0.7237915 0.7447951 0.2540808 0.1823334
## 0.8111111 0.7182931 0.7470008 0.2542696 0.1836815
## 0.9000000 0.7168761 0.7458975 0.2552912 0.1858821
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
lasso_prediction <- predict(lasso.fit, select(test,lcavol,lweight,pgg45,age,lbph,svi,lcp,gleason))
lasso_prediction <- data.frame(pred = lasso_prediction, obs = test$lpsa)
round(defaultSummary(lasso_prediction), digits = 3)
## RMSE Rsquared
## 0.706 0.527