require(ggplot2)
## Loading required package: ggplot2
require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
require(GGally)
## Loading required package: GGally
require(plyr)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:GGally':
##
## nasa
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Usaremos dados de vinhos: http://www3.dsi.uminho.pt/pcortez/wine/
wines <- read.csv("winequality/winequality-red.csv", sep=";")
summary(wines)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
if(! file.exists("ggpairs.pdf")){
pdf("ggpairs.pdf", w = 15, h = 15)
ggpairs(wines, alpha=0.3)
dev.off()
}
# do caret:
featurePlot(wines[,1:11], wines[,12])
Treino e teste
split<-createDataPartition(y = wines$quality,
p = 0.7,
list = FALSE)
wines.dev <- wines[split,]
wines.val <- wines[-split,]
As pressuposições de regressão para previsão são: - A relação entre preditores e variável de resposta é aproximadamente linear - Nenhum preditor pode ser derivado por combinação linear dos demais - Há um número igual ou maior de observações do que preditores
Preditores com muita correlação entre si (aka colinearidade) também geram instabilidade no modelo. Para eliminar preditores com alta colinearidade:
correlationMatrix <- cor(wines[,1:11])
print(correlationMatrix)
## fixed.acidity volatile.acidity citric.acid
## fixed.acidity 1.00000000 -0.256130895 0.67170343
## volatile.acidity -0.25613089 1.000000000 -0.55249568
## citric.acid 0.67170343 -0.552495685 1.00000000
## residual.sugar 0.11477672 0.001917882 0.14357716
## chlorides 0.09370519 0.061297772 0.20382291
## free.sulfur.dioxide -0.15379419 -0.010503827 -0.06097813
## total.sulfur.dioxide -0.11318144 0.076470005 0.03553302
## density 0.66804729 0.022026232 0.36494718
## pH -0.68297819 0.234937294 -0.54190414
## sulphates 0.18300566 -0.260986685 0.31277004
## alcohol -0.06166827 -0.202288027 0.10990325
## residual.sugar chlorides free.sulfur.dioxide
## fixed.acidity 0.114776724 0.093705186 -0.153794193
## volatile.acidity 0.001917882 0.061297772 -0.010503827
## citric.acid 0.143577162 0.203822914 -0.060978129
## residual.sugar 1.000000000 0.055609535 0.187048995
## chlorides 0.055609535 1.000000000 0.005562147
## free.sulfur.dioxide 0.187048995 0.005562147 1.000000000
## total.sulfur.dioxide 0.203027882 0.047400468 0.667666450
## density 0.355283371 0.200632327 -0.021945831
## pH -0.085652422 -0.265026131 0.070377499
## sulphates 0.005527121 0.371260481 0.051657572
## alcohol 0.042075437 -0.221140545 -0.069408354
## total.sulfur.dioxide density pH
## fixed.acidity -0.11318144 0.66804729 -0.68297819
## volatile.acidity 0.07647000 0.02202623 0.23493729
## citric.acid 0.03553302 0.36494718 -0.54190414
## residual.sugar 0.20302788 0.35528337 -0.08565242
## chlorides 0.04740047 0.20063233 -0.26502613
## free.sulfur.dioxide 0.66766645 -0.02194583 0.07037750
## total.sulfur.dioxide 1.00000000 0.07126948 -0.06649456
## density 0.07126948 1.00000000 -0.34169933
## pH -0.06649456 -0.34169933 1.00000000
## sulphates 0.04294684 0.14850641 -0.19664760
## alcohol -0.20565394 -0.49617977 0.20563251
## sulphates alcohol
## fixed.acidity 0.183005664 -0.06166827
## volatile.acidity -0.260986685 -0.20228803
## citric.acid 0.312770044 0.10990325
## residual.sugar 0.005527121 0.04207544
## chlorides 0.371260481 -0.22114054
## free.sulfur.dioxide 0.051657572 -0.06940835
## total.sulfur.dioxide 0.042946836 -0.20565394
## density 0.148506412 -0.49617977
## pH -0.196647602 0.20563251
## sulphates 1.000000000 0.09359475
## alcohol 0.093594750 1.00000000
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
## integer(0)
# se houvesse, faríamos:
# wines.filtered <- wines[,-highlyCorrelated]
Caso haja relações não-lineares entre preditores e resposta, podemos adicionar preditores transformados como fatores extra.
Outro ponto importante: outliers influenciam muito a regressão linear. (Por causa do algoritmo de mínimos quadrados.)
Não há parâmetro para ajustar na regressão linear simples.
ctrl <- trainControl(method = "cv", number = 10)
lmFit <- train(quality ~. ,
data = wines.dev,
method = "lm",
trControl = ctrl,
metric = "RMSE")
lmFit
## Linear Regression
##
## 1120 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 1009, 1008, 1008, 1008, 1009, 1007, ...
##
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 0.6583694 0.3404559 0.04507074 0.09056368
##
##
summary(lmFit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.69392 -0.36613 -0.05366 0.46019 2.04682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.570e+01 2.553e+01 -0.615 0.53862
## fixed.acidity -6.526e-03 3.077e-02 -0.212 0.83209
## volatile.acidity -1.099e+00 1.473e-01 -7.456 1.80e-13 ***
## citric.acid -2.151e-01 1.773e-01 -1.213 0.22529
## residual.sugar -1.464e-02 1.842e-02 -0.795 0.42675
## chlorides -2.177e+00 5.115e-01 -4.256 2.25e-05 ***
## free.sulfur.dioxide 6.344e-03 2.664e-03 2.382 0.01739 *
## total.sulfur.dioxide -3.364e-03 8.982e-04 -3.745 0.00019 ***
## density 2.070e+01 2.605e+01 0.795 0.42706
## pH -6.370e-01 2.304e-01 -2.765 0.00579 **
## sulphates 8.744e-01 1.377e-01 6.352 3.10e-10 ***
## alcohol 3.086e-01 3.199e-02 9.647 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6533 on 1108 degrees of freedom
## Multiple R-squared: 0.3534, Adjusted R-squared: 0.347
## F-statistic: 55.05 on 11 and 1108 DF, p-value: < 2.2e-16
plot(varImp(lmFit))
# Usando os dados de treino!
avaliacao <- data.frame(obs = wines.dev$quality, pred = predict(lmFit), res = resid(lmFit))
ggplot(avaliacao, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
stat_abline(colour = "blue") +
ggtitle("Observado x Previsão (validação)")
ggplot(avaliacao, aes(y = res, x = pred)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.1)) +
geom_abline(slope = 0, intercept = 0, colour = "darkred") +
ggtitle("Resíduos na validação")
# Outra forma de fazer:
#
# xyplot(wines.dev$quality ~ predict(lmFit),
# ## plot the points (type = 'p') and a background grid ('g')
# type = c("p", "g"),
# xlab = "Predicted", ylab = "Observed")
#
# xyplot(resid(lmFit) ~ predict(lmFit),
# type = c("p", "g"),
# xlab = "Predicted", ylab = "Residuals")
Desempenho:
predictedVal <- predict(lmFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)
defaultSummary(modelvalues)
## RMSE Rsquared
## 0.6398463 0.3692433
compare <- data.frame(obs = wines.val$quality, pred = predictedVal)
ggplot(compare, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
stat_abline() +
ggtitle("Observado x Previsão (validação)")
ggplot(compare, aes(y = (pred - obs), x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
ggtitle("Resíduos na validação")
Usaremos MARS
require(earth)
## Loading required package: earth
## Warning: package 'earth' was built under R version 3.1.3
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.1.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.1.3
## Loading required package: TeachingDemos
Sem model tuning:
marsSimpleFit <- earth(quality ~ ., data = wines.dev)
marsSimpleFit
## Selected 14 of 19 terms, and 7 of 11 predictors
## Termination condition: Reached nk 23
## Importance: alcohol, sulphates, volatile.acidity, pH, chlorides, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.406612 RSS 433.7131 GRSq 0.3784381 RSq 0.4069866
summary(marsSimpleFit)
## Call: earth(formula=quality~., data=wines.dev)
##
## coefficients
## (Intercept) 3.4114803
## h(1.005-volatile.acidity) 0.9323240
## h(0.49-citric.acid) 0.4104112
## h(chlorides-0.049) 29.2333849
## h(chlorides-0.091) -18.9882200
## h(0.143-chlorides) 25.1245610
## h(chlorides-0.213) -13.3100263
## h(total.sulfur.dioxide-24) -0.0020810
## h(3.47-pH) 0.5808026
## h(pH-3.47) -1.9331333
## h(0.76-sulphates) -2.4884343
## h(alcohol-11.1) 0.3229972
## h(12.7-alcohol) -0.2159715
## h(alcohol-12.7) -0.7384290
##
## Selected 14 of 19 terms, and 7 of 11 predictors
## Termination condition: Reached nk 23
## Importance: alcohol, sulphates, volatile.acidity, pH, chlorides, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.406612 RSS 433.7131 GRSq 0.3784381 RSq 0.4069866
plotmo(marsSimpleFit)
## grid: fixed.acidity volatile.acidity citric.acid residual.sugar
## 7.9 0.52 0.25 2.2
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 0.08 13 37 0.996755 3.31
## sulphates alcohol
## 0.62 10.1
Agora com caret. Há dois parâmetros para tuning: grau do modelo (quantas variáveis podem interagir em um termo da equação?) e número de termos no modelo final.
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:40)
marsFit <- train(quality ~. ,
data = wines.dev,
method = "earth",
trControl = ctrl,
# novidade:
tuneGrid = marsGrid,
metric = "RMSE")
marsFit
## Multivariate Adaptive Regression Spline
##
## 1120 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 1008, 1007, 1008, 1009, 1008, 1008, ...
##
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared RMSE SD Rsquared SD
## 1 2 0.7118244 0.2344693 0.05940003 0.12392549
## 1 3 0.6785644 0.2976706 0.06137521 0.11462296
## 1 4 0.6514260 0.3517015 0.06084369 0.11029751
## 1 5 0.6524230 0.3507537 0.06053572 0.11049875
## 1 6 0.6514900 0.3538741 0.05833570 0.10689815
## 1 7 0.6506111 0.3549043 0.05489033 0.10245261
## 1 8 0.6513546 0.3549378 0.05523432 0.10181724
## 1 9 0.6511403 0.3558790 0.05776018 0.10433596
## 1 10 0.6492265 0.3599048 0.05938403 0.10453710
## 1 11 0.6506602 0.3570202 0.05640550 0.09801561
## 1 12 0.6515258 0.3552798 0.05374173 0.09454401
## 1 13 0.6533397 0.3516454 0.05313897 0.09141514
## 1 14 0.6670305 0.3316272 0.06736493 0.10305016
## 1 15 0.6669519 0.3313858 0.06676860 0.10316958
## 1 16 0.6667792 0.3309639 0.06661583 0.10335914
## 1 17 0.6667792 0.3309639 0.06661583 0.10335914
## 1 18 0.6667792 0.3309639 0.06661583 0.10335914
## 1 19 0.6667792 0.3309639 0.06661583 0.10335914
## 1 20 0.6667792 0.3309639 0.06661583 0.10335914
## 1 21 0.6667792 0.3309639 0.06661583 0.10335914
## 1 22 0.6667792 0.3309639 0.06661583 0.10335914
## 1 23 0.6667792 0.3309639 0.06661583 0.10335914
## 1 24 0.6667792 0.3309639 0.06661583 0.10335914
## 1 25 0.6667792 0.3309639 0.06661583 0.10335914
## 1 26 0.6667792 0.3309639 0.06661583 0.10335914
## 1 27 0.6667792 0.3309639 0.06661583 0.10335914
## 1 28 0.6667792 0.3309639 0.06661583 0.10335914
## 1 29 0.6667792 0.3309639 0.06661583 0.10335914
## 1 30 0.6667792 0.3309639 0.06661583 0.10335914
## 1 31 0.6667792 0.3309639 0.06661583 0.10335914
## 1 32 0.6667792 0.3309639 0.06661583 0.10335914
## 1 33 0.6667792 0.3309639 0.06661583 0.10335914
## 1 34 0.6667792 0.3309639 0.06661583 0.10335914
## 1 35 0.6667792 0.3309639 0.06661583 0.10335914
## 1 36 0.6667792 0.3309639 0.06661583 0.10335914
## 1 37 0.6667792 0.3309639 0.06661583 0.10335914
## 1 38 0.6667792 0.3309639 0.06661583 0.10335914
## 1 39 0.6667792 0.3309639 0.06661583 0.10335914
## 1 40 0.6667792 0.3309639 0.06661583 0.10335914
## 2 2 0.7261255 0.2018730 0.06313604 0.11735867
## 2 3 0.6825500 0.2889358 0.05794540 0.10434710
## 2 4 0.6585776 0.3371816 0.06444895 0.11306296
## 2 5 0.6622942 0.3325108 0.06642791 0.11646667
## 2 6 0.6574566 0.3416775 0.05709707 0.10373988
## 2 7 0.6617611 0.3346005 0.06170858 0.10984493
## 2 8 0.6589548 0.3401732 0.06696283 0.11729593
## 2 9 0.6586616 0.3412774 0.06687693 0.11705426
## 2 10 0.6599339 0.3393151 0.06610801 0.11820519
## 2 11 0.6608447 0.3379116 0.06440963 0.11563978
## 2 12 0.6590193 0.3411240 0.06452301 0.11670505
## 2 13 0.6581327 0.3434370 0.06350979 0.11394885
## 2 14 0.6568625 0.3456528 0.06352425 0.11358979
## 2 15 0.6571627 0.3448212 0.06162320 0.11108838
## 2 16 0.6571627 0.3448212 0.06162320 0.11108838
## 2 17 0.6571627 0.3448212 0.06162320 0.11108838
## 2 18 0.6571627 0.3448212 0.06162320 0.11108838
## 2 19 0.6571627 0.3448212 0.06162320 0.11108838
## 2 20 0.6571627 0.3448212 0.06162320 0.11108838
## 2 21 0.6571627 0.3448212 0.06162320 0.11108838
## 2 22 0.6571627 0.3448212 0.06162320 0.11108838
## 2 23 0.6571627 0.3448212 0.06162320 0.11108838
## 2 24 0.6571627 0.3448212 0.06162320 0.11108838
## 2 25 0.6571627 0.3448212 0.06162320 0.11108838
## 2 26 0.6571627 0.3448212 0.06162320 0.11108838
## 2 27 0.6571627 0.3448212 0.06162320 0.11108838
## 2 28 0.6571627 0.3448212 0.06162320 0.11108838
## 2 29 0.6571627 0.3448212 0.06162320 0.11108838
## 2 30 0.6571627 0.3448212 0.06162320 0.11108838
## 2 31 0.6571627 0.3448212 0.06162320 0.11108838
## 2 32 0.6571627 0.3448212 0.06162320 0.11108838
## 2 33 0.6571627 0.3448212 0.06162320 0.11108838
## 2 34 0.6571627 0.3448212 0.06162320 0.11108838
## 2 35 0.6571627 0.3448212 0.06162320 0.11108838
## 2 36 0.6571627 0.3448212 0.06162320 0.11108838
## 2 37 0.6571627 0.3448212 0.06162320 0.11108838
## 2 38 0.6571627 0.3448212 0.06162320 0.11108838
## 2 39 0.6571627 0.3448212 0.06162320 0.11108838
## 2 40 0.6571627 0.3448212 0.06162320 0.11108838
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
Diagnóstico
plot(varImp(marsFit))
# Usando os dados de treino!
avaliacao$modelo <- "RL"
pred_mars <- data.frame(obs = wines.dev$quality,
pred = predict(marsFit),
res = wines.dev$quality - predict(marsFit),
modelo = "MARS")
avaliacao <- rbind(avaliacao, pred_mars)
ggplot(avaliacao, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
stat_abline(colour = "blue") +
facet_grid(. ~ modelo) +
ggtitle("Observado x Previsão (validação)")
ggplot(avaliacao, aes(y = res, x = pred)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.1)) +
geom_abline(slope = 0, intercept = 0, colour = "darkred") +
facet_grid(. ~ modelo) +
ggtitle("Resíduos na validação")
O desempenho melhorou?
predictedVal <- predict(marsFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)
defaultSummary(modelvalues)
## RMSE Rsquared
## 0.6339553 0.3810186
compare$modelo <- "RL"
pred_mars <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "MARS")
compare <- rbind(compare, pred_mars)
ggplot(compare, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
stat_abline() +
ggtitle("Observado x Previsão (validação)")
ggplot(compare, aes(y = (pred - obs), x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
geom_abline(slope = 0, intercept = 0, colour = "darkred") +
ggtitle("Resíduos na validação")
MARS + Bootstrap AGGregation
bMarsFit <- train(quality ~. ,
data = wines.dev,
method = "bagEarthGCV",
trControl = ctrl,
tuneGrid = expand.grid(.degree = 1:2),
metric = "RMSE")
bMarsFit
## Bagged MARS using gCV Pruning
##
## 1120 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 1008, 1008, 1007, 1007, 1009, 1008, ...
##
## Resampling results across tuning parameters:
##
## degree RMSE Rsquared RMSE SD Rsquared SD
## 1 0.6550211 0.3520736 0.03157344 0.04854529
## 2 0.6400212 0.3771018 0.02947601 0.04313088
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was degree = 2.
predictedVal <- predict(bMarsFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)
defaultSummary(modelvalues)
## RMSE Rsquared
## 0.6122403 0.4223664
bstLsFit <- train(quality ~. ,
data = wines.dev,
method = "bstLs",
trControl = ctrl,
metric = "RMSE")
## Loading required package: bst
bstLsFit
## Boosted Linear Model
##
## 1120 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 1008, 1008, 1008, 1008, 1009, 1008, ...
##
## Resampling results across tuning parameters:
##
## mstop RMSE Rsquared RMSE SD Rsquared SD
## 50 0.7744332 0.1246540 0.03709427 0.06084528
## 100 0.7585361 0.1695485 0.03729673 0.06388378
## 150 0.7459287 0.1999767 0.03733727 0.06562179
##
## Tuning parameter 'nu' was held constant at a value of 0.1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mstop = 150 and nu = 0.1.
summary(bstLsFit)
## Length Class Mode
## y 1120 -none- numeric
## x 12320 -none- numeric
## cost 1 -none- numeric
## family 1 -none- character
## learner 1 -none- character
## yhat 1120 -none- numeric
## offset 1 -none- numeric
## ens 150 -none- list
## control.tree 1 -none- list
## risk 150 -none- numeric
## ctrl 4 -none- list
## maxdepth 1 -none- numeric
## xselect 6 -none- numeric
## coef 150 -none- numeric
## ensemble 150 -none- numeric
## ml.fit 12 lm list
## call 6 -none- call
## xNames 11 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 1 -none- logical
predictedVal <- predict(bstLsFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)
defaultSummary(modelvalues)
## RMSE Rsquared
## 0.7354052 0.2211663
pred_blm <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "Boosted LM")
compare <- rbind(compare, pred_blm)
ggplot(compare, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
stat_abline() +
ggtitle("Observado x Previsão (validação)")
ggplot(compare, aes(y = (pred - obs), x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
geom_abline(slope = 0, intercept = 0, colour = "darkred") +
ggtitle("Resíduos na validação")
knnFit <- train(quality ~. ,
data = wines.dev,
method = "knn",
trControl = ctrl,
preProcess = c("center","scale"),
tuneGrid = expand.grid(.k = 3:6),
metric = "RMSE")
knnFit
## k-Nearest Neighbors
##
## 1120 samples
## 11 predictor
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 1009, 1007, 1008, 1006, 1008, 1008, ...
##
## Resampling results across tuning parameters:
##
## k RMSE Rsquared RMSE SD Rsquared SD
## 3 0.7083258 0.2946669 0.07114445 0.09042016
## 4 0.6976602 0.2962700 0.06738799 0.08367464
## 5 0.6898862 0.3004407 0.06935318 0.08136818
## 6 0.6851462 0.3019338 0.06750080 0.08057502
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.
predictedVal <- predict(knnFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)
defaultSummary(modelvalues)
## RMSE Rsquared
## 0.6641049 0.3377955
pred_knn <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "kNN")
compare <- rbind(compare, pred_knn)
ggplot(compare, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
stat_abline(colour = "darkblue") +
#ylim(3, 8) +
ggtitle("Observado x Previsão (validação)")
ggplot(compare, aes(y = (pred - obs), x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
geom_abline(slope = 0, intercept = 0, colour = "darkred") +
ggtitle("Resíduos na validação")
cubFit <- train(quality ~. ,
data = wines.dev,
method = "cubist",
trControl = ctrl,
metric = "RMSE")
## Loading required package: Cubist
cubFit
## Cubist
##
## 1120 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 1009, 1009, 1008, 1007, 1008, 1007, ...
##
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared RMSE SD Rsquared SD
## 1 0 0.6615767 0.3462724 0.03643841 0.05543391
## 1 5 0.6527261 0.3718818 0.04175233 0.06420414
## 1 9 0.6423597 0.3811342 0.03925410 0.06255311
## 10 0 0.6468007 0.3685528 0.03149673 0.04399655
## 10 5 0.6478554 0.3774355 0.03687477 0.05832914
## 10 9 0.6368622 0.3885977 0.03363137 0.05374157
## 20 0 0.6432439 0.3749506 0.03259530 0.04635415
## 20 5 0.6457803 0.3811723 0.03677562 0.05828424
## 20 9 0.6342645 0.3937789 0.03457025 0.05598567
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
predictedVal <- predict(cubFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)
defaultSummary(modelvalues)
## RMSE Rsquared
## 0.6143765 0.4266906
pred_cub <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "Cubist")
compare <- rbind(compare, pred_cub)
ggplot(compare, aes(y = pred, x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
stat_abline(colour = "darkblue") +
#ylim(3, 8) +
ggtitle("Observado x Previsão (validação)")
ggplot(compare, aes(y = (pred - obs), x = obs)) +
geom_point(alpha = 0.5, position = position_jitter(width=0.2)) +
facet_grid(. ~ modelo) +
geom_abline(slope = 0, intercept = 0, colour = "darkred") +
ggtitle("Resíduos na validação")
plot(varImp(cubFit))
Do livro de Kuhn (o autor do caret):
Start with several models that are the least interpretable and most flexible, such as boosted trees or support vector machines. Across many problem domains, these models have a high likelihood of producing the empirically optimum results (i.e., most accurate).
Investigate simpler models that are less opaque (e.g., not complete black boxes), such as multivariate adaptive regression splines (MARS), partial least squares, generalized additive models, or na ̈ıve Bayes models.
Consider using the simplest model that reasonably approximates the per- formance of the more complex methods.
Using this methodology, the modeler can discover the “performance ceiling” for the data set before settling on a model. In many cases, a range of models will be equivalent in terms of performance so the practitioner can weight the benefits of different methodologies (e.g., computational complexity, easy of prediction, interpretability). For example, a nonlinear support vector machine or random forest model might have superior accuracy, but the complexity and scope of the prediction equation may prohibit exporting the prediction equation to a production system. However, if a more interpretable model, such as a MARS model, yielded similar accuracy, the implementation of the prediction equation would be trivial and would also have superior execution time.