require(ggplot2)

## Loading required package: ggplot2

require(caret)

## Loading required package: caret

## Warning: package 'caret' was built under R version 3.1.3

## Loading required package: lattice

require(GGally)

## Loading required package: GGally

require(plyr)

## Loading required package: plyr

require(dplyr)

## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:GGally':
## 
##     nasa
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Preliminares

Usaremos dados de vinhos: http://www3.dsi.uminho.pt/pcortez/wine/

wines <- read.csv("winequality/winequality-red.csv", sep=";")
summary(wines)

##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00      
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00      
##  Median :0.07900   Median :14.00       Median : 38.00      
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47      
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00      
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00      
##     density             pH          sulphates         alcohol     
##  Min.   :0.9901   Min.   :2.740   Min.   :0.3300   Min.   : 8.40  
##  1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50  
##  Median :0.9968   Median :3.310   Median :0.6200   Median :10.20  
##  Mean   :0.9967   Mean   :3.311   Mean   :0.6581   Mean   :10.42  
##  3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10  
##  Max.   :1.0037   Max.   :4.010   Max.   :2.0000   Max.   :14.90  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.636  
##  3rd Qu.:6.000  
##  Max.   :8.000

if(! file.exists("ggpairs.pdf")){
  pdf("ggpairs.pdf", w = 15, h = 15)
  ggpairs(wines, alpha=0.3)
  dev.off()
}

# do caret:
featurePlot(wines[,1:11], wines[,12])

Treino e teste

split<-createDataPartition(y = wines$quality, 
                           p = 0.7, 
                           list = FALSE)

wines.dev <- wines[split,]
wines.val <- wines[-split,]

Regressão linear

As pressuposições de regressão para previsão são: - A relação entre preditores e variável de resposta é aproximadamente linear - Nenhum preditor pode ser derivado por combinação linear dos demais - Há um número igual ou maior de observações do que preditores

Preditores com muita correlação entre si (aka colinearidade) também geram instabilidade no modelo. Para eliminar preditores com alta colinearidade:

correlationMatrix <- cor(wines[,1:11])
print(correlationMatrix)

##                      fixed.acidity volatile.acidity citric.acid
## fixed.acidity           1.00000000     -0.256130895  0.67170343
## volatile.acidity       -0.25613089      1.000000000 -0.55249568
## citric.acid             0.67170343     -0.552495685  1.00000000
## residual.sugar          0.11477672      0.001917882  0.14357716
## chlorides               0.09370519      0.061297772  0.20382291
## free.sulfur.dioxide    -0.15379419     -0.010503827 -0.06097813
## total.sulfur.dioxide   -0.11318144      0.076470005  0.03553302
## density                 0.66804729      0.022026232  0.36494718
## pH                     -0.68297819      0.234937294 -0.54190414
## sulphates               0.18300566     -0.260986685  0.31277004
## alcohol                -0.06166827     -0.202288027  0.10990325
##                      residual.sugar    chlorides free.sulfur.dioxide
## fixed.acidity           0.114776724  0.093705186        -0.153794193
## volatile.acidity        0.001917882  0.061297772        -0.010503827
## citric.acid             0.143577162  0.203822914        -0.060978129
## residual.sugar          1.000000000  0.055609535         0.187048995
## chlorides               0.055609535  1.000000000         0.005562147
## free.sulfur.dioxide     0.187048995  0.005562147         1.000000000
## total.sulfur.dioxide    0.203027882  0.047400468         0.667666450
## density                 0.355283371  0.200632327        -0.021945831
## pH                     -0.085652422 -0.265026131         0.070377499
## sulphates               0.005527121  0.371260481         0.051657572
## alcohol                 0.042075437 -0.221140545        -0.069408354
##                      total.sulfur.dioxide     density          pH
## fixed.acidity                 -0.11318144  0.66804729 -0.68297819
## volatile.acidity               0.07647000  0.02202623  0.23493729
## citric.acid                    0.03553302  0.36494718 -0.54190414
## residual.sugar                 0.20302788  0.35528337 -0.08565242
## chlorides                      0.04740047  0.20063233 -0.26502613
## free.sulfur.dioxide            0.66766645 -0.02194583  0.07037750
## total.sulfur.dioxide           1.00000000  0.07126948 -0.06649456
## density                        0.07126948  1.00000000 -0.34169933
## pH                            -0.06649456 -0.34169933  1.00000000
## sulphates                      0.04294684  0.14850641 -0.19664760
## alcohol                       -0.20565394 -0.49617977  0.20563251
##                         sulphates     alcohol
## fixed.acidity         0.183005664 -0.06166827
## volatile.acidity     -0.260986685 -0.20228803
## citric.acid           0.312770044  0.10990325
## residual.sugar        0.005527121  0.04207544
## chlorides             0.371260481 -0.22114054
## free.sulfur.dioxide   0.051657572 -0.06940835
## total.sulfur.dioxide  0.042946836 -0.20565394
## density               0.148506412 -0.49617977
## pH                   -0.196647602  0.20563251
## sulphates             1.000000000  0.09359475
## alcohol               0.093594750  1.00000000

# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)

## integer(0)

# se houvesse, faríamos: 
# wines.filtered <- wines[,-highlyCorrelated]

Caso haja relações não-lineares entre preditores e resposta, podemos adicionar preditores transformados como fatores extra.

Outro ponto importante: outliers influenciam muito a regressão linear. (Por causa do algoritmo de mínimos quadrados.)

Treinando

Não há parâmetro para ajustar na regressão linear simples.

ctrl <- trainControl(method = "cv", number = 10)

lmFit <- train(quality ~. , 
               data = wines.dev, 
               method = "lm", 
               trControl = ctrl,
               metric = "RMSE")

lmFit

## Linear Regression 
## 
## 1120 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 1009, 1008, 1008, 1008, 1009, 1007, ... 
## 
## Resampling results
## 
##   RMSE       Rsquared   RMSE SD     Rsquared SD
##   0.6583694  0.3404559  0.04507074  0.09056368 
## 
##

summary(lmFit)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.69392 -0.36613 -0.05366  0.46019  2.04682 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.570e+01  2.553e+01  -0.615  0.53862    
## fixed.acidity        -6.526e-03  3.077e-02  -0.212  0.83209    
## volatile.acidity     -1.099e+00  1.473e-01  -7.456 1.80e-13 ***
## citric.acid          -2.151e-01  1.773e-01  -1.213  0.22529    
## residual.sugar       -1.464e-02  1.842e-02  -0.795  0.42675    
## chlorides            -2.177e+00  5.115e-01  -4.256 2.25e-05 ***
## free.sulfur.dioxide   6.344e-03  2.664e-03   2.382  0.01739 *  
## total.sulfur.dioxide -3.364e-03  8.982e-04  -3.745  0.00019 ***
## density               2.070e+01  2.605e+01   0.795  0.42706    
## pH                   -6.370e-01  2.304e-01  -2.765  0.00579 ** 
## sulphates             8.744e-01  1.377e-01   6.352 3.10e-10 ***
## alcohol               3.086e-01  3.199e-02   9.647  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6533 on 1108 degrees of freedom
## Multiple R-squared:  0.3534, Adjusted R-squared:  0.347 
## F-statistic: 55.05 on 11 and 1108 DF,  p-value: < 2.2e-16

Diagnósticos do modelo

plot(varImp(lmFit))

# Usando os dados de treino!

avaliacao <- data.frame(obs = wines.dev$quality, pred = predict(lmFit), res = resid(lmFit))

ggplot(avaliacao, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  stat_abline(colour = "blue") + 
  ggtitle("Observado x Previsão (validação)")

ggplot(avaliacao, aes(y = res, x = pred)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.1)) + 
  geom_abline(slope = 0, intercept = 0, colour = "darkred") + 
  ggtitle("Resíduos na validação")

# Outra forma de fazer:
#
# xyplot(wines.dev$quality ~ predict(lmFit),
#        ## plot the points (type = 'p') and a background grid ('g')
#        type = c("p", "g"),
#        xlab = "Predicted", ylab = "Observed")
# 
# xyplot(resid(lmFit) ~ predict(lmFit),
#        type = c("p", "g"),
#        xlab = "Predicted", ylab = "Residuals")

Desempenho:

predictedVal <- predict(lmFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)

defaultSummary(modelvalues)

##      RMSE  Rsquared 
## 0.6398463 0.3692433

compare <- data.frame(obs = wines.val$quality, pred = predictedVal)
ggplot(compare, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  stat_abline() + 
  ggtitle("Observado x Previsão (validação)")

ggplot(compare, aes(y = (pred - obs), x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  ggtitle("Resíduos na validação")

Outro tipo de regressão

Usaremos MARS

require(earth)

## Loading required package: earth

## Warning: package 'earth' was built under R version 3.1.3

## Loading required package: plotmo

## Warning: package 'plotmo' was built under R version 3.1.3

## Loading required package: plotrix

## Warning: package 'plotrix' was built under R version 3.1.3

## Loading required package: TeachingDemos

Sem model tuning:

marsSimpleFit <- earth(quality ~ ., data = wines.dev) 
marsSimpleFit

## Selected 14 of 19 terms, and 7 of 11 predictors
## Termination condition: Reached nk 23
## Importance: alcohol, sulphates, volatile.acidity, pH, chlorides, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.406612    RSS 433.7131    GRSq 0.3784381    RSq 0.4069866

summary(marsSimpleFit)

## Call: earth(formula=quality~., data=wines.dev)
## 
##                            coefficients
## (Intercept)                   3.4114803
## h(1.005-volatile.acidity)     0.9323240
## h(0.49-citric.acid)           0.4104112
## h(chlorides-0.049)           29.2333849
## h(chlorides-0.091)          -18.9882200
## h(0.143-chlorides)           25.1245610
## h(chlorides-0.213)          -13.3100263
## h(total.sulfur.dioxide-24)   -0.0020810
## h(3.47-pH)                    0.5808026
## h(pH-3.47)                   -1.9331333
## h(0.76-sulphates)            -2.4884343
## h(alcohol-11.1)               0.3229972
## h(12.7-alcohol)              -0.2159715
## h(alcohol-12.7)              -0.7384290
## 
## Selected 14 of 19 terms, and 7 of 11 predictors
## Termination condition: Reached nk 23
## Importance: alcohol, sulphates, volatile.acidity, pH, chlorides, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.406612    RSS 433.7131    GRSq 0.3784381    RSq 0.4069866

plotmo(marsSimpleFit)

##  grid:    fixed.acidity volatile.acidity citric.acid residual.sugar
##                     7.9             0.52        0.25            2.2
##  chlorides free.sulfur.dioxide total.sulfur.dioxide  density   pH
##       0.08                  13                   37 0.996755 3.31
##  sulphates alcohol
##       0.62    10.1

Agora com caret. Há dois parâmetros para tuning: grau do modelo (quantas variáveis podem interagir em um termo da equação?) e número de termos no modelo final.

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:40)

marsFit <- train(quality ~. , 
                 data = wines.dev, 
                 method = "earth", 
                 trControl = ctrl,
                 # novidade:
                 tuneGrid = marsGrid,
                 metric = "RMSE")

marsFit

## Multivariate Adaptive Regression Spline 
## 
## 1120 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 1008, 1007, 1008, 1009, 1008, 1008, ... 
## 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   RMSE SD     Rsquared SD
##   1        2      0.7118244  0.2344693  0.05940003  0.12392549 
##   1        3      0.6785644  0.2976706  0.06137521  0.11462296 
##   1        4      0.6514260  0.3517015  0.06084369  0.11029751 
##   1        5      0.6524230  0.3507537  0.06053572  0.11049875 
##   1        6      0.6514900  0.3538741  0.05833570  0.10689815 
##   1        7      0.6506111  0.3549043  0.05489033  0.10245261 
##   1        8      0.6513546  0.3549378  0.05523432  0.10181724 
##   1        9      0.6511403  0.3558790  0.05776018  0.10433596 
##   1       10      0.6492265  0.3599048  0.05938403  0.10453710 
##   1       11      0.6506602  0.3570202  0.05640550  0.09801561 
##   1       12      0.6515258  0.3552798  0.05374173  0.09454401 
##   1       13      0.6533397  0.3516454  0.05313897  0.09141514 
##   1       14      0.6670305  0.3316272  0.06736493  0.10305016 
##   1       15      0.6669519  0.3313858  0.06676860  0.10316958 
##   1       16      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       17      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       18      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       19      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       20      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       21      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       22      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       23      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       24      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       25      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       26      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       27      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       28      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       29      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       30      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       31      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       32      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       33      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       34      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       35      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       36      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       37      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       38      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       39      0.6667792  0.3309639  0.06661583  0.10335914 
##   1       40      0.6667792  0.3309639  0.06661583  0.10335914 
##   2        2      0.7261255  0.2018730  0.06313604  0.11735867 
##   2        3      0.6825500  0.2889358  0.05794540  0.10434710 
##   2        4      0.6585776  0.3371816  0.06444895  0.11306296 
##   2        5      0.6622942  0.3325108  0.06642791  0.11646667 
##   2        6      0.6574566  0.3416775  0.05709707  0.10373988 
##   2        7      0.6617611  0.3346005  0.06170858  0.10984493 
##   2        8      0.6589548  0.3401732  0.06696283  0.11729593 
##   2        9      0.6586616  0.3412774  0.06687693  0.11705426 
##   2       10      0.6599339  0.3393151  0.06610801  0.11820519 
##   2       11      0.6608447  0.3379116  0.06440963  0.11563978 
##   2       12      0.6590193  0.3411240  0.06452301  0.11670505 
##   2       13      0.6581327  0.3434370  0.06350979  0.11394885 
##   2       14      0.6568625  0.3456528  0.06352425  0.11358979 
##   2       15      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       16      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       17      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       18      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       19      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       20      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       21      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       22      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       23      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       24      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       25      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       26      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       27      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       28      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       29      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       30      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       31      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       32      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       33      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       34      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       35      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       36      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       37      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       38      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       39      0.6571627  0.3448212  0.06162320  0.11108838 
##   2       40      0.6571627  0.3448212  0.06162320  0.11108838 
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.

Diagnóstico

plot(varImp(marsFit))

# Usando os dados de treino!
avaliacao$modelo <- "RL"

pred_mars <- data.frame(obs = wines.dev$quality, 
                        pred = predict(marsFit), 
                        res = wines.dev$quality - predict(marsFit),
                        modelo = "MARS")
avaliacao <- rbind(avaliacao, pred_mars)

ggplot(avaliacao, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  stat_abline(colour = "blue") + 
  facet_grid(. ~ modelo) + 
  ggtitle("Observado x Previsão (validação)")

ggplot(avaliacao, aes(y = res, x = pred)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.1)) + 
  geom_abline(slope = 0, intercept = 0, colour = "darkred") + 
  facet_grid(. ~ modelo) + 
  ggtitle("Resíduos na validação")

O desempenho melhorou?

predictedVal <- predict(marsFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)

defaultSummary(modelvalues)

##      RMSE  Rsquared 
## 0.6339553 0.3810186

compare$modelo <- "RL" 
pred_mars <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "MARS")
compare <- rbind(compare, pred_mars)

ggplot(compare, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  stat_abline() +
  ggtitle("Observado x Previsão (validação)")

ggplot(compare, aes(y = (pred - obs), x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  geom_abline(slope = 0, intercept = 0, colour = "darkred") + 
  ggtitle("Resíduos na validação")

Bagged MARS

MARS + Bootstrap AGGregation

bMarsFit <- train(quality ~. , 
                 data = wines.dev, 
                 method = "bagEarthGCV", 
                 trControl = ctrl,
                 tuneGrid = expand.grid(.degree = 1:2),
                 metric = "RMSE")

bMarsFit

## Bagged MARS using gCV Pruning 
## 
## 1120 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 1008, 1008, 1007, 1007, 1009, 1008, ... 
## 
## Resampling results across tuning parameters:
## 
##   degree  RMSE       Rsquared   RMSE SD     Rsquared SD
##   1       0.6550211  0.3520736  0.03157344  0.04854529 
##   2       0.6400212  0.3771018  0.02947601  0.04313088 
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was degree = 2.

predictedVal <- predict(bMarsFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)

defaultSummary(modelvalues)

##      RMSE  Rsquared 
## 0.6122403 0.4223664

Boosted LM

bstLsFit <- train(quality ~. , 
                 data = wines.dev, 
                 method = "bstLs", 
                 trControl = ctrl,
                 metric = "RMSE")

## Loading required package: bst

bstLsFit

## Boosted Linear Model 
## 
## 1120 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 1008, 1008, 1008, 1008, 1009, 1008, ... 
## 
## Resampling results across tuning parameters:
## 
##   mstop  RMSE       Rsquared   RMSE SD     Rsquared SD
##    50    0.7744332  0.1246540  0.03709427  0.06084528 
##   100    0.7585361  0.1695485  0.03729673  0.06388378 
##   150    0.7459287  0.1999767  0.03733727  0.06562179 
## 
## Tuning parameter 'nu' was held constant at a value of 0.1
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were mstop = 150 and nu = 0.1.

summary(bstLsFit)

##              Length Class      Mode     
## y             1120  -none-     numeric  
## x            12320  -none-     numeric  
## cost             1  -none-     numeric  
## family           1  -none-     character
## learner          1  -none-     character
## yhat          1120  -none-     numeric  
## offset           1  -none-     numeric  
## ens            150  -none-     list     
## control.tree     1  -none-     list     
## risk           150  -none-     numeric  
## ctrl             4  -none-     list     
## maxdepth         1  -none-     numeric  
## xselect          6  -none-     numeric  
## coef           150  -none-     numeric  
## ensemble       150  -none-     numeric  
## ml.fit          12  lm         list     
## call             6  -none-     call     
## xNames          11  -none-     character
## problemType      1  -none-     character
## tuneValue        2  data.frame list     
## obsLevels        1  -none-     logical

predictedVal <- predict(bstLsFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)

defaultSummary(modelvalues)

##      RMSE  Rsquared 
## 0.7354052 0.2211663

pred_blm <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "Boosted LM")
compare <- rbind(compare, pred_blm)

ggplot(compare, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  stat_abline() +
  ggtitle("Observado x Previsão (validação)")

ggplot(compare, aes(y = (pred - obs), x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  geom_abline(slope = 0, intercept = 0, colour = "darkred") + 
  ggtitle("Resíduos na validação")

kNN

knnFit <- train(quality ~. , 
                data = wines.dev, 
                method = "knn", 
                trControl = ctrl,
                preProcess = c("center","scale"), 
                tuneGrid = expand.grid(.k = 3:6),
                metric = "RMSE")

knnFit

## k-Nearest Neighbors 
## 
## 1120 samples
##   11 predictor
## 
## Pre-processing: centered, scaled 
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 1009, 1007, 1008, 1006, 1008, 1008, ... 
## 
## Resampling results across tuning parameters:
## 
##   k  RMSE       Rsquared   RMSE SD     Rsquared SD
##   3  0.7083258  0.2946669  0.07114445  0.09042016 
##   4  0.6976602  0.2962700  0.06738799  0.08367464 
##   5  0.6898862  0.3004407  0.06935318  0.08136818 
##   6  0.6851462  0.3019338  0.06750080  0.08057502 
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was k = 6.

predictedVal <- predict(knnFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)

defaultSummary(modelvalues)

##      RMSE  Rsquared 
## 0.6641049 0.3377955

pred_knn <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "kNN")
compare <- rbind(compare, pred_knn)

ggplot(compare, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  stat_abline(colour = "darkblue") +
  #ylim(3, 8) + 
  ggtitle("Observado x Previsão (validação)")

ggplot(compare, aes(y = (pred - obs), x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  geom_abline(slope = 0, intercept = 0, colour = "darkred") + 
  ggtitle("Resíduos na validação")

Cubist

cubFit <- train(quality ~. , 
                data = wines.dev, 
                method = "cubist", 
                trControl = ctrl,
                metric = "RMSE")

## Loading required package: Cubist

cubFit

## Cubist 
## 
## 1120 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 1009, 1009, 1008, 1007, 1008, 1007, ... 
## 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE       Rsquared   RMSE SD     Rsquared SD
##    1          0          0.6615767  0.3462724  0.03643841  0.05543391 
##    1          5          0.6527261  0.3718818  0.04175233  0.06420414 
##    1          9          0.6423597  0.3811342  0.03925410  0.06255311 
##   10          0          0.6468007  0.3685528  0.03149673  0.04399655 
##   10          5          0.6478554  0.3774355  0.03687477  0.05832914 
##   10          9          0.6368622  0.3885977  0.03363137  0.05374157 
##   20          0          0.6432439  0.3749506  0.03259530  0.04635415 
##   20          5          0.6457803  0.3811723  0.03677562  0.05828424 
##   20          9          0.6342645  0.3937789  0.03457025  0.05598567 
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.

predictedVal <- predict(cubFit, wines.val)
modelvalues<-data.frame(obs = wines.val$quality, pred = predictedVal)

defaultSummary(modelvalues)

##      RMSE  Rsquared 
## 0.6143765 0.4266906

pred_cub <- data.frame(obs = wines.val$quality, pred = predictedVal, modelo = "Cubist")
compare <- rbind(compare, pred_cub)

ggplot(compare, aes(y = pred, x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  stat_abline(colour = "darkblue") +
  #ylim(3, 8) + 
  ggtitle("Observado x Previsão (validação)")

ggplot(compare, aes(y = (pred - obs), x = obs)) + 
  geom_point(alpha = 0.5, position = position_jitter(width=0.2)) + 
  facet_grid(. ~ modelo) + 
  geom_abline(slope = 0, intercept = 0, colour = "darkred") + 
  ggtitle("Resíduos na validação")

plot(varImp(cubFit))

Conselho de metodologia

Do livro de Kuhn (o autor do caret):

Start with several models that are the least interpretable and most flexible, such as boosted trees or support vector machines. Across many problem domains, these models have a high likelihood of producing the empirically optimum results (i.e., most accurate).
Investigate simpler models that are less opaque (e.g., not complete black boxes), such as multivariate adaptive regression splines (MARS), partial least squares, generalized additive models, or na ̈ıve Bayes models.
Consider using the simplest model that reasonably approximates the per- formance of the more complex methods.

Using this methodology, the modeler can discover the “performance ceiling” for the data set before settling on a model. In many cases, a range of models will be equivalent in terms of performance so the practitioner can weight the benefits of different methodologies (e.g., computational complexity, easy of prediction, interpretability). For example, a nonlinear support vector machine or random forest model might have superior accuracy, but the complexity and scope of the prediction equation may prohibit exporting the prediction equation to a production system. However, if a more interpretable model, such as a MARS model, yielded similar accuracy, the implementation of the prediction equation would be trivial and would also have superior execution time.

Regressão e Vinho com Caret

Nazareno Andrade e João Arthur B. Monteiro

26 de julho de 2015