# Caret
# Instalando os pacotes
#install.packages("caret")
#install.packages("randomForest")
# Carregando os pacotes
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(datasets)
# Usando o dataset mtcars
data(mtcars)
# Funcao do Caret para divisao dos dados
?createDataPartition
## starting httpd help server ...
## done
split <- createDataPartition(y = mtcars$mpg, p = 0.7, list = FALSE)
# Criando dados de treino e de teste
dados_treino <- mtcars[split,]
dados_teste <- mtcars[-split,]
# Treinando o modelo
?train
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "bdk"
## [19] "binda" "blackboost" "blasso"
## [22] "blassoAveraged" "Boruta" "bridge"
## [25] "brnn" "BstLm" "bstSm"
## [28] "bstTree" "C5.0" "C5.0Cost"
## [31] "C5.0Rules" "C5.0Tree" "cforest"
## [34] "chaid" "CSimca" "ctree"
## [37] "ctree2" "cubist" "dda"
## [40] "deepboost" "DENFIS" "dnn"
## [43] "dwdLinear" "dwdPoly" "dwdRadial"
## [46] "earth" "elm" "enet"
## [49] "enpls.fs" "enpls" "evtree"
## [52] "extraTrees" "fda" "FH.GBML"
## [55] "FIR.DM" "foba" "FRBCS.CHI"
## [58] "FRBCS.W" "FS.HGD" "gam"
## [61] "gamboost" "gamLoess" "gamSpline"
## [64] "gaussprLinear" "gaussprPoly" "gaussprRadial"
## [67] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [70] "GFS.GCCL" "GFS.LT.RS" "GFS.THRIFT"
## [73] "glm" "glmboost" "glmnet"
## [76] "glmStepAIC" "gpls" "hda"
## [79] "hdda" "hdrda" "HYFIS"
## [82] "icr" "J48" "JRip"
## [85] "kernelpls" "kknn" "knn"
## [88] "krlsPoly" "krlsRadial" "lars"
## [91] "lars2" "lasso" "lda"
## [94] "lda2" "leapBackward" "leapForward"
## [97] "leapSeq" "Linda" "lm"
## [100] "lmStepAIC" "LMT" "loclda"
## [103] "logicBag" "LogitBoost" "logreg"
## [106] "lssvmLinear" "lssvmPoly" "lssvmRadial"
## [109] "lvq" "M5" "M5Rules"
## [112] "manb" "mda" "Mlda"
## [115] "mlp" "mlpML" "mlpSGD"
## [118] "mlpWeightDecay" "mlpWeightDecayML" "multinom"
## [121] "nb" "nbDiscrete" "nbSearch"
## [124] "neuralnet" "nnet" "nnls"
## [127] "nodeHarvest" "oblique.tree" "OneR"
## [130] "ordinalNet" "ORFlog" "ORFpls"
## [133] "ORFridge" "ORFsvm" "ownn"
## [136] "pam" "parRF" "PART"
## [139] "partDSA" "pcaNNet" "pcr"
## [142] "pda" "pda2" "penalized"
## [145] "PenalizedLDA" "plr" "pls"
## [148] "plsRglm" "polr" "ppr"
## [151] "protoclass" "pythonKnnReg" "qda"
## [154] "QdaCov" "qrf" "qrnn"
## [157] "randomGLM" "ranger" "rbf"
## [160] "rbfDDA" "Rborist" "rda"
## [163] "relaxo" "rf" "rFerns"
## [166] "RFlda" "rfRules" "ridge"
## [169] "rlda" "rlm" "rmda"
## [172] "rocc" "rotationForest" "rotationForestCp"
## [175] "rpart" "rpart1SE" "rpart2"
## [178] "rpartCost" "rpartScore" "rqlasso"
## [181] "rqnc" "RRF" "RRFglobal"
## [184] "rrlda" "RSimca" "rvmLinear"
## [187] "rvmPoly" "rvmRadial" "SBC"
## [190] "sda" "sddaLDA" "sddaQDA"
## [193] "sdwd" "simpls" "SLAVE"
## [196] "slda" "smda" "snn"
## [199] "sparseLDA" "spikeslab" "spls"
## [202] "stepLDA" "stepQDA" "superpc"
## [205] "svmBoundrangeString" "svmExpoString" "svmLinear"
## [208] "svmLinear2" "svmLinear3" "svmLinearWeights"
## [211] "svmLinearWeights2" "svmPoly" "svmRadial"
## [214] "svmRadialCost" "svmRadialSigma" "svmRadialWeights"
## [217] "svmSpectrumString" "tan" "tanSearch"
## [220] "treebag" "vbmpRadial" "vglmAdjCat"
## [223] "vglmContRatio" "vglmCumulative" "widekernelpls"
## [226] "WM" "wsrf" "xgbLinear"
## [229] "xgbTree" "xyf"
# Regressao linear
modelolm <- train(mpg ~ ., data = dados_treino, method = "lm")
# Regressao logistica
modelolm2 <- train(mpg ~ ., data = dados_treino, method = "glm")
# Random forest
modelolm3 <- train(mpg ~ ., data = dados_treino, method = "rf")
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
# Resumo do modelo
summary(modelolm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0504 -1.7144 0.1626 1.3853 4.1936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.031897 28.206510 -1.065 0.3064
## cyl 0.584525 1.055176 0.554 0.5890
## disp 0.006436 0.021130 0.305 0.7655
## hp -0.006903 0.026111 -0.264 0.7956
## drat 2.363192 2.108162 1.121 0.2826
## wt -5.090450 2.574063 -1.978 0.0696 .
## qsec 2.815753 1.230073 2.289 0.0394 *
## vs -3.121273 2.694013 -1.159 0.2675
## am 2.186772 2.275734 0.961 0.3541
## gear 0.795180 1.763481 0.451 0.6595
## carb 0.337487 0.939505 0.359 0.7252
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.484 on 13 degrees of freedom
## Multiple R-squared: 0.8969, Adjusted R-squared: 0.8175
## F-statistic: 11.31 on 10 and 13 DF, p-value: 7.048e-05
summary(modelolm2)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0504 -1.7144 0.1626 1.3853 4.1936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.031897 28.206510 -1.065 0.3064
## cyl 0.584525 1.055176 0.554 0.5890
## disp 0.006436 0.021130 0.305 0.7655
## hp -0.006903 0.026111 -0.264 0.7956
## drat 2.363192 2.108162 1.121 0.2826
## wt -5.090450 2.574063 -1.978 0.0696 .
## qsec 2.815753 1.230073 2.289 0.0394 *
## vs -3.121273 2.694013 -1.159 0.2675
## am 2.186772 2.275734 0.961 0.3541
## gear 0.795180 1.763481 0.451 0.6595
## carb 0.337487 0.939505 0.359 0.7252
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 6.169274)
##
## Null deviance: 777.650 on 23 degrees of freedom
## Residual deviance: 80.201 on 13 degrees of freedom
## AIC: 121.06
##
## Number of Fisher Scoring iterations: 2
summary(modelolm3)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 24 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 24 -none- numeric
## importance 10 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 24 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 10 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
# Ajustando o modelo
?expand.grid
?trainControl
controle1 <- trainControl(method = "cv", number = 10)
modelolm_v2 <- train(mpg ~ ., data = mtcars, method = "lm",
trControl = controle1,
metric = "Rsquared")
# Resumo do modelo
summary(modelolm_v2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
# Coletando os residuos
residuals <- resid(modelolm)
# Previsoes
?predict
predictedValues <- predict(modelolm)
plot(dados_treino$mpg, predictedValues)

# Mostrando a importancia das variaveis para a criacao do modelo
?varImp
varImp(modelolm)
## lm variable importance
##
## Overall
## qsec 100.000
## wt 84.615
## vs 44.165
## drat 42.307
## am 34.402
## cyl 14.303
## gear 9.214
## carb 4.685
## disp 1.986
## hp 0.000
# Plot
plot(varImp(modelolm))
