Juan Carlos Carmona Calvo
April 28, 2019
Dataset: mtcars, covariate: am
Steps:
Dat <- data.frame(mtcars)
paste("Covariates before:", dim(Dat)[2])
[1] "Covariates before: 11"
set.seed(13)
vf <- grep("am", names(Dat))
# first of all let's delete the covariates with variance near to Zero.
nz <- nearZeroVar(Dat)
if (length(nz) > 0) Dat <- Dat[, -nz]
# second let's delete covariates with high % of NA values.
vna <- sapply(Dat, function(x) mean(is.na(x))) > 0.9
Dat <- Dat[, vna==FALSE]
# Finally let's delete covariates with high % of correlation.
descrCor <- cor(Dat[, -vf])
highlyCorDescr <- findCorrelation(descrCor, cutoff = 0.8)
Dat <- Dat[,-highlyCorDescr]
paste("Covariates after:", dim(Dat)[2])
[1] "Covariates after: 8"
vf <- grep("am", names(Dat))
Dat[,vf] <- as.factor(Dat[,vf])
Spl <- createDataPartition(y = Dat[,vf], p = 0.6, list = FALSE)
Dat.train.train <- Dat[Spl, ]
Dat.2 <- Dat[-Spl, ]
Spl2 <- createDataPartition(y = Dat.2[,vf], p = 0.5, list = FALSE)
Dat.train.val <- Dat.2[Spl2, ]
Dat.test <- Dat.2[-Spl2, ]
paste("Train:", dim(Dat.train.train)[1])
[1] "Train: 20"
[1] "Val: 7"
[1] "Test: 5"
vControl <- trainControl(method="cv", number=4, verboseIter = FALSE)
Modfit.rf <- train(am ~ ., method = "rf", data = Dat.train.train, trControl = vControl)
Pre.rf.val <- predict(Modfit.rf, Dat.train.val)
paste("Validation Accuracy :",round(confusionMatrix(Pre.rf.val, Dat.train.val$am)$overall[1],2) * 100,"%")
[1] "Validation Accuracy : 71 %"
[1] "Validation Error out-of-sample : 29 %"
Pre.rf.test <- predict(Modfit.rf, Dat.test)
paste("Testing Accuracy :",round(confusionMatrix(Pre.rf.test, Dat.test$am)$overall[1],2) * 100,"%")
[1] "Testing Accuracy : 80 %"
[1] "Testing Error out-of-sample : 20 %"