A form of gradient boosting. Lecture 290 https://www.udemy.com/machinelearning/learn/lecture/6453764
High performance
Fast exectution
We can keep all the interpretation of your problem, aka there’s isn’t all the fixing of stuff like country by making dummy variables.
dataset = read.csv('Churn_Modelling.csv')
# we only need the features (independent variables) we don't need the dependent variable
# we've removed the 1st three columns because they have no impact on the dependent variable
dataset = dataset[4:14]
knitr::include_graphics("BankCustomerData.png")
What are we doing with K-fold?
dataset$Geography = as.numeric(factor(dataset$Geography,
levels = c('France', 'Spain', 'Germany'),
labels = c(1, 2, 3)))
dataset$Gender = as.numeric(factor(dataset$Gender,
levels = c('Female', 'Male'),
labels = c(1, 2)))
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Exited, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
Results are the Root Mean Squares.
# install.packages('xgboost')
library(xgboost)
# the -11 references our removal of the dependent variable from the mastrix
# we have a dataframe and we need a matrix so we use as.matrix
# label is the dependent variable as a vector
# rounds is the number of iterations, we'll choose 10
classifier = xgboost(data = as.matrix(training_set[-11]), label = training_set$Exited, nrounds = 10)
## [1] train-rmse:0.417732
## [2] train-rmse:0.369591
## [3] train-rmse:0.342098
## [4] train-rmse:0.325681
## [5] train-rmse:0.316159
## [6] train-rmse:0.310497
## [7] train-rmse:0.305414
## [8] train-rmse:0.303013
## [9] train-rmse:0.300684
## [10] train-rmse:0.298272
y_pred = predict(classifier, newdata = as.matrix(test_set[-11]))
y_pred = (y_pred >= 0.5)
cm = table(test_set[, 11], y_pred)
To validate the accuracy of our XGBoost implementation.
# install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
folds = createFolds(training_set$Exited, k = 10)
cv = lapply(folds, function(x) {
training_fold = training_set[-x, ]
test_fold = training_set[x, ]
# we stick our XGBoost classifier in here
classifier = xgboost(data = as.matrix(training_fold[-11]), label = training_fold$Exited, nrounds = 10)
y_pred = predict(classifier, newdata = as.matrix(test_fold[-11])) # again need a matrix
y_pred = (y_pred >= 0.5) # here we are setting up the binary outcome of 0 or 1
cm = table(test_fold[, 11], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
## [1] train-rmse:0.417751
## [2] train-rmse:0.369226
## [3] train-rmse:0.340978
## [4] train-rmse:0.324698
## [5] train-rmse:0.315062
## [6] train-rmse:0.308865
## [7] train-rmse:0.304597
## [8] train-rmse:0.301234
## [9] train-rmse:0.299177
## [10] train-rmse:0.296997
## [1] train-rmse:0.416580
## [2] train-rmse:0.367659
## [3] train-rmse:0.339303
## [4] train-rmse:0.321607
## [5] train-rmse:0.311742
## [6] train-rmse:0.306015
## [7] train-rmse:0.299935
## [8] train-rmse:0.297774
## [9] train-rmse:0.294960
## [10] train-rmse:0.292531
## [1] train-rmse:0.416725
## [2] train-rmse:0.367874
## [3] train-rmse:0.340352
## [4] train-rmse:0.324469
## [5] train-rmse:0.314764
## [6] train-rmse:0.308515
## [7] train-rmse:0.302797
## [8] train-rmse:0.299819
## [9] train-rmse:0.294961
## [10] train-rmse:0.293289
## [1] train-rmse:0.418281
## [2] train-rmse:0.370448
## [3] train-rmse:0.343018
## [4] train-rmse:0.326706
## [5] train-rmse:0.315882
## [6] train-rmse:0.309408
## [7] train-rmse:0.304730
## [8] train-rmse:0.302643
## [9] train-rmse:0.299575
## [10] train-rmse:0.296431
## [1] train-rmse:0.418011
## [2] train-rmse:0.369630
## [3] train-rmse:0.341924
## [4] train-rmse:0.326216
## [5] train-rmse:0.316442
## [6] train-rmse:0.310661
## [7] train-rmse:0.306469
## [8] train-rmse:0.301762
## [9] train-rmse:0.300215
## [10] train-rmse:0.297390
## [1] train-rmse:0.417158
## [2] train-rmse:0.369087
## [3] train-rmse:0.341431
## [4] train-rmse:0.325114
## [5] train-rmse:0.315444
## [6] train-rmse:0.309110
## [7] train-rmse:0.304607
## [8] train-rmse:0.299609
## [9] train-rmse:0.296509
## [10] train-rmse:0.295154
## [1] train-rmse:0.417483
## [2] train-rmse:0.368896
## [3] train-rmse:0.340769
## [4] train-rmse:0.324137
## [5] train-rmse:0.314588
## [6] train-rmse:0.306912
## [7] train-rmse:0.301667
## [8] train-rmse:0.299551
## [9] train-rmse:0.295551
## [10] train-rmse:0.293667
## [1] train-rmse:0.417165
## [2] train-rmse:0.368203
## [3] train-rmse:0.339787
## [4] train-rmse:0.322932
## [5] train-rmse:0.313179
## [6] train-rmse:0.307549
## [7] train-rmse:0.301477
## [8] train-rmse:0.299353
## [9] train-rmse:0.297144
## [10] train-rmse:0.294549
## [1] train-rmse:0.418012
## [2] train-rmse:0.369551
## [3] train-rmse:0.341373
## [4] train-rmse:0.325821
## [5] train-rmse:0.316044
## [6] train-rmse:0.309670
## [7] train-rmse:0.304819
## [8] train-rmse:0.301259
## [9] train-rmse:0.298292
## [10] train-rmse:0.296204
## [1] train-rmse:0.417991
## [2] train-rmse:0.369705
## [3] train-rmse:0.342000
## [4] train-rmse:0.325743
## [5] train-rmse:0.316375
## [6] train-rmse:0.309956
## [7] train-rmse:0.304910
## [8] train-rmse:0.301956
## [9] train-rmse:0.299017
## [10] train-rmse:0.296713
accuracy = mean(as.numeric(cv))
accuracy
## [1] 0.8585