df <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 8 - Deep Learning\\Section 39 - Artificial Neural Networks (ANN)\\Artificial_Neural_Networks\\Churn_Modelling.csv")
head(df)
Goal is to predict the ‘exited’ (dependent variable) 1 = exited, 0= stayed The items not needed for the analysis are taken out. Things like the rownumber, customerID, surname.
# we include the dependent variable.
df <- df[,4:14]
head(df)
df$Geography = as.numeric(factor(df$Geography,
levels = c("France","Spain","Germany"),
labels = c(1,2,3)))
df$Gender = as.numeric(factor(df$Gender,
levels = c("Female", "Male"),
labels = c(1,2)))
df$Gender
[1] 1 1 1 1 1 2 2 1 2 2 2 2 1 1 1 2 2 1 2 1 2 1 1 2 1 2 2 2 1 2 1 2 2 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1
[54] 2 2 2 2 2 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 2 2 2 2 2 2 1 1 2 2 1 1
[107] 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 1 2 2 2 1 1 2 1 1 2 2 1 2 1 1 1 2 2 2 1 1 2 2 1 2 1 1 1
[160] 1 1 2 1 1 2 2 1 1 1 2 1 2 2 2 2 1 1 2 1 1 1 2 1 2 1 2 1 1 2 1 1 1 2 2 2 2 1 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2
[213] 1 1 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 1 2 1 2 1 1 1 2 2 1 2 2 2 1 2 1 2 2 1 1 1 2 2 2 2 1 1 2 2 1 1 1 2
[266] 1 2 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2 2 2 1 2 2 1 1 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 2 2 2 2 2 2 1 2 2 1 2 2 2 2
[319] 1 2 1 1 2 1 2 2 2 1 2 2 2 1 2 1 1 2 1 2 1 1 1 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 2 2 2 1 1 1 2 1 1 1 2 1 2
[372] 1 2 2 2 1 2 2 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 2 1 1 2 2 1 1 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 2 1 1 1 1 1 1 2 1
[425] 2 2 2 2 1 2 2 2 1 1 2 1 1 2 1 1 1 1 1 2 2 1 2 2 1 1 1 2 2 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 2 2 2 2 2 2 1 2 1
[478] 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 1 1 1 2 1 2 2 1 2 2 2 2 1 2 2 1 2 1 1 1 2 2 1 2 1 2 2 2 1 2 2 2 2 1 2 2 1 2
[531] 1 1 2 2 1 2 2 2 1 1 2 2 1 2 1 1 2 1 2 2 1 2 1 2 2 1 2 2 2 1 1 2 2 1 1 2 1 1 2 2 2 2 2 1 2 1 1 2 1 2 2 1 1
[584] 1 1 1 2 2 2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 2 2 2 2 2 1 2 1 1 2 2 1 2 1 2 1 2 1 1 2 1 2 2 2 2 1 1 1 1 2 2 1
[637] 1 2 1 1 2 1 1 1 1 1 1 2 1 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 1 2 1 2 2 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1
[690] 1 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 2 2 1 1 2 2 2 1 1 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 1 2 2 2 2 1 1 2
[743] 2 1 2 2 2 1 2 2 2 1 1 2 1 2 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 2 2 2 2 2 1 2 2 1 2 1 2 1 2 2 2 2 1 2 2 2 1 1 2
[796] 2 1 2 1 1 2 1 1 1 2 1 1 2 2 1 1 2 1 2 1 2 2 1 2 1 2 1 2 1 2 2 1 2 1 1 1 1 1 2 1 2 1 1 1 2 2 1 2 1 2 2 2 1
[849] 2 2 2 1 1 1 1 2 1 2 2 2 2 1 2 2 2 1 2 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 1 2 2 2 1 1 2 2 1 1 2 2
[902] 2 1 2 2 1 2 1 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 1 1 1 2 2 2 1 2 2 1 2 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 2
[955] 1 1 1 1 2 1 1 2 2 2 1 1 1 1 2 2 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 2 1 1
[ reached getOption("max.print") -- omitted 9000 entries ]
df$Geography
[1] 1 2 1 1 2 2 1 3 1 1 1 2 1 1 2 3 3 2 2 1 1 2 2 1 1 1 3 1 3 1 2 1 3 2 2 1 2 2 1 3 2 1 1 1 2 3 3 3 3 3 3 3 1
[54] 3 3 1 1 3 2 1 3 3 2 2 3 3 3 3 3 1 3 1 2 3 1 1 1 1 2 3 1 1 1 1 1 2 2 1 1 3 2 1 1 1 2 2 2 3 2 1 1 1 1 2 2 2
[107] 3 3 2 3 3 3 1 2 3 3 3 1 3 3 1 2 1 3 1 1 1 3 1 1 1 3 1 1 3 3 3 1 2 2 3 1 1 2 1 2 2 1 1 1 2 1 2 1 1 2 1 2 1
[160] 3 2 1 1 2 2 3 3 2 2 3 1 3 3 1 1 2 1 3 2 3 2 1 3 1 1 1 1 2 2 3 1 1 1 1 2 1 2 3 1 1 2 1 2 1 2 1 2 1 1 1 3 1
[213] 1 1 3 3 1 2 1 1 1 2 1 1 3 1 1 3 3 3 1 1 1 3 1 3 1 3 1 1 2 3 1 3 2 1 3 1 3 3 3 2 1 3 2 2 1 3 1 3 3 2 1 1 2
[266] 2 1 1 2 3 2 3 3 1 3 2 1 1 1 1 1 2 3 1 1 1 2 1 2 2 3 2 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 3 1 3 1 3 1 3 2 2 1 2
[319] 2 1 2 1 1 1 1 1 1 1 3 3 1 2 3 3 2 1 3 1 2 3 3 1 2 1 1 1 3 3 3 1 2 2 2 2 1 1 2 2 1 1 2 3 1 3 1 2 3 3 3 1 2
[372] 3 3 3 3 1 3 2 2 2 1 1 3 3 3 1 2 2 2 1 3 3 2 2 2 3 1 3 1 1 1 3 2 2 1 1 2 2 3 3 2 1 2 3 2 2 3 1 1 1 3 1 3 1
[425] 1 1 3 1 3 3 3 3 3 3 1 1 3 1 1 3 1 1 1 1 2 1 1 1 1 2 3 1 2 2 1 1 1 1 3 2 3 1 2 1 2 1 2 3 1 1 2 1 1 1 1 3 3
[478] 1 1 1 2 1 1 3 2 1 1 1 1 3 2 1 1 1 1 3 1 1 3 1 2 1 1 2 2 1 3 3 1 2 3 3 2 1 1 2 1 3 1 1 1 1 2 3 1 1 1 1 3 1
[531] 2 2 1 2 1 1 3 2 3 3 1 1 1 2 1 2 1 3 1 1 3 3 2 1 3 2 3 3 2 2 3 2 2 1 2 1 2 2 3 1 1 1 1 2 2 1 2 1 1 1 2 2 2
[584] 3 3 1 2 1 1 2 3 1 1 2 2 3 2 3 3 3 1 2 1 1 1 1 1 1 1 3 1 1 1 2 3 2 2 3 2 2 1 2 3 3 2 2 1 1 1 2 1 3 2 3 1 2
[637] 2 1 1 1 2 1 1 3 1 1 1 1 1 1 1 1 1 3 2 3 2 3 1 1 3 2 1 1 1 1 1 1 2 1 3 1 1 2 2 1 2 3 2 2 1 2 3 3 1 2 1 3 1
[690] 1 3 1 3 1 1 1 2 3 2 3 1 1 2 2 3 3 3 1 1 1 1 3 2 1 2 1 3 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 3 3 1 2 1 1 1 1 1
[743] 1 1 1 1 1 2 2 3 1 3 1 3 2 1 3 2 1 1 1 1 3 3 3 2 1 3 2 1 3 1 1 1 3 1 3 2 1 2 1 1 1 1 3 2 1 3 2 2 1 1 3 3 1
[796] 3 1 3 1 3 1 3 3 2 1 2 1 1 3 1 1 2 2 3 1 1 1 3 2 1 3 1 2 2 2 3 1 2 3 1 1 2 3 3 1 1 2 3 2 1 3 1 3 2 1 1 3 1
[849] 2 1 3 3 3 1 1 2 3 2 2 1 1 1 1 1 1 3 1 1 3 1 1 3 1 1 1 1 3 2 1 1 1 1 1 1 1 1 2 1 3 1 2 3 1 1 3 3 1 1 1 3 3
[902] 1 1 1 1 1 1 1 2 2 1 2 2 3 1 1 1 1 2 2 1 2 1 3 1 3 2 3 3 1 2 3 3 2 2 1 1 3 1 1 2 1 1 3 1 2 2 1 1 1 1 3 3 2
[955] 2 1 2 2 2 1 3 2 2 2 3 1 1 2 1 3 1 1 3 2 1 2 1 1 1 1 1 3 1 3 1 3 1 2 2 1 1 1 3 2 2 2 2 2 3 1
[ reached getOption("max.print") -- omitted 9000 entries ]
library(caTools)
package <U+393C><U+3E31>caTools<U+393C><U+3E32> was built under R version 3.3.3
set.seed(123)
split <- sample.split(df$Exited, SplitRatio = 0.80)
training_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)
# Feature Scaling
# not necessary for XGboost
# install.packages("xgboost")
library(xgboost)
package <U+393C><U+3E31>xgboost<U+393C><U+3E32> was built under R version 3.3.3
# -11 is the Exited dependent variable
classifier = xgboost(data = as.matrix(training_set[-11]),
label = training_set$Exited,
nrounds = 10)
[1] train-rmse:0.417724
[2] train-rmse:0.369587
[3] train-rmse:0.342099
[4] train-rmse:0.325681
[5] train-rmse:0.316158
[6] train-rmse:0.310497
[7] train-rmse:0.305414
[8] train-rmse:0.303013
[9] train-rmse:0.300683
[10] train-rmse:0.298272
# Applying K-fold cross validation
# install.packages("caret")
library(caret)
package <U+393C><U+3E31>caret<U+393C><U+3E32> was built under R version 3.3.3Loading required package: lattice
package <U+393C><U+3E31>lattice<U+393C><U+3E32> was built under R version 3.3.3Loading required package: ggplot2
folds = createFolds(training_set$Exited, k = 10)
cv = lapply(folds, function(x){
training_fold <- training_set[-x, ]
test_fold <- training_set[x, ]
classifier <- xgboost(data = as.matrix(training_fold[-11]),
label = training_fold$Exited,
nrounds = 10)
y_pred <- predict(classifier, newdata = as.matrix(test_fold[-11]))
y_pred <-(y_pred >= 0.5)
cm <- table(test_fold[,11], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
}
)
[1] train-rmse:0.417511
[2] train-rmse:0.368548
[3] train-rmse:0.340471
[4] train-rmse:0.324878
[5] train-rmse:0.314721
[6] train-rmse:0.306865
[7] train-rmse:0.301697
[8] train-rmse:0.298052
[9] train-rmse:0.295592
[10] train-rmse:0.293040
[1] train-rmse:0.417244
[2] train-rmse:0.368158
[3] train-rmse:0.340676
[4] train-rmse:0.324912
[5] train-rmse:0.314037
[6] train-rmse:0.306583
[7] train-rmse:0.302554
[8] train-rmse:0.299224
[9] train-rmse:0.296799
[10] train-rmse:0.294110
[1] train-rmse:0.416136
[2] train-rmse:0.367195
[3] train-rmse:0.338752
[4] train-rmse:0.322656
[5] train-rmse:0.312869
[6] train-rmse:0.306559
[7] train-rmse:0.302208
[8] train-rmse:0.299689
[9] train-rmse:0.295676
[10] train-rmse:0.293186
[1] train-rmse:0.417261
[2] train-rmse:0.369349
[3] train-rmse:0.341684
[4] train-rmse:0.325994
[5] train-rmse:0.316336
[6] train-rmse:0.308903
[7] train-rmse:0.304136
[8] train-rmse:0.300419
[9] train-rmse:0.297376
[10] train-rmse:0.295398
[1] train-rmse:0.417071
[2] train-rmse:0.368179
[3] train-rmse:0.340170
[4] train-rmse:0.323932
[5] train-rmse:0.314532
[6] train-rmse:0.308267
[7] train-rmse:0.303902
[8] train-rmse:0.300467
[9] train-rmse:0.297037
[10] train-rmse:0.296005
[1] train-rmse:0.418153
[2] train-rmse:0.370097
[3] train-rmse:0.342438
[4] train-rmse:0.327157
[5] train-rmse:0.318110
[6] train-rmse:0.312157
[7] train-rmse:0.307803
[8] train-rmse:0.303832
[9] train-rmse:0.301520
[10] train-rmse:0.300574
[1] train-rmse:0.418025
[2] train-rmse:0.369733
[3] train-rmse:0.342694
[4] train-rmse:0.326682
[5] train-rmse:0.315787
[6] train-rmse:0.309104
[7] train-rmse:0.305890
[8] train-rmse:0.302110
[9] train-rmse:0.297851
[10] train-rmse:0.296200
[1] train-rmse:0.417692
[2] train-rmse:0.369005
[3] train-rmse:0.340888
[4] train-rmse:0.324117
[5] train-rmse:0.314281
[6] train-rmse:0.308024
[7] train-rmse:0.304913
[8] train-rmse:0.302051
[9] train-rmse:0.299052
[10] train-rmse:0.297183
[1] train-rmse:0.417432
[2] train-rmse:0.368861
[3] train-rmse:0.340627
[4] train-rmse:0.323917
[5] train-rmse:0.313854
[6] train-rmse:0.307355
[7] train-rmse:0.301953
[8] train-rmse:0.299758
[9] train-rmse:0.296595
[10] train-rmse:0.294788
[1] train-rmse:0.418314
[2] train-rmse:0.370496
[3] train-rmse:0.342442
[4] train-rmse:0.326794
[5] train-rmse:0.316660
[6] train-rmse:0.310425
[7] train-rmse:0.305719
[8] train-rmse:0.303115
[9] train-rmse:0.300370
[10] train-rmse:0.297201
accuracy <- mean(as.numeric(cv))
accuracy
[1] 0.85925