ML in R

library(AUC)

## AUC 0.3.0

## Type AUCNews() to see the change log and ?AUC to get an overview.

data(churn)
auc(sensitivity(churn$predictions,churn$labels))

## [1] 0.8026259

auc(specificity(churn$predictions,churn$labels))

## [1] 0.4591936

auc(accuracy(churn$predictions,churn$labels))

## [1] 0.5034279

auc(roc(churn$predictions,churn$labels))

## [1] 0.8439201

plot(sensitivity(churn$predictions,churn$labels))

plot(specificity(churn$predictions,churn$labels))

plot(accuracy(churn$predictions,churn$labels))

plot(roc(churn$predictions,churn$labels))

#feature selection
library(mlbench)
data(Ozone, package="mlbench")
inputData <- Ozone
names(inputData) <- c("Month", "Day_of_month", "Day_of_week", "ozone_reading", "pressure_height", "Wind_speed", "Humidity", "Temperature_Sandburg", "Temperature_ElMonte", "Inversion_base_height", "Pressure_gradient", "Inversion_temperature", "Visibility")

#NA imputation
library(DMwR)

## Loading required package: lattice

## Loading required package: grid

inputData <- knnImputation(inputData)

inputData_cont <- inputData[, c("pressure_height", "Wind_speed", "Humidity", "Temperature_Sandburg", "Temperature_ElMonte", "Inversion_base_height", "Pressure_gradient", "Inversion_temperature", "Visibility")]

inputData_cat <- inputData[, c("Month", "Day_of_month", "Day_of_week")]

inputData_response <- data.frame(ozone_reading=inputData[, "ozone_reading"]) 
# response #variable as a dataframe
response_name <- "ozone_reading"  # name of response variable
response <- inputData[, response_name] # response variable as a vector

#1. random forest method
library(party)

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

cf1 <- cforest(ozone_reading ~ . , data= inputData, control=cforest_unbiased(mtry=2,ntree=50)) # fit the random forest
varimp(cf1) # get variable importance, based on mean decrease in accuracy

##                 Month          Day_of_month           Day_of_week 
##             3.7062044            -0.1382535            -0.2174799 
##       pressure_height            Wind_speed              Humidity 
##             2.9864310             0.1739157             4.0674793 
##  Temperature_Sandburg   Temperature_ElMonte Inversion_base_height 
##            10.9701438            11.5560742             6.0519120 
##     Pressure_gradient Inversion_temperature            Visibility 
##             2.3410749             9.4198043             1.5428216

varimp(cf1, conditional=TRUE)  # conditional=True, adjusts for correlations between predictors

##                 Month          Day_of_month           Day_of_week 
##            0.56589104           -0.08416212           -0.01287185 
##       pressure_height            Wind_speed              Humidity 
##            0.17733925           -0.04787313            0.43068883 
##  Temperature_Sandburg   Temperature_ElMonte Inversion_base_height 
##            0.91190947            2.18890604            0.87562096 
##     Pressure_gradient Inversion_temperature            Visibility 
##            0.29950532            0.90544473            0.21006645

varimpAUC(cf1)  # more robust towards class imbalance

##                 Month          Day_of_month           Day_of_week 
##            3.81324789            0.03788994           -0.04052355 
##       pressure_height            Wind_speed              Humidity 
##            3.09446180            0.02728874            3.89792164 
##  Temperature_Sandburg   Temperature_ElMonte Inversion_base_height 
##           10.92087203           12.04959959            5.06100818 
##     Pressure_gradient Inversion_temperature            Visibility 
##            2.26375315           10.22862976            1.62766000

#2. relative importance
library(relaimpo)

## Loading required package: MASS

## Loading required package: boot

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

## Loading required package: survey

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:boot':
## 
##     aml

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

## Loading required package: mitools

## This is the global version of package relaimpo.

## If you are a non-US user, a version with the interesting additional metric pmvd is available

## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.

lmMod <- lm(ozone_reading ~ . , data = inputData)  # fit lm() model
relImportance <- calc.relimp(lmMod, type = "lmg", rela = TRUE)  # calculate relative importance scaled to 100
sort(relImportance$lmg, decreasing=TRUE)  # relative importance

##   Temperature_ElMonte  Temperature_Sandburg                 Month 
##           0.184722438           0.164540381           0.163371978 
## Inversion_temperature       pressure_height Inversion_base_height 
##           0.137890248           0.087594494           0.083696664 
##              Humidity            Visibility          Day_of_month 
##           0.068573808           0.039202230           0.031248599 
##     Pressure_gradient           Day_of_week            Wind_speed 
##           0.026557629           0.008371262           0.004230269

#3. MARS in earth package
library(earth)

## Loading required package: plotmo

## Loading required package: plotrix

## Loading required package: TeachingDemos

marsModel <- earth(ozone_reading ~ ., data=inputData)
ev <- evimp (marsModel)
plot (ev)

#4. stepwise regression
base.mod <- lm(ozone_reading ~ 1 , data= inputData)
all.mod <- lm(ozone_reading ~ . , data= inputData)
stepMod <- step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = "both", trace = 1, steps = 1000)

## Start:  AIC=1514.09
## ozone_reading ~ 1
## 
##                         Df Sum of Sq     RSS    AIC
## + Temperature_Sandburg   1   13565.4  9223.7 1185.0
## + Temperature_ElMonte    1   13383.8  9405.3 1192.2
## + Inversion_temperature  1   12085.6 10703.5 1239.5
## + Month                 11    9210.1 13579.0 1346.6
## + pressure_height        1    7906.1 14883.1 1360.2
## + Inversion_base_height  1    7638.0 15151.2 1366.7
## + Humidity               1    4697.7 18091.5 1431.6
## + Visibility             1    3943.0 18846.1 1446.6
## + Pressure_gradient      1    1220.0 21569.1 1496.0
## <none>                               22789.2 1514.1
## + Wind_speed             1       0.6 22788.6 1516.1
## + Day_of_week            6     279.0 22510.1 1521.6
## + Day_of_month          30     772.7 22016.4 1561.5
## 
## Step:  AIC=1185.05
## ozone_reading ~ Temperature_Sandburg
## 
##                         Df Sum of Sq     RSS    AIC
## + Month                 11    1355.1  7868.6 1148.9
## + Humidity               1     877.7  8346.0 1150.5
## + Inversion_base_height  1     868.8  8355.0 1150.8
## + Temperature_ElMonte    1     633.3  8590.4 1161.0
## + Visibility             1     374.9  8848.8 1171.9
## + Inversion_temperature  1     353.9  8869.8 1172.7
## + Pressure_gradient      1     159.5  9064.2 1180.7
## + pressure_height        1      68.4  9155.3 1184.3
## <none>                                9223.7 1185.0
## + Wind_speed             1       1.5  9222.2 1187.0
## + Day_of_week            6     120.8  9102.9 1192.2
## + Day_of_month          30     630.3  8593.4 1219.1
## - Temperature_Sandburg   1   13565.4 22789.2 1514.1
## 
## Step:  AIC=1148.89
## ozone_reading ~ Temperature_Sandburg + Month
## 
##                         Df Sum of Sq     RSS    AIC
## + Temperature_ElMonte    1    1445.9  6422.7 1076.6
## + Inversion_temperature  1     864.3  7004.4 1108.3
## + Inversion_base_height  1     849.1  7019.5 1109.1
## + Visibility             1     641.5  7227.1 1119.8
## + Humidity               1     593.2  7275.4 1122.2
## + Wind_speed             1     161.6  7707.0 1143.3
## + pressure_height        1     102.1  7766.5 1146.1
## + Pressure_gradient      1      93.1  7775.6 1146.5
## <none>                                7868.6 1148.9
## + Day_of_week            6     120.0  7748.6 1155.3
## + Day_of_month          30     632.7  7235.9 1178.2
## - Month                 11    1355.1  9223.7 1185.0
## - Temperature_Sandburg   1    5710.4 13579.0 1346.6
## 
## Step:  AIC=1076.58
## ozone_reading ~ Temperature_Sandburg + Month + Temperature_ElMonte
## 
##                         Df Sum of Sq    RSS    AIC
## + Humidity               1    731.20 5691.5 1034.3
## + Visibility             1    375.27 6047.5 1056.5
## + Inversion_base_height  1    262.08 6160.7 1063.3
## + pressure_height        1    119.72 6303.0 1071.7
## + Wind_speed             1     75.73 6347.0 1074.2
## + Inversion_temperature  1     52.17 6370.6 1075.6
## <none>                               6422.7 1076.6
## + Pressure_gradient      1     26.95 6395.8 1077.0
## - Temperature_Sandburg   1    149.49 6572.2 1083.0
## + Day_of_week            6     80.32 6342.4 1084.0
## + Day_of_month          30    523.90 5898.8 1105.4
## - Temperature_ElMonte    1   1445.90 7868.6 1148.9
## - Month                 11   2167.69 8590.4 1161.0
## 
## Step:  AIC=1034.34
## ozone_reading ~ Temperature_Sandburg + Month + Temperature_ElMonte + 
##     Humidity
## 
##                         Df Sum of Sq    RSS    AIC
## + Pressure_gradient      1    113.67 5577.9 1029.0
## + Visibility             1     73.23 5618.3 1031.6
## + Inversion_base_height  1     65.14 5626.4 1032.1
## + Wind_speed             1     54.82 5636.7 1032.8
## <none>                               5691.5 1034.3
## + pressure_height        1     24.71 5666.8 1034.8
## + Inversion_temperature  1     23.77 5667.8 1034.8
## + Day_of_week            6    113.25 5578.3 1039.0
## - Temperature_Sandburg   1    129.49 5821.0 1040.6
## + Day_of_month          30    521.20 5170.3 1059.2
## - Humidity               1    731.20 6422.7 1076.6
## - Month                 11   1735.93 7427.5 1109.8
## - Temperature_ElMonte    1   1583.91 7275.4 1122.2
## 
## Step:  AIC=1028.96
## ozone_reading ~ Temperature_Sandburg + Month + Temperature_ElMonte + 
##     Humidity + Pressure_gradient
## 
##                         Df Sum of Sq    RSS    AIC
## + Visibility             1     90.83 5487.0 1025.0
## + Wind_speed             1     49.09 5528.8 1027.7
## + Inversion_base_height  1     44.43 5533.4 1028.0
## <none>                               5577.9 1029.0
## + pressure_height        1     23.03 5554.8 1029.4
## + Inversion_temperature  1      4.56 5573.3 1030.7
## + Day_of_week            6    106.13 5471.7 1033.9
## - Pressure_gradient      1    113.67 5691.5 1034.3
## - Temperature_Sandburg   1    158.47 5736.3 1037.2
## + Day_of_month          30    492.28 5085.6 1055.1
## - Humidity               1    817.92 6395.8 1077.0
## - Temperature_ElMonte    1   1033.34 6611.2 1089.2
## - Month                 11   1810.91 7388.8 1109.9
## 
## Step:  AIC=1024.95
## ozone_reading ~ Temperature_Sandburg + Month + Temperature_ElMonte + 
##     Humidity + Pressure_gradient + Visibility
## 
##                         Df Sum of Sq    RSS    AIC
## + Wind_speed             1     35.66 5451.4 1024.6
## + pressure_height        1     31.20 5455.8 1024.9
## <none>                               5487.0 1025.0
## + Inversion_base_height  1     29.90 5457.1 1025.0
## + Inversion_temperature  1      0.70 5486.3 1026.9
## - Visibility             1     90.83 5577.9 1029.0
## + Day_of_week            6    111.27 5375.8 1029.5
## - Pressure_gradient      1    131.27 5618.3 1031.6
## - Temperature_Sandburg   1    160.10 5647.1 1033.5
## + Day_of_month          30    469.31 5017.7 1052.2
## - Humidity               1    560.14 6047.2 1058.5
## - Temperature_ElMonte    1    848.40 6335.4 1075.6
## - Month                 11   1886.76 7373.8 1111.1
## 
## Step:  AIC=1024.56
## ozone_reading ~ Temperature_Sandburg + Month + Temperature_ElMonte + 
##     Humidity + Pressure_gradient + Visibility + Wind_speed
## 
##                         Df Sum of Sq    RSS    AIC
## + pressure_height        1     49.37 5402.0 1023.2
## <none>                               5451.4 1024.6
## - Wind_speed             1     35.66 5487.0 1025.0
## + Inversion_base_height  1     20.40 5431.0 1025.2
## + Inversion_temperature  1      0.00 5451.4 1026.6
## - Visibility             1     77.40 5528.8 1027.7
## + Day_of_week            6    103.01 5348.4 1029.6
## - Pressure_gradient      1    124.52 5575.9 1030.8
## - Temperature_Sandburg   1    151.32 5602.7 1032.6
## + Day_of_month          30    464.47 4986.9 1052.0
## - Humidity               1    554.48 6005.8 1058.0
## - Temperature_ElMonte    1    825.34 6276.7 1074.2
## - Month                 11   1921.47 7372.8 1113.1
## 
## Step:  AIC=1023.23
## ozone_reading ~ Temperature_Sandburg + Month + Temperature_ElMonte + 
##     Humidity + Pressure_gradient + Visibility + Wind_speed + 
##     pressure_height
## 
##                         Df Sum of Sq    RSS    AIC
## <none>                               5402.0 1023.2
## + Inversion_base_height  1     22.66 5379.3 1023.7
## - pressure_height        1     49.37 5451.4 1024.6
## - Wind_speed             1     53.83 5455.8 1024.9
## + Inversion_temperature  1      2.53 5399.5 1025.1
## - Visibility             1     84.40 5486.4 1026.9
## + Day_of_week            6     95.02 5307.0 1028.7
## - Pressure_gradient      1    121.40 5523.4 1029.4
## - Temperature_Sandburg   1    190.02 5592.0 1033.9
## + Day_of_month          30    468.21 4933.8 1050.0
## - Humidity               1    456.50 5858.5 1050.9
## - Temperature_ElMonte    1    825.63 6227.6 1073.3
## - Month                 11   1624.84 7026.8 1097.5

shortlistedVars <- names(unlist(stepMod[[1]]))
shortlistedVars <- shortlistedVars[!shortlistedVars %in% "(Intercept)"]

#5. Boruta
library(Boruta)

## Loading required package: ranger

boruta_output <- Boruta(response ~ ., data=na.omit(inputData), doTrace=2)

##  1. run of importance source...

##  2. run of importance source...

##  3. run of importance source...

##  4. run of importance source...

##  5. run of importance source...

##  6. run of importance source...

##  7. run of importance source...

##  8. run of importance source...

##  9. run of importance source...

##  10. run of importance source...

##  11. run of importance source...

## After 11 iterations, +3 secs:

##  confirmed 10 attributes: Humidity, Inversion_base_height, Inversion_temperature, Month, ozone_reading and 5 more;

##  rejected 1 attribute: Day_of_week;

##  still have 2 attributes left.

##  12. run of importance source...

##  13. run of importance source...

##  14. run of importance source...

##  15. run of importance source...

## After 15 iterations, +3.8 secs:

##  rejected 1 attribute: Day_of_month;

##  still have 1 attribute left.

##  16. run of importance source...

##  17. run of importance source...

##  18. run of importance source...

##  19. run of importance source...

##  20. run of importance source...

##  21. run of importance source...

##  22. run of importance source...

##  23. run of importance source...

##  24. run of importance source...

##  25. run of importance source...

##  26. run of importance source...

##  27. run of importance source...

##  28. run of importance source...

##  29. run of importance source...

##  30. run of importance source...

##  31. run of importance source...

##  32. run of importance source...

##  33. run of importance source...

##  34. run of importance source...

##  35. run of importance source...

##  36. run of importance source...

##  37. run of importance source...

##  38. run of importance source...

##  39. run of importance source...

##  40. run of importance source...

##  41. run of importance source...

##  42. run of importance source...

##  43. run of importance source...

##  44. run of importance source...

##  45. run of importance source...

##  46. run of importance source...

##  47. run of importance source...

##  48. run of importance source...

##  49. run of importance source...

##  50. run of importance source...

##  51. run of importance source...

##  52. run of importance source...

##  53. run of importance source...

##  54. run of importance source...

##  55. run of importance source...

##  56. run of importance source...

##  57. run of importance source...

##  58. run of importance source...

##  59. run of importance source...

##  60. run of importance source...

##  61. run of importance source...

##  62. run of importance source...

##  63. run of importance source...

##  64. run of importance source...

##  65. run of importance source...

##  66. run of importance source...

##  67. run of importance source...

##  68. run of importance source...

##  69. run of importance source...

##  70. run of importance source...

##  71. run of importance source...

##  72. run of importance source...

##  73. run of importance source...

##  74. run of importance source...

##  75. run of importance source...

##  76. run of importance source...

##  77. run of importance source...

##  78. run of importance source...

##  79. run of importance source...

##  80. run of importance source...

##  81. run of importance source...

##  82. run of importance source...

##  83. run of importance source...

##  84. run of importance source...

##  85. run of importance source...

##  86. run of importance source...

##  87. run of importance source...

##  88. run of importance source...

##  89. run of importance source...

##  90. run of importance source...

##  91. run of importance source...

## After 91 iterations, +20 secs:

##  rejected 1 attribute: Wind_speed;

##  no more attributes left.

boruta_signif <- names(boruta_output$finalDecision[boruta_output$finalDecision %in% c("Confirmed", "Tentative")])
boruta_signif

##  [1] "Month"                 "ozone_reading"        
##  [3] "pressure_height"       "Humidity"             
##  [5] "Temperature_Sandburg"  "Temperature_ElMonte"  
##  [7] "Inversion_base_height" "Pressure_gradient"    
##  [9] "Inversion_temperature" "Visibility"

#6. information value and woe
library(woe)
library(riv)

## Loading required package: rrcov

## Loading required package: robustbase

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## The following object is masked from 'package:boot':
## 
##     salinity

## Scalable Robust Estimators with High Breakdown Point (version 1.4-3)

## Loading required package: quantreg

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'quantreg'

## The following object is masked from 'package:survival':
## 
##     untangle.specials

iv_df <- iv.mult(german_data, y="gb", summary=TRUE, verbose=TRUE)

## Started processing of data frame: german_data 
## Calling iv.str for variable: ca_status 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.67 
## Calling iv.num for variable: duration 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 5
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.26 
##   Formatting output
## Calling iv.str for variable: credit_history 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.29 
## Calling iv.str for variable: purpose 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.17 
## Calling iv.num for variable: credit_amount 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 5
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.21 
##   Formatting output
## Calling iv.str for variable: savings 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.2 
## Calling iv.str for variable: present_employment_since 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.09 
## Calling iv.num for variable: installment_rate_income 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 2
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.02 
##   Formatting output
## Calling iv.str for variable: status_sex 
## Assuming good = level 'good' and bad = level 'bad'

## Warning in iv.str(df, x, y, verbose = verbose): Some group for outcome 0 has zero count. This will result in -Inf or Inf WOE. Replacing - ODDS=1, WoE=0, MIV=0. 
##  The bin is either too small or suspiciously predictive. 
##  You should fix this before running any model. It does not make any sense to keep WoE = 0 for such bin.

## Information Value NaN 
## Calling iv.str for variable: other_debtors 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.03 
## Calling iv.num for variable: present_residence_since 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 1
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0 
##   Formatting output
## Calling iv.str for variable: property 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.11 
## Calling iv.num for variable: age 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 5
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.13 
##   Formatting output
## Calling iv.str for variable: other_installment 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.06 
## Calling iv.str for variable: housing 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.08 
## Calling iv.num for variable: existing_credits 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 2
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.01 
##   Formatting output
## Calling iv.str for variable: job 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.01 
## Calling iv.num for variable: liable_maintenance_people 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 1
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0 
##   Formatting output
## Calling iv.str for variable: telephone 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.01 
## Calling iv.str for variable: foreign_worker 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.04 
## Preparing summary

iv <- iv.mult(german_data, y="gb", summary=FALSE, verbose=TRUE)

## Started processing of data frame: german_data 
## Calling iv.str for variable: ca_status 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.67 
## Calling iv.num for variable: duration 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 5
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.26 
##   Formatting output
## Calling iv.str for variable: credit_history 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.29 
## Calling iv.str for variable: purpose 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.17 
## Calling iv.num for variable: credit_amount 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 5
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.21 
##   Formatting output
## Calling iv.str for variable: savings 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.2 
## Calling iv.str for variable: present_employment_since 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.09 
## Calling iv.num for variable: installment_rate_income 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 2
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.02 
##   Formatting output
## Calling iv.str for variable: status_sex 
## Assuming good = level 'good' and bad = level 'bad'

## Warning in iv.str(df, x, y, verbose = verbose): Some group for outcome 0 has zero count. This will result in -Inf or Inf WOE. Replacing - ODDS=1, WoE=0, MIV=0. 
##  The bin is either too small or suspiciously predictive. 
##  You should fix this before running any model. It does not make any sense to keep WoE = 0 for such bin.

## Information Value NaN 
## Calling iv.str for variable: other_debtors 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.03 
## Calling iv.num for variable: present_residence_since 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 1
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0 
##   Formatting output
## Calling iv.str for variable: property 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.11 
## Calling iv.num for variable: age 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 5
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.13 
##   Formatting output
## Calling iv.str for variable: other_installment 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.06 
## Calling iv.str for variable: housing 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.08 
## Calling iv.num for variable: existing_credits 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 2
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0.01 
##   Formatting output
## Calling iv.str for variable: job 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.01 
## Calling iv.num for variable: liable_maintenance_people 
##   Building rpart model
##   Model finished
##   Sending model to tree parser
##   Rules parsed: 1
##   Mapping nodes to data
##     SQL Merge
##     DF Merge
##   Calling iv.str for nodes
## Information Value 0 
##   Formatting output
## Calling iv.str for variable: telephone 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.01 
## Calling iv.str for variable: foreign_worker 
## Assuming good = level 'good' and bad = level 'bad' 
## Information Value 0.04

iv.plot.summary(iv_df)

german_data_iv <- iv.replace.woe(german_data, iv, verbose=TRUE)  # add woe to data frame

## Var Name: ca_status
## WOE Name: ca_status_woe
## Var Name: duration
## WOE Name: duration_woe
## Var Name: credit_history
## WOE Name: credit_history_woe
## Var Name: purpose
## WOE Name: purpose_woe
## Var Name: credit_amount
## WOE Name: credit_amount_woe
## Var Name: savings
## WOE Name: savings_woe
## Var Name: present_employment_since
## WOE Name: present_employment_since_woe
## Var Name: installment_rate_income
## WOE Name: installment_rate_income_woe
## Var Name: status_sex
## WOE Name: status_sex_woe
## Var Name: other_debtors
## WOE Name: other_debtors_woe
## Var Name: present_residence_since
## WOE Name: present_residence_since_woe
## Var Name: property
## WOE Name: property_woe
## Var Name: age
## WOE Name: age_woe
## Var Name: other_installment
## WOE Name: other_installment_woe
## Var Name: housing
## WOE Name: housing_woe
## Var Name: existing_credits
## WOE Name: existing_credits_woe
## Var Name: job
## WOE Name: job_woe
## Var Name: liable_maintenance_people
## WOE Name: liable_maintenance_people_woe
## Var Name: telephone
## WOE Name: telephone_woe
## Var Name: foreign_worker
## WOE Name: foreign_worker_woe

ML in R

Kushan De Silva

October 12, 2017