Team member: Priyanka Veeranki Lixia Chen Amod Panchal Ikeahor Ezewele

Load packages

library(mlr)
## Warning: package 'mlr' was built under R version 4.2.1
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 4.2.1
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(parallel)
library(e1071)
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:mlr':
## 
##     impute
library(caTools)
library(parallelMap)
## Warning: package 'parallelMap' was built under R version 4.2.1
library(dplyr)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
packages <-c("randomForestSRC", "party")
for(x in packages){
  library(x,character.only = TRUE)
}
## Warning: package 'randomForestSRC' was built under R version 4.2.1
## 
##  randomForestSRC 3.1.1 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
##  
## 
## 
## Attaching package: 'randomForestSRC'
## 
## The following objects are masked from 'package:e1071':
## 
##     impute, tune
## 
## The following object is masked from 'package:purrr':
## 
##     partial
## 
## The following objects are masked from 'package:mlr':
## 
##     impute, subsample
## Warning: package 'party' was built under R version 4.2.1
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.2.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.1
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.1
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
library(randomForestSRC)
library(party)

#Load train dataset

sbank <- read.csv("C:/Users/Public/train.csv", nrows=5000)
sbank_test <- read.csv("C:/Users/Public/test.csv",  nrows=2000)

loading data for train data set. For this task we will use part of our training data with hyperparameter running on SVM models is a cost-effective task for large datasets. Inside In the fall, this section is equal to the first data set.

#Dimension of train and test data set

dim(sbank)
## [1] 5000  202
dim(sbank_test)
## [1] 2000  201

#Structure of sbank

str(sbank)
## 'data.frame':    5000 obs. of  202 variables:
##  $ ID_code: chr  "train_0" "train_1" "train_2" "train_3" ...
##  $ target : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ var_0  : num  8.93 11.5 8.61 11.06 9.84 ...
##  $ var_1  : num  -6.79 -4.15 -2.75 -2.15 -1.48 ...
##  $ var_2  : num  11.91 13.86 12.08 8.95 12.87 ...
##  $ var_3  : num  5.09 5.39 7.89 7.2 6.64 ...
##  $ var_4  : num  11.5 12.4 10.6 12.6 12.3 ...
##  $ var_5  : num  -9.28 7.04 -9.08 -1.84 2.45 ...
##  $ var_6  : num  5.12 5.62 6.94 5.84 5.94 ...
##  $ var_7  : num  18.6 16.5 14.6 14.9 19.3 ...
##  $ var_8  : num  -4.92 3.15 -4.92 -5.86 6.27 ...
##  $ var_9  : num  5.75 8.09 5.95 8.24 7.68 ...
##  $ var_10 : num  2.925 -0.403 -0.325 2.306 -9.446 ...
##  $ var_11 : num  3.18 8.06 -11.26 2.81 -12.14 ...
##  $ var_12 : num  14 14 14.2 13.8 13.8 ...
##  $ var_13 : num  0.575 8.414 7.312 11.97 7.889 ...
##  $ var_14 : num  8.8 5.43 7.52 6.46 7.79 ...
##  $ var_15 : num  14.6 13.7 14.6 14.8 15.1 ...
##  $ var_16 : num  5.75 13.83 7.68 10.74 8.49 ...
##  $ var_17 : num  -7.24 -15.58 -1.74 -0.43 -3.07 ...
##  $ var_18 : num  4.28 7.8 4.7 15.94 6.53 ...
##  $ var_19 : num  30.7 28.6 20.5 13.7 11.3 ...
##  $ var_20 : num  10.54 3.43 17.76 20.3 21.42 ...
##  $ var_21 : num  16.22 2.74 18.14 12.56 18.96 ...
##  $ var_22 : num  2.58 8.55 1.21 6.82 10.11 ...
##  $ var_23 : num  2.47 3.37 3.51 2.72 2.71 ...
##  $ var_24 : num  14.38 6.98 5.68 12.14 14.21 ...
##  $ var_25 : num  13.4 13.9 13.2 13.7 13.5 ...
##  $ var_26 : num  -5.149 -11.768 -7.994 0.814 3.174 ...
##  $ var_27 : num  -0.407 -2.559 -2.903 -0.906 -3.342 ...
##  $ var_28 : num  4.93 5.05 5.85 5.91 5.9 ...
##  $ var_29 : num  5.997 0.548 6.144 2.841 7.935 ...
##  $ var_30 : num  -0.308 -9.299 -11.102 -15.24 -3.158 ...
##  $ var_31 : num  12.9 7.88 12.49 10.44 9.47 ...
##  $ var_32 : num  -3.8766 1.2859 -2.2871 -2.5731 -0.0083 ...
##  $ var_33 : num  16.89 19.37 19.04 6.18 19.32 ...
##  $ var_34 : num  11.2 11.4 11 10.6 12.4 ...
##  $ var_35 : num  10.579 0.74 4.109 -5.916 0.633 ...
##  $ var_36 : num  0.676 2.8 4.697 8.172 2.792 ...
##  $ var_37 : num  7.89 5.84 6.93 2.85 5.82 ...
##  $ var_38 : num  4.67 10.82 10.89 9.17 19.3 ...
##  $ var_39 : num  3.874 3.678 0.9 0.666 1.445 ...
##  $ var_40 : num  -5.24 -11.11 -13.52 -3.83 -5.6 ...
##  $ var_41 : num  7.37 1.87 2.24 -1.04 14.07 ...
##  $ var_42 : num  11.58 9.88 11.53 11.78 11.92 ...
##  $ var_43 : num  12 11.8 12 11.3 11.5 ...
##  $ var_44 : num  11.64 1.24 4.1 8.05 6.91 ...
##  $ var_45 : num  -7.02 -47.38 -7.91 -24.68 -65.49 ...
##  $ var_46 : num  5.92 7.37 11.14 12.74 13.87 ...
##  $ var_47 : num  -14.2136 0.1948 -5.7864 -35.1659 0.0444 ...
##  $ var_48 : num  16.028 34.401 20.748 0.761 -0.135 ...
##  $ var_49 : num  5.33 25.7 6.89 8.38 14.43 ...
##  $ var_50 : num  12.9 11.8 12.9 12.7 13.3 ...
##  $ var_51 : num  29.05 13.23 19.59 9.55 10.49 ...
##  $ var_52 : num  -0.694 -4.108 0.727 1.79 -1.437 ...
##  $ var_53 : num  5.17 6.69 6.41 5.21 5.76 ...
##  $ var_54 : num  -0.747 -8.095 9.312 8.091 -8.541 ...
##  $ var_55 : num  14.83 18.6 6.28 12.4 14.15 ...
##  $ var_56 : num  11.3 19.3 15.6 14.5 17 ...
##  $ var_57 : num  5.38 7.01 5.82 6.58 6.18 ...
##  $ var_58 : num  2.02 1.92 1.1 3.32 1.95 ...
##  $ var_59 : num  10.12 8.87 9.19 9.46 9.2 ...
##  $ var_60 : num  16.18 8.01 12.6 15.78 8.66 ...
##  $ var_61 : num  4.96 -7.24 -10.37 -25.02 -27.74 ...
##  $ var_62 : num  2.077 1.794 0.875 3.442 -0.495 ...
##  $ var_63 : num  -0.215 -1.315 5.804 -4.392 -1.784 ...
##  $ var_64 : num  8.67 8.1 3.72 8.65 5.27 ...
##  $ var_65 : num  9.53 1.54 -1.1 6.31 -4.32 ...
##  $ var_66 : num  5.81 5.4 7.37 5.62 6.99 ...
##  $ var_67 : num  22.43 7.93 9.86 23.61 1.62 ...
##  $ var_68 : num  5.01 5.02 5.02 5.02 5.03 ...
##  $ var_69 : num  -4.7 2.23 -5.78 -4 -3.24 ...
##  $ var_70 : num  21.64 40.56 2.36 4.05 40.12 ...
##  $ var_71 : num  0.566 0.513 0.852 0.25 0.774 ...
##  $ var_72 : num  5.2 3.17 6.358 1.252 -0.726 ...
##  $ var_73 : num  8.86 20.11 12.17 24.42 4.59 ...
##  $ var_74 : num  43.11 7.78 19.73 4.53 -4.53 ...
##  $ var_75 : num  18.38 7.05 19.45 15.42 23.35 ...
##  $ var_76 : num  -2.34 3.27 4.5 11.69 1.03 ...
##  $ var_77 : num  23.4 23.5 23.2 23.6 19.2 ...
##  $ var_78 : num  6.52 5.51 6.32 4.08 7.17 ...
##  $ var_79 : num  12.2 13.8 12.8 15.3 14.4 ...
##  $ var_80 : num  13.647 2.546 7.473 0.784 2.96 ...
##  $ var_81 : num  13.8 18.2 15.8 10.5 13.3 ...
##  $ var_82 : num  1.367 0.368 13.353 1.621 -9.259 ...
##  $ var_83 : num  2.94 -4.82 10.19 -5.29 -6.71 ...
##  $ var_84 : num  -4.52 -5.49 5.46 1.6 7.9 ...
##  $ var_85 : num  21.5 13.8 19.1 18 14.5 ...
##  $ var_86 : num  9.32 -13.59 -4.46 -2.32 7.08 ...
##  $ var_87 : num  16.46 11.1 9.54 15.63 20.17 ...
##  $ var_88 : num  8 7.9 11.91 4.55 8.01 ...
##  $ var_89 : num  -1.71 12.23 2.14 7.55 3.8 ...
##  $ var_90 : num  -21.449 0.477 -22.404 -7.587 -39.8 ...
##  $ var_91 : num  6.78 6.89 7.09 7.04 7.01 ...
##  $ var_92 : num  11.09 8.09 14.16 14.4 9.36 ...
##  $ var_93 : num  9.99 10.96 10.51 10.78 10.43 ...
##  $ var_94 : num  14.84 11.76 14.26 7.29 14.06 ...
##  $ var_95 : num  0.1812 -1.2722 0.2647 -1.093 0.0213 ...
##  $ var_96 : num  8.96 24.79 20.4 11.36 14.72 ...
##   [list output truncated]

Above statement helps to check the structure of train dataset before checking of each column

#Creating a task, model and learner with mlr We will use the mlr package which has all machine learning methods in one appropriate place.

#Create a task: First, we will put in our example a service with a quality level of 1. (which means that the customer did business)

set.seed(48748674)
sbank = sbank %>% sample_n(size = 1000)
sbank1 = sbank %>% mutate_at(vars(one_of("ID_code")), funs( as.factor)) 
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
train_task <- makeClassifTask(data = sbank1, target = "target",positive = "1")

#Ranking Task We will create notifications that help us improve this service. Function filter attribute A value of 10 will be used to calculate and train our model on them.

#ranking_task <- filterFeatures(train_task, 
                              # method = "party_cforest.importance",
                               #abs = 10)
#ranking_task

#Define a Learner

svm.learner <- makeLearner("classif.svm")

#Tuning hyperparameters We need to change our hyperparameters before we can train our model. We use it getParamSet() to find available hyperparameters for algorithm optimization

getParamSet("classif.svm")
##                        Type  len             Def
## type               discrete    - C-classifica...
## cost                numeric    -               1
## nu                  numeric    -             0.5
## class.weights numericvector <NA>               -
## kernel             discrete    -          radial
## degree              integer    -               3
## coef0               numeric    -               0
## gamma               numeric    -               -
## cachesize           numeric    -              40
## tolerance           numeric    -           0.001
## shrinking           logical    -            TRUE
## cross               integer    -               0
## fitted              logical    -            TRUE
## scale         logicalvector <NA>            TRUE
##                                           Constr Req Tunable Trafo
## type          C-classification,nu-classification   -    TRUE     -
## cost                                    0 to Inf   Y    TRUE     -
## nu                                   -Inf to Inf   Y    TRUE     -
## class.weights                           0 to Inf   -    TRUE     -
## kernel          linear,polynomial,radial,sigmoid   -    TRUE     -
## degree                                  1 to Inf   Y    TRUE     -
## coef0                                -Inf to Inf   Y    TRUE     -
## gamma                                   0 to Inf   Y    TRUE     -
## cachesize                            -Inf to Inf   -    TRUE     -
## tolerance                               0 to Inf   -    TRUE     -
## shrinking                                      -   -    TRUE     -
## cross                                   0 to Inf   -   FALSE     -
## fitted                                         -   -   FALSE     -
## scale                                          -   -    TRUE     -

A list of possible restrictions is provided in the information above. Kernel, price,The degree and gamma are the most important factors for our model changed

#Define the hyperparameter space for tuning To define the hyperparameter space that we want to add, we will use makeParamSet(). A kernel hyperparameter method that accepts kernel names will be placed this. The integer value of the hyperparameter will be used for the step. The values will be numeric because of the cost and range of hyperparameters

kernels <- c("polynomial", "radial", "sigmoid")
svmParamSpace <- makeParamSet(makeDiscreteParam("kernel", values = kernels),
                makeIntegerParam("degree", lower = 1, upper = 3),
                makeNumericParam("cost", lower = 0.1, upper = 10),
                makeNumericParam("gamma", lower = 0.1, 10))

#Define the random search We will use a random search method to find the best sign from the search engine.it takes a lot of time and uses a lot of ingredients

randSearch <- makeTuneControlRandom(maxit = 10)
cvForTuning <- makeResampleDesc("Holdout", split=2/3)

#performing hyperparameter tuning

#parallelStartSocket(cpus = detectCores()-1, setup_strategy = #"sequential")
tunedSvmPars <- tuneParams("classif.svm", 
                           task = train_task,
                           resampling = cvForTuning,
                           par.set = svmParamSpace,
                           control = randSearch)
## [Tune] Started tuning learner classif.svm for parameter set:
##            Type len Def                    Constr Req Tunable Trafo
## kernel discrete   -   - polynomial,radial,sigmoid   -    TRUE     -
## degree  integer   -   -                    1 to 3   -    TRUE     -
## cost    numeric   -   -                 0.1 to 10   -    TRUE     -
## gamma   numeric   -   -                 0.1 to 10   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: 1
## [Tune-x] 1: kernel=radial; degree=1; cost=3.57; gamma=5.56
## [Tune-y] 1: mmce.test.mean=0.0868263; time: 0.0 min
## [Tune-x] 2: kernel=radial; degree=2; cost=9.56; gamma=5.54
## [Tune-y] 2: mmce.test.mean=0.0868263; time: 0.0 min
## [Tune-x] 3: kernel=polynomial; degree=1; cost=2.78; gamma=3.3
## [Tune-y] 3: mmce.test.mean=0.1497006; time: 0.0 min
## [Tune-x] 4: kernel=sigmoid; degree=3; cost=0.24; gamma=3.33
## [Tune-y] 4: mmce.test.mean=0.1257485; time: 0.0 min
## [Tune-x] 5: kernel=sigmoid; degree=3; cost=6.12; gamma=4.16
## [Tune-y] 5: mmce.test.mean=0.1556886; time: 0.0 min
## [Tune-x] 6: kernel=radial; degree=2; cost=3.55; gamma=0.899
## [Tune-y] 6: mmce.test.mean=0.0868263; time: 0.0 min
## [Tune-x] 7: kernel=polynomial; degree=1; cost=6.36; gamma=9.45
## [Tune-y] 7: mmce.test.mean=0.1497006; time: 0.0 min
## [Tune-x] 8: kernel=sigmoid; degree=1; cost=4.78; gamma=0.626
## [Tune-y] 8: mmce.test.mean=0.1137725; time: 0.0 min
## [Tune-x] 9: kernel=radial; degree=3; cost=1.11; gamma=4.43
## [Tune-y] 9: mmce.test.mean=0.0868263; time: 0.0 min
## [Tune-x] 10: kernel=radial; degree=1; cost=1.57; gamma=7.93
## [Tune-y] 10: mmce.test.mean=0.0868263; time: 0.0 min
## [Tune] Result: kernel=radial; degree=3; cost=1.11; gamma=4.43 : mmce.test.mean=0.0868263

We can observe that the highest performing hyper parameter values are a radial kernel of degree three with a cost of 1.114311 and a gamma of 4.434755, as shown above.

#Training the model

tunedSvm <- setHyperPars(makeLearner("classif.svm"),
                         par.vals = tunedSvmPars$x)
tunedSvmModel <- train(tunedSvm, train_task)
tunedSvmModel
## Model for learner.id=classif.svm; learner.class=classif.svm
## Trained on: task.id = sbank1; obs = 1000; features = 201
## Hyperparameters: kernel=radial,degree=3,cost=1.11,gamma=4.43

#Cross validation of SVM We’ll bind our example here by writing a binding wrapper function the hyperparameter and hyperparameter tweaking method and group. As we explained in the code, and the resampling loop will perform cross-validation in the 2/3 division. The support is placed in the outer loop that we created below.

outer <- makeResampleDesc("CV", iters = 3)
svmWrapper <- makeTuneWrapper("classif.svm",
                              resampling = cvForTuning,
                              par.set = svmParamSpace,
                              control = randSearch)
parallelStartSocket(cpus = detectCores()-1,
                    setup_strategy = "sequential")
## Starting parallelization in mode=socket with cpus=7.
cvWithTuning <- resample(svmWrapper, train_task, resampling = outer)
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Resampling: cross-validation
## Measures:             mmce
## Mapping in parallel: mode = socket; level = mlr.resample; cpus = 7; elements = 3.
## 
## Aggregated Result: mmce.test.mean=0.0970162
## 
parallelStop()
## Stopped parallelization. All cleaned up.

#Results of Cross Validation

cvWithTuning
## Resample Result
## Task: sbank1
## Learner: classif.svm.tuned
## Aggr perf: mmce.test.mean=0.0970162
## Runtime: 6.92207

When we are in the division of the task using the ten main characteristics, We are able to validate our SVM model. Look at the results of the cross-check, It can be seen that 90.29 percent of our information is confirmed as fact or not made by the customer. This turned out to be similar to task 1, but better and use more resources.