Voice_Classifiers.r

setwd("C:\\Users\\user\\Desktop\\Blogs\\Publish")
voice=read.csv("voice.csv")
head(voice)

##     meanfreq         sd     median         Q25        Q75        IQR
## 1 0.05978099 0.06424127 0.03202691 0.015071489 0.09019344 0.07512195
## 2 0.06600874 0.06731003 0.04022874 0.019413867 0.09266619 0.07325232
## 3 0.07731550 0.08382942 0.03671846 0.008701057 0.13190802 0.12320696
## 4 0.15122809 0.07211059 0.15801119 0.096581728 0.20795525 0.11137352
## 5 0.13512039 0.07914610 0.12465623 0.078720218 0.20604493 0.12732471
## 6 0.13278641 0.07955687 0.11908985 0.067957993 0.20959160 0.14163361
##        skew        kurt    sp.ent       sfm       mode   centroid
## 1 12.863462  274.402905 0.8933694 0.4919178 0.00000000 0.05978099
## 2 22.423285  634.613855 0.8921932 0.5137238 0.00000000 0.06600874
## 3 30.757155 1024.927705 0.8463891 0.4789050 0.00000000 0.07731550
## 4  1.232831    4.177296 0.9633225 0.7272318 0.08387818 0.15122809
## 5  1.101174    4.333713 0.9719551 0.7835681 0.10426140 0.13512039
## 6  1.932562    8.308895 0.9631813 0.7383070 0.11255543 0.13278641
##      meanfun     minfun    maxfun     meandom    mindom    maxdom
## 1 0.08427911 0.01570167 0.2758621 0.007812500 0.0078125 0.0078125
## 2 0.10793655 0.01582591 0.2500000 0.009014423 0.0078125 0.0546875
## 3 0.09870626 0.01565558 0.2711864 0.007990057 0.0078125 0.0156250
## 4 0.08896485 0.01779755 0.2500000 0.201497396 0.0078125 0.5625000
## 5 0.10639785 0.01693122 0.2666667 0.712812500 0.0078125 5.4843750
## 6 0.11013192 0.01711230 0.2539683 0.298221983 0.0078125 2.7265625
##     dfrange    modindx label
## 1 0.0000000 0.00000000  male
## 2 0.0468750 0.05263158  male
## 3 0.0078125 0.04651163  male
## 4 0.5546875 0.24711908  male
## 5 5.4765625 0.20827389  male
## 6 2.7187500 0.12515964  male

dim(voice)

## [1] 3168   21

colSums(is.na(voice))

## meanfreq       sd   median      Q25      Q75      IQR     skew     kurt 
##        0        0        0        0        0        0        0        0 
##   sp.ent      sfm     mode centroid  meanfun   minfun   maxfun  meandom 
##        0        0        0        0        0        0        0        0 
##   mindom   maxdom  dfrange  modindx    label 
##        0        0        0        0        0

table(voice$label)

## 
## female   male 
##   1584   1584

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(caret)

## Warning: package 'caret' was built under R version 3.4.4

## Loading required package: lattice

## Loading required package: ggplot2

library(car)

## Warning: package 'car' was built under R version 3.4.3

library(DiscriMiner)

## Warning: package 'DiscriMiner' was built under R version 3.4.2

trainIndex=createDataPartition(voice$label, p=0.75, list = FALSE,times = 1)
voice.train=voice[trainIndex,]
voice.test=voice[-trainIndex,]
voice.train.x=voice[trainIndex,1:20]
voice.train.y=voice[trainIndex,21]

voice.test.x=voice[-trainIndex,1:20]
voice.test.y=voice[-trainIndex,21]


###Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.3

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(Boruta)

## Warning: package 'Boruta' was built under R version 3.4.4

## Loading required package: ranger

## Warning: package 'ranger' was built under R version 3.4.4

## 
## Attaching package: 'ranger'

## The following object is masked from 'package:randomForest':
## 
##     importance

voice.BT=Boruta(label~., data = voice, doTrace=2, ntree=500)

##  1. run of importance source...

##  2. run of importance source...

##  3. run of importance source...

##  4. run of importance source...

##  5. run of importance source...

##  6. run of importance source...

##  7. run of importance source...

##  8. run of importance source...

##  9. run of importance source...

##  10. run of importance source...

##  11. run of importance source...

## After 11 iterations, +39 secs:

##  confirmed 20 attributes: centroid, dfrange, IQR, kurt, maxdom and 15 more;

##  no more attributes left.

plot(voice.BT, xlab = "", xaxt = "n", main="Variable Importance")
k <-lapply(1:ncol(voice.BT$ImpHistory),function(i)
  voice.BT$ImpHistory[is.finite(voice.BT$ImpHistory[,i]),i])
names(k) <- colnames(voice.BT$ImpHistory)
Labels <- sort(sapply(k,median))
axis(side = 1,las=2,labels = names(Labels),
     at = 1:ncol(voice.BT$ImpHistory), cex.axis = 0.7)

getSelectedAttributes(voice.BT)

##  [1] "meanfreq" "sd"       "median"   "Q25"      "Q75"      "IQR"     
##  [7] "skew"     "kurt"     "sp.ent"   "sfm"      "mode"     "centroid"
## [13] "meanfun"  "minfun"   "maxfun"   "meandom"  "mindom"   "maxdom"  
## [19] "dfrange"  "modindx"

attStats(voice.BT)

##           meanImp medianImp   minImp   maxImp normHits  decision
## meanfreq 15.62322  15.61245 14.57645 16.87907        1 Confirmed
## sd       21.56208  21.43479 20.28532 23.02301        1 Confirmed
## median   15.22047  15.11179 14.26540 16.21802        1 Confirmed
## Q25      28.32153  28.58427 26.94125 29.57518        1 Confirmed
## Q75      15.63257  15.58216 14.50351 16.86030        1 Confirmed
## IQR      38.77793  38.02985 37.01546 41.81356        1 Confirmed
## skew     12.98145  13.20309 11.46838 13.84315        1 Confirmed
## kurt     12.84411  12.81247 11.52440 14.20167        1 Confirmed
## sp.ent   16.96122  16.60883 15.91170 18.26139        1 Confirmed
## sfm      18.30454  18.28308 16.95814 19.12684        1 Confirmed
## mode     13.43629  13.52252 12.62999 14.05887        1 Confirmed
## centroid 15.25943  15.46352 14.21894 16.03348        1 Confirmed
## meanfun  66.64167  66.70269 63.27066 70.69585        1 Confirmed
## minfun   13.88203  13.84892 12.13081 15.14099        1 Confirmed
## maxfun   11.84117  11.86825 11.09517 12.66145        1 Confirmed
## meandom  13.88778  13.92948 12.89561 14.62034        1 Confirmed
## mindom   12.54708  12.38866 11.37744 13.57965        1 Confirmed
## maxdom   14.46009  14.29613 13.07611 15.71666        1 Confirmed
## dfrange  14.64478  15.00449 12.58160 16.48005        1 Confirmed
## modindx  15.29686  15.23233 14.28087 16.39455        1 Confirmed

set.seed(1234)
ctrl=trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(1234)

#Grid Search to obtain the best parameters to build a random forest model

tunegrid <- expand.grid(.mtry=seq(2,18,1))
rf_gridsearch <- train(label~., data= voice.train, method="rf",
                       tuneGrid=tunegrid, trControl=ctrl)
print(rf_gridsearch)

## Random Forest 
## 
## 2376 samples
##   20 predictor
##    2 classes: 'female', 'male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 2139, 2138, 2139, 2138, 2139, 2138, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9772755  0.9545490
##    3    0.9768553  0.9537090
##    4    0.9772755  0.9545490
##    5    0.9781176  0.9562336
##    6    0.9772755  0.9545494
##    7    0.9772737  0.9545461
##    8    0.9776956  0.9553902
##    9    0.9760132  0.9520251
##   10    0.9760150  0.9520289
##   11    0.9764369  0.9528730
##   12    0.9764369  0.9528730
##   13    0.9747491  0.9494969
##   14    0.9751711  0.9503409
##   15    0.9760132  0.9520254
##   16    0.9743307  0.9486604
##   17    0.9747509  0.9495008
##   18    0.9751729  0.9503445
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.

plot(rf_gridsearch)

#best value of mtry was 6

voice.RF=randomForest(label~., data = voice.train,mtry= 6, importance=TRUE)
plot(voice.RF, main="")
legend("topright", c("OOB", "0", "1"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest-Final")

impVar <- round(randomForest::importance(voice.RF), 2)
impVar[order(impVar[,3], decreasing=TRUE),]

##          female  male MeanDecreaseAccuracy MeanDecreaseGini
## meanfun   49.28 80.75                87.51           512.30
## IQR       20.67 42.89                44.42           250.42
## Q25       19.42 20.03                26.70           161.16
## modindx   17.33  8.46                18.77             7.47
## sd        14.37 12.79                17.84            82.92
## sfm       14.80 12.86                17.68            30.73
## Q75       10.30 11.58                15.86             9.98
## maxdom    12.27  8.68                14.96             8.91
## minfun    10.54 11.26                14.62            10.61
## dfrange   12.09  8.00                14.54             8.07
## meanfreq   9.45  9.67                13.76            11.64
## sp.ent    12.43  8.25                13.73            30.26
## median     7.60 10.10                12.61             8.93
## meandom   10.45  7.66                12.46             6.46
## centroid   9.07  8.44                11.90            14.35
## skew       8.53  6.09                10.06             7.71
## mode       6.95  7.94                 9.80            10.73
## maxfun     8.24  5.86                 9.76             3.66
## mindom     6.04  7.52                 8.82             5.41
## kurt       9.60  4.50                 8.47             5.77

voice.pred.class=predict(voice.RF, voice.test, type = "class")
CM=confusionMatrix(voice.test[,21],voice.pred.class)
fourfoldplot(CM$table)

Accuracy_RF=CM$overall[1]
Sensitivity_RF=CM$byClass[1]
Specificity_RF=CM$byClass[2]

voice.prediction=prediction(as.numeric(voice.pred.class),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_RF=auc@y.values[[1]]


##################
#Build a SVM model using Linear kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = c(0.01, 0.02,0.05, 0.075, 0.1, 0.25, 0.5, 1, 1.25, 1.5, 1.75, 2,5))

set.seed(1234)
voice.SVM_Lin=train(voice.train.x,voice.train.y,
                    tuneGrid=grid, tuneControl=ctrl,
                    preProcess=c("scale","center"),
                    method='svmLinear')
voice.SVM_Lin$bestTune

##      C
## 9 1.25

plot(voice.SVM_Lin)

voice.pred=predict(voice.SVM_Lin, voice.test.x)

voice.prediction=prediction(as.numeric(voice.pred),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_SVM_Lin=auc@y.values[[1]]

CM=confusionMatrix(voice.test[,21],voice.pred)
fourfoldplot(CM$table)

Accuracy_SVM_Lin=CM$overall[1]
Sensitivity_SVM_Lin=CM$byClass[1]
Specificity_SVM_Lin=CM$byClass[2]

#######
#Build a SVM model using polynomial kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = c(0.01, 0.02,0.05, 0.075, 0.1, 
                       0.25, 0.5, 1, 1.25, 1.5, 1.75, 2,5),
                 degree=c(2,3), scale=1)

set.seed(12345)
voice.SVM_Pol=train(voice.train.x,voice.train.y,
                    tuneGrid=grid, tuneControl=ctrl,
                                      method='svmPoly')
voice.SVM_Pol$bestTune

##   degree scale    C
## 3      2     1 0.02

plot(voice.SVM_Pol)

voice.pred=predict(voice.SVM_Pol, voice.test.x)

voice.prediction=prediction(as.numeric(voice.pred),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_SVM_Pol=auc@y.values[[1]]

CM=confusionMatrix(voice.test[,21],voice.pred)
fourfoldplot(CM$table)

Accuracy_SVM_Pol=CM$overall[1]
Sensitivity_SVM_Pol=CM$byClass[1]
Specificity_SVM_Pol=CM$byClass[2]

#######
#Build a SVM model using Radial kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = c(.01, 0.1, 0.20,0.5), 
                 sigma=c(0.005,0.01,0.02,0.025))

set.seed(123465)
voice.SVM_Rad=train(voice.train.x,voice.train.y,
                    tuneGrid=grid, tuneControl=ctrl,
                    method='svmRadial')
voice.SVM_Rad$bestTune

##    sigma   C
## 16 0.025 0.5

plot(voice.SVM_Rad)

voice.pred=predict(voice.SVM_Rad, voice.test.x)

voice.prediction=prediction(as.numeric(voice.pred),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_SVM_Rad=auc@y.values[[1]]

CM=confusionMatrix(voice.test[,21],voice.pred)
fourfoldplot(CM$table)

Accuracy_SVM_Rad=CM$overall[1]
Sensitivity_SVM_Rad=CM$byClass[1]
Specificity_SVM_Rad=CM$byClass[2]

###############################################
#Building a Logistic Regression Model
str(voice.train)

## 'data.frame':    2376 obs. of  21 variables:
##  $ meanfreq: num  0.0598 0.066 0.1512 0.1351 0.1328 ...
##  $ sd      : num  0.0642 0.0673 0.0721 0.0791 0.0796 ...
##  $ median  : num  0.032 0.0402 0.158 0.1247 0.1191 ...
##  $ Q25     : num  0.0151 0.0194 0.0966 0.0787 0.068 ...
##  $ Q75     : num  0.0902 0.0927 0.208 0.206 0.2096 ...
##  $ IQR     : num  0.0751 0.0733 0.1114 0.1273 0.1416 ...
##  $ skew    : num  12.86 22.42 1.23 1.1 1.93 ...
##  $ kurt    : num  274.4 634.61 4.18 4.33 8.31 ...
##  $ sp.ent  : num  0.893 0.892 0.963 0.972 0.963 ...
##  $ sfm     : num  0.492 0.514 0.727 0.784 0.738 ...
##  $ mode    : num  0 0 0.0839 0.1043 0.1126 ...
##  $ centroid: num  0.0598 0.066 0.1512 0.1351 0.1328 ...
##  $ meanfun : num  0.0843 0.1079 0.089 0.1064 0.1101 ...
##  $ minfun  : num  0.0157 0.0158 0.0178 0.0169 0.0171 ...
##  $ maxfun  : num  0.276 0.25 0.25 0.267 0.254 ...
##  $ meandom : num  0.00781 0.00901 0.2015 0.71281 0.29822 ...
##  $ mindom  : num  0.00781 0.00781 0.00781 0.00781 0.00781 ...
##  $ maxdom  : num  0.00781 0.05469 0.5625 5.48438 2.72656 ...
##  $ dfrange : num  0 0.0469 0.5547 5.4766 2.7188 ...
##  $ modindx : num  0 0.0526 0.2471 0.2083 0.1252 ...
##  $ label   : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...

voice.train$label=factor(ifelse(voice.train$label=="male",1,0))
library(corrplot)
M=cor(na.omit(voice.train[,-21]))
corrplot(M, method = "circle", type = "lower", 
         tl.srt = 45, tl.col = "black", tl.cex = 0.75)

#meanfreq and centroid are identical with a correlation of 1. centroid can be removed.
#median,Q25 are also highly correlated with meanfreq.
#maxdom and dfrange are also highly correlated. One of them to be removed
#centroid,median,dfrange and Q25 could be be removed to reduce multicollienearity

voice.train.Cleaned=voice.train[,-c(3,4,12,19)]
voice.test.Cleaned=voice.test[,-c(3,4,12,19)]

#Build model and check for multicolinearity using VIF
LogMod1=glm(label~.,data = voice.train.Cleaned,family = 'binomial')
summary(LogMod1)

## 
## Call:
## glm(formula = label ~ ., family = "binomial", data = voice.train.Cleaned)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1161  -0.0390   0.0002   0.1129   4.2282  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.700e+01  1.020e+01  -1.667 0.095603 .  
## meanfreq    -3.519e+01  2.486e+01  -1.416 0.156880    
## sd           2.760e+00  3.472e+01   0.079 0.936655    
## Q75          2.165e+01  2.038e+01   1.063 0.287931    
## IQR          4.790e+01  1.073e+01   4.466 7.98e-06 ***
## skew         2.272e-01  1.806e-01   1.258 0.208389    
## kurt        -9.783e-03  4.777e-03  -2.048 0.040557 *  
## sp.ent       4.562e+01  1.143e+01   3.991 6.59e-05 ***
## sfm         -1.261e+01  2.885e+00  -4.372 1.23e-05 ***
## mode         5.176e+00  2.483e+00   2.085 0.037062 *  
## meanfun     -1.661e+02  9.974e+00 -16.650  < 2e-16 ***
## minfun       3.803e+01  1.044e+01   3.642 0.000271 ***
## maxfun      -2.606e+00  7.684e+00  -0.339 0.734535    
## meandom      1.937e-01  4.740e-01   0.409 0.682773    
## mindom      -1.025e+00  2.376e+00  -0.431 0.666135    
## maxdom      -1.762e-02  7.477e-02  -0.236 0.813708    
## modindx     -3.627e+00  1.809e+00  -2.006 0.044902 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3293.84  on 2375  degrees of freedom
## Residual deviance:  439.66  on 2359  degrees of freedom
## AIC: 473.66
## 
## Number of Fisher Scoring iterations: 8

options(scipen = 9999)
car::vif(LogMod1)

##  meanfreq        sd       Q75       IQR      skew      kurt    sp.ent 
## 34.660781 14.485075 17.882649  8.774593 42.638715 34.322152  8.652440 
##       sfm      mode   meanfun    minfun    maxfun   meandom    mindom 
## 14.389097  2.441855  1.438337  1.858984  1.566805  4.151739  1.317469 
##    maxdom   modindx 
##  4.023023  2.065868

## Variables with very high VIF (greater than 10) could
#be removed to eliminate multicollinearity.
#meanfreq,sd,Q75,,IQR,skew,kurt,sfm could be removed while building the model

names(voice.train.Cleaned)

##  [1] "meanfreq" "sd"       "Q75"      "IQR"      "skew"     "kurt"    
##  [7] "sp.ent"   "sfm"      "mode"     "meanfun"  "minfun"   "maxfun"  
## [13] "meandom"  "mindom"   "maxdom"   "modindx"  "label"

voice.train.Cleaned=voice.train.Cleaned[,-c(1:6,8)]
voice.test.Cleaned=voice.test.Cleaned[,-c(1:6,8)]


LogMod2=glm(label~.,data = voice.train.Cleaned,family = 'binomial')
summary(LogMod2)

## 
## Call:
## glm(formula = label ~ ., family = "binomial", data = voice.train.Cleaned)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1036  -0.0781   0.0005   0.1469   3.7806  
## 
## Coefficients:
##                Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)    1.415551    3.926999   0.360              0.71850    
## sp.ent        22.868141    3.972221   5.757      0.0000000085613 ***
## mode           4.730990    1.675405   2.824              0.00475 ** 
## meanfun     -181.593289    9.469489 -19.177 < 0.0000000000000002 ***
## minfun        54.822629    8.132723   6.741      0.0000000000157 ***
## maxfun         6.449256    5.686845   1.134              0.25677    
## meandom       -0.007311    0.351213  -0.021              0.98339    
## mindom        -4.165008    1.871228  -2.226              0.02603 *  
## maxdom        -0.053264    0.057243  -0.930              0.35211    
## modindx       -2.881131    1.443607  -1.996              0.04596 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3293.84  on 2375  degrees of freedom
## Residual deviance:  613.86  on 2366  degrees of freedom
## AIC: 633.86
## 
## Number of Fisher Scoring iterations: 8

options(scipen = 9999)
car::vif(LogMod2)

##   sp.ent     mode  meanfun   minfun   maxfun  meandom   mindom   maxdom 
## 1.310424 1.364971 1.237013 1.691014 1.487267 3.283409 1.155321 3.666829 
##  modindx 
## 1.824442

#VIF is now stable.
#Check for the best model by removing variables that are not significant
step(LogMod2, direction = "both")

## Start:  AIC=633.86
## label ~ sp.ent + mode + meanfun + minfun + maxfun + meandom + 
##     mindom + maxdom + modindx
## 
##           Df Deviance     AIC
## - meandom  1   613.86  631.86
## - maxdom   1   614.72  632.72
## - maxfun   1   615.13  633.13
## <none>         613.86  633.86
## - modindx  1   617.71  635.71
## - mindom   1   618.88  636.88
## - mode     1   621.87  639.87
## - sp.ent   1   648.24  666.24
## - minfun   1   652.75  670.75
## - meanfun  1  2538.99 2556.99
## 
## Step:  AIC=631.86
## label ~ sp.ent + mode + meanfun + minfun + maxfun + mindom + 
##     maxdom + modindx
## 
##           Df Deviance     AIC
## - maxfun   1   615.17  631.17
## - maxdom   1   615.66  631.66
## <none>         613.86  631.86
## + meandom  1   613.86  633.86
## - modindx  1   618.35  634.35
## - mindom   1   619.07  635.07
## - mode     1   622.02  638.02
## - sp.ent   1   648.36  664.36
## - minfun   1   655.38  671.38
## - meanfun  1  2541.44 2557.44
## 
## Step:  AIC=631.17
## label ~ sp.ent + mode + meanfun + minfun + mindom + maxdom + 
##     modindx
## 
##           Df Deviance     AIC
## - maxdom   1   616.42  630.42
## <none>         615.17  631.17
## + maxfun   1   613.86  631.86
## + meandom  1   615.13  633.13
## - modindx  1   621.13  635.13
## - mindom   1   621.47  635.47
## - mode     1   623.03  637.03
## - sp.ent   1   651.75  665.75
## - minfun   1   659.22  673.22
## - meanfun  1  2614.88 2628.88
## 
## Step:  AIC=630.42
## label ~ sp.ent + mode + meanfun + minfun + mindom + modindx
## 
##           Df Deviance     AIC
## <none>         616.42  630.42
## + maxdom   1   615.17  631.17
## + maxfun   1   615.66  631.66
## + meandom  1   615.96  631.96
## - modindx  1   621.18  633.18
## - mindom   1   622.90  634.90
## - mode     1   623.30  635.30
## - sp.ent   1   653.65  665.65
## - minfun   1   660.71  672.71
## - meanfun  1  2636.32 2648.32

## 
## Call:  glm(formula = label ~ sp.ent + mode + meanfun + minfun + mindom + 
##     modindx, family = "binomial", data = voice.train.Cleaned)
## 
## Coefficients:
## (Intercept)       sp.ent         mode      meanfun       minfun  
##       2.308       23.556        4.232     -181.467       53.439  
##      mindom      modindx  
##      -4.603       -2.704  
## 
## Degrees of Freedom: 2375 Total (i.e. Null);  2369 Residual
## Null Deviance:       3294 
## Residual Deviance: 616.4     AIC: 630.4

LogModF=glm(formula = label ~ sp.ent + mode + meanfun + minfun + meandom + 
              mindom + modindx, family = "binomial", data = voice.train.Cleaned)
summary(LogModF)

## 
## Call:
## glm(formula = label ~ sp.ent + mode + meanfun + minfun + meandom + 
##     mindom + modindx, family = "binomial", data = voice.train.Cleaned)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0932  -0.0781   0.0005   0.1454   3.7725  
## 
## Coefficients:
##              Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)    2.3963     3.7802   0.634              0.52614    
## sp.ent        23.4285     3.9341   5.955     0.00000000259602 ***
## mode           4.5025     1.6644   2.705              0.00683 ** 
## meanfun     -181.0224     9.4284 -19.200 < 0.0000000000000002 ***
## minfun        55.3009     7.9480   6.958     0.00000000000345 ***
## meandom       -0.1581     0.2343  -0.675              0.49981    
## mindom        -4.4516     1.8365  -2.424              0.01535 *  
## modindx       -2.7356     1.2229  -2.237              0.02529 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3293.84  on 2375  degrees of freedom
## Residual deviance:  615.96  on 2368  degrees of freedom
## AIC: 631.96
## 
## Number of Fisher Scoring iterations: 8

###Prediction
voice.pred=predict(LogModF, voice.test.Cleaned, type = "response")
voice.class=ifelse(voice.pred>0.5,"male","female")

voice.prediction=prediction(as.numeric(voice.pred),voice.test.Cleaned$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_Log=auc@y.values[[1]]

CM=confusionMatrix(voice.test.Cleaned$label,as.factor(voice.class))
fourfoldplot(CM$table)

Accuracy_Log=CM$overall[1]
Sensitivity_Log=CM$byClass[1]
Specificity_Log=CM$byClass[2]


####################################
#Building Linear Discriminant Model
#Since Multicollinearity impacts Linear Discriminant Model like the Logistic model
#the same cleaned data was used as incase of Logistic Model

library(MASS)
library(DiscriMiner)
DiscM=lda(label~.,data = voice.train.Cleaned)
names(voice.train.Cleaned)

##  [1] "sp.ent"  "mode"    "meanfun" "minfun"  "maxfun"  "meandom" "mindom" 
##  [8] "maxdom"  "modindx" "label"

M=manova(as.matrix(voice.train.Cleaned[,c(1:9)])
         ~as.matrix(voice.train.Cleaned[,10]))
summary(M)

##                                        Df  Pillai approx F num Df den Df
## as.matrix(voice.train.Cleaned[, 10])    1 0.73632   734.12      9   2366
## Residuals                            2374                               
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary.aov(M)

##  Response sp.ent :
##                                        Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1 1.1617 1.16169  740.51
## Residuals                            2374 3.7242 0.00157        
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response mode :
##                                        Df  Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1  0.4641 0.46413  79.238
## Residuals                            2374 13.9056 0.00586        
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response meanfun :
##                                        Df  Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1 1.68957 1.68957  5288.5
## Residuals                            2374 0.75844 0.00032        
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response minfun :
##                                        Df  Sum Sq  Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1 0.01460 0.014595  43.304
## Residuals                            2374 0.80014 0.000337        
##                                                Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) 0.00000000005748 ***
## Residuals                                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response maxfun :
##                                        Df  Sum Sq  Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1 0.05365 0.053653  61.222
## Residuals                            2374 2.08050 0.000876        
##                                                    Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) 0.000000000000007613 ***
## Residuals                                                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response meandom :
##                                        Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1  23.52   23.52  87.103
## Residuals                            2374 641.03    0.27        
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response mindom :
##                                        Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1 0.4385 0.43847  112.31
## Residuals                            2374 9.2687 0.00390        
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response maxdom :
##                                        Df  Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10])    1  1113.8 1113.77  93.672
## Residuals                            2374 28226.9   11.89        
##                                                     Pr(>F)    
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response modindx :
##                                        Df Sum Sq  Mean Sq F value Pr(>F)
## as.matrix(voice.train.Cleaned[, 10])    1  0.029 0.029487  2.1534 0.1424
## Residuals                            2374 32.508 0.013693

###Prediction
voice.pred=predict(DiscM, voice.test.Cleaned)
voice.prediction=prediction(as.numeric(voice.pred$class),voice.test.Cleaned$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_LDA=auc@y.values[[1]]
levels(voice.test.Cleaned$label)=c(0,1)
CM=confusionMatrix(factor(voice.test.Cleaned$label),(voice.pred$class))
fourfoldplot(CM$table)

Accuracy_LDA=CM$overall[1]
Sensitivity_LDA=CM$byClass[1]
Specificity_LDA=CM$byClass[2]

########################################
Compare=data.frame(Classifier=c("RF","SVM_Lin","SV_Pol","SV_Radial","Logistic", "LDA"),
                   Acc=c(Accuracy_RF,Accuracy_SVM_Lin,Accuracy_SVM_Pol,Accuracy_SVM_Rad,Accuracy_Log, Accuracy_LDA),
                   Sensitivity=c(Sensitivity_RF,Sensitivity_SVM_Lin,Sensitivity_SVM_Pol,Sensitivity_SVM_Rad,Sensitivity_Log,Sensitivity_LDA),
                   Specificity=c(Specificity_RF,Specificity_SVM_Lin,Specificity_SVM_Pol,Specificity_SVM_Rad,Specificity_Log, Specificity_LDA),
                   AUC_All=c(AUC_RF,AUC_SVM_Lin,AUC_SVM_Pol,AUC_SVM_Rad,AUC_Log, AUC_LDA))
Compare

##   Classifier       Acc Sensitivity Specificity   AUC_All
## 1         RF 0.9848485   0.9873096   0.9824121 0.9848485
## 2    SVM_Lin 0.9823232   0.9798995   0.9847716 0.9823232
## 3     SV_Pol 0.9785354   0.9773300   0.9797468 0.9785354
## 4  SV_Radial 0.9747475   0.9771574   0.9723618 0.9747475
## 5   Logistic 0.9722222   0.9746193   0.9698492 0.9911297
## 6        LDA 0.9671717   0.9842932   0.9512195 0.9671717

library(reshape)

## Warning: package 'reshape' was built under R version 3.4.3

ggplot(melt(Compare,id.vars = "Classifier"),aes(Classifier,value, col=variable, group=variable))+geom_line()+
  geom_point(size=4,shape=21,fill="white")+
  labs(x="",y="Values", title="Evaluation Metric Comparison", color="Metrics")+
  theme(legend.key = element_rect(colour = "black", fill = "light blue"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(size = 15, hjust = 0.5))

ggplot(melt(Compare,id.vars = "Classifier"),aes(x=variable,value, fill=Classifier))+
  geom_bar(stat = "identity", position = "dodge")+coord_flip()+
  labs(x="",y="Values", title="Evaluation Metric Comparison", color="Metrics")+
  theme(legend.key = element_rect(colour = "black", fill = "light blue"),
        axis.text.y = element_text(size = 10, hjust = 1, face = "bold"),
        plot.title = element_text(size = 15, hjust = 0.5),
        legend.key.size = unit(0.5,"cm"),
        legend.position = "bottom",
        legend.background = element_rect(fill="grey"))

Voice_Classifiers.r

user

Fri Mar 15 20:19:33 2019