setwd("C:\\Users\\user\\Desktop\\Blogs\\Publish")
voice=read.csv("voice.csv")
head(voice)
## meanfreq sd median Q25 Q75 IQR
## 1 0.05978099 0.06424127 0.03202691 0.015071489 0.09019344 0.07512195
## 2 0.06600874 0.06731003 0.04022874 0.019413867 0.09266619 0.07325232
## 3 0.07731550 0.08382942 0.03671846 0.008701057 0.13190802 0.12320696
## 4 0.15122809 0.07211059 0.15801119 0.096581728 0.20795525 0.11137352
## 5 0.13512039 0.07914610 0.12465623 0.078720218 0.20604493 0.12732471
## 6 0.13278641 0.07955687 0.11908985 0.067957993 0.20959160 0.14163361
## skew kurt sp.ent sfm mode centroid
## 1 12.863462 274.402905 0.8933694 0.4919178 0.00000000 0.05978099
## 2 22.423285 634.613855 0.8921932 0.5137238 0.00000000 0.06600874
## 3 30.757155 1024.927705 0.8463891 0.4789050 0.00000000 0.07731550
## 4 1.232831 4.177296 0.9633225 0.7272318 0.08387818 0.15122809
## 5 1.101174 4.333713 0.9719551 0.7835681 0.10426140 0.13512039
## 6 1.932562 8.308895 0.9631813 0.7383070 0.11255543 0.13278641
## meanfun minfun maxfun meandom mindom maxdom
## 1 0.08427911 0.01570167 0.2758621 0.007812500 0.0078125 0.0078125
## 2 0.10793655 0.01582591 0.2500000 0.009014423 0.0078125 0.0546875
## 3 0.09870626 0.01565558 0.2711864 0.007990057 0.0078125 0.0156250
## 4 0.08896485 0.01779755 0.2500000 0.201497396 0.0078125 0.5625000
## 5 0.10639785 0.01693122 0.2666667 0.712812500 0.0078125 5.4843750
## 6 0.11013192 0.01711230 0.2539683 0.298221983 0.0078125 2.7265625
## dfrange modindx label
## 1 0.0000000 0.00000000 male
## 2 0.0468750 0.05263158 male
## 3 0.0078125 0.04651163 male
## 4 0.5546875 0.24711908 male
## 5 5.4765625 0.20827389 male
## 6 2.7187500 0.12515964 male
dim(voice)
## [1] 3168 21
colSums(is.na(voice))
## meanfreq sd median Q25 Q75 IQR skew kurt
## 0 0 0 0 0 0 0 0
## sp.ent sfm mode centroid meanfun minfun maxfun meandom
## 0 0 0 0 0 0 0 0
## mindom maxdom dfrange modindx label
## 0 0 0 0 0
table(voice$label)
##
## female male
## 1584 1584
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Warning: package 'car' was built under R version 3.4.3
library(DiscriMiner)
## Warning: package 'DiscriMiner' was built under R version 3.4.2
trainIndex=createDataPartition(voice$label, p=0.75, list = FALSE,times = 1)
voice.train=voice[trainIndex,]
voice.test=voice[-trainIndex,]
voice.train.x=voice[trainIndex,1:20]
voice.train.y=voice[trainIndex,21]
voice.test.x=voice[-trainIndex,1:20]
voice.test.y=voice[-trainIndex,21]
###Random Forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(Boruta)
## Warning: package 'Boruta' was built under R version 3.4.4
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.4.4
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
voice.BT=Boruta(label~., data = voice, doTrace=2, ntree=500)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +39 secs:
## confirmed 20 attributes: centroid, dfrange, IQR, kurt, maxdom and 15 more;
## no more attributes left.
plot(voice.BT, xlab = "", xaxt = "n", main="Variable Importance")
k <-lapply(1:ncol(voice.BT$ImpHistory),function(i)
voice.BT$ImpHistory[is.finite(voice.BT$ImpHistory[,i]),i])
names(k) <- colnames(voice.BT$ImpHistory)
Labels <- sort(sapply(k,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(voice.BT$ImpHistory), cex.axis = 0.7)

getSelectedAttributes(voice.BT)
## [1] "meanfreq" "sd" "median" "Q25" "Q75" "IQR"
## [7] "skew" "kurt" "sp.ent" "sfm" "mode" "centroid"
## [13] "meanfun" "minfun" "maxfun" "meandom" "mindom" "maxdom"
## [19] "dfrange" "modindx"
attStats(voice.BT)
## meanImp medianImp minImp maxImp normHits decision
## meanfreq 15.62322 15.61245 14.57645 16.87907 1 Confirmed
## sd 21.56208 21.43479 20.28532 23.02301 1 Confirmed
## median 15.22047 15.11179 14.26540 16.21802 1 Confirmed
## Q25 28.32153 28.58427 26.94125 29.57518 1 Confirmed
## Q75 15.63257 15.58216 14.50351 16.86030 1 Confirmed
## IQR 38.77793 38.02985 37.01546 41.81356 1 Confirmed
## skew 12.98145 13.20309 11.46838 13.84315 1 Confirmed
## kurt 12.84411 12.81247 11.52440 14.20167 1 Confirmed
## sp.ent 16.96122 16.60883 15.91170 18.26139 1 Confirmed
## sfm 18.30454 18.28308 16.95814 19.12684 1 Confirmed
## mode 13.43629 13.52252 12.62999 14.05887 1 Confirmed
## centroid 15.25943 15.46352 14.21894 16.03348 1 Confirmed
## meanfun 66.64167 66.70269 63.27066 70.69585 1 Confirmed
## minfun 13.88203 13.84892 12.13081 15.14099 1 Confirmed
## maxfun 11.84117 11.86825 11.09517 12.66145 1 Confirmed
## meandom 13.88778 13.92948 12.89561 14.62034 1 Confirmed
## mindom 12.54708 12.38866 11.37744 13.57965 1 Confirmed
## maxdom 14.46009 14.29613 13.07611 15.71666 1 Confirmed
## dfrange 14.64478 15.00449 12.58160 16.48005 1 Confirmed
## modindx 15.29686 15.23233 14.28087 16.39455 1 Confirmed
set.seed(1234)
ctrl=trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(1234)
#Grid Search to obtain the best parameters to build a random forest model
tunegrid <- expand.grid(.mtry=seq(2,18,1))
rf_gridsearch <- train(label~., data= voice.train, method="rf",
tuneGrid=tunegrid, trControl=ctrl)
print(rf_gridsearch)
## Random Forest
##
## 2376 samples
## 20 predictor
## 2 classes: 'female', 'male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 2139, 2138, 2139, 2138, 2139, 2138, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9772755 0.9545490
## 3 0.9768553 0.9537090
## 4 0.9772755 0.9545490
## 5 0.9781176 0.9562336
## 6 0.9772755 0.9545494
## 7 0.9772737 0.9545461
## 8 0.9776956 0.9553902
## 9 0.9760132 0.9520251
## 10 0.9760150 0.9520289
## 11 0.9764369 0.9528730
## 12 0.9764369 0.9528730
## 13 0.9747491 0.9494969
## 14 0.9751711 0.9503409
## 15 0.9760132 0.9520254
## 16 0.9743307 0.9486604
## 17 0.9747509 0.9495008
## 18 0.9751729 0.9503445
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
plot(rf_gridsearch)

#best value of mtry was 6
voice.RF=randomForest(label~., data = voice.train,mtry= 6, importance=TRUE)
plot(voice.RF, main="")
legend("topright", c("OOB", "0", "1"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest-Final")

impVar <- round(randomForest::importance(voice.RF), 2)
impVar[order(impVar[,3], decreasing=TRUE),]
## female male MeanDecreaseAccuracy MeanDecreaseGini
## meanfun 49.28 80.75 87.51 512.30
## IQR 20.67 42.89 44.42 250.42
## Q25 19.42 20.03 26.70 161.16
## modindx 17.33 8.46 18.77 7.47
## sd 14.37 12.79 17.84 82.92
## sfm 14.80 12.86 17.68 30.73
## Q75 10.30 11.58 15.86 9.98
## maxdom 12.27 8.68 14.96 8.91
## minfun 10.54 11.26 14.62 10.61
## dfrange 12.09 8.00 14.54 8.07
## meanfreq 9.45 9.67 13.76 11.64
## sp.ent 12.43 8.25 13.73 30.26
## median 7.60 10.10 12.61 8.93
## meandom 10.45 7.66 12.46 6.46
## centroid 9.07 8.44 11.90 14.35
## skew 8.53 6.09 10.06 7.71
## mode 6.95 7.94 9.80 10.73
## maxfun 8.24 5.86 9.76 3.66
## mindom 6.04 7.52 8.82 5.41
## kurt 9.60 4.50 8.47 5.77
voice.pred.class=predict(voice.RF, voice.test, type = "class")
CM=confusionMatrix(voice.test[,21],voice.pred.class)
fourfoldplot(CM$table)

Accuracy_RF=CM$overall[1]
Sensitivity_RF=CM$byClass[1]
Specificity_RF=CM$byClass[2]
voice.prediction=prediction(as.numeric(voice.pred.class),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_RF=auc@y.values[[1]]
##################
#Build a SVM model using Linear kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = c(0.01, 0.02,0.05, 0.075, 0.1, 0.25, 0.5, 1, 1.25, 1.5, 1.75, 2,5))
set.seed(1234)
voice.SVM_Lin=train(voice.train.x,voice.train.y,
tuneGrid=grid, tuneControl=ctrl,
preProcess=c("scale","center"),
method='svmLinear')
voice.SVM_Lin$bestTune
## C
## 9 1.25
plot(voice.SVM_Lin)

voice.pred=predict(voice.SVM_Lin, voice.test.x)
voice.prediction=prediction(as.numeric(voice.pred),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_SVM_Lin=auc@y.values[[1]]
CM=confusionMatrix(voice.test[,21],voice.pred)
fourfoldplot(CM$table)

Accuracy_SVM_Lin=CM$overall[1]
Sensitivity_SVM_Lin=CM$byClass[1]
Specificity_SVM_Lin=CM$byClass[2]
#######
#Build a SVM model using polynomial kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = c(0.01, 0.02,0.05, 0.075, 0.1,
0.25, 0.5, 1, 1.25, 1.5, 1.75, 2,5),
degree=c(2,3), scale=1)
set.seed(12345)
voice.SVM_Pol=train(voice.train.x,voice.train.y,
tuneGrid=grid, tuneControl=ctrl,
method='svmPoly')
voice.SVM_Pol$bestTune
## degree scale C
## 3 2 1 0.02
plot(voice.SVM_Pol)

voice.pred=predict(voice.SVM_Pol, voice.test.x)
voice.prediction=prediction(as.numeric(voice.pred),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_SVM_Pol=auc@y.values[[1]]
CM=confusionMatrix(voice.test[,21],voice.pred)
fourfoldplot(CM$table)

Accuracy_SVM_Pol=CM$overall[1]
Sensitivity_SVM_Pol=CM$byClass[1]
Specificity_SVM_Pol=CM$byClass[2]
#######
#Build a SVM model using Radial kernel
###Tuning parameter C for optimized model
grid=expand.grid(C = c(.01, 0.1, 0.20,0.5),
sigma=c(0.005,0.01,0.02,0.025))
set.seed(123465)
voice.SVM_Rad=train(voice.train.x,voice.train.y,
tuneGrid=grid, tuneControl=ctrl,
method='svmRadial')
voice.SVM_Rad$bestTune
## sigma C
## 16 0.025 0.5
plot(voice.SVM_Rad)

voice.pred=predict(voice.SVM_Rad, voice.test.x)
voice.prediction=prediction(as.numeric(voice.pred),voice.test$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_SVM_Rad=auc@y.values[[1]]
CM=confusionMatrix(voice.test[,21],voice.pred)
fourfoldplot(CM$table)

Accuracy_SVM_Rad=CM$overall[1]
Sensitivity_SVM_Rad=CM$byClass[1]
Specificity_SVM_Rad=CM$byClass[2]
###############################################
#Building a Logistic Regression Model
str(voice.train)
## 'data.frame': 2376 obs. of 21 variables:
## $ meanfreq: num 0.0598 0.066 0.1512 0.1351 0.1328 ...
## $ sd : num 0.0642 0.0673 0.0721 0.0791 0.0796 ...
## $ median : num 0.032 0.0402 0.158 0.1247 0.1191 ...
## $ Q25 : num 0.0151 0.0194 0.0966 0.0787 0.068 ...
## $ Q75 : num 0.0902 0.0927 0.208 0.206 0.2096 ...
## $ IQR : num 0.0751 0.0733 0.1114 0.1273 0.1416 ...
## $ skew : num 12.86 22.42 1.23 1.1 1.93 ...
## $ kurt : num 274.4 634.61 4.18 4.33 8.31 ...
## $ sp.ent : num 0.893 0.892 0.963 0.972 0.963 ...
## $ sfm : num 0.492 0.514 0.727 0.784 0.738 ...
## $ mode : num 0 0 0.0839 0.1043 0.1126 ...
## $ centroid: num 0.0598 0.066 0.1512 0.1351 0.1328 ...
## $ meanfun : num 0.0843 0.1079 0.089 0.1064 0.1101 ...
## $ minfun : num 0.0157 0.0158 0.0178 0.0169 0.0171 ...
## $ maxfun : num 0.276 0.25 0.25 0.267 0.254 ...
## $ meandom : num 0.00781 0.00901 0.2015 0.71281 0.29822 ...
## $ mindom : num 0.00781 0.00781 0.00781 0.00781 0.00781 ...
## $ maxdom : num 0.00781 0.05469 0.5625 5.48438 2.72656 ...
## $ dfrange : num 0 0.0469 0.5547 5.4766 2.7188 ...
## $ modindx : num 0 0.0526 0.2471 0.2083 0.1252 ...
## $ label : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
voice.train$label=factor(ifelse(voice.train$label=="male",1,0))
library(corrplot)
M=cor(na.omit(voice.train[,-21]))
corrplot(M, method = "circle", type = "lower",
tl.srt = 45, tl.col = "black", tl.cex = 0.75)

#meanfreq and centroid are identical with a correlation of 1. centroid can be removed.
#median,Q25 are also highly correlated with meanfreq.
#maxdom and dfrange are also highly correlated. One of them to be removed
#centroid,median,dfrange and Q25 could be be removed to reduce multicollienearity
voice.train.Cleaned=voice.train[,-c(3,4,12,19)]
voice.test.Cleaned=voice.test[,-c(3,4,12,19)]
#Build model and check for multicolinearity using VIF
LogMod1=glm(label~.,data = voice.train.Cleaned,family = 'binomial')
summary(LogMod1)
##
## Call:
## glm(formula = label ~ ., family = "binomial", data = voice.train.Cleaned)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1161 -0.0390 0.0002 0.1129 4.2282
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.700e+01 1.020e+01 -1.667 0.095603 .
## meanfreq -3.519e+01 2.486e+01 -1.416 0.156880
## sd 2.760e+00 3.472e+01 0.079 0.936655
## Q75 2.165e+01 2.038e+01 1.063 0.287931
## IQR 4.790e+01 1.073e+01 4.466 7.98e-06 ***
## skew 2.272e-01 1.806e-01 1.258 0.208389
## kurt -9.783e-03 4.777e-03 -2.048 0.040557 *
## sp.ent 4.562e+01 1.143e+01 3.991 6.59e-05 ***
## sfm -1.261e+01 2.885e+00 -4.372 1.23e-05 ***
## mode 5.176e+00 2.483e+00 2.085 0.037062 *
## meanfun -1.661e+02 9.974e+00 -16.650 < 2e-16 ***
## minfun 3.803e+01 1.044e+01 3.642 0.000271 ***
## maxfun -2.606e+00 7.684e+00 -0.339 0.734535
## meandom 1.937e-01 4.740e-01 0.409 0.682773
## mindom -1.025e+00 2.376e+00 -0.431 0.666135
## maxdom -1.762e-02 7.477e-02 -0.236 0.813708
## modindx -3.627e+00 1.809e+00 -2.006 0.044902 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3293.84 on 2375 degrees of freedom
## Residual deviance: 439.66 on 2359 degrees of freedom
## AIC: 473.66
##
## Number of Fisher Scoring iterations: 8
options(scipen = 9999)
car::vif(LogMod1)
## meanfreq sd Q75 IQR skew kurt sp.ent
## 34.660781 14.485075 17.882649 8.774593 42.638715 34.322152 8.652440
## sfm mode meanfun minfun maxfun meandom mindom
## 14.389097 2.441855 1.438337 1.858984 1.566805 4.151739 1.317469
## maxdom modindx
## 4.023023 2.065868
## Variables with very high VIF (greater than 10) could
#be removed to eliminate multicollinearity.
#meanfreq,sd,Q75,,IQR,skew,kurt,sfm could be removed while building the model
names(voice.train.Cleaned)
## [1] "meanfreq" "sd" "Q75" "IQR" "skew" "kurt"
## [7] "sp.ent" "sfm" "mode" "meanfun" "minfun" "maxfun"
## [13] "meandom" "mindom" "maxdom" "modindx" "label"
voice.train.Cleaned=voice.train.Cleaned[,-c(1:6,8)]
voice.test.Cleaned=voice.test.Cleaned[,-c(1:6,8)]
LogMod2=glm(label~.,data = voice.train.Cleaned,family = 'binomial')
summary(LogMod2)
##
## Call:
## glm(formula = label ~ ., family = "binomial", data = voice.train.Cleaned)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1036 -0.0781 0.0005 0.1469 3.7806
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.415551 3.926999 0.360 0.71850
## sp.ent 22.868141 3.972221 5.757 0.0000000085613 ***
## mode 4.730990 1.675405 2.824 0.00475 **
## meanfun -181.593289 9.469489 -19.177 < 0.0000000000000002 ***
## minfun 54.822629 8.132723 6.741 0.0000000000157 ***
## maxfun 6.449256 5.686845 1.134 0.25677
## meandom -0.007311 0.351213 -0.021 0.98339
## mindom -4.165008 1.871228 -2.226 0.02603 *
## maxdom -0.053264 0.057243 -0.930 0.35211
## modindx -2.881131 1.443607 -1.996 0.04596 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3293.84 on 2375 degrees of freedom
## Residual deviance: 613.86 on 2366 degrees of freedom
## AIC: 633.86
##
## Number of Fisher Scoring iterations: 8
options(scipen = 9999)
car::vif(LogMod2)
## sp.ent mode meanfun minfun maxfun meandom mindom maxdom
## 1.310424 1.364971 1.237013 1.691014 1.487267 3.283409 1.155321 3.666829
## modindx
## 1.824442
#VIF is now stable.
#Check for the best model by removing variables that are not significant
step(LogMod2, direction = "both")
## Start: AIC=633.86
## label ~ sp.ent + mode + meanfun + minfun + maxfun + meandom +
## mindom + maxdom + modindx
##
## Df Deviance AIC
## - meandom 1 613.86 631.86
## - maxdom 1 614.72 632.72
## - maxfun 1 615.13 633.13
## <none> 613.86 633.86
## - modindx 1 617.71 635.71
## - mindom 1 618.88 636.88
## - mode 1 621.87 639.87
## - sp.ent 1 648.24 666.24
## - minfun 1 652.75 670.75
## - meanfun 1 2538.99 2556.99
##
## Step: AIC=631.86
## label ~ sp.ent + mode + meanfun + minfun + maxfun + mindom +
## maxdom + modindx
##
## Df Deviance AIC
## - maxfun 1 615.17 631.17
## - maxdom 1 615.66 631.66
## <none> 613.86 631.86
## + meandom 1 613.86 633.86
## - modindx 1 618.35 634.35
## - mindom 1 619.07 635.07
## - mode 1 622.02 638.02
## - sp.ent 1 648.36 664.36
## - minfun 1 655.38 671.38
## - meanfun 1 2541.44 2557.44
##
## Step: AIC=631.17
## label ~ sp.ent + mode + meanfun + minfun + mindom + maxdom +
## modindx
##
## Df Deviance AIC
## - maxdom 1 616.42 630.42
## <none> 615.17 631.17
## + maxfun 1 613.86 631.86
## + meandom 1 615.13 633.13
## - modindx 1 621.13 635.13
## - mindom 1 621.47 635.47
## - mode 1 623.03 637.03
## - sp.ent 1 651.75 665.75
## - minfun 1 659.22 673.22
## - meanfun 1 2614.88 2628.88
##
## Step: AIC=630.42
## label ~ sp.ent + mode + meanfun + minfun + mindom + modindx
##
## Df Deviance AIC
## <none> 616.42 630.42
## + maxdom 1 615.17 631.17
## + maxfun 1 615.66 631.66
## + meandom 1 615.96 631.96
## - modindx 1 621.18 633.18
## - mindom 1 622.90 634.90
## - mode 1 623.30 635.30
## - sp.ent 1 653.65 665.65
## - minfun 1 660.71 672.71
## - meanfun 1 2636.32 2648.32
##
## Call: glm(formula = label ~ sp.ent + mode + meanfun + minfun + mindom +
## modindx, family = "binomial", data = voice.train.Cleaned)
##
## Coefficients:
## (Intercept) sp.ent mode meanfun minfun
## 2.308 23.556 4.232 -181.467 53.439
## mindom modindx
## -4.603 -2.704
##
## Degrees of Freedom: 2375 Total (i.e. Null); 2369 Residual
## Null Deviance: 3294
## Residual Deviance: 616.4 AIC: 630.4
LogModF=glm(formula = label ~ sp.ent + mode + meanfun + minfun + meandom +
mindom + modindx, family = "binomial", data = voice.train.Cleaned)
summary(LogModF)
##
## Call:
## glm(formula = label ~ sp.ent + mode + meanfun + minfun + meandom +
## mindom + modindx, family = "binomial", data = voice.train.Cleaned)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0932 -0.0781 0.0005 0.1454 3.7725
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.3963 3.7802 0.634 0.52614
## sp.ent 23.4285 3.9341 5.955 0.00000000259602 ***
## mode 4.5025 1.6644 2.705 0.00683 **
## meanfun -181.0224 9.4284 -19.200 < 0.0000000000000002 ***
## minfun 55.3009 7.9480 6.958 0.00000000000345 ***
## meandom -0.1581 0.2343 -0.675 0.49981
## mindom -4.4516 1.8365 -2.424 0.01535 *
## modindx -2.7356 1.2229 -2.237 0.02529 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3293.84 on 2375 degrees of freedom
## Residual deviance: 615.96 on 2368 degrees of freedom
## AIC: 631.96
##
## Number of Fisher Scoring iterations: 8
###Prediction
voice.pred=predict(LogModF, voice.test.Cleaned, type = "response")
voice.class=ifelse(voice.pred>0.5,"male","female")
voice.prediction=prediction(as.numeric(voice.pred),voice.test.Cleaned$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_Log=auc@y.values[[1]]
CM=confusionMatrix(voice.test.Cleaned$label,as.factor(voice.class))
fourfoldplot(CM$table)

Accuracy_Log=CM$overall[1]
Sensitivity_Log=CM$byClass[1]
Specificity_Log=CM$byClass[2]
####################################
#Building Linear Discriminant Model
#Since Multicollinearity impacts Linear Discriminant Model like the Logistic model
#the same cleaned data was used as incase of Logistic Model
library(MASS)
library(DiscriMiner)
DiscM=lda(label~.,data = voice.train.Cleaned)
names(voice.train.Cleaned)
## [1] "sp.ent" "mode" "meanfun" "minfun" "maxfun" "meandom" "mindom"
## [8] "maxdom" "modindx" "label"
M=manova(as.matrix(voice.train.Cleaned[,c(1:9)])
~as.matrix(voice.train.Cleaned[,10]))
summary(M)
## Df Pillai approx F num Df den Df
## as.matrix(voice.train.Cleaned[, 10]) 1 0.73632 734.12 9 2366
## Residuals 2374
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(M)
## Response sp.ent :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 1.1617 1.16169 740.51
## Residuals 2374 3.7242 0.00157
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response mode :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 0.4641 0.46413 79.238
## Residuals 2374 13.9056 0.00586
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response meanfun :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 1.68957 1.68957 5288.5
## Residuals 2374 0.75844 0.00032
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response minfun :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 0.01460 0.014595 43.304
## Residuals 2374 0.80014 0.000337
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) 0.00000000005748 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response maxfun :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 0.05365 0.053653 61.222
## Residuals 2374 2.08050 0.000876
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) 0.000000000000007613 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response meandom :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 23.52 23.52 87.103
## Residuals 2374 641.03 0.27
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response mindom :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 0.4385 0.43847 112.31
## Residuals 2374 9.2687 0.00390
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response maxdom :
## Df Sum Sq Mean Sq F value
## as.matrix(voice.train.Cleaned[, 10]) 1 1113.8 1113.77 93.672
## Residuals 2374 28226.9 11.89
## Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) < 0.00000000000000022 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response modindx :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.matrix(voice.train.Cleaned[, 10]) 1 0.029 0.029487 2.1534 0.1424
## Residuals 2374 32.508 0.013693
###Prediction
voice.pred=predict(DiscM, voice.test.Cleaned)
voice.prediction=prediction(as.numeric(voice.pred$class),voice.test.Cleaned$label)
voice.perf=performance(voice.prediction,"tpr","fpr")
plot(voice.perf)

auc=performance(voice.prediction,measure = "auc")
AUC_LDA=auc@y.values[[1]]
levels(voice.test.Cleaned$label)=c(0,1)
CM=confusionMatrix(factor(voice.test.Cleaned$label),(voice.pred$class))
fourfoldplot(CM$table)

Accuracy_LDA=CM$overall[1]
Sensitivity_LDA=CM$byClass[1]
Specificity_LDA=CM$byClass[2]
########################################
Compare=data.frame(Classifier=c("RF","SVM_Lin","SV_Pol","SV_Radial","Logistic", "LDA"),
Acc=c(Accuracy_RF,Accuracy_SVM_Lin,Accuracy_SVM_Pol,Accuracy_SVM_Rad,Accuracy_Log, Accuracy_LDA),
Sensitivity=c(Sensitivity_RF,Sensitivity_SVM_Lin,Sensitivity_SVM_Pol,Sensitivity_SVM_Rad,Sensitivity_Log,Sensitivity_LDA),
Specificity=c(Specificity_RF,Specificity_SVM_Lin,Specificity_SVM_Pol,Specificity_SVM_Rad,Specificity_Log, Specificity_LDA),
AUC_All=c(AUC_RF,AUC_SVM_Lin,AUC_SVM_Pol,AUC_SVM_Rad,AUC_Log, AUC_LDA))
Compare
## Classifier Acc Sensitivity Specificity AUC_All
## 1 RF 0.9848485 0.9873096 0.9824121 0.9848485
## 2 SVM_Lin 0.9823232 0.9798995 0.9847716 0.9823232
## 3 SV_Pol 0.9785354 0.9773300 0.9797468 0.9785354
## 4 SV_Radial 0.9747475 0.9771574 0.9723618 0.9747475
## 5 Logistic 0.9722222 0.9746193 0.9698492 0.9911297
## 6 LDA 0.9671717 0.9842932 0.9512195 0.9671717
library(reshape)
## Warning: package 'reshape' was built under R version 3.4.3
ggplot(melt(Compare,id.vars = "Classifier"),aes(Classifier,value, col=variable, group=variable))+geom_line()+
geom_point(size=4,shape=21,fill="white")+
labs(x="",y="Values", title="Evaluation Metric Comparison", color="Metrics")+
theme(legend.key = element_rect(colour = "black", fill = "light blue"),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 15, hjust = 0.5))

ggplot(melt(Compare,id.vars = "Classifier"),aes(x=variable,value, fill=Classifier))+
geom_bar(stat = "identity", position = "dodge")+coord_flip()+
labs(x="",y="Values", title="Evaluation Metric Comparison", color="Metrics")+
theme(legend.key = element_rect(colour = "black", fill = "light blue"),
axis.text.y = element_text(size = 10, hjust = 1, face = "bold"),
plot.title = element_text(size = 15, hjust = 0.5),
legend.key.size = unit(0.5,"cm"),
legend.position = "bottom",
legend.background = element_rect(fill="grey"))
