#Load the file as our Dataset for the Project and do some data cleaning
Data <- read.csv("C:/Users/Priya/Downloads/ReplicatedAcousticFeatures-ParkinsonDatabase.csv")
sum(is.na(Data))
## [1] 2
Data <- na.omit(Data)
Data2 <- Data[,(1:22)]
summary(Data)
## ID Recording Status Gender
## CONT-01: 3 Min. :1 Min. :0.0000 Min. :0.0000
## CONT-02: 3 1st Qu.:1 1st Qu.:0.0000 1st Qu.:0.0000
## CONT-03: 3 Median :2 Median :0.0000 Median :0.0000
## CONT-04: 3 Mean :2 Mean :0.4958 Mean :0.3992
## CONT-05: 3 3rd Qu.:3 3rd Qu.:1.0000 3rd Qu.:1.0000
## CONT-06: 3 Max. :3 Max. :1.0000 Max. :1.0000
## (Other):220
## Jitter_rel Jitter_abs Jitter_RAP
## Min. :0.1480 Min. :7.070e-06 Min. :0.0006783
## 1st Qu.:0.2974 1st Qu.:1.924e-05 1st Qu.:0.0015562
## Median :0.4815 Median :3.563e-05 Median :0.0023365
## Mean :0.5845 Mean :4.463e-05 Mean :0.0031738
## 3rd Qu.:0.6809 3rd Qu.:5.638e-05 3rd Qu.:0.0036443
## Max. :6.8382 Max. :5.499e-04 Max. :0.0438430
##
## Jitter_PPQ Shim_loc Shim_dB
## Min. :0.001036 Min. :0.007444 Min. :0.06499
## 1st Qu.:0.001872 1st Qu.:0.024255 1st Qu.:0.21138
## Median :0.002870 Median :0.032924 Median :0.28704
## Mean :0.003539 Mean :0.038410 Mean :0.33676
## 3rd Qu.:0.003993 3rd Qu.:0.045644 3rd Qu.:0.39990
## Max. :0.065199 Max. :0.192600 Max. :1.74760
##
## Shim_APQ3 Shim_APQ5 Shi_APQ11 HNR05
## Min. :0.003344 Min. :0.004103 Min. :0.006459 Min. : 22.22
## 1st Qu.:0.012821 1st Qu.:0.014970 1st Qu.:0.018911 1st Qu.: 50.77
## Median :0.018517 Median :0.019841 Median :0.024775 Median : 59.83
## Mean :0.021482 Mean :0.023467 Mean :0.028671 Mean : 59.51
## 3rd Qu.:0.025781 3rd Qu.:0.028101 3rd Qu.:0.033795 3rd Qu.: 68.75
## Max. :0.113240 Max. :0.120760 Max. :0.142440 Max. :101.21
##
## HNR15 HNR25 HNR35 HNR38
## Min. : 26.27 Min. : 33.16 Min. : 36.49 Min. : 36.91
## 1st Qu.: 54.65 1st Qu.: 64.23 1st Qu.: 69.98 1st Qu.: 71.69
## Median : 65.10 Median : 75.72 Median : 81.21 Median : 82.41
## Mean : 63.89 Mean : 74.43 Mean : 80.43 Mean : 81.75
## 3rd Qu.: 73.45 3rd Qu.: 85.70 3rd Qu.: 92.38 3rd Qu.: 93.51
## Max. :109.65 Max. :120.71 Max. :128.29 Max. :129.99
##
## RPDE DFA PPE GNE
## Min. :0.1628 Min. :0.4114 Min. :0.004127 Min. :0.8473
## 1st Qu.:0.2625 1st Qu.:0.5575 1st Qu.:0.039455 1st Qu.:0.8915
## Median :0.3044 Median :0.6085 Median :0.246486 Median :0.9128
## Mean :0.3103 Mean :0.6139 Mean :0.271949 Mean :0.9177
## 3rd Qu.:0.3493 3rd Qu.:0.6612 3rd Qu.:0.408552 3rd Qu.:0.9464
## Max. :0.5360 Max. :0.7844 Max. :0.908395 Max. :0.9873
##
## MFCC0 MFCC1 MFCC2 MFCC3
## Min. :0.7702 Min. :0.7255 Min. :0.5695 Min. :0.7276
## 1st Qu.:1.2098 1st Qu.:1.1522 1st Qu.:1.1242 1st Qu.:1.2139
## Median :1.3409 Median :1.2968 Median :1.2995 Median :1.3435
## Mean :1.3473 Mean :1.3037 Mean :1.2904 Mean :1.3512
## 3rd Qu.:1.4915 3rd Qu.:1.4745 3rd Qu.:1.4822 3rd Qu.:1.4977
## Max. :1.9491 Max. :1.8357 Max. :1.9284 Max. :1.8571
##
## MFCC4 MFCC5 MFCC6 MFCC7
## Min. :0.7713 Min. :0.6115 Min. :0.8291 Min. :0.6536
## 1st Qu.:1.2198 1st Qu.:1.2017 1st Qu.:1.2052 1st Qu.:1.1911
## Median :1.3453 Median :1.3333 Median :1.3243 Median :1.3233
## Mean :1.3568 Mean :1.3400 Mean :1.3368 Mean :1.3414
## 3rd Qu.:1.4996 3rd Qu.:1.4750 3rd Qu.:1.4575 3rd Qu.:1.4884
## Max. :1.8408 Max. :1.9762 Max. :2.0008 Max. :2.0167
##
## MFCC8 MFCC9 MFCC10 MFCC11
## Min. :0.8395 Min. :0.8236 Min. :0.8136 Min. :0.8232
## 1st Qu.:1.2217 1st Qu.:1.2231 1st Qu.:1.2298 1st Qu.:1.2060
## Median :1.3414 Median :1.3374 Median :1.3403 Median :1.3579
## Mean :1.3563 Mean :1.3473 Mean :1.3476 Mean :1.3576
## 3rd Qu.:1.4830 3rd Qu.:1.4886 3rd Qu.:1.4688 3rd Qu.:1.5150
## Max. :1.9184 Max. :2.0396 Max. :2.0713 Max. :1.9836
##
## MFCC12 Delta0 Delta1 Delta2
## Min. :0.8444 Min. :0.6208 Min. :0.6474 Min. :0.6281
## 1st Qu.:1.1985 1st Qu.:1.1998 1st Qu.:1.1856 1st Qu.:1.1980
## Median :1.3404 Median :1.3346 Median :1.3439 Median :1.3246
## Mean :1.3398 Mean :1.3405 Mean :1.3369 Mean :1.3442
## 3rd Qu.:1.4765 3rd Qu.:1.4831 3rd Qu.:1.4872 3rd Qu.:1.4944
## Max. :2.0300 Max. :2.0281 Max. :2.0213 Max. :1.9799
##
## Delta3 Delta4 Delta5 Delta6
## Min. :0.7665 Min. :0.8401 Min. :0.7417 1.698341898: 2
## 1st Qu.:1.2100 1st Qu.:1.2224 1st Qu.:1.1981 0.75968897 : 1
## Median :1.3510 Median :1.3439 Median :1.3372 0.860440731: 1
## Mean :1.3446 Mean :1.3509 Mean :1.3391 0.891382409: 1
## 3rd Qu.:1.4891 3rd Qu.:1.4704 3rd Qu.:1.4867 0.903131747: 1
## Max. :1.8606 Max. :2.0382 Max. :1.7860 0.914237303: 1
## (Other) :231
## Delta7 Delta8 Delta9 Delta10
## Min. :0.7646 Min. :0.7628 Min. :0.8119 Min. :0.777
## 1st Qu.:1.1957 1st Qu.:1.1945 1st Qu.:1.1899 1st Qu.:1.194
## Median :1.3444 Median :1.3369 Median :1.3512 Median :1.333
## Mean :1.3423 Mean :1.3457 Mean :1.3436 Mean :1.332
## 3rd Qu.:1.4911 3rd Qu.:1.4968 3rd Qu.:1.4756 3rd Qu.:1.473
## Max. :1.8728 Max. :1.9201 Max. :1.9433 Max. :1.950
##
## Delta11 Delta12
## Min. :0.6431 Min. :0.7484
## 1st Qu.:1.2034 1st Qu.:1.2069
## Median :1.3478 Median :1.3312
## Mean :1.3482 Mean :1.3476
## 3rd Qu.:1.5079 3rd Qu.:1.4774
## Max. :1.9184 Max. :1.9301
##
str(Data)
## 'data.frame': 238 obs. of 48 variables:
## $ ID : Factor w/ 80 levels "CONT-01","CONT-02",..: 1 1 1 2 2 2 3 3 3 4 ...
## $ Recording : int 1 2 3 1 2 3 1 2 3 1 ...
## $ Status : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gender : int 1 1 1 0 0 0 1 1 1 1 ...
## $ Jitter_rel: num 0.255 0.37 0.235 0.293 0.231 ...
## $ Jitter_abs: num 1.46e-05 2.17e-05 1.31e-05 1.73e-05 1.46e-05 ...
## $ Jitter_RAP: num 0.00147 0.00193 0.00135 0.0011 0.00107 ...
## $ Jitter_PPQ: num 0.00167 0.00224 0.00155 0.00144 0.0014 ...
## $ Shim_loc : num 0.0303 0.0231 0.0193 0.0247 0.0131 ...
## $ Shim_dB : num 0.263 0.202 0.167 0.209 0.116 ...
## $ Shim_APQ3 : num 0.01746 0.01301 0.01105 0.01452 0.00646 ...
## $ Shim_APQ5 : num 0.01966 0.0141 0.01268 0.0157 0.00838 ...
## $ Shi_APQ11 : num 0.0219 0.0168 0.013 0.0183 0.011 ...
## $ HNR05 : num 59.4 59.8 57.3 62.2 67.5 ...
## $ HNR15 : num 60.7 62.7 61.9 68.7 75 ...
## $ HNR25 : num 71.9 74.4 73.7 79.8 85.7 ...
## $ HNR35 : num 77.6 80 78.9 84.4 91 ...
## $ HNR38 : num 77.5 80.3 79 84.3 91.1 ...
## $ RPDE : num 0.25 0.271 0.204 0.261 0.222 ...
## $ DFA : num 0.599 0.581 0.601 0.625 0.602 ...
## $ PPE : num 0.0054 0.07562 0.00528 0.03052 0.0356 ...
## $ GNE : num 0.9 0.887 0.889 0.876 0.902 ...
## $ MFCC0 : num 1.32 1.22 1.43 1.5 1.52 ...
## $ MFCC1 : num 1.34 1.22 1.22 1.6 1.58 ...
## $ MFCC2 : num 1.3 1.27 1.34 1.57 1.54 ...
## $ MFCC3 : num 1.33 1.19 1.35 1.5 1.61 ...
## $ MFCC4 : num 1.28 1.26 1.34 1.56 1.5 ...
## $ MFCC5 : num 1.21 1.25 1.33 1.59 1.26 ...
## $ MFCC6 : num 1.24 1.28 1.39 1.63 1.39 ...
## $ MFCC7 : num 1.28 1.3 1.35 1.65 1.45 ...
## $ MFCC8 : num 1.33 1.31 1.35 1.67 1.5 ...
## $ MFCC9 : num 1.35 1.32 1.34 1.62 1.42 ...
## $ MFCC10 : num 1.37 1.37 1.36 1.49 1.56 ...
## $ MFCC11 : num 1.4 1.37 1.31 1.46 1.19 ...
## $ MFCC12 : num 1.39 1.34 1.28 1.38 1.58 ...
## $ Delta0 : num 1.4 1.3 1.31 1.38 1.25 ...
## $ Delta1 : num 1.37 1.35 1.35 1.22 1.58 ...
## $ Delta2 : num 1.39 1.37 1.24 1.31 1.58 ...
## $ Delta3 : num 1.41 1.33 1.41 1.5 1.51 ...
## $ Delta4 : num 1.42 1.23 1.32 1.53 1.33 ...
## $ Delta5 : num 1.38 1.21 1.28 1.32 1.61 ...
## $ Delta6 : Factor w/ 239 levels "0.75968897","0.860440731",..: 150 127 155 179 226 230 183 81 144 154 ...
## $ Delta7 : num 1.45 1.35 1.46 1.47 1.42 ...
## $ Delta8 : num 1.44 1.37 1.37 1.64 1.57 ...
## $ Delta9 : num 1.4 1.32 1.44 1.55 1.64 ...
## $ Delta10 : num 1.41 1.31 1.39 1.64 1.53 ...
## $ Delta11 : num 1.42 1.32 1.31 1.6 1.3 ...
## $ Delta12 : num 1.35 1.32 1.31 1.62 1.38 ...
## - attr(*, "na.action")= 'omit' Named int 181 228
## ..- attr(*, "names")= chr "181" "228"
#removing ID and converting gender and status to a factor
Data <- Data[,(2:48)]
Data <- Data[,-41]
Data$Gender <- as.factor(Data$Gender)
Data$Status <- as.factor(Data$Status)
#A bit about the variables in our dataset
###Set seed and split the data into train and test. 80:20 split.
set.seed(12121)
Data_rand <- Data[order(runif(238)), ]
Data_train <- Data_rand[1:190, ]
Data_test <- Data_rand[191:238, ]
#Check if the data has been proportioned properly after splitting the data in train and test sets. Generate a naive bayes model on the Train dataset.
prop.table(table(Data_train$Status))
##
## 0 1
## 0.5 0.5
prop.table(table(Data_test$Status))
##
## 0 1
## 0.5208333 0.4791667
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
naive_model <- naive_bayes(as.character(Status) ~ ., data= Data_train)
naive_model
##
## ================================ Naive Bayes =================================
##
## Call:
## naive_bayes.formula(formula = as.character(Status) ~ ., data = Data_train)
##
## ------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.5 0.5
##
## ------------------------------------------------------------------------------
##
## Tables:
##
## ------------------------------------------------------------------------------
## ::: Recording (Gaussian)
## ------------------------------------------------------------------------------
##
## Recording 0 1
## mean 1.9368421 1.9789474
## sd 0.8226905 0.7852218
##
## ------------------------------------------------------------------------------
## ::: Gender (Bernoulli)
## ------------------------------------------------------------------------------
##
## Gender 0 1
## 0 0.5368421 0.6105263
## 1 0.4631579 0.3894737
##
## ------------------------------------------------------------------------------
## ::: Jitter_rel (Gaussian)
## ------------------------------------------------------------------------------
##
## Jitter_rel 0 1
## mean 0.4956831 0.6876674
## sd 0.2832267 0.7604290
##
## ------------------------------------------------------------------------------
## ::: Jitter_abs (Gaussian)
## ------------------------------------------------------------------------------
##
## Jitter_abs 0 1
## mean 4.140846e-05 4.778915e-05
## sd 3.046618e-05 6.144964e-05
##
## ------------------------------------------------------------------------------
## ::: Jitter_RAP (Gaussian)
## ------------------------------------------------------------------------------
##
## Jitter_RAP 0 1
## mean 0.002554235 0.003876533
## sd 0.001658089 0.004833633
##
## ------------------------------------------------------------------------------
##
## # ... and 40 more tables
##
## ------------------------------------------------------------------------------
#The model output is generated as above. Now, lets check the accuracy of the model.
Data_acc <- table(predict(naive_model, Data_test), Data_test$Status)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(Data_acc))/sum(Data_acc)*100
Accuracy
## [1] 83.33333
#We can see that the model has an accuracy of 83.33% which is considered as good performance of the model. Now lets use another method to create another machine learning model on the same train and test sets and compare accuracy of both models.
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.5.2
Data_classifier <- ksvm(Status ~., data= Data_train,kernel="vanilladot")
## Setting default kernel parameters
summary(Data_classifier)
## Length Class Mode
## 1 ksvm S4
Data_predictions <- predict(Data_classifier, Data_test)
(p<- table(Data_predictions,Data_test$Status))
##
## Data_predictions 0 1
## 0 22 6
## 1 3 17
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 81.25
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
## Loading required package: rpart
fit <- rpart(Status ~ ., data=Data_train, method="class")
rpart.plot(fit)
#Now lets access the model performace
pred<-predict(fit, Data_test, type="class")
print(table(pred, Data_test$Status))
##
## pred 0 1
## 0 16 7
## 1 9 16
p<-table(pred, Data_test$Status)
(Accuracy <- (sum(diag(p))/sum(p)*100))
## [1] 66.66667
set.seed(1234)
Data2$Status <- as.factor(Data2$Status)
fit.km <- kmeans(Data, 6, nstart=44)
print(fit.km$size)
## [1] 34 44 52 51 35 22
print(fit.km$centers)
## Recording Status Gender Jitter_rel Jitter_abs Jitter_RAP
## 1 1.882353 0.1764706 0.3823529 0.6118276 5.275747e-05 0.003205326
## 2 2.159091 0.5227273 0.4545455 0.7392245 5.325725e-05 0.004296461
## 3 2.057692 0.1923077 0.2884615 0.5096096 4.163588e-05 0.002601305
## 4 1.941176 0.5098039 0.3529412 0.4990947 4.191790e-05 0.002614542
## 5 1.885714 0.8857143 0.4857143 0.5783537 3.962094e-05 0.003184261
## 6 2.045455 1.0000000 0.5454545 0.6175670 3.614077e-05 0.003512991
## Jitter_PPQ Shim_loc Shim_dB Shim_APQ3 Shim_APQ5 Shi_APQ11
## 1 0.003478232 0.03344895 0.2936789 0.01812278 0.02012975 0.02584419
## 2 0.004896720 0.04705570 0.4133968 0.02703926 0.02847381 0.03333711
## 3 0.002912900 0.03652262 0.3179847 0.02013100 0.02213304 0.02808770
## 4 0.002955553 0.03862500 0.3372124 0.02129611 0.02391960 0.02983998
## 5 0.003571914 0.03699544 0.3302214 0.02081336 0.02307129 0.02718369
## 6 0.003694698 0.03499373 0.3037880 0.02024749 0.02134483 0.02473866
## HNR05 HNR15 HNR25 HNR35 HNR38 RPDE DFA
## 1 80.41057 87.32158 98.68322 105.90856 107.48508 0.3039304 0.6517635
## 2 54.58130 57.70287 68.24170 74.26046 75.51915 0.3258406 0.5951196
## 3 69.45061 74.34322 85.45881 92.17105 93.71099 0.3115780 0.6149760
## 4 60.83959 66.08176 77.05004 82.60471 83.81548 0.3059521 0.6229899
## 5 46.61847 49.24998 58.68668 63.75885 65.11901 0.3043146 0.6025471
## 6 30.95813 33.54456 42.17983 47.11318 47.87071 0.3054083 0.5871637
## PPE GNE MFCC0 MFCC1 MFCC2 MFCC3 MFCC4
## 1 0.3146891 0.9168083 1.484459 1.382355 1.330645 1.510957 1.539247
## 2 0.2622407 0.9160310 1.314902 1.318794 1.310268 1.335058 1.337651
## 3 0.2979775 0.9304011 1.496845 1.437982 1.418833 1.495955 1.491525
## 4 0.2611002 0.9149709 1.346767 1.287619 1.296291 1.322794 1.355833
## 5 0.2607332 0.9101494 1.221919 1.202470 1.188798 1.242942 1.208992
## 6 0.2067840 0.9110303 1.047280 1.032692 1.033387 1.032332 1.032577
## MFCC5 MFCC6 MFCC7 MFCC8 MFCC9 MFCC10 MFCC11 MFCC12
## 1 1.507922 1.486117 1.500558 1.531053 1.487759 1.515915 1.535200 1.505596
## 2 1.324332 1.313718 1.322991 1.327059 1.332317 1.337923 1.335190 1.332458
## 3 1.483720 1.477887 1.454757 1.480831 1.462290 1.456926 1.469633 1.445001
## 4 1.345909 1.333235 1.357288 1.358926 1.347754 1.363250 1.375866 1.340120
## 5 1.165823 1.195538 1.205875 1.224911 1.243198 1.208944 1.219147 1.217839
## 6 1.035967 1.051403 1.042632 1.052857 1.053151 1.032669 1.041389 1.043262
## Delta0 Delta1 Delta2 Delta3 Delta4 Delta5 Delta7 Delta8
## 1 1.504702 1.492119 1.530450 1.488144 1.510448 1.501960 1.519925 1.502571
## 2 1.300194 1.299299 1.303248 1.342536 1.333771 1.331380 1.346209 1.342574
## 3 1.484899 1.464461 1.494599 1.467128 1.477135 1.468902 1.462529 1.461643
## 4 1.341754 1.348447 1.331450 1.342680 1.356382 1.333673 1.333741 1.349822
## 5 1.208993 1.223084 1.202055 1.222866 1.225048 1.212277 1.195346 1.219054
## 6 1.032254 1.025455 1.038871 1.035850 1.027261 1.009878 1.029522 1.027197
## Delta9 Delta10 Delta11 Delta12
## 1 1.503811 1.459851 1.532789 1.484787
## 2 1.334598 1.285286 1.313074 1.318790
## 3 1.457561 1.461580 1.476071 1.488190
## 4 1.351172 1.377224 1.349177 1.358228
## 5 1.210233 1.194788 1.214723 1.219988
## 6 1.038767 1.040231 1.041108 1.039147
library(cluster)
clusplot(Data, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)
(df_km<-table(Data$Status,fit.km$cluster))
##
## 1 2 3 4 5 6
## 0 28 21 42 25 4 0
## 1 6 23 10 26 31 22
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 21.42857