#Load the file as our Dataset for the Project and do some data cleaning

Data <- read.csv("C:/Users/Priya/Downloads/ReplicatedAcousticFeatures-ParkinsonDatabase.csv")

sum(is.na(Data))
## [1] 2
Data <- na.omit(Data)
Data2 <- Data[,(1:22)]

summary(Data)
##        ID        Recording     Status           Gender      
##  CONT-01:  3   Min.   :1   Min.   :0.0000   Min.   :0.0000  
##  CONT-02:  3   1st Qu.:1   1st Qu.:0.0000   1st Qu.:0.0000  
##  CONT-03:  3   Median :2   Median :0.0000   Median :0.0000  
##  CONT-04:  3   Mean   :2   Mean   :0.4958   Mean   :0.3992  
##  CONT-05:  3   3rd Qu.:3   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  CONT-06:  3   Max.   :3   Max.   :1.0000   Max.   :1.0000  
##  (Other):220                                                
##    Jitter_rel       Jitter_abs          Jitter_RAP       
##  Min.   :0.1480   Min.   :7.070e-06   Min.   :0.0006783  
##  1st Qu.:0.2974   1st Qu.:1.924e-05   1st Qu.:0.0015562  
##  Median :0.4815   Median :3.563e-05   Median :0.0023365  
##  Mean   :0.5845   Mean   :4.463e-05   Mean   :0.0031738  
##  3rd Qu.:0.6809   3rd Qu.:5.638e-05   3rd Qu.:0.0036443  
##  Max.   :6.8382   Max.   :5.499e-04   Max.   :0.0438430  
##                                                          
##    Jitter_PPQ          Shim_loc           Shim_dB       
##  Min.   :0.001036   Min.   :0.007444   Min.   :0.06499  
##  1st Qu.:0.001872   1st Qu.:0.024255   1st Qu.:0.21138  
##  Median :0.002870   Median :0.032924   Median :0.28704  
##  Mean   :0.003539   Mean   :0.038410   Mean   :0.33676  
##  3rd Qu.:0.003993   3rd Qu.:0.045644   3rd Qu.:0.39990  
##  Max.   :0.065199   Max.   :0.192600   Max.   :1.74760  
##                                                         
##    Shim_APQ3          Shim_APQ5          Shi_APQ11            HNR05       
##  Min.   :0.003344   Min.   :0.004103   Min.   :0.006459   Min.   : 22.22  
##  1st Qu.:0.012821   1st Qu.:0.014970   1st Qu.:0.018911   1st Qu.: 50.77  
##  Median :0.018517   Median :0.019841   Median :0.024775   Median : 59.83  
##  Mean   :0.021482   Mean   :0.023467   Mean   :0.028671   Mean   : 59.51  
##  3rd Qu.:0.025781   3rd Qu.:0.028101   3rd Qu.:0.033795   3rd Qu.: 68.75  
##  Max.   :0.113240   Max.   :0.120760   Max.   :0.142440   Max.   :101.21  
##                                                                           
##      HNR15            HNR25            HNR35            HNR38       
##  Min.   : 26.27   Min.   : 33.16   Min.   : 36.49   Min.   : 36.91  
##  1st Qu.: 54.65   1st Qu.: 64.23   1st Qu.: 69.98   1st Qu.: 71.69  
##  Median : 65.10   Median : 75.72   Median : 81.21   Median : 82.41  
##  Mean   : 63.89   Mean   : 74.43   Mean   : 80.43   Mean   : 81.75  
##  3rd Qu.: 73.45   3rd Qu.: 85.70   3rd Qu.: 92.38   3rd Qu.: 93.51  
##  Max.   :109.65   Max.   :120.71   Max.   :128.29   Max.   :129.99  
##                                                                     
##       RPDE             DFA              PPE                GNE        
##  Min.   :0.1628   Min.   :0.4114   Min.   :0.004127   Min.   :0.8473  
##  1st Qu.:0.2625   1st Qu.:0.5575   1st Qu.:0.039455   1st Qu.:0.8915  
##  Median :0.3044   Median :0.6085   Median :0.246486   Median :0.9128  
##  Mean   :0.3103   Mean   :0.6139   Mean   :0.271949   Mean   :0.9177  
##  3rd Qu.:0.3493   3rd Qu.:0.6612   3rd Qu.:0.408552   3rd Qu.:0.9464  
##  Max.   :0.5360   Max.   :0.7844   Max.   :0.908395   Max.   :0.9873  
##                                                                       
##      MFCC0            MFCC1            MFCC2            MFCC3       
##  Min.   :0.7702   Min.   :0.7255   Min.   :0.5695   Min.   :0.7276  
##  1st Qu.:1.2098   1st Qu.:1.1522   1st Qu.:1.1242   1st Qu.:1.2139  
##  Median :1.3409   Median :1.2968   Median :1.2995   Median :1.3435  
##  Mean   :1.3473   Mean   :1.3037   Mean   :1.2904   Mean   :1.3512  
##  3rd Qu.:1.4915   3rd Qu.:1.4745   3rd Qu.:1.4822   3rd Qu.:1.4977  
##  Max.   :1.9491   Max.   :1.8357   Max.   :1.9284   Max.   :1.8571  
##                                                                     
##      MFCC4            MFCC5            MFCC6            MFCC7       
##  Min.   :0.7713   Min.   :0.6115   Min.   :0.8291   Min.   :0.6536  
##  1st Qu.:1.2198   1st Qu.:1.2017   1st Qu.:1.2052   1st Qu.:1.1911  
##  Median :1.3453   Median :1.3333   Median :1.3243   Median :1.3233  
##  Mean   :1.3568   Mean   :1.3400   Mean   :1.3368   Mean   :1.3414  
##  3rd Qu.:1.4996   3rd Qu.:1.4750   3rd Qu.:1.4575   3rd Qu.:1.4884  
##  Max.   :1.8408   Max.   :1.9762   Max.   :2.0008   Max.   :2.0167  
##                                                                     
##      MFCC8            MFCC9            MFCC10           MFCC11      
##  Min.   :0.8395   Min.   :0.8236   Min.   :0.8136   Min.   :0.8232  
##  1st Qu.:1.2217   1st Qu.:1.2231   1st Qu.:1.2298   1st Qu.:1.2060  
##  Median :1.3414   Median :1.3374   Median :1.3403   Median :1.3579  
##  Mean   :1.3563   Mean   :1.3473   Mean   :1.3476   Mean   :1.3576  
##  3rd Qu.:1.4830   3rd Qu.:1.4886   3rd Qu.:1.4688   3rd Qu.:1.5150  
##  Max.   :1.9184   Max.   :2.0396   Max.   :2.0713   Max.   :1.9836  
##                                                                     
##      MFCC12           Delta0           Delta1           Delta2      
##  Min.   :0.8444   Min.   :0.6208   Min.   :0.6474   Min.   :0.6281  
##  1st Qu.:1.1985   1st Qu.:1.1998   1st Qu.:1.1856   1st Qu.:1.1980  
##  Median :1.3404   Median :1.3346   Median :1.3439   Median :1.3246  
##  Mean   :1.3398   Mean   :1.3405   Mean   :1.3369   Mean   :1.3442  
##  3rd Qu.:1.4765   3rd Qu.:1.4831   3rd Qu.:1.4872   3rd Qu.:1.4944  
##  Max.   :2.0300   Max.   :2.0281   Max.   :2.0213   Max.   :1.9799  
##                                                                     
##      Delta3           Delta4           Delta5               Delta6   
##  Min.   :0.7665   Min.   :0.8401   Min.   :0.7417   1.698341898:  2  
##  1st Qu.:1.2100   1st Qu.:1.2224   1st Qu.:1.1981   0.75968897 :  1  
##  Median :1.3510   Median :1.3439   Median :1.3372   0.860440731:  1  
##  Mean   :1.3446   Mean   :1.3509   Mean   :1.3391   0.891382409:  1  
##  3rd Qu.:1.4891   3rd Qu.:1.4704   3rd Qu.:1.4867   0.903131747:  1  
##  Max.   :1.8606   Max.   :2.0382   Max.   :1.7860   0.914237303:  1  
##                                                     (Other)    :231  
##      Delta7           Delta8           Delta9          Delta10     
##  Min.   :0.7646   Min.   :0.7628   Min.   :0.8119   Min.   :0.777  
##  1st Qu.:1.1957   1st Qu.:1.1945   1st Qu.:1.1899   1st Qu.:1.194  
##  Median :1.3444   Median :1.3369   Median :1.3512   Median :1.333  
##  Mean   :1.3423   Mean   :1.3457   Mean   :1.3436   Mean   :1.332  
##  3rd Qu.:1.4911   3rd Qu.:1.4968   3rd Qu.:1.4756   3rd Qu.:1.473  
##  Max.   :1.8728   Max.   :1.9201   Max.   :1.9433   Max.   :1.950  
##                                                                    
##     Delta11          Delta12      
##  Min.   :0.6431   Min.   :0.7484  
##  1st Qu.:1.2034   1st Qu.:1.2069  
##  Median :1.3478   Median :1.3312  
##  Mean   :1.3482   Mean   :1.3476  
##  3rd Qu.:1.5079   3rd Qu.:1.4774  
##  Max.   :1.9184   Max.   :1.9301  
## 
str(Data)
## 'data.frame':    238 obs. of  48 variables:
##  $ ID        : Factor w/ 80 levels "CONT-01","CONT-02",..: 1 1 1 2 2 2 3 3 3 4 ...
##  $ Recording : int  1 2 3 1 2 3 1 2 3 1 ...
##  $ Status    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gender    : int  1 1 1 0 0 0 1 1 1 1 ...
##  $ Jitter_rel: num  0.255 0.37 0.235 0.293 0.231 ...
##  $ Jitter_abs: num  1.46e-05 2.17e-05 1.31e-05 1.73e-05 1.46e-05 ...
##  $ Jitter_RAP: num  0.00147 0.00193 0.00135 0.0011 0.00107 ...
##  $ Jitter_PPQ: num  0.00167 0.00224 0.00155 0.00144 0.0014 ...
##  $ Shim_loc  : num  0.0303 0.0231 0.0193 0.0247 0.0131 ...
##  $ Shim_dB   : num  0.263 0.202 0.167 0.209 0.116 ...
##  $ Shim_APQ3 : num  0.01746 0.01301 0.01105 0.01452 0.00646 ...
##  $ Shim_APQ5 : num  0.01966 0.0141 0.01268 0.0157 0.00838 ...
##  $ Shi_APQ11 : num  0.0219 0.0168 0.013 0.0183 0.011 ...
##  $ HNR05     : num  59.4 59.8 57.3 62.2 67.5 ...
##  $ HNR15     : num  60.7 62.7 61.9 68.7 75 ...
##  $ HNR25     : num  71.9 74.4 73.7 79.8 85.7 ...
##  $ HNR35     : num  77.6 80 78.9 84.4 91 ...
##  $ HNR38     : num  77.5 80.3 79 84.3 91.1 ...
##  $ RPDE      : num  0.25 0.271 0.204 0.261 0.222 ...
##  $ DFA       : num  0.599 0.581 0.601 0.625 0.602 ...
##  $ PPE       : num  0.0054 0.07562 0.00528 0.03052 0.0356 ...
##  $ GNE       : num  0.9 0.887 0.889 0.876 0.902 ...
##  $ MFCC0     : num  1.32 1.22 1.43 1.5 1.52 ...
##  $ MFCC1     : num  1.34 1.22 1.22 1.6 1.58 ...
##  $ MFCC2     : num  1.3 1.27 1.34 1.57 1.54 ...
##  $ MFCC3     : num  1.33 1.19 1.35 1.5 1.61 ...
##  $ MFCC4     : num  1.28 1.26 1.34 1.56 1.5 ...
##  $ MFCC5     : num  1.21 1.25 1.33 1.59 1.26 ...
##  $ MFCC6     : num  1.24 1.28 1.39 1.63 1.39 ...
##  $ MFCC7     : num  1.28 1.3 1.35 1.65 1.45 ...
##  $ MFCC8     : num  1.33 1.31 1.35 1.67 1.5 ...
##  $ MFCC9     : num  1.35 1.32 1.34 1.62 1.42 ...
##  $ MFCC10    : num  1.37 1.37 1.36 1.49 1.56 ...
##  $ MFCC11    : num  1.4 1.37 1.31 1.46 1.19 ...
##  $ MFCC12    : num  1.39 1.34 1.28 1.38 1.58 ...
##  $ Delta0    : num  1.4 1.3 1.31 1.38 1.25 ...
##  $ Delta1    : num  1.37 1.35 1.35 1.22 1.58 ...
##  $ Delta2    : num  1.39 1.37 1.24 1.31 1.58 ...
##  $ Delta3    : num  1.41 1.33 1.41 1.5 1.51 ...
##  $ Delta4    : num  1.42 1.23 1.32 1.53 1.33 ...
##  $ Delta5    : num  1.38 1.21 1.28 1.32 1.61 ...
##  $ Delta6    : Factor w/ 239 levels "0.75968897","0.860440731",..: 150 127 155 179 226 230 183 81 144 154 ...
##  $ Delta7    : num  1.45 1.35 1.46 1.47 1.42 ...
##  $ Delta8    : num  1.44 1.37 1.37 1.64 1.57 ...
##  $ Delta9    : num  1.4 1.32 1.44 1.55 1.64 ...
##  $ Delta10   : num  1.41 1.31 1.39 1.64 1.53 ...
##  $ Delta11   : num  1.42 1.32 1.31 1.6 1.3 ...
##  $ Delta12   : num  1.35 1.32 1.31 1.62 1.38 ...
##  - attr(*, "na.action")= 'omit' Named int  181 228
##   ..- attr(*, "names")= chr  "181" "228"
#removing ID and converting gender and status to a factor
Data <- Data[,(2:48)]
Data <- Data[,-41]

Data$Gender <- as.factor(Data$Gender)
Data$Status <- as.factor(Data$Status)

#A bit about the variables in our dataset

ID: Subjects’s identifier.

Recording: Number of the recording.

Status: 0=Healthy; 1=PD

Gender: 0=Man; 1=Woman

Pitch local perturbation measures: relative jitter (Jitter_rel), absolute jitter (Jitter_abs), relative average perturbation (Jitter_RAP), and pitch perturbation quotient (Jitter_PPQ).

Amplitude perturbation measures: local shimmer (Shim_loc), shimmer in dB (Shim_dB), 3-point amplitude perturbation quotient (Shim_APQ3), 5-point amplitude perturbation quotient (Shim_APQ5), and 11-point amplitude perturbation quotient (Shim_APQ11).

Harmonic-to-noise ratio measures: harmonic-to-noise ratio in the frequency band 0-500 Hz (HNR05), in 0-1500 Hz (HNR15), in 0-2500 Hz (HNR25), in 0-3500 Hz (HNR35), and in 0-3800 Hz (HNR38).

Mel frequency cepstral coefficient-based spectral measures of order 0 to 12 (MFCC0, MFCC1,…, MFCC12) and their derivatives (Delta0, Delta1,…, Delta12).

Recurrence period density entropy (RPDE).

Detrended fluctuation analysis (DFA).

Pitch period entropy (PPE).

Glottal-to-noise excitation ratio (GNE).

###Set seed and split the data into train and test. 80:20 split.

set.seed(12121)
Data_rand <- Data[order(runif(238)), ]
Data_train <- Data_rand[1:190, ]
Data_test <- Data_rand[191:238, ]

#Check if the data has been proportioned properly after splitting the data in train and test sets. Generate a naive bayes model on the Train dataset.

prop.table(table(Data_train$Status))
## 
##   0   1 
## 0.5 0.5
prop.table(table(Data_test$Status))
## 
##         0         1 
## 0.5208333 0.4791667
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
naive_model <- naive_bayes(as.character(Status) ~ ., data= Data_train)
naive_model
## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = as.character(Status) ~ ., data = Data_train)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 0
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
##   0   1 
## 0.5 0.5 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: Recording (Gaussian) 
## ------------------------------------------------------------------------------ 
##          
## Recording         0         1
##      mean 1.9368421 1.9789474
##      sd   0.8226905 0.7852218
## 
## ------------------------------------------------------------------------------ 
##  ::: Gender (Bernoulli) 
## ------------------------------------------------------------------------------ 
##       
## Gender         0         1
##      0 0.5368421 0.6105263
##      1 0.4631579 0.3894737
## 
## ------------------------------------------------------------------------------ 
##  ::: Jitter_rel (Gaussian) 
## ------------------------------------------------------------------------------ 
##           
## Jitter_rel         0         1
##       mean 0.4956831 0.6876674
##       sd   0.2832267 0.7604290
## 
## ------------------------------------------------------------------------------ 
##  ::: Jitter_abs (Gaussian) 
## ------------------------------------------------------------------------------ 
##           
## Jitter_abs            0            1
##       mean 4.140846e-05 4.778915e-05
##       sd   3.046618e-05 6.144964e-05
## 
## ------------------------------------------------------------------------------ 
##  ::: Jitter_RAP (Gaussian) 
## ------------------------------------------------------------------------------ 
##           
## Jitter_RAP           0           1
##       mean 0.002554235 0.003876533
##       sd   0.001658089 0.004833633
## 
## ------------------------------------------------------------------------------
## 
## # ... and 40 more tables
## 
## ------------------------------------------------------------------------------

#The model output is generated as above. Now, lets check the accuracy of the model.

Data_acc <- table(predict(naive_model, Data_test), Data_test$Status)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(Data_acc))/sum(Data_acc)*100
Accuracy
## [1] 83.33333

#We can see that the model has an accuracy of 83.33% which is considered as good performance of the model. Now lets use another method to create another machine learning model on the same train and test sets and compare accuracy of both models.

library(kernlab)
## Warning: package 'kernlab' was built under R version 3.5.2
Data_classifier <- ksvm(Status ~., data= Data_train,kernel="vanilladot")
##  Setting default kernel parameters
summary(Data_classifier)
## Length  Class   Mode 
##      1   ksvm     S4
Data_predictions <- predict(Data_classifier, Data_test)
(p<- table(Data_predictions,Data_test$Status))
##                 
## Data_predictions  0  1
##                0 22  6
##                1  3 17
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 81.25
library(rpart.plot) 
## Warning: package 'rpart.plot' was built under R version 3.5.3
## Loading required package: rpart
fit <- rpart(Status ~ ., data=Data_train, method="class")

rpart.plot(fit)

#Now lets access the model performace
pred<-predict(fit, Data_test, type="class")
print(table(pred, Data_test$Status))
##     
## pred  0  1
##    0 16  7
##    1  9 16
p<-table(pred, Data_test$Status)
(Accuracy <- (sum(diag(p))/sum(p)*100))
## [1] 66.66667
set.seed(1234)
Data2$Status <- as.factor(Data2$Status)
fit.km <- kmeans(Data, 6, nstart=44) 
print(fit.km$size)
## [1] 34 44 52 51 35 22
print(fit.km$centers)
##   Recording    Status    Gender Jitter_rel   Jitter_abs  Jitter_RAP
## 1  1.882353 0.1764706 0.3823529  0.6118276 5.275747e-05 0.003205326
## 2  2.159091 0.5227273 0.4545455  0.7392245 5.325725e-05 0.004296461
## 3  2.057692 0.1923077 0.2884615  0.5096096 4.163588e-05 0.002601305
## 4  1.941176 0.5098039 0.3529412  0.4990947 4.191790e-05 0.002614542
## 5  1.885714 0.8857143 0.4857143  0.5783537 3.962094e-05 0.003184261
## 6  2.045455 1.0000000 0.5454545  0.6175670 3.614077e-05 0.003512991
##    Jitter_PPQ   Shim_loc   Shim_dB  Shim_APQ3  Shim_APQ5  Shi_APQ11
## 1 0.003478232 0.03344895 0.2936789 0.01812278 0.02012975 0.02584419
## 2 0.004896720 0.04705570 0.4133968 0.02703926 0.02847381 0.03333711
## 3 0.002912900 0.03652262 0.3179847 0.02013100 0.02213304 0.02808770
## 4 0.002955553 0.03862500 0.3372124 0.02129611 0.02391960 0.02983998
## 5 0.003571914 0.03699544 0.3302214 0.02081336 0.02307129 0.02718369
## 6 0.003694698 0.03499373 0.3037880 0.02024749 0.02134483 0.02473866
##      HNR05    HNR15    HNR25     HNR35     HNR38      RPDE       DFA
## 1 80.41057 87.32158 98.68322 105.90856 107.48508 0.3039304 0.6517635
## 2 54.58130 57.70287 68.24170  74.26046  75.51915 0.3258406 0.5951196
## 3 69.45061 74.34322 85.45881  92.17105  93.71099 0.3115780 0.6149760
## 4 60.83959 66.08176 77.05004  82.60471  83.81548 0.3059521 0.6229899
## 5 46.61847 49.24998 58.68668  63.75885  65.11901 0.3043146 0.6025471
## 6 30.95813 33.54456 42.17983  47.11318  47.87071 0.3054083 0.5871637
##         PPE       GNE    MFCC0    MFCC1    MFCC2    MFCC3    MFCC4
## 1 0.3146891 0.9168083 1.484459 1.382355 1.330645 1.510957 1.539247
## 2 0.2622407 0.9160310 1.314902 1.318794 1.310268 1.335058 1.337651
## 3 0.2979775 0.9304011 1.496845 1.437982 1.418833 1.495955 1.491525
## 4 0.2611002 0.9149709 1.346767 1.287619 1.296291 1.322794 1.355833
## 5 0.2607332 0.9101494 1.221919 1.202470 1.188798 1.242942 1.208992
## 6 0.2067840 0.9110303 1.047280 1.032692 1.033387 1.032332 1.032577
##      MFCC5    MFCC6    MFCC7    MFCC8    MFCC9   MFCC10   MFCC11   MFCC12
## 1 1.507922 1.486117 1.500558 1.531053 1.487759 1.515915 1.535200 1.505596
## 2 1.324332 1.313718 1.322991 1.327059 1.332317 1.337923 1.335190 1.332458
## 3 1.483720 1.477887 1.454757 1.480831 1.462290 1.456926 1.469633 1.445001
## 4 1.345909 1.333235 1.357288 1.358926 1.347754 1.363250 1.375866 1.340120
## 5 1.165823 1.195538 1.205875 1.224911 1.243198 1.208944 1.219147 1.217839
## 6 1.035967 1.051403 1.042632 1.052857 1.053151 1.032669 1.041389 1.043262
##     Delta0   Delta1   Delta2   Delta3   Delta4   Delta5   Delta7   Delta8
## 1 1.504702 1.492119 1.530450 1.488144 1.510448 1.501960 1.519925 1.502571
## 2 1.300194 1.299299 1.303248 1.342536 1.333771 1.331380 1.346209 1.342574
## 3 1.484899 1.464461 1.494599 1.467128 1.477135 1.468902 1.462529 1.461643
## 4 1.341754 1.348447 1.331450 1.342680 1.356382 1.333673 1.333741 1.349822
## 5 1.208993 1.223084 1.202055 1.222866 1.225048 1.212277 1.195346 1.219054
## 6 1.032254 1.025455 1.038871 1.035850 1.027261 1.009878 1.029522 1.027197
##     Delta9  Delta10  Delta11  Delta12
## 1 1.503811 1.459851 1.532789 1.484787
## 2 1.334598 1.285286 1.313074 1.318790
## 3 1.457561 1.461580 1.476071 1.488190
## 4 1.351172 1.377224 1.349177 1.358228
## 5 1.210233 1.194788 1.214723 1.219988
## 6 1.038767 1.040231 1.041108 1.039147
library(cluster)
clusplot(Data, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)

(df_km<-table(Data$Status,fit.km$cluster))
##    
##      1  2  3  4  5  6
##   0 28 21 42 25  4  0
##   1  6 23 10 26 31 22
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 21.42857