Here I’ll show the impact of decorrelating high-dimensional data sets.
library("FRESA.CAD")
library(whitening)
library(mRMRe)
library(readxl)
decorrelated_ML_Method <- function(theformula,
data = NULL,
mlMethod = NULL,
adjust.parameters=list(thr=0.85,
unipvalue=0.05,
type="NZLM"),
...)
{
if (class(theformula)!="formula")
{
theformula = formula(theformula)
}
varlist <- attr(terms(theformula,data=data),"variables")
dependent <- as.character(varlist[[2]])
Outcome = dependent[1];
scaledMethod <- FRESAScale(data,method = "OrderLogit")
data <- scaledMethod$scaledData
refData <- data
scaledMethod$scaledData <- NULL;
data <- do.call(featureDecorrelation,c(list(data,Outcome,refData),adjust.parameters));
DeCorrmatrix <- attr(data,"DeCorrmatrix")
attr(data,"DeCorrmatrix") <- NULL
fit <- mlMethod(formula=theformula,data,...)
usedfeatures <- unique(c(fit$selectedfeatures,attr(data,"varincluded")))
baseFeatures <- attr(data,"baseFeatures")
result <- list(fit=fit,
refdata=refData,
scaledMethod = scaledMethod,
selectedfeatures=fit$selectedfeatures,
usedfeatures = usedfeatures,
Outcome=Outcome,
adjust.parameters=adjust.parameters,
baseFeatures=baseFeatures,
DeCorrmatrix=DeCorrmatrix,
decorrelated=data);
class(result) <- "DecorrelatedFit"
return(result)
}
predict.DecorrelatedFit <- function(object,...)
{
parameters <- list(...);
testframe <- parameters[[1]];
usfeat <- unique(c(object$Outcome,object$usedfeatures));
nottoadjust <- (rownames(testframe) %in% rownames(object$decorrelated))
if (sum(nottoadjust) < nrow(testframe))
{
idstoAdjust <- rownames(testframe)[!nottoadjust]
testframe[idstoAdjust,] <- FRESAScale(testframe[idstoAdjust,],
method = "OrderLogit",
refMean =object$scaledMethod$refMean ,
refDisp =object$scaledMethod$refDisp ,
)$scaledData
object$refdata[,object$Outcome] <- as.numeric(object$refdata[,object$Outcome])
testframe[,object$Outcome] <- as.numeric(testframe[,object$Outcome])
decormat <- object$DeCorrmatrix;
featdecor <- colnames(decormat)
testframe[,featdecor] <- as.matrix(testframe[,featdecor]) %*% decormat
# testframe[idstoAdjust,usfeat] <- do.call(featureDecorrelation,
# c(list(testframe[idstoAdjust,usfeat],
# object$Outcome,
# refdata=object$refdata[,usfeat]),
# baseFeatures=object$baseFeatures,
# object$adjust.parameters));
# attr(testframe,"featureMatrix") <- NULL
toreplace <- rownames(object$decorrelated) %in% rownames(testframe)
if (sum(toreplace) > 0)
{
toreplace <- rownames(object$decorrelated)[toreplace]
cat("Replace:",length(toreplace))
testframe[toreplace,] <- object$decorrelated[toreplace,]
}
}
else
{
# cat("Not Adjusting")
testframe <- object$decorrelated[rownames(testframe),]
}
prediction <- predict(object$fit,testframe)
return (prediction)
}
C. Okan Sakar, Gorkem Serbes, Aysegul Gunduz, Hunkar C. Tunc, Hatice Nizam, Betul Erdogdu Sakar, Melih Tutuncu, Tarkan Aydin, M. Erdem Isenkul, Hulya Apaydin, A comparative analysis of speech signal processing algorithms for Parkinson’s disease classification and the use of the tunable Q-factor wavelet transform, Applied Soft Computing, Volume 74, 2019, Pages 255-263, ISSN 1568-4946,
https://doi.org/10.1016/j.asoc.2018.10.022.
(https://www.sciencedirect.com/science/article/pii/S1568494618305799)
Data From:
https://archive.ics.uci.edu/ml/datasets/Parkinson%27s+Disease+Classification
ParkisonsData <- as.data.frame(read_excel("DataSets/IDspd_speech_features.xlsx"))
ParkisonsData$UID <- NULL
ParkisonsData[,1:ncol(ParkisonsData)] <- sapply(ParkisonsData,as.numeric)
set1Pakisons <- subset(ParkisonsData,Sample == 1)
set2Pakisons <- subset(ParkisonsData,Sample == 2)
set3Pakisons <- subset(ParkisonsData,Sample == 3)
rownames(set1Pakisons) <- set1Pakisons$id
set1Pakisons$id <- NULL
set1Pakisons$Sample <- NULL
rownames(set2Pakisons) <- set2Pakisons$id
set2Pakisons$id <- NULL
set2Pakisons$Sample <- NULL
rownames(set3Pakisons) <- set3Pakisons$id
set3Pakisons$id <- NULL
set3Pakisons$Sample <- NULL
pander::pander(table(set1Pakisons$class,set1Pakisons$gender))
| 0 | 1 | |
|---|---|---|
| 0 | 41 | 23 |
| 1 | 81 | 107 |
avgsetPakisons <- (set1Pakisons + set2Pakisons + set3Pakisons)/3
logPakisons <- as.data.frame(apply(abs(avgsetPakisons[,!(colnames(avgsetPakisons) %in% c("gender","class"))])+1,2,log))
boxplot(logPakisons)
logPakisons$class <- avgsetPakisons$class
logPakisons$gender <- avgsetPakisons$gender
logPakisons$gender <- 2*logPakisons$gender - 1
LMDecorrelated <- featureDecorrelation(logPakisons,Outcome="class")
features <- attr(LMDecorrelated,"varincluded")
cormat1 <- abs(cor(logPakisons[features]))
diag(cormat1) <- 0
gplots::heatmap.2(cormat1,trace = "none",mar = c(10,10),col=rev(heat.colors(5)),main = "Raw Correlation",cexRow = 0.5,cexCol = 0.5,key.xlab="Correlation matrix",xlab="Feature", ylab="Feature")
DeCorrmatrix <- attr(LMDecorrelated,"DeCorrmatrix")
cormat1 <- abs(cor(LMDecorrelated[features],method="spearman"))
diag(cormat1) <- 0
gplots::heatmap.2(cormat1,trace = "none",mar = c(10,10),col=rev(heat.colors(5)),main = "Decorrelation",cexRow = 0.5,cexCol = 0.5,key.xlab="correlation matrix",xlab="Feature", ylab="Feature")
max(cormat1)
#> [1] 0.9977556
decorrmatw <- as.matrix(logPakisons[,features]) %*% DeCorrmatrix
#for (ft in colnames(decorrmatw))
#{
# cat("(",cor(decorrmatw[,ft],LMDecorrelated[,ft]),":",mean(decorrmatw[,ft]-LMDecorrelated[,ft]),")\n")
#}
cormat1 <- abs(cor(decorrmatw,method="spearman"))
diag(cormat1) <- 0
colmax <- apply(cormat1,2,max)
names(colmax[colmax>0.95])
#> [1] "rapJitter" "ppq5Jitter"
#> [3] "minIntensity" "meanIntensity"
#> [5] "IMF_SNR_SEO" "IMF_SNR_entropy"
#> [7] "app_LT_entropy_shannon_4_coef" "app_LT_entropy_shannon_8_coef"
#> [9] "app_LT_entropy_log_1_coef" "app_LT_TKEO_mean_3_coef"
#> [11] "tqwt_entropy_shannon_dec_5" "tqwt_entropy_log_dec_12"
#> [13] "tqwt_entropy_log_dec_22" "tqwt_TKEO_mean_dec_5"
#> [15] "tqwt_TKEO_mean_dec_6" "tqwt_TKEO_mean_dec_32"
#> [17] "tqwt_TKEO_std_dec_5" "tqwt_TKEO_std_dec_6"
#> [19] "tqwt_stdValue_dec_12" "tqwt_stdValue_dec_22"
#> [21] "tqwt_stdValue_dec_32" "tqwt_maxValue_dec_5"
gplots::heatmap.2(cormat1,trace = "none",mar = c(10,10),col=rev(heat.colors(5)),main = "Decorrelation",cexRow = 0.5,cexCol = 0.5,key.xlab="correlation matrix",xlab="Feature", ylab="Feature")
max(cormat1)
#> [1] 0.9977556
plot(as.data.frame(decorrmatw[,names(colmax[colmax>0.99])]))
op <- par(no.readonly = TRUE,pty="m")
RawParkisonCV <- randomCV(logPakisons,
"class",
fittingFunction= filteredFit,
asFactor = TRUE,
trainFraction = 0.90,
classSamplingType = "Pro",
repetitions = 200,
fitmethod = e1071::svm,
filtermethod = mRMR.classic_FRESA,
filtermethod.control = list(feature_count=50),
probability = TRUE
)
……….10 Tested: 176 Avg. Selected: 50 Min Tests: 1 Max Tests: 4 Mean Tests: 1.477273 . MAD: 0.2432809 ……….20 Tested: 229 Avg. Selected: 50 Min Tests: 1 Max Tests: 6 Mean Tests: 2.270742 . MAD: 0.2441392 ……….30 Tested: 243 Avg. Selected: 50 Min Tests: 1 Max Tests: 8 Mean Tests: 3.209877 . MAD: 0.2373913 ……….40 Tested: 249 Avg. Selected: 50 Min Tests: 1 Max Tests: 11 Mean Tests: 4.176707 . MAD: 0.2375405 ……….50 Tested: 250 Avg. Selected: 50 Min Tests: 1 Max Tests: 14 Mean Tests: 5.2 . MAD: 0.2378382 ……….60 Tested: 252 Avg. Selected: 50 Min Tests: 1 Max Tests: 14 Mean Tests: 6.190476 . MAD: 0.2382549 ……….70 Tested: 252 Avg. Selected: 50 Min Tests: 1 Max Tests: 17 Mean Tests: 7.222222 . MAD: 0.2391956 ……….80 Tested: 252 Avg. Selected: 50 Min Tests: 1 Max Tests: 17 Mean Tests: 8.253968 . MAD: 0.2409151 ……….90 Tested: 252 Avg. Selected: 50 Min Tests: 1 Max Tests: 18 Mean Tests: 9.285714 . MAD: 0.2409633 ……….100 Tested: 252 Avg. Selected: 50 Min Tests: 1 Max Tests: 21 Mean Tests: 10.31746 . MAD: 0.2421591 ……….110 Tested: 252 Avg. Selected: 50 Min Tests: 1 Max Tests: 24 Mean Tests: 11.34921 . MAD: 0.2406284 ……….120 Tested: 252 Avg. Selected: 50 Min Tests: 2 Max Tests: 25 Mean Tests: 12.38095 . MAD: 0.2391411 ……….130 Tested: 252 Avg. Selected: 50 Min Tests: 3 Max Tests: 25 Mean Tests: 13.4127 . MAD: 0.2394732 ……….140 Tested: 252 Avg. Selected: 50 Min Tests: 4 Max Tests: 26 Mean Tests: 14.44444 . MAD: 0.2413311 ……….150 Tested: 252 Avg. Selected: 50 Min Tests: 4 Max Tests: 27 Mean Tests: 15.47619 . MAD: 0.2407454 ……….160 Tested: 252 Avg. Selected: 50 Min Tests: 4 Max Tests: 29 Mean Tests: 16.50794 . MAD: 0.2413003 ……….170 Tested: 252 Avg. Selected: 50 Min Tests: 7 Max Tests: 29 Mean Tests: 17.53968 . MAD: 0.2410206 ……….180 Tested: 252 Avg. Selected: 50 Min Tests: 7 Max Tests: 33 Mean Tests: 18.57143 . MAD: 0.2411989 ……….190 Tested: 252 Avg. Selected: 50 Min Tests: 7 Max Tests: 35 Mean Tests: 19.60317 . MAD: 0.2402621 ……….200 Tested: 252 Avg. Selected: 50 Min Tests: 7 Max Tests: 38 Mean Tests: 20.63492 . MAD: 0.2400171
par(op)
DeCorrParkisonCV <- randomCV(logPakisons,
"class",
fittingFunction= decorrelated_ML_Method,
asFactor = TRUE,
trainSampleSets = RawParkisonCV$trainSamplesSets,
mlMethod = filteredFit,
adjust.parameters=list(thr=0.85,unipvalue=0.05,type="NZLM"),
fitmethod = e1071::svm,
filtermethod = mRMR.classic_FRESA,
filtermethod.control = list(feature_count=25),
probability = TRUE,
scale=FALSE,
)
……….10 Tested: 176 Avg. Selected: 25 Min Tests: 1 Max Tests: 4 Mean Tests: 1.477273 . MAD: 0.2149682 ……….20 Tested: 229 Avg. Selected: 25 Min Tests: 1 Max Tests: 6 Mean Tests: 2.270742 . MAD: 0.2061591 ……….30 Tested: 243 Avg. Selected: 25 Min Tests: 1 Max Tests: 8 Mean Tests: 3.209877 . MAD: 0.2031371 ……….40 Tested: 249 Avg. Selected: 25 Min Tests: 1 Max Tests: 11 Mean Tests: 4.176707 . MAD: 0.2049641 ……….50 Tested: 250 Avg. Selected: 25 Min Tests: 1 Max Tests: 14 Mean Tests: 5.2 . MAD: 0.2025217 ……….60 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 14 Mean Tests: 6.190476 . MAD: 0.2034254 ……….70 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 17 Mean Tests: 7.222222 . MAD: 0.2037848 ……….80 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 17 Mean Tests: 8.253968 . MAD: 0.2030797 ……….90 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 18 Mean Tests: 9.285714 . MAD: 0.2024452 ……….100 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 21 Mean Tests: 10.31746 . MAD: 0.2015627 ……….110 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 24 Mean Tests: 11.34921 . MAD: 0.2039776 ……….120 Tested: 252 Avg. Selected: 25 Min Tests: 2 Max Tests: 25 Mean Tests: 12.38095 . MAD: 0.2036254 ……….130 Tested: 252 Avg. Selected: 25 Min Tests: 3 Max Tests: 25 Mean Tests: 13.4127 . MAD: 0.2035676 ……….140 Tested: 252 Avg. Selected: 25 Min Tests: 4 Max Tests: 26 Mean Tests: 14.44444 . MAD: 0.2037296 ……….150 Tested: 252 Avg. Selected: 25 Min Tests: 4 Max Tests: 27 Mean Tests: 15.47619 . MAD: 0.2039409 ……….160 Tested: 252 Avg. Selected: 25 Min Tests: 4 Max Tests: 29 Mean Tests: 16.50794 . MAD: 0.204645 ……….170 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 29 Mean Tests: 17.53968 . MAD: 0.204881 ……….180 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 33 Mean Tests: 18.57143 . MAD: 0.2050119 ……….190 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 35 Mean Tests: 19.60317 . MAD: 0.2051853 ……….200 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 38 Mean Tests: 20.63492 . MAD: 0.2056593
DeCorrParkisonRLMCV <- randomCV(logPakisons,
"class",
fittingFunction= decorrelated_ML_Method,
asFactor = TRUE,
trainSampleSets = RawParkisonCV$trainSamplesSets,
mlMethod = filteredFit,
adjust.parameters=list(thr=0.85,unipvalue=0.05,type="RLM"),
fitmethod = e1071::svm,
filtermethod = mRMR.classic_FRESA,
filtermethod.control = list(feature_count=25),
probability = TRUE,
scale=FALSE,
)
……….10 Tested: 176 Avg. Selected: 25 Min Tests: 1 Max Tests: 4 Mean Tests: 1.477273 . MAD: 0.2147932 ……….20 Tested: 229 Avg. Selected: 25 Min Tests: 1 Max Tests: 6 Mean Tests: 2.270742 . MAD: 0.2112763 ……….30 Tested: 243 Avg. Selected: 25 Min Tests: 1 Max Tests: 8 Mean Tests: 3.209877 . MAD: 0.2085286 ……….40 Tested: 249 Avg. Selected: 25 Min Tests: 1 Max Tests: 11 Mean Tests: 4.176707 . MAD: 0.209713 ……….50 Tested: 250 Avg. Selected: 25 Min Tests: 1 Max Tests: 14 Mean Tests: 5.2 . MAD: 0.2085083 ……….60 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 14 Mean Tests: 6.190476 . MAD: 0.207523 ……….70 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 17 Mean Tests: 7.222222 . MAD: 0.2086716 ……….80 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 17 Mean Tests: 8.253968 . MAD: 0.2084372 ……….90 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 18 Mean Tests: 9.285714 . MAD: 0.207126 ……….100 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 21 Mean Tests: 10.31746 . MAD: 0.2069109 ……….110 Tested: 252 Avg. Selected: 25 Min Tests: 1 Max Tests: 24 Mean Tests: 11.34921 . MAD: 0.2083483 ……….120 Tested: 252 Avg. Selected: 25 Min Tests: 2 Max Tests: 25 Mean Tests: 12.38095 . MAD: 0.2073671 ……….130 Tested: 252 Avg. Selected: 25 Min Tests: 3 Max Tests: 25 Mean Tests: 13.4127 . MAD: 0.2069197 ……….140 Tested: 252 Avg. Selected: 25 Min Tests: 4 Max Tests: 26 Mean Tests: 14.44444 . MAD: 0.2078809 ……….150 Tested: 252 Avg. Selected: 25 Min Tests: 4 Max Tests: 27 Mean Tests: 15.47619 . MAD: 0.2075634 ……….160 Tested: 252 Avg. Selected: 25 Min Tests: 4 Max Tests: 29 Mean Tests: 16.50794 . MAD: 0.2079679 ……….170 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 29 Mean Tests: 17.53968 . MAD: 0.2075518 ……….180 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 33 Mean Tests: 18.57143 . MAD: 0.2078517 ……….190 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 35 Mean Tests: 19.60317 . MAD: 0.2079881 ……….200 Tested: 252 Avg. Selected: 25 Min Tests: 7 Max Tests: 38 Mean Tests: 20.63492 . MAD: 0.2082941
par(mfrow = c(1,1),cex = 0.5);
bsFilteredSVM <- predictionStats_binary(RawParkisonCV$medianTest,
"RAW Set One Filtered:SVM",
cex = 1.0)
RAW Set One Filtered:SVM
bsRLMOneFilteredSVM <- predictionStats_binary(DeCorrParkisonRLMCV$medianTest,
"RLM Decorrelated One Filtered:SVM",
cex = 1.0)
RLM Decorrelated One Filtered:SVM
bsOneFilteredSVM <- predictionStats_binary(DeCorrParkisonCV$medianTest,
"LM Decorrelated One Filtered:SVM",
cex = 1.0)
LM Decorrelated One Filtered:SVM
par(op)
pander::pander(bsFilteredSVM$accc,caption="Accuracy")
| est | lower | upper |
|---|---|---|
| 0.8492 | 0.7989 | 0.891 |
pander::pander(as.data.frame(t(bsFilteredSVM$aucs)),caption="ROC AUC")
| est | lower | upper |
|---|---|---|
| 0.8216 | 0.7537 | 0.8894 |
pander::pander(bsFilteredSVM$ClassMetrics)
accci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.8492 | 0.8056 | 0.8929 |
senci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7655 | 0.7022 | 0.8281 |
aucci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7655 | 0.7022 | 0.8281 |
berci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.2345 | 0.1719 | 0.2978 |
preci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.8169 | 0.7502 | 0.8774 |
F1ci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7855 | 0.721 | 0.8447 |
pander::pander(bsFilteredSVM$ROC.analysis$F1)
0.9026
pander::pander(bsOneFilteredSVM$accc,caption="Accuracy")
| est | lower | upper |
|---|---|---|
| 0.8571 | 0.8078 | 0.8979 |
pander::pander(as.data.frame(t(bsOneFilteredSVM$aucs)),caption="ROC AUC")
| est | lower | upper |
|---|---|---|
| 0.8684 | 0.8132 | 0.9235 |
pander::pander(bsOneFilteredSVM$ClassMetrics)
accci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.8571 | 0.8134 | 0.8968 |
senci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7694 | 0.7029 | 0.8279 |
aucci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.7694 | 0.7029 | 0.8279 |
berci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.2306 | 0.1721 | 0.2971 |
preci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.8314 | 0.7678 | 0.8904 |
F1ci:
| 50% | 2.5% | 97.5% |
|---|---|---|
| 0.791 | 0.7252 | 0.8474 |
pander::pander(bsOneFilteredSVM$ROC.analysis$F1)
0.9082