data.aov <- aov(AUC~domain*Learner,data)
HSD.test(data.aov,c("domain","Learner"),console = TRUE)
##
## Study: data.aov ~ c("domain", "Learner")
##
## HSD Test for AUC
##
## Mean Square Error: 0.004226521
##
## domain:Learner, means
##
## AUC std r Min Max
## doctor:5-NN 0.6313289 0.04061558 20 0.566725 0.716901
## doctor:C4.5 0.6608827 0.04994291 20 0.557042 0.756424
## doctor:C4.5_Original 0.6255333 0.07649738 20 0.450528 0.740451
## doctor:Logistic_Regression 0.8749526 0.03683307 20 0.805282 0.952817
## doctor:MLP 0.7952378 0.02917185 20 0.742254 0.849306
## doctor:Naive_Bayes 0.8946762 0.01736294 20 0.857746 0.919792
## doctor:Random_Forest_(100) 0.8884482 0.02718574 20 0.826042 0.919894
## doctor:RBF_Network 0.7212954 0.04787920 20 0.633275 0.802083
## doctor:SVM 0.8314018 0.03015308 20 0.768750 0.888732
## hotel:5-NN 0.6266993 0.02764574 20 0.575825 0.663064
## hotel:C4.5 0.7695898 0.03000123 20 0.698900 0.805396
## hotel:C4.5_Original 0.7000094 0.03253486 20 0.645486 0.757176
## hotel:Logistic_Regression 0.7136357 0.13117911 20 0.564482 0.900622
## hotel:MLP 0.8029145 0.15776847 20 0.164497 0.904167
## hotel:Naive_Bayes 0.8889128 0.02058352 20 0.845197 0.922367
## hotel:Random_Forest_(100) 0.9107104 0.01561730 20 0.872584 0.932697
## hotel:RBF_Network 0.8136813 0.01948143 20 0.766479 0.839207
## hotel:SVM 0.8631352 0.01859478 20 0.818721 0.905584
## restraunt:5-NN 0.7650625 0.06111589 20 0.586875 0.856875
## restraunt:C4.5 0.7419689 0.06245266 20 0.601875 0.838750
## restraunt:C4.5_Original 0.6929846 0.06850049 20 0.537813 0.807500
## restraunt:Logistic_Regression 0.8563127 0.04138117 20 0.781875 0.926875
## restraunt:MLP 0.7566875 0.17477131 20 0.164375 0.910000
## restraunt:Naive_Bayes 0.9170157 0.03190490 20 0.843125 0.963750
## restraunt:Random_Forest_(100) 0.9088597 0.02632488 20 0.845938 0.945625
## restraunt:RBF_Network 0.6234220 0.05994822 20 0.493125 0.737500
## restraunt:SVM 0.8670938 0.03957199 20 0.803750 0.939375
##
## alpha: 0.05 ; Df Error: 513
## Critical Value of Studentized Range: 5.2573
##
## Honestly Significant Difference: 0.07642567
##
## Means with the same letter are not significantly different.
##
## Groups, Treatments and means
## a restraunt:Naive_Bayes 0.917
## a hotel:Random_Forest_(100) 0.9107
## a restraunt:Random_Forest_(100) 0.9089
## ab doctor:Naive_Bayes 0.8947
## abc hotel:Naive_Bayes 0.8889
## abc doctor:Random_Forest_(100) 0.8884
## abcd doctor:Logistic_Regression 0.875
## abcde restraunt:SVM 0.8671
## abcde hotel:SVM 0.8631
## abcde restraunt:Logistic_Regression 0.8563
## bcdef doctor:SVM 0.8314
## cdefg hotel:RBF_Network 0.8137
## defg hotel:MLP 0.8029
## efgh doctor:MLP 0.7952
## fghi hotel:C4.5 0.7696
## fghij restraunt:5-NN 0.7651
## fghij restraunt:MLP 0.7567
## ghij restraunt:C4.5 0.742
## hijk doctor:RBF_Network 0.7213
## ijk hotel:Logistic_Regression 0.7136
## ijkl hotel:C4.5_Original 0.7
## jklm restraunt:C4.5_Original 0.693
## klm doctor:C4.5 0.6609
## lm doctor:5-NN 0.6313
## lm hotel:5-NN 0.6267
## lm doctor:C4.5_Original 0.6255
## m restraunt:RBF_Network 0.6234
data <- mapKeys("feature-ranking-key.xls","testData/newreview-wordvector-testset-output.zip","output/output2.arff")
data$trainDomain <- data$FitDataset
data$trainDomain <- plyr::revalue(data$trainDomain,c("reviewSpamData-string-hotel.arff"="hotel",
"reviewSpamData-string-restaurant.arff"="restraunt",
"reviewSpamData-string-doctor.arff"="doctor"))
data$testDomain <- data$TestDataset
data$testDomain <- plyr::revalue(data$testDomain,c("reviewSpamData-string-hotel.arff"="hotel",
"reviewSpamData-string-restaurant.arff"="restraunt",
"reviewSpamData-string-doctor.arff"="doctor"))
ddata <- select(data,trainDomain, testDomain,Learner,BestF_FMeasure,AUC)
ddata <- filter(ddata,trainDomain=='hotel')
ddata <- arrange(ddata,desc(BestF_FMeasure))
ddata
## trainDomain testDomain Learner BestF_FMeasure AUC
## 1 hotel restraunt Naive_Bayes 0.763326 0.790150
## 2 hotel restraunt MLP 0.725118 0.763100
## 3 hotel restraunt SVM 0.710407 0.736150
## 4 hotel restraunt Random_Forest_(100) 0.702913 0.748363
## 5 hotel restraunt RBF_Network 0.690909 0.644850
## 6 hotel restraunt C4.5 0.672727 0.583313
## 7 hotel restraunt 5-NN 0.667780 0.493263
## 8 hotel restraunt Logistic_Regression 0.666667 0.500600
## 9 hotel restraunt C4.5_Original 0.666667 0.557975
## 10 hotel doctor SVM 0.584392 0.647844
## 11 hotel doctor MLP 0.573826 0.643961
## 12 hotel doctor Naive_Bayes 0.561562 0.614242
## 13 hotel doctor Random_Forest_(100) 0.537931 0.555597
## 14 hotel doctor 5-NN 0.531544 0.485302
## 15 hotel doctor Logistic_Regression 0.529801 0.463989
## 16 hotel doctor RBF_Network 0.529501 0.538511
## 17 hotel doctor C4.5 0.529101 0.545302
## 18 hotel doctor C4.5_Original 0.529101 0.539508
data <- mapKeys("feature-ranking-key.xls","testData/newreview-wordvector-output 2.zip","output/normal/output2.arff")
data$domain <- data$Dataset
data$domain <- plyr::revalue(data$domain,c("reviewSpamData-string-hotel.arff"="hotel",
"reviewSpamData-string-restaurant.arff"="restraunt",
"reviewSpamData-string-doctor.arff"="doctor",
"reviewSpamData-string-noSentiment.arff"="all"))
#data$WordsToKeep <- as.character(data$WordsToKeep)
#data$domain <- as.factor(data$domain)
#data$WordsToKeep <- as.factor(data$WordsToKeep)
data <- filter(data,domain=='all')
data.aov <- aov(AUC~Learner*WordsToKeep,data)
HSD.test(data.aov,c("Learner","WordsToKeep"),console = TRUE)
##
## Study: data.aov ~ c("Learner", "WordsToKeep")
##
## HSD Test for AUC
##
## Mean Square Error: 0.0005161914
##
## Learner:WordsToKeep, means
##
## AUC std r Min Max
## C4.5:100 0.6653927 0.02986633 20 0.619075 0.733244
## C4.5:1000 0.7299589 0.02074435 20 0.687392 0.765609
## C4.5:1100 0.7311963 0.01978110 20 0.691488 0.767393
## C4.5:1200 0.7310596 0.01993955 20 0.689781 0.767393
## C4.5:1300 0.7305792 0.01977812 20 0.689781 0.765889
## C4.5:1400 0.7300697 0.01968751 20 0.689781 0.765889
## C4.5:1500 0.7300919 0.01987954 20 0.690284 0.765889
## C4.5:1600 0.7293202 0.01982863 20 0.690284 0.766055
## C4.5:1700 0.7293789 0.01987210 20 0.690246 0.766501
## C4.5:1800 0.7293789 0.01987210 20 0.690246 0.766501
## C4.5:1900 0.7300039 0.01981626 20 0.689456 0.766501
## C4.5:200 0.6951332 0.02279499 20 0.651733 0.739183
## C4.5:2000 0.7300039 0.01981626 20 0.689456 0.766501
## C4.5:300 0.7081515 0.01741704 20 0.679543 0.740991
## C4.5:400 0.7196797 0.02126731 20 0.690488 0.762627
## C4.5:500 0.7263006 0.02156981 20 0.691030 0.771279
## C4.5:600 0.7262031 0.01845421 20 0.698541 0.764768
## C4.5:700 0.7300733 0.02255201 20 0.686557 0.765431
## C4.5:800 0.7285308 0.02210812 20 0.685538 0.765233
## C4.5:900 0.7307915 0.02002356 20 0.691393 0.765233
## Logistic_Regression:100 0.7735741 0.01589683 20 0.735894 0.814156
## Logistic_Regression:1000 0.7364933 0.02241736 20 0.691692 0.795292
## Logistic_Regression:1100 0.7114746 0.02011643 20 0.669423 0.744623
## Logistic_Regression:1200 0.6571497 0.02253972 20 0.608232 0.694196
## Logistic_Regression:1300 0.6186704 0.02420277 20 0.580297 0.685277
## Logistic_Regression:1400 0.6019425 0.02162649 20 0.554122 0.640182
## Logistic_Regression:1500 0.5962591 0.02100249 20 0.545700 0.629173
## Logistic_Regression:1600 0.5938256 0.01803604 20 0.571305 0.641902
## Logistic_Regression:1700 0.6016516 0.03233980 20 0.533180 0.650720
## Logistic_Regression:1800 0.5998516 0.03153286 20 0.533180 0.654339
## Logistic_Regression:1900 0.5899047 0.03905139 20 0.493502 0.648742
## Logistic_Regression:200 0.8252570 0.01733961 20 0.784021 0.850306
## Logistic_Regression:2000 0.6117388 0.07072708 20 0.538542 0.872687
## Logistic_Regression:300 0.8455820 0.01403629 20 0.807238 0.864093
## Logistic_Regression:400 0.8409437 0.01342741 20 0.805632 0.860449
## Logistic_Regression:500 0.7937861 0.01519020 20 0.759735 0.817081
## Logistic_Regression:600 0.7809333 0.01730153 20 0.747426 0.807711
## Logistic_Regression:700 0.7697041 0.01998184 20 0.721063 0.799573
## Logistic_Regression:800 0.7509569 0.01938067 20 0.717151 0.780262
## Logistic_Regression:900 0.7344063 0.02358776 20 0.687360 0.785060
## Naive_Bayes:100 0.7426301 0.02823651 20 0.674261 0.793578
## Naive_Bayes:1000 0.8261807 0.02630069 20 0.762525 0.876707
## Naive_Bayes:1100 0.8266173 0.02625750 20 0.763214 0.877090
## Naive_Bayes:1200 0.8268743 0.02631298 20 0.762895 0.877383
## Naive_Bayes:1300 0.8272493 0.02632090 20 0.763163 0.877714
## Naive_Bayes:1400 0.8274357 0.02637318 20 0.763220 0.877854
## Naive_Bayes:1500 0.8276653 0.02635740 20 0.763239 0.877982
## Naive_Bayes:1600 0.8279321 0.02629711 20 0.763609 0.877982
## Naive_Bayes:1700 0.8281666 0.02638746 20 0.763660 0.878198
## Naive_Bayes:1800 0.8282318 0.02635944 20 0.763940 0.878338
## Naive_Bayes:1900 0.8284149 0.02633759 20 0.764112 0.878389
## Naive_Bayes:200 0.7864800 0.02667158 20 0.715902 0.828874
## Naive_Bayes:2000 0.8284967 0.02632915 20 0.764271 0.878593
## Naive_Bayes:300 0.8118838 0.02601424 20 0.742495 0.857480
## Naive_Bayes:400 0.8207689 0.02493250 20 0.760780 0.863749
## Naive_Bayes:500 0.8208435 0.02708785 20 0.756167 0.873586
## Naive_Bayes:600 0.8228372 0.02656817 20 0.759964 0.874860
## Naive_Bayes:700 0.8242331 0.02638465 20 0.760614 0.875217
## Naive_Bayes:800 0.8247463 0.02621734 20 0.761366 0.875765
## Naive_Bayes:900 0.8254480 0.02634730 20 0.761557 0.876032
## SVM:100 0.7724263 0.01725180 20 0.735410 0.817036
## SVM:1000 0.8329224 0.01429270 20 0.807480 0.865201
## SVM:1100 0.8395938 0.01234784 20 0.819355 0.870247
## SVM:1200 0.8433699 0.01253048 20 0.823496 0.870865
## SVM:1300 0.8465466 0.01381359 20 0.824758 0.876102
## SVM:1400 0.8518777 0.01354587 20 0.827472 0.876854
## SVM:1500 0.8536197 0.01339570 20 0.833601 0.884703
## SVM:1600 0.8563945 0.01445342 20 0.835270 0.884314
## SVM:1700 0.8571042 0.01450558 20 0.832862 0.880836
## SVM:1800 0.8570707 0.01428386 20 0.832862 0.880836
## SVM:1900 0.8576928 0.01503530 20 0.837258 0.880594
## SVM:200 0.8170338 0.01788749 20 0.777051 0.848662
## SVM:2000 0.8586635 0.01450109 20 0.833945 0.880632
## SVM:300 0.8422479 0.01392605 20 0.804995 0.860321
## SVM:400 0.8376615 0.01381645 20 0.805110 0.857518
## SVM:500 0.8275756 0.01041695 20 0.797452 0.840571
## SVM:600 0.8288414 0.01271284 20 0.793998 0.849809
## SVM:700 0.8248404 0.01403390 20 0.790635 0.842391
## SVM:800 0.8256575 0.01330207 20 0.795674 0.846678
## SVM:900 0.8298854 0.01560775 20 0.796923 0.863513
##
## alpha: 0.05 ; Df Error: 1520
## Critical Value of Studentized Range: 5.960946
##
## Honestly Significant Difference: 0.03028347
##
## Means with the same letter are not significantly different.
##
## Groups, Treatments and means
## a SVM:2000 0.8587
## ab SVM:1900 0.8577
## abc SVM:1700 0.8571
## abc SVM:1800 0.8571
## abcd SVM:1600 0.8564
## abcde SVM:1500 0.8536
## abcdef SVM:1400 0.8519
## abcdefg SVM:1300 0.8465
## abcdefg Logistic_Regression:300 0.8456
## abcdefg SVM:1200 0.8434
## abcdefg SVM:300 0.8422
## abcdefgh Logistic_Regression:400 0.8409
## abcdefgh SVM:1100 0.8396
## abcdefgh SVM:400 0.8377
## abcdefgh SVM:1000 0.8329
## abcdefgh SVM:900 0.8299
## abcdefgh SVM:600 0.8288
## abcdefgh Naive_Bayes:2000 0.8285
## abcdefgh Naive_Bayes:1900 0.8284
## bcdefgh Naive_Bayes:1800 0.8282
## bcdefgh Naive_Bayes:1700 0.8282
## bcdefgh Naive_Bayes:1600 0.8279
## bcdefgh Naive_Bayes:1500 0.8277
## bcdefgh SVM:500 0.8276
## bcdefgh Naive_Bayes:1400 0.8274
## cdefgh Naive_Bayes:1300 0.8272
## cdefgh Naive_Bayes:1200 0.8269
## defgh Naive_Bayes:1100 0.8266
## defgh Naive_Bayes:1000 0.8262
## efgh SVM:800 0.8257
## efgh Naive_Bayes:900 0.8254
## efgh Logistic_Regression:200 0.8253
## efgh SVM:700 0.8248
## efgh Naive_Bayes:800 0.8247
## efgh Naive_Bayes:700 0.8242
## fghi Naive_Bayes:600 0.8228
## ghi Naive_Bayes:500 0.8208
## ghi Naive_Bayes:400 0.8208
## ghi SVM:200 0.817
## hij Naive_Bayes:300 0.8119
## ijk Logistic_Regression:500 0.7938
## jk Naive_Bayes:200 0.7865
## kl Logistic_Regression:600 0.7809
## kl Logistic_Regression:100 0.7736
## klm SVM:100 0.7724
## klm Logistic_Regression:700 0.7697
## lmn Logistic_Regression:800 0.751
## mno Naive_Bayes:100 0.7426
## nop Logistic_Regression:1000 0.7365
## nop Logistic_Regression:900 0.7344
## nop C4.5:1100 0.7312
## nop C4.5:1200 0.7311
## nop C4.5:900 0.7308
## nop C4.5:1300 0.7306
## nop C4.5:1500 0.7301
## nop C4.5:700 0.7301
## nop C4.5:1400 0.7301
## nop C4.5:1900 0.73
## nop C4.5:2000 0.73
## nop C4.5:1000 0.73
## nop C4.5:1700 0.7294
## nop C4.5:1800 0.7294
## nop C4.5:1600 0.7293
## nop C4.5:800 0.7285
## nop C4.5:500 0.7263
## nop C4.5:600 0.7262
## opq C4.5:400 0.7197
## pq Logistic_Regression:1100 0.7115
## pq C4.5:300 0.7082
## qr C4.5:200 0.6951
## rs C4.5:100 0.6654
## s Logistic_Regression:1200 0.6571
## t Logistic_Regression:1300 0.6187
## t Logistic_Regression:2000 0.6117
## t Logistic_Regression:1400 0.6019
## t Logistic_Regression:1700 0.6017
## t Logistic_Regression:1800 0.5999
## t Logistic_Regression:1500 0.5963
## t Logistic_Regression:1600 0.5938
## t Logistic_Regression:1900 0.5899