Review Spam Results Analysis

data.aov <- aov(AUC~domain*Learner,data)
HSD.test(data.aov,c("domain","Learner"),console = TRUE)

## 
## Study: data.aov ~ c("domain", "Learner")
## 
## HSD Test for AUC 
## 
## Mean Square Error:  0.004226521 
## 
## domain:Learner,  means
## 
##                                     AUC        std  r      Min      Max
## doctor:5-NN                   0.6313289 0.04061558 20 0.566725 0.716901
## doctor:C4.5                   0.6608827 0.04994291 20 0.557042 0.756424
## doctor:C4.5_Original          0.6255333 0.07649738 20 0.450528 0.740451
## doctor:Logistic_Regression    0.8749526 0.03683307 20 0.805282 0.952817
## doctor:MLP                    0.7952378 0.02917185 20 0.742254 0.849306
## doctor:Naive_Bayes            0.8946762 0.01736294 20 0.857746 0.919792
## doctor:Random_Forest_(100)    0.8884482 0.02718574 20 0.826042 0.919894
## doctor:RBF_Network            0.7212954 0.04787920 20 0.633275 0.802083
## doctor:SVM                    0.8314018 0.03015308 20 0.768750 0.888732
## hotel:5-NN                    0.6266993 0.02764574 20 0.575825 0.663064
## hotel:C4.5                    0.7695898 0.03000123 20 0.698900 0.805396
## hotel:C4.5_Original           0.7000094 0.03253486 20 0.645486 0.757176
## hotel:Logistic_Regression     0.7136357 0.13117911 20 0.564482 0.900622
## hotel:MLP                     0.8029145 0.15776847 20 0.164497 0.904167
## hotel:Naive_Bayes             0.8889128 0.02058352 20 0.845197 0.922367
## hotel:Random_Forest_(100)     0.9107104 0.01561730 20 0.872584 0.932697
## hotel:RBF_Network             0.8136813 0.01948143 20 0.766479 0.839207
## hotel:SVM                     0.8631352 0.01859478 20 0.818721 0.905584
## restraunt:5-NN                0.7650625 0.06111589 20 0.586875 0.856875
## restraunt:C4.5                0.7419689 0.06245266 20 0.601875 0.838750
## restraunt:C4.5_Original       0.6929846 0.06850049 20 0.537813 0.807500
## restraunt:Logistic_Regression 0.8563127 0.04138117 20 0.781875 0.926875
## restraunt:MLP                 0.7566875 0.17477131 20 0.164375 0.910000
## restraunt:Naive_Bayes         0.9170157 0.03190490 20 0.843125 0.963750
## restraunt:Random_Forest_(100) 0.9088597 0.02632488 20 0.845938 0.945625
## restraunt:RBF_Network         0.6234220 0.05994822 20 0.493125 0.737500
## restraunt:SVM                 0.8670938 0.03957199 20 0.803750 0.939375
## 
## alpha: 0.05 ; Df Error: 513 
## Critical Value of Studentized Range: 5.2573 
## 
## Honestly Significant Difference: 0.07642567 
## 
## Means with the same letter are not significantly different.
## 
## Groups, Treatments and means
## a     restraunt:Naive_Bayes           0.917 
## a     hotel:Random_Forest_(100)       0.9107 
## a     restraunt:Random_Forest_(100)   0.9089 
## ab    doctor:Naive_Bayes              0.8947 
## abc   hotel:Naive_Bayes               0.8889 
## abc   doctor:Random_Forest_(100)      0.8884 
## abcd      doctor:Logistic_Regression      0.875 
## abcde     restraunt:SVM                   0.8671 
## abcde     hotel:SVM                       0.8631 
## abcde     restraunt:Logistic_Regression   0.8563 
## bcdef     doctor:SVM                      0.8314 
## cdefg     hotel:RBF_Network               0.8137 
## defg      hotel:MLP                       0.8029 
## efgh      doctor:MLP                      0.7952 
## fghi      hotel:C4.5                      0.7696 
## fghij     restraunt:5-NN                  0.7651 
## fghij     restraunt:MLP                   0.7567 
## ghij      restraunt:C4.5                  0.742 
## hijk      doctor:RBF_Network              0.7213 
## ijk   hotel:Logistic_Regression       0.7136 
## ijkl      hotel:C4.5_Original             0.7 
## jklm      restraunt:C4.5_Original         0.693 
## klm   doctor:C4.5                     0.6609 
## lm    doctor:5-NN                     0.6313 
## lm    hotel:5-NN                      0.6267 
## lm    doctor:C4.5_Original            0.6255 
## m     restraunt:RBF_Network           0.6234

data <- mapKeys("feature-ranking-key.xls","testData/newreview-wordvector-testset-output.zip","output/output2.arff")
data$trainDomain <- data$FitDataset
data$trainDomain <- plyr::revalue(data$trainDomain,c("reviewSpamData-string-hotel.arff"="hotel",
                                           "reviewSpamData-string-restaurant.arff"="restraunt",
                                           "reviewSpamData-string-doctor.arff"="doctor"))
data$testDomain <- data$TestDataset
data$testDomain <- plyr::revalue(data$testDomain,c("reviewSpamData-string-hotel.arff"="hotel",
                                           "reviewSpamData-string-restaurant.arff"="restraunt",
                                           "reviewSpamData-string-doctor.arff"="doctor"))
ddata <- select(data,trainDomain, testDomain,Learner,BestF_FMeasure,AUC)
ddata <- filter(ddata,trainDomain=='hotel')
ddata <- arrange(ddata,desc(BestF_FMeasure))
ddata

##    trainDomain testDomain             Learner BestF_FMeasure      AUC
## 1        hotel  restraunt         Naive_Bayes       0.763326 0.790150
## 2        hotel  restraunt                 MLP       0.725118 0.763100
## 3        hotel  restraunt                 SVM       0.710407 0.736150
## 4        hotel  restraunt Random_Forest_(100)       0.702913 0.748363
## 5        hotel  restraunt         RBF_Network       0.690909 0.644850
## 6        hotel  restraunt                C4.5       0.672727 0.583313
## 7        hotel  restraunt                5-NN       0.667780 0.493263
## 8        hotel  restraunt Logistic_Regression       0.666667 0.500600
## 9        hotel  restraunt       C4.5_Original       0.666667 0.557975
## 10       hotel     doctor                 SVM       0.584392 0.647844
## 11       hotel     doctor                 MLP       0.573826 0.643961
## 12       hotel     doctor         Naive_Bayes       0.561562 0.614242
## 13       hotel     doctor Random_Forest_(100)       0.537931 0.555597
## 14       hotel     doctor                5-NN       0.531544 0.485302
## 15       hotel     doctor Logistic_Regression       0.529801 0.463989
## 16       hotel     doctor         RBF_Network       0.529501 0.538511
## 17       hotel     doctor                C4.5       0.529101 0.545302
## 18       hotel     doctor       C4.5_Original       0.529101 0.539508

data <- mapKeys("feature-ranking-key.xls","testData/newreview-wordvector-output 2.zip","output/normal/output2.arff")
data$domain <- data$Dataset
data$domain <- plyr::revalue(data$domain,c("reviewSpamData-string-hotel.arff"="hotel",
                                           "reviewSpamData-string-restaurant.arff"="restraunt",
                                           "reviewSpamData-string-doctor.arff"="doctor",
                                           "reviewSpamData-string-noSentiment.arff"="all"))
#data$WordsToKeep <- as.character(data$WordsToKeep)
#data$domain <- as.factor(data$domain)
#data$WordsToKeep <- as.factor(data$WordsToKeep)
data <- filter(data,domain=='all')
data.aov <- aov(AUC~Learner*WordsToKeep,data)
HSD.test(data.aov,c("Learner","WordsToKeep"),console = TRUE)

## 
## Study: data.aov ~ c("Learner", "WordsToKeep")
## 
## HSD Test for AUC 
## 
## Mean Square Error:  0.0005161914 
## 
## Learner:WordsToKeep,  means
## 
##                                AUC        std  r      Min      Max
## C4.5:100                 0.6653927 0.02986633 20 0.619075 0.733244
## C4.5:1000                0.7299589 0.02074435 20 0.687392 0.765609
## C4.5:1100                0.7311963 0.01978110 20 0.691488 0.767393
## C4.5:1200                0.7310596 0.01993955 20 0.689781 0.767393
## C4.5:1300                0.7305792 0.01977812 20 0.689781 0.765889
## C4.5:1400                0.7300697 0.01968751 20 0.689781 0.765889
## C4.5:1500                0.7300919 0.01987954 20 0.690284 0.765889
## C4.5:1600                0.7293202 0.01982863 20 0.690284 0.766055
## C4.5:1700                0.7293789 0.01987210 20 0.690246 0.766501
## C4.5:1800                0.7293789 0.01987210 20 0.690246 0.766501
## C4.5:1900                0.7300039 0.01981626 20 0.689456 0.766501
## C4.5:200                 0.6951332 0.02279499 20 0.651733 0.739183
## C4.5:2000                0.7300039 0.01981626 20 0.689456 0.766501
## C4.5:300                 0.7081515 0.01741704 20 0.679543 0.740991
## C4.5:400                 0.7196797 0.02126731 20 0.690488 0.762627
## C4.5:500                 0.7263006 0.02156981 20 0.691030 0.771279
## C4.5:600                 0.7262031 0.01845421 20 0.698541 0.764768
## C4.5:700                 0.7300733 0.02255201 20 0.686557 0.765431
## C4.5:800                 0.7285308 0.02210812 20 0.685538 0.765233
## C4.5:900                 0.7307915 0.02002356 20 0.691393 0.765233
## Logistic_Regression:100  0.7735741 0.01589683 20 0.735894 0.814156
## Logistic_Regression:1000 0.7364933 0.02241736 20 0.691692 0.795292
## Logistic_Regression:1100 0.7114746 0.02011643 20 0.669423 0.744623
## Logistic_Regression:1200 0.6571497 0.02253972 20 0.608232 0.694196
## Logistic_Regression:1300 0.6186704 0.02420277 20 0.580297 0.685277
## Logistic_Regression:1400 0.6019425 0.02162649 20 0.554122 0.640182
## Logistic_Regression:1500 0.5962591 0.02100249 20 0.545700 0.629173
## Logistic_Regression:1600 0.5938256 0.01803604 20 0.571305 0.641902
## Logistic_Regression:1700 0.6016516 0.03233980 20 0.533180 0.650720
## Logistic_Regression:1800 0.5998516 0.03153286 20 0.533180 0.654339
## Logistic_Regression:1900 0.5899047 0.03905139 20 0.493502 0.648742
## Logistic_Regression:200  0.8252570 0.01733961 20 0.784021 0.850306
## Logistic_Regression:2000 0.6117388 0.07072708 20 0.538542 0.872687
## Logistic_Regression:300  0.8455820 0.01403629 20 0.807238 0.864093
## Logistic_Regression:400  0.8409437 0.01342741 20 0.805632 0.860449
## Logistic_Regression:500  0.7937861 0.01519020 20 0.759735 0.817081
## Logistic_Regression:600  0.7809333 0.01730153 20 0.747426 0.807711
## Logistic_Regression:700  0.7697041 0.01998184 20 0.721063 0.799573
## Logistic_Regression:800  0.7509569 0.01938067 20 0.717151 0.780262
## Logistic_Regression:900  0.7344063 0.02358776 20 0.687360 0.785060
## Naive_Bayes:100          0.7426301 0.02823651 20 0.674261 0.793578
## Naive_Bayes:1000         0.8261807 0.02630069 20 0.762525 0.876707
## Naive_Bayes:1100         0.8266173 0.02625750 20 0.763214 0.877090
## Naive_Bayes:1200         0.8268743 0.02631298 20 0.762895 0.877383
## Naive_Bayes:1300         0.8272493 0.02632090 20 0.763163 0.877714
## Naive_Bayes:1400         0.8274357 0.02637318 20 0.763220 0.877854
## Naive_Bayes:1500         0.8276653 0.02635740 20 0.763239 0.877982
## Naive_Bayes:1600         0.8279321 0.02629711 20 0.763609 0.877982
## Naive_Bayes:1700         0.8281666 0.02638746 20 0.763660 0.878198
## Naive_Bayes:1800         0.8282318 0.02635944 20 0.763940 0.878338
## Naive_Bayes:1900         0.8284149 0.02633759 20 0.764112 0.878389
## Naive_Bayes:200          0.7864800 0.02667158 20 0.715902 0.828874
## Naive_Bayes:2000         0.8284967 0.02632915 20 0.764271 0.878593
## Naive_Bayes:300          0.8118838 0.02601424 20 0.742495 0.857480
## Naive_Bayes:400          0.8207689 0.02493250 20 0.760780 0.863749
## Naive_Bayes:500          0.8208435 0.02708785 20 0.756167 0.873586
## Naive_Bayes:600          0.8228372 0.02656817 20 0.759964 0.874860
## Naive_Bayes:700          0.8242331 0.02638465 20 0.760614 0.875217
## Naive_Bayes:800          0.8247463 0.02621734 20 0.761366 0.875765
## Naive_Bayes:900          0.8254480 0.02634730 20 0.761557 0.876032
## SVM:100                  0.7724263 0.01725180 20 0.735410 0.817036
## SVM:1000                 0.8329224 0.01429270 20 0.807480 0.865201
## SVM:1100                 0.8395938 0.01234784 20 0.819355 0.870247
## SVM:1200                 0.8433699 0.01253048 20 0.823496 0.870865
## SVM:1300                 0.8465466 0.01381359 20 0.824758 0.876102
## SVM:1400                 0.8518777 0.01354587 20 0.827472 0.876854
## SVM:1500                 0.8536197 0.01339570 20 0.833601 0.884703
## SVM:1600                 0.8563945 0.01445342 20 0.835270 0.884314
## SVM:1700                 0.8571042 0.01450558 20 0.832862 0.880836
## SVM:1800                 0.8570707 0.01428386 20 0.832862 0.880836
## SVM:1900                 0.8576928 0.01503530 20 0.837258 0.880594
## SVM:200                  0.8170338 0.01788749 20 0.777051 0.848662
## SVM:2000                 0.8586635 0.01450109 20 0.833945 0.880632
## SVM:300                  0.8422479 0.01392605 20 0.804995 0.860321
## SVM:400                  0.8376615 0.01381645 20 0.805110 0.857518
## SVM:500                  0.8275756 0.01041695 20 0.797452 0.840571
## SVM:600                  0.8288414 0.01271284 20 0.793998 0.849809
## SVM:700                  0.8248404 0.01403390 20 0.790635 0.842391
## SVM:800                  0.8256575 0.01330207 20 0.795674 0.846678
## SVM:900                  0.8298854 0.01560775 20 0.796923 0.863513
## 
## alpha: 0.05 ; Df Error: 1520 
## Critical Value of Studentized Range: 5.960946 
## 
## Honestly Significant Difference: 0.03028347 
## 
## Means with the same letter are not significantly different.
## 
## Groups, Treatments and means
## a     SVM:2000                    0.8587 
## ab    SVM:1900                    0.8577 
## abc   SVM:1700                    0.8571 
## abc   SVM:1800                    0.8571 
## abcd      SVM:1600                    0.8564 
## abcde     SVM:1500                    0.8536 
## abcdef    SVM:1400                    0.8519 
## abcdefg   SVM:1300                    0.8465 
## abcdefg   Logistic_Regression:300     0.8456 
## abcdefg   SVM:1200                    0.8434 
## abcdefg   SVM:300                     0.8422 
## abcdefgh      Logistic_Regression:400     0.8409 
## abcdefgh      SVM:1100                    0.8396 
## abcdefgh      SVM:400                     0.8377 
## abcdefgh      SVM:1000                    0.8329 
## abcdefgh      SVM:900                     0.8299 
## abcdefgh      SVM:600                     0.8288 
## abcdefgh      Naive_Bayes:2000            0.8285 
## abcdefgh      Naive_Bayes:1900            0.8284 
## bcdefgh   Naive_Bayes:1800            0.8282 
## bcdefgh   Naive_Bayes:1700            0.8282 
## bcdefgh   Naive_Bayes:1600            0.8279 
## bcdefgh   Naive_Bayes:1500            0.8277 
## bcdefgh   SVM:500                     0.8276 
## bcdefgh   Naive_Bayes:1400            0.8274 
## cdefgh    Naive_Bayes:1300            0.8272 
## cdefgh    Naive_Bayes:1200            0.8269 
## defgh     Naive_Bayes:1100            0.8266 
## defgh     Naive_Bayes:1000            0.8262 
## efgh      SVM:800                     0.8257 
## efgh      Naive_Bayes:900             0.8254 
## efgh      Logistic_Regression:200     0.8253 
## efgh      SVM:700                     0.8248 
## efgh      Naive_Bayes:800             0.8247 
## efgh      Naive_Bayes:700             0.8242 
## fghi      Naive_Bayes:600             0.8228 
## ghi   Naive_Bayes:500             0.8208 
## ghi   Naive_Bayes:400             0.8208 
## ghi   SVM:200                     0.817 
## hij   Naive_Bayes:300             0.8119 
## ijk   Logistic_Regression:500     0.7938 
## jk    Naive_Bayes:200             0.7865 
## kl    Logistic_Regression:600     0.7809 
## kl    Logistic_Regression:100     0.7736 
## klm   SVM:100                     0.7724 
## klm   Logistic_Regression:700     0.7697 
## lmn   Logistic_Regression:800     0.751 
## mno   Naive_Bayes:100             0.7426 
## nop   Logistic_Regression:1000    0.7365 
## nop   Logistic_Regression:900     0.7344 
## nop   C4.5:1100                   0.7312 
## nop   C4.5:1200                   0.7311 
## nop   C4.5:900                    0.7308 
## nop   C4.5:1300                   0.7306 
## nop   C4.5:1500                   0.7301 
## nop   C4.5:700                    0.7301 
## nop   C4.5:1400                   0.7301 
## nop   C4.5:1900                   0.73 
## nop   C4.5:2000                   0.73 
## nop   C4.5:1000                   0.73 
## nop   C4.5:1700                   0.7294 
## nop   C4.5:1800                   0.7294 
## nop   C4.5:1600                   0.7293 
## nop   C4.5:800                    0.7285 
## nop   C4.5:500                    0.7263 
## nop   C4.5:600                    0.7262 
## opq   C4.5:400                    0.7197 
## pq    Logistic_Regression:1100    0.7115 
## pq    C4.5:300                    0.7082 
## qr    C4.5:200                    0.6951 
## rs    C4.5:100                    0.6654 
## s     Logistic_Regression:1200    0.6571 
## t     Logistic_Regression:1300    0.6187 
## t     Logistic_Regression:2000    0.6117 
## t     Logistic_Regression:1400    0.6019 
## t     Logistic_Regression:1700    0.6017 
## t     Logistic_Regression:1800    0.5999 
## t     Logistic_Regression:1500    0.5963 
## t     Logistic_Regression:1600    0.5938 
## t     Logistic_Regression:1900    0.5899

Review Spam Results Analysis

Michael Crawford

September 13, 2015