imbalance: Oversampling Algorithms for Imbalanced Classification in Rimbalance and Datasetnewthyroid1 is used for examples.library(imbalance)
head(newthyroid1, 10)
## # A tibble: 10 x 6
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 105 7.3 1.5 1.5 -0.1 negative
## 2 67 23.3 7.4 1.8 -0.6 positive
## 3 111 8.4 1.5 0.8 1.2 negative
## 4 89 14.3 4.1 0.5 0.2 positive
## 5 105 9.5 1.8 1.6 3.6 negative
## 6 110 20.3 3.7 0.6 0.2 positive
## 7 84 21.5 2.7 1.1 -0.6 positive
## 8 113 11.1 1.7 0.8 2.3 negative
## 9 97 7.8 1.3 1.2 0.9 negative
## 10 106 13.4 3 1.1 0 positive
table(newthyroid1$Class)
##
## negative positive
## 180 35
You can use imbalanceRatio as,
imbalanceRatio(newthyroid1)
## [1] 0.1944444
newMWMOTE <- mwmote(newthyroid1, numInstances = 100)
newMWMOTE
## # A tibble: 100 x 6
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 99 14.6 2.98 0.955 0.0457 positive
## 2 96 13.3 3.73 0.648 -0.0221 positive
## 3 90 22.2 5.13 0.530 0.0704 positive
## 4 87 20.8 3.94 1.16 -0.0625 positive
## 5 109 15.9 2.06 0.935 -0.112 positive
## 6 91 14.2 3.92 0.596 0.168 positive
## 7 96 16.5 5.53 1.1 -0.112 positive
## 8 106 13.4 3 1.1 0 positive
## 9 105 17.6 2.64 0.977 0.0684 positive
## 10 73 14.2 6.39 0.462 -0.0892 positive
## # ... with 90 more rows
plotComparison(newthyroid1, rbind(newthyroid1, newMWMOTE), attrs = names(newthyroid1)[1:3])
newRACOG <- racog(newthyroid1, numInstances = 100)
newRACOG
## # A tibble: 100 x 6
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 65 25.3 5.8 0.5 0.1 positive
## 2 67 23.3 7.4 1.8 -0.6 positive
## 3 84 11.1 2.7 1.8 -0.6 positive
## 4 98 16.7 4.3 0.6 0.2 positive
## 5 89 21.8 7.1 0.5 0.2 positive
## 6 105 22.3 3.3 1.1 0 positive
## 7 106 13.4 3 1.1 0 positive
## 8 139 16.4 3.8 0.7 -0.2 positive
## 9 97 17.2 1.8 1.2 -0.2 positive
## 10 111 16 2.1 0.7 -0.1 positive
## # ... with 90 more rows
plotComparison(newthyroid1, rbind(newthyroid1, newRACOG), attrs = names(newthyroid1)[1:3])
newRWO <- rwo(newthyroid1, numInstances = 100)
newRWO
## # A tibble: 100 x 6
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 193 17.6 6.13 1.69 0.190 positive
## 2 125 12.0 2.60 0.106 0.191 positive
## 3 159 21.3 4.81 0.916 0.297 positive
## 4 14 14.2 1.99 1.23 -0.198 positive
## 5 60 22.7 2.09 1.37 -0.497 positive
## 6 93 10.5 1.69 0.701 -0.207 positive
## 7 38 25.0 5.60 0.920 0.302 positive
## 8 117 13.1 3.36 0.493 0.207 positive
## 9 93 13.6 1.59 1.46 0.296 positive
## 10 146 16.7 1.05 0.334 0.406 positive
## # ... with 90 more rows
plotComparison(newthyroid1, rbind(newthyroid1, newRWO), attrs = names(newthyroid1)[1:3])
newPDFOS <- pdfos(newthyroid1, numInstances = 100)
newPDFOS
## # A tibble: 100 x 6
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 155 20.1 1.64 1.10 0.660 positive
## 2 96 12.3 3.07 0.286 -0.218 positive
## 3 105 9.15 3.92 1.72 -0.317 positive
## 4 98 10.3 3.83 -0.302 0.760 positive
## 5 91 19.4 4.40 0.398 0.110 positive
## 6 66 15.5 4.27 0.0874 0.103 positive
## 7 106 9.93 -3.03 0.816 0.0440 positive
## 8 98 14.0 4.98 0.765 -0.0553 positive
## 9 98 11.8 3.89 1.30 0.156 positive
## 10 100 14.7 -0.169 0.890 0.526 positive
## # ... with 90 more rows
plotComparison(newthyroid1, rbind(newthyroid1, newPDFOS), attrs = names(newthyroid1)[1:3])
filtered <- neater(newthyroid1, newSamples = newPDFOS, iterations = 500)
## [1] "21 samples filtered by NEATER"
plotComparison(newthyroid1, rbind(newthyroid1, filtered), attrs = names(newthyroid1)[1:3])
oversampleimbalance includes the method oversample, which is a wrapper that eases calls to the described and already existing methods. Possible methods are: RACOG, wRACOG, PDFOS, RWO, ADASYN, ANSMOTE, BLSMOTE, DBSMOTE, BLSMOTE, DBSMOTE, SLMOTE, RSLSMOTE.filtered2 <- oversample(newthyroid1, ratio = 1, method = "PDFOS", filtering = TRUE, iterations = 500)
## [1] "24 samples filtered by NEATER"
head(filtered2, 20)
## # A tibble: 20 x 6
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 105 7.3 1.5 1.5 -0.1 negative
## 2 67 23.3 7.4 1.8 -0.6 positive
## 3 111 8.4 1.5 0.8 1.2 negative
## 4 89 14.3 4.1 0.5 0.2 positive
## 5 105 9.5 1.8 1.6 3.6 negative
## 6 110 20.3 3.7 0.6 0.2 positive
## 7 84 21.5 2.7 1.1 -0.6 positive
## 8 113 11.1 1.7 0.8 2.3 negative
## 9 97 7.8 1.3 1.2 0.9 negative
## 10 106 13.4 3 1.1 0 positive
## 11 104 6.3 2 1.2 4 negative
## 12 112 5.9 1.7 2 1.3 negative
## 13 120 1.9 0.7 18.5 24 negative
## 14 118 3.6 1.5 11.6 48.8 negative
## 15 106 9.4 1.7 0.9 3.1 negative
## 16 99 13 3.6 0.7 -0.1 positive
## 17 107 13.8 1.5 1 1.9 negative
## 18 111 16 2.1 0.9 -0.1 positive
## 19 129 11.9 2.7 1.2 3.5 negative
## 20 115 6.3 1.2 4.7 14.4 negative
dim(filtered2)
## [1] 336 6
table(filtered2$Class)
##
## negative positive
## 180 156