library(mlbench)
library(corrplot)
library(ggplot2)
require(gridExtra)
library(car)
library(caret)
library(tidyverse)
library(DT)
The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via:
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
require(gridExtra)
grid.arrange(ggplot(Glass, aes(RI)) + geom_histogram(binwidth = .001), #very narrow distribution, somewhat normal
ggplot(Glass, aes(Na)) + geom_histogram(binwidth = .5), #normal distribution
ggplot(Glass, aes(Mg)) + geom_histogram(binwidth = .25), #right skewed distribution
ggplot(Glass, aes(Al)) + geom_histogram(binwidth = .25), #normal distribution
ggplot(Glass, aes(Si)) + geom_histogram(binwidth = .5), #nomal distribution
ggplot(Glass, aes(K)) + geom_histogram(binwidth = .25), #right skewed distribution, seems to have an outlier
ggplot(Glass, aes(Ca)) + geom_histogram(binwidth = 1), #normal distribution
ggplot(Glass, aes(Ba)) + geom_histogram(binwidth = .25), #left-skewed distribution
ggplot(Glass, aes(Fe)) + geom_histogram(binwidth = .05), #left-skewed distribution
ncol=3)
Taking a further look at the distributions of the feature variables, it looks like the distributions of Mg, Fe, Ba, and Ca are highly skewed.
colnames(subset(Glass, select = -c(Type)))
## [1] "RI" "Na" "Mg" "Al" "Si" "K" "Ca" "Ba" "Fe"
par(mfrow=c(3,3), cex=.8, mai=c(0,0,0.2,0))
invisible(qqPlot(~ RI, data = Glass, main = "RI"))
invisible(qqPlot(~ Na, data = Glass, main = "Na"))
invisible(qqPlot(~ Mg, data = Glass, main = "Mg"))
invisible(qqPlot(~ Al, data = Glass, main = "Al"))
invisible(qqPlot(~ Si, data = Glass, main = "Si"))
invisible(qqPlot(~ K, data = Glass, main = "K"))
invisible(qqPlot(~ Ca, data = Glass, main = "Ca"))
invisible(qqPlot(~ Ba, data = Glass, main = "Ba"))
invisible(qqPlot(~ Fe, data = Glass, main = "Fe"))
In the histograms, a few outliers were spotted. Let’s take a look at boxplots of the features to identify more outliers.
It looks like besides Mg, all fields have outliers.
grid.arrange(ggplot(Glass, aes(x="x", y = RI)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Na)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Mg)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Al)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Si)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = K)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Ca)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Ba)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ggplot(Glass, aes(x="x", y = Fe)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
ncol=3)
From this correlation plot of glass dataset features, the following correlations can be identified:
A VIF statistic can also indicate multicollinearity in the data.
corrplot(cor(subset(Glass, select = -c(Type)), use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)
Let’s take a look at scatter plots of some of the highly correlated features.
From these scatter plots, it seems like the CA and RI / Si and RI correlations may cause issues in the model.
grid.arrange(ggplot(Glass, aes(Ca, RI)) + geom_point(),
ggplot(Glass, aes(Si, RI)) + geom_point(),
ggplot(Glass, aes(Mg, Ba)) + geom_point(),
ggplot(Glass, aes(Al, Ba)) + geom_point(),
ncol=2)
Based on the boxplots, it looks like all fields other than Mg have outliers. The histrograms and qqplots indicate that the distributions of Mg, Fe, Ba, and Ca are highly skewed.
A good way to address skewed features is boxcox, log, sqrt, or inverse transformations. However, none of these transformations seen to help significantly improve this model. Scaling may also help center a distribution, but it did not help in this case. Since some features have many values near 0, it may help to bin those predictors.
BoxCoxTrans(Glass$Na)
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.73 12.91 13.30 13.41 13.82 17.38
##
## Largest/Smallest: 1.62
## Sample Skewness: 0.448
##
## Estimated Lambda: -0.1
## With fudge factor, Lambda = 0 will be used for transformations
BoxCoxTrans(Glass$Ca)
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.430 8.240 8.600 8.957 9.172 16.190
##
## Largest/Smallest: 2.98
## Sample Skewness: 2.02
##
## Estimated Lambda: -1.1
BoxCoxTrans(Glass$Si)
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.81 72.28 72.79 72.65 73.09 75.41
##
## Largest/Smallest: 1.08
## Sample Skewness: -0.72
##
## Estimated Lambda: 2
BoxCoxTrans(Glass$Al)
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.290 1.190 1.360 1.445 1.630 3.500
##
## Largest/Smallest: 12.1
## Sample Skewness: 0.895
##
## Estimated Lambda: 0.5
RI has multicollinearlity with other features in the dataset. An approach to resolve this would be to remove this predictor, or apply PCA to the dataset.
Glass = subset(Glass, select = -c(RI))
pca <- prcomp(subset(Glass, select = -c(Type)), scale. = TRUE, center = TRUE)$x
We can apply a spatial sign transformation.
library(caret)
sst <- spatialSign(subset(Glass, select = -c(Type)))
Type <- Glass$Type
Glass <- as.data.frame(cbind(sst, Type))
library(caTools)
set.seed(123)
Type <- Glass$Type
Glass <- as.data.frame(cbind(pca, Type))
split = sample.split(Glass$Type, SplitRatio = 0.75)
training_set = subset(Glass, split == TRUE)
test_set = subset(Glass, split == FALSE)
library("e1071")
classifier = svm(formula = Type ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
y_pred = predict(classifier, newdata = test_set[-9])
cm = table(test_set[, 9], y_pred)
prop.table(cm, margin = 2)
## y_pred
## 1 2 3 4 5 6
## 1 0.70000000 0.18181818 0.00000000 0.00000000 0.00000000
## 2 0.20000000 0.59090909 0.40000000 0.00000000 0.00000000
## 3 0.05000000 0.13636364 0.00000000 0.00000000 0.00000000
## 4 0.00000000 0.00000000 0.60000000 0.00000000 0.00000000
## 5 0.00000000 0.04545455 0.00000000 1.00000000 0.00000000
## 6 0.05000000 0.04545455 0.00000000 0.00000000 1.00000000
#check how many accurate predictions there were
summary(test_set[, 9] == y_pred)
## Mode FALSE TRUE
## logical 17 36
The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
data(Soybean)
## See ?Soybean for details
leaf.malf: fraction of unique values is low leaf.mild: fraction of unique values is low lodging: fraction of unique values is low mycelium: fraction of unique values is low sclerotia: fraction of unique values is low shriveling: fraction of unique values is low
These predictors should probably be removed.
summary(Soybean)
## Class date plant.stand precip temp
## brown-spot : 92 5 :149 0 :354 0 : 74 0 : 80
## alternarialeaf-spot: 91 4 :131 1 :293 1 :112 1 :374
## frog-eye-leaf-spot : 91 3 :118 NA's: 36 2 :459 2 :199
## phytophthora-rot : 88 2 : 93 NA's: 38 NA's: 30
## anthracnose : 44 6 : 90
## brown-stem-rot : 44 (Other):101
## (Other) :233 NA's : 1
## hail crop.hist area.dam sever seed.tmt germ plant.growth
## 0 :435 0 : 65 0 :123 0 :195 0 :305 0 :165 0 :441
## 1 :127 1 :165 1 :227 1 :322 1 :222 1 :213 1 :226
## NA's:121 2 :219 2 :145 2 : 45 2 : 35 2 :193 NA's: 16
## 3 :218 3 :187 NA's:121 NA's:121 NA's:112
## NA's: 16 NA's: 1
##
##
## leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild
## 0: 77 0 :221 0 :357 0 : 51 0 :487 0 :554 0 :535
## 1:606 1 : 36 1 : 21 1 :327 1 : 96 1 : 45 1 : 20
## 2 :342 2 :221 2 :221 NA's:100 NA's: 84 2 : 20
## NA's: 84 NA's: 84 NA's: 84 NA's:108
##
##
##
## stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 :296 0 :520 0 :379 0 :320 0 :473 0 :497
## 1 :371 1 : 42 1 : 39 1 : 83 1 :104 1 :135
## NA's: 16 NA's:121 2 : 36 2 :177 NA's:106 2 : 13
## 3 :191 3 : 65 NA's: 38
## NA's: 38 NA's: 38
##
##
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0 :639 0 :581 0 :625 0 :407 0 :345 0 :476
## 1 : 6 1 : 44 1 : 20 1 :130 1 : 75 1 :115
## NA's: 38 2 : 20 NA's: 38 2 : 14 2 : 57 NA's: 92
## NA's: 38 3 : 48 4 :100
## NA's: 84 NA's:106
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0 :524 0 :513 0 :532 0 :539 0 :551
## 1 : 67 1 : 64 1 : 59 1 : 38 1 : 86
## NA's: 92 NA's:106 NA's: 92 NA's:106 2 : 15
## NA's: 31
##
##
##
library(tidyr)
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
res <- Soybean %>%
gather("predictor", "value", 2:36) %>%
group_by(Class, predictor) %>%
summarise(na.count=sum(is.na(value))) %>%
filter(na.count > 1)
datatable(cast(res, predictor ~ Class))
## Using na.count as value column. Use the value argument to cast to override this choice
There are 19 classes, only the first 15 of which have been used in prior work. The folklore seems to be that the last four classes are unjustified by the data since they have so few examples. There are 35 categorical attributes, some nominal and some ordered. The value “dna” means does not apply. The values for attributes are encoded numerically, with the first value encoded as “0,” the second as “1,” and so forth.
The following table shows the missing rate of each predictor.
res <- Soybean %>%
gather("predictor", "value", 2:36) %>%
group_by(predictor) %>%
summarise(na.percent=sum(is.na(value))/n()) %>%
arrange(desc(na.percent))
## Warning: attributes are not identical across measure variables;
## they will be dropped
res
## # A tibble: 35 x 2
## predictor na.percent
## <chr> <dbl>
## 1 hail 0.177
## 2 lodging 0.177
## 3 seed.tmt 0.177
## 4 sever 0.177
## 5 germ 0.164
## 6 leaf.mild 0.158
## 7 fruit.spots 0.155
## 8 fruiting.bodies 0.155
## 9 seed.discolor 0.155
## 10 shriveling 0.155
## # ... with 25 more rows
Since there seems to be overlap between rows that are missing data, try:
The concern with dropping rows with multiple missing values is that certain classes will have too little data to classify. Instead, I will drop columns that are degenerate and have many missing values, and impute the rest.
res <- Soybean %>%
gather("predictor", "value", 2:36) %>%
group_by(predictor) %>%
summarise(na.percent=sum(is.na(value))/n()) %>%
arrange(desc(na.percent))
## Warning: attributes are not identical across measure variables;
## they will be dropped
res
## # A tibble: 35 x 2
## predictor na.percent
## <chr> <dbl>
## 1 hail 0.177
## 2 lodging 0.177
## 3 seed.tmt 0.177
## 4 sever 0.177
## 5 germ 0.164
## 6 leaf.mild 0.158
## 7 fruit.spots 0.155
## 8 fruiting.bodies 0.155
## 9 seed.discolor 0.155
## 10 shriveling 0.155
## # ... with 25 more rows
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
new_Soybean <- knnImputation(subset(Soybean, select = -c(leaf.malf, leaf.mild, lodging, mycelium, sclerotia, shriveling)))
summary(new_Soybean)
## Class date plant.stand precip temp hail
## brown-spot : 92 0: 26 0:383 0: 86 0: 96 0:542
## alternarialeaf-spot: 91 1: 76 1:300 1:136 1:382 1:141
## frog-eye-leaf-spot : 91 2: 93 2:461 2:205
## phytophthora-rot : 88 3:118
## anthracnose : 44 4:131
## brown-stem-rot : 44 5:149
## (Other) :233 6: 90
## crop.hist area.dam sever seed.tmt germ plant.growth leaves leaf.halo
## 0: 71 0:124 0:242 0:356 0:176 0:444 0: 77 0:284
## 1:167 1:227 1:390 1:292 1:263 1:239 1:606 1: 57
## 2:227 2:145 2: 51 2: 35 2:244 2:342
## 3:218 3:187
##
##
##
## leaf.marg leaf.size leaf.shread stem stem.cankers canker.lesion
## 0:431 0:107 0:587 0:300 0:397 0:327
## 1: 31 1:351 1: 96 1:383 1: 54 1:105
## 2:221 2:225 2: 38 2:185
## 3:194 3: 66
##
##
##
## fruiting.bodies ext.decay int.discolor fruit.pods fruit.spots seed
## 0:574 0:525 0:614 0:484 0:429 0:568
## 1:109 1:145 1: 44 1:137 1: 92 1:115
## 2: 13 2: 25 2: 14 2: 62
## 3: 48 4:100
##
##
##
## mold.growth seed.discolor seed.size roots
## 0:616 0:617 0:624 0:582
## 1: 67 1: 66 1: 59 1: 86
## 2: 15
##
##
##
##