library(mlbench)
library(psych)
library(e1071)
library(ggplot2)
library(knitr)
library(mlbench)
library(reshape2)
library(corrplot)
library(caret)
library(DMwR)
library(GGally)
The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.
About the dataset
This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)
Attribute Information:
data(Glass)
kable(head(Glass))
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | Type |
---|---|---|---|---|---|---|---|---|---|
1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0 | 0.00 | 1 |
1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0 | 0.00 | 1 |
1.51618 | 13.53 | 3.55 | 1.54 | 72.99 | 0.39 | 7.78 | 0 | 0.00 | 1 |
1.51766 | 13.21 | 3.69 | 1.29 | 72.61 | 0.57 | 8.22 | 0 | 0.00 | 1 |
1.51742 | 13.27 | 3.62 | 1.24 | 73.08 | 0.55 | 8.07 | 0 | 0.00 | 1 |
1.51596 | 12.79 | 3.61 | 1.62 | 72.97 | 0.64 | 8.07 | 0 | 0.26 | 1 |
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
describe(Glass)
## vars n mean sd median trimmed mad min max range skew kurtosis
## RI 1 214 1.52 0.00 1.52 1.52 0.00 1.51 1.53 0.02 1.60 4.72
## Na 2 214 13.41 0.82 13.30 13.38 0.64 10.73 17.38 6.65 0.45 2.90
## Mg 3 214 2.68 1.44 3.48 2.87 0.30 0.00 4.49 4.49 -1.14 -0.45
## Al 4 214 1.44 0.50 1.36 1.41 0.31 0.29 3.50 3.21 0.89 1.94
## Si 5 214 72.65 0.77 72.79 72.71 0.57 69.81 75.41 5.60 -0.72 2.82
## K 6 214 0.50 0.65 0.56 0.43 0.17 0.00 6.21 6.21 6.46 52.87
## Ca 7 214 8.96 1.42 8.60 8.74 0.66 5.43 16.19 10.76 2.02 6.41
## Ba 8 214 0.18 0.50 0.00 0.03 0.00 0.00 3.15 3.15 3.37 12.08
## Fe 9 214 0.06 0.10 0.00 0.04 0.00 0.00 0.51 0.51 1.73 2.52
## Type* 10 214 2.54 1.71 2.00 2.31 1.48 1.00 6.00 5.00 1.04 -0.29
## se
## RI 0.00
## Na 0.06
## Mg 0.10
## Al 0.03
## Si 0.05
## K 0.04
## Ca 0.10
## Ba 0.03
## Fe 0.01
## Type* 0.12
ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) +
geom_histogram(colour = "gray", bins=50, alpha = 0.5, position = "identity") +
ggtitle("Histogram") +
facet_wrap(~variable, scale="free")
ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) +
geom_density(colour = "gray", alpha = 0.5, position = "identity") +
ggtitle("Density Plot Companision") +
facet_wrap(~variable, scale="free")
skewRI <- skewness(Glass$RI)
skewNa <- skewness(Glass$Na)
skewMg <- skewness(Glass$Mg)
skewAl <- skewness(Glass$Al)
skewSi <- skewness(Glass$Si)
skewK <- skewness(Glass$K)
skewCa <- skewness(Glass$Ca)
skewBa <- skewness(Glass$Ba)
skewFe <- skewness(Glass$Fe)
TypeValue <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")
Skewness <- c(skewRI, skewNa, skewMg, skewAl, skewSi, skewK, skewCa, skewBa, skewFe)
ds1 <- data.frame(TypeValue, Skewness)
kable(ds1)
TypeValue | Skewness |
---|---|
RI | 1.6027151 |
Na | 0.4478343 |
Mg | -1.1364523 |
Al | 0.8946104 |
Si | -0.7202392 |
K | 6.4600889 |
Ca | 2.0184463 |
Ba | 3.3686800 |
Fe | 1.7298107 |
pairs.panels(Glass[,-10])
From the visualizations and data above we can see the shape of data is different for each type. The density plot show us the skewness of these data. For example Na and Al seems to be normal but Ba, Fe, Ca etc are left heavy. The Skewness table shows the skew values
boxplot(Glass["RI"],plot=FALSE)$out
## [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365
boxplot(Glass["Na"],plot=FALSE)$out
## [1] 11.45 10.73 11.23 11.02 11.03 17.38 15.79
boxplot(Glass["Mg"],plot=FALSE)$out
## numeric(0)
boxplot(Glass["Al"],plot=FALSE)$out
## [1] 0.29 0.47 0.47 0.51 3.50 3.04 3.02 0.34 2.38 2.79 2.68 2.54 2.34 2.66 2.51
## [16] 2.42 2.74 2.88
boxplot(Glass["Si"],plot=FALSE)$out
## [1] 70.57 69.81 70.16 74.45 69.89 70.48 70.70 74.55 75.41 70.26 70.43 75.18
boxplot(Glass["K"],plot=FALSE)$out
## [1] 1.68 6.21 6.21 1.76 1.46 2.70 1.41
boxplot(Glass["Ca"],plot=FALSE)$out
## [1] 11.64 10.79 13.24 13.30 16.19 11.52 10.99 14.68 14.96 14.40 11.14 13.44
## [13] 5.87 11.41 11.62 11.53 11.32 12.24 12.50 11.27 10.88 11.22 6.65 5.43
## [25] 5.79 6.47
boxplot(Glass["Ba"],plot=FALSE)$out
## [1] 0.09 0.11 0.69 0.14 0.11 3.15 0.27 0.09 0.06 0.15 2.20 0.24 1.19 1.63 1.68
## [16] 0.76 0.64 0.40 1.59 1.57 0.61 0.81 0.66 0.64 0.53 0.63 0.56 1.71 0.67 1.55
## [31] 1.38 2.88 0.54 1.06 1.59 1.64 1.57 1.67
boxplot(Glass["Fe"],plot=FALSE)$out
## [1] 0.26 0.30 0.31 0.32 0.34 0.28 0.29 0.28 0.35 0.37 0.51 0.28
To improve the model we can use transformation like Box-Cox for highly skewed variables and Normalizing the variables by centering and scaling. Along with these techniques we should take a look at the outliers and try to remove them
apply(Glass[,-10], 2, BoxCoxTrans)
## $RI
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.511 1.517 1.518 1.518 1.519 1.534
##
## Largest/Smallest: 1.02
## Sample Skewness: 1.6
##
## Estimated Lambda: -2
##
##
## $Na
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.73 12.91 13.30 13.41 13.82 17.38
##
## Largest/Smallest: 1.62
## Sample Skewness: 0.448
##
## Estimated Lambda: -0.1
## With fudge factor, Lambda = 0 will be used for transformations
##
##
## $Mg
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.115 3.480 2.685 3.600 4.490
##
## Lambda could not be estimated; no transformation is applied
##
##
## $Al
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.290 1.190 1.360 1.445 1.630 3.500
##
## Largest/Smallest: 12.1
## Sample Skewness: 0.895
##
## Estimated Lambda: 0.5
##
##
## $Si
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.81 72.28 72.79 72.65 73.09 75.41
##
## Largest/Smallest: 1.08
## Sample Skewness: -0.72
##
## Estimated Lambda: 2
##
##
## $K
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1225 0.5550 0.4971 0.6100 6.2100
##
## Lambda could not be estimated; no transformation is applied
##
##
## $Ca
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.430 8.240 8.600 8.957 9.172 16.190
##
## Largest/Smallest: 2.98
## Sample Skewness: 2.02
##
## Estimated Lambda: -1.1
##
##
## $Ba
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.175 0.000 3.150
##
## Lambda could not be estimated; no transformation is applied
##
##
## $Fe
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05701 0.10000 0.51000
##
## Lambda could not be estimated; no transformation is applied
The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
data(Soybean)
kable(head(Soybean, 10))
Class | date | plant.stand | precip | temp | hail | crop.hist | area.dam | sever | seed.tmt | germ | plant.growth | leaves | leaf.halo | leaf.marg | leaf.size | leaf.shread | leaf.malf | leaf.mild | stem | lodging | stem.cankers | canker.lesion | fruiting.bodies | ext.decay | mycelium | int.discolor | sclerotia | fruit.pods | fruit.spots | seed | mold.growth | seed.discolor | seed.size | shriveling | roots |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 1 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 4 | 0 | 2 | 1 | 0 | 2 | 0 | 2 | 1 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 3 | 0 | 2 | 1 | 0 | 1 | 0 | 2 | 1 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 3 | 0 | 2 | 1 | 0 | 1 | 0 | 2 | 0 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 2 | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 5 | 0 | 2 | 1 | 0 | 3 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 5 | 0 | 2 | 1 | 0 | 2 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 1 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 4 | 0 | 2 | 1 | 1 | 1 | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 3 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 4 | 0 | 2 | 1 | 0 | 2 | 0 | 2 | 0 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
ggplot(melt(Soybean, id.vars=c('Class')), aes(x=value)) +
geom_histogram(stat="count") +
facet_wrap(~variable, scale="free")
We can use the nearZeroVar function in the caret package to identify the predictors with degenerate distributions.
nzv.cols <- nearZeroVar(Soybean)
nzv.out <- nearZeroVar(Soybean, saveMetrics=TRUE)[nzv.cols, ]
nzv.prop.high <- apply(Soybean, 2, function(x) max(table(x)) / length(x))[nzv.cols]
summary(Soybean[ , nzv.cols])
## leaf.mild mycelium sclerotia
## 0 :535 0 :639 0 :625
## 1 : 20 1 : 6 1 : 20
## 2 : 20 NA's: 38 NA's: 38
## NA's:108
Frequency Distribution
lapply(apply(Soybean[-1], 2, table), kable)
## $date
##
##
## Var1 Freq
## ----- -----
## 0 26
## 1 75
## 2 93
## 3 118
## 4 131
## 5 149
## 6 90
##
## $plant.stand
##
##
## Var1 Freq
## ----- -----
## 0 354
## 1 293
##
## $precip
##
##
## Var1 Freq
## ----- -----
## 0 74
## 1 112
## 2 459
##
## $temp
##
##
## Var1 Freq
## ----- -----
## 0 80
## 1 374
## 2 199
##
## $hail
##
##
## Var1 Freq
## ----- -----
## 0 435
## 1 127
##
## $crop.hist
##
##
## Var1 Freq
## ----- -----
## 0 65
## 1 165
## 2 219
## 3 218
##
## $area.dam
##
##
## Var1 Freq
## ----- -----
## 0 123
## 1 227
## 2 145
## 3 187
##
## $sever
##
##
## Var1 Freq
## ----- -----
## 0 195
## 1 322
## 2 45
##
## $seed.tmt
##
##
## Var1 Freq
## ----- -----
## 0 305
## 1 222
## 2 35
##
## $germ
##
##
## Var1 Freq
## ----- -----
## 0 165
## 1 213
## 2 193
##
## $plant.growth
##
##
## Var1 Freq
## ----- -----
## 0 441
## 1 226
##
## $leaves
##
##
## Var1 Freq
## ----- -----
## 0 77
## 1 606
##
## $leaf.halo
##
##
## Var1 Freq
## ----- -----
## 0 221
## 1 36
## 2 342
##
## $leaf.marg
##
##
## Var1 Freq
## ----- -----
## 0 357
## 1 21
## 2 221
##
## $leaf.size
##
##
## Var1 Freq
## ----- -----
## 0 51
## 1 327
## 2 221
##
## $leaf.shread
##
##
## Var1 Freq
## ----- -----
## 0 487
## 1 96
##
## $leaf.malf
##
##
## Var1 Freq
## ----- -----
## 0 554
## 1 45
##
## $leaf.mild
##
##
## Var1 Freq
## ----- -----
## 0 535
## 1 20
## 2 20
##
## $stem
##
##
## Var1 Freq
## ----- -----
## 0 296
## 1 371
##
## $lodging
##
##
## Var1 Freq
## ----- -----
## 0 520
## 1 42
##
## $stem.cankers
##
##
## Var1 Freq
## ----- -----
## 0 379
## 1 39
## 2 36
## 3 191
##
## $canker.lesion
##
##
## Var1 Freq
## ----- -----
## 0 320
## 1 83
## 2 177
## 3 65
##
## $fruiting.bodies
##
##
## Var1 Freq
## ----- -----
## 0 473
## 1 104
##
## $ext.decay
##
##
## Var1 Freq
## ----- -----
## 0 497
## 1 135
## 2 13
##
## $mycelium
##
##
## Var1 Freq
## ----- -----
## 0 639
## 1 6
##
## $int.discolor
##
##
## Var1 Freq
## ----- -----
## 0 581
## 1 44
## 2 20
##
## $sclerotia
##
##
## Var1 Freq
## ----- -----
## 0 625
## 1 20
##
## $fruit.pods
##
##
## Var1 Freq
## ----- -----
## 0 407
## 1 130
## 2 14
## 3 48
##
## $fruit.spots
##
##
## Var1 Freq
## ----- -----
## 0 345
## 1 75
## 2 57
## 4 100
##
## $seed
##
##
## Var1 Freq
## ----- -----
## 0 476
## 1 115
##
## $mold.growth
##
##
## Var1 Freq
## ----- -----
## 0 524
## 1 67
##
## $seed.discolor
##
##
## Var1 Freq
## ----- -----
## 0 513
## 1 64
##
## $seed.size
##
##
## Var1 Freq
## ----- -----
## 0 532
## 1 59
##
## $shriveling
##
##
## Var1 Freq
## ----- -----
## 0 539
## 1 38
##
## $roots
##
##
## Var1 Freq
## ----- -----
## 0 551
## 1 86
## 2 15
Missing Values %
1-(sum(complete.cases(Soybean))/nrow(Soybean))
## [1] 0.1771596
Complete cases proportion
1 - sum(complete.cases(Soybean)) / nrow(Soybean)
## [1] 0.1771596
sum(is.na(Soybean)) / ncol(Soybean) / nrow(Soybean)
## [1] 0.09504636
Missing values per column
the.na.Soybean <- apply(Soybean, 2, function(x){sum(is.na(x))})
the.na.Soybean
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 84 100 84 108 16
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 121 38 38 106 38
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 38 38 38 84 106
## seed mold.growth seed.discolor seed.size shriveling
## 92 92 106 92 106
## roots
## 31
From the above data we see missing data in nearly all data. We also see nearly 18% where missing data can be seen
There are many strategies that can be used to solve missing data problem. There are many data with NA values. For example phytophthora-rot data can be solved using knnImputation package. After that we look at how many NA’s are still there and if it make sense to use dropna. We have to be careful here because it should not change the meaning of what we are trying to achieve.
kable(head(Soybean[Soybean$Class=='phytophthora-rot',],5))
Class | date | plant.stand | precip | temp | hail | crop.hist | area.dam | sever | seed.tmt | germ | plant.growth | leaves | leaf.halo | leaf.marg | leaf.size | leaf.shread | leaf.malf | leaf.mild | stem | lodging | stem.cankers | canker.lesion | fruiting.bodies | ext.decay | mycelium | int.discolor | sclerotia | fruit.pods | fruit.spots | seed | mold.growth | seed.discolor | seed.size | shriveling | roots | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
31 | phytophthora-rot | 0 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
32 | phytophthora-rot | 1 | 1 | 2 | 1 | NA | 3 | 1 | NA | NA | NA | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | NA | 2 | 2 | NA | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA | 1 |
33 | phytophthora-rot | 2 | 1 | 2 | 2 | NA | 2 | 1 | NA | NA | NA | 1 | 1 | NA | NA | NA | NA | NA | NA | 1 | NA | 3 | 2 | NA | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA | 1 |
34 | phytophthora-rot | 1 | 1 | 2 | 0 | 0 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
35 | phytophthora-rot | 2 | 1 | 2 | 2 | NA | 2 | 1 | NA | NA | NA | 1 | 1 | NA | NA | NA | NA | NA | NA | 1 | NA | 2 | 2 | NA | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA | 1 |
imputed_data <- knnImputation(Soybean,k=10)
kable(head(imputed_data,5))
Class | date | plant.stand | precip | temp | hail | crop.hist | area.dam | sever | seed.tmt | germ | plant.growth | leaves | leaf.halo | leaf.marg | leaf.size | leaf.shread | leaf.malf | leaf.mild | stem | lodging | stem.cankers | canker.lesion | fruiting.bodies | ext.decay | mycelium | int.discolor | sclerotia | fruit.pods | fruit.spots | seed | mold.growth | seed.discolor | seed.size | shriveling | roots |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 1 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 4 | 0 | 2 | 1 | 0 | 2 | 0 | 2 | 1 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 3 | 0 | 2 | 1 | 0 | 1 | 0 | 2 | 1 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 3 | 0 | 2 | 1 | 0 | 1 | 0 | 2 | 0 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 2 | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |