library(mlbench)
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
library(knitr)
kable(head(Glass, 10))
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | Type |
---|---|---|---|---|---|---|---|---|---|
1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0 | 0.00 | 1 |
1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0 | 0.00 | 1 |
1.51618 | 13.53 | 3.55 | 1.54 | 72.99 | 0.39 | 7.78 | 0 | 0.00 | 1 |
1.51766 | 13.21 | 3.69 | 1.29 | 72.61 | 0.57 | 8.22 | 0 | 0.00 | 1 |
1.51742 | 13.27 | 3.62 | 1.24 | 73.08 | 0.55 | 8.07 | 0 | 0.00 | 1 |
1.51596 | 12.79 | 3.61 | 1.62 | 72.97 | 0.64 | 8.07 | 0 | 0.26 | 1 |
1.51743 | 13.30 | 3.60 | 1.14 | 73.09 | 0.58 | 8.17 | 0 | 0.00 | 1 |
1.51756 | 13.15 | 3.61 | 1.05 | 73.24 | 0.57 | 8.24 | 0 | 0.00 | 1 |
1.51918 | 14.04 | 3.58 | 1.37 | 72.08 | 0.56 | 8.30 | 0 | 0.00 | 1 |
1.51755 | 13.00 | 3.60 | 1.36 | 72.99 | 0.57 | 8.40 | 0 | 0.11 | 1 |
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
library(psych)
describe(Glass)
## vars n mean sd median trimmed mad min max range skew
## RI 1 214 1.52 0.00 1.52 1.52 0.00 1.51 1.53 0.02 1.60
## Na 2 214 13.41 0.82 13.30 13.38 0.64 10.73 17.38 6.65 0.45
## Mg 3 214 2.68 1.44 3.48 2.87 0.30 0.00 4.49 4.49 -1.14
## Al 4 214 1.44 0.50 1.36 1.41 0.31 0.29 3.50 3.21 0.89
## Si 5 214 72.65 0.77 72.79 72.71 0.57 69.81 75.41 5.60 -0.72
## K 6 214 0.50 0.65 0.56 0.43 0.17 0.00 6.21 6.21 6.46
## Ca 7 214 8.96 1.42 8.60 8.74 0.66 5.43 16.19 10.76 2.02
## Ba 8 214 0.18 0.50 0.00 0.03 0.00 0.00 3.15 3.15 3.37
## Fe 9 214 0.06 0.10 0.00 0.04 0.00 0.00 0.51 0.51 1.73
## Type* 10 214 2.54 1.71 2.00 2.31 1.48 1.00 6.00 5.00 1.04
## kurtosis se
## RI 4.72 0.00
## Na 2.90 0.06
## Mg -0.45 0.10
## Al 1.94 0.03
## Si 2.82 0.05
## K 52.87 0.04
## Ca 6.41 0.10
## Ba 12.08 0.03
## Fe 2.52 0.01
## Type* -0.29 0.12
library(ggplot2)
library(reshape2)
ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) +
geom_histogram(bins=50) +
facet_wrap(~variable, scale="free")
library(corrplot)
corrplot(cor(Glass[,1:9]), order = "hclust")
library(caret)
corelation <- cor(Glass[,-10])
highCorr <- findCorrelation(corelation, cutoff = .75)
print(paste0("Total number of Predictor Variables with Pearson Correlation > 0.75: ",length(highCorr)))
## [1] "Total number of Predictor Variables with Pearson Correlation > 0.75: 1"
cor(Glass[,c('Ca','RI')])
## Ca RI
## Ca 1.0000000 0.8104027
## RI 0.8104027 1.0000000
Also, Thre is strong corelation between Ca and RI
Do there appear to be any outliers in the data? Are any predictors skewed?
# Total 9 columns
for(i in 1:9) {
print(paste0("Predictor Variable (outlier values): ", colnames(Glass[i])))
print(paste0(boxplot(Glass[i],plot=FALSE)$out))
}
## [1] "Predictor Variable (outlier values): RI"
## [1] "1.52667" "1.5232" "1.51215" "1.52725" "1.5241" "1.52475" "1.53125"
## [8] "1.53393" "1.52664" "1.52739" "1.52777" "1.52614" "1.52369" "1.51115"
## [15] "1.51131" "1.52315" "1.52365"
## [1] "Predictor Variable (outlier values): Na"
## [1] "11.45" "10.73" "11.23" "11.02" "11.03" "17.38" "15.79"
## [1] "Predictor Variable (outlier values): Mg"
## character(0)
## [1] "Predictor Variable (outlier values): Al"
## [1] "0.29" "0.47" "0.47" "0.51" "3.5" "3.04" "3.02" "0.34" "2.38" "2.79"
## [11] "2.68" "2.54" "2.34" "2.66" "2.51" "2.42" "2.74" "2.88"
## [1] "Predictor Variable (outlier values): Si"
## [1] "70.57" "69.81" "70.16" "74.45" "69.89" "70.48" "70.7" "74.55"
## [9] "75.41" "70.26" "70.43" "75.18"
## [1] "Predictor Variable (outlier values): K"
## [1] "1.68" "6.21" "6.21" "1.76" "1.46" "2.7" "1.41"
## [1] "Predictor Variable (outlier values): Ca"
## [1] "11.64" "10.79" "13.24" "13.3" "16.19" "11.52" "10.99" "14.68"
## [9] "14.96" "14.4" "11.14" "13.44" "5.87" "11.41" "11.62" "11.53"
## [17] "11.32" "12.24" "12.5" "11.27" "10.88" "11.22" "6.65" "5.43"
## [25] "5.79" "6.47"
## [1] "Predictor Variable (outlier values): Ba"
## [1] "0.09" "0.11" "0.69" "0.14" "0.11" "3.15" "0.27" "0.09" "0.06" "0.15"
## [11] "2.2" "0.24" "1.19" "1.63" "1.68" "0.76" "0.64" "0.4" "1.59" "1.57"
## [21] "0.61" "0.81" "0.66" "0.64" "0.53" "0.63" "0.56" "1.71" "0.67" "1.55"
## [31] "1.38" "2.88" "0.54" "1.06" "1.59" "1.64" "1.57" "1.67"
## [1] "Predictor Variable (outlier values): Fe"
## [1] "0.26" "0.3" "0.31" "0.32" "0.34" "0.28" "0.29" "0.28" "0.35" "0.37"
## [11] "0.51" "0.28"
Yes, there are outliers in the predictor variable
library(e1071)
apply(Glass[,-10], 2, skewness)
## RI Na Mg Al Si K
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889
## Ca Ba Fe
## 2.0184463 3.3686800 1.7298107
The data also shows high skewed values and transformation techniques need to be used.
Are there any relevant transformations of one or more predictors that might improve the classification model?
transform_glass <- apply(Glass[,-10], 2, BoxCoxTrans)
transform_glass
## $RI
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.511 1.517 1.518 1.518 1.519 1.534
##
## Largest/Smallest: 1.02
## Sample Skewness: 1.6
##
## Estimated Lambda: -2
##
##
## $Na
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.73 12.91 13.30 13.41 13.82 17.38
##
## Largest/Smallest: 1.62
## Sample Skewness: 0.448
##
## Estimated Lambda: -0.1
## With fudge factor, Lambda = 0 will be used for transformations
##
##
## $Mg
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.115 3.480 2.685 3.600 4.490
##
## Lambda could not be estimated; no transformation is applied
##
##
## $Al
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.290 1.190 1.360 1.445 1.630 3.500
##
## Largest/Smallest: 12.1
## Sample Skewness: 0.895
##
## Estimated Lambda: 0.5
##
##
## $Si
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.81 72.28 72.79 72.65 73.09 75.41
##
## Largest/Smallest: 1.08
## Sample Skewness: -0.72
##
## Estimated Lambda: 2
##
##
## $K
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1225 0.5550 0.4971 0.6100 6.2100
##
## Lambda could not be estimated; no transformation is applied
##
##
## $Ca
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.430 8.240 8.600 8.957 9.172 16.190
##
## Largest/Smallest: 2.98
## Sample Skewness: 2.02
##
## Estimated Lambda: -1.1
##
##
## $Ba
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.175 0.000 3.150
##
## Lambda could not be estimated; no transformation is applied
##
##
## $Fe
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05701 0.10000 0.51000
##
## Lambda could not be estimated; no transformation is applied
Yes, the transformations techniques such as log or a Box Cox could help improve the classification model. By definition, removing skews would remove outliers and that improves the model’s performance as well.
data(Soybean)
## See ?Soybean for details
Description: There are 19 classes, only the first 15 of which have been used in prior work. The folklore seems to be that the last four classes are unjustified by the data since they have so few examples. There are 35 categorical attributes, some nominal and some ordered. The value “dna” means does not apply. The values for attributes are encoded numerically, with the first value encoded as “0,” the second as “1,” and so forth.
A data frame with 683 observations on 36 variables. There are 35 categorical attributes, all numerical and a nominal denoting the class.
Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?
kable(head(Soybean))
Class | date | plant.stand | precip | temp | hail | crop.hist | area.dam | sever | seed.tmt | germ | plant.growth | leaves | leaf.halo | leaf.marg | leaf.size | leaf.shread | leaf.malf | leaf.mild | stem | lodging | stem.cankers | canker.lesion | fruiting.bodies | ext.decay | mycelium | int.discolor | sclerotia | fruit.pods | fruit.spots | seed | mold.growth | seed.discolor | seed.size | shriveling | roots |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 1 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 4 | 0 | 2 | 1 | 0 | 2 | 0 | 2 | 1 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 3 | 0 | 2 | 1 | 0 | 1 | 0 | 2 | 1 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 3 | 0 | 2 | 1 | 0 | 1 | 0 | 2 | 0 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 6 | 0 | 2 | 1 | 0 | 2 | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
diaporthe-stem-canker | 5 | 0 | 2 | 1 | 0 | 3 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
ggplot(melt(Soybean, id.vars=c('Class')), aes(x=value)) +
geom_histogram(stat="count") +
facet_wrap(~variable, scale="free")
Let’s remove near-zero variance predictors using the caret package and the nearZeroVar() function.
nearZeroVar(Soybean)
## [1] 19 26 28
So, the degenerate distributions are columns 19, 26, and 28.
Roughly 18% of the data are mising. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?
Soybean.na <- apply(Soybean, 2, function(x){sum(is.na(x))})
Soybean.na
## Class date plant.stand precip
## 0 1 36 38
## temp hail crop.hist area.dam
## 30 121 16 1
## sever seed.tmt germ plant.growth
## 121 121 112 16
## leaves leaf.halo leaf.marg leaf.size
## 0 84 84 84
## leaf.shread leaf.malf leaf.mild stem
## 100 84 108 16
## lodging stem.cankers canker.lesion fruiting.bodies
## 121 38 38 106
## ext.decay mycelium int.discolor sclerotia
## 38 38 38 38
## fruit.pods fruit.spots seed mold.growth
## 84 106 92 92
## seed.discolor seed.size shriveling roots
## 106 92 106 31
number.of.na <- apply(Soybean, 1, function(x){sum(is.na(x))})
class.soybean <- Soybean$Class
soybean.na.df <- data.frame(class.soybean, number.of.na)
kable(head(soybean.na.df,10))
class.soybean | number.of.na |
---|---|
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
diaporthe-stem-canker | 0 |
results <- aggregate(soybean.na.df$number.of.na, by=list(class.soybean=soybean.na.df$class.soybean), FUN=sum)
kable(results[order(results[,"x"]),])
class.soybean | x | |
---|---|---|
2 | alternarialeaf-spot | 0 |
3 | anthracnose | 0 |
4 | bacterial-blight | 0 |
5 | bacterial-pustule | 0 |
6 | brown-spot | 0 |
7 | brown-stem-rot | 0 |
8 | charcoal-rot | 0 |
11 | diaporthe-stem-canker | 0 |
12 | downy-mildew | 0 |
13 | frog-eye-leaf-spot | 0 |
15 | phyllosticta-leaf-spot | 0 |
17 | powdery-mildew | 0 |
18 | purple-seed-stain | 0 |
19 | rhizoctonia-root-rot | 0 |
14 | herbicide-injury | 160 |
10 | diaporthe-pod-&-stem-blight | 177 |
9 | cyst-nematode | 336 |
1 | 2-4-d-injury | 450 |
16 | phytophthora-rot | 1214 |
Develop a strategy for handling missing data, either by eliminating predictors or imputation.
head(Soybean[Soybean$Class=='phytophthora-rot',],10)
## Class date plant.stand precip temp hail crop.hist area.dam
## 31 phytophthora-rot 0 1 2 1 1 1 1
## 32 phytophthora-rot 1 1 2 1 <NA> 3 1
## 33 phytophthora-rot 2 1 2 2 <NA> 2 1
## 34 phytophthora-rot 1 1 2 0 0 2 1
## 35 phytophthora-rot 2 1 2 2 <NA> 2 1
## 36 phytophthora-rot 3 1 2 1 <NA> 2 1
## 37 phytophthora-rot 0 1 1 1 0 1 1
## 38 phytophthora-rot 3 1 2 0 0 2 1
## 39 phytophthora-rot 2 1 1 1 <NA> 0 1
## 40 phytophthora-rot 2 1 2 0 0 1 1
## sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 31 1 0 0 1 1 0 2 2
## 32 <NA> <NA> <NA> 1 1 0 2 2
## 33 <NA> <NA> <NA> 1 1 <NA> <NA> <NA>
## 34 2 1 1 1 1 0 2 2
## 35 <NA> <NA> <NA> 1 1 <NA> <NA> <NA>
## 36 <NA> <NA> <NA> 1 1 <NA> <NA> <NA>
## 37 1 0 0 1 1 0 2 2
## 38 2 1 1 1 1 0 2 2
## 39 <NA> <NA> <NA> 1 1 0 2 2
## 40 2 0 1 1 1 0 2 2
## leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 31 0 0 0 1 0 1 2
## 32 0 0 0 1 <NA> 2 2
## 33 <NA> <NA> <NA> 1 <NA> 3 2
## 34 0 0 0 1 0 2 2
## 35 <NA> <NA> <NA> 1 <NA> 2 2
## 36 <NA> <NA> <NA> 1 <NA> 3 2
## 37 0 0 0 1 0 1 2
## 38 0 0 0 1 0 2 2
## 39 0 0 0 1 <NA> 2 2
## 40 0 0 0 1 0 1 2
## fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 31 0 1 0 0 0 3
## 32 <NA> 0 0 0 0 <NA>
## 33 <NA> 0 0 0 0 <NA>
## 34 0 0 0 0 0 3
## 35 <NA> 0 0 0 0 <NA>
## 36 <NA> 0 0 0 0 <NA>
## 37 0 0 0 0 0 3
## 38 0 0 0 0 0 3
## 39 <NA> 0 0 0 0 <NA>
## 40 0 0 0 0 0 3
## fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 31 4 0 0 0 0 0 0
## 32 <NA> <NA> <NA> <NA> <NA> <NA> 1
## 33 <NA> <NA> <NA> <NA> <NA> <NA> 1
## 34 4 0 0 0 0 0 0
## 35 <NA> <NA> <NA> <NA> <NA> <NA> 1
## 36 <NA> <NA> <NA> <NA> <NA> <NA> 1
## 37 4 0 0 0 0 0 0
## 38 4 0 0 0 0 0 0
## 39 <NA> <NA> <NA> <NA> <NA> <NA> 1
## 40 4 0 0 0 0 0 0
(reference : page 54) To impute missing values, the impute package has a function, impute.knn, that uses Knearest neighbors to estimate the missing data. The previously mentioned preProcess function applies imputation methods based on K-nearest neighbors or bagged trees.
# reference : https://www.rdocumentation.org/packages/DMwR/versions/0.4.1/topics/knnImputation
library(DMwR)
imputed.data <- knnImputation(Soybean,k=10)
head(imputed.data,10)
## Class date plant.stand precip temp hail crop.hist
## 1 diaporthe-stem-canker 6 0 2 1 0 1
## 2 diaporthe-stem-canker 4 0 2 1 0 2
## 3 diaporthe-stem-canker 3 0 2 1 0 1
## 4 diaporthe-stem-canker 3 0 2 1 0 1
## 5 diaporthe-stem-canker 6 0 2 1 0 2
## 6 diaporthe-stem-canker 5 0 2 1 0 3
## 7 diaporthe-stem-canker 5 0 2 1 0 2
## 8 diaporthe-stem-canker 4 0 2 1 1 1
## 9 diaporthe-stem-canker 6 0 2 1 0 3
## 10 diaporthe-stem-canker 4 0 2 1 0 2
## area.dam sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg
## 1 1 1 0 0 1 1 0 2
## 2 0 2 1 1 1 1 0 2
## 3 0 2 1 2 1 1 0 2
## 4 0 2 0 1 1 1 0 2
## 5 0 1 0 2 1 1 0 2
## 6 0 1 0 1 1 1 0 2
## 7 0 1 1 0 1 1 0 2
## 8 0 1 0 2 1 1 0 2
## 9 0 1 1 1 1 1 0 2
## 10 0 2 0 2 1 1 0 2
## leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers
## 1 2 0 0 0 1 1 3
## 2 2 0 0 0 1 0 3
## 3 2 0 0 0 1 0 3
## 4 2 0 0 0 1 0 3
## 5 2 0 0 0 1 0 3
## 6 2 0 0 0 1 0 3
## 7 2 0 0 0 1 1 3
## 8 2 0 0 0 1 0 3
## 9 2 0 0 0 1 0 3
## 10 2 0 0 0 1 0 3
## canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia
## 1 1 1 1 0 0 0
## 2 1 1 1 0 0 0
## 3 0 1 1 0 0 0
## 4 0 1 1 0 0 0
## 5 1 1 1 0 0 0
## 6 0 1 1 0 0 0
## 7 1 1 1 0 0 0
## 8 1 1 1 0 0 0
## 9 1 1 1 0 0 0
## 10 1 1 1 0 0 0
## fruit.pods fruit.spots seed mold.growth seed.discolor seed.size
## 1 0 4 0 0 0 0
## 2 0 4 0 0 0 0
## 3 0 4 0 0 0 0
## 4 0 4 0 0 0 0
## 5 0 4 0 0 0 0
## 6 0 4 0 0 0 0
## 7 0 4 0 0 0 0
## 8 0 4 0 0 0 0
## 9 0 4 0 0 0 0
## 10 0 4 0 0 0 0
## shriveling roots
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
anyNA(imputed.data)
## [1] FALSE
Reference: