library(mlbench) 
library(psych)
library(e1071)
library(ggplot2)
library(knitr)
library(mlbench)
library(reshape2)
library(corrplot)
library(caret)
library(DMwR)
library(GGally)

Problem 3.1

The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

About the Dataset

About the dataset

This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)

Attribute Information:

  1. Id number: 1 to 214
  2. RI: refractive index
  3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
  4. Mg: Magnesium
  5. Al: Aluminum
  6. Si: Silicon
  7. K: Potassium
  8. Ca: Calcium
  9. Ba: Barium
  10. Fe: Iron
  11. Type of glass: (class attribute) – 1 building_windows_float_processed – 2 building_windows_non_float_processed – 3 vehicle_windows_float_processed – 4 vehicle_windows_non_float_processed (none in this database) – 5 containers – 6 tableware – 7 headlamps

Examine Data

data(Glass)
kable(head(Glass))
RI Na Mg Al Si K Ca Ba Fe Type
1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
str(Glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
describe(Glass)
##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## RI       1 214  1.52 0.00   1.52    1.52 0.00  1.51  1.53  0.02  1.60     4.72
## Na       2 214 13.41 0.82  13.30   13.38 0.64 10.73 17.38  6.65  0.45     2.90
## Mg       3 214  2.68 1.44   3.48    2.87 0.30  0.00  4.49  4.49 -1.14    -0.45
## Al       4 214  1.44 0.50   1.36    1.41 0.31  0.29  3.50  3.21  0.89     1.94
## Si       5 214 72.65 0.77  72.79   72.71 0.57 69.81 75.41  5.60 -0.72     2.82
## K        6 214  0.50 0.65   0.56    0.43 0.17  0.00  6.21  6.21  6.46    52.87
## Ca       7 214  8.96 1.42   8.60    8.74 0.66  5.43 16.19 10.76  2.02     6.41
## Ba       8 214  0.18 0.50   0.00    0.03 0.00  0.00  3.15  3.15  3.37    12.08
## Fe       9 214  0.06 0.10   0.00    0.04 0.00  0.00  0.51  0.51  1.73     2.52
## Type*   10 214  2.54 1.71   2.00    2.31 1.48  1.00  6.00  5.00  1.04    -0.29
##         se
## RI    0.00
## Na    0.06
## Mg    0.10
## Al    0.03
## Si    0.05
## K     0.04
## Ca    0.10
## Ba    0.03
## Fe    0.01
## Type* 0.12
  1. Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

Hostogram

ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) + 
  geom_histogram(colour = "gray", bins=50, alpha = 0.5, position = "identity") + 
  ggtitle("Histogram") +
  facet_wrap(~variable, scale="free") 

Density

ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) + 
  geom_density(colour = "gray", alpha = 0.5, position = "identity") + 
  ggtitle("Density Plot Companision") +
  facet_wrap(~variable, scale="free") 

Skewness

skewRI <- skewness(Glass$RI)
skewNa <- skewness(Glass$Na)
skewMg <- skewness(Glass$Mg)
skewAl <- skewness(Glass$Al)
skewSi <- skewness(Glass$Si)
skewK <- skewness(Glass$K)
skewCa <- skewness(Glass$Ca)
skewBa <- skewness(Glass$Ba)
skewFe <- skewness(Glass$Fe)
TypeValue <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")
Skewness <- c(skewRI, skewNa, skewMg, skewAl, skewSi, skewK, skewCa, skewBa, skewFe)
ds1 <- data.frame(TypeValue, Skewness)
kable(ds1)
TypeValue Skewness
RI 1.6027151
Na 0.4478343
Mg -1.1364523
Al 0.8946104
Si -0.7202392
K 6.4600889
Ca 2.0184463
Ba 3.3686800
Fe 1.7298107

Pairs Plot

pairs.panels(Glass[,-10])

  1. Do there appear to be any outliers in the data? Are any predictors skewed?

From the visualizations and data above we can see the shape of data is different for each type. The density plot show us the skewness of these data. For example Na and Al seems to be normal but Ba, Fe, Ca etc are left heavy. The Skewness table shows the skew values

Predictor Variable Outlier

boxplot(Glass["RI"],plot=FALSE)$out
##  [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365
boxplot(Glass["Na"],plot=FALSE)$out
## [1] 11.45 10.73 11.23 11.02 11.03 17.38 15.79
boxplot(Glass["Mg"],plot=FALSE)$out
## numeric(0)
boxplot(Glass["Al"],plot=FALSE)$out
##  [1] 0.29 0.47 0.47 0.51 3.50 3.04 3.02 0.34 2.38 2.79 2.68 2.54 2.34 2.66 2.51
## [16] 2.42 2.74 2.88
boxplot(Glass["Si"],plot=FALSE)$out
##  [1] 70.57 69.81 70.16 74.45 69.89 70.48 70.70 74.55 75.41 70.26 70.43 75.18
boxplot(Glass["K"],plot=FALSE)$out
## [1] 1.68 6.21 6.21 1.76 1.46 2.70 1.41
boxplot(Glass["Ca"],plot=FALSE)$out
##  [1] 11.64 10.79 13.24 13.30 16.19 11.52 10.99 14.68 14.96 14.40 11.14 13.44
## [13]  5.87 11.41 11.62 11.53 11.32 12.24 12.50 11.27 10.88 11.22  6.65  5.43
## [25]  5.79  6.47
boxplot(Glass["Ba"],plot=FALSE)$out
##  [1] 0.09 0.11 0.69 0.14 0.11 3.15 0.27 0.09 0.06 0.15 2.20 0.24 1.19 1.63 1.68
## [16] 0.76 0.64 0.40 1.59 1.57 0.61 0.81 0.66 0.64 0.53 0.63 0.56 1.71 0.67 1.55
## [31] 1.38 2.88 0.54 1.06 1.59 1.64 1.57 1.67
boxplot(Glass["Fe"],plot=FALSE)$out
##  [1] 0.26 0.30 0.31 0.32 0.34 0.28 0.29 0.28 0.35 0.37 0.51 0.28
  1. Are there any relevant transformations of one or more predictors that might improve the classification model?

Transformation

To improve the model we can use transformation like Box-Cox for highly skewed variables and Normalizing the variables by centering and scaling. Along with these techniques we should take a look at the outliers and try to remove them

apply(Glass[,-10], 2, BoxCoxTrans)
## $RI
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.511   1.517   1.518   1.518   1.519   1.534 
## 
## Largest/Smallest: 1.02 
## Sample Skewness: 1.6 
## 
## Estimated Lambda: -2 
## 
## 
## $Na
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.73   12.91   13.30   13.41   13.82   17.38 
## 
## Largest/Smallest: 1.62 
## Sample Skewness: 0.448 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations
## 
## 
## $Mg
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.115   3.480   2.685   3.600   4.490 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Al
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.290   1.190   1.360   1.445   1.630   3.500 
## 
## Largest/Smallest: 12.1 
## Sample Skewness: 0.895 
## 
## Estimated Lambda: 0.5 
## 
## 
## $Si
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69.81   72.28   72.79   72.65   73.09   75.41 
## 
## Largest/Smallest: 1.08 
## Sample Skewness: -0.72 
## 
## Estimated Lambda: 2 
## 
## 
## $K
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1225  0.5550  0.4971  0.6100  6.2100 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Ca
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.430   8.240   8.600   8.957   9.172  16.190 
## 
## Largest/Smallest: 2.98 
## Sample Skewness: 2.02 
## 
## Estimated Lambda: -1.1 
## 
## 
## $Ba
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.175   0.000   3.150 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Fe
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05701 0.10000 0.51000 
## 
## Lambda could not be estimated; no transformation is applied

Problem 3.2

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

Examine Data

data(Soybean)
kable(head(Soybean, 10))
Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
diaporthe-stem-canker 6 0 2 1 0 1 1 1 0 0 1 1 0 2 2 0 0 0 1 1 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 4 0 2 1 0 2 0 2 1 1 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 3 0 2 1 0 1 0 2 1 2 1 1 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 3 0 2 1 0 1 0 2 0 1 1 1 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 6 0 2 1 0 2 0 1 0 2 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 5 0 2 1 0 3 0 1 0 1 1 1 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 5 0 2 1 0 2 0 1 1 0 1 1 0 2 2 0 0 0 1 1 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 4 0 2 1 1 1 0 1 0 2 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 6 0 2 1 0 3 0 1 1 1 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 4 0 2 1 0 2 0 2 0 2 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
  1. Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?
ggplot(melt(Soybean, id.vars=c('Class')), aes(x=value)) + 
  geom_histogram(stat="count") + 
  facet_wrap(~variable, scale="free")

We can use the nearZeroVar function in the caret package to identify the predictors with degenerate distributions.

nzv.cols <- nearZeroVar(Soybean)
nzv.out <- nearZeroVar(Soybean, saveMetrics=TRUE)[nzv.cols, ]
nzv.prop.high <- apply(Soybean, 2, function(x) max(table(x)) / length(x))[nzv.cols]

summary(Soybean[ , nzv.cols])
##  leaf.mild  mycelium   sclerotia 
##  0   :535   0   :639   0   :625  
##  1   : 20   1   :  6   1   : 20  
##  2   : 20   NA's: 38   NA's: 38  
##  NA's:108

Frequency Distribution

lapply(apply(Soybean[-1], 2, table), kable)
## $date
## 
## 
## Var1    Freq
## -----  -----
## 0         26
## 1         75
## 2         93
## 3        118
## 4        131
## 5        149
## 6         90
## 
## $plant.stand
## 
## 
## Var1    Freq
## -----  -----
## 0        354
## 1        293
## 
## $precip
## 
## 
## Var1    Freq
## -----  -----
## 0         74
## 1        112
## 2        459
## 
## $temp
## 
## 
## Var1    Freq
## -----  -----
## 0         80
## 1        374
## 2        199
## 
## $hail
## 
## 
## Var1    Freq
## -----  -----
## 0        435
## 1        127
## 
## $crop.hist
## 
## 
## Var1    Freq
## -----  -----
## 0         65
## 1        165
## 2        219
## 3        218
## 
## $area.dam
## 
## 
## Var1    Freq
## -----  -----
## 0        123
## 1        227
## 2        145
## 3        187
## 
## $sever
## 
## 
## Var1    Freq
## -----  -----
## 0        195
## 1        322
## 2         45
## 
## $seed.tmt
## 
## 
## Var1    Freq
## -----  -----
## 0        305
## 1        222
## 2         35
## 
## $germ
## 
## 
## Var1    Freq
## -----  -----
## 0        165
## 1        213
## 2        193
## 
## $plant.growth
## 
## 
## Var1    Freq
## -----  -----
## 0        441
## 1        226
## 
## $leaves
## 
## 
## Var1    Freq
## -----  -----
## 0         77
## 1        606
## 
## $leaf.halo
## 
## 
## Var1    Freq
## -----  -----
## 0        221
## 1         36
## 2        342
## 
## $leaf.marg
## 
## 
## Var1    Freq
## -----  -----
## 0        357
## 1         21
## 2        221
## 
## $leaf.size
## 
## 
## Var1    Freq
## -----  -----
## 0         51
## 1        327
## 2        221
## 
## $leaf.shread
## 
## 
## Var1    Freq
## -----  -----
## 0        487
## 1         96
## 
## $leaf.malf
## 
## 
## Var1    Freq
## -----  -----
## 0        554
## 1         45
## 
## $leaf.mild
## 
## 
## Var1    Freq
## -----  -----
## 0        535
## 1         20
## 2         20
## 
## $stem
## 
## 
## Var1    Freq
## -----  -----
## 0        296
## 1        371
## 
## $lodging
## 
## 
## Var1    Freq
## -----  -----
## 0        520
## 1         42
## 
## $stem.cankers
## 
## 
## Var1    Freq
## -----  -----
## 0        379
## 1         39
## 2         36
## 3        191
## 
## $canker.lesion
## 
## 
## Var1    Freq
## -----  -----
## 0        320
## 1         83
## 2        177
## 3         65
## 
## $fruiting.bodies
## 
## 
## Var1    Freq
## -----  -----
## 0        473
## 1        104
## 
## $ext.decay
## 
## 
## Var1    Freq
## -----  -----
## 0        497
## 1        135
## 2         13
## 
## $mycelium
## 
## 
## Var1    Freq
## -----  -----
## 0        639
## 1          6
## 
## $int.discolor
## 
## 
## Var1    Freq
## -----  -----
## 0        581
## 1         44
## 2         20
## 
## $sclerotia
## 
## 
## Var1    Freq
## -----  -----
## 0        625
## 1         20
## 
## $fruit.pods
## 
## 
## Var1    Freq
## -----  -----
## 0        407
## 1        130
## 2         14
## 3         48
## 
## $fruit.spots
## 
## 
## Var1    Freq
## -----  -----
## 0        345
## 1         75
## 2         57
## 4        100
## 
## $seed
## 
## 
## Var1    Freq
## -----  -----
## 0        476
## 1        115
## 
## $mold.growth
## 
## 
## Var1    Freq
## -----  -----
## 0        524
## 1         67
## 
## $seed.discolor
## 
## 
## Var1    Freq
## -----  -----
## 0        513
## 1         64
## 
## $seed.size
## 
## 
## Var1    Freq
## -----  -----
## 0        532
## 1         59
## 
## $shriveling
## 
## 
## Var1    Freq
## -----  -----
## 0        539
## 1         38
## 
## $roots
## 
## 
## Var1    Freq
## -----  -----
## 0        551
## 1         86
## 2         15

Missing data problem

  1. Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

Missing Values %

1-(sum(complete.cases(Soybean))/nrow(Soybean))
## [1] 0.1771596

Complete cases proportion

1 - sum(complete.cases(Soybean)) / nrow(Soybean)
## [1] 0.1771596
sum(is.na(Soybean)) / ncol(Soybean) / nrow(Soybean)
## [1] 0.09504636

Missing values per column

the.na.Soybean <- apply(Soybean, 2, function(x){sum(is.na(x))})
the.na.Soybean
##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##             121              16               1             121             121 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##             112              16               0              84              84 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##              84             100              84             108              16 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##             121              38              38             106              38 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##              38              38              38              84             106 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##              92              92             106              92             106 
##           roots 
##              31

From the above data we see missing data in nearly all data. We also see nearly 18% where missing data can be seen

Solution to Missing data problem

  1. Develop a strategy for handling missing data, either by eliminating predictors or imputation.

There are many strategies that can be used to solve missing data problem. There are many data with NA values. For example phytophthora-rot data can be solved using knnImputation package. After that we look at how many NA’s are still there and if it make sense to use dropna. We have to be careful here because it should not change the meaning of what we are trying to achieve.

kable(head(Soybean[Soybean$Class=='phytophthora-rot',],5))
Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
31 phytophthora-rot 0 1 2 1 1 1 1 1 0 0 1 1 0 2 2 0 0 0 1 0 1 2 0 1 0 0 0 3 4 0 0 0 0 0 0
32 phytophthora-rot 1 1 2 1 NA 3 1 NA NA NA 1 1 0 2 2 0 0 0 1 NA 2 2 NA 0 0 0 0 NA NA NA NA NA NA NA 1
33 phytophthora-rot 2 1 2 2 NA 2 1 NA NA NA 1 1 NA NA NA NA NA NA 1 NA 3 2 NA 0 0 0 0 NA NA NA NA NA NA NA 1
34 phytophthora-rot 1 1 2 0 0 2 1 2 1 1 1 1 0 2 2 0 0 0 1 0 2 2 0 0 0 0 0 3 4 0 0 0 0 0 0
35 phytophthora-rot 2 1 2 2 NA 2 1 NA NA NA 1 1 NA NA NA NA NA NA 1 NA 2 2 NA 0 0 0 0 NA NA NA NA NA NA NA 1
imputed_data <- knnImputation(Soybean,k=10)
kable(head(imputed_data,5))
Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
diaporthe-stem-canker 6 0 2 1 0 1 1 1 0 0 1 1 0 2 2 0 0 0 1 1 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 4 0 2 1 0 2 0 2 1 1 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 3 0 2 1 0 1 0 2 1 2 1 1 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 3 0 2 1 0 1 0 2 0 1 1 1 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0 0 4 0 0 0 0 0 0
diaporthe-stem-canker 6 0 2 1 0 2 0 1 0 2 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0