DATA 624 Homework 4

library(mlbench) 
library(psych)
library(e1071)
library(ggplot2)
library(knitr)
library(mlbench)
library(reshape2)
library(corrplot)
library(caret)
library(DMwR)
library(GGally)

Problem 3.1

The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

About the Dataset

About the dataset

This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)

Attribute Information:

Id number: 1 to 214
RI: refractive index
Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
Mg: Magnesium
Al: Aluminum
Si: Silicon
K: Potassium
Ca: Calcium
Ba: Barium
Fe: Iron
Type of glass: (class attribute) – 1 building_windows_float_processed – 2 building_windows_non_float_processed – 3 vehicle_windows_float_processed – 4 vehicle_windows_non_float_processed (none in this database) – 5 containers – 6 tableware – 7 headlamps

Examine Data

data(Glass)
kable(head(Glass))

RI	Na	Mg	Al	Si	K	Ca	Fe	Type
1.52101	13.64	4.49	1.10	71.78	0.06	8.75	0.00	1
1.51761	13.89	3.60	1.36	72.73	0.48	7.83	0.00	1
1.51618	13.53	3.55	1.54	72.99	0.39	7.78	0.00	1
1.51766	13.21	3.69	1.29	72.61	0.57	8.22	0.00	1
1.51742	13.27	3.62	1.24	73.08	0.55	8.07	0.00	1
1.51596	12.79	3.61	1.62	72.97	0.64	8.07	0.26	1

str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

describe(Glass)

##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## RI       1 214  1.52 0.00   1.52    1.52 0.00  1.51  1.53  0.02  1.60     4.72
## Na       2 214 13.41 0.82  13.30   13.38 0.64 10.73 17.38  6.65  0.45     2.90
## Mg       3 214  2.68 1.44   3.48    2.87 0.30  0.00  4.49  4.49 -1.14    -0.45
## Al       4 214  1.44 0.50   1.36    1.41 0.31  0.29  3.50  3.21  0.89     1.94
## Si       5 214 72.65 0.77  72.79   72.71 0.57 69.81 75.41  5.60 -0.72     2.82
## K        6 214  0.50 0.65   0.56    0.43 0.17  0.00  6.21  6.21  6.46    52.87
## Ca       7 214  8.96 1.42   8.60    8.74 0.66  5.43 16.19 10.76  2.02     6.41
## Ba       8 214  0.18 0.50   0.00    0.03 0.00  0.00  3.15  3.15  3.37    12.08
## Fe       9 214  0.06 0.10   0.00    0.04 0.00  0.00  0.51  0.51  1.73     2.52
## Type*   10 214  2.54 1.71   2.00    2.31 1.48  1.00  6.00  5.00  1.04    -0.29
##         se
## RI    0.00
## Na    0.06
## Mg    0.10
## Al    0.03
## Si    0.05
## K     0.04
## Ca    0.10
## Ba    0.03
## Fe    0.01
## Type* 0.12

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

Hostogram

ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) + 
  geom_histogram(colour = "gray", bins=50, alpha = 0.5, position = "identity") + 
  ggtitle("Histogram") +
  facet_wrap(~variable, scale="free")

Density

ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) + 
  geom_density(colour = "gray", alpha = 0.5, position = "identity") + 
  ggtitle("Density Plot Companision") +
  facet_wrap(~variable, scale="free")

Skewness

skewRI <- skewness(Glass$RI)
skewNa <- skewness(Glass$Na)
skewMg <- skewness(Glass$Mg)
skewAl <- skewness(Glass$Al)
skewSi <- skewness(Glass$Si)
skewK <- skewness(Glass$K)
skewCa <- skewness(Glass$Ca)
skewBa <- skewness(Glass$Ba)
skewFe <- skewness(Glass$Fe)
TypeValue <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")
Skewness <- c(skewRI, skewNa, skewMg, skewAl, skewSi, skewK, skewCa, skewBa, skewFe)
ds1 <- data.frame(TypeValue, Skewness)
kable(ds1)

TypeValue	Skewness
RI	1.6027151
Na	0.4478343
Mg	-1.1364523
Al	0.8946104
Si	-0.7202392
K	6.4600889
Ca	2.0184463
Ba	3.3686800
Fe	1.7298107

Pairs Plot

pairs.panels(Glass[,-10])

Do there appear to be any outliers in the data? Are any predictors skewed?

From the visualizations and data above we can see the shape of data is different for each type. The density plot show us the skewness of these data. For example Na and Al seems to be normal but Ba, Fe, Ca etc are left heavy. The Skewness table shows the skew values

Predictor Variable Outlier

boxplot(Glass["RI"],plot=FALSE)$out

##  [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365

boxplot(Glass["Na"],plot=FALSE)$out

## [1] 11.45 10.73 11.23 11.02 11.03 17.38 15.79

boxplot(Glass["Mg"],plot=FALSE)$out

## numeric(0)

boxplot(Glass["Al"],plot=FALSE)$out

##  [1] 0.29 0.47 0.47 0.51 3.50 3.04 3.02 0.34 2.38 2.79 2.68 2.54 2.34 2.66 2.51
## [16] 2.42 2.74 2.88

boxplot(Glass["Si"],plot=FALSE)$out

##  [1] 70.57 69.81 70.16 74.45 69.89 70.48 70.70 74.55 75.41 70.26 70.43 75.18

boxplot(Glass["K"],plot=FALSE)$out

## [1] 1.68 6.21 6.21 1.76 1.46 2.70 1.41

boxplot(Glass["Ca"],plot=FALSE)$out

##  [1] 11.64 10.79 13.24 13.30 16.19 11.52 10.99 14.68 14.96 14.40 11.14 13.44
## [13]  5.87 11.41 11.62 11.53 11.32 12.24 12.50 11.27 10.88 11.22  6.65  5.43
## [25]  5.79  6.47

boxplot(Glass["Ba"],plot=FALSE)$out

##  [1] 0.09 0.11 0.69 0.14 0.11 3.15 0.27 0.09 0.06 0.15 2.20 0.24 1.19 1.63 1.68
## [16] 0.76 0.64 0.40 1.59 1.57 0.61 0.81 0.66 0.64 0.53 0.63 0.56 1.71 0.67 1.55
## [31] 1.38 2.88 0.54 1.06 1.59 1.64 1.57 1.67

boxplot(Glass["Fe"],plot=FALSE)$out

##  [1] 0.26 0.30 0.31 0.32 0.34 0.28 0.29 0.28 0.35 0.37 0.51 0.28

Are there any relevant transformations of one or more predictors that might improve the classification model?

Transformation

To improve the model we can use transformation like Box-Cox for highly skewed variables and Normalizing the variables by centering and scaling. Along with these techniques we should take a look at the outliers and try to remove them

apply(Glass[,-10], 2, BoxCoxTrans)

## $RI
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.511   1.517   1.518   1.518   1.519   1.534 
## 
## Largest/Smallest: 1.02 
## Sample Skewness: 1.6 
## 
## Estimated Lambda: -2 
## 
## 
## $Na
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.73   12.91   13.30   13.41   13.82   17.38 
## 
## Largest/Smallest: 1.62 
## Sample Skewness: 0.448 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations
## 
## 
## $Mg
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.115   3.480   2.685   3.600   4.490 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Al
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.290   1.190   1.360   1.445   1.630   3.500 
## 
## Largest/Smallest: 12.1 
## Sample Skewness: 0.895 
## 
## Estimated Lambda: 0.5 
## 
## 
## $Si
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69.81   72.28   72.79   72.65   73.09   75.41 
## 
## Largest/Smallest: 1.08 
## Sample Skewness: -0.72 
## 
## Estimated Lambda: 2 
## 
## 
## $K
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1225  0.5550  0.4971  0.6100  6.2100 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Ca
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.430   8.240   8.600   8.957   9.172  16.190 
## 
## Largest/Smallest: 2.98 
## Sample Skewness: 2.02 
## 
## Estimated Lambda: -1.1 
## 
## 
## $Ba
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.175   0.000   3.150 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Fe
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05701 0.10000 0.51000 
## 
## Lambda could not be estimated; no transformation is applied

Problem 3.2

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

Examine Data

data(Soybean)

kable(head(Soybean, 10))

Class	date	precip	temp	hail	crop.hist	area.dam	sever	seed.tmt	germ	plant.growth	leaves	leaf.marg	leaf.size	stem	lodging	stem.cankers	canker.lesion	fruiting.bodies	ext.decay	fruit.spots
diaporthe-stem-canker	6	2	1	0	1	1	1	0	0	1	1	2	2	1	1	3	1	1	1	4
diaporthe-stem-canker	4	2	1	0	2	0	2	1	1	1	1	2	2	1	0	3	1	1	1	4
diaporthe-stem-canker	3	2	1	0	1	0	2	1	2	1	1	2	2	1	0	3	0	1	1	4
diaporthe-stem-canker	3	2	1	0	1	0	2	0	1	1	1	2	2	1	0	3	0	1	1	4
diaporthe-stem-canker	6	2	1	0	2	0	1	0	2	1	1	2	2	1	0	3	1	1	1	4
diaporthe-stem-canker	5	2	1	0	3	0	1	0	1	1	1	2	2	1	0	3	0	1	1	4
diaporthe-stem-canker	5	2	1	0	2	0	1	1	0	1	1	2	2	1	1	3	1	1	1	4
diaporthe-stem-canker	4	2	1	1	1	0	1	0	2	1	1	2	2	1	0	3	1	1	1	4
diaporthe-stem-canker	6	2	1	0	3	0	1	1	1	1	1	2	2	1	0	3	1	1	1	4
diaporthe-stem-canker	4	2	1	0	2	0	2	0	2	1	1	2	2	1	0	3	1	1	1	4

str(Soybean)

## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

ggplot(melt(Soybean, id.vars=c('Class')), aes(x=value)) + 
  geom_histogram(stat="count") + 
  facet_wrap(~variable, scale="free")

We can use the nearZeroVar function in the caret package to identify the predictors with degenerate distributions.

nzv.cols <- nearZeroVar(Soybean)
nzv.out <- nearZeroVar(Soybean, saveMetrics=TRUE)[nzv.cols, ]
nzv.prop.high <- apply(Soybean, 2, function(x) max(table(x)) / length(x))[nzv.cols]

summary(Soybean[ , nzv.cols])

##  leaf.mild  mycelium   sclerotia 
##  0   :535   0   :639   0   :625  
##  1   : 20   1   :  6   1   : 20  
##  2   : 20   NA's: 38   NA's: 38  
##  NA's:108

Frequency Distribution

lapply(apply(Soybean[-1], 2, table), kable)

## $date
## 
## 
## Var1    Freq
## -----  -----
## 0         26
## 1         75
## 2         93
## 3        118
## 4        131
## 5        149
## 6         90
## 
## $plant.stand
## 
## 
## Var1    Freq
## -----  -----
## 0        354
## 1        293
## 
## $precip
## 
## 
## Var1    Freq
## -----  -----
## 0         74
## 1        112
## 2        459
## 
## $temp
## 
## 
## Var1    Freq
## -----  -----
## 0         80
## 1        374
## 2        199
## 
## $hail
## 
## 
## Var1    Freq
## -----  -----
## 0        435
## 1        127
## 
## $crop.hist
## 
## 
## Var1    Freq
## -----  -----
## 0         65
## 1        165
## 2        219
## 3        218
## 
## $area.dam
## 
## 
## Var1    Freq
## -----  -----
## 0        123
## 1        227
## 2        145
## 3        187
## 
## $sever
## 
## 
## Var1    Freq
## -----  -----
## 0        195
## 1        322
## 2         45
## 
## $seed.tmt
## 
## 
## Var1    Freq
## -----  -----
## 0        305
## 1        222
## 2         35
## 
## $germ
## 
## 
## Var1    Freq
## -----  -----
## 0        165
## 1        213
## 2        193
## 
## $plant.growth
## 
## 
## Var1    Freq
## -----  -----
## 0        441
## 1        226
## 
## $leaves
## 
## 
## Var1    Freq
## -----  -----
## 0         77
## 1        606
## 
## $leaf.halo
## 
## 
## Var1    Freq
## -----  -----
## 0        221
## 1         36
## 2        342
## 
## $leaf.marg
## 
## 
## Var1    Freq
## -----  -----
## 0        357
## 1         21
## 2        221
## 
## $leaf.size
## 
## 
## Var1    Freq
## -----  -----
## 0         51
## 1        327
## 2        221
## 
## $leaf.shread
## 
## 
## Var1    Freq
## -----  -----
## 0        487
## 1         96
## 
## $leaf.malf
## 
## 
## Var1    Freq
## -----  -----
## 0        554
## 1         45
## 
## $leaf.mild
## 
## 
## Var1    Freq
## -----  -----
## 0        535
## 1         20
## 2         20
## 
## $stem
## 
## 
## Var1    Freq
## -----  -----
## 0        296
## 1        371
## 
## $lodging
## 
## 
## Var1    Freq
## -----  -----
## 0        520
## 1         42
## 
## $stem.cankers
## 
## 
## Var1    Freq
## -----  -----
## 0        379
## 1         39
## 2         36
## 3        191
## 
## $canker.lesion
## 
## 
## Var1    Freq
## -----  -----
## 0        320
## 1         83
## 2        177
## 3         65
## 
## $fruiting.bodies
## 
## 
## Var1    Freq
## -----  -----
## 0        473
## 1        104
## 
## $ext.decay
## 
## 
## Var1    Freq
## -----  -----
## 0        497
## 1        135
## 2         13
## 
## $mycelium
## 
## 
## Var1    Freq
## -----  -----
## 0        639
## 1          6
## 
## $int.discolor
## 
## 
## Var1    Freq
## -----  -----
## 0        581
## 1         44
## 2         20
## 
## $sclerotia
## 
## 
## Var1    Freq
## -----  -----
## 0        625
## 1         20
## 
## $fruit.pods
## 
## 
## Var1    Freq
## -----  -----
## 0        407
## 1        130
## 2         14
## 3         48
## 
## $fruit.spots
## 
## 
## Var1    Freq
## -----  -----
## 0        345
## 1         75
## 2         57
## 4        100
## 
## $seed
## 
## 
## Var1    Freq
## -----  -----
## 0        476
## 1        115
## 
## $mold.growth
## 
## 
## Var1    Freq
## -----  -----
## 0        524
## 1         67
## 
## $seed.discolor
## 
## 
## Var1    Freq
## -----  -----
## 0        513
## 1         64
## 
## $seed.size
## 
## 
## Var1    Freq
## -----  -----
## 0        532
## 1         59
## 
## $shriveling
## 
## 
## Var1    Freq
## -----  -----
## 0        539
## 1         38
## 
## $roots
## 
## 
## Var1    Freq
## -----  -----
## 0        551
## 1         86
## 2         15

Missing data problem

Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

Missing Values %

1-(sum(complete.cases(Soybean))/nrow(Soybean))

## [1] 0.1771596

Complete cases proportion

1 - sum(complete.cases(Soybean)) / nrow(Soybean)

## [1] 0.1771596

sum(is.na(Soybean)) / ncol(Soybean) / nrow(Soybean)

## [1] 0.09504636

Missing values per column

the.na.Soybean <- apply(Soybean, 2, function(x){sum(is.na(x))})
the.na.Soybean

##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##             121              16               1             121             121 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##             112              16               0              84              84 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##              84             100              84             108              16 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##             121              38              38             106              38 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##              38              38              38              84             106 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##              92              92             106              92             106 
##           roots 
##              31

From the above data we see missing data in nearly all data. We also see nearly 18% where missing data can be seen

Solution to Missing data problem

Develop a strategy for handling missing data, either by eliminating predictors or imputation.

There are many strategies that can be used to solve missing data problem. There are many data with NA values. For example phytophthora-rot data can be solved using knnImputation package. After that we look at how many NA’s are still there and if it make sense to use dropna. We have to be careful here because it should not change the meaning of what we are trying to achieve.

kable(head(Soybean[Soybean$Class=='phytophthora-rot',],5))

	Class	date	plant.stand	precip	temp	hail	crop.hist	area.dam	sever	seed.tmt	germ	plant.growth	leaves	leaf.halo	leaf.marg	leaf.size	leaf.shread	leaf.malf	leaf.mild	stem	lodging	stem.cankers	canker.lesion	fruiting.bodies	ext.decay	fruit.pods	fruit.spots	seed	mold.growth	seed.discolor	seed.size	shriveling	roots
31	phytophthora-rot	0	1	2	1	1	1	1	1	0	0	1	1	0	2	2	0	0	0	1	0	1	2	0	1	3	4	0	0	0	0	0	0
32	phytophthora-rot	1	1	2	1	NA	3	1	NA	NA	NA	1	1	0	2	2	0	0	0	1	NA	2	2	NA	0	NA	NA	NA	NA	NA	NA	NA	1
33	phytophthora-rot	2	1	2	2	NA	2	1	NA	NA	NA	1	1	NA	NA	NA	NA	NA	NA	1	NA	3	2	NA	0	NA	NA	NA	NA	NA	NA	NA	1
34	phytophthora-rot	1	1	2	0	0	2	1	2	1	1	1	1	0	2	2	0	0	0	1	0	2	2	0	0	3	4	0	0	0	0	0	0
35	phytophthora-rot	2	1	2	2	NA	2	1	NA	NA	NA	1	1	NA	NA	NA	NA	NA	NA	1	NA	2	2	NA	0	NA	NA	NA	NA	NA	NA	NA	1

imputed_data <- knnImputation(Soybean,k=10)
kable(head(imputed_data,5))

Class	date	precip	temp	crop.hist	area.dam	sever	seed.tmt	germ	plant.growth	leaves	leaf.marg	leaf.size	stem	lodging	stem.cankers	canker.lesion	fruiting.bodies	ext.decay	fruit.spots
diaporthe-stem-canker	6	2	1	1	1	1	0	0	1	1	2	2	1	1	3	1	1	1	4
diaporthe-stem-canker	4	2	1	2	0	2	1	1	1	1	2	2	1	0	3	1	1	1	4
diaporthe-stem-canker	3	2	1	1	0	2	1	2	1	1	2	2	1	0	3	0	1	1	4
diaporthe-stem-canker	3	2	1	1	0	2	0	1	1	1	2	2	1	0	3	0	1	1	4
diaporthe-stem-canker	6	2	1	2	0	1	0	2	1	1	2	2	1	0	3	1	1	1	4

DATA 624 Homework 4

Monu Chacko

3/7/2021

Problem 3.1

About the Dataset

Examine Data

Hostogram

Density

Skewness

Pairs Plot

Predictor Variable Outlier

Transformation

Problem 3.2

Examine Data

Missing data problem

Solution to Missing data problem