library(gmodels)
library(ggpubr)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.90 loaded
library(caret)
## Loading required package: lattice
library(e1071)
library(lattice)
library(AppliedPredictiveModeling)
library(mlbench)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(reshape2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
3.1 The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via:
library(mlbench)
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
Glass1<-Glass
Visualizations…..
pairs(Type~RI+Na+Mg+Al+Si+K+Ca+K+Ba+Fe,data=Glass1,gap=0.4,cex.labels=1.5)
#Another package used for plots to visualize data
pairs.panels(Glass1[,c("RI", "Na","Mg","Al","Si","Ca","K","Ba","Fe","Type")])
There appears to be correlation between RI/Ca, moderate correlation RI/Si.
Take a close look at the histogram,boxplots, skewness by predictor:
hist(Glass1$RI)
boxplot(Glass1$RI)
skewness(Glass1$RI)
## [1] 1.602715
RI - the histogram is slightly right skewed with a skewness=1.6. Possibly investigate point of 1.53 as outlier.
hist(Glass1$Na)
boxplot(Glass1$Na)
skewness(Glass1$Na)
## [1] 0.4478343
Na - Outlier of 18 should be investigated. Close to Symmetrical distribution.
hist(Glass1$Mg)
boxplot(Glass1$Mg)
skewness(Glass1$Mg)
## [1] -1.136452
Mg- exhibits bimodal, with one of the modes at 0. Is this a plausible possibility or is it undetectable? Perhaps this variable needs to be encoded by use of dummy variables.
hist(Glass1$Al)
boxplot(Glass1$Al)
skewness(Glass1$Al)
## [1] 0.8946104
Al- distribution has a slight right skew with no cause for concern with outliers.
hist(Glass1$Si)
boxplot(Glass1$Si)
skewness(Glass1$Si)
## [1] -0.7202392
SI- distribution is slight left skew. No cause for concern relating to outliers.
hist(Glass1$Ca)
boxplot(Glass1$Ca)
skewness(Glass1$Ca)
## [1] 2.018446
Ca - distribution is right skewed. Investigate outliers at 13, 14, 16.
hist(Glass1$K)
boxplot(Glass1$K)
skewness(Glass1$K)
## [1] 6.460089
K - Outlier at 6 needs to be investigated. Right skewed.
hist(Glass1$Ba)
boxplot(Glass1$Ba)
skewness(Glass1$Ba)
## [1] 3.36868
Ba - distribution is right skewed. Are these plausible values?
hist(Glass1$Fe)
boxplot(Glass1$Fe)
skewness(Glass1$Fe)
## [1] 1.729811
Fe - distribution is right skewed with most values at 0. Is 0 plausible or undetectable? Fe may warrant encoding.
DICUSSION:
Several of the variables are skewed. A tranformation may help to remove the skew.
The highest skewness with.
Ca, K, Ba
Glass1<-Glass1 %>% mutate(log_Ca=log(Ca),log_K=log(K),log_Ba=log(Ba))
DISCUSSION: Take a look at the histogram for the original variable and then the log transformed variable.
hist(Glass1$Ca)
hist(Glass1$log_Ca)
#The log tranformation improved the symmetry
hist(Glass1$K)
hist(Glass1$log_K)
#The log tranformation improved the symmetry
hist(Glass1$Ba)
hist(Glass1$log_Ba)
#The log tranformation improved the symmetry
All log transformation improved the skewness.
#library(mlbench) already loaded
data(Soybean)
## See ?Soybean for details
dim(Soybean)
## [1] 683 36
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
summary(Soybean)
## Class date plant.stand precip temp
## brown-spot : 92 5 :149 0 :354 0 : 74 0 : 80
## alternarialeaf-spot: 91 4 :131 1 :293 1 :112 1 :374
## frog-eye-leaf-spot : 91 3 :118 NA's: 36 2 :459 2 :199
## phytophthora-rot : 88 2 : 93 NA's: 38 NA's: 30
## anthracnose : 44 6 : 90
## brown-stem-rot : 44 (Other):101
## (Other) :233 NA's : 1
## hail crop.hist area.dam sever seed.tmt germ plant.growth
## 0 :435 0 : 65 0 :123 0 :195 0 :305 0 :165 0 :441
## 1 :127 1 :165 1 :227 1 :322 1 :222 1 :213 1 :226
## NA's:121 2 :219 2 :145 2 : 45 2 : 35 2 :193 NA's: 16
## 3 :218 3 :187 NA's:121 NA's:121 NA's:112
## NA's: 16 NA's: 1
##
##
## leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild
## 0: 77 0 :221 0 :357 0 : 51 0 :487 0 :554 0 :535
## 1:606 1 : 36 1 : 21 1 :327 1 : 96 1 : 45 1 : 20
## 2 :342 2 :221 2 :221 NA's:100 NA's: 84 2 : 20
## NA's: 84 NA's: 84 NA's: 84 NA's:108
##
##
##
## stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 :296 0 :520 0 :379 0 :320 0 :473 0 :497
## 1 :371 1 : 42 1 : 39 1 : 83 1 :104 1 :135
## NA's: 16 NA's:121 2 : 36 2 :177 NA's:106 2 : 13
## 3 :191 3 : 65 NA's: 38
## NA's: 38 NA's: 38
##
##
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0 :639 0 :581 0 :625 0 :407 0 :345 0 :476
## 1 : 6 1 : 44 1 : 20 1 :130 1 : 75 1 :115
## NA's: 38 2 : 20 NA's: 38 2 : 14 2 : 57 NA's: 92
## NA's: 38 3 : 48 4 :100
## NA's: 84 NA's:106
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0 :524 0 :513 0 :532 0 :539 0 :551
## 1 : 67 1 : 64 1 : 59 1 : 38 1 : 86
## NA's: 92 NA's:106 NA's: 92 NA's:106 2 : 15
## NA's: 31
##
##
##
ggplot(melt(Soybean, id.vars=c('Class')), aes(x=value)) +
geom_histogram(stat="count") +
#facet_wrap(~variable, scale="free")
facet_wrap(~variable)
## Warning: attributes are not identical across measure variables; they will be
## dropped
## Warning: Ignoring unknown parameters: binwidth, bins, pad
colnames(Soybean)
## [1] "Class" "date" "plant.stand" "precip"
## [5] "temp" "hail" "crop.hist" "area.dam"
## [9] "sever" "seed.tmt" "germ" "plant.growth"
## [13] "leaves" "leaf.halo" "leaf.marg" "leaf.size"
## [17] "leaf.shread" "leaf.malf" "leaf.mild" "stem"
## [21] "lodging" "stem.cankers" "canker.lesion" "fruiting.bodies"
## [25] "ext.decay" "mycelium" "int.discolor" "sclerotia"
## [29] "fruit.pods" "fruit.spots" "seed" "mold.growth"
## [33] "seed.discolor" "seed.size" "shriveling" "roots"
nearZeroVar(Soybean)
## [1] 19 26 28
dim(Soybean)
## [1] 683 36
# There are 36 columns (variables)
#Near zero variance predictors have a single value for most samples and add little information
# which variables are these
colnames(Soybean)[19]
## [1] "leaf.mild"
colnames(Soybean)[26]
## [1] "mycelium"
colnames(Soybean)[28]
## [1] "sclerotia"
Let’s take a closer look:
histogram(Soybean$leaf.mild)
histogram(Soybean$mycelium)
histogram(Soybean$sclerotia)
These predictors serve no meaningful information and should not be used.
Soybean2<-Soybean%>%
select(c(1:18,20:25,27,29:36))
dim(Soybean2)
## [1] 683 33
#Soybean2 has dropped the three variables:leaf.mild,mycelium, sclerotia
There are 19 classes, only the first 15 of which have been used in prior work. The folklore seems to be that the last four classes are unjustified by the data since they have so few examples. There are 35 categorical attributes, some nominal and some ordered. The value “dna” means does not apply. The values for attributes are encoded numerically, with the first value encoded as “0,” the second as “1,” and so forth.
#What are the last 4 classes? How many records in each class?
Soybean%>%
group_by(Class)%>%
summarise(number = n())
## # A tibble: 19 x 2
## Class number
## <fct> <int>
## 1 2-4-d-injury 16
## 2 alternarialeaf-spot 91
## 3 anthracnose 44
## 4 bacterial-blight 20
## 5 bacterial-pustule 20
## 6 brown-spot 92
## 7 brown-stem-rot 44
## 8 charcoal-rot 20
## 9 cyst-nematode 14
## 10 diaporthe-pod-&-stem-blight 15
## 11 diaporthe-stem-canker 20
## 12 downy-mildew 20
## 13 frog-eye-leaf-spot 91
## 14 herbicide-injury 8
## 15 phyllosticta-leaf-spot 20
## 16 phytophthora-rot 88
## 17 powdery-mildew 20
## 18 purple-seed-stain 20
## 19 rhizoctonia-root-rot 20
countNA<-colSums(is.na(Soybean2))
countNA
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf stem lodging
## 84 100 84 16 121
## stem.cankers canker.lesion fruiting.bodies ext.decay int.discolor
## 38 38 106 38 38
## fruit.pods fruit.spots seed mold.growth seed.discolor
## 84 106 92 92 106
## seed.size shriveling roots
## 92 106 31
#Let's assume any variable with more than 10% missing (NA) is an issue
dim(Soybean2)
## [1] 683 33
683*.10
## [1] 68.3
# How many of these variables have over 10% missing?
df<-as.data.frame(countNA)
df%>%filter(countNA>68)
## countNA
## hail 121
## sever 121
## seed.tmt 121
## germ 112
## leaf.halo 84
## leaf.marg 84
## leaf.size 84
## leaf.shread 100
## leaf.malf 84
## lodging 121
## fruiting.bodies 106
## fruit.pods 84
## fruit.spots 106
## seed 92
## mold.growth 92
## seed.discolor 106
## seed.size 92
## shriveling 106
#Let's look at missing by rows classes
miss_by_class <- Soybean2 %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
mutate(TotalNA = select(.,date:roots) %>% rowSums())
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
miss_by_class[,c('Class','TotalNA')] %>% arrange(-TotalNA)
## # A tibble: 19 x 2
## Class TotalNA
## <fct> <dbl>
## 1 phytophthora-rot 1159
## 2 2-4-d-injury 402
## 3 cyst-nematode 294
## 4 diaporthe-pod-&-stem-blight 162
## 5 herbicide-injury 136
## 6 alternarialeaf-spot 0
## 7 anthracnose 0
## 8 bacterial-blight 0
## 9 bacterial-pustule 0
## 10 brown-spot 0
## 11 brown-stem-rot 0
## 12 charcoal-rot 0
## 13 diaporthe-stem-canker 0
## 14 downy-mildew 0
## 15 frog-eye-leaf-spot 0
## 16 phyllosticta-leaf-spot 0
## 17 powdery-mildew 0
## 18 purple-seed-stain 0
## 19 rhizoctonia-root-rot 0
#phytophthora-rot has 1159 na, maybe this class should be analyzed separately. Possibly remove this class, and do a separate analysis based on a small number of predictors with the other classes later.
#lets see how many rows each class is comprised of
Soybean%>%
group_by(Class)%>%
summarise(number = n()) %>%
arrange(desc(number))
## # A tibble: 19 x 2
## Class number
## <fct> <int>
## 1 brown-spot 92
## 2 alternarialeaf-spot 91
## 3 frog-eye-leaf-spot 91
## 4 phytophthora-rot 88
## 5 anthracnose 44
## 6 brown-stem-rot 44
## 7 bacterial-blight 20
## 8 bacterial-pustule 20
## 9 charcoal-rot 20
## 10 diaporthe-stem-canker 20
## 11 downy-mildew 20
## 12 phyllosticta-leaf-spot 20
## 13 powdery-mildew 20
## 14 purple-seed-stain 20
## 15 rhizoctonia-root-rot 20
## 16 2-4-d-injury 16
## 17 diaporthe-pod-&-stem-blight 15
## 18 cyst-nematode 14
## 19 herbicide-injury 8
#phytophthora-rot has 88 rows of data, with many NA, This would be a candidate for imputation however, difficult to impute if too much is missing. Again, consider eliminating this class due to analyze later because of the high number of observations and missing variables.
#herbicide-injury, 2-4-d-injury, cyst-nematode, diaporthe-pod-&-stem-blight have so few examples and a fair share of missing, considering dropping rows.
Soybean3<-Soybean2%>%
filter(Class !="phytophthora-rot")
dim(Soybean3)
## [1] 595 33
#Lets see what we have as far as NA
countNA3<-colSums(is.na(Soybean3))
countNA3
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 53 16 1 53 53
## germ plant.growth leaves leaf.halo leaf.marg
## 44 16 0 29 29
## leaf.size leaf.shread leaf.malf stem lodging
## 29 45 29 16 53
## stem.cankers canker.lesion fruiting.bodies ext.decay int.discolor
## 38 38 38 38 38
## fruit.pods fruit.spots seed mold.growth seed.discolor
## 16 38 24 24 38
## seed.size shriveling roots
## 24 38 31
#This count of NAs looks more reasonable
The strategy for some missing has been developed above. Based on near zero variance, three predictors have been removed.
In addition, the class phytophthora-rot had in inordinate number of NA across all variables, removing this class reduces the dataset by 88 rows.
At this point we are left with 595 observations and 33 variables.
Let’s see which variables have a high NA left:
countNA2<-colSums(is.na(Soybean3))
countNA2
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 53 16 1 53 53
## germ plant.growth leaves leaf.halo leaf.marg
## 44 16 0 29 29
## leaf.size leaf.shread leaf.malf stem lodging
## 29 45 29 16 53
## stem.cankers canker.lesion fruiting.bodies ext.decay int.discolor
## 38 38 38 38 38
## fruit.pods fruit.spots seed mold.growth seed.discolor
## 16 38 24 24 38
## seed.size shriveling roots
## 24 38 31
df2<-as.data.frame(countNA2)
df2%>%
arrange(desc(countNA2))
## countNA2
## hail 53
## sever 53
## seed.tmt 53
## lodging 53
## leaf.shread 45
## germ 44
## precip 38
## stem.cankers 38
## canker.lesion 38
## fruiting.bodies 38
## ext.decay 38
## int.discolor 38
## fruit.spots 38
## seed.discolor 38
## shriveling 38
## plant.stand 36
## roots 31
## temp 30
## leaf.halo 29
## leaf.marg 29
## leaf.size 29
## leaf.malf 29
## seed 24
## mold.growth 24
## seed.size 24
## crop.hist 16
## plant.growth 16
## stem 16
## fruit.pods 16
## date 1
## area.dam 1
## Class 0
## leaves 0
The missingness has improved.
We are faced with the question of whether or not to impute. At this point I would run the analysis with the justification that missingness was addressed by thought out deletion.
Imputation can reduce the quality of the data, because you are making an educated guess. All of the variables are nominal or ordinal categorical.
A imputation based on mode may be a good choice.
For instance, taking the mode of hail (which is not hailing) is a pretty safe imputation because hail is a rarer occurrence.
Soybean4<-Soybean3 %>%
mutate(hail_imp=hail)
Soybean5<-Soybean4 %>% replace_na(list(hail_imp = 0))
countNA5<-colSums(is.na(Soybean5))
countNA5
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 53 16 1 53 53
## germ plant.growth leaves leaf.halo leaf.marg
## 44 16 0 29 29
## leaf.size leaf.shread leaf.malf stem lodging
## 29 45 29 16 53
## stem.cankers canker.lesion fruiting.bodies ext.decay int.discolor
## 38 38 38 38 38
## fruit.pods fruit.spots seed mold.growth seed.discolor
## 16 38 24 24 38
## seed.size shriveling roots hail_imp
## 24 38 31 0
DISCUSSION: USE hail_imp variable instead of hail.
Imputing can be done on on sever, seed.tmt, lodging, however a clearer understanding of these variables is necessary before going forward. Need to consult and collaborate to gain a better understanding before imputation.
Lets try using an imputation package MICE from r.
#patter of missingness
md.pattern(Soybean)
## Class leaves date area.dam crop.hist plant.growth stem temp roots
## 562 1 1 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 1 1 1
## 55 1 1 1 1 1 1 1 1 1
## 8 1 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1 0
## 6 1 1 1 1 1 1 1 1 0
## 14 1 1 1 1 1 1 1 0 1
## 15 1 1 1 1 0 0 0 0 0
## 1 1 1 0 0 0 0 0 0 0
## 0 0 1 1 16 16 16 30 31
## plant.stand precip stem.cankers canker.lesion ext.decay mycelium
## 562 1 1 1 1 1 1
## 13 1 1 1 1 1 1
## 55 1 1 1 1 1 1
## 8 1 0 0 0 0 0
## 9 1 1 1 1 1 1
## 6 0 1 1 1 1 1
## 14 0 0 0 0 0 0
## 15 0 0 0 0 0 0
## 1 0 0 0 0 0 0
## 36 38 38 38 38 38
## int.discolor sclerotia leaf.halo leaf.marg leaf.size leaf.malf fruit.pods
## 562 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 0
## 55 1 1 0 0 0 0 0
## 8 0 0 1 1 1 1 1
## 9 1 1 0 0 0 0 1
## 6 1 1 0 0 0 0 1
## 14 0 0 0 0 0 0 1
## 15 0 0 1 1 1 1 0
## 1 0 0 1 1 1 1 0
## 38 38 84 84 84 84 84
## seed mold.growth seed.size leaf.shread fruiting.bodies fruit.spots
## 562 1 1 1 1 1 1
## 13 0 0 0 1 0 0
## 55 0 0 0 0 0 0
## 8 0 0 0 1 0 0
## 9 1 1 1 0 1 1
## 6 1 1 1 0 1 1
## 14 1 1 1 0 0 0
## 15 0 0 0 0 0 0
## 1 0 0 0 0 0 0
## 92 92 92 100 106 106
## seed.discolor shriveling leaf.mild germ hail sever seed.tmt lodging
## 562 1 1 1 1 1 1 1 1 0
## 13 0 0 1 0 0 0 0 0 13
## 55 0 0 0 0 0 0 0 0 19
## 8 0 0 0 0 0 0 0 0 20
## 9 1 1 0 1 0 0 0 0 11
## 6 1 1 0 0 0 0 0 0 13
## 14 0 0 0 0 0 0 0 0 24
## 15 0 0 0 0 0 0 0 0 28
## 1 0 0 0 0 0 0 0 0 30
## 106 106 108 112 121 121 121 121 2337
#use method predict mean matching
imp<- mice(Soybean, method = 'pmm', seed = 123, printFlag = FALSE)
## Warning: Number of logged events: 1684
#imp<-mice(Soybean)
#info stored
attributes(imp)
## $names
## [1] "data" "imp" "m" "where"
## [5] "blocks" "call" "nmis" "method"
## [9] "predictorMatrix" "visitSequence" "formulas" "post"
## [13] "blots" "ignore" "seed" "iteration"
## [17] "lastSeedValue" "chainMean" "chainVar" "loggedEvents"
## [21] "version" "date"
##
## $class
## [1] "mids"
c.long <- complete(imp, "long")
#summary of imputed df
summary(c.long)
## .imp .id Class date plant.stand
## Min. :1 Min. : 1 brown-spot : 460 0:130 0:1913
## 1st Qu.:2 1st Qu.:171 alternarialeaf-spot: 455 1:377 1:1502
## Median :3 Median :342 frog-eye-leaf-spot : 455 2:466
## Mean :3 Mean :342 phytophthora-rot : 440 3:591
## 3rd Qu.:4 3rd Qu.:513 anthracnose : 220 4:655
## Max. :5 Max. :683 brown-stem-rot : 220 5:745
## (Other) :1165 6:451
## precip temp hail crop.hist area.dam sever seed.tmt germ
## 0: 499 0: 450 0:2569 0: 374 0: 617 0:1225 0:1694 0: 995
## 1: 568 1:1896 1: 846 1: 842 1:1135 1:1826 1:1296 1:1085
## 2:2348 2:1069 2:1102 2: 725 2: 364 2: 425 2:1335
## 3:1097 3: 938
##
##
##
## plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf
## 0:2228 0: 385 0:1376 0:2045 0: 505 0:2749 0:3051
## 1:1187 1:3030 1: 201 1: 111 1:1711 1: 666 1: 364
## 2:1838 2:1259 2:1199
##
##
##
##
## leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies
## 0:3112 0:1518 0:2938 0:2017 0:1719 0:2629
## 1: 100 1:1897 1: 477 1: 196 1: 415 1: 786
## 2: 203 2: 180 2: 885
## 3:1022 3: 396
##
##
##
## ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0:2611 0:3346 0:3010 0:3192 0:2157 0:1967 0:2612
## 1: 698 1: 69 1: 232 1: 223 1: 650 1: 390 1: 803
## 2: 106 2: 173 2: 89 2: 302
## 3: 519 4: 756
##
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0:2947 0:2871 0:2990 0:2967 0:2853
## 1: 468 1: 544 1: 425 1: 448 1: 430
## 2: 132
##
##
##
##
#missing resolved