HW4 624 pre-processing

library(gmodels)
library(ggpubr)

## Loading required package: ggplot2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(corrplot)

## corrplot 0.90 loaded

library(caret)

## Loading required package: lattice

library(e1071)
library(lattice)
library(AppliedPredictiveModeling)
library(mlbench)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(reshape2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.4     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()    masks ggplot2::%+%()
## x psych::alpha()  masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

3.1 The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via:

library(mlbench) 
data(Glass)
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

head(Glass)

##        RI    Na   Mg   Al    Si    K   Ca Ba   Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0 0.00    1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0 0.00    1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0 0.00    1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0 0.00    1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0 0.00    1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07  0 0.26    1

summary(Glass)

##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

Glass1<-Glass

Visualizations…..

pairs(Type~RI+Na+Mg+Al+Si+K+Ca+K+Ba+Fe,data=Glass1,gap=0.4,cex.labels=1.5)

#Another package used for plots to visualize data
pairs.panels(Glass1[,c("RI", "Na","Mg","Al","Si","Ca","K","Ba","Fe","Type")])

There appears to be correlation between RI/Ca, moderate correlation RI/Si.

Do there appear to be any outliers in the data? Are any predictors skewed?

Take a close look at the histogram,boxplots, skewness by predictor:

hist(Glass1$RI)

boxplot(Glass1$RI)

skewness(Glass1$RI)

## [1] 1.602715

RI - the histogram is slightly right skewed with a skewness=1.6. Possibly investigate point of 1.53 as outlier.

hist(Glass1$Na)

boxplot(Glass1$Na)

skewness(Glass1$Na)

## [1] 0.4478343

Na - Outlier of 18 should be investigated. Close to Symmetrical distribution.

hist(Glass1$Mg)

boxplot(Glass1$Mg)

skewness(Glass1$Mg)

## [1] -1.136452

Mg- exhibits bimodal, with one of the modes at 0. Is this a plausible possibility or is it undetectable? Perhaps this variable needs to be encoded by use of dummy variables.

hist(Glass1$Al)

boxplot(Glass1$Al)

skewness(Glass1$Al)

## [1] 0.8946104

Al- distribution has a slight right skew with no cause for concern with outliers.

hist(Glass1$Si)

boxplot(Glass1$Si)

skewness(Glass1$Si)

## [1] -0.7202392

SI- distribution is slight left skew. No cause for concern relating to outliers.

hist(Glass1$Ca)

boxplot(Glass1$Ca)

skewness(Glass1$Ca)

## [1] 2.018446

Ca - distribution is right skewed. Investigate outliers at 13, 14, 16.

hist(Glass1$K)

boxplot(Glass1$K)

skewness(Glass1$K)

## [1] 6.460089

K - Outlier at 6 needs to be investigated. Right skewed.

hist(Glass1$Ba)

boxplot(Glass1$Ba)

skewness(Glass1$Ba)

## [1] 3.36868

Ba - distribution is right skewed. Are these plausible values?

hist(Glass1$Fe)

boxplot(Glass1$Fe)

skewness(Glass1$Fe)

## [1] 1.729811

Fe - distribution is right skewed with most values at 0. Is 0 plausible or undetectable? Fe may warrant encoding.

Are there any relevant transformations of one or more predictors that might improve the classification model?

DICUSSION:

Several of the variables are skewed. A tranformation may help to remove the skew.

The highest skewness with.

Ca, K, Ba

Glass1<-Glass1 %>% mutate(log_Ca=log(Ca),log_K=log(K),log_Ba=log(Ba))

DISCUSSION: Take a look at the histogram for the original variable and then the log transformed variable.

hist(Glass1$Ca)

hist(Glass1$log_Ca)

#The log tranformation improved the symmetry

hist(Glass1$K)

hist(Glass1$log_K)

#The log tranformation improved the symmetry

hist(Glass1$Ba)

hist(Glass1$log_Ba)

#The log tranformation improved the symmetry

All log transformation improved the skewness.

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes. The data can be loaded via:

#library(mlbench) already loaded
data(Soybean)
## See ?Soybean for details
dim(Soybean)

## [1] 683  36

str(Soybean)

## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

summary(Soybean)

##                  Class          date     plant.stand  precip      temp    
##  brown-spot         : 92   5      :149   0   :354    0   : 74   0   : 80  
##  alternarialeaf-spot: 91   4      :131   1   :293    1   :112   1   :374  
##  frog-eye-leaf-spot : 91   3      :118   NA's: 36    2   :459   2   :199  
##  phytophthora-rot   : 88   2      : 93               NA's: 38   NA's: 30  
##  anthracnose        : 44   6      : 90                                    
##  brown-stem-rot     : 44   (Other):101                                    
##  (Other)            :233   NA's   :  1                                    
##    hail     crop.hist  area.dam    sever     seed.tmt     germ     plant.growth
##  0   :435   0   : 65   0   :123   0   :195   0   :305   0   :165   0   :441    
##  1   :127   1   :165   1   :227   1   :322   1   :222   1   :213   1   :226    
##  NA's:121   2   :219   2   :145   2   : 45   2   : 35   2   :193   NA's: 16    
##             3   :218   3   :187   NA's:121   NA's:121   NA's:112               
##             NA's: 16   NA's:  1                                                
##                                                                                
##                                                                                
##  leaves  leaf.halo  leaf.marg  leaf.size  leaf.shread leaf.malf  leaf.mild 
##  0: 77   0   :221   0   :357   0   : 51   0   :487    0   :554   0   :535  
##  1:606   1   : 36   1   : 21   1   :327   1   : 96    1   : 45   1   : 20  
##          2   :342   2   :221   2   :221   NA's:100    NA's: 84   2   : 20  
##          NA's: 84   NA's: 84   NA's: 84                          NA's:108  
##                                                                            
##                                                                            
##                                                                            
##    stem     lodging    stem.cankers canker.lesion fruiting.bodies ext.decay 
##  0   :296   0   :520   0   :379     0   :320      0   :473        0   :497  
##  1   :371   1   : 42   1   : 39     1   : 83      1   :104        1   :135  
##  NA's: 16   NA's:121   2   : 36     2   :177      NA's:106        2   : 13  
##                        3   :191     3   : 65                      NA's: 38  
##                        NA's: 38     NA's: 38                                
##                                                                             
##                                                                             
##  mycelium   int.discolor sclerotia  fruit.pods fruit.spots   seed    
##  0   :639   0   :581     0   :625   0   :407   0   :345    0   :476  
##  1   :  6   1   : 44     1   : 20   1   :130   1   : 75    1   :115  
##  NA's: 38   2   : 20     NA's: 38   2   : 14   2   : 57    NA's: 92  
##             NA's: 38                3   : 48   4   :100              
##                                     NA's: 84   NA's:106              
##                                                                      
##                                                                      
##  mold.growth seed.discolor seed.size  shriveling  roots    
##  0   :524    0   :513      0   :532   0   :539   0   :551  
##  1   : 67    1   : 64      1   : 59   1   : 38   1   : 86  
##  NA's: 92    NA's:106      NA's: 92   NA's:106   2   : 15  
##                                                  NA's: 31  
##                                                            
##                                                            
##

Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

ggplot(melt(Soybean, id.vars=c('Class')), aes(x=value)) + 
  geom_histogram(stat="count") + 
  #facet_wrap(~variable, scale="free")
facet_wrap(~variable)

## Warning: attributes are not identical across measure variables; they will be
## dropped

## Warning: Ignoring unknown parameters: binwidth, bins, pad

colnames(Soybean)

##  [1] "Class"           "date"            "plant.stand"     "precip"         
##  [5] "temp"            "hail"            "crop.hist"       "area.dam"       
##  [9] "sever"           "seed.tmt"        "germ"            "plant.growth"   
## [13] "leaves"          "leaf.halo"       "leaf.marg"       "leaf.size"      
## [17] "leaf.shread"     "leaf.malf"       "leaf.mild"       "stem"           
## [21] "lodging"         "stem.cankers"    "canker.lesion"   "fruiting.bodies"
## [25] "ext.decay"       "mycelium"        "int.discolor"    "sclerotia"      
## [29] "fruit.pods"      "fruit.spots"     "seed"            "mold.growth"    
## [33] "seed.discolor"   "seed.size"       "shriveling"      "roots"

nearZeroVar(Soybean)

## [1] 19 26 28

dim(Soybean)

## [1] 683  36

# There are 36 columns (variables)
#Near zero variance predictors have a single value for most samples and add little information

# which variables are these

colnames(Soybean)[19]

## [1] "leaf.mild"

colnames(Soybean)[26]

## [1] "mycelium"

colnames(Soybean)[28]

## [1] "sclerotia"

Let’s take a closer look:

histogram(Soybean$leaf.mild)

histogram(Soybean$mycelium)

histogram(Soybean$sclerotia)

These predictors serve no meaningful information and should not be used.

Soybean2<-Soybean%>%
  select(c(1:18,20:25,27,29:36))
dim(Soybean2)

## [1] 683  33

#Soybean2 has dropped the three variables:leaf.mild,mycelium, sclerotia

Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

There are 19 classes, only the first 15 of which have been used in prior work. The folklore seems to be that the last four classes are unjustified by the data since they have so few examples. There are 35 categorical attributes, some nominal and some ordered. The value “dna” means does not apply. The values for attributes are encoded numerically, with the first value encoded as “0,” the second as “1,” and so forth.

#What are the last 4 classes?  How many records in each class?
Soybean%>%
  group_by(Class)%>%
  summarise(number = n())

## # A tibble: 19 x 2
##    Class                       number
##    <fct>                        <int>
##  1 2-4-d-injury                    16
##  2 alternarialeaf-spot             91
##  3 anthracnose                     44
##  4 bacterial-blight                20
##  5 bacterial-pustule               20
##  6 brown-spot                      92
##  7 brown-stem-rot                  44
##  8 charcoal-rot                    20
##  9 cyst-nematode                   14
## 10 diaporthe-pod-&-stem-blight     15
## 11 diaporthe-stem-canker           20
## 12 downy-mildew                    20
## 13 frog-eye-leaf-spot              91
## 14 herbicide-injury                 8
## 15 phyllosticta-leaf-spot          20
## 16 phytophthora-rot                88
## 17 powdery-mildew                  20
## 18 purple-seed-stain               20
## 19 rhizoctonia-root-rot            20

countNA<-colSums(is.na(Soybean2))
countNA

##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##             121              16               1             121             121 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##             112              16               0              84              84 
##       leaf.size     leaf.shread       leaf.malf            stem         lodging 
##              84             100              84              16             121 
##    stem.cankers   canker.lesion fruiting.bodies       ext.decay    int.discolor 
##              38              38             106              38              38 
##      fruit.pods     fruit.spots            seed     mold.growth   seed.discolor 
##              84             106              92              92             106 
##       seed.size      shriveling           roots 
##              92             106              31

#Let's assume any variable with more than 10% missing (NA) is an issue

dim(Soybean2)

## [1] 683  33

683*.10

## [1] 68.3

# How many of these variables have over 10% missing?
df<-as.data.frame(countNA)
df%>%filter(countNA>68)

##                 countNA
## hail                121
## sever               121
## seed.tmt            121
## germ                112
## leaf.halo            84
## leaf.marg            84
## leaf.size            84
## leaf.shread         100
## leaf.malf            84
## lodging             121
## fruiting.bodies     106
## fruit.pods           84
## fruit.spots         106
## seed                 92
## mold.growth          92
## seed.discolor       106
## seed.size            92
## shriveling          106

#Let's look at missing by rows classes
miss_by_class <- Soybean2 %>%
  group_by(Class) %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  mutate(TotalNA = select(.,date:roots) %>% rowSums())

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

miss_by_class[,c('Class','TotalNA')] %>% arrange(-TotalNA)

## # A tibble: 19 x 2
##    Class                       TotalNA
##    <fct>                         <dbl>
##  1 phytophthora-rot               1159
##  2 2-4-d-injury                    402
##  3 cyst-nematode                   294
##  4 diaporthe-pod-&-stem-blight     162
##  5 herbicide-injury                136
##  6 alternarialeaf-spot               0
##  7 anthracnose                       0
##  8 bacterial-blight                  0
##  9 bacterial-pustule                 0
## 10 brown-spot                        0
## 11 brown-stem-rot                    0
## 12 charcoal-rot                      0
## 13 diaporthe-stem-canker             0
## 14 downy-mildew                      0
## 15 frog-eye-leaf-spot                0
## 16 phyllosticta-leaf-spot            0
## 17 powdery-mildew                    0
## 18 purple-seed-stain                 0
## 19 rhizoctonia-root-rot              0

#phytophthora-rot has 1159 na, maybe this class should be analyzed separately.  Possibly remove this class, and do a separate analysis based on a small number of predictors with the other classes later.

#lets see how many rows each class is comprised of

Soybean%>%
  group_by(Class)%>%
  summarise(number = n())  %>%
  arrange(desc(number))

## # A tibble: 19 x 2
##    Class                       number
##    <fct>                        <int>
##  1 brown-spot                      92
##  2 alternarialeaf-spot             91
##  3 frog-eye-leaf-spot              91
##  4 phytophthora-rot                88
##  5 anthracnose                     44
##  6 brown-stem-rot                  44
##  7 bacterial-blight                20
##  8 bacterial-pustule               20
##  9 charcoal-rot                    20
## 10 diaporthe-stem-canker           20
## 11 downy-mildew                    20
## 12 phyllosticta-leaf-spot          20
## 13 powdery-mildew                  20
## 14 purple-seed-stain               20
## 15 rhizoctonia-root-rot            20
## 16 2-4-d-injury                    16
## 17 diaporthe-pod-&-stem-blight     15
## 18 cyst-nematode                   14
## 19 herbicide-injury                 8

#phytophthora-rot has 88 rows of data, with many NA, This would be a candidate for imputation however, difficult to impute if too much is missing.  Again, consider eliminating this class due to analyze later because of the high number of observations and missing variables.

#herbicide-injury, 2-4-d-injury, cyst-nematode, diaporthe-pod-&-stem-blight have so few examples and a fair share of missing, considering dropping rows.

Soybean3<-Soybean2%>%
  filter(Class !="phytophthora-rot")

dim(Soybean3)

## [1] 595  33

#Lets see what we have as far as NA
countNA3<-colSums(is.na(Soybean3))
countNA3

##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##              53              16               1              53              53 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##              44              16               0              29              29 
##       leaf.size     leaf.shread       leaf.malf            stem         lodging 
##              29              45              29              16              53 
##    stem.cankers   canker.lesion fruiting.bodies       ext.decay    int.discolor 
##              38              38              38              38              38 
##      fruit.pods     fruit.spots            seed     mold.growth   seed.discolor 
##              16              38              24              24              38 
##       seed.size      shriveling           roots 
##              24              38              31

#This count of NAs looks more reasonable

Develop a strategy for handling missing data, either by eliminating predictors or imputation.

The strategy for some missing has been developed above. Based on near zero variance, three predictors have been removed.

In addition, the class phytophthora-rot had in inordinate number of NA across all variables, removing this class reduces the dataset by 88 rows.

At this point we are left with 595 observations and 33 variables.

Let’s see which variables have a high NA left:

countNA2<-colSums(is.na(Soybean3))
countNA2

##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##              53              16               1              53              53 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##              44              16               0              29              29 
##       leaf.size     leaf.shread       leaf.malf            stem         lodging 
##              29              45              29              16              53 
##    stem.cankers   canker.lesion fruiting.bodies       ext.decay    int.discolor 
##              38              38              38              38              38 
##      fruit.pods     fruit.spots            seed     mold.growth   seed.discolor 
##              16              38              24              24              38 
##       seed.size      shriveling           roots 
##              24              38              31

df2<-as.data.frame(countNA2)
 df2%>%
   arrange(desc(countNA2))

##                 countNA2
## hail                  53
## sever                 53
## seed.tmt              53
## lodging               53
## leaf.shread           45
## germ                  44
## precip                38
## stem.cankers          38
## canker.lesion         38
## fruiting.bodies       38
## ext.decay             38
## int.discolor          38
## fruit.spots           38
## seed.discolor         38
## shriveling            38
## plant.stand           36
## roots                 31
## temp                  30
## leaf.halo             29
## leaf.marg             29
## leaf.size             29
## leaf.malf             29
## seed                  24
## mold.growth           24
## seed.size             24
## crop.hist             16
## plant.growth          16
## stem                  16
## fruit.pods            16
## date                   1
## area.dam               1
## Class                  0
## leaves                 0

The missingness has improved.

We are faced with the question of whether or not to impute. At this point I would run the analysis with the justification that missingness was addressed by thought out deletion.

Imputation can reduce the quality of the data, because you are making an educated guess. All of the variables are nominal or ordinal categorical.

A imputation based on mode may be a good choice.

For instance, taking the mode of hail (which is not hailing) is a pretty safe imputation because hail is a rarer occurrence.

Soybean4<-Soybean3 %>%
  mutate(hail_imp=hail)



Soybean5<-Soybean4 %>% replace_na(list(hail_imp = 0))


countNA5<-colSums(is.na(Soybean5))
countNA5

##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##              53              16               1              53              53 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##              44              16               0              29              29 
##       leaf.size     leaf.shread       leaf.malf            stem         lodging 
##              29              45              29              16              53 
##    stem.cankers   canker.lesion fruiting.bodies       ext.decay    int.discolor 
##              38              38              38              38              38 
##      fruit.pods     fruit.spots            seed     mold.growth   seed.discolor 
##              16              38              24              24              38 
##       seed.size      shriveling           roots        hail_imp 
##              24              38              31               0

DISCUSSION: USE hail_imp variable instead of hail.

Imputing can be done on on sever, seed.tmt, lodging, however a clearer understanding of these variables is necessary before going forward. Need to consult and collaborate to gain a better understanding before imputation.

Lets try using an imputation package MICE from r.

#patter of missingness

md.pattern(Soybean)

##     Class leaves date area.dam crop.hist plant.growth stem temp roots
## 562     1      1    1        1         1            1    1    1     1
## 13      1      1    1        1         1            1    1    1     1
## 55      1      1    1        1         1            1    1    1     1
## 8       1      1    1        1         1            1    1    1     1
## 9       1      1    1        1         1            1    1    1     0
## 6       1      1    1        1         1            1    1    1     0
## 14      1      1    1        1         1            1    1    0     1
## 15      1      1    1        1         0            0    0    0     0
## 1       1      1    0        0         0            0    0    0     0
##         0      0    1        1        16           16   16   30    31
##     plant.stand precip stem.cankers canker.lesion ext.decay mycelium
## 562           1      1            1             1         1        1
## 13            1      1            1             1         1        1
## 55            1      1            1             1         1        1
## 8             1      0            0             0         0        0
## 9             1      1            1             1         1        1
## 6             0      1            1             1         1        1
## 14            0      0            0             0         0        0
## 15            0      0            0             0         0        0
## 1             0      0            0             0         0        0
##              36     38           38            38        38       38
##     int.discolor sclerotia leaf.halo leaf.marg leaf.size leaf.malf fruit.pods
## 562            1         1         1         1         1         1          1
## 13             1         1         1         1         1         1          0
## 55             1         1         0         0         0         0          0
## 8              0         0         1         1         1         1          1
## 9              1         1         0         0         0         0          1
## 6              1         1         0         0         0         0          1
## 14             0         0         0         0         0         0          1
## 15             0         0         1         1         1         1          0
## 1              0         0         1         1         1         1          0
##               38        38        84        84        84        84         84
##     seed mold.growth seed.size leaf.shread fruiting.bodies fruit.spots
## 562    1           1         1           1               1           1
## 13     0           0         0           1               0           0
## 55     0           0         0           0               0           0
## 8      0           0         0           1               0           0
## 9      1           1         1           0               1           1
## 6      1           1         1           0               1           1
## 14     1           1         1           0               0           0
## 15     0           0         0           0               0           0
## 1      0           0         0           0               0           0
##       92          92        92         100             106         106
##     seed.discolor shriveling leaf.mild germ hail sever seed.tmt lodging     
## 562             1          1         1    1    1     1        1       1    0
## 13              0          0         1    0    0     0        0       0   13
## 55              0          0         0    0    0     0        0       0   19
## 8               0          0         0    0    0     0        0       0   20
## 9               1          1         0    1    0     0        0       0   11
## 6               1          1         0    0    0     0        0       0   13
## 14              0          0         0    0    0     0        0       0   24
## 15              0          0         0    0    0     0        0       0   28
## 1               0          0         0    0    0     0        0       0   30
##               106        106       108  112  121   121      121     121 2337

#use method predict mean matching

imp<- mice(Soybean, method = 'pmm', seed = 123, printFlag = FALSE)

## Warning: Number of logged events: 1684

#imp<-mice(Soybean)

#info stored
attributes(imp)

## $names
##  [1] "data"            "imp"             "m"               "where"          
##  [5] "blocks"          "call"            "nmis"            "method"         
##  [9] "predictorMatrix" "visitSequence"   "formulas"        "post"           
## [13] "blots"           "ignore"          "seed"            "iteration"      
## [17] "lastSeedValue"   "chainMean"       "chainVar"        "loggedEvents"   
## [21] "version"         "date"           
## 
## $class
## [1] "mids"

c.long <- complete(imp, "long")  
#summary of imputed df
summary(c.long)

##       .imp        .id                      Class      date    plant.stand
##  Min.   :1   Min.   :  1   brown-spot         : 460   0:130   0:1913     
##  1st Qu.:2   1st Qu.:171   alternarialeaf-spot: 455   1:377   1:1502     
##  Median :3   Median :342   frog-eye-leaf-spot : 455   2:466              
##  Mean   :3   Mean   :342   phytophthora-rot   : 440   3:591              
##  3rd Qu.:4   3rd Qu.:513   anthracnose        : 220   4:655              
##  Max.   :5   Max.   :683   brown-stem-rot     : 220   5:745              
##                            (Other)            :1165   6:451              
##  precip   temp     hail     crop.hist area.dam sever    seed.tmt germ    
##  0: 499   0: 450   0:2569   0: 374    0: 617   0:1225   0:1694   0: 995  
##  1: 568   1:1896   1: 846   1: 842    1:1135   1:1826   1:1296   1:1085  
##  2:2348   2:1069            2:1102    2: 725   2: 364   2: 425   2:1335  
##                             3:1097    3: 938                             
##                                                                          
##                                                                          
##                                                                          
##  plant.growth leaves   leaf.halo leaf.marg leaf.size leaf.shread leaf.malf
##  0:2228       0: 385   0:1376    0:2045    0: 505    0:2749      0:3051   
##  1:1187       1:3030   1: 201    1: 111    1:1711    1: 666      1: 364   
##                        2:1838    2:1259    2:1199                         
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##  leaf.mild stem     lodging  stem.cankers canker.lesion fruiting.bodies
##  0:3112    0:1518   0:2938   0:2017       0:1719        0:2629         
##  1: 100    1:1897   1: 477   1: 196       1: 415        1: 786         
##  2: 203                      2: 180       2: 885                       
##                              3:1022       3: 396                       
##                                                                        
##                                                                        
##                                                                        
##  ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed    
##  0:2611    0:3346   0:3010       0:3192    0:2157     0:1967      0:2612  
##  1: 698    1:  69   1: 232       1: 223    1: 650     1: 390      1: 803  
##  2: 106             2: 173                 2:  89     2: 302              
##                                            3: 519     4: 756              
##                                                                           
##                                                                           
##                                                                           
##  mold.growth seed.discolor seed.size shriveling roots   
##  0:2947      0:2871        0:2990    0:2967     0:2853  
##  1: 468      1: 544        1: 425    1: 448     1: 430  
##                                                 2: 132  
##                                                         
##                                                         
##                                                         
##

#missing resolved

HW4 624 pre-processing

Lisa Szydziak

9/28/2021