DATA624

3.1 The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via: library(mlbench), data(Glass), str(Glass)

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.2.2

library(car)

## Warning: package 'car' was built under R version 4.2.2

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.2

data(Glass)
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

a.Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

library(knitr)
kable(head(Glass, 10))

RI	Na	Mg	Al	Si	K	Ca	Fe	Type
1.52101	13.64	4.49	1.10	71.78	0.06	8.75	0.00	1
1.51761	13.89	3.60	1.36	72.73	0.48	7.83	0.00	1
1.51618	13.53	3.55	1.54	72.99	0.39	7.78	0.00	1
1.51766	13.21	3.69	1.29	72.61	0.57	8.22	0.00	1
1.51742	13.27	3.62	1.24	73.08	0.55	8.07	0.00	1
1.51596	12.79	3.61	1.62	72.97	0.64	8.07	0.26	1
1.51743	13.30	3.60	1.14	73.09	0.58	8.17	0.00	1
1.51756	13.15	3.61	1.05	73.24	0.57	8.24	0.00	1
1.51918	14.04	3.58	1.37	72.08	0.56	8.30	0.00	1
1.51755	13.00	3.60	1.36	72.99	0.57	8.40	0.11	1

summary(Glass)

##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:car':
## 
##     logit

describe(Glass)

##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## RI       1 214  1.52 0.00   1.52    1.52 0.00  1.51  1.53  0.02  1.60     4.72
## Na       2 214 13.41 0.82  13.30   13.38 0.64 10.73 17.38  6.65  0.45     2.90
## Mg       3 214  2.68 1.44   3.48    2.87 0.30  0.00  4.49  4.49 -1.14    -0.45
## Al       4 214  1.44 0.50   1.36    1.41 0.31  0.29  3.50  3.21  0.89     1.94
## Si       5 214 72.65 0.77  72.79   72.71 0.57 69.81 75.41  5.60 -0.72     2.82
## K        6 214  0.50 0.65   0.56    0.43 0.17  0.00  6.21  6.21  6.46    52.87
## Ca       7 214  8.96 1.42   8.60    8.74 0.66  5.43 16.19 10.76  2.02     6.41
## Ba       8 214  0.18 0.50   0.00    0.03 0.00  0.00  3.15  3.15  3.37    12.08
## Fe       9 214  0.06 0.10   0.00    0.04 0.00  0.00  0.51  0.51  1.73     2.52
## Type*   10 214  2.54 1.71   2.00    2.31 1.48  1.00  6.00  5.00  1.04    -0.29
##         se
## RI    0.00
## Na    0.06
## Mg    0.10
## Al    0.03
## Si    0.05
## K     0.04
## Ca    0.10
## Ba    0.03
## Fe    0.01
## Type* 0.12

library(ggplot2)
library(reshape2)
ggplot(melt(Glass, id.vars=c('Type')), aes(x=value)) + 
  geom_histogram(bins=50, fill="blue") + 
  facet_wrap(~variable, scale="free")

library(corrplot)

## corrplot 0.92 loaded

corrplot(cor(Glass[,1:9]), order = "hclust")

library(caret)

## Warning: package 'caret' was built under R version 4.2.2

## Loading required package: lattice

corelation <- cor(Glass[,-10])
highCorr <- findCorrelation(corelation, cutoff = .75)
print(paste0("Total number of Predictor Variables with Pearson Correlation > 0.75: ",length(highCorr)))

## [1] "Total number of Predictor Variables with Pearson Correlation > 0.75: 1"

cor(Glass[,c('Ca','RI')])

##           Ca        RI
## Ca 1.0000000 0.8104027
## RI 0.8104027 1.0000000

There is strong corelation between Ca and RI

b.Do there appear to be any outliers in the data? Are any predictors skewed?

for(i in 1:9) {
  print(paste0("Predictor Variable (outlier values): ", colnames(Glass[i])))
  print(paste0(boxplot(Glass[i],plot=FALSE)$out))
}

## [1] "Predictor Variable (outlier values): RI"
##  [1] "1.52667" "1.5232"  "1.51215" "1.52725" "1.5241"  "1.52475" "1.53125"
##  [8] "1.53393" "1.52664" "1.52739" "1.52777" "1.52614" "1.52369" "1.51115"
## [15] "1.51131" "1.52315" "1.52365"
## [1] "Predictor Variable (outlier values): Na"
## [1] "11.45" "10.73" "11.23" "11.02" "11.03" "17.38" "15.79"
## [1] "Predictor Variable (outlier values): Mg"
## character(0)
## [1] "Predictor Variable (outlier values): Al"
##  [1] "0.29" "0.47" "0.47" "0.51" "3.5"  "3.04" "3.02" "0.34" "2.38" "2.79"
## [11] "2.68" "2.54" "2.34" "2.66" "2.51" "2.42" "2.74" "2.88"
## [1] "Predictor Variable (outlier values): Si"
##  [1] "70.57" "69.81" "70.16" "74.45" "69.89" "70.48" "70.7"  "74.55" "75.41"
## [10] "70.26" "70.43" "75.18"
## [1] "Predictor Variable (outlier values): K"
## [1] "1.68" "6.21" "6.21" "1.76" "1.46" "2.7"  "1.41"
## [1] "Predictor Variable (outlier values): Ca"
##  [1] "11.64" "10.79" "13.24" "13.3"  "16.19" "11.52" "10.99" "14.68" "14.96"
## [10] "14.4"  "11.14" "13.44" "5.87"  "11.41" "11.62" "11.53" "11.32" "12.24"
## [19] "12.5"  "11.27" "10.88" "11.22" "6.65"  "5.43"  "5.79"  "6.47" 
## [1] "Predictor Variable (outlier values): Ba"
##  [1] "0.09" "0.11" "0.69" "0.14" "0.11" "3.15" "0.27" "0.09" "0.06" "0.15"
## [11] "2.2"  "0.24" "1.19" "1.63" "1.68" "0.76" "0.64" "0.4"  "1.59" "1.57"
## [21] "0.61" "0.81" "0.66" "0.64" "0.53" "0.63" "0.56" "1.71" "0.67" "1.55"
## [31] "1.38" "2.88" "0.54" "1.06" "1.59" "1.64" "1.57" "1.67"
## [1] "Predictor Variable (outlier values): Fe"
##  [1] "0.26" "0.3"  "0.31" "0.32" "0.34" "0.28" "0.29" "0.28" "0.35" "0.37"
## [11] "0.51" "0.28"

Yes, there are outliers in the predictor variable.

library(e1071)

## Warning: package 'e1071' was built under R version 4.2.2

apply(Glass[,-10], 2, skewness)

##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107

The data shows high skewed values and it is required to use transformation techniques.

c.Are there any relevant transformations of one or more predictors that might improve the classification model?

transform_glass <- apply(Glass[,-10], 2, BoxCoxTrans)
transform_glass

## $RI
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.511   1.517   1.518   1.518   1.519   1.534 
## 
## Largest/Smallest: 1.02 
## Sample Skewness: 1.6 
## 
## Estimated Lambda: -2 
## 
## 
## $Na
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.73   12.91   13.30   13.41   13.82   17.38 
## 
## Largest/Smallest: 1.62 
## Sample Skewness: 0.448 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations
## 
## 
## $Mg
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.115   3.480   2.685   3.600   4.490 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Al
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.290   1.190   1.360   1.445   1.630   3.500 
## 
## Largest/Smallest: 12.1 
## Sample Skewness: 0.895 
## 
## Estimated Lambda: 0.5 
## 
## 
## $Si
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69.81   72.28   72.79   72.65   73.09   75.41 
## 
## Largest/Smallest: 1.08 
## Sample Skewness: -0.72 
## 
## Estimated Lambda: 2 
## 
## 
## $K
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1225  0.5550  0.4971  0.6100  6.2100 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Ca
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.430   8.240   8.600   8.957   9.172  16.190 
## 
## Largest/Smallest: 2.98 
## Sample Skewness: 2.02 
## 
## Estimated Lambda: -1.1 
## 
## 
## $Ba
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.175   0.000   3.150 
## 
## Lambda could not be estimated; no transformation is applied
## 
## 
## $Fe
## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05701 0.10000 0.51000 
## 
## Lambda could not be estimated; no transformation is applied

Yes, there are some transformation techniques that allow improving the classification model. Transformation techniques such as log or Box Cox could help improve the model. Removing outliers improve model performance.

3.2The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

data(Soybean)

summary(Soybean)

##                  Class          date     plant.stand  precip      temp    
##  brown-spot         : 92   5      :149   0   :354    0   : 74   0   : 80  
##  alternarialeaf-spot: 91   4      :131   1   :293    1   :112   1   :374  
##  frog-eye-leaf-spot : 91   3      :118   NA's: 36    2   :459   2   :199  
##  phytophthora-rot   : 88   2      : 93               NA's: 38   NA's: 30  
##  anthracnose        : 44   6      : 90                                    
##  brown-stem-rot     : 44   (Other):101                                    
##  (Other)            :233   NA's   :  1                                    
##    hail     crop.hist  area.dam    sever     seed.tmt     germ     plant.growth
##  0   :435   0   : 65   0   :123   0   :195   0   :305   0   :165   0   :441    
##  1   :127   1   :165   1   :227   1   :322   1   :222   1   :213   1   :226    
##  NA's:121   2   :219   2   :145   2   : 45   2   : 35   2   :193   NA's: 16    
##             3   :218   3   :187   NA's:121   NA's:121   NA's:112               
##             NA's: 16   NA's:  1                                                
##                                                                                
##                                                                                
##  leaves  leaf.halo  leaf.marg  leaf.size  leaf.shread leaf.malf  leaf.mild 
##  0: 77   0   :221   0   :357   0   : 51   0   :487    0   :554   0   :535  
##  1:606   1   : 36   1   : 21   1   :327   1   : 96    1   : 45   1   : 20  
##          2   :342   2   :221   2   :221   NA's:100    NA's: 84   2   : 20  
##          NA's: 84   NA's: 84   NA's: 84                          NA's:108  
##                                                                            
##                                                                            
##                                                                            
##    stem     lodging    stem.cankers canker.lesion fruiting.bodies ext.decay 
##  0   :296   0   :520   0   :379     0   :320      0   :473        0   :497  
##  1   :371   1   : 42   1   : 39     1   : 83      1   :104        1   :135  
##  NA's: 16   NA's:121   2   : 36     2   :177      NA's:106        2   : 13  
##                        3   :191     3   : 65                      NA's: 38  
##                        NA's: 38     NA's: 38                                
##                                                                             
##                                                                             
##  mycelium   int.discolor sclerotia  fruit.pods fruit.spots   seed    
##  0   :639   0   :581     0   :625   0   :407   0   :345    0   :476  
##  1   :  6   1   : 44     1   : 20   1   :130   1   : 75    1   :115  
##  NA's: 38   2   : 20     NA's: 38   2   : 14   2   : 57    NA's: 92  
##             NA's: 38                3   : 48   4   :100              
##                                     NA's: 84   NA's:106              
##                                                                      
##                                                                      
##  mold.growth seed.discolor seed.size  shriveling  roots    
##  0   :524    0   :513      0   :532   0   :539   0   :551  
##  1   : 67    1   : 64      1   : 59   1   : 38   1   : 86  
##  NA's: 92    NA's:106      NA's: 92   NA's:106   2   : 15  
##                                                  NA's: 31  
##                                                            
##                                                            
##

We can see that there are 19 classes.There are 35 categorical attributes, some nominal and some ordered. The values for attributes are encoded numerically, with the first value encoded as “0,” the second as “1,” and so forth. A data frame with 683 observations on 36 variables. There are 35 categorical attributes, all numerical and a nominal denoting the class.

a.Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

df <- Soybean 
df[is.na(df)] <- 0
library(sqldf)

## Warning: package 'sqldf' was built under R version 4.2.2

## Loading required package: gsubfn

## Warning: package 'gsubfn' was built under R version 4.2.2

## Loading required package: proto

## Warning: package 'proto' was built under R version 4.2.2

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 4.2.2

f <-colnames(df) 

for (i in 1:length(f)){
  sSQL <- paste("SELECT [", f[i], "], COUNT(1) Occurrences FROM Soybean GROUP BY [", f[i], "] ORDER BY COUNT(1)  DESC  LIMIT 2", sep = "")
  a <- sqldf(sSQL)
  ratio <- round(a[1,2]/a[2,2],1)
  if(ratio > 10){
    print(paste0(f[i], "  Most Frequent: ", a[1,2],"   Second Most: ", a[2,2], "   Ratio: ", ratio))
  }
}

## [1] "mycelium  Most Frequent: 639   Second Most: 38   Ratio: 16.8"
## [1] "int.discolor  Most Frequent: 581   Second Most: 44   Ratio: 13.2"
## [1] "sclerotia  Most Frequent: 625   Second Most: 38   Ratio: 16.4"

The three most likely candidates are mycelium, int.discolor and sclerotia, but the ratios between most frequent and 2nd most frequent levels are all below twenty. None of the predictors are at risk of being degenerate.

b.Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing?

df <- Soybean #copy to new data frame so can edit without changing original
na_count <-sapply(df, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count) #convert the integer series to a data frame

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

invisible(setDT(na_count, keep.rownames = TRUE)[]) # convert the data frame index to a data column
na_count <- na_count[order(-na_count),] # sort by missing values count, descending

library(knitr)
kable(na_count[1:15, ]) # show top 15 missing values

rn	na_count
hail	121
sever	121
seed.tmt	121
lodging	121
germ	112
leaf.mild	108
fruiting.bodies	106
fruit.spots	106
seed.discolor	106
shriveling	106
leaf.shread	100
seed	92
mold.growth	92
seed.size	92
leaf.halo	84

According to the Table above lists the predictors most likely to be missing. I can see that where the most missing values are hail, sever, seed.tmt and lodging, all have the same number of missing values. These variables may be correlated within the class. Values are also missing in fruiting.bodies, fruit.spots, seed.discolor, and shriveling.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

df <- select(Soybean, Class, hail, sever, seed.tmt, lodging, fruiting.bodies, fruit.spots, seed.discolor, shriveling)
DT <- data.table(df)
DT[, lapply(.SD, function(x) sum(is.na(x))) , by = list(Class)]

##                           Class hail sever seed.tmt lodging fruiting.bodies
##  1:       diaporthe-stem-canker    0     0        0       0               0
##  2:                charcoal-rot    0     0        0       0               0
##  3:        rhizoctonia-root-rot    0     0        0       0               0
##  4:            phytophthora-rot   68    68       68      68              68
##  5:              brown-stem-rot    0     0        0       0               0
##  6:              powdery-mildew    0     0        0       0               0
##  7:                downy-mildew    0     0        0       0               0
##  8:                  brown-spot    0     0        0       0               0
##  9:            bacterial-blight    0     0        0       0               0
## 10:           bacterial-pustule    0     0        0       0               0
## 11:           purple-seed-stain    0     0        0       0               0
## 12:                 anthracnose    0     0        0       0               0
## 13:      phyllosticta-leaf-spot    0     0        0       0               0
## 14:         alternarialeaf-spot    0     0        0       0               0
## 15:          frog-eye-leaf-spot    0     0        0       0               0
## 16: diaporthe-pod-&-stem-blight   15    15       15      15               0
## 17:               cyst-nematode   14    14       14      14              14
## 18:                2-4-d-injury   16    16       16      16              16
## 19:            herbicide-injury    8     8        8       8               8
##     fruit.spots seed.discolor shriveling
##  1:           0             0          0
##  2:           0             0          0
##  3:           0             0          0
##  4:          68            68         68
##  5:           0             0          0
##  6:           0             0          0
##  7:           0             0          0
##  8:           0             0          0
##  9:           0             0          0
## 10:           0             0          0
## 11:           0             0          0
## 12:           0             0          0
## 13:           0             0          0
## 14:           0             0          0
## 15:           0             0          0
## 16:           0             0          0
## 17:          14            14         14
## 18:          16            16         16
## 19:           8             8          8

number.of.na <- apply(Soybean, 1, function(x){sum(is.na(x))})
class.soybean <- Soybean$Class
soybean.na.df <- data.frame(class.soybean, number.of.na)
kable(head(soybean.na.df,10))

class.soybean	number.of.na
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0
diaporthe-stem-canker	0

results <- aggregate(soybean.na.df$number.of.na, by=list(class.soybean=soybean.na.df$class.soybean), FUN=sum)
kable(results[order(results[,"x"]),])

	class.soybean	x
2	alternarialeaf-spot	0
3	anthracnose	0
4	bacterial-blight	0
5	bacterial-pustule	0
6	brown-spot	0
7	brown-stem-rot	0
8	charcoal-rot	0
11	diaporthe-stem-canker	0
12	downy-mildew	0
13	frog-eye-leaf-spot	0
15	phyllosticta-leaf-spot	0
17	powdery-mildew	0
18	purple-seed-stain	0
19	rhizoctonia-root-rot	0
14	herbicide-injury	160
10	diaporthe-pod-&-stem-blight	177
9	cyst-nematode	336
1	2-4-d-injury	450
16	phytophthora-rot	1214

c.Develop a strategy for handling missing data, either by eliminating predictors or imputation.

head(Soybean[Soybean$Class=='phytophthora-rot',],10)

##               Class date plant.stand precip temp hail crop.hist area.dam sever
## 31 phytophthora-rot    0           1      2    1    1         1        1     1
## 32 phytophthora-rot    1           1      2    1 <NA>         3        1  <NA>
## 33 phytophthora-rot    2           1      2    2 <NA>         2        1  <NA>
## 34 phytophthora-rot    1           1      2    0    0         2        1     2
## 35 phytophthora-rot    2           1      2    2 <NA>         2        1  <NA>
## 36 phytophthora-rot    3           1      2    1 <NA>         2        1  <NA>
## 37 phytophthora-rot    0           1      1    1    0         1        1     1
## 38 phytophthora-rot    3           1      2    0    0         2        1     2
## 39 phytophthora-rot    2           1      1    1 <NA>         0        1  <NA>
## 40 phytophthora-rot    2           1      2    0    0         1        1     2
##    seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread
## 31        0    0            1      1         0         2         2           0
## 32     <NA> <NA>            1      1         0         2         2           0
## 33     <NA> <NA>            1      1      <NA>      <NA>      <NA>        <NA>
## 34        1    1            1      1         0         2         2           0
## 35     <NA> <NA>            1      1      <NA>      <NA>      <NA>        <NA>
## 36     <NA> <NA>            1      1      <NA>      <NA>      <NA>        <NA>
## 37        0    0            1      1         0         2         2           0
## 38        1    1            1      1         0         2         2           0
## 39     <NA> <NA>            1      1         0         2         2           0
## 40        0    1            1      1         0         2         2           0
##    leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies
## 31         0         0    1       0            1             2               0
## 32         0         0    1    <NA>            2             2            <NA>
## 33      <NA>      <NA>    1    <NA>            3             2            <NA>
## 34         0         0    1       0            2             2               0
## 35      <NA>      <NA>    1    <NA>            2             2            <NA>
## 36      <NA>      <NA>    1    <NA>            3             2            <NA>
## 37         0         0    1       0            1             2               0
## 38         0         0    1       0            2             2               0
## 39         0         0    1    <NA>            2             2            <NA>
## 40         0         0    1       0            1             2               0
##    ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 31         1        0            0         0          3           4    0
## 32         0        0            0         0       <NA>        <NA> <NA>
## 33         0        0            0         0       <NA>        <NA> <NA>
## 34         0        0            0         0          3           4    0
## 35         0        0            0         0       <NA>        <NA> <NA>
## 36         0        0            0         0       <NA>        <NA> <NA>
## 37         0        0            0         0          3           4    0
## 38         0        0            0         0          3           4    0
## 39         0        0            0         0       <NA>        <NA> <NA>
## 40         0        0            0         0          3           4    0
##    mold.growth seed.discolor seed.size shriveling roots
## 31           0             0         0          0     0
## 32        <NA>          <NA>      <NA>       <NA>     1
## 33        <NA>          <NA>      <NA>       <NA>     1
## 34           0             0         0          0     0
## 35        <NA>          <NA>      <NA>       <NA>     1
## 36        <NA>          <NA>      <NA>       <NA>     1
## 37           0             0         0          0     0
## 38           0             0         0          0     0
## 39        <NA>          <NA>      <NA>       <NA>     1
## 40           0             0         0          0     0

Analyzing the data and the tables we can observe phytophthora-rot values across all these predictors. It is possible that at the time the data was collected it was not fully realized. Data is also missing in 2-4-d-injury and herbicide-injury. The missing data is related to the class.

The strategy would be to try to collect the missing data, to perform the analysis properly.

In case it is not possible to collect the missing data, a classification model should be built with and without the MICE package. To review if the predictive result can be improved by imputation.

df <- select(Soybean, Class, hail, sever, seed.tmt, lodging, fruiting.bodies, fruit.spots, seed.discolor, shriveling)
DT <- data.table(df)
DT[, lapply(.SD, function(x) sum(is.na(x))) , by = list(Class)]

##                           Class hail sever seed.tmt lodging fruiting.bodies
##  1:       diaporthe-stem-canker    0     0        0       0               0
##  2:                charcoal-rot    0     0        0       0               0
##  3:        rhizoctonia-root-rot    0     0        0       0               0
##  4:            phytophthora-rot   68    68       68      68              68
##  5:              brown-stem-rot    0     0        0       0               0
##  6:              powdery-mildew    0     0        0       0               0
##  7:                downy-mildew    0     0        0       0               0
##  8:                  brown-spot    0     0        0       0               0
##  9:            bacterial-blight    0     0        0       0               0
## 10:           bacterial-pustule    0     0        0       0               0
## 11:           purple-seed-stain    0     0        0       0               0
## 12:                 anthracnose    0     0        0       0               0
## 13:      phyllosticta-leaf-spot    0     0        0       0               0
## 14:         alternarialeaf-spot    0     0        0       0               0
## 15:          frog-eye-leaf-spot    0     0        0       0               0
## 16: diaporthe-pod-&-stem-blight   15    15       15      15               0
## 17:               cyst-nematode   14    14       14      14              14
## 18:                2-4-d-injury   16    16       16      16              16
## 19:            herbicide-injury    8     8        8       8               8
##     fruit.spots seed.discolor shriveling
##  1:           0             0          0
##  2:           0             0          0
##  3:           0             0          0
##  4:          68            68         68
##  5:           0             0          0
##  6:           0             0          0
##  7:           0             0          0
##  8:           0             0          0
##  9:           0             0          0
## 10:           0             0          0
## 11:           0             0          0
## 12:           0             0          0
## 13:           0             0          0
## 14:           0             0          0
## 15:           0             0          0
## 16:           0             0          0
## 17:          14            14         14
## 18:          16            16         16
## 19:           8             8          8

DATA624_HW4

Gabriel Santos

2023-02-24