The UC irvine Machine Learning Repository contains a data set related to glass identification. the data consists of 214 glass samples labeled as one of seven class categories. There are nine Predictors including the refractive index of eight elements: Na, Mg, Al, Si,K, Ca, Ba, Fe a)Using visualizations explore the predictor variables to understand the distributions as well as the relationships between predictors
library(mlbench)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(AppliedPredictiveModeling)
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:e1071':
##
## impute
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
#from GGally package used in Data 606 conveniently plots finds correlations.
Glass %>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# hist.data.frame from Hmisc convenient for displaying histograms of every column.
hist.data.frame(Glass)
#to see how the histogram of each column compares to the normal distribution. the normal distribution can be plotted over the histogram
rimean <- mean(Glass$RI)
risd <- sd(Glass$RI)
rihist <- ggplot(data = Glass, aes(x = RI)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = rimean, sd = risd), col = "tomato")
rihist
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$RI)
## [1] 1.602715
namean <- mean(Glass$Na)
nasd <- sd(Glass$Na)
nahist <- ggplot(data = Glass, aes(x = Na)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = namean, sd = nasd), col = "tomato")
nahist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Na)
## [1] 0.4478343
mgmean <- mean(Glass$Mg)
mgsd <- sd(Glass$Mg)
mghist <- ggplot(data = Glass, aes(x = Mg)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = mgmean, sd = mgsd), col = "tomato")
mghist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Mg)
## [1] -1.136452
almean <- mean(Glass$Al)
alsd <- sd(Glass$Al)
alhist <- ggplot(data = Glass, aes(x = Al)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = almean, sd = alsd), col = "tomato")
alhist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Al)
## [1] 0.8946104
simean <- mean(Glass$Si)
sisd <- sd(Glass$Si)
sihist <- ggplot(data = Glass, aes(x = Si)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = simean, sd = sisd), col = "tomato")
sihist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Si)
## [1] -0.7202392
kmean <- mean(Glass$K)
ksd <- sd(Glass$K)
khist <- ggplot(data = Glass, aes(x = K)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = kmean, sd = ksd), col = "tomato")
khist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$K)
## [1] 6.460089
camean <- mean(Glass$Ca)
casd <- sd(Glass$Ca)
cahist <- ggplot(data = Glass, aes(x = Ca)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = camean, sd = casd), col = "tomato")
cahist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Ca)
## [1] 2.018446
bamean <- mean(Glass$Ba)
basd <- sd(Glass$Ba)
bahist <- ggplot(data = Glass, aes(x = Ba)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = bamean, sd = basd), col = "tomato")
bahist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Ba)
## [1] 3.36868
femean <- mean(Glass$Fe)
fesd <- sd(Glass$Fe)
fehist <- ggplot(data = Glass, aes(x = Fe)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = femean, sd = fesd), col = "tomato")
fehist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(Glass$Fe)
## [1] 1.729811
type_glass <- Glass |> group_by(Type)
boxplot_ri <- type_glass |>
ggplot( aes(Glass, x = RI, y = Type ) )+
geom_boxplot()
boxplot_ri
boxplot_na <- type_glass |>
ggplot( aes(Glass, x = Na, y = Type ) )+
geom_boxplot()
boxplot_na
boxplot_mg <- type_glass |>
ggplot( aes(Glass, x = Mg, y = Type ) )+
geom_boxplot()
boxplot_mg
boxplot_al <- type_glass |>
ggplot( aes(Glass, x = Al, y = Type ) )+
geom_boxplot()
boxplot_al
boxplot_si <- type_glass |>
ggplot( aes(Glass, x = Si, y = Type ) )+
geom_boxplot()
boxplot_si
boxplot_k <- type_glass |>
ggplot( aes(Glass, x = K, y = Type ) )+
geom_boxplot()
boxplot_k
boxplot_Ca <- type_glass |>
ggplot( aes(Glass, x = Ca, y = Type ) )+
geom_boxplot()
boxplot_Ca
boxplot_ba <- type_glass |>
ggplot( aes(Glass, x = Ba, y = Type ) )+
geom_boxplot()
boxplot_ba
boxplot_fe <- type_glass |>
ggplot( aes(Glass, x = Fe, y = Type ) )+
geom_boxplot()
boxplot_fe
B) Do There appear to be any outliers in the Data? Are any Predictors
skewed? visually looking at the histograms and the skewness function
from the e1071 package
RI Right skewed Na Right skewed Mg Left skewed Al right skewed Si left skewed K right skewed Ca right skewed Ba right skewed Fe right skewed
#boxplot of all % or RI together regardless of type.
boxplot_ri <- Glass |>
ggplot( aes(Glass, x = RI, y = 0 ) )+
geom_boxplot()
boxplot_ri
boxplot_na <- Glass |>
ggplot( aes(Glass, x = Na, y = 0 ) )+
geom_boxplot()
boxplot_na
boxplot_mg <- Glass |>
ggplot( aes(Glass, x = Mg, y = 0 ) )+
geom_boxplot()
boxplot_mg
boxplot_al <- Glass |>
ggplot( aes(Glass, x = Al, y = 0 ) )+
geom_boxplot()
boxplot_al
boxplot_si <- Glass |>
ggplot( aes(Glass, x = Si, y = 0 ) )+
geom_boxplot()
boxplot_si
boxplot_k <- Glass |>
ggplot( aes(Glass, x = K, y = 0 ) )+
geom_boxplot()
boxplot_k
boxplot_Ca <- Glass |>
ggplot( aes(Glass, x = Ca, y = 0 ) )+
geom_boxplot()
boxplot_Ca
boxplot_ba <- Glass |>
ggplot( aes(Glass, x = Ba, y = 0 ) )+
geom_boxplot()
boxplot_ba
boxplot_fe <- Glass |>
ggplot( aes(Glass, x = Fe, y = 0 ) )+
geom_boxplot()
boxplot_fe
From the Box plots it would appear only Mg Does not have outliers when including the Mg % values of all types together.
#perform a boxcox transformation on each predictor. plot the transformation historgram. compare to normal distribution and the non transformed data.
riglasstrans <- BoxCoxTrans(Glass$RI)
riglasstrans
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.511 1.517 1.518 1.518 1.519 1.534
##
## Largest/Smallest: 1.02
## Sample Skewness: 1.6
##
## Estimated Lambda: -2
RItrans <- predict(riglasstrans, Glass$RI)
head(RItrans)
## [1] 0.2838746 0.2829051 0.2824954 0.2829194 0.2828507 0.2824323
RItrans <- as.data.frame(RItrans)
head(RItrans)
## RItrans
## 1 0.2838746
## 2 0.2829051
## 3 0.2824954
## 4 0.2829194
## 5 0.2828507
## 6 0.2824323
rimean <- mean(RItrans$RItrans)
risd <- sd(RItrans$RItrans)
ggplot(data = RItrans, aes(x = RItrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = rimean, sd = risd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(RItrans$RItrans)
## [1] 1.56566
rihist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
naglasstrans <- BoxCoxTrans(Glass$Na)
naglasstrans
## Box-Cox Transformation
##
## 214 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.73 12.91 13.30 13.41 13.82 17.38
##
## Largest/Smallest: 1.62
## Sample Skewness: 0.448
##
## Estimated Lambda: -0.1
## With fudge factor, Lambda = 0 will be used for transformations
natrans <-predict(naglasstrans, Glass$Na)
natrans <- as.data.frame(natrans)
head(natrans)
## natrans
## 1 2.613007
## 2 2.631169
## 3 2.604909
## 4 2.580974
## 5 2.585506
## 6 2.548664
namean <- mean(natrans$natrans)
nasd <- sd(natrans$natrans)
ggplot(data = natrans, aes(x = natrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = namean, sd = nasd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nahist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mgglasstrans <- BoxCoxTrans(Glass$Mg)
mgtrans <-predict(mgglasstrans, Glass$Mg)
mgtrans <- as.data.frame(mgtrans)
head(mgtrans)
## mgtrans
## 1 4.49
## 2 3.60
## 3 3.55
## 4 3.69
## 5 3.62
## 6 3.61
mgmean <- mean(mgtrans$mgtrans)
mgsd <- sd(mgtrans$mgtrans)
ggplot(data = mgtrans, aes(x = mgtrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = mgmean, sd = mgsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mghist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
alglasstrans <- BoxCoxTrans(Glass$Al)
altrans <-predict(alglasstrans, Glass$Al)
altrans <- as.data.frame(altrans)
head(altrans)
## altrans
## 1 0.0976177
## 2 0.3323808
## 3 0.4819347
## 4 0.2715633
## 5 0.2271057
## 6 0.5455844
almean <- mean(altrans$altrans)
alsd <- sd(altrans$altrans)
ggplot(data = altrans, aes(x = altrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = almean, sd = alsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
alhist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
siglasstrans <- BoxCoxTrans(Glass$Si)
sitrans <-predict(siglasstrans, Glass$Si)
sitrans <- as.data.frame(sitrans)
head(sitrans)
## sitrans
## 1 2575.684
## 2 2644.326
## 3 2663.270
## 4 2635.606
## 5 2669.843
## 6 2661.810
simean <- mean(sitrans$sitrans)
sisd <- sd(sitrans$sitrans)
ggplot(data = sitrans, aes(x = sitrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = simean, sd = sisd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sihist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
kglasstrans <- BoxCoxTrans(Glass$K)
ktrans <-predict(kglasstrans, Glass$K)
ktrans <- as.data.frame(ktrans)
head(ktrans)
## ktrans
## 1 0.06
## 2 0.48
## 3 0.39
## 4 0.57
## 5 0.55
## 6 0.64
kmean <- mean(ktrans$ktrans)
ksd <- sd(ktrans$ktrans)
ggplot(data = ktrans, aes(x = ktrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = kmean, sd = ksd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
khist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
caglasstrans <- BoxCoxTrans(Glass$Ca)
catrans <-predict(caglasstrans, Glass$Ca)
catrans <- as.data.frame(catrans)
head(catrans)
## catrans
## 1 0.8254539
## 2 0.8145827
## 3 0.8139144
## 4 0.8195032
## 5 0.8176698
## 6 0.8176698
camean <- mean(catrans$catrans)
casd <- sd(catrans$catrans)
ggplot(data = catrans, aes(x = catrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = camean, sd = casd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cahist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
baglasstrans <- BoxCoxTrans(Glass$Ba)
batrans <-predict(baglasstrans, Glass$Ba)
batrans <- as.data.frame(batrans)
head(batrans)
## batrans
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
namean <- mean(batrans$batrans)
nasd <- sd(batrans$batrans)
ggplot(data = batrans, aes(x = batrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = namean, sd = nasd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
bahist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
feglasstrans <- BoxCoxTrans(Glass$Fe)
fetrans <-predict(feglasstrans, Glass$Fe)
fetrans <- as.data.frame(fetrans)
head(fetrans)
## fetrans
## 1 0.00
## 2 0.00
## 3 0.00
## 4 0.00
## 5 0.00
## 6 0.26
femean <- mean(fetrans$fetrans)
fesd <- sd(fetrans$fetrans)
ggplot(data = fetrans, aes(x = fetrans )) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = femean, sd = fesd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fehist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
in order to make the predictors more normally distributed a box cox transformation can be performed. Some predictors such as Fe do not benefit as much. other predictors showed mariginal improvement such as with Al.
3.2 The soybean data a)investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?
library(mlbench)
data(Soybean)
?Soybean
## starting httpd help server ... done
#employing hist.data.frame and ggpairs results in an error due to the high number of factors.
#Soybean|> group_by(Class) |>
# ggpairs(cardinality_threshold = 20)
#plotting the factors of the soybean
#for extracting column names use index i
i <- 2
#simplify extracting column names
name <- c(names(Soybean))
for(col in Soybean[,-1]) { #excluding the class column
factorplot <- ggplot(data=Soybean, aes(x=col)) +
geom_bar() +
xlab(name[i])
i <- i + 1
print(factorplot)
}
b) roughly 18% of the data of the data are missing. Are there particular
predictors that are more likely to be missing? Is the pattern of missing
data related to the classes?
#count the number of missing values for each factor
nacount <- colSums(is.na(Soybean))
print(nacount)
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 84 100 84 108 16
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 121 38 38 106 38
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 38 38 38 84 106
## seed mold.growth seed.discolor seed.size shriveling
## 92 92 106 92 106
## roots
## 31
#find the number of missing values for each class
classnacount <- Soybean |>
group_by(Class) |>
summarise(across(everything(), ~sum(is.na(.)), .names = "na_{.col}"))
print(classnacount)
## # A tibble: 19 × 36
## Class na_date na_plant.stand na_precip na_temp na_hail na_crop.hist
## <fct> <int> <int> <int> <int> <int> <int>
## 1 2-4-d-injury 1 16 16 16 16 16
## 2 alternarialeaf… 0 0 0 0 0 0
## 3 anthracnose 0 0 0 0 0 0
## 4 bacterial-blig… 0 0 0 0 0 0
## 5 bacterial-pust… 0 0 0 0 0 0
## 6 brown-spot 0 0 0 0 0 0
## 7 brown-stem-rot 0 0 0 0 0 0
## 8 charcoal-rot 0 0 0 0 0 0
## 9 cyst-nematode 0 14 14 14 14 0
## 10 diaporthe-pod-… 0 6 0 0 15 0
## 11 diaporthe-stem… 0 0 0 0 0 0
## 12 downy-mildew 0 0 0 0 0 0
## 13 frog-eye-leaf-… 0 0 0 0 0 0
## 14 herbicide-inju… 0 0 8 0 8 0
## 15 phyllosticta-l… 0 0 0 0 0 0
## 16 phytophthora-r… 0 0 0 0 68 0
## 17 powdery-mildew 0 0 0 0 0 0
## 18 purple-seed-st… 0 0 0 0 0 0
## 19 rhizoctonia-ro… 0 0 0 0 0 0
## # ℹ 29 more variables: na_area.dam <int>, na_sever <int>, na_seed.tmt <int>,
## # na_germ <int>, na_plant.growth <int>, na_leaves <int>, na_leaf.halo <int>,
## # na_leaf.marg <int>, na_leaf.size <int>, na_leaf.shread <int>,
## # na_leaf.malf <int>, na_leaf.mild <int>, na_stem <int>, na_lodging <int>,
## # na_stem.cankers <int>, na_canker.lesion <int>, na_fruiting.bodies <int>,
## # na_ext.decay <int>, na_mycelium <int>, na_int.discolor <int>,
## # na_sclerotia <int>, na_fruit.pods <int>, na_fruit.spots <int>, …
#find the total missing value per class
na_total <- Soybean |>
group_by(Class) |>
summarise(total_na = sum(across(everything(), ~sum(is.na(.)))))
print(na_total)
## # A tibble: 19 × 2
## Class total_na
## <fct> <int>
## 1 2-4-d-injury 450
## 2 alternarialeaf-spot 0
## 3 anthracnose 0
## 4 bacterial-blight 0
## 5 bacterial-pustule 0
## 6 brown-spot 0
## 7 brown-stem-rot 0
## 8 charcoal-rot 0
## 9 cyst-nematode 336
## 10 diaporthe-pod-&-stem-blight 177
## 11 diaporthe-stem-canker 0
## 12 downy-mildew 0
## 13 frog-eye-leaf-spot 0
## 14 herbicide-injury 160
## 15 phyllosticta-leaf-spot 0
## 16 phytophthora-rot 1214
## 17 powdery-mildew 0
## 18 purple-seed-stain 0
## 19 rhizoctonia-root-rot 0
Class 2-4 d injury, cyst-nematode, diaporthe-pod-&-stem-blight, herbicide-injury and phytophthora-rot have all the missing values. predictors such as lodging and sever and seed.tmt are missing the most values.
some of the predictors do not have any values for a particular class. so providing a reasonable value would be difficult. potentially the missing values may be predicted based on the values of the other predictors. the data may be missing due to the class not having the attribute. I would replace the na for predictors where the class has a value for that predictor. using the mode of the factor values for a predictor for a particular class may be useful.