library(fpp3)
library(RColorBrewer)
library(seasonal)
library(mlbench)
library(fmsb)
Do problems 3.1 and 3.2 in the Kuhn and Johnson book Applied Predictive Modeling.
The data can be accessed via:
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
To explore the predictor values, I first averaged the predictors based on Glass type and then plotted them on radarcharts to visualize the average composition.
I first created an overall dataframe format, which created dataframe containing the max and min values for each column. This is the same for all of the Glass plots, so that they are viewable on the same scale.
maxval <- apply(Glass,2,max) #find max values
minval <- apply(Glass,2,min) #find min values
max_row <- data.frame(
RI = 1.6, Na = 20, Mg = 5, Al = 4,
Si = 76, K = 7, Ca = 17, Ba = 4, Fe = .6)
min_row <- data.frame(
RI = 1.5, Na = 10, Mg = 0, Al = 0,
Si = 69, K = 0, Ca = 5, Ba = 0, Fe = 0)
Glass_df <- data.frame(
RI = numeric(0), Na = numeric(0), Mg = numeric(0),
Al = numeric(0), Si = numeric(0), K = numeric(0),
Ca = numeric(0), Ba = numeric(0), Fe = numeric(0))
Glass_df <- rbind(Glass_df, max_row)
Glass_df <- rbind(Glass_df, min_row)
The following is a radarchart for Glass type 1.
Glass1_df <- Glass_df
Glass1 <- Glass %>% filter(Type == 1) %>% select(-c(Type)) %>% colMeans()
Glass1
## RI Na Mg Al Si K
## 1.51871829 13.24228571 3.55242857 1.16385714 72.61914286 0.44742857
## Ca Ba Fe
## 8.79728571 0.01271429 0.05700000
Glass1_df <- rbind(Glass1_df, Glass1)
radarchart(Glass1_df, axistype=1 ,
#custom polygon
pcol=rgb(0.8,0.0,0.0,0.9) , pfcol=rgb(0.8,0.0,0.0,0.5) , plwd=2 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
Glass2_df <- Glass_df
Glass2 <- Glass %>% filter(Type == 2) %>% select(-c(Type)) %>% colMeans()
Glass2
## RI Na Mg Al Si K
## 1.51861855 13.11171053 3.00210526 1.40815789 72.59802632 0.52105263
## Ca Ba Fe
## 9.07368421 0.05026316 0.07973684
Glass2_df <- rbind(Glass2_df, Glass2)
radarchart(Glass2_df, axistype=1 ,
#custom polygon
pcol=rgb(0.8,0.2,0.0,0.9) , pfcol=rgb(0.8,0.2,0.0,0.5) , plwd=2 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
Glass3_df <- Glass_df
Glass3 <- Glass %>% filter(Type == 3) %>% select(-c(Type)) %>% colMeans()
Glass3
## RI Na Mg Al Si K
## 1.517963529 13.437058824 3.543529412 1.201176471 72.404705882 0.406470588
## Ca Ba Fe
## 8.782941176 0.008823529 0.057058824
Glass3_df <- rbind(Glass3_df, Glass3)
radarchart(Glass3_df, axistype=1 ,
#custom polygon
pcol=rgb(0.8,0.5,0.0,0.9) , pfcol=rgb(0.8,0.5,0.0,0.5) , plwd=2 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
Glass5_df <- Glass_df
Glass5 <- Glass %>% filter(Type == 5) %>% select(-c(Type)) %>% colMeans()
Glass5
## RI Na Mg Al Si K
## 1.51892769 12.82769231 0.77384615 2.03384615 72.36615385 1.47000000
## Ca Ba Fe
## 10.12384615 0.18769231 0.06076923
Glass5_df <- rbind(Glass5_df, Glass5)
radarchart(Glass5_df, axistype=1 ,
#custom polygon
pcol=rgb(0.5,0.5,0.0,0.9) , pfcol=rgb(0.5,0.5,0.0,0.5) , plwd=2 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
Glass6_df <- Glass_df
Glass6 <- Glass %>% filter(Type == 6) %>% select(-c(Type)) %>% colMeans()
Glass6
## RI Na Mg Al Si K Ca Ba
## 1.517456 14.646667 1.305556 1.366667 73.206667 0.000000 9.356667 0.000000
## Fe
## 0.000000
Glass6_df <- rbind(Glass6_df, Glass6)
radarchart(Glass6_df, axistype=1 ,
#custom polygon
pcol=rgb(0.0,0.5,0.5,0.9) , pfcol=rgb(0.0,0.5,0.5,0.5) , plwd=2 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
Glass7_df <- Glass_df
Glass7 <- Glass %>% filter(Type == 7) %>% select(-c(Type)) %>% colMeans()
Glass7
## RI Na Mg Al Si K
## 1.51711621 14.44206897 0.53827586 2.12275862 72.96586207 0.32517241
## Ca Ba Fe
## 8.49137931 1.04000000 0.01344828
Glass7_df <- rbind(Glass7_df, Glass7)
radarchart(Glass7_df, axistype=1 ,
#custom polygon
pcol=rgb(0.0,0.0,0.8,0.9) , pfcol=rgb(0.0,0.0,0.8,0.5) , plwd=2 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
Since Type 1, 2, and 3 Glass appear very close in average composition, I changed the scale while taking into account only the min and max values of these glass types to more easily distinguish between them. I also overlaid them on the radarchart for ease of comparison.
Glass_123 <- Glass %>% filter(Type == 1|Type == 2|Type == 3)
maxval <- apply(Glass_123,2,max) #find max values
minval <- apply(Glass_123,2,min) #find min values
max_row1 <- data.frame(
RI = 1.54, Na = 15, Mg = 5, Al = 2.2,
Si = 76, K = 2, Ca = 17, Ba = 4, Fe = .6)
min_row1 <- data.frame(
RI = 1.51, Na = 10.5, Mg = 0, Al = 0.2,
Si = 69, K = 0, Ca = 7, Ba = 0, Fe = 0)
Glass_df1 <- data.frame(
RI = numeric(0), Na = numeric(0), Mg = numeric(0), Al = numeric(0),
Si = numeric(0), K = numeric(0), Ca = numeric(0), Ba = numeric(0), Fe = numeric(0))
Glass_df1 <- rbind(Glass_df1, max_row1)
Glass_df1 <- rbind(Glass_df1, min_row1)
Glass_df1 <- rbind(Glass_df1,Glass1)
Glass_df1 <- rbind(Glass_df1,Glass2)
Glass_df1 <- rbind(Glass_df1,Glass3)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.3), rgb(0.8,0.2,0.5,0.3) , rgb(0.7,0.5,0.1,0.3) )
radarchart(Glass_df1, axistype=1 ,
#custom polygon
pcol=colors_border , pfcol=colors_in , plwd=2 , plty=1,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
legend(x=1.5, y=1, legend = c("Type 1","Type 2","Type 3"), bty = "n", pch=20 , col=colors_in , text.col = "grey", cex=1.2, pt.cex=3)
I also plotted Glass types 5, 6, and 7 together on one radarchart, though the scales from the original plots remained the same.
Glass_df2 <- Glass_df
Glass_df2 <- rbind(Glass_df2,Glass5)
Glass_df2 <- rbind(Glass_df2,Glass6)
Glass_df2 <- rbind(Glass_df2,Glass7)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.3), rgb(0.8,0.2,0.5,0.3) , rgb(0.7,0.5,0.1,0.3) )
radarchart(Glass_df2, axistype=1 ,
#custom polygon
pcol=colors_border , pfcol=colors_in , plwd=2 , plty=1,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
)
legend(x=1.5, y=1, legend = c("Type 5","Type 6","Type 7"),
bty = "n", pch=20 , col=colors_in , text.col = "grey", cex=1.2, pt.cex=3)
The following graphs show the distribution of values for each property, grouped by glass type.
For Glass Type 1, the boxplots show that Al composition is slightly left skewed. Ba composition is centered on 0%, although it appears that the existence of Ba in the glass sample does not preclude it from classification into Type 1, given that there are samples with Ba included. Fe composition is centered on 0 as well, although it has a greater variance than Ba. Fe composition is also skewed right. K composition is skewed left with no outliers. Mg composition is not skewed although it has multiple outliers. Most samples generally fall within the approximate 3.3-3.9% range. Na composition has very slight right skew. Refractive Index is also skewed right. Si composition has no outliers, and all samples fall within the 71-74% range. There are 70 samples in the Type 1 glass group, so average composition as calculated above and graphed with the radarchart should not be incredibly affected by outliers.
Glass1 <- Glass %>% filter(Type == 1) %>% mutate(sampleID = row_number())
Glass1 <- Glass1 %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass1 %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
For Glass Type 2, there are more outliers in general than in Glass Type 1. All plots show many (4+) outliers except for Fe composition. Al exhibits slight left skew. Ba is centered on 0, with right skew; there is 1 extreme outlier at 3%+. Ca is right skewed, much more so than with Type 1 glass. Fe composition is again centered on 0, with right skew. K composition exhibits left skew, with many outliers in the 0-0.3% range. Mg also exhibits left skew, although most samples fall within the 2.25-4% range. Na is mostly unskewed though with outliers on both ends. Refractive Index is right skewed, with many outliers like with the Ca plot. Si is slightly left skewed. Like Type 1 glass, there were 70+ samples in type 2 glass. However, in this case, a significant percentage of the samples represent outliers.
Glass2 <- Glass %>% filter(Type == 2) %>% mutate(sampleID = row_number())
Glass2 <- Glass2 %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass2 %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
For Glass Type 3, there are fewer outliers in general than in Glass Type 1 and 2. There are 9 total outliers across the 9 plots. However, there are also only 17 samples in total, which may indicate a high percentage of outliers. This would also explain the fewer number of outliers, as each sample has more influence in the plot. There are no outliers for Al, Mg, and Si composition. Ba and Fe composition are both centered on 0, with right skew. Ba composition appears to be 0 in all samples except for 1.
Glass3 <- Glass %>% filter(Type == 3) %>% mutate(sampleID = row_number())
Glass3 <- Glass3 %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass3 %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
For Glass Type 5, there are only 13 samples, so the same discussion (from Glass Type 3) on outliers and sample influence applies. Ba and Fe are again centered on 0. Mg composition is also centered on 0. All plots exhibit some level of skew, with the least skewed plots being K and Na composition.
Glass5 <- Glass %>% filter(Type == 5) %>% mutate(sampleID = row_number())
Glass5 <- Glass5 %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass5 %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
For Glass Type 6, there are only 9 samples. In all samples, Ba, Fe, and K composition are 0. Looking at the plots, one might be tempted to conclude that samples with 0 Ba, Fe, and K composition fall under Type 6 category. However, given that there are only 9 samples and given that all other plots thus far have centered on 0 for Ba and Fe, this conclusion may be premature.
Glass6 <- Glass %>% filter(Type == 6) %>% mutate(sampleID = row_number())
Glass6 <- Glass6 %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass6 %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
Finally, for Glass Type 7, there are many outliers for Ca, Fe, K, Mg,
RI, and Si. Na composition has 2 outliers while Al and Ba have none.
Unlike all other glass types, Ba composition is centered above 0, at
about 0.8-0.9%. Fe, K, and Mg composition are all centered on 0.
Glass7 <- Glass %>% filter(Type == 7) %>% mutate(sampleID = row_number())
Glass7 <- Glass7 %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass7 %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
Looking at the data as a whole, we can see that Ba and Fe content is generally centered on 0. Ba and Fe are generally right skewed. Mg is left skewed, though potentailly this is due to the various samples at 0% Mg composition from Type 7. Na and RI exhibit some slight right skew, while Si exhibits some slight left skew. K is also left skewed, due to the various samples with 0 and nearly 0 K composition. Al and Ca appear nearly normal in the histoggram, but from the boxplot we can see that both have various outliers on both ends.
Glass_full <- Glass %>%
pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe),
names_to = "Property", values_to = "value")
Glass_full %>% ggplot(aes(x = value)) +
geom_boxplot() + facet_wrap(~ Property, scales = "free_x")
Glass_full %>% ggplot(aes(x = value)) +
geom_histogram() + facet_wrap(~ Property, scales = "free")
All of the data is positive, as a negative composition would not make sense. As there is a lot of fractional data between 0-1, along with skewed distributions, it may make sense to perform a log transform.
The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
The data can be loaded via:
data(Soybean)
Degenerate distributions have only one possible value. From the below graph, we can see that no category has only one possible value. However, there are multiple categories that are close to having only one possible value, such as mycelium and sclerotia. Lodging, shriveling, leaf.malf, and leaf.mild also heavily favor a single value, though not as extreme as mycelium and schlerotia.
Soybean_full <- Soybean %>% mutate(across(
c(date,plant.stand,precip,temp,hail,crop.hist,area.dam,sever,seed.tmt,
germ,plant.growth,leaves,leaf.halo,leaf.marg,leaf.size,leaf.shread,
leaf.malf,leaf.mild,stem,lodging,stem.cankers,canker.lesion,
fruiting.bodies,ext.decay,mycelium,int.discolor,sclerotia,fruit.pods,
fruit.spots,seed,mold.growth,seed.discolor,seed.size,shriveling,roots), ~ as.numeric(.))) %>%
mutate(sampleid = row_number()) #was not able to pivot longer unless convert to numeric
Soybean_full <- Soybean_full %>%
pivot_longer(c(date,plant.stand,precip,temp,hail,crop.hist,
area.dam,sever,seed.tmt,germ,plant.growth,leaves,
leaf.halo,leaf.marg,leaf.size,leaf.shread,
leaf.malf,leaf.mild,stem,lodging,stem.cankers,
canker.lesion,fruiting.bodies,ext.decay,mycelium,
int.discolor,sclerotia,fruit.pods,fruit.spots,seed
,mold.growth,seed.discolor,seed.size,shriveling,roots),
names_to = "Property", values_to = "value")
# Soybean_full$value <- as.factor(Soybean_full$value) Saving the NA analysis for the next graph
Soybean_full %>% ggplot(aes(x = value)) + geom_bar() + facet_wrap(~ Property, scales = "free_y")
## Warning: Removed 2337 rows containing non-finite outside the scale range
## (`stat_count()`).
Germ, seed.tmt, sever, and fruit.spots appear to have the highest percentage of missing values.
Soybean_Summary <- Soybean_full %>% group_by(Property, value) %>% count()
Soybean_Summary$value <- as.factor(Soybean_Summary$value)
Soybean_Summary %>% ggplot(aes(x = value, y = n, fill = value)) + geom_bar(stat= "identity") +
facet_wrap(~ Property, scales = "free_y")
Soybean_Percentages <- Soybean_full %>% group_by(Property, value) %>% count()
Soybean_Percentages <- Soybean_Percentages %>% group_by(Property) %>%
mutate(total = sum(n)) %>%
mutate(percentage = 100 * n/total)
Soybean_Percentages$value <- as.factor(Soybean_Percentages$value)
Soybean_Percentages %>%
ggplot(aes(x = value, y = percentage, fill = value)) + geom_bar(stat= "identity") +
facet_wrap(~ Property, scales = "free_y")
The pattern of missing data does appear related to class, with 2-4-d-injury, cyst-nematode, diaporthe-pod-&-stem-blight, herbicide-injury, and phytophthora-rot having the highest percentage of NA values.
Soybean_Summary <- Soybean_full %>% group_by(Class,value) %>% count()
Soybean_Summary$value <- as.factor(Soybean_Summary$value)
Soybean_Summary %>% filter(is.na(value)) %>% ggplot(aes(x = Class, y = n, fill = Class)) + geom_bar(stat= "identity")
columns <- c("2-4-d-injury","cyst-nematode","diaporthe-pod-&-stem-blight","herbicide-injury","phytophthora-rot")
Soybean_full %>% group_by(Class) %>% count() %>% mutate(total_samples = n/35) %>% filter(Class %in% columns)
## # A tibble: 5 × 3
## # Groups: Class [5]
## Class n total_samples
## <fct> <int> <dbl>
## 1 2-4-d-injury 560 16
## 2 cyst-nematode 490 14
## 3 diaporthe-pod-&-stem-blight 525 15
## 4 herbicide-injury 280 8
## 5 phytophthora-rot 3080 88
Missing data is isolated to the five classes stated above.
For 2-4-d-injury, the following predictors are all NA across this class.
Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 16 , Class == "2-4-d-injury")
## # A tibble: 28 × 4
## # Groups: Class, Property, value [28]
## Class Property value n
## <fct> <chr> <dbl> <int>
## 1 2-4-d-injury canker.lesion NA 16
## 2 2-4-d-injury crop.hist NA 16
## 3 2-4-d-injury ext.decay NA 16
## 4 2-4-d-injury fruit.pods NA 16
## 5 2-4-d-injury fruit.spots NA 16
## 6 2-4-d-injury fruiting.bodies NA 16
## 7 2-4-d-injury germ NA 16
## 8 2-4-d-injury hail NA 16
## 9 2-4-d-injury int.discolor NA 16
## 10 2-4-d-injury leaf.mild NA 16
## # ℹ 18 more rows
For cyst-nematode, the following predictors are all NA across this class.
Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 14, Class == "cyst-nematode")
## # A tibble: 24 × 4
## # Groups: Class, Property, value [24]
## Class Property value n
## <fct> <chr> <dbl> <int>
## 1 cyst-nematode canker.lesion NA 14
## 2 cyst-nematode ext.decay NA 14
## 3 cyst-nematode fruit.spots NA 14
## 4 cyst-nematode fruiting.bodies NA 14
## 5 cyst-nematode germ NA 14
## 6 cyst-nematode hail NA 14
## 7 cyst-nematode int.discolor NA 14
## 8 cyst-nematode leaf.halo NA 14
## 9 cyst-nematode leaf.malf NA 14
## 10 cyst-nematode leaf.marg NA 14
## # ℹ 14 more rows
For diaporthe-pod-&-stem-blight, the following predictors are all NA across this class.
Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 15, Class == "diaporthe-pod-&-stem-blight")
## # A tibble: 11 × 4
## # Groups: Class, Property, value [11]
## Class Property value n
## <fct> <chr> <dbl> <int>
## 1 diaporthe-pod-&-stem-blight hail NA 15
## 2 diaporthe-pod-&-stem-blight leaf.halo NA 15
## 3 diaporthe-pod-&-stem-blight leaf.malf NA 15
## 4 diaporthe-pod-&-stem-blight leaf.marg NA 15
## 5 diaporthe-pod-&-stem-blight leaf.mild NA 15
## 6 diaporthe-pod-&-stem-blight leaf.shread NA 15
## 7 diaporthe-pod-&-stem-blight leaf.size NA 15
## 8 diaporthe-pod-&-stem-blight lodging NA 15
## 9 diaporthe-pod-&-stem-blight roots NA 15
## 10 diaporthe-pod-&-stem-blight seed.tmt NA 15
## 11 diaporthe-pod-&-stem-blight sever NA 15
For herbicide-injury, the following predictors are all NA across this class.
Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 8, Class == "herbicide-injury")
## # A tibble: 20 × 4
## # Groups: Class, Property, value [20]
## Class Property value n
## <fct> <chr> <dbl> <int>
## 1 herbicide-injury canker.lesion NA 8
## 2 herbicide-injury ext.decay NA 8
## 3 herbicide-injury fruit.spots NA 8
## 4 herbicide-injury fruiting.bodies NA 8
## 5 herbicide-injury germ NA 8
## 6 herbicide-injury hail NA 8
## 7 herbicide-injury int.discolor NA 8
## 8 herbicide-injury leaf.mild NA 8
## 9 herbicide-injury lodging NA 8
## 10 herbicide-injury mold.growth NA 8
## 11 herbicide-injury mycelium NA 8
## 12 herbicide-injury precip NA 8
## 13 herbicide-injury sclerotia NA 8
## 14 herbicide-injury seed NA 8
## 15 herbicide-injury seed.discolor NA 8
## 16 herbicide-injury seed.size NA 8
## 17 herbicide-injury seed.tmt NA 8
## 18 herbicide-injury sever NA 8
## 19 herbicide-injury shriveling NA 8
## 20 herbicide-injury stem.cankers NA 8
For phytophthora-rot, there are no predictors that are missing across the entire class.
Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 88, Class == "phytophthora-rot")
## # A tibble: 0 × 4
## # Groups: Class, Property, value [0]
## # ℹ 4 variables: Class <fct>, Property <chr>, value <dbl>, n <int>
For 2-4-d-injury, cyst-nematode, and herbicide-injury, as there are many (>=20) predictors missing and an already small sample set, I would propose to remove these classes entirely. This is because over 50% of the predictors would be missing complete, making it difficult to interpolate missing values. For the Phytophtora-rot class, no predictors are missing in its entirety. As such, it may be possible to imputate these values. One way to do this would be by using a simple replacement for mean/median within their own class. Alternatively, a more calculated approach may employ K Nearest Neighbor imputation by determining its correlation with other features and then interpolating based on these populated values.