Data 624 Assignment 4

Load packages

library(fpp3)
library(RColorBrewer)
library(seasonal)
library(mlbench)
library(fmsb)

Instructions

Do problems 3.1 and 3.2 in the Kuhn and Johnson book Applied Predictive Modeling.

Exercise 3.1

The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

The data can be accessed via:

data(Glass)
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

To explore the predictor values, I first averaged the predictors based on Glass type and then plotted them on radarcharts to visualize the average composition.

I first created an overall dataframe format, which created dataframe containing the max and min values for each column. This is the same for all of the Glass plots, so that they are viewable on the same scale.

maxval <- apply(Glass,2,max) #find max values
minval <- apply(Glass,2,min) #find min values

max_row <- data.frame(
  RI = 1.6, Na = 20, Mg = 5, Al = 4, 
  Si = 76, K = 7, Ca = 17, Ba = 4, Fe = .6)
min_row <- data.frame(
  RI = 1.5, Na = 10, Mg = 0, Al = 0, 
  Si = 69, K = 0, Ca = 5, Ba = 0, Fe = 0)

Glass_df <- data.frame(
  RI = numeric(0), Na = numeric(0), Mg = numeric(0), 
  Al = numeric(0), Si = numeric(0), K = numeric(0), 
  Ca = numeric(0), Ba = numeric(0), Fe = numeric(0))

Glass_df <- rbind(Glass_df, max_row)
Glass_df <- rbind(Glass_df, min_row)

The following is a radarchart for Glass type 1.

Glass1_df <- Glass_df
Glass1 <- Glass %>% filter(Type == 1) %>% select(-c(Type)) %>% colMeans()
Glass1

##          RI          Na          Mg          Al          Si           K 
##  1.51871829 13.24228571  3.55242857  1.16385714 72.61914286  0.44742857 
##          Ca          Ba          Fe 
##  8.79728571  0.01271429  0.05700000

Glass1_df <- rbind(Glass1_df, Glass1)
radarchart(Glass1_df, axistype=1 , 
    #custom polygon
    pcol=rgb(0.8,0.0,0.0,0.9) , pfcol=rgb(0.8,0.0,0.0,0.5) , plwd=2 , 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

Glass2_df <- Glass_df
Glass2 <- Glass %>% filter(Type == 2) %>% select(-c(Type)) %>% colMeans()
Glass2

##          RI          Na          Mg          Al          Si           K 
##  1.51861855 13.11171053  3.00210526  1.40815789 72.59802632  0.52105263 
##          Ca          Ba          Fe 
##  9.07368421  0.05026316  0.07973684

Glass2_df <- rbind(Glass2_df, Glass2)
radarchart(Glass2_df, axistype=1 , 
    #custom polygon
    pcol=rgb(0.8,0.2,0.0,0.9) , pfcol=rgb(0.8,0.2,0.0,0.5) , plwd=2 , 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

Glass3_df <- Glass_df
Glass3 <- Glass %>% filter(Type == 3) %>% select(-c(Type)) %>% colMeans()
Glass3

##           RI           Na           Mg           Al           Si            K 
##  1.517963529 13.437058824  3.543529412  1.201176471 72.404705882  0.406470588 
##           Ca           Ba           Fe 
##  8.782941176  0.008823529  0.057058824

Glass3_df <- rbind(Glass3_df, Glass3)
radarchart(Glass3_df, axistype=1 , 
    #custom polygon
    pcol=rgb(0.8,0.5,0.0,0.9) , pfcol=rgb(0.8,0.5,0.0,0.5) , plwd=2 , 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

Glass5_df <- Glass_df
Glass5 <- Glass %>% filter(Type == 5) %>% select(-c(Type)) %>% colMeans()
Glass5

##          RI          Na          Mg          Al          Si           K 
##  1.51892769 12.82769231  0.77384615  2.03384615 72.36615385  1.47000000 
##          Ca          Ba          Fe 
## 10.12384615  0.18769231  0.06076923

Glass5_df <- rbind(Glass5_df, Glass5)
radarchart(Glass5_df, axistype=1 , 
    #custom polygon
    pcol=rgb(0.5,0.5,0.0,0.9) , pfcol=rgb(0.5,0.5,0.0,0.5) , plwd=2 , 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

Glass6_df <- Glass_df
Glass6 <- Glass %>% filter(Type == 6) %>% select(-c(Type)) %>% colMeans()
Glass6

##        RI        Na        Mg        Al        Si         K        Ca        Ba 
##  1.517456 14.646667  1.305556  1.366667 73.206667  0.000000  9.356667  0.000000 
##        Fe 
##  0.000000

Glass6_df <- rbind(Glass6_df, Glass6)
radarchart(Glass6_df, axistype=1 , 
    #custom polygon
    pcol=rgb(0.0,0.5,0.5,0.9) , pfcol=rgb(0.0,0.5,0.5,0.5) , plwd=2 , 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

Glass7_df <- Glass_df
Glass7 <- Glass %>% filter(Type == 7) %>% select(-c(Type)) %>% colMeans()
Glass7

##          RI          Na          Mg          Al          Si           K 
##  1.51711621 14.44206897  0.53827586  2.12275862 72.96586207  0.32517241 
##          Ca          Ba          Fe 
##  8.49137931  1.04000000  0.01344828

Glass7_df <- rbind(Glass7_df, Glass7)
radarchart(Glass7_df, axistype=1 , 
    #custom polygon
    pcol=rgb(0.0,0.0,0.8,0.9) , pfcol=rgb(0.0,0.0,0.8,0.5) , plwd=2 , 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

Since Type 1, 2, and 3 Glass appear very close in average composition, I changed the scale while taking into account only the min and max values of these glass types to more easily distinguish between them. I also overlaid them on the radarchart for ease of comparison.

Glass_123 <- Glass %>% filter(Type == 1|Type == 2|Type == 3) 
maxval <- apply(Glass_123,2,max) #find max values
minval <- apply(Glass_123,2,min) #find min values

max_row1 <- data.frame(
  RI = 1.54, Na = 15, Mg = 5, Al = 2.2, 
  Si = 76, K = 2, Ca = 17, Ba = 4, Fe = .6)
min_row1 <- data.frame(
  RI = 1.51, Na = 10.5, Mg = 0, Al = 0.2, 
  Si = 69, K = 0, Ca = 7, Ba = 0, Fe = 0)

Glass_df1 <- data.frame(
  RI = numeric(0), Na = numeric(0), Mg = numeric(0), Al = numeric(0), 
  Si = numeric(0), K = numeric(0), Ca = numeric(0), Ba = numeric(0), Fe = numeric(0))
Glass_df1 <- rbind(Glass_df1, max_row1)
Glass_df1 <- rbind(Glass_df1, min_row1)
Glass_df1 <- rbind(Glass_df1,Glass1)
Glass_df1 <- rbind(Glass_df1,Glass2)
Glass_df1 <- rbind(Glass_df1,Glass3)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.3), rgb(0.8,0.2,0.5,0.3) , rgb(0.7,0.5,0.1,0.3) )
radarchart(Glass_df1, axistype=1 , 
    #custom polygon
    pcol=colors_border , pfcol=colors_in , plwd=2 , plty=1,
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    )

legend(x=1.5, y=1, legend = c("Type 1","Type 2","Type 3"), bty = "n", pch=20 , col=colors_in , text.col = "grey", cex=1.2, pt.cex=3)

I also plotted Glass types 5, 6, and 7 together on one radarchart, though the scales from the original plots remained the same.

Glass_df2 <- Glass_df
Glass_df2 <- rbind(Glass_df2,Glass5)
Glass_df2 <- rbind(Glass_df2,Glass6)
Glass_df2 <- rbind(Glass_df2,Glass7)
colors_border=c( rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9) )
colors_in=c( rgb(0.2,0.5,0.5,0.3), rgb(0.8,0.2,0.5,0.3) , rgb(0.7,0.5,0.1,0.3) )
radarchart(Glass_df2, axistype=1 , 
    #custom polygon
    pcol=colors_border , pfcol=colors_in , plwd=2 , plty=1,
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,20,5), cglwd=0.8,
    ) 

legend(x=1.5, y=1, legend = c("Type 5","Type 6","Type 7"), 
       bty = "n", pch=20 , col=colors_in , text.col = "grey", cex=1.2, pt.cex=3)

Do there appear to be any outliers in the data? Are any predictors skewed?

The following graphs show the distribution of values for each property, grouped by glass type.

For Glass Type 1, the boxplots show that Al composition is slightly left skewed. Ba composition is centered on 0%, although it appears that the existence of Ba in the glass sample does not preclude it from classification into Type 1, given that there are samples with Ba included. Fe composition is centered on 0 as well, although it has a greater variance than Ba. Fe composition is also skewed right. K composition is skewed left with no outliers. Mg composition is not skewed although it has multiple outliers. Most samples generally fall within the approximate 3.3-3.9% range. Na composition has very slight right skew. Refractive Index is also skewed right. Si composition has no outliers, and all samples fall within the 71-74% range. There are 70 samples in the Type 1 glass group, so average composition as calculated above and graphed with the radarchart should not be incredibly affected by outliers.

Glass1 <- Glass %>% filter(Type == 1) %>% mutate(sampleID = row_number())
Glass1 <- Glass1 %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass1 %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

For Glass Type 2, there are more outliers in general than in Glass Type 1. All plots show many (4+) outliers except for Fe composition. Al exhibits slight left skew. Ba is centered on 0, with right skew; there is 1 extreme outlier at 3%+. Ca is right skewed, much more so than with Type 1 glass. Fe composition is again centered on 0, with right skew. K composition exhibits left skew, with many outliers in the 0-0.3% range. Mg also exhibits left skew, although most samples fall within the 2.25-4% range. Na is mostly unskewed though with outliers on both ends. Refractive Index is right skewed, with many outliers like with the Ca plot. Si is slightly left skewed. Like Type 1 glass, there were 70+ samples in type 2 glass. However, in this case, a significant percentage of the samples represent outliers.

Glass2 <- Glass %>% filter(Type == 2) %>% mutate(sampleID = row_number())
Glass2 <- Glass2 %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass2 %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

For Glass Type 3, there are fewer outliers in general than in Glass Type 1 and 2. There are 9 total outliers across the 9 plots. However, there are also only 17 samples in total, which may indicate a high percentage of outliers. This would also explain the fewer number of outliers, as each sample has more influence in the plot. There are no outliers for Al, Mg, and Si composition. Ba and Fe composition are both centered on 0, with right skew. Ba composition appears to be 0 in all samples except for 1.

Glass3 <- Glass %>% filter(Type == 3) %>% mutate(sampleID = row_number())
Glass3 <- Glass3 %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass3 %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

For Glass Type 5, there are only 13 samples, so the same discussion (from Glass Type 3) on outliers and sample influence applies. Ba and Fe are again centered on 0. Mg composition is also centered on 0. All plots exhibit some level of skew, with the least skewed plots being K and Na composition.

Glass5 <- Glass %>% filter(Type == 5) %>% mutate(sampleID = row_number())
Glass5 <- Glass5 %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass5 %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

For Glass Type 6, there are only 9 samples. In all samples, Ba, Fe, and K composition are 0. Looking at the plots, one might be tempted to conclude that samples with 0 Ba, Fe, and K composition fall under Type 6 category. However, given that there are only 9 samples and given that all other plots thus far have centered on 0 for Ba and Fe, this conclusion may be premature.

Glass6 <- Glass %>% filter(Type == 6) %>% mutate(sampleID = row_number())
Glass6 <- Glass6 %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass6 %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

Finally, for Glass Type 7, there are many outliers for Ca, Fe, K, Mg, RI, and Si. Na composition has 2 outliers while Al and Ba have none. Unlike all other glass types, Ba composition is centered above 0, at about 0.8-0.9%. Fe, K, and Mg composition are all centered on 0.

Glass7 <- Glass %>% filter(Type == 7) %>% mutate(sampleID = row_number())
Glass7 <- Glass7 %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass7 %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

Looking at the data as a whole, we can see that Ba and Fe content is generally centered on 0. Ba and Fe are generally right skewed. Mg is left skewed, though potentailly this is due to the various samples at 0% Mg composition from Type 7. Na and RI exhibit some slight right skew, while Si exhibits some slight left skew. K is also left skewed, due to the various samples with 0 and nearly 0 K composition. Al and Ca appear nearly normal in the histoggram, but from the boxplot we can see that both have various outliers on both ends.

Glass_full <- Glass %>% 
  pivot_longer(c(RI,Na,Mg,Al,Si,K,Ca,Ba,Fe), 
               names_to = "Property", values_to = "value") 
Glass_full %>% ggplot(aes(x = value)) + 
  geom_boxplot() + facet_wrap(~ Property, scales = "free_x")

Glass_full %>% ggplot(aes(x = value)) + 
  geom_histogram() + facet_wrap(~ Property, scales = "free")

Are there any relevant transformations of one or more predictors that might improve the classification model?

All of the data is positive, as a negative composition would not make sense. As there is a lot of fractional data between 0-1, along with skewed distributions, it may make sense to perform a log transform.

Exercise 3.2

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

The data can be loaded via:

data(Soybean)

Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

Degenerate distributions have only one possible value. From the below graph, we can see that no category has only one possible value. However, there are multiple categories that are close to having only one possible value, such as mycelium and sclerotia. Lodging, shriveling, leaf.malf, and leaf.mild also heavily favor a single value, though not as extreme as mycelium and schlerotia.

Soybean_full <- Soybean %>% mutate(across(
  c(date,plant.stand,precip,temp,hail,crop.hist,area.dam,sever,seed.tmt,
    germ,plant.growth,leaves,leaf.halo,leaf.marg,leaf.size,leaf.shread,
    leaf.malf,leaf.mild,stem,lodging,stem.cankers,canker.lesion,
    fruiting.bodies,ext.decay,mycelium,int.discolor,sclerotia,fruit.pods,
    fruit.spots,seed,mold.growth,seed.discolor,seed.size,shriveling,roots), ~ as.numeric(.))) %>% 
  mutate(sampleid = row_number()) #was not able to pivot longer unless convert to numeric

Soybean_full <- Soybean_full %>% 
  pivot_longer(c(date,plant.stand,precip,temp,hail,crop.hist,
                 area.dam,sever,seed.tmt,germ,plant.growth,leaves,
                 leaf.halo,leaf.marg,leaf.size,leaf.shread,
                 leaf.malf,leaf.mild,stem,lodging,stem.cankers,
                 canker.lesion,fruiting.bodies,ext.decay,mycelium,
                 int.discolor,sclerotia,fruit.pods,fruit.spots,seed
                 ,mold.growth,seed.discolor,seed.size,shriveling,roots), 
               names_to = "Property", values_to = "value")
# Soybean_full$value <- as.factor(Soybean_full$value) Saving the NA analysis for the next graph

Soybean_full %>% ggplot(aes(x = value)) + geom_bar() + facet_wrap(~ Property, scales = "free_y")

## Warning: Removed 2337 rows containing non-finite outside the scale range
## (`stat_count()`).

Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

Germ, seed.tmt, sever, and fruit.spots appear to have the highest percentage of missing values.

Soybean_Summary <- Soybean_full %>% group_by(Property, value) %>% count()
Soybean_Summary$value <- as.factor(Soybean_Summary$value)
Soybean_Summary %>% ggplot(aes(x = value, y = n, fill = value)) + geom_bar(stat= "identity") +
  facet_wrap(~ Property, scales = "free_y")

Soybean_Percentages <- Soybean_full %>% group_by(Property, value) %>% count()
Soybean_Percentages <- Soybean_Percentages %>% group_by(Property) %>% 
  mutate(total = sum(n)) %>% 
  mutate(percentage = 100 * n/total)
Soybean_Percentages$value <- as.factor(Soybean_Percentages$value)
Soybean_Percentages %>% 
  ggplot(aes(x = value, y = percentage, fill = value)) + geom_bar(stat= "identity") +
  facet_wrap(~ Property, scales = "free_y")

The pattern of missing data does appear related to class, with 2-4-d-injury, cyst-nematode, diaporthe-pod-&-stem-blight, herbicide-injury, and phytophthora-rot having the highest percentage of NA values.

Soybean_Summary <- Soybean_full %>% group_by(Class,value) %>% count()
Soybean_Summary$value <- as.factor(Soybean_Summary$value)
Soybean_Summary %>% filter(is.na(value)) %>% ggplot(aes(x = Class, y = n, fill = Class)) + geom_bar(stat= "identity")

Develop a strategy for handling missing data, either by eliminating predictors or imputation.

columns <- c("2-4-d-injury","cyst-nematode","diaporthe-pod-&-stem-blight","herbicide-injury","phytophthora-rot")
Soybean_full %>% group_by(Class) %>% count() %>% mutate(total_samples = n/35) %>% filter(Class %in% columns)

## # A tibble: 5 × 3
## # Groups:   Class [5]
##   Class                           n total_samples
##   <fct>                       <int>         <dbl>
## 1 2-4-d-injury                  560            16
## 2 cyst-nematode                 490            14
## 3 diaporthe-pod-&-stem-blight   525            15
## 4 herbicide-injury              280             8
## 5 phytophthora-rot             3080            88

Missing data is isolated to the five classes stated above.

For 2-4-d-injury, the following predictors are all NA across this class.

Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 16 , Class == "2-4-d-injury")

## # A tibble: 28 × 4
## # Groups:   Class, Property, value [28]
##    Class        Property        value     n
##    <fct>        <chr>           <dbl> <int>
##  1 2-4-d-injury canker.lesion      NA    16
##  2 2-4-d-injury crop.hist          NA    16
##  3 2-4-d-injury ext.decay          NA    16
##  4 2-4-d-injury fruit.pods         NA    16
##  5 2-4-d-injury fruit.spots        NA    16
##  6 2-4-d-injury fruiting.bodies    NA    16
##  7 2-4-d-injury germ               NA    16
##  8 2-4-d-injury hail               NA    16
##  9 2-4-d-injury int.discolor       NA    16
## 10 2-4-d-injury leaf.mild          NA    16
## # ℹ 18 more rows

For cyst-nematode, the following predictors are all NA across this class.

Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 14, Class == "cyst-nematode")

## # A tibble: 24 × 4
## # Groups:   Class, Property, value [24]
##    Class         Property        value     n
##    <fct>         <chr>           <dbl> <int>
##  1 cyst-nematode canker.lesion      NA    14
##  2 cyst-nematode ext.decay          NA    14
##  3 cyst-nematode fruit.spots        NA    14
##  4 cyst-nematode fruiting.bodies    NA    14
##  5 cyst-nematode germ               NA    14
##  6 cyst-nematode hail               NA    14
##  7 cyst-nematode int.discolor       NA    14
##  8 cyst-nematode leaf.halo          NA    14
##  9 cyst-nematode leaf.malf          NA    14
## 10 cyst-nematode leaf.marg          NA    14
## # ℹ 14 more rows

For diaporthe-pod-&-stem-blight, the following predictors are all NA across this class.

Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 15, Class == "diaporthe-pod-&-stem-blight")

## # A tibble: 11 × 4
## # Groups:   Class, Property, value [11]
##    Class                       Property    value     n
##    <fct>                       <chr>       <dbl> <int>
##  1 diaporthe-pod-&-stem-blight hail           NA    15
##  2 diaporthe-pod-&-stem-blight leaf.halo      NA    15
##  3 diaporthe-pod-&-stem-blight leaf.malf      NA    15
##  4 diaporthe-pod-&-stem-blight leaf.marg      NA    15
##  5 diaporthe-pod-&-stem-blight leaf.mild      NA    15
##  6 diaporthe-pod-&-stem-blight leaf.shread    NA    15
##  7 diaporthe-pod-&-stem-blight leaf.size      NA    15
##  8 diaporthe-pod-&-stem-blight lodging        NA    15
##  9 diaporthe-pod-&-stem-blight roots          NA    15
## 10 diaporthe-pod-&-stem-blight seed.tmt       NA    15
## 11 diaporthe-pod-&-stem-blight sever          NA    15

For herbicide-injury, the following predictors are all NA across this class.

Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 8, Class == "herbicide-injury")

## # A tibble: 20 × 4
## # Groups:   Class, Property, value [20]
##    Class            Property        value     n
##    <fct>            <chr>           <dbl> <int>
##  1 herbicide-injury canker.lesion      NA     8
##  2 herbicide-injury ext.decay          NA     8
##  3 herbicide-injury fruit.spots        NA     8
##  4 herbicide-injury fruiting.bodies    NA     8
##  5 herbicide-injury germ               NA     8
##  6 herbicide-injury hail               NA     8
##  7 herbicide-injury int.discolor       NA     8
##  8 herbicide-injury leaf.mild          NA     8
##  9 herbicide-injury lodging            NA     8
## 10 herbicide-injury mold.growth        NA     8
## 11 herbicide-injury mycelium           NA     8
## 12 herbicide-injury precip             NA     8
## 13 herbicide-injury sclerotia          NA     8
## 14 herbicide-injury seed               NA     8
## 15 herbicide-injury seed.discolor      NA     8
## 16 herbicide-injury seed.size          NA     8
## 17 herbicide-injury seed.tmt           NA     8
## 18 herbicide-injury sever              NA     8
## 19 herbicide-injury shriveling         NA     8
## 20 herbicide-injury stem.cankers       NA     8

For phytophthora-rot, there are no predictors that are missing across the entire class.

Soybean_full %>% group_by(Class,Property,value) %>% count() %>% filter(is.na(value) & n == 88, Class == "phytophthora-rot")

## # A tibble: 0 × 4
## # Groups:   Class, Property, value [0]
## # ℹ 4 variables: Class <fct>, Property <chr>, value <dbl>, n <int>

For 2-4-d-injury, cyst-nematode, and herbicide-injury, as there are many (>=20) predictors missing and an already small sample set, I would propose to remove these classes entirely. This is because over 50% of the predictors would be missing complete, making it difficult to interpolate missing values. For the Phytophtora-rot class, no predictors are missing in its entirety. As such, it may be possible to imputate these values. One way to do this would be by using a simple replacement for mean/median within their own class. Alternatively, a more calculated approach may employ K Nearest Neighbor imputation by determining its correlation with other features and then interpolating based on these populated values.

Data 624 Assignment 4

Kim Koon

2024-09-28

Load packages

Instructions

Exercise 3.1

Exercise 3.2