Pre-Processing Data

# Load required libraries
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(RColorBrewer)

# ========================================
# READ AND PREPARE DATA
# ========================================

# Read the soybean.csv file
soybean <- read.csv("~/Desktop/soybean.csv", header = TRUE)

# View structure
str(soybean)

## 'data.frame':    306 obs. of  36 variables:
##  $ diaporthe.stem.canker: chr  "diaporthe-stem-canker" "diaporthe-stem-canker" "diaporthe-stem-canker" "diaporthe-stem-canker" ...
##  $ X6                   : chr  "4" "3" "3" "6" ...
##  $ X0                   : chr  "0" "0" "0" "0" ...
##  $ X2                   : chr  "2" "2" "2" "2" ...
##  $ X1                   : chr  "1" "1" "1" "1" ...
##  $ X0.1                 : chr  "0" "0" "0" "0" ...
##  $ X1.1                 : chr  "2" "1" "1" "2" ...
##  $ X1.2                 : chr  "0" "0" "0" "0" ...
##  $ X1.3                 : chr  "2" "2" "2" "1" ...
##  $ X0.2                 : chr  "1" "1" "0" "0" ...
##  $ X0.3                 : chr  "1" "2" "1" "2" ...
##  $ X1.4                 : chr  "1" "1" "1" "1" ...
##  $ X1.5                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ X0.4                 : chr  "0" "0" "0" "0" ...
##  $ X2.1                 : chr  "2" "2" "2" "2" ...
##  $ X2.2                 : chr  "2" "2" "2" "2" ...
##  $ X0.5                 : chr  "0" "0" "0" "0" ...
##  $ X0.6                 : chr  "0" "0" "0" "0" ...
##  $ X0.7                 : chr  "0" "0" "0" "0" ...
##  $ X1.6                 : chr  "1" "1" "1" "1" ...
##  $ X1.7                 : chr  "0" "0" "0" "0" ...
##  $ X3                   : chr  "3" "3" "3" "3" ...
##  $ X1.8                 : chr  "1" "0" "0" "1" ...
##  $ X1.9                 : chr  "1" "1" "1" "1" ...
##  $ X1.10                : chr  "1" "1" "1" "1" ...
##  $ X0.8                 : chr  "0" "0" "0" "0" ...
##  $ X0.9                 : chr  "0" "0" "0" "0" ...
##  $ X0.10                : chr  "0" "0" "0" "0" ...
##  $ X0.11                : chr  "0" "0" "0" "0" ...
##  $ X4                   : chr  "4" "4" "4" "4" ...
##  $ X0.12                : chr  "0" "0" "0" "0" ...
##  $ X0.13                : chr  "0" "0" "0" "0" ...
##  $ X0.14                : chr  "0" "0" "0" "0" ...
##  $ X0.15                : chr  "0" "0" "0" "0" ...
##  $ X0.16                : chr  "0" "0" "0" "0" ...
##  $ X0.17                : chr  "0" "0" "0" "0" ...

head(soybean)

##   diaporthe.stem.canker X6 X0 X2 X1 X0.1 X1.1 X1.2 X1.3 X0.2 X0.3 X1.4 X1.5
## 1 diaporthe-stem-canker  4  0  2  1    0    2    0    2    1    1    1    1
## 2 diaporthe-stem-canker  3  0  2  1    0    1    0    2    1    2    1    1
## 3 diaporthe-stem-canker  3  0  2  1    0    1    0    2    0    1    1    1
## 4 diaporthe-stem-canker  6  0  2  1    0    2    0    1    0    2    1    1
## 5 diaporthe-stem-canker  5  0  2  1    0    3    0    1    0    1    1    1
## 6 diaporthe-stem-canker  5  0  2  1    0    2    0    1    1    0    1    1
##   X0.4 X2.1 X2.2 X0.5 X0.6 X0.7 X1.6 X1.7 X3 X1.8 X1.9 X1.10 X0.8 X0.9 X0.10
## 1    0    2    2    0    0    0    1    0  3    1    1     1    0    0     0
## 2    0    2    2    0    0    0    1    0  3    0    1     1    0    0     0
## 3    0    2    2    0    0    0    1    0  3    0    1     1    0    0     0
## 4    0    2    2    0    0    0    1    0  3    1    1     1    0    0     0
## 5    0    2    2    0    0    0    1    0  3    0    1     1    0    0     0
## 6    0    2    2    0    0    0    1    1  3    1    1     1    0    0     0
##   X0.11 X4 X0.12 X0.13 X0.14 X0.15 X0.16 X0.17
## 1     0  4     0     0     0     0     0     0
## 2     0  4     0     0     0     0     0     0
## 3     0  4     0     0     0     0     0     0
## 4     0  4     0     0     0     0     0     0
## 5     0  4     0     0     0     0     0     0
## 6     0  4     0     0     0     0     0     0

# The first column contains disease names, rest are features
colnames(soybean)[1] <- "Disease"

# ========================================
# 1. BAR PLOT: Disease Class Distribution
# ========================================

# Count samples per disease
disease_counts <- soybean %>%
  group_by(Disease) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count))

ggplot(disease_counts, aes(x = reorder(Disease, Count), y = Count, fill = Disease)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Distribution of Soybean Diseases",
       x = "Disease Type",
       y = "Number of Samples") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

# ========================================
# 2. BOX PLOTS: Feature Distributions by Disease
# ========================================

# Select a few numeric features for box plots
# Assuming columns 2-6 are numeric features
feature_cols <- colnames(soybean)[2:6]

# Create box plots for each feature
for(feature in feature_cols) {
  p <- ggplot(soybean, aes(x = Disease, y = .data[[feature]], fill = Disease)) +
    geom_boxplot(show.legend = FALSE) +
    coord_flip() +
    labs(title = paste("Distribution of", feature, "by Disease"),
         x = "Disease Type",
         y = feature) +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 7))
  
  print(p)
}

# ========================================
# 4. HEATMAP: Feature Correlation
# ========================================

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

# Select only numeric columns for correlation
numeric_data <- soybean %>% select(where(is.numeric))

# Calculate correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Melt for ggplot
cor_melted <- melt(cor_matrix)

ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1)) +
  labs(title = "Feature Correlation Heatmap",
       x = "", y = "", fill = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
        axis.text.y = element_text(size = 6))

# ========================================
# 5. SCATTER PLOT: Relationship Between Features
# ========================================

# Scatter plot of two features colored by disease
feature1 <- colnames(soybean)[2]
feature2 <- colnames(soybean)[3]

ggplot(soybean, aes(x = .data[[feature1]], y = .data[[feature2]], 
                    color = Disease)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = paste(feature1, "vs", feature2),
       x = feature1,
       y = feature2,
       color = "Disease") +
  theme_minimal() +
  theme(legend.position = "right",
        legend.text = element_text(size = 7))

# ========================================
# 6. STACKED BAR PLOT: Disease by Feature Value
# ========================================

# Group by a categorical feature and disease
# Assuming column 2 has discrete values
soybean_grouped <- soybean %>%
  group_by(Disease, .data[[feature1]]) %>%
  summarise(Count = n(), .groups = 'drop')

ggplot(soybean_grouped, aes(x = factor(.data[[feature1]]), y = Count, 
                            fill = Disease)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = paste("Disease Distribution by", feature1),
       x = feature1,
       y = "Count",
       fill = "Disease") +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 6))

# ========================================
# 7. VIOLIN PLOT: Feature Distribution
# ========================================

ggplot(soybean, aes(x = Disease, y = .data[[feature1]], fill = Disease)) +
  geom_violin(show.legend = FALSE) +
  coord_flip() +
  labs(title = paste("Violin Plot of", feature1, "by Disease"),
       x = "Disease Type",
       y = feature1) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 7))

## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# ========================================
# 9. CLASS IMBALANCE VISUALIZATION
# ========================================

# Highlight class imbalance
disease_counts$Imbalanced <- ifelse(disease_counts$Count < 10, "Underrepresented", 
                                    "Well-represented")

ggplot(disease_counts, aes(x = reorder(Disease, Count), y = Count, 
                           fill = Imbalanced)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("Underrepresented" = "red", 
                                "Well-represented" = "steelblue")) +
  labs(title = "Class Imbalance in Soybean Disease Dataset",
       x = "Disease Type",
       y = "Number of Samples",
       fill = "Class Balance") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

# ========================================
# 10. SUMMARY STATISTICS TABLE
# ========================================

# Summary by disease
summary_stats <- soybean %>%
  group_by(Disease) %>%
  summarise(
    Count = n(),
    Mean_Feature1 = mean(.data[[feature1]], na.rm = TRUE),
    SD_Feature1 = sd(.data[[feature1]], na.rm = TRUE)
  )

## Warning: There were 20 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Mean_Feature1 = mean(.data[["X6"]], na.rm = TRUE)`.
## ℹ In group 1: `Disease = "2-4-d-injury"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 19 remaining warnings.

print(summary_stats)

## # A tibble: 19 × 4
##    Disease                     Count Mean_Feature1 SD_Feature1
##    <chr>                       <int>         <dbl>       <dbl>
##  1 2-4-d-injury                    1            NA      NA    
##  2 alternarialeaf-spot            40            NA       0.891
##  3 anthracnose                    20            NA       1.73 
##  4 bacterial-blight               10            NA       0.949
##  5 bacterial-pustule              10            NA       0.919
##  6 brown-spot                     40            NA       1.33 
##  7 brown-stem-rot                 20            NA       0.786
##  8 charcoal-rot                   10            NA       1.23 
##  9 cyst-nematode                   6            NA       0.753
## 10 diaporthe-pod-&-stem-blight     6            NA       1.76 
## 11 diaporthe-stem-canker           9            NA       1.13 
## 12 downy-mildew                   10            NA       1.73 
## 13 frog-eye-leaf-spot             40            NA       0.841
## 14 herbicide-injury                4            NA       0.5  
## 15 phyllosticta-leaf-spot         10            NA       0.699
## 16 phytophthora-rot               40            NA       1.06 
## 17 powdery-mildew                 10            NA       2.10 
## 18 purple-seed-stain              10            NA       1.18 
## 19 rhizoctonia-root-rot           10            NA       0.919

library(ggplot2)
library(dplyr)

# Read the soybean dataset
soybean <- read.csv("~/Desktop/soybean.csv", header = TRUE)

# Rename first column
colnames(soybean)[1] <- "Disease"

# ========================================
# OUTLIER DETECTION FUNCTIONS
# ========================================

# Function to detect outliers using IQR method
detect_outliers_iqr <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  outliers <- x < lower_bound | x > upper_bound
  
  return(list(
    outliers = outliers,
    lower_bound = lower_bound,
    upper_bound = upper_bound,
    num_outliers = sum(outliers, na.rm = TRUE)
  ))
}

# ========================================
# ANALYZE ALL NUMERIC COLUMNS FOR OUTLIERS
# ========================================

# Get numeric columns only
numeric_cols <- names(soybean)[sapply(soybean, is.numeric)]

cat("=== OUTLIER ANALYSIS FOR SOYBEAN DATASET ===\n\n")

## === OUTLIER ANALYSIS FOR SOYBEAN DATASET ===

# Check each numeric column
outlier_summary <- data.frame(
  Column = character(),
  Min = numeric(),
  Max = numeric(),
  Range = numeric(),
  Unique_Values = integer(),
  Num_Outliers = integer(),
  Percent_Outliers = numeric(),
  stringsAsFactors = FALSE
)

for(col in numeric_cols) {
  values <- soybean[[col]]
  outlier_info <- detect_outliers_iqr(values)
  
  unique_vals <- length(unique(values))
  value_range <- max(values, na.rm = TRUE) - min(values, na.rm = TRUE)
  pct_outliers <- (outlier_info$num_outliers / length(values)) * 100
  
  outlier_summary <- rbind(outlier_summary, data.frame(
    Column = col,
    Min = min(values, na.rm = TRUE),
    Max = max(values, na.rm = TRUE),
    Range = value_range,
    Unique_Values = unique_vals,
    Num_Outliers = outlier_info$num_outliers,
    Percent_Outliers = round(pct_outliers, 2)
  ))
  
  if(outlier_info$num_outliers > 0) {
    cat(sprintf("%s:\n", col))
    cat(sprintf("  Min: %.2f, Max: %.2f\n", min(values, na.rm = TRUE), max(values, na.rm = TRUE)))
    cat(sprintf("  Unique values: %d\n", unique_vals))
    cat(sprintf("  Outlier bounds: [%.2f, %.2f]\n", 
                outlier_info$lower_bound, outlier_info$upper_bound))
    cat(sprintf("  Number of outliers: %d (%.2f%%)\n", 
                outlier_info$num_outliers, pct_outliers))
    cat(sprintf("  Outlier values: %s\n\n", 
                paste(unique(values[outlier_info$outliers]), collapse = ", ")))
  }
}

## X1.5:
##   Min: 0.00, Max: 1.00
##   Unique values: 2
##   Outlier bounds: [1.00, 1.00]
##   Number of outliers: 33 (10.78%)
##   Outlier values: 0

# Print summary table
print(outlier_summary)

##   Column Min Max Range Unique_Values Num_Outliers Percent_Outliers
## 1   X1.5   0   1     1             2           33            10.78

# Columns with outliers
cat("\n=== SUMMARY ===\n")

## 
## === SUMMARY ===

cat(sprintf("Total numeric columns: %d\n", length(numeric_cols)))

## Total numeric columns: 1

cat(sprintf("Columns with outliers: %d\n", sum(outlier_summary$Num_Outliers > 0)))

## Columns with outliers: 1

cat(sprintf("Total outliers across all columns: %d\n", sum(outlier_summary$Num_Outliers)))

## Total outliers across all columns: 33

# ========================================
# VISUALIZE OUTLIERS: BOX PLOTS
# ========================================

# Box plots for columns with most outliers
top_outlier_cols <- outlier_summary %>%
  arrange(desc(Num_Outliers)) %>%
  head(6) %>%
  pull(Column)

if(length(top_outlier_cols) > 0) {
  for(col in top_outlier_cols) {
    p <- ggplot(soybean, aes(y = .data[[col]])) +
      geom_boxplot(fill = "steelblue", outlier.color = "red", outlier.size = 3) +
      labs(title = paste("Box Plot with Outliers:", col),
           y = col) +
      theme_minimal() +
      coord_flip()
    
    print(p)
  }
}

# ========================================
# OUTLIERS BY DISEASE CLASS
# ========================================

# Check if outliers are associated with specific diseases
cat("\n=== OUTLIERS BY DISEASE CLASS ===\n")

## 
## === OUTLIERS BY DISEASE CLASS ===

for(col in head(top_outlier_cols, 3)) {
  outlier_info <- detect_outliers_iqr(soybean[[col]])
  outlier_diseases <- soybean$Disease[outlier_info$outliers]
  
  if(length(outlier_diseases) > 0) {
    cat(sprintf("\n%s outliers by disease:\n", col))
    print(table(outlier_diseases))
  }
}

## 
## X1.5 outliers by disease:
## outlier_diseases
##                 anthracnose              brown-stem-rot 
##                           9                           3 
## diaporthe-pod-&-stem-blight           purple-seed-stain 
##                           6                           5 
##        rhizoctonia-root-rot 
##                          10

# ========================================
# VISUALIZE: SCATTER PLOT WITH OUTLIERS HIGHLIGHTED
# ========================================

# For first two numeric columns
if(length(numeric_cols) >= 2) {
  col1 <- numeric_cols[1]
  col2 <- numeric_cols[2]
  
  outliers1 <- detect_outliers_iqr(soybean[[col1]])
  outliers2 <- detect_outliers_iqr(soybean[[col2]])
  
  soybean$is_outlier <- outliers1$outliers | outliers2$outliers
  
  ggplot(soybean, aes(x = .data[[col1]], y = .data[[col2]], 
                      color = is_outlier, shape = is_outlier)) +
    geom_point(size = 3, alpha = 0.7) +
    scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "red"),
                       labels = c("Normal", "Outlier")) +
    scale_shape_manual(values = c("FALSE" = 16, "TRUE" = 17),
                       labels = c("Normal", "Outlier")) +
    labs(title = paste("Outlier Detection:", col1, "vs", col2),
         x = col1,
         y = col2,
         color = "Status",
         shape = "Status") +
    theme_minimal()
}

# ========================================
# CHECK DATA DISTRIBUTION
# ========================================

# Since soybean features are likely categorical/ordinal (0,1,2,3,4)
# Let's check the unique values
cat("\n=== DATA TYPE ANALYSIS ===\n")

## 
## === DATA TYPE ANALYSIS ===

for(col in head(numeric_cols, 5)) {
  unique_vals <- sort(unique(soybean[[col]]))
  cat(sprintf("%s: [%s]\n", col, paste(unique_vals, collapse = ", ")))
}

## X1.5: [0, 1]

Narrative Overview

The soybean dataset contains 306 samples across 19 disease classes with 35 categorical/ordinal features (symptom severity coded 0-4). The critical challenge is severe class imbalance - some diseases have only 1-6 samples while others have 40. Unlike the glass dataset, these features are already discrete and bounded, so traditional transformations like log scaling are inappropriate. Instead, the focus must be on addressing class imbalance through resampling and proper train-test stratification.Key Transformations NeededCritical Transformations:

SMOTE (Synthetic Minority Over-sampling) - Generate synthetic samples for underrepresented diseases to balance the dataset
Stratified train-test split - Ensures all 19 disease classes appear in both training and test sets
Handle missing values with mode imputation - Use most frequent value for categorical features
One-hot encoding - Only for neural networks and some linear models; keep ordinal (0-4) for tree models

Why These Work:

SMOTE creates synthetic samples by interpolating between existing minority class samples, preventing models from ignoring rare diseases
Stratified splitting is essential with 19 classes and severe imbalance - random splits could leave some diseases entirely in training or test sets
Mode imputation preserves the categorical nature of symptom data better than mean/median
Tree models naturally handle ordinal features (0=none, 1=mild, 2=moderate, 3=severe, 4=very severe), but neural networks need binary features

Pre-Processing Data - Soybean

Candace Grant

2025-09-29