# Load required libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(RColorBrewer)
# ========================================
# READ AND PREPARE DATA
# ========================================
# Read the soybean.csv file
soybean <- read.csv("~/Desktop/soybean.csv", header = TRUE)
# View structure
str(soybean)
## 'data.frame': 306 obs. of 36 variables:
## $ diaporthe.stem.canker: chr "diaporthe-stem-canker" "diaporthe-stem-canker" "diaporthe-stem-canker" "diaporthe-stem-canker" ...
## $ X6 : chr "4" "3" "3" "6" ...
## $ X0 : chr "0" "0" "0" "0" ...
## $ X2 : chr "2" "2" "2" "2" ...
## $ X1 : chr "1" "1" "1" "1" ...
## $ X0.1 : chr "0" "0" "0" "0" ...
## $ X1.1 : chr "2" "1" "1" "2" ...
## $ X1.2 : chr "0" "0" "0" "0" ...
## $ X1.3 : chr "2" "2" "2" "1" ...
## $ X0.2 : chr "1" "1" "0" "0" ...
## $ X0.3 : chr "1" "2" "1" "2" ...
## $ X1.4 : chr "1" "1" "1" "1" ...
## $ X1.5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ X0.4 : chr "0" "0" "0" "0" ...
## $ X2.1 : chr "2" "2" "2" "2" ...
## $ X2.2 : chr "2" "2" "2" "2" ...
## $ X0.5 : chr "0" "0" "0" "0" ...
## $ X0.6 : chr "0" "0" "0" "0" ...
## $ X0.7 : chr "0" "0" "0" "0" ...
## $ X1.6 : chr "1" "1" "1" "1" ...
## $ X1.7 : chr "0" "0" "0" "0" ...
## $ X3 : chr "3" "3" "3" "3" ...
## $ X1.8 : chr "1" "0" "0" "1" ...
## $ X1.9 : chr "1" "1" "1" "1" ...
## $ X1.10 : chr "1" "1" "1" "1" ...
## $ X0.8 : chr "0" "0" "0" "0" ...
## $ X0.9 : chr "0" "0" "0" "0" ...
## $ X0.10 : chr "0" "0" "0" "0" ...
## $ X0.11 : chr "0" "0" "0" "0" ...
## $ X4 : chr "4" "4" "4" "4" ...
## $ X0.12 : chr "0" "0" "0" "0" ...
## $ X0.13 : chr "0" "0" "0" "0" ...
## $ X0.14 : chr "0" "0" "0" "0" ...
## $ X0.15 : chr "0" "0" "0" "0" ...
## $ X0.16 : chr "0" "0" "0" "0" ...
## $ X0.17 : chr "0" "0" "0" "0" ...
head(soybean)
## diaporthe.stem.canker X6 X0 X2 X1 X0.1 X1.1 X1.2 X1.3 X0.2 X0.3 X1.4 X1.5
## 1 diaporthe-stem-canker 4 0 2 1 0 2 0 2 1 1 1 1
## 2 diaporthe-stem-canker 3 0 2 1 0 1 0 2 1 2 1 1
## 3 diaporthe-stem-canker 3 0 2 1 0 1 0 2 0 1 1 1
## 4 diaporthe-stem-canker 6 0 2 1 0 2 0 1 0 2 1 1
## 5 diaporthe-stem-canker 5 0 2 1 0 3 0 1 0 1 1 1
## 6 diaporthe-stem-canker 5 0 2 1 0 2 0 1 1 0 1 1
## X0.4 X2.1 X2.2 X0.5 X0.6 X0.7 X1.6 X1.7 X3 X1.8 X1.9 X1.10 X0.8 X0.9 X0.10
## 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0
## 2 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0
## 3 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0
## 4 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0
## 5 0 2 2 0 0 0 1 0 3 0 1 1 0 0 0
## 6 0 2 2 0 0 0 1 1 3 1 1 1 0 0 0
## X0.11 X4 X0.12 X0.13 X0.14 X0.15 X0.16 X0.17
## 1 0 4 0 0 0 0 0 0
## 2 0 4 0 0 0 0 0 0
## 3 0 4 0 0 0 0 0 0
## 4 0 4 0 0 0 0 0 0
## 5 0 4 0 0 0 0 0 0
## 6 0 4 0 0 0 0 0 0
# The first column contains disease names, rest are features
colnames(soybean)[1] <- "Disease"
# ========================================
# 1. BAR PLOT: Disease Class Distribution
# ========================================
# Count samples per disease
disease_counts <- soybean %>%
group_by(Disease) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
ggplot(disease_counts, aes(x = reorder(Disease, Count), y = Count, fill = Disease)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(title = "Distribution of Soybean Diseases",
x = "Disease Type",
y = "Number of Samples") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8))
# ========================================
# 2. BOX PLOTS: Feature Distributions by Disease
# ========================================
# Select a few numeric features for box plots
# Assuming columns 2-6 are numeric features
feature_cols <- colnames(soybean)[2:6]
# Create box plots for each feature
for(feature in feature_cols) {
p <- ggplot(soybean, aes(x = Disease, y = .data[[feature]], fill = Disease)) +
geom_boxplot(show.legend = FALSE) +
coord_flip() +
labs(title = paste("Distribution of", feature, "by Disease"),
x = "Disease Type",
y = feature) +
theme_minimal() +
theme(axis.text.y = element_text(size = 7))
print(p)
}
# ========================================
# 4. HEATMAP: Feature Correlation
# ========================================
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Select only numeric columns for correlation
numeric_data <- soybean %>% select(where(is.numeric))
# Calculate correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Melt for ggplot
cor_melted <- melt(cor_matrix)
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1)) +
labs(title = "Feature Correlation Heatmap",
x = "", y = "", fill = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
axis.text.y = element_text(size = 6))
# ========================================
# 5. SCATTER PLOT: Relationship Between Features
# ========================================
# Scatter plot of two features colored by disease
feature1 <- colnames(soybean)[2]
feature2 <- colnames(soybean)[3]
ggplot(soybean, aes(x = .data[[feature1]], y = .data[[feature2]],
color = Disease)) +
geom_point(size = 3, alpha = 0.7) +
labs(title = paste(feature1, "vs", feature2),
x = feature1,
y = feature2,
color = "Disease") +
theme_minimal() +
theme(legend.position = "right",
legend.text = element_text(size = 7))
# ========================================
# 6. STACKED BAR PLOT: Disease by Feature Value
# ========================================
# Group by a categorical feature and disease
# Assuming column 2 has discrete values
soybean_grouped <- soybean %>%
group_by(Disease, .data[[feature1]]) %>%
summarise(Count = n(), .groups = 'drop')
ggplot(soybean_grouped, aes(x = factor(.data[[feature1]]), y = Count,
fill = Disease)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = paste("Disease Distribution by", feature1),
x = feature1,
y = "Count",
fill = "Disease") +
theme_minimal() +
theme(legend.position = "bottom",
legend.text = element_text(size = 6))
# ========================================
# 7. VIOLIN PLOT: Feature Distribution
# ========================================
ggplot(soybean, aes(x = Disease, y = .data[[feature1]], fill = Disease)) +
geom_violin(show.legend = FALSE) +
coord_flip() +
labs(title = paste("Violin Plot of", feature1, "by Disease"),
x = "Disease Type",
y = feature1) +
theme_minimal() +
theme(axis.text.y = element_text(size = 7))
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# ========================================
# 9. CLASS IMBALANCE VISUALIZATION
# ========================================
# Highlight class imbalance
disease_counts$Imbalanced <- ifelse(disease_counts$Count < 10, "Underrepresented",
"Well-represented")
ggplot(disease_counts, aes(x = reorder(Disease, Count), y = Count,
fill = Imbalanced)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("Underrepresented" = "red",
"Well-represented" = "steelblue")) +
labs(title = "Class Imbalance in Soybean Disease Dataset",
x = "Disease Type",
y = "Number of Samples",
fill = "Class Balance") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8))
# ========================================
# 10. SUMMARY STATISTICS TABLE
# ========================================
# Summary by disease
summary_stats <- soybean %>%
group_by(Disease) %>%
summarise(
Count = n(),
Mean_Feature1 = mean(.data[[feature1]], na.rm = TRUE),
SD_Feature1 = sd(.data[[feature1]], na.rm = TRUE)
)
## Warning: There were 20 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Mean_Feature1 = mean(.data[["X6"]], na.rm = TRUE)`.
## ℹ In group 1: `Disease = "2-4-d-injury"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 19 remaining warnings.
print(summary_stats)
## # A tibble: 19 × 4
## Disease Count Mean_Feature1 SD_Feature1
## <chr> <int> <dbl> <dbl>
## 1 2-4-d-injury 1 NA NA
## 2 alternarialeaf-spot 40 NA 0.891
## 3 anthracnose 20 NA 1.73
## 4 bacterial-blight 10 NA 0.949
## 5 bacterial-pustule 10 NA 0.919
## 6 brown-spot 40 NA 1.33
## 7 brown-stem-rot 20 NA 0.786
## 8 charcoal-rot 10 NA 1.23
## 9 cyst-nematode 6 NA 0.753
## 10 diaporthe-pod-&-stem-blight 6 NA 1.76
## 11 diaporthe-stem-canker 9 NA 1.13
## 12 downy-mildew 10 NA 1.73
## 13 frog-eye-leaf-spot 40 NA 0.841
## 14 herbicide-injury 4 NA 0.5
## 15 phyllosticta-leaf-spot 10 NA 0.699
## 16 phytophthora-rot 40 NA 1.06
## 17 powdery-mildew 10 NA 2.10
## 18 purple-seed-stain 10 NA 1.18
## 19 rhizoctonia-root-rot 10 NA 0.919
library(ggplot2)
library(dplyr)
# Read the soybean dataset
soybean <- read.csv("~/Desktop/soybean.csv", header = TRUE)
# Rename first column
colnames(soybean)[1] <- "Disease"
# ========================================
# OUTLIER DETECTION FUNCTIONS
# ========================================
# Function to detect outliers using IQR method
detect_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- x < lower_bound | x > upper_bound
return(list(
outliers = outliers,
lower_bound = lower_bound,
upper_bound = upper_bound,
num_outliers = sum(outliers, na.rm = TRUE)
))
}
# ========================================
# ANALYZE ALL NUMERIC COLUMNS FOR OUTLIERS
# ========================================
# Get numeric columns only
numeric_cols <- names(soybean)[sapply(soybean, is.numeric)]
cat("=== OUTLIER ANALYSIS FOR SOYBEAN DATASET ===\n\n")
## === OUTLIER ANALYSIS FOR SOYBEAN DATASET ===
# Check each numeric column
outlier_summary <- data.frame(
Column = character(),
Min = numeric(),
Max = numeric(),
Range = numeric(),
Unique_Values = integer(),
Num_Outliers = integer(),
Percent_Outliers = numeric(),
stringsAsFactors = FALSE
)
for(col in numeric_cols) {
values <- soybean[[col]]
outlier_info <- detect_outliers_iqr(values)
unique_vals <- length(unique(values))
value_range <- max(values, na.rm = TRUE) - min(values, na.rm = TRUE)
pct_outliers <- (outlier_info$num_outliers / length(values)) * 100
outlier_summary <- rbind(outlier_summary, data.frame(
Column = col,
Min = min(values, na.rm = TRUE),
Max = max(values, na.rm = TRUE),
Range = value_range,
Unique_Values = unique_vals,
Num_Outliers = outlier_info$num_outliers,
Percent_Outliers = round(pct_outliers, 2)
))
if(outlier_info$num_outliers > 0) {
cat(sprintf("%s:\n", col))
cat(sprintf(" Min: %.2f, Max: %.2f\n", min(values, na.rm = TRUE), max(values, na.rm = TRUE)))
cat(sprintf(" Unique values: %d\n", unique_vals))
cat(sprintf(" Outlier bounds: [%.2f, %.2f]\n",
outlier_info$lower_bound, outlier_info$upper_bound))
cat(sprintf(" Number of outliers: %d (%.2f%%)\n",
outlier_info$num_outliers, pct_outliers))
cat(sprintf(" Outlier values: %s\n\n",
paste(unique(values[outlier_info$outliers]), collapse = ", ")))
}
}
## X1.5:
## Min: 0.00, Max: 1.00
## Unique values: 2
## Outlier bounds: [1.00, 1.00]
## Number of outliers: 33 (10.78%)
## Outlier values: 0
# Print summary table
print(outlier_summary)
## Column Min Max Range Unique_Values Num_Outliers Percent_Outliers
## 1 X1.5 0 1 1 2 33 10.78
# Columns with outliers
cat("\n=== SUMMARY ===\n")
##
## === SUMMARY ===
cat(sprintf("Total numeric columns: %d\n", length(numeric_cols)))
## Total numeric columns: 1
cat(sprintf("Columns with outliers: %d\n", sum(outlier_summary$Num_Outliers > 0)))
## Columns with outliers: 1
cat(sprintf("Total outliers across all columns: %d\n", sum(outlier_summary$Num_Outliers)))
## Total outliers across all columns: 33
# ========================================
# VISUALIZE OUTLIERS: BOX PLOTS
# ========================================
# Box plots for columns with most outliers
top_outlier_cols <- outlier_summary %>%
arrange(desc(Num_Outliers)) %>%
head(6) %>%
pull(Column)
if(length(top_outlier_cols) > 0) {
for(col in top_outlier_cols) {
p <- ggplot(soybean, aes(y = .data[[col]])) +
geom_boxplot(fill = "steelblue", outlier.color = "red", outlier.size = 3) +
labs(title = paste("Box Plot with Outliers:", col),
y = col) +
theme_minimal() +
coord_flip()
print(p)
}
}
# ========================================
# OUTLIERS BY DISEASE CLASS
# ========================================
# Check if outliers are associated with specific diseases
cat("\n=== OUTLIERS BY DISEASE CLASS ===\n")
##
## === OUTLIERS BY DISEASE CLASS ===
for(col in head(top_outlier_cols, 3)) {
outlier_info <- detect_outliers_iqr(soybean[[col]])
outlier_diseases <- soybean$Disease[outlier_info$outliers]
if(length(outlier_diseases) > 0) {
cat(sprintf("\n%s outliers by disease:\n", col))
print(table(outlier_diseases))
}
}
##
## X1.5 outliers by disease:
## outlier_diseases
## anthracnose brown-stem-rot
## 9 3
## diaporthe-pod-&-stem-blight purple-seed-stain
## 6 5
## rhizoctonia-root-rot
## 10
# ========================================
# VISUALIZE: SCATTER PLOT WITH OUTLIERS HIGHLIGHTED
# ========================================
# For first two numeric columns
if(length(numeric_cols) >= 2) {
col1 <- numeric_cols[1]
col2 <- numeric_cols[2]
outliers1 <- detect_outliers_iqr(soybean[[col1]])
outliers2 <- detect_outliers_iqr(soybean[[col2]])
soybean$is_outlier <- outliers1$outliers | outliers2$outliers
ggplot(soybean, aes(x = .data[[col1]], y = .data[[col2]],
color = is_outlier, shape = is_outlier)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "red"),
labels = c("Normal", "Outlier")) +
scale_shape_manual(values = c("FALSE" = 16, "TRUE" = 17),
labels = c("Normal", "Outlier")) +
labs(title = paste("Outlier Detection:", col1, "vs", col2),
x = col1,
y = col2,
color = "Status",
shape = "Status") +
theme_minimal()
}
# ========================================
# CHECK DATA DISTRIBUTION
# ========================================
# Since soybean features are likely categorical/ordinal (0,1,2,3,4)
# Let's check the unique values
cat("\n=== DATA TYPE ANALYSIS ===\n")
##
## === DATA TYPE ANALYSIS ===
for(col in head(numeric_cols, 5)) {
unique_vals <- sort(unique(soybean[[col]]))
cat(sprintf("%s: [%s]\n", col, paste(unique_vals, collapse = ", ")))
}
## X1.5: [0, 1]
Narrative Overview
The soybean dataset contains 306 samples across 19 disease classes with 35 categorical/ordinal features (symptom severity coded 0-4). The critical challenge is severe class imbalance - some diseases have only 1-6 samples while others have 40. Unlike the glass dataset, these features are already discrete and bounded, so traditional transformations like log scaling are inappropriate. Instead, the focus must be on addressing class imbalance through resampling and proper train-test stratification.Key Transformations NeededCritical Transformations:
Why These Work: