Primary key/Foreign Key: Country and Year
Get average temperature per month/ quarter - > Sabrina - (25/12) Hafiezah (25/12)
Cleaning & Pre- processing -> Kai (1/1/2025)
EDA -> Rafiq /Afif (08/01/2025)
Modeling -> Rafiq /Afif (08/01/2025)
Evaluation Rafiq /Afif (08/01/2025)
https://colab.research.google.com/drive/1D1AVF33qxVJ65ubFrqdhGtJf2K99nLE7?usp=sharing
The objective of this project is to leverage historical climate and agricultural data to predict crop yields for various crops across different regions. By analyzing features like average temperature, rainfall, and pesticide usage, we aim to forecast yield performance and provide actionable insights for agricultural planning. These predictions will assist farmers, policymakers, and agribusinesses in optimizing resources, mitigating risks, and improving food security.
How accurately can we predict crop yield based on rainfall, temperature, and pesticide usage?
What is the impact of average rainfall and temperature on crop yields for specific crops (e.g., Maize, Wheat)?
Which regions or countries are most sensitive to changes in climate factors (e.g., rainfall variability)?
How can pesticide usage be optimized to improve crop yields without environmental harm?
How will changes in climatic conditions (e.g., rising average temperatures) affect crop yields over time?
The datasets is the combination of two datasets which are Global Temperature Data and Crop Yield Data from year 1995 to 2020. Both data are taken from Kaggle. Below is an overview of the features in the dataset:
# Load Library
library(readr)
## Warning: package 'readr' was built under R version 4.4.2
# Load the dataset
data <- read_csv("Crop_Yield_Datasets.csv",show_col_types = FALSE)
# Display the first few rows
head(data)
## # A tibble: 6 × 8
## `Unnamed: 0` Area Item Year `hg/ha_yield` average_rain_fall_mm_pe…¹
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 Albania Maize 1990 36613 1485
## 2 1 Albania Potatoes 1990 66667 1485
## 3 2 Albania Rice, paddy 1990 23333 1485
## 4 3 Albania Sorghum 1990 12500 1485
## 5 4 Albania Soybeans 1990 7000 1485
## 6 5 Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹average_rain_fall_mm_per_year
## # ℹ 2 more variables: pesticides_tonnes <dbl>, avg_temp <dbl>
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
shape <- dim(data)
# Display the number of rows and columns
cat("Number of Rows:", shape[1], "\n")
## Number of Rows: 30242
cat("Number of Columns:", shape[2], "\n")
## Number of Columns: 8
# Check for missing values in the dataset
missing_values_summary <- colSums(is.na(data))
# Display columns with missing values and their counts
missing_values_summary[missing_values_summary > 0]
## Unnamed: 0 Area
## 2000 2000
## Item Year
## 2000 2000
## hg/ha_yield average_rain_fall_mm_per_year
## 2000 2000
## pesticides_tonnes avg_temp
## 2000 2000
# Check if there are any missing values in the dataset
any(is.na(data))
## [1] TRUE
# Remove rows with missing values
data_cleaned <- data %>%
drop_na()
# Verify the rows with missing values are removed
sum(is.na(data_cleaned)) # Should return 0
## [1] 0
# Show the shape of the data frame
shape <- dim(data_cleaned)
# Display the number of rows and columns
cat("Number of Rows:", shape[1], "\n")
## Number of Rows: 28242
cat("Number of Columns:", shape[2], "\n")
## Number of Columns: 8
colnames(data_cleaned)
## [1] "Unnamed: 0" "Area"
## [3] "Item" "Year"
## [5] "hg/ha_yield" "average_rain_fall_mm_per_year"
## [7] "pesticides_tonnes" "avg_temp"
# Load necessary libraries
library(dplyr)
library(stringr)
# Clean column names
colnames(data_cleaned) <- str_trim(colnames(data_cleaned)) # Remove leading/trailing spaces
# Rename columns
data_cleaned <- data_cleaned %>%
rename(
Country = `Area`, # Rename `Area` to `Country`
`Crop_yield(hg/ha)` = `hg/ha_yield`
) %>%
# Capitalize the first letter of all column names
rename_with(~ str_to_title(.))
# View the updated dataframe
print(data_cleaned)
## # A tibble: 28,242 × 8
## `Unnamed: 0` Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 Albania Maize 1990 36613 1485
## 2 1 Albania Potato… 1990 66667 1485
## 3 2 Albania Rice, … 1990 23333 1485
## 4 3 Albania Sorghum 1990 12500 1485
## 5 4 Albania Soybea… 1990 7000 1485
## 6 5 Albania Wheat 1990 30197 1485
## 7 6 Albania Maize 1991 29068 1485
## 8 7 Albania Potato… 1991 77818 1485
## 9 8 Albania Rice, … 1991 28538 1485
## 10 9 Albania Sorghum 1991 6667 1485
## # ℹ 28,232 more rows
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 2 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>
head(data_cleaned)
## # A tibble: 6 × 8
## `Unnamed: 0` Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 Albania Maize 1990 36613 1485
## 2 1 Albania Potatoes 1990 66667 1485
## 3 2 Albania Rice, p… 1990 23333 1485
## 4 3 Albania Sorghum 1990 12500 1485
## 5 4 Albania Soybeans 1990 7000 1485
## 6 5 Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 2 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>
cat("\nUnique values in 'Country':\n")
##
## Unique values in 'Country':
unique(data_cleaned$Country) %>% print()
## [1] "Albania" "Algeria"
## [3] "Angola" "Argentina"
## [5] "Armenia" "Australia"
## [7] "Austria" "Azerbaijan"
## [9] "Bahamas" "Bahrain"
## [11] "Bangladesh" "Belarus"
## [13] "Belgium" "Botswana"
## [15] "Brazil" "Bulgaria"
## [17] "Burkina Faso" "Burundi"
## [19] "Cameroon" "Canada"
## [21] "Central African Republic" "Chile"
## [23] "Colombia" "Croatia"
## [25] "Denmark" "Dominican Republic"
## [27] "Ecuador" "Egypt"
## [29] "El Salvador" "Eritrea"
## [31] "Estonia" "Finland"
## [33] "France" "Germany"
## [35] "Ghana" "Greece"
## [37] "Guatemala" "Guinea"
## [39] "Guyana" "Haiti"
## [41] "Honduras" "Hungary"
## [43] "India" "Indonesia"
## [45] "Iraq" "Ireland"
## [47] "Italy" "Jamaica"
## [49] "Japan" "Kazakhstan"
## [51] "Kenya" "Latvia"
## [53] "Lebanon" "Lesotho"
## [55] "Libya" "Lithuania"
## [57] "Madagascar" "Malawi"
## [59] "Malaysia" "Mali"
## [61] "Mauritania" "Mauritius"
## [63] "Mexico" "Montenegro"
## [65] "Morocco" "Mozambique"
## [67] "Namibia" "Nepal"
## [69] "Netherlands" "New Zealand"
## [71] "Nicaragua" "Niger"
## [73] "Norway" "Pakistan"
## [75] "Papua New Guinea" "Peru"
## [77] "Poland" "Portugal"
## [79] "Qatar" "Romania"
## [81] "Rwanda" "Saudi Arabia"
## [83] "Senegal" "Slovenia"
## [85] "South Africa" "Spain"
## [87] "Sri Lanka" "Sudan"
## [89] "Suriname" "Sweden"
## [91] "Switzerland" "Tajikistan"
## [93] "Thailand" "Tunisia"
## [95] "Turkey" "Uganda"
## [97] "Ukraine" "United Kingdom"
## [99] "Uruguay" "Zambia"
## [101] "Zimbabwe"
# Add a new column 'Continent' to the dataframe based on 'Region'
data_cleaned$Continent <- dplyr::case_when(
data_cleaned$Country %in% c("Albania", "Austria", "Belarus", "Belgium", "Bulgaria", "Croatia",
"Denmark", "Estonia", "Finland", "France", "Germany", "Greece",
"Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Montenegro",
"Netherlands", "Norway", "Poland", "Portugal", "Romania", "Slovenia",
"Spain", "Sweden", "Switzerland", "Ukraine", "United Kingdom") ~ "Europe",
data_cleaned$Country %in% c("Algeria", "Angola", "Botswana", "Burkina Faso", "Burundi", "Cameroon",
"Central African Republic", "Egypt", "Eritrea", "Ghana", "Guinea",
"Kenya", "Lesotho", "Libya", "Madagascar", "Malawi", "Mali",
"Mauritania", "Mauritius", "Morocco", "Mozambique", "Namibia",
"Niger", "Rwanda", "Senegal", "South Africa", "Sudan", "Tunisia",
"Uganda", "Zambia", "Zimbabwe") ~ "Africa",
data_cleaned$Country %in% c("Argentina", "Brazil", "Chile", "Colombia", "Ecuador", "Guyana",
"Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela") ~ "South America",
data_cleaned$Country %in% c("Bahamas", "Canada", "Dominican Republic", "El Salvador", "Guatemala",
"Haiti", "Honduras", "Jamaica", "Mexico", "Nicaragua", "Panama",
"United States") ~ "North America",
data_cleaned$Country %in% c("Armenia", "Azerbaijan", "Bahrain", "Bangladesh", "India", "Indonesia",
"Iraq", "Japan", "Kazakhstan", "Lebanon", "Malaysia", "Nepal",
"Pakistan", "Qatar", "Saudi Arabia", "Sri Lanka", "Tajikistan",
"Thailand", "Turkey", "United Arab Emirates", "Vietnam") ~ "Asia",
data_cleaned$Country %in% c("Australia", "New Zealand", "Papua New Guinea") ~ "Oceania",
TRUE ~ "Other" # For countries not matched above
)
# View the updated dataframe
head(data_cleaned)
## # A tibble: 6 × 9
## `Unnamed: 0` Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 Albania Maize 1990 36613 1485
## 2 1 Albania Potatoes 1990 66667 1485
## 3 2 Albania Rice, p… 1990 23333 1485
## 4 3 Albania Sorghum 1990 12500 1485
## 5 4 Albania Soybeans 1990 7000 1485
## 6 5 Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 3 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>, Continent <chr>
# Reorder the columns as requested
data_reordered <- data_cleaned %>%
select(Continent, Country, Item, Year, `Crop_yield(Hg/Ha)`,
Average_rain_fall_mm_per_year, Pesticides_tonnes, Avg_temp)
# Print the first few rows of the reordered data to verify
head(data_reordered)
## # A tibble: 6 × 8
## Continent Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Europe Albania Maize 1990 36613 1485
## 2 Europe Albania Potatoes 1990 66667 1485
## 3 Europe Albania Rice, paddy 1990 23333 1485
## 4 Europe Albania Sorghum 1990 12500 1485
## 5 Europe Albania Soybeans 1990 7000 1485
## 6 Europe Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 2 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>
# Check shape before removing outliers
cat("Shape before cleaning:\n")
## Shape before cleaning:
cat("Rows:", nrow(data_reordered), "Columns:", ncol(data_reordered), "\n")
## Rows: 28242 Columns: 8
# Load necessary library
library(ggplot2)
# Specify the numeric columns for which you want boxplots
numeric_columns <- c("Crop_yield(Hg/Ha)", "Average_rain_fall_mm_per_year",
"Pesticides_tonnes", "Avg_temp")
# Loop through each specified column and create a boxplot
for (col in numeric_columns) {
# Create a boxplot for the current column
p <- ggplot(data_reordered, aes(y = .data[[col]])) +
geom_boxplot(fill = "blue", color = "black") +
ggtitle(paste("Boxplot of", col)) +
ylab(col) +
theme_minimal()
# Print the plot
print(p)
}
# Remove outliers in the `Avg_temp` column using the IQR method
data_reordered <- data_reordered %>%
filter(
Avg_temp >= quantile(Avg_temp, 0.25, na.rm = TRUE) - 1.5 * IQR(Avg_temp, na.rm = TRUE) &
Avg_temp <= quantile(Avg_temp, 0.75, na.rm = TRUE) + 1.5 * IQR(Avg_temp, na.rm = TRUE)
)
# Check shape before removing outliers
cat("Shape after cleaning:\n")
## Shape after cleaning:
cat("Rows:", nrow(data_reordered), "Columns:", ncol(data_reordered), "\n")
## Rows: 28208 Columns: 8
# Load necessary library
library(ggplot2)
# Specify the numeric columns for which you want boxplots
numeric_columns <- c("Avg_temp")
# Loop through each specified column and create a boxplot
for (col in numeric_columns) {
# Create a boxplot for the current column
p <- ggplot(data_reordered, aes(y = .data[[col]])) +
geom_boxplot(fill = "blue", color = "black") +
ggtitle(paste("Boxplot of", col)) +
ylab(col) +
theme_minimal()
# Print the plot
print(p)
}
EDA aims to:
library(dplyr)
library(ggplot2)
library(maps)
## Warning: package 'maps' was built under R version 4.4.2
# Aggregate total crop production for the past 3 years
data_cleaned_filtered <- data_reordered %>%
filter(Year >= max(Year) - 2) %>%
group_by(Country) %>%
summarize(Total_Production = sum(`Crop_yield(Hg/Ha)`))
# Get world map data
world_map <- map_data("world")
# Merge map and production data
map_data_merged <- world_map %>%
left_join(data_cleaned_filtered, by = c("region" = "Country"))
# Plot world map
options(repr.plot.width = 16, repr.plot.height = 10) # Example of larger size
ggplot(map_data_merged, aes(long, lat, group = group, fill = Total_Production)) +
geom_polygon(color = "white") +
scale_fill_viridis_c(option = "C") +
theme_minimal(base_size = 16) + # Increase text size
labs(
title = "World Map of Total Crop Production (Last 3 Years)",
fill = "Total Production"
) +
theme(
plot.title = element_text(size = 20), # Title size
axis.text = element_text(size = 14), # Axis text size
legend.title = element_text(size = 16), # Legend title size
legend.text = element_text(size = 12) # Legend text size
)
Insights
High Crop Production Regions: India (yellow region) and parts of South America (notably Brazil) exhibit significantly high crop production, likely due to their large agricultural areas and favorable climatic conditions.
Moderate to Low Crop Production: Several regions in Africa, Europe, and North America show moderate production levels (shades of purple). This could reflect the balance between agricultural activity and land usage for other purposes.
Minimal Production Areas: Regions with low or no color intensity (gray areas) indicate very low or no recorded crop production. This may include desert regions, high-altitude areas, or urbanized zones with limited agricultural activities.
# Boxplot of rainfall distribution
ggplot(data_reordered, aes(x = reorder(Country, Average_rain_fall_mm_per_year), y = Average_rain_fall_mm_per_year)) +
geom_boxplot(fill = "skyblue", alpha = 0.7) +
coord_flip() +
theme_minimal() +
labs(
title = "Rainfall Distribution by Country",
x = "Country",
y = "Average Rainfall (mm)"
)
Wide Variation in Rainfall: There is a significant range in average annual rainfall among countries, from close to 0 mm in arid regions to over 3,000 mm in tropical countries.
Tropical Countries at the Top: Countries like Papua New Guinea, Colombia, and Indonesia are among the highest in rainfall, reflecting their location in tropical regions with heavy annual precipitation.
Arid Regions at the Bottom: Countries like Saudi Arabia and other Middle Eastern or desert-dominated nations are at the bottom of the list, receiving very little average annual rainfall.
Optimal Rainfall for High Crop Production:Countries like India and Brazil exhibit high crop production and moderate-to-high rainfall levels. These regions strike a balance between adequate rainfall and other favorable agricultural conditions (fertile soil, temperature, and irrigation infrastructure).
While high rainfall benefits crop production, excessive rainfall (as seen in countries like Papua New Guinea and Colombia) may not always correlate with the highest crop yields due to challenges like flooding, poor soil drainage, or erosion.
Low Rainfall and Low Crop Production:Arid and semi-arid regions (e.g., Saudi Arabia) show low crop production, which is directly linked to limited rainfall. Agriculture in these regions often depends on irrigation systems and water resource management to support crops.
Moderate Rainfall with Variability in Production: Some countries with moderate rainfall (e.g., regions in Africa) show relatively low crop production despite sufficient precipitation. This discrepancy may be due to factors like lack of infrastructure, soil fertility issues, or underdeveloped agricultural practices.
# Scatter plot of pesticide usage vs crop yield
ggplot(data_reordered, aes(x = Pesticides_tonnes, y = `Crop_yield(Hg/Ha)`, color = Country)) +
geom_point(alpha = 0.7, size = 3) +
theme_minimal() +
labs(
title = "Pesticide Usage vs. Crop Yield",
x = "Pesticide Usage (tonnes)",
y = "Crop Yield (Hg/Ha)"
)
ggplot(data_reordered, aes(x = Avg_temp, y = `Crop_yield(Hg/Ha)`)) +
geom_point(alpha = 0.5) +
facet_wrap(~Item, scales = "free_y") +
labs(title = "Impact of Average Temperature on Crop Yield (by Crop)",
x = "Average Temperature (°C)",
y = "Crop Yield (Hg/Ha)") +
theme_minimal()
Insights: 1. Optimal Temperature for Each Crop:
- Cassava: Shows high yields at higher average temperatures, typically above 20°C. This suggests cassava thrives in warmer climates.
- Maize: Peak yields are observed in the mid-temperature range (15–25°C), indicating that maize performs well in moderate climates.
- Plantains and Others: Higher yields are concentrated at temperatures around 25°C, suggesting these crops prefer tropical climates.
- Potatoes: Yield appears to peak in a cooler range, around 10–20°C, indicating suitability for temperate regions.
- Rice (Paddy): Performs consistently well between 20–30°C, suggesting rice adapts well to warm and humid conditions.
- Sorghum: Optimal yield occurs in the mid-temperature range (15–25°C), similar to maize, indicating its resilience in semi-arid conditions.
- Soybeans: Yields are spread across a wide temperature range, with optimal results between 15–25°C.
- Sweet Potatoes: High yields are observed in a moderate range of 15–25°C, indicating adaptability to tropical and subtropical climates.
- Wheat: Peaks are observed in the cooler range of 10–20°C, indicating its preference for temperate climates.
- Yams: Yield appears highest at warmer temperatures around 20–30°C, consistent with its origin in tropical regions.
Temperature as a Key Factor:
# Heatmap for correlations
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.2
# Selecting numeric columns for correlation analysis
numeric_data <- data_reordered %>%
select_if(is.numeric)
# Compute correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Plot the heatmap
ggcorrplot(cor_matrix,
method = "circle",
lab = TRUE,
title = "Heatmap of Variable Correlations")
# Load necessary libraries
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.2
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.2
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.2
options(warn = -1)
# Copy the dataframe
dataModel <- data_reordered
head(dataModel)
## # A tibble: 6 × 8
## Continent Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Europe Albania Maize 1990 36613 1485
## 2 Europe Albania Potatoes 1990 66667 1485
## 3 Europe Albania Rice, paddy 1990 23333 1485
## 4 Europe Albania Sorghum 1990 12500 1485
## 5 Europe Albania Soybeans 1990 7000 1485
## 6 Europe Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 2 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>
# Convert categorical variables to factors
dataModel$Continent <- as.factor(dataModel$Continent)
dataModel$Item <- as.factor(dataModel$Item)
dataModel$Country <- as.factor(dataModel$Country) # Convert 'Country' to factor
# Create a mapping of labels to original names (before converting to numeric)
country_mapping <- data.frame(
Country_Label = seq_along(levels(dataModel$Country)), # Numeric labels
Country_Name = levels(dataModel$Country) # Original country names
)
# Replace 'Country' with numeric labels for label encoding
dataModel$Country <- as.numeric(dataModel$Country)
# View the mapping (optional)
print(country_mapping) # View the mapping
## Country_Label Country_Name
## 1 1 Albania
## 2 2 Algeria
## 3 3 Angola
## 4 4 Argentina
## 5 5 Armenia
## 6 6 Australia
## 7 7 Austria
## 8 8 Azerbaijan
## 9 9 Bahamas
## 10 10 Bahrain
## 11 11 Bangladesh
## 12 12 Belarus
## 13 13 Belgium
## 14 14 Botswana
## 15 15 Brazil
## 16 16 Bulgaria
## 17 17 Burkina Faso
## 18 18 Burundi
## 19 19 Cameroon
## 20 20 Canada
## 21 21 Central African Republic
## 22 22 Chile
## 23 23 Colombia
## 24 24 Croatia
## 25 25 Denmark
## 26 26 Dominican Republic
## 27 27 Ecuador
## 28 28 Egypt
## 29 29 El Salvador
## 30 30 Eritrea
## 31 31 Estonia
## 32 32 Finland
## 33 33 France
## 34 34 Germany
## 35 35 Ghana
## 36 36 Greece
## 37 37 Guatemala
## 38 38 Guinea
## 39 39 Guyana
## 40 40 Haiti
## 41 41 Honduras
## 42 42 Hungary
## 43 43 India
## 44 44 Indonesia
## 45 45 Iraq
## 46 46 Ireland
## 47 47 Italy
## 48 48 Jamaica
## 49 49 Japan
## 50 50 Kazakhstan
## 51 51 Kenya
## 52 52 Latvia
## 53 53 Lebanon
## 54 54 Lesotho
## 55 55 Libya
## 56 56 Lithuania
## 57 57 Madagascar
## 58 58 Malawi
## 59 59 Malaysia
## 60 60 Mali
## 61 61 Mauritania
## 62 62 Mauritius
## 63 63 Mexico
## 64 64 Montenegro
## 65 65 Morocco
## 66 66 Mozambique
## 67 67 Namibia
## 68 68 Nepal
## 69 69 Netherlands
## 70 70 New Zealand
## 71 71 Nicaragua
## 72 72 Niger
## 73 73 Norway
## 74 74 Pakistan
## 75 75 Papua New Guinea
## 76 76 Peru
## 77 77 Poland
## 78 78 Portugal
## 79 79 Qatar
## 80 80 Romania
## 81 81 Rwanda
## 82 82 Saudi Arabia
## 83 83 Senegal
## 84 84 Slovenia
## 85 85 South Africa
## 86 86 Spain
## 87 87 Sri Lanka
## 88 88 Sudan
## 89 89 Suriname
## 90 90 Sweden
## 91 91 Switzerland
## 92 92 Tajikistan
## 93 93 Thailand
## 94 94 Tunisia
## 95 95 Turkey
## 96 96 Uganda
## 97 97 Ukraine
## 98 98 United Kingdom
## 99 99 Uruguay
## 100 100 Zambia
## 101 101 Zimbabwe
head(dataModel) # View the updated dataset
## # A tibble: 6 × 8
## Continent Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <fct> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 Europe 1 Maize 1990 36613 1485
## 2 Europe 1 Potatoes 1990 66667 1485
## 3 Europe 1 Rice, paddy 1990 23333 1485
## 4 Europe 1 Sorghum 1990 12500 1485
## 5 Europe 1 Soybeans 1990 7000 1485
## 6 Europe 1 Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 2 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>
# Split the data into training and testing sets
set.seed(42)
trainIndex <- createDataPartition(dataModel$`Crop_yield(Hg/Ha)`, p = 0.8, list = FALSE)
train_data <- dataModel[trainIndex, ]
test_data <- dataModel[-trainIndex, ]
# Separate features and target variable
x_train <- train_data[, !(names(train_data) %in% c("Crop_yield(Hg/Ha)"))]
y_train <- train_data$`Crop_yield(Hg/Ha)`
x_test <- test_data[, !(names(test_data) %in% c("Crop_yield(Hg/Ha)"))]
y_test <- test_data$`Crop_yield(Hg/Ha)`
The regression model is learning the relationship between the features (inputs) and the target variable (crop yield):
Features (inputs): Variables like Avg_temp, Rainfall, Pesticides, or others you provide.
Target variable (output): Crop_yield (continuous numeric variable).
The model is essentially answering: “How does a change in temperature, rainfall, or pesticide use affect the crop yield?”
Once trained, the regression model takes the feature values (e.g., temperature, rainfall) and predicts the expected crop yield. For example:
Input:
Output:
# Suppress warnings
options(warn = -1)
# Load necessary library
library(caret)
# Training Linear Regression model
set.seed(42)
lm_model <- train(
x_train, y_train,
method = "lm",
trControl = trainControl(method = "cv", number = 5)
)
# Make predictions
lm_predictions <- predict(lm_model, x_test)
# Evaluate the model
lm_rmse <- sqrt(mean((lm_predictions - y_test)^2))
lm_r2 <- 1 - sum((y_test - lm_predictions)^2) / sum((y_test - mean(y_test))^2)
# Print evaluation results
cat("Linear Regression Results:\n")
## Linear Regression Results:
cat("RMSE:", lm_rmse, "\n")
## RMSE: 48538.57
cat("R²:", lm_r2, "\n")
## R²: 0.6798097
# Load necessary library
library(randomForest)
# Training Random Forest model
set.seed(42)
rf_model <- randomForest(
x = x_train, y = y_train,
ntree = 100
)
# Make predictions
rf_predictions <- predict(rf_model, x_test)
# Evaluate the model
rf_rmse <- sqrt(mean((rf_predictions - y_test)^2))
rf_r2 <- 1 - sum((y_test - rf_predictions)^2) / sum((y_test - mean(y_test))^2)
# Print evaluation results
cat("\nRandom Forest Results:\n")
##
## Random Forest Results:
cat("RMSE:", rf_rmse, "\n")
## RMSE: 14027.52
cat("R²:", rf_r2, "\n")
## R²: 0.9732579
# Load necessary library
library(gbm)
# Convert Country to a factor in train and test data
train_data$Country <- as.factor(train_data$Country)
test_data$Country <- as.factor(test_data$Country)
# Align factor levels in test_data with train_data
test_data$Country <- factor(test_data$Country, levels = levels(train_data$Country))
# Training Gradient Boosting model
set.seed(42)
gbm_model <- gbm(
formula = `Crop_yield(Hg/Ha)` ~ ., # Enclose column name with backticks
data = train_data,
distribution = "gaussian",
n.trees = 100,
interaction.depth = 3,
shrinkage = 0.1,
cv.folds = 5
)
# Make predictions
gbm_predictions <- predict(gbm_model, test_data, n.trees = 100)
# Evaluate the model
gbm_rmse <- sqrt(mean((gbm_predictions - y_test)^2)) # RMSE
gbm_r2 <- 1 - sum((y_test - gbm_predictions)^2) / sum((y_test - mean(y_test))^2) # R²
# Print evaluation results
cat("\nGradient Boosting Results:\n")
##
## Gradient Boosting Results:
cat("RMSE:", gbm_rmse, "\n")
## RMSE: 23369.39
cat("R²:", gbm_r2, "\n")
## R²: 0.9257786
# Load necessary library
library(xgboost)
library(dplyr) # For data manipulation
# Step 1: Ensure all categorical columns are factors
x_train <- x_train %>%
mutate(across(where(is.character), as.factor))
x_test <- x_test %>%
mutate(across(where(is.character), as.factor))
# Step 2: Convert factors to dummy variables
x_train_numeric <- as.matrix(model.matrix(~ . - 1, data = x_train))
x_test_numeric <- as.matrix(model.matrix(~ . - 1, data = x_test))
# Step 3: Prepare the XGBoost Datasets
dtrain <- xgb.DMatrix(data = x_train_numeric, label = y_train)
dtest <- xgb.DMatrix(data = x_test_numeric, label = y_test)
# Step 4: Train the XGBoost Model
set.seed(42)
xgb_model <- xgboost(
data = dtrain,
nrounds = 100, # Number of boosting rounds
objective = "reg:squarederror", # Objective for regression
max_depth = 3, # Maximum tree depth
eta = 0.1 # Learning rate
)
## [1] train-rmse:106149.948938
## [2] train-rmse:98849.975006
## [3] train-rmse:92442.033956
## [4] train-rmse:86898.818871
## [5] train-rmse:82089.909463
## [6] train-rmse:77898.984548
## [7] train-rmse:74309.790551
## [8] train-rmse:71097.291621
## [9] train-rmse:68424.833106
## [10] train-rmse:66128.448952
## [11] train-rmse:64150.065132
## [12] train-rmse:62291.788025
## [13] train-rmse:60800.941983
## [14] train-rmse:59533.643978
## [15] train-rmse:58376.977933
## [16] train-rmse:57419.761240
## [17] train-rmse:56487.627571
## [18] train-rmse:55760.850745
## [19] train-rmse:55025.936610
## [20] train-rmse:54395.283986
## [21] train-rmse:53802.837308
## [22] train-rmse:53276.669192
## [23] train-rmse:52822.890840
## [24] train-rmse:52384.386077
## [25] train-rmse:51936.188582
## [26] train-rmse:51493.693699
## [27] train-rmse:51081.591747
## [28] train-rmse:50767.257944
## [29] train-rmse:50483.739986
## [30] train-rmse:50187.872028
## [31] train-rmse:49862.101094
## [32] train-rmse:49594.054932
## [33] train-rmse:49312.323838
## [34] train-rmse:49029.518034
## [35] train-rmse:48803.870576
## [36] train-rmse:48429.877299
## [37] train-rmse:48223.303730
## [38] train-rmse:48003.604653
## [39] train-rmse:47820.727837
## [40] train-rmse:47633.363257
## [41] train-rmse:47312.004196
## [42] train-rmse:47149.246518
## [43] train-rmse:46978.321396
## [44] train-rmse:46818.530120
## [45] train-rmse:46574.912306
## [46] train-rmse:46391.967868
## [47] train-rmse:46253.887879
## [48] train-rmse:46102.170685
## [49] train-rmse:45977.851811
## [50] train-rmse:45807.368755
## [51] train-rmse:45683.083892
## [52] train-rmse:45567.364160
## [53] train-rmse:45457.176069
## [54] train-rmse:45272.889919
## [55] train-rmse:45164.772130
## [56] train-rmse:44938.687866
## [57] train-rmse:44783.850539
## [58] train-rmse:44678.099401
## [59] train-rmse:44583.575119
## [60] train-rmse:44485.909103
## [61] train-rmse:44400.893233
## [62] train-rmse:44308.658411
## [63] train-rmse:44226.277845
## [64] train-rmse:44062.405209
## [65] train-rmse:43976.427670
## [66] train-rmse:43873.952599
## [67] train-rmse:43768.337207
## [68] train-rmse:43693.059500
## [69] train-rmse:43601.100183
## [70] train-rmse:43515.763780
## [71] train-rmse:43449.003741
## [72] train-rmse:43114.971899
## [73] train-rmse:43003.638320
## [74] train-rmse:42937.032243
## [75] train-rmse:42847.558666
## [76] train-rmse:42742.445076
## [77] train-rmse:42673.881271
## [78] train-rmse:42603.058164
## [79] train-rmse:42472.129457
## [80] train-rmse:42413.606583
## [81] train-rmse:42345.769343
## [82] train-rmse:42254.316103
## [83] train-rmse:42192.525298
## [84] train-rmse:42130.460554
## [85] train-rmse:41869.105206
## [86] train-rmse:41814.615296
## [87] train-rmse:41753.133116
## [88] train-rmse:41559.610680
## [89] train-rmse:41503.328621
## [90] train-rmse:41424.843899
## [91] train-rmse:41230.301440
## [92] train-rmse:41175.646713
## [93] train-rmse:41122.861559
## [94] train-rmse:40915.542185
## [95] train-rmse:40863.072672
## [96] train-rmse:40779.595279
## [97] train-rmse:40728.115605
## [98] train-rmse:40677.523492
## [99] train-rmse:40589.277564
## [100] train-rmse:40539.292390
# Step 5: Make Predictions
xgb_predictions <- predict(xgb_model, x_test_numeric)
# Step 6: Evaluate the Model
xgb_rmse <- sqrt(mean((xgb_predictions - y_test)^2)) # Root Mean Square Error
xgb_r2 <- 1 - sum((y_test - xgb_predictions)^2) / sum((y_test - mean(y_test))^2) # R²
# Step 7: Print Evaluation Results
cat("\nXGBoost Results:\n")
##
## XGBoost Results:
cat("RMSE:", xgb_rmse, "\n")
## RMSE: 41312.06
cat("R²:", xgb_r2, "\n")
## R²: 0.7680535
# Convert all factor variables in x_train and x_test to dummy variables
x_train <- as.data.frame(model.matrix(~ . - 1, data = x_train)) # Dummy encoding
x_test <- as.data.frame(model.matrix(~ . - 1, data = x_test)) # Dummy encoding
# Train the k-NN Model
set.seed(42)
knn_model <- train(
x_train, y_train,
method = "knn",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = expand.grid(k = c(3, 5, 7)) # Test multiple k values
)
# Make Predictions
knn_predictions <- predict(knn_model, x_test)
# Evaluate the Model
knn_rmse <- sqrt(mean((knn_predictions - y_test)^2)) # RMSE
knn_r2 <- 1 - sum((y_test - knn_predictions)^2) / sum((y_test - mean(y_test))^2) # R²
# Print Results
cat("\nK-Nearest Neighbors Results:\n")
##
## K-Nearest Neighbors Results:
cat("RMSE:", knn_rmse, "\n")
## RMSE: 68810.77
cat("R²:", knn_r2, "\n")
## R²: 0.3565023
# Load necessary library
library(rpart)
# Training Decision Tree model
dt_model <- rpart(
formula = `Crop_yield(Hg/Ha)` ~ .,
data = train_data,
method = "anova" # Regression tree
)
# Make predictions
dt_predictions <- predict(dt_model, test_data)
# Evaluate the model
dt_rmse <- sqrt(mean((dt_predictions - y_test)^2)) # RMSE
dt_r2 <- 1 - sum((y_test - dt_predictions)^2) / sum((y_test - mean(y_test))^2) # R²
# Print evaluation results
cat("\nDecision Tree Results:\n")
##
## Decision Tree Results:
cat("RMSE:", dt_rmse, "\n")
## RMSE: 33723.02
cat("R²:", dt_r2, "\n")
## R²: 0.8454436
# Collect results
results <- data.frame(
Model = c("Linear Regression", "Random Forest", "Gradient Boosting", "XGBoost", "KNN", "Decision Tree"),
RMSE = c(lm_rmse, rf_rmse, gbm_rmse, xgb_rmse, knn_rmse, dt_rmse),
R2 = c(lm_r2, rf_r2, gbm_r2, xgb_r2, knn_r2, dt_r2)
)
# Print all results
print(results)
## Model RMSE R2
## 1 Linear Regression 48538.57 0.6798097
## 2 Random Forest 14027.52 0.9732579
## 3 Gradient Boosting 23369.39 0.9257786
## 4 XGBoost 41312.06 0.7680535
## 5 KNN 68810.77 0.3565023
## 6 Decision Tree 33723.02 0.8454436
# Linear Regression visualization
print(paste("Linear Regression - RMSE:", lm_rmse))
## [1] "Linear Regression - RMSE: 48538.5710639287"
print(paste("Linear Regression - R²:", lm_r2))
## [1] "Linear Regression - R²: 0.679809745682794"
ggplot(data = data.frame(Actual = y_test, Predicted = lm_predictions),
aes(x = Actual, y = Predicted)) +
geom_point(color = "#9B673C", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "green", linewidth = 1) +
ggtitle("Linear Regression Evaluation") +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
# Random Forest visualization
print(paste("Random Forest - RMSE:", rf_rmse))
## [1] "Random Forest - RMSE: 14027.518366673"
print(paste("Random Forest - R²:", rf_r2))
## [1] "Random Forest - R²: 0.973257880368533"
ggplot(data = data.frame(Actual = y_test, Predicted = rf_predictions),
aes(x = Actual, y = Predicted)) +
geom_point(color = "#9B673C", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "green", linewidth = 1) +
ggtitle("Random Forest Evaluation") +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
# Gradient Boosting visualization
print(paste("Gradient Boosting - RMSE:", gbm_rmse))
## [1] "Gradient Boosting - RMSE: 23369.3903344111"
print(paste("Gradient Boosting - R²:", gbm_r2))
## [1] "Gradient Boosting - R²: 0.925778641279042"
ggplot(data = data.frame(Actual = y_test, Predicted = gbm_predictions),
aes(x = Actual, y = Predicted)) +
geom_point(color = "#9B673C", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "green", linewidth = 1) +
ggtitle("Gradient Boosting Evaluation") +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
# XGBoost visualization
print(paste("XGBoost - RMSE:", xgb_rmse))
## [1] "XGBoost - RMSE: 41312.0569453984"
print(paste("XGBoost - R²:", xgb_r2))
## [1] "XGBoost - R²: 0.768053526599436"
ggplot(data = data.frame(Actual = y_test, Predicted = xgb_predictions),
aes(x = Actual, y = Predicted)) +
geom_point(color = "#9B673C", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "green", linewidth = 1) +
ggtitle("XGBoost Evaluation") +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
# Print RMSE and R² for KNN
print(paste("KNN - RMSE:", knn_rmse))
## [1] "KNN - RMSE: 68810.7721535999"
print(paste("KNN - R²:", knn_r2))
## [1] "KNN - R²: 0.35650230172846"
# Create a scatter plot for KNN predictions
k <- ggplot(data = data.frame(Actual = y_test, Predicted = knn_predictions),
aes(x = Actual, y = Predicted)) +
geom_point(color = "#9B673C", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "green", linewidth = 1) +
ggtitle("KNN Model Evaluation") +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
# Print the plot
print(k)
# Print RMSE and R² for Decision Tree
cat("Decision Tree - RMSE:", dt_rmse, "\n")
## Decision Tree - RMSE: 33723.02
cat("Decision Tree - R²:", dt_r2, "\n")
## Decision Tree - R²: 0.8454436
# Create a scatter plot for Decision Tree predictions
library(ggplot2)
# Visualization: Actual vs. Predicted values
d <- ggplot(data = data.frame(Actual = y_test, Predicted = dt_predictions),
aes(x = Actual, y = Predicted)) +
geom_point(color = "#9B673C", size = 1) + # Scatter plot points
geom_abline(slope = 1, intercept = 0, color = "green", linewidth = 1) + # Diagonal reference line
ggtitle("Decision Tree Model Evaluation") +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
# Print the plot
print(d)
Insights
For this dataset (climate and crop yield data), the classification problem could be:
options(warn = -1)
# Copy the dataframe
dataModel2 <- data_reordered
head(dataModel2)
## # A tibble: 6 × 8
## Continent Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Europe Albania Maize 1990 36613 1485
## 2 Europe Albania Potatoes 1990 66667 1485
## 3 Europe Albania Rice, paddy 1990 23333 1485
## 4 Europe Albania Sorghum 1990 12500 1485
## 5 Europe Albania Soybeans 1990 7000 1485
## 6 Europe Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 2 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>
# Convert relevant variables to factors
dataModel2$Continent <- as.factor(dataModel2$Continent)
dataModel2$Item <- as.factor(dataModel2$Item) # Original crop type
dataModel2$Country <- as.factor(dataModel2$Country)
# Group Crop Types into Crop_Group with the new label for "Other Crops"
dataModel2$Crop_Group <- ifelse(dataModel2$Item %in% c("Cassava", "Yams", "Sweet potatoes", "Plantains and others"), "Root Crops",
ifelse(dataModel2$Item %in% c("Maize", "Rice, paddy", "Wheat", "Sorghum"), "Grains",
"Potatoes and Soybeans"))
# Convert 'Crop_Group' to a factor
dataModel2$Crop_Group <- as.factor(dataModel2$Crop_Group)
# View the grouped data (optional)
table(dataModel2$Crop_Group)
##
## Grains Potatoes and Soybeans Root Crops
## 14384 7486 6338
head(dataModel2)
## # A tibble: 6 × 9
## Continent Country Item Year `Crop_yield(Hg/Ha)` Average_rain_fall_mm…¹
## <fct> <fct> <fct> <dbl> <dbl> <dbl>
## 1 Europe Albania Maize 1990 36613 1485
## 2 Europe Albania Potatoes 1990 66667 1485
## 3 Europe Albania Rice, paddy 1990 23333 1485
## 4 Europe Albania Sorghum 1990 12500 1485
## 5 Europe Albania Soybeans 1990 7000 1485
## 6 Europe Albania Wheat 1990 30197 1485
## # ℹ abbreviated name: ¹Average_rain_fall_mm_per_year
## # ℹ 3 more variables: Pesticides_tonnes <dbl>, Avg_temp <dbl>, Crop_Group <fct>
# Set seed for reproducibility
set.seed(42)
# Split the data into training and testing sets
trainIndex <- createDataPartition(dataModel2$Crop_Group, p = 0.8, list = FALSE)
train_data <- dataModel2[trainIndex, ]
test_data <- dataModel2[-trainIndex, ]
# Separate features and target
x_train <- train_data[, c("Avg_temp", "Average_rain_fall_mm_per_year", "Pesticides_tonnes")]
y_train <- train_data$Crop_Group # Target variable
x_test <- test_data[, c("Avg_temp", "Average_rain_fall_mm_per_year", "Pesticides_tonnes")]
y_test <- test_data$Crop_Group # Target variable
library(randomForest)
# Train Random Forest Model
rf_model <- randomForest(
Crop_Group ~ Avg_temp + Average_rain_fall_mm_per_year + Pesticides_tonnes,
data = train_data,
ntree = 100
)
# Predict on test data
rf_predictions <- predict(rf_model, x_test)
# Evaluate Model
rf_confusion <- confusionMatrix(rf_predictions, y_test)
cat("Random Forest Accuracy:", rf_confusion$overall["Accuracy"], "\n")
## Random Forest Accuracy: 0.4588652
print(rf_confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Grains Potatoes and Soybeans Root Crops
## Grains 2374 1361 1083
## Potatoes and Soybeans 259 53 23
## Root Crops 243 83 161
##
## Overall Statistics
##
## Accuracy : 0.4589
## 95% CI : (0.4458, 0.472)
## No Information Rate : 0.5099
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0225
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: Grains Class: Potatoes and Soybeans
## Sensitivity 0.8255 0.035404
## Specificity 0.1158 0.931933
## Pos Pred Value 0.4927 0.158209
## Neg Pred Value 0.3893 0.727804
## Prevalence 0.5099 0.265426
## Detection Rate 0.4209 0.009397
## Detection Prevalence 0.8543 0.059397
## Balanced Accuracy 0.4706 0.483669
## Class: Root Crops
## Sensitivity 0.12707
## Specificity 0.92545
## Pos Pred Value 0.33060
## Neg Pred Value 0.78537
## Prevalence 0.22465
## Detection Rate 0.02855
## Detection Prevalence 0.08635
## Balanced Accuracy 0.52626
library(xgboost)
# Encode target variable (0-based encoding for XGBoost)
label_mapping <- data.frame(Class = levels(y_train), Label = seq_along(levels(y_train)) - 1)
y_train_encoded <- as.numeric(y_train) - 1
y_test_encoded <- as.numeric(y_test) - 1
# Prepare data for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = y_train_encoded)
dtest <- xgb.DMatrix(data = as.matrix(x_test), label = y_test_encoded)
# Train XGBoost Model
xgb_model <- xgboost(
data = dtrain,
max.depth = 6,
eta = 0.1,
nrounds = 100,
objective = "multi:softmax",
num_class = length(unique(y_train_encoded)),
verbose = 1
)
## [1] train-mlogloss:1.080786
## [2] train-mlogloss:1.065471
## [3] train-mlogloss:1.052269
## [4] train-mlogloss:1.040881
## [5] train-mlogloss:1.031007
## [6] train-mlogloss:1.022324
## [7] train-mlogloss:1.014748
## [8] train-mlogloss:1.008107
## [9] train-mlogloss:1.002263
## [10] train-mlogloss:0.997067
## [11] train-mlogloss:0.992400
## [12] train-mlogloss:0.988247
## [13] train-mlogloss:0.984522
## [14] train-mlogloss:0.981216
## [15] train-mlogloss:0.978232
## [16] train-mlogloss:0.975463
## [17] train-mlogloss:0.972935
## [18] train-mlogloss:0.970646
## [19] train-mlogloss:0.968617
## [20] train-mlogloss:0.966744
## [21] train-mlogloss:0.965039
## [22] train-mlogloss:0.963463
## [23] train-mlogloss:0.961944
## [24] train-mlogloss:0.960549
## [25] train-mlogloss:0.959327
## [26] train-mlogloss:0.958116
## [27] train-mlogloss:0.956993
## [28] train-mlogloss:0.956007
## [29] train-mlogloss:0.955047
## [30] train-mlogloss:0.954103
## [31] train-mlogloss:0.953250
## [32] train-mlogloss:0.952429
## [33] train-mlogloss:0.951691
## [34] train-mlogloss:0.951038
## [35] train-mlogloss:0.950411
## [36] train-mlogloss:0.949829
## [37] train-mlogloss:0.949291
## [38] train-mlogloss:0.948740
## [39] train-mlogloss:0.948246
## [40] train-mlogloss:0.947765
## [41] train-mlogloss:0.947307
## [42] train-mlogloss:0.946898
## [43] train-mlogloss:0.946450
## [44] train-mlogloss:0.946026
## [45] train-mlogloss:0.945591
## [46] train-mlogloss:0.945189
## [47] train-mlogloss:0.944824
## [48] train-mlogloss:0.944495
## [49] train-mlogloss:0.944188
## [50] train-mlogloss:0.943851
## [51] train-mlogloss:0.943568
## [52] train-mlogloss:0.943249
## [53] train-mlogloss:0.942970
## [54] train-mlogloss:0.942699
## [55] train-mlogloss:0.942447
## [56] train-mlogloss:0.942171
## [57] train-mlogloss:0.941928
## [58] train-mlogloss:0.941699
## [59] train-mlogloss:0.941470
## [60] train-mlogloss:0.941246
## [61] train-mlogloss:0.941029
## [62] train-mlogloss:0.940860
## [63] train-mlogloss:0.940657
## [64] train-mlogloss:0.940480
## [65] train-mlogloss:0.940321
## [66] train-mlogloss:0.940116
## [67] train-mlogloss:0.939998
## [68] train-mlogloss:0.939830
## [69] train-mlogloss:0.939682
## [70] train-mlogloss:0.939547
## [71] train-mlogloss:0.939403
## [72] train-mlogloss:0.939240
## [73] train-mlogloss:0.939106
## [74] train-mlogloss:0.939014
## [75] train-mlogloss:0.938873
## [76] train-mlogloss:0.938778
## [77] train-mlogloss:0.938630
## [78] train-mlogloss:0.938518
## [79] train-mlogloss:0.938379
## [80] train-mlogloss:0.938287
## [81] train-mlogloss:0.938162
## [82] train-mlogloss:0.938077
## [83] train-mlogloss:0.937945
## [84] train-mlogloss:0.937858
## [85] train-mlogloss:0.937789
## [86] train-mlogloss:0.937687
## [87] train-mlogloss:0.937619
## [88] train-mlogloss:0.937499
## [89] train-mlogloss:0.937394
## [90] train-mlogloss:0.937324
## [91] train-mlogloss:0.937233
## [92] train-mlogloss:0.937118
## [93] train-mlogloss:0.937001
## [94] train-mlogloss:0.936906
## [95] train-mlogloss:0.936779
## [96] train-mlogloss:0.936678
## [97] train-mlogloss:0.936608
## [98] train-mlogloss:0.936522
## [99] train-mlogloss:0.936456
## [100] train-mlogloss:0.936381
# Predict on test data
xgb_predictions <- predict(xgb_model, dtest)
xgb_predictions <- factor(xgb_predictions, levels = label_mapping$Label, labels = label_mapping$Class)
# Evaluate Model
xgb_confusion <- confusionMatrix(xgb_predictions, y_test)
cat("XGBoost Accuracy:", xgb_confusion$overall["Accuracy"], "\n")
## XGBoost Accuracy: 0.4803191
print(xgb_confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Grains Potatoes and Soybeans Root Crops
## Grains 2472 1369 1054
## Potatoes and Soybeans 182 48 24
## Root Crops 222 80 189
##
## Overall Statistics
##
## Accuracy : 0.4803
## 95% CI : (0.4672, 0.4935)
## No Information Rate : 0.5099
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0119
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: Grains Class: Potatoes and Soybeans
## Sensitivity 0.8595 0.032064
## Specificity 0.1234 0.950278
## Pos Pred Value 0.5050 0.188976
## Neg Pred Value 0.4577 0.730969
## Prevalence 0.5099 0.265426
## Detection Rate 0.4383 0.008511
## Detection Prevalence 0.8679 0.045035
## Balanced Accuracy 0.4914 0.491171
## Class: Root Crops
## Sensitivity 0.14917
## Specificity 0.93094
## Pos Pred Value 0.38493
## Neg Pred Value 0.79064
## Prevalence 0.22465
## Detection Rate 0.03351
## Detection Prevalence 0.08706
## Balanced Accuracy 0.54006
library(e1071)
library(caret)
# Train SVM Model
svm_model <- svm(
Crop_Group ~ Avg_temp + Average_rain_fall_mm_per_year + Pesticides_tonnes,
data = train_data,
kernel = "radial" # Radial basis kernel for non-linear decision boundaries
)
# Predict on test data
svm_predictions <- predict(svm_model, x_test)
# Evaluate Model
svm_confusion <- confusionMatrix(svm_predictions, y_test)
cat("SVM Accuracy:", svm_confusion$overall["Accuracy"], "\n")
## SVM Accuracy: 0.5106383
print(svm_confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Grains Potatoes and Soybeans Root Crops
## Grains 2706 1420 1093
## Potatoes and Soybeans 0 0 0
## Root Crops 170 77 174
##
## Overall Statistics
##
## Accuracy : 0.5106
## 95% CI : (0.4975, 0.5238)
## No Information Rate : 0.5099
## P-Value [Acc > NIR] : 0.4629
##
## Kappa : 0.043
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: Grains Class: Potatoes and Soybeans
## Sensitivity 0.94089 0.0000
## Specificity 0.09081 1.0000
## Pos Pred Value 0.51849 NaN
## Neg Pred Value 0.59620 0.7346
## Prevalence 0.50993 0.2654
## Detection Rate 0.47979 0.0000
## Detection Prevalence 0.92535 0.0000
## Balanced Accuracy 0.51585 0.5000
## Class: Root Crops
## Sensitivity 0.13733
## Specificity 0.94352
## Pos Pred Value 0.41330
## Neg Pred Value 0.79057
## Prevalence 0.22465
## Detection Rate 0.03085
## Detection Prevalence 0.07465
## Balanced Accuracy 0.54042
library(rpart)
library(caret)
# Train Decision Tree Model
dt_model <- rpart(
Crop_Group ~ Avg_temp + Average_rain_fall_mm_per_year + Pesticides_tonnes,
data = train_data,
method = "class"
)
# Predict on test data
dt_predictions <- predict(dt_model, x_test, type = "class")
# Evaluate Model
dt_confusion <- confusionMatrix(dt_predictions, y_test)
cat("Decision Tree Accuracy:", dt_confusion$overall["Accuracy"], "\n")
## Decision Tree Accuracy: 0.5099291
print(dt_confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Grains Potatoes and Soybeans Root Crops
## Grains 2876 1497 1267
## Potatoes and Soybeans 0 0 0
## Root Crops 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5099
## 95% CI : (0.4968, 0.5231)
## No Information Rate : 0.5099
## P-Value [Acc > NIR] : 0.5053
##
## Kappa : 0
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Grains Class: Potatoes and Soybeans
## Sensitivity 1.0000 0.0000
## Specificity 0.0000 1.0000
## Pos Pred Value 0.5099 NaN
## Neg Pred Value NaN 0.7346
## Prevalence 0.5099 0.2654
## Detection Rate 0.5099 0.0000
## Detection Prevalence 1.0000 0.0000
## Balanced Accuracy 0.5000 0.5000
## Class: Root Crops
## Sensitivity 0.0000
## Specificity 1.0000
## Pos Pred Value NaN
## Neg Pred Value 0.7754
## Prevalence 0.2246
## Detection Rate 0.0000
## Detection Prevalence 0.0000
## Balanced Accuracy 0.5000
# Compile Results
results <- data.frame(
Model = c("Random Forest", "XGBoost", "SVM", "Decision Tree"),
Accuracy = c(
rf_confusion$overall["Accuracy"],
xgb_confusion$overall["Accuracy"],
svm_confusion$overall["Accuracy"],
dt_confusion$overall["Accuracy"]
)
)
# Print Results
print(results)
## Model Accuracy
## 1 Random Forest 0.4588652
## 2 XGBoost 0.4803191
## 3 SVM 0.5106383
## 4 Decision Tree 0.5099291