# Load necessary libraries
library(caret) # For data preprocessing
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # For data visualization
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
library(class)
library(e1071)
# Load the dataset
data <- read.csv("C:\\Users\\acer\\Downloads\\archive (1)\\Crop_recommendation.csv")
# 1. Overview of the dataset
str(data) # Structure of the dataset
## 'data.frame': 2200 obs. of 8 variables:
## $ N : int 90 85 60 74 78 69 69 94 89 68 ...
## $ P : int 42 58 55 35 42 37 55 53 54 58 ...
## $ K : int 43 41 44 40 42 42 38 40 38 38 ...
## $ temperature: num 20.9 21.8 23 26.5 20.1 ...
## $ humidity : num 82 80.3 82.3 80.2 81.6 ...
## $ ph : num 6.5 7.04 7.84 6.98 7.63 ...
## $ rainfall : num 203 227 264 243 263 ...
## $ label : chr "rice" "rice" "rice" "rice" ...
summary(data) # Summary statistics
## N P K temperature
## Min. : 0.00 Min. : 5.00 Min. : 5.00 Min. : 8.826
## 1st Qu.: 21.00 1st Qu.: 28.00 1st Qu.: 20.00 1st Qu.:22.769
## Median : 37.00 Median : 51.00 Median : 32.00 Median :25.599
## Mean : 50.55 Mean : 53.36 Mean : 48.15 Mean :25.616
## 3rd Qu.: 84.25 3rd Qu.: 68.00 3rd Qu.: 49.00 3rd Qu.:28.562
## Max. :140.00 Max. :145.00 Max. :205.00 Max. :43.675
## humidity ph rainfall label
## Min. :14.26 Min. :3.505 Min. : 20.21 Length:2200
## 1st Qu.:60.26 1st Qu.:5.972 1st Qu.: 64.55 Class :character
## Median :80.47 Median :6.425 Median : 94.87 Mode :character
## Mean :71.48 Mean :6.469 Mean :103.46
## 3rd Qu.:89.95 3rd Qu.:6.924 3rd Qu.:124.27
## Max. :99.98 Max. :9.935 Max. :298.56
# 2. Explore the target variable (Crop)
table(data$label) # Frequency of each crop
##
## apple banana blackgram chickpea coconut coffee
## 100 100 100 100 100 100
## cotton grapes jute kidneybeans lentil maize
## 100 100 100 100 100 100
## mango mothbeans mungbean muskmelon orange papaya
## 100 100 100 100 100 100
## pigeonpeas pomegranate rice watermelon
## 100 100 100 100
# 3. Numeric Feature Distributions
numeric_features <- c("N", "P", "K", "temperature", "humidity", "ph", "rainfall")
# Plot histograms for numeric features
for (feature in numeric_features) {
ggplot(data, aes(x = data[[feature]])) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = paste("Histogram of", feature), x = feature, y = "Frequency")
}
# 5. Correlation Analysis (if applicable)
cor_matrix <- cor(data[, numeric_features])
print(cor_matrix)
## N P K temperature humidity
## N 1.00000000 -0.23145958 -0.14051184 0.02650380 0.190688379
## P -0.23145958 1.00000000 0.73623222 -0.12754113 -0.118734116
## K -0.14051184 0.73623222 1.00000000 -0.16038713 0.190858861
## temperature 0.02650380 -0.12754113 -0.16038713 1.00000000 0.205319677
## humidity 0.19068838 -0.11873412 0.19085886 0.20531968 1.000000000
## ph 0.09668285 -0.13801889 -0.16950310 -0.01779502 -0.008482539
## rainfall 0.05902022 -0.06383905 -0.05346135 -0.03008378 0.094423053
## ph rainfall
## N 0.096682846 0.05902022
## P -0.138018893 -0.06383905
## K -0.169503098 -0.05346135
## temperature -0.017795017 -0.03008378
## humidity -0.008482539 0.09442305
## ph 1.000000000 -0.10906948
## rainfall -0.109069484 1.00000000
# 6. Outlier Detection
# Use box plots to detect outliers in numeric features
for (feature in numeric_features) {
ggplot(data, aes(y = data[[feature]])) +
geom_boxplot(fill = "") +
labs(title = paste("Box Plot of", feature), y = feature)
}
# Specify the subset of attributes you want to include in the scatterplot matrix
# You can adjust this list based on the specific attributes you want to visualize
attributes_subset <- data[, c("N", "P", "K", "temperature", "humidity", "ph", "rainfall")]
# Create a scatterplot matrix
pairs(attributes_subset)

# Create a scatterplot matrix using ggplot2 with color based on the "Crop" variable
ggplot(data, aes(x = N, y = P, color = label)) +
geom_point() +
facet_wrap(~label, scales = "free")

# 7. Pairwise Scatterplots (if applicable)
# Create pairwise scatterplots for numeric features
scatter_data <- data[, numeric_features]
pairs(scatter_data)

# 8. Missing Value Analysis
# Check for missing values in the dataset
missing_values <- colSums(is.na(data))
print(missing_values)
## N P K temperature humidity ph
## 0 0 0 0 0 0
## rainfall label
## 0 0
# 1. Handling Missing Values (if any)
# Example: Replacing missing values with the mean for numerical features
data$N[is.na(data$N)] <- mean(data$N, na.rm = TRUE)
data$P[is.na(data$P)] <- mean(data$P, na.rm = TRUE)
data$K[is.na(data$K)] <- mean(data$K, na.rm = TRUE)
data$temperature[is.na(data$temperature)] <- mean(data$temperature, na.rm = TRUE)
data$humidity[is.na(data$humidity)] <- mean(data$humidity, na.rm = TRUE)
data$ph[is.na(data$ph)] <- mean(data$ph, na.rm = TRUE)
data$rainfall[is.na(data$rainfall)] <- mean(data$rainfall, na.rm = TRUE)
data$label <- as.factor(data$label)
data_df <- data
head(data_df)
## N P K temperature humidity ph rainfall label
## 1 90 42 43 20.87974 82.00274 6.502985 202.9355 rice
## 2 85 58 41 21.77046 80.31964 7.038096 226.6555 rice
## 3 60 55 44 23.00446 82.32076 7.840207 263.9642 rice
## 4 74 35 40 26.49110 80.15836 6.980401 242.8640 rice
## 5 78 42 42 20.13017 81.60487 7.628473 262.7173 rice
## 6 69 37 42 23.05805 83.37012 7.073454 251.0550 rice
# 4. Splitting the data into training and testing sets
# Assuming you have already loaded and prepared your dataset 'data'
# Set a random seed for reproducibility
set.seed(123)
# Define the proportion of data for training (e.g., 75%)
train_proportion <- 0.75
# Calculate the number of samples for the training set
num_train_samples <- round(nrow(data_df) * train_proportion)
# Randomly select the row indices for the training set
train_indices <- sample(1:nrow(data_df), num_train_samples)
# Create the training and testing sets
data_train <- data_df[train_indices, ]
data_test <- data_df[-train_indices, ]
str(data_train)
## 'data.frame': 1650 obs. of 8 variables:
## $ N : num 36 71 40 26 8 40 119 14 21 118 ...
## $ P : num 56 60 5 32 133 132 72 41 29 88 ...
## $ K : num 20 22 29 32 195 202 55 17 12 52 ...
## $ temperature: num 25.4 26.1 28.5 30.9 20.5 ...
## $ humidity : num 49.7 59.4 97.8 49.9 81 ...
## $ ph : num 7.44 6.2 5.82 6.81 6.46 ...
## $ rainfall : num 31.9 85.8 160.4 90.1 71.3 ...
## $ label : Factor w/ 22 levels "apple","banana",..: 14 12 5 13 8 8 2 15 17 2 ...
# Create the Random Forest model
model_rf <- randomForest(label~ N + temperature + humidity + ph + rainfall ,data = data_train,ntree = 100,)
# Make predictions on the test data
predictions_rf <- predict(model_rf, data_test)
confusion_matrix_rf <- table(predictions_rf, data_test$label)
accuracy_rf <- sum(diag(confusion_matrix_rf)) / sum(confusion_matrix_rf)
cat("RF Accuracy: ", accuracy_rf, "\n")
## RF Accuracy: 0.9727273
# Create the SVM model
model_svm <- svm(label ~ N + P + K + temperature + humidity + ph + rainfall, data = data_train)
# Make predictions on the test data
predictions_svm <- predict(model_svm, data_test)
# Evaluate the SVM model
confusion_matrix_svm <- table(predictions_svm, data_test$label)
accuracy_svm <- sum(diag(confusion_matrix_svm)) / sum(confusion_matrix_svm)
cat("SVM Accuracy: ", accuracy_svm, "\n")
## SVM Accuracy: 0.9818182
# Fit a decision tree model
tree_model <- rpart(label ~ N + P + K + temperature + humidity + ph + rainfall, data = data_train, method = "class")
# Make predictions
predictions <- predict(tree_model, data_test, type = "class")
# Create a confusion matrix
confusion_matrix <- confusionMatrix(predictions, data_test$label)
# Calculate metrics from the confusion matrix
accuracy <- confusion_matrix$overall[1]
precision <- confusion_matrix$byClass["Pos Pred Value"]
recall <- confusion_matrix$byClass["Sensitivity"]
f1_score <- confusion_matrix$byClass["F1"]
# Print the metrics
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.9545455
cat("Precision: ", precision, "\n")
## Precision: NA
cat("Recall: ", recall, "\n")
## Recall: NA
cat("F1 Score: ", f1_score, "\n")
## F1 Score: NA
# Print the confusion matrix
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction apple banana blackgram chickpea coconut coffee cotton grapes jute
## apple 27 0 0 0 0 0 0 0 0
## banana 0 22 0 0 0 0 0 0 0
## blackgram 0 0 21 0 0 0 0 0 0
## chickpea 0 0 0 24 0 0 0 0 0
## coconut 0 0 0 0 23 0 0 0 0
## coffee 0 0 0 0 0 22 0 0 0
## cotton 0 0 0 0 0 0 26 0 0
## grapes 0 0 0 0 0 0 0 25 0
## jute 0 0 0 0 0 0 0 0 22
## kidneybeans 0 0 0 0 0 0 0 0 0
## lentil 0 0 0 0 0 0 0 0 0
## maize 0 0 0 0 0 0 0 0 0
## mango 0 0 0 0 0 0 0 0 0
## mothbeans 0 0 1 0 0 0 0 0 0
## mungbean 0 0 0 0 0 0 0 0 0
## muskmelon 0 0 0 0 0 0 0 0 0
## orange 0 0 0 0 0 0 0 0 0
## papaya 0 0 0 0 0 0 0 0 0
## pigeonpeas 0 0 0 0 0 0 0 0 0
## pomegranate 0 0 0 0 0 0 0 0 0
## rice 0 0 0 0 0 0 0 0 0
## watermelon 0 0 0 0 0 0 0 0 0
## Reference
## Prediction kidneybeans lentil maize mango mothbeans mungbean muskmelon
## apple 0 0 0 0 0 0 0
## banana 0 0 0 0 0 0 0
## blackgram 0 0 0 0 1 0 0
## chickpea 0 0 0 0 0 0 0
## coconut 0 0 0 0 0 0 0
## coffee 0 0 0 0 0 0 0
## cotton 0 0 1 0 0 0 0
## grapes 0 0 0 0 0 0 0
## jute 0 0 0 0 0 0 0
## kidneybeans 27 0 0 0 0 0 0
## lentil 0 29 0 0 3 0 0
## maize 0 0 23 0 0 0 0
## mango 0 0 0 25 0 0 0
## mothbeans 0 1 0 0 20 0 0
## mungbean 0 0 0 0 0 20 0
## muskmelon 0 0 0 0 0 0 25
## orange 0 0 0 0 0 0 0
## papaya 0 0 0 0 0 0 0
## pigeonpeas 0 0 0 0 0 0 0
## pomegranate 0 0 0 0 0 0 0
## rice 0 0 0 0 0 0 0
## watermelon 0 0 0 0 0 0 0
## Reference
## Prediction orange papaya pigeonpeas pomegranate rice watermelon
## apple 0 0 0 0 0 0
## banana 0 1 0 0 0 0
## blackgram 0 0 0 0 0 0
## chickpea 0 0 0 0 0 0
## coconut 0 0 0 0 0 0
## coffee 0 0 0 0 0 0
## cotton 0 0 0 0 0 0
## grapes 0 0 0 0 0 0
## jute 0 2 0 0 6 0
## kidneybeans 0 0 0 0 0 0
## lentil 0 0 0 0 0 0
## maize 0 1 0 0 0 0
## mango 0 0 0 0 0 0
## mothbeans 0 0 0 0 0 0
## mungbean 0 2 0 0 0 0
## muskmelon 0 0 0 0 0 0
## orange 28 0 0 0 0 0
## papaya 0 21 0 0 0 0
## pigeonpeas 0 0 29 0 0 0
## pomegranate 0 3 0 26 0 0
## rice 0 1 0 0 16 0
## watermelon 0 2 0 0 0 24
##
## Overall Statistics
##
## Accuracy : 0.9545
## 95% CI : (0.9336, 0.9704)
## No Information Rate : 0.06
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9524
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: apple Class: banana Class: blackgram
## Sensitivity 1.00000 1.00000 0.95455
## Specificity 1.00000 0.99811 0.99811
## Pos Pred Value 1.00000 0.95652 0.95455
## Neg Pred Value 1.00000 1.00000 0.99811
## Prevalence 0.04909 0.04000 0.04000
## Detection Rate 0.04909 0.04000 0.03818
## Detection Prevalence 0.04909 0.04182 0.04000
## Balanced Accuracy 1.00000 0.99905 0.97633
## Class: chickpea Class: coconut Class: coffee Class: cotton
## Sensitivity 1.00000 1.00000 1.00 1.00000
## Specificity 1.00000 1.00000 1.00 0.99809
## Pos Pred Value 1.00000 1.00000 1.00 0.96296
## Neg Pred Value 1.00000 1.00000 1.00 1.00000
## Prevalence 0.04364 0.04182 0.04 0.04727
## Detection Rate 0.04364 0.04182 0.04 0.04727
## Detection Prevalence 0.04364 0.04182 0.04 0.04909
## Balanced Accuracy 1.00000 1.00000 1.00 0.99905
## Class: grapes Class: jute Class: kidneybeans Class: lentil
## Sensitivity 1.00000 1.00000 1.00000 0.96667
## Specificity 1.00000 0.98485 1.00000 0.99423
## Pos Pred Value 1.00000 0.73333 1.00000 0.90625
## Neg Pred Value 1.00000 1.00000 1.00000 0.99807
## Prevalence 0.04545 0.04000 0.04909 0.05455
## Detection Rate 0.04545 0.04000 0.04909 0.05273
## Detection Prevalence 0.04545 0.05455 0.04909 0.05818
## Balanced Accuracy 1.00000 0.99242 1.00000 0.98045
## Class: maize Class: mango Class: mothbeans Class: mungbean
## Sensitivity 0.95833 1.00000 0.83333 1.00000
## Specificity 0.99810 1.00000 0.99620 0.99623
## Pos Pred Value 0.95833 1.00000 0.90909 0.90909
## Neg Pred Value 0.99810 1.00000 0.99242 1.00000
## Prevalence 0.04364 0.04545 0.04364 0.03636
## Detection Rate 0.04182 0.04545 0.03636 0.03636
## Detection Prevalence 0.04364 0.04545 0.04000 0.04000
## Balanced Accuracy 0.97822 1.00000 0.91477 0.99811
## Class: muskmelon Class: orange Class: papaya
## Sensitivity 1.00000 1.00000 0.63636
## Specificity 1.00000 1.00000 1.00000
## Pos Pred Value 1.00000 1.00000 1.00000
## Neg Pred Value 1.00000 1.00000 0.97732
## Prevalence 0.04545 0.05091 0.06000
## Detection Rate 0.04545 0.05091 0.03818
## Detection Prevalence 0.04545 0.05091 0.03818
## Balanced Accuracy 1.00000 1.00000 0.81818
## Class: pigeonpeas Class: pomegranate Class: rice
## Sensitivity 1.00000 1.00000 0.72727
## Specificity 1.00000 0.99427 0.99811
## Pos Pred Value 1.00000 0.89655 0.94118
## Neg Pred Value 1.00000 1.00000 0.98874
## Prevalence 0.05273 0.04727 0.04000
## Detection Rate 0.05273 0.04727 0.02909
## Detection Prevalence 0.05273 0.05273 0.03091
## Balanced Accuracy 1.00000 0.99714 0.86269
## Class: watermelon
## Sensitivity 1.00000
## Specificity 0.99620
## Pos Pred Value 0.92308
## Neg Pred Value 1.00000
## Prevalence 0.04364
## Detection Rate 0.04364
## Detection Prevalence 0.04727
## Balanced Accuracy 0.99810