Project.knit

# Load necessary libraries
library(caret)  # For data preprocessing

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)   # For data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) # For data visualization
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(rpart)
library(class)
library(e1071)

# Load the dataset
data <- read.csv("C:\\Users\\acer\\Downloads\\archive (1)\\Crop_recommendation.csv")

# 1. Overview of the dataset
str(data)   # Structure of the dataset

## 'data.frame':    2200 obs. of  8 variables:
##  $ N          : int  90 85 60 74 78 69 69 94 89 68 ...
##  $ P          : int  42 58 55 35 42 37 55 53 54 58 ...
##  $ K          : int  43 41 44 40 42 42 38 40 38 38 ...
##  $ temperature: num  20.9 21.8 23 26.5 20.1 ...
##  $ humidity   : num  82 80.3 82.3 80.2 81.6 ...
##  $ ph         : num  6.5 7.04 7.84 6.98 7.63 ...
##  $ rainfall   : num  203 227 264 243 263 ...
##  $ label      : chr  "rice" "rice" "rice" "rice" ...

summary(data) # Summary statistics

##        N                P                K           temperature    
##  Min.   :  0.00   Min.   :  5.00   Min.   :  5.00   Min.   : 8.826  
##  1st Qu.: 21.00   1st Qu.: 28.00   1st Qu.: 20.00   1st Qu.:22.769  
##  Median : 37.00   Median : 51.00   Median : 32.00   Median :25.599  
##  Mean   : 50.55   Mean   : 53.36   Mean   : 48.15   Mean   :25.616  
##  3rd Qu.: 84.25   3rd Qu.: 68.00   3rd Qu.: 49.00   3rd Qu.:28.562  
##  Max.   :140.00   Max.   :145.00   Max.   :205.00   Max.   :43.675  
##     humidity           ph           rainfall         label          
##  Min.   :14.26   Min.   :3.505   Min.   : 20.21   Length:2200       
##  1st Qu.:60.26   1st Qu.:5.972   1st Qu.: 64.55   Class :character  
##  Median :80.47   Median :6.425   Median : 94.87   Mode  :character  
##  Mean   :71.48   Mean   :6.469   Mean   :103.46                     
##  3rd Qu.:89.95   3rd Qu.:6.924   3rd Qu.:124.27                     
##  Max.   :99.98   Max.   :9.935   Max.   :298.56

# 2. Explore the target variable (Crop)
table(data$label)  # Frequency of each crop

## 
##       apple      banana   blackgram    chickpea     coconut      coffee 
##         100         100         100         100         100         100 
##      cotton      grapes        jute kidneybeans      lentil       maize 
##         100         100         100         100         100         100 
##       mango   mothbeans    mungbean   muskmelon      orange      papaya 
##         100         100         100         100         100         100 
##  pigeonpeas pomegranate        rice  watermelon 
##         100         100         100         100

# 3. Numeric Feature Distributions
numeric_features <- c("N", "P", "K", "temperature", "humidity", "ph", "rainfall")

# Plot histograms for numeric features
for (feature in numeric_features) {
  ggplot(data, aes(x = data[[feature]])) +
    geom_histogram(binwidth = 1, fill = "blue", color = "black") +
    labs(title = paste("Histogram of", feature), x = feature, y = "Frequency")
}

# 5. Correlation Analysis (if applicable)
cor_matrix <- cor(data[, numeric_features])
print(cor_matrix)

##                       N           P           K temperature     humidity
## N            1.00000000 -0.23145958 -0.14051184  0.02650380  0.190688379
## P           -0.23145958  1.00000000  0.73623222 -0.12754113 -0.118734116
## K           -0.14051184  0.73623222  1.00000000 -0.16038713  0.190858861
## temperature  0.02650380 -0.12754113 -0.16038713  1.00000000  0.205319677
## humidity     0.19068838 -0.11873412  0.19085886  0.20531968  1.000000000
## ph           0.09668285 -0.13801889 -0.16950310 -0.01779502 -0.008482539
## rainfall     0.05902022 -0.06383905 -0.05346135 -0.03008378  0.094423053
##                       ph    rainfall
## N            0.096682846  0.05902022
## P           -0.138018893 -0.06383905
## K           -0.169503098 -0.05346135
## temperature -0.017795017 -0.03008378
## humidity    -0.008482539  0.09442305
## ph           1.000000000 -0.10906948
## rainfall    -0.109069484  1.00000000

# 6. Outlier Detection
# Use box plots to detect outliers in numeric features
for (feature in numeric_features) {
  ggplot(data, aes(y = data[[feature]])) +
    geom_boxplot(fill = "") +
    labs(title = paste("Box Plot of", feature), y = feature)
}

# Specify the subset of attributes you want to include in the scatterplot matrix
# You can adjust this list based on the specific attributes you want to visualize
attributes_subset <- data[, c("N", "P", "K", "temperature", "humidity", "ph", "rainfall")]

# Create a scatterplot matrix
pairs(attributes_subset)

# Create a scatterplot matrix using ggplot2 with color based on the "Crop" variable
ggplot(data, aes(x = N, y = P, color = label)) +
  geom_point() +
  facet_wrap(~label, scales = "free")

# 7. Pairwise Scatterplots (if applicable)
# Create pairwise scatterplots for numeric features
scatter_data <- data[, numeric_features]
pairs(scatter_data)

# 8. Missing Value Analysis
# Check for missing values in the dataset
missing_values <- colSums(is.na(data))
print(missing_values)

##           N           P           K temperature    humidity          ph 
##           0           0           0           0           0           0 
##    rainfall       label 
##           0           0

# 1. Handling Missing Values (if any)
# Example: Replacing missing values with the mean for numerical features
data$N[is.na(data$N)] <- mean(data$N, na.rm = TRUE)
data$P[is.na(data$P)] <- mean(data$P, na.rm = TRUE)
data$K[is.na(data$K)] <- mean(data$K, na.rm = TRUE)
data$temperature[is.na(data$temperature)] <- mean(data$temperature, na.rm = TRUE)
data$humidity[is.na(data$humidity)] <- mean(data$humidity, na.rm = TRUE)
data$ph[is.na(data$ph)] <- mean(data$ph, na.rm = TRUE)
data$rainfall[is.na(data$rainfall)] <- mean(data$rainfall, na.rm = TRUE)

data$label <- as.factor(data$label)

data_df <- data
head(data_df)

##    N  P  K temperature humidity       ph rainfall label
## 1 90 42 43    20.87974 82.00274 6.502985 202.9355  rice
## 2 85 58 41    21.77046 80.31964 7.038096 226.6555  rice
## 3 60 55 44    23.00446 82.32076 7.840207 263.9642  rice
## 4 74 35 40    26.49110 80.15836 6.980401 242.8640  rice
## 5 78 42 42    20.13017 81.60487 7.628473 262.7173  rice
## 6 69 37 42    23.05805 83.37012 7.073454 251.0550  rice

# 4. Splitting the data into training and testing sets
# Assuming you have already loaded and prepared your dataset 'data'

# Set a random seed for reproducibility
set.seed(123)

# Define the proportion of data for training (e.g., 75%)
train_proportion <- 0.75

# Calculate the number of samples for the training set
num_train_samples <- round(nrow(data_df) * train_proportion)

# Randomly select the row indices for the training set
train_indices <- sample(1:nrow(data_df), num_train_samples)

# Create the training and testing sets
data_train <- data_df[train_indices, ]
data_test <- data_df[-train_indices, ]

str(data_train)

## 'data.frame':    1650 obs. of  8 variables:
##  $ N          : num  36 71 40 26 8 40 119 14 21 118 ...
##  $ P          : num  56 60 5 32 133 132 72 41 29 88 ...
##  $ K          : num  20 22 29 32 195 202 55 17 12 52 ...
##  $ temperature: num  25.4 26.1 28.5 30.9 20.5 ...
##  $ humidity   : num  49.7 59.4 97.8 49.9 81 ...
##  $ ph         : num  7.44 6.2 5.82 6.81 6.46 ...
##  $ rainfall   : num  31.9 85.8 160.4 90.1 71.3 ...
##  $ label      : Factor w/ 22 levels "apple","banana",..: 14 12 5 13 8 8 2 15 17 2 ...

# Create the Random Forest model
model_rf <- randomForest(label~ N + temperature + humidity + ph + rainfall ,data = data_train,ntree = 100,)

# Make predictions on the test data
predictions_rf <- predict(model_rf, data_test)
confusion_matrix_rf <- table(predictions_rf, data_test$label)
accuracy_rf <- sum(diag(confusion_matrix_rf)) / sum(confusion_matrix_rf)
cat("RF Accuracy: ", accuracy_rf, "\n")

## RF Accuracy:  0.9727273

# Create the SVM model
model_svm <- svm(label ~ N + P + K + temperature + humidity + ph + rainfall, data = data_train)

# Make predictions on the test data
predictions_svm <- predict(model_svm, data_test)

# Evaluate the SVM model
confusion_matrix_svm <- table(predictions_svm, data_test$label)
accuracy_svm <- sum(diag(confusion_matrix_svm)) / sum(confusion_matrix_svm)
cat("SVM Accuracy: ", accuracy_svm, "\n")

## SVM Accuracy:  0.9818182

# Fit a decision tree model
tree_model <- rpart(label ~ N + P + K + temperature + humidity + ph + rainfall, data = data_train, method = "class")

# Make predictions
predictions <- predict(tree_model, data_test, type = "class")

# Create a confusion matrix
confusion_matrix <- confusionMatrix(predictions, data_test$label)

# Calculate metrics from the confusion matrix
accuracy <- confusion_matrix$overall[1]
precision <- confusion_matrix$byClass["Pos Pred Value"]
recall <- confusion_matrix$byClass["Sensitivity"]
f1_score <- confusion_matrix$byClass["F1"]

# Print the metrics
cat("Accuracy: ", accuracy, "\n")

## Accuracy:  0.9545455

cat("Precision: ", precision, "\n")

## Precision:  NA

cat("Recall: ", recall, "\n")

## Recall:  NA

cat("F1 Score: ", f1_score, "\n")

## F1 Score:  NA

# Print the confusion matrix
print(confusion_matrix)

## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    apple banana blackgram chickpea coconut coffee cotton grapes jute
##   apple          27      0         0        0       0      0      0      0    0
##   banana          0     22         0        0       0      0      0      0    0
##   blackgram       0      0        21        0       0      0      0      0    0
##   chickpea        0      0         0       24       0      0      0      0    0
##   coconut         0      0         0        0      23      0      0      0    0
##   coffee          0      0         0        0       0     22      0      0    0
##   cotton          0      0         0        0       0      0     26      0    0
##   grapes          0      0         0        0       0      0      0     25    0
##   jute            0      0         0        0       0      0      0      0   22
##   kidneybeans     0      0         0        0       0      0      0      0    0
##   lentil          0      0         0        0       0      0      0      0    0
##   maize           0      0         0        0       0      0      0      0    0
##   mango           0      0         0        0       0      0      0      0    0
##   mothbeans       0      0         1        0       0      0      0      0    0
##   mungbean        0      0         0        0       0      0      0      0    0
##   muskmelon       0      0         0        0       0      0      0      0    0
##   orange          0      0         0        0       0      0      0      0    0
##   papaya          0      0         0        0       0      0      0      0    0
##   pigeonpeas      0      0         0        0       0      0      0      0    0
##   pomegranate     0      0         0        0       0      0      0      0    0
##   rice            0      0         0        0       0      0      0      0    0
##   watermelon      0      0         0        0       0      0      0      0    0
##              Reference
## Prediction    kidneybeans lentil maize mango mothbeans mungbean muskmelon
##   apple                 0      0     0     0         0        0         0
##   banana                0      0     0     0         0        0         0
##   blackgram             0      0     0     0         1        0         0
##   chickpea              0      0     0     0         0        0         0
##   coconut               0      0     0     0         0        0         0
##   coffee                0      0     0     0         0        0         0
##   cotton                0      0     1     0         0        0         0
##   grapes                0      0     0     0         0        0         0
##   jute                  0      0     0     0         0        0         0
##   kidneybeans          27      0     0     0         0        0         0
##   lentil                0     29     0     0         3        0         0
##   maize                 0      0    23     0         0        0         0
##   mango                 0      0     0    25         0        0         0
##   mothbeans             0      1     0     0        20        0         0
##   mungbean              0      0     0     0         0       20         0
##   muskmelon             0      0     0     0         0        0        25
##   orange                0      0     0     0         0        0         0
##   papaya                0      0     0     0         0        0         0
##   pigeonpeas            0      0     0     0         0        0         0
##   pomegranate           0      0     0     0         0        0         0
##   rice                  0      0     0     0         0        0         0
##   watermelon            0      0     0     0         0        0         0
##              Reference
## Prediction    orange papaya pigeonpeas pomegranate rice watermelon
##   apple            0      0          0           0    0          0
##   banana           0      1          0           0    0          0
##   blackgram        0      0          0           0    0          0
##   chickpea         0      0          0           0    0          0
##   coconut          0      0          0           0    0          0
##   coffee           0      0          0           0    0          0
##   cotton           0      0          0           0    0          0
##   grapes           0      0          0           0    0          0
##   jute             0      2          0           0    6          0
##   kidneybeans      0      0          0           0    0          0
##   lentil           0      0          0           0    0          0
##   maize            0      1          0           0    0          0
##   mango            0      0          0           0    0          0
##   mothbeans        0      0          0           0    0          0
##   mungbean         0      2          0           0    0          0
##   muskmelon        0      0          0           0    0          0
##   orange          28      0          0           0    0          0
##   papaya           0     21          0           0    0          0
##   pigeonpeas       0      0         29           0    0          0
##   pomegranate      0      3          0          26    0          0
##   rice             0      1          0           0   16          0
##   watermelon       0      2          0           0    0         24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9545          
##                  95% CI : (0.9336, 0.9704)
##     No Information Rate : 0.06            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9524          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: apple Class: banana Class: blackgram
## Sensitivity               1.00000       1.00000          0.95455
## Specificity               1.00000       0.99811          0.99811
## Pos Pred Value            1.00000       0.95652          0.95455
## Neg Pred Value            1.00000       1.00000          0.99811
## Prevalence                0.04909       0.04000          0.04000
## Detection Rate            0.04909       0.04000          0.03818
## Detection Prevalence      0.04909       0.04182          0.04000
## Balanced Accuracy         1.00000       0.99905          0.97633
##                      Class: chickpea Class: coconut Class: coffee Class: cotton
## Sensitivity                  1.00000        1.00000          1.00       1.00000
## Specificity                  1.00000        1.00000          1.00       0.99809
## Pos Pred Value               1.00000        1.00000          1.00       0.96296
## Neg Pred Value               1.00000        1.00000          1.00       1.00000
## Prevalence                   0.04364        0.04182          0.04       0.04727
## Detection Rate               0.04364        0.04182          0.04       0.04727
## Detection Prevalence         0.04364        0.04182          0.04       0.04909
## Balanced Accuracy            1.00000        1.00000          1.00       0.99905
##                      Class: grapes Class: jute Class: kidneybeans Class: lentil
## Sensitivity                1.00000     1.00000            1.00000       0.96667
## Specificity                1.00000     0.98485            1.00000       0.99423
## Pos Pred Value             1.00000     0.73333            1.00000       0.90625
## Neg Pred Value             1.00000     1.00000            1.00000       0.99807
## Prevalence                 0.04545     0.04000            0.04909       0.05455
## Detection Rate             0.04545     0.04000            0.04909       0.05273
## Detection Prevalence       0.04545     0.05455            0.04909       0.05818
## Balanced Accuracy          1.00000     0.99242            1.00000       0.98045
##                      Class: maize Class: mango Class: mothbeans Class: mungbean
## Sensitivity               0.95833      1.00000          0.83333         1.00000
## Specificity               0.99810      1.00000          0.99620         0.99623
## Pos Pred Value            0.95833      1.00000          0.90909         0.90909
## Neg Pred Value            0.99810      1.00000          0.99242         1.00000
## Prevalence                0.04364      0.04545          0.04364         0.03636
## Detection Rate            0.04182      0.04545          0.03636         0.03636
## Detection Prevalence      0.04364      0.04545          0.04000         0.04000
## Balanced Accuracy         0.97822      1.00000          0.91477         0.99811
##                      Class: muskmelon Class: orange Class: papaya
## Sensitivity                   1.00000       1.00000       0.63636
## Specificity                   1.00000       1.00000       1.00000
## Pos Pred Value                1.00000       1.00000       1.00000
## Neg Pred Value                1.00000       1.00000       0.97732
## Prevalence                    0.04545       0.05091       0.06000
## Detection Rate                0.04545       0.05091       0.03818
## Detection Prevalence          0.04545       0.05091       0.03818
## Balanced Accuracy             1.00000       1.00000       0.81818
##                      Class: pigeonpeas Class: pomegranate Class: rice
## Sensitivity                    1.00000            1.00000     0.72727
## Specificity                    1.00000            0.99427     0.99811
## Pos Pred Value                 1.00000            0.89655     0.94118
## Neg Pred Value                 1.00000            1.00000     0.98874
## Prevalence                     0.05273            0.04727     0.04000
## Detection Rate                 0.05273            0.04727     0.02909
## Detection Prevalence           0.05273            0.05273     0.03091
## Balanced Accuracy              1.00000            0.99714     0.86269
##                      Class: watermelon
## Sensitivity                    1.00000
## Specificity                    0.99620
## Pos Pred Value                 0.92308
## Neg Pred Value                 1.00000
## Prevalence                     0.04364
## Detection Rate                 0.04364
## Detection Prevalence           0.04727
## Balanced Accuracy              0.99810