Load necessary Libraries

library(readr)
library(dplyr)
library(tidyr)
library(tidyverse)
library(ggplot2)
library(GGally)
library(corrplot)
library(ggpubr)
ICRISAT <- read_csv("ICRISAT.csv")
View(ICRISAT)

Change column names to match the new format

colnames(ICRISAT) <- c("dist_code", "year", "state_code", "state_name", "district_name", 
                       "rice_area_1000ha", "rice_production_1000tons", "rice_yield_kg_per_ha",
                       "wheat_area_1000ha", "wheat_production_1000tons", "wheat_yield_kg_per_ha",
                       "kharif_sorghum_area_1000ha", "kharif_sorghum_production_1000tons", "kharif_sorghum_yield_kg_per_ha",
                       "rabi_sorghum_area_1000ha", "rabi_sorghum_production_1000tons", "rabi_sorghum_yield_kg_per_ha",
                       "sorghum_area_1000ha", "sorghum_production_1000tons", "sorghum_yield_kg_per_ha",
                       "pearl_millet_area_1000ha", "pearl_millet_production_1000tons", "pearl_millet_yield_kg_per_ha",
                       "maize_area_1000ha", "maize_production_1000tons", "maize_yield_kg_per_ha",
                       "finger_millet_area_1000ha", "finger_millet_production_1000tons", "finger_millet_yield_kg_per_ha",
                       "barley_area_1000ha", "barley_production_1000tons", "barley_yield_kg_per_ha",
                       "chickpea_area_1000ha", "chickpea_production_1000tons", "chickpea_yield_kg_per_ha",
                       "pigeonpea_area_1000ha", "pigeonpea_production_1000tons", "pigeonpea_yield_kg_per_ha",
                       "minor_pulses_area_1000ha", "minor_pulses_production_1000tons", "minor_pulses_yield_kg_per_ha",
                       "groundnut_area_1000ha", "groundnut_production_1000tons", "groundnut_yield_kg_per_ha",
                       "sesamum_area_1000ha", "sesamum_production_1000tons", "sesamum_yield_kg_per_ha",
                       "rapeseed_mustard_area_1000ha", "rapeseed_mustard_production_1000tons", "rapeseed_mustard_yield_kg_per_ha",
                       "safflower_area_1000ha", "safflower_production_1000tons", "safflower_yield_kg_per_ha",
                       "castor_area_1000ha", "castor_production_1000tons", "castor_yield_kg_per_ha",
                       "linseed_area_1000ha", "linseed_production_1000tons", "linseed_yield_kg_per_ha",
                       "sunflower_area_1000ha", "sunflower_production_1000tons", "sunflower_yield_kg_per_ha",
                       "soyabean_area_1000ha", "soyabean_production_1000tons", "soyabean_yield_kg_per_ha",
                       "oilseeds_area_1000ha", "oilseeds_production_1000tons", "oilseeds_yield_kg_per_ha",
                       "sugarcane_area_1000ha", "sugarcane_production_1000tons", "sugarcane_yield_kg_per_ha",
                       "cotton_area_1000ha", "cotton_production_1000tons", "cotton_yield_kg_per_ha",
                       "fruits_area_1000ha", "vegetables_area_1000ha", "fruits_vegetables_area_1000ha", 
                       "potatoes_area_1000ha", "onion_area_1000ha", "fodder_area_1000ha")
colnames(ICRISAT)
##  [1] "dist_code"                           
##  [2] "year"                                
##  [3] "state_code"                          
##  [4] "state_name"                          
##  [5] "district_name"                       
##  [6] "rice_area_1000ha"                    
##  [7] "rice_production_1000tons"            
##  [8] "rice_yield_kg_per_ha"                
##  [9] "wheat_area_1000ha"                   
## [10] "wheat_production_1000tons"           
## [11] "wheat_yield_kg_per_ha"               
## [12] "kharif_sorghum_area_1000ha"          
## [13] "kharif_sorghum_production_1000tons"  
## [14] "kharif_sorghum_yield_kg_per_ha"      
## [15] "rabi_sorghum_area_1000ha"            
## [16] "rabi_sorghum_production_1000tons"    
## [17] "rabi_sorghum_yield_kg_per_ha"        
## [18] "sorghum_area_1000ha"                 
## [19] "sorghum_production_1000tons"         
## [20] "sorghum_yield_kg_per_ha"             
## [21] "pearl_millet_area_1000ha"            
## [22] "pearl_millet_production_1000tons"    
## [23] "pearl_millet_yield_kg_per_ha"        
## [24] "maize_area_1000ha"                   
## [25] "maize_production_1000tons"           
## [26] "maize_yield_kg_per_ha"               
## [27] "finger_millet_area_1000ha"           
## [28] "finger_millet_production_1000tons"   
## [29] "finger_millet_yield_kg_per_ha"       
## [30] "barley_area_1000ha"                  
## [31] "barley_production_1000tons"          
## [32] "barley_yield_kg_per_ha"              
## [33] "chickpea_area_1000ha"                
## [34] "chickpea_production_1000tons"        
## [35] "chickpea_yield_kg_per_ha"            
## [36] "pigeonpea_area_1000ha"               
## [37] "pigeonpea_production_1000tons"       
## [38] "pigeonpea_yield_kg_per_ha"           
## [39] "minor_pulses_area_1000ha"            
## [40] "minor_pulses_production_1000tons"    
## [41] "minor_pulses_yield_kg_per_ha"        
## [42] "groundnut_area_1000ha"               
## [43] "groundnut_production_1000tons"       
## [44] "groundnut_yield_kg_per_ha"           
## [45] "sesamum_area_1000ha"                 
## [46] "sesamum_production_1000tons"         
## [47] "sesamum_yield_kg_per_ha"             
## [48] "rapeseed_mustard_area_1000ha"        
## [49] "rapeseed_mustard_production_1000tons"
## [50] "rapeseed_mustard_yield_kg_per_ha"    
## [51] "safflower_area_1000ha"               
## [52] "safflower_production_1000tons"       
## [53] "safflower_yield_kg_per_ha"           
## [54] "castor_area_1000ha"                  
## [55] "castor_production_1000tons"          
## [56] "castor_yield_kg_per_ha"              
## [57] "linseed_area_1000ha"                 
## [58] "linseed_production_1000tons"         
## [59] "linseed_yield_kg_per_ha"             
## [60] "sunflower_area_1000ha"               
## [61] "sunflower_production_1000tons"       
## [62] "sunflower_yield_kg_per_ha"           
## [63] "soyabean_area_1000ha"                
## [64] "soyabean_production_1000tons"        
## [65] "soyabean_yield_kg_per_ha"            
## [66] "oilseeds_area_1000ha"                
## [67] "oilseeds_production_1000tons"        
## [68] "oilseeds_yield_kg_per_ha"            
## [69] "sugarcane_area_1000ha"               
## [70] "sugarcane_production_1000tons"       
## [71] "sugarcane_yield_kg_per_ha"           
## [72] "cotton_area_1000ha"                  
## [73] "cotton_production_1000tons"          
## [74] "cotton_yield_kg_per_ha"              
## [75] "fruits_area_1000ha"                  
## [76] "vegetables_area_1000ha"              
## [77] "fruits_vegetables_area_1000ha"       
## [78] "potatoes_area_1000ha"                
## [79] "onion_area_1000ha"                   
## [80] "fodder_area_1000ha"

———————————————————

Level 1: Understanding the Data (Basic Exploration)

———————————————————

Question 1.1: What is the structure of the dataset (number of rows, columns, and data types)?

str(ICRISAT)
## spc_tbl_ [16,146 × 80] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ dist_code                           : num [1:16146] 1 1 1 1 1 1 1 1 1 1 ...
##  $ year                                : num [1:16146] 1966 1967 1968 1969 1970 ...
##  $ state_code                          : num [1:16146] 14 14 14 14 14 14 14 14 14 14 ...
##  $ state_name                          : chr [1:16146] "Chhattisgarh" "Chhattisgarh" "Chhattisgarh" "Chhattisgarh" ...
##  $ district_name                       : chr [1:16146] "Durg" "Durg" "Durg" "Durg" ...
##  $ rice_area_1000ha                    : num [1:16146] 548 547 556 563 572 ...
##  $ rice_production_1000tons            : num [1:16146] 185 409 468 401 474 ...
##  $ rice_yield_kg_per_ha                : num [1:16146] 338 748 841 711 829 ...
##  $ wheat_area_1000ha                   : num [1:16146] 44 50 53.7 49.4 44.2 44.4 39.6 37.3 36.5 49.2 ...
##  $ wheat_production_1000tons           : num [1:16146] 20 26 30 26.5 29 25.8 20.6 18.6 22.4 27.8 ...
##  $ wheat_yield_kg_per_ha               : num [1:16146] 455 520 559 536 656 ...
##  $ kharif_sorghum_area_1000ha          : num [1:16146] 0.6 1.1 0.5 0.8 0.9 0.3 0.3 0.2 0.5 0.2 ...
##  $ kharif_sorghum_production_1000tons  : num [1:16146] 0.4 0.9 0.4 0.6 0.6 0.2 0.3 0.2 0.4 0.2 ...
##  $ kharif_sorghum_yield_kg_per_ha      : num [1:16146] 667 818 800 750 667 ...
##  $ rabi_sorghum_area_1000ha            : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ rabi_sorghum_production_1000tons    : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ rabi_sorghum_yield_kg_per_ha        : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ sorghum_area_1000ha                 : num [1:16146] 0.6 1.1 0.5 0.8 0.9 0.3 0.3 0.2 0.5 0.2 ...
##  $ sorghum_production_1000tons         : num [1:16146] 0.4 0.9 0.4 0.6 0.6 0.2 0.3 0.2 0.4 0.2 ...
##  $ sorghum_yield_kg_per_ha             : num [1:16146] 667 818 800 750 667 ...
##  $ pearl_millet_area_1000ha            : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pearl_millet_production_1000tons    : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pearl_millet_yield_kg_per_ha        : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ maize_area_1000ha                   : num [1:16146] 3 3 2.8 2.7 2.5 2.7 2.8 2.9 2.9 2.9 ...
##  $ maize_production_1000tons           : num [1:16146] 2 3 2 2.3 3.3 3.1 3.2 2.7 2.9 2.9 ...
##  $ maize_yield_kg_per_ha               : num [1:16146] 667 1000 714 852 1320 ...
##  $ finger_millet_area_1000ha           : num [1:16146] 0.8 0.9 0.8 0.8 0.8 0.9 0.8 0.8 0.8 0.8 ...
##  $ finger_millet_production_1000tons   : num [1:16146] 0.2 0.2 0.2 0.2 0.2 0.2 0.1 0.2 0.3 0.2 ...
##  $ finger_millet_yield_kg_per_ha       : num [1:16146] 250 222 250 250 250 ...
##  $ barley_area_1000ha                  : num [1:16146] 0.1 0.2 0.2 0.2 0.1 0.2 0.1 0.1 0.2 0.2 ...
##  $ barley_production_1000tons          : num [1:16146] 0 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.2 ...
##  $ barley_yield_kg_per_ha              : num [1:16146] 0 500 500 500 1000 500 1000 1000 500 1000 ...
##  $ chickpea_area_1000ha                : num [1:16146] 54 52 51.3 52.6 53.3 50.3 52.3 56.4 54.9 58.3 ...
##  $ chickpea_production_1000tons        : num [1:16146] 27 15 23 23.1 32.6 32.8 32.6 32.4 41 30.2 ...
##  $ chickpea_yield_kg_per_ha            : num [1:16146] 500 288 448 439 612 ...
##  $ pigeonpea_area_1000ha               : num [1:16146] 37 36 35.7 35.7 33.3 33.9 32.8 32.2 32.5 34.2 ...
##  $ pigeonpea_production_1000tons       : num [1:16146] 15 26 28 27.3 35.2 34.6 42.4 22.8 37.6 26.7 ...
##  $ pigeonpea_yield_kg_per_ha           : num [1:16146] 405 722 784 765 1057 ...
##  $ minor_pulses_area_1000ha            : num [1:16146] 115 270 289 298 343 ...
##  $ minor_pulses_production_1000tons    : num [1:16146] -1 -1 -1 98.3 142.9 ...
##  $ minor_pulses_yield_kg_per_ha        : num [1:16146] -1 -1 -1 330 417 ...
##  $ groundnut_area_1000ha               : num [1:16146] 0.2 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0 0 ...
##  $ groundnut_production_1000tons       : num [1:16146] 0.1 0.1 0.1 0.1 0.1 0 0 0 0 0 ...
##  $ groundnut_yield_kg_per_ha           : num [1:16146] 500 1000 1000 1000 1000 0 0 0 0 0 ...
##  $ sesamum_area_1000ha                 : num [1:16146] 3 11.6 11.3 11.4 10.8 10.8 10.4 9.6 9.9 8.6 ...
##  $ sesamum_production_1000tons         : num [1:16146] 0.3 0.9 1.6 1.4 1.1 1 1.9 1.3 1.4 0.9 ...
##  $ sesamum_yield_kg_per_ha             : num [1:16146] 100 77.6 141.6 122.8 101.8 ...
##  $ rapeseed_mustard_area_1000ha        : num [1:16146] 1 1.1 1.2 1.2 1.1 1.1 1.2 1.2 1.4 1.6 ...
##  $ rapeseed_mustard_production_1000tons: num [1:16146] 0.2 0.3 0.5 0.4 0.4 0.4 0.7 0.7 0.8 0.9 ...
##  $ rapeseed_mustard_yield_kg_per_ha    : num [1:16146] 200 273 417 333 364 ...
##  $ safflower_area_1000ha               : num [1:16146] 0.07 0.01 0.02 0.02 0.02 0.07 0 0 0 0.1 ...
##  $ safflower_production_1000tons       : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ safflower_yield_kg_per_ha           : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ castor_area_1000ha                  : num [1:16146] 0.7 0.5 0.6 0.5 0.4 0.3 0.3 0.2 0.2 0.2 ...
##  $ castor_production_1000tons          : num [1:16146] 0.2 0.1 0.3 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ castor_yield_kg_per_ha              : num [1:16146] 286 200 500 200 250 ...
##  $ linseed_area_1000ha                 : num [1:16146] 45.7 100.1 113.3 101.1 114.1 ...
##  $ linseed_production_1000tons         : num [1:16146] 6.8 25.8 23.8 13 22.7 18.5 22.7 28 21.4 28.7 ...
##  $ linseed_yield_kg_per_ha             : num [1:16146] 149 258 210 129 199 ...
##  $ sunflower_area_1000ha               : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ sunflower_production_1000tons       : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ sunflower_yield_kg_per_ha           : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ soyabean_area_1000ha                : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ soyabean_production_1000tons        : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ soyabean_yield_kg_per_ha            : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ oilseeds_area_1000ha                : num [1:16146] 50.7 113.5 126.6 114.1 126.4 ...
##  $ oilseeds_production_1000tons        : num [1:16146] -1 -1 -1 14.9 24.3 20.2 25.4 30.3 23.9 30.8 ...
##  $ oilseeds_yield_kg_per_ha            : num [1:16146] -1 -1 -1 131 192 ...
##  $ sugarcane_area_1000ha               : num [1:16146] 0.9 0.8 1 1 0.7 0.5 0.5 0.2 0.8 0.8 ...
##  $ sugarcane_production_1000tons       : num [1:16146] 1.6 1.2 1 1.9 1.4 1 1 1.2 1.5 1.4 ...
##  $ sugarcane_yield_kg_per_ha           : num [1:16146] 1778 1500 1000 1900 2000 ...
##  $ cotton_area_1000ha                  : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ cotton_production_1000tons          : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ cotton_yield_kg_per_ha              : num [1:16146] 0 0 0 0 0 0 0 0 0 0 ...
##  $ fruits_area_1000ha                  : num [1:16146] 5.95 5.77 5.41 5.52 5.45 5.42 5.48 5.3 5.21 5.11 ...
##  $ vegetables_area_1000ha              : num [1:16146] 6.64 7.24 7.4 7.16 7.19 7.48 7.53 7.6 7.44 7.86 ...
##  $ fruits_vegetables_area_1000ha       : num [1:16146] 12.6 13 12.8 12.7 12.6 ...
##  $ potatoes_area_1000ha                : num [1:16146] 0.01 0.01 0.1 0.01 0.02 0.01 0.01 0.01 0.03 0.05 ...
##  $ onion_area_1000ha                   : num [1:16146] 0.6 0.56 0.58 0.56 0.52 0.54 0.55 0.53 0.45 0.52 ...
##  $ fodder_area_1000ha                  : num [1:16146] 0.47 1.23 1.02 0.84 0.42 0.38 0.26 0.14 0.06 0.08 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Dist Code` = col_double(),
##   ..   Year = col_double(),
##   ..   `State Code` = col_double(),
##   ..   `State Name` = col_character(),
##   ..   `Dist Name` = col_character(),
##   ..   `RICE AREA (1000 ha)` = col_double(),
##   ..   `RICE PRODUCTION (1000 tons)` = col_double(),
##   ..   `RICE YIELD (Kg per ha)` = col_double(),
##   ..   `WHEAT AREA (1000 ha)` = col_double(),
##   ..   `WHEAT PRODUCTION (1000 tons)` = col_double(),
##   ..   `WHEAT YIELD (Kg per ha)` = col_double(),
##   ..   `KHARIF SORGHUM AREA (1000 ha)` = col_double(),
##   ..   `KHARIF SORGHUM PRODUCTION (1000 tons)` = col_double(),
##   ..   `KHARIF SORGHUM YIELD (Kg per ha)` = col_double(),
##   ..   `RABI SORGHUM AREA (1000 ha)` = col_double(),
##   ..   `RABI SORGHUM PRODUCTION (1000 tons)` = col_double(),
##   ..   `RABI SORGHUM YIELD (Kg per ha)` = col_double(),
##   ..   `SORGHUM AREA (1000 ha)` = col_double(),
##   ..   `SORGHUM PRODUCTION (1000 tons)` = col_double(),
##   ..   `SORGHUM YIELD (Kg per ha)` = col_double(),
##   ..   `PEARL MILLET AREA (1000 ha)` = col_double(),
##   ..   `PEARL MILLET PRODUCTION (1000 tons)` = col_double(),
##   ..   `PEARL MILLET YIELD (Kg per ha)` = col_double(),
##   ..   `MAIZE AREA (1000 ha)` = col_double(),
##   ..   `MAIZE PRODUCTION (1000 tons)` = col_double(),
##   ..   `MAIZE YIELD (Kg per ha)` = col_double(),
##   ..   `FINGER MILLET AREA (1000 ha)` = col_double(),
##   ..   `FINGER MILLET PRODUCTION (1000 tons)` = col_double(),
##   ..   `FINGER MILLET YIELD (Kg per ha)` = col_double(),
##   ..   `BARLEY AREA (1000 ha)` = col_double(),
##   ..   `BARLEY PRODUCTION (1000 tons)` = col_double(),
##   ..   `BARLEY YIELD (Kg per ha)` = col_double(),
##   ..   `CHICKPEA AREA (1000 ha)` = col_double(),
##   ..   `CHICKPEA PRODUCTION (1000 tons)` = col_double(),
##   ..   `CHICKPEA YIELD (Kg per ha)` = col_double(),
##   ..   `PIGEONPEA AREA (1000 ha)` = col_double(),
##   ..   `PIGEONPEA PRODUCTION (1000 tons)` = col_double(),
##   ..   `PIGEONPEA YIELD (Kg per ha)` = col_double(),
##   ..   `MINOR PULSES AREA (1000 ha)` = col_double(),
##   ..   `MINOR PULSES PRODUCTION (1000 tons)` = col_double(),
##   ..   `MINOR PULSES YIELD (Kg per ha)` = col_double(),
##   ..   `GROUNDNUT AREA (1000 ha)` = col_double(),
##   ..   `GROUNDNUT PRODUCTION (1000 tons)` = col_double(),
##   ..   `GROUNDNUT YIELD (Kg per ha)` = col_double(),
##   ..   `SESAMUM AREA (1000 ha)` = col_double(),
##   ..   `SESAMUM PRODUCTION (1000 tons)` = col_double(),
##   ..   `SESAMUM YIELD (Kg per ha)` = col_double(),
##   ..   `RAPESEED AND MUSTARD AREA (1000 ha)` = col_double(),
##   ..   `RAPESEED AND MUSTARD PRODUCTION (1000 tons)` = col_double(),
##   ..   `RAPESEED AND MUSTARD YIELD (Kg per ha)` = col_double(),
##   ..   `SAFFLOWER AREA (1000 ha)` = col_double(),
##   ..   `SAFFLOWER PRODUCTION (1000 tons)` = col_double(),
##   ..   `SAFFLOWER YIELD (Kg per ha)` = col_double(),
##   ..   `CASTOR AREA (1000 ha)` = col_double(),
##   ..   `CASTOR PRODUCTION (1000 tons)` = col_double(),
##   ..   `CASTOR YIELD (Kg per ha)` = col_double(),
##   ..   `LINSEED AREA (1000 ha)` = col_double(),
##   ..   `LINSEED PRODUCTION (1000 tons)` = col_double(),
##   ..   `LINSEED YIELD (Kg per ha)` = col_double(),
##   ..   `SUNFLOWER AREA (1000 ha)` = col_double(),
##   ..   `SUNFLOWER PRODUCTION (1000 tons)` = col_double(),
##   ..   `SUNFLOWER YIELD (Kg per ha)` = col_double(),
##   ..   `SOYABEAN AREA (1000 ha)` = col_double(),
##   ..   `SOYABEAN PRODUCTION (1000 tons)` = col_double(),
##   ..   `SOYABEAN YIELD (Kg per ha)` = col_double(),
##   ..   `OILSEEDS AREA (1000 ha)` = col_double(),
##   ..   `OILSEEDS PRODUCTION (1000 tons)` = col_double(),
##   ..   `OILSEEDS YIELD (Kg per ha)` = col_double(),
##   ..   `SUGARCANE AREA (1000 ha)` = col_double(),
##   ..   `SUGARCANE PRODUCTION (1000 tons)` = col_double(),
##   ..   `SUGARCANE YIELD (Kg per ha)` = col_double(),
##   ..   `COTTON AREA (1000 ha)` = col_double(),
##   ..   `COTTON PRODUCTION (1000 tons)` = col_double(),
##   ..   `COTTON YIELD (Kg per ha)` = col_double(),
##   ..   `FRUITS AREA (1000 ha)` = col_double(),
##   ..   `VEGETABLES AREA (1000 ha)` = col_double(),
##   ..   `FRUITS AND VEGETABLES AREA (1000 ha)` = col_double(),
##   ..   `POTATOES AREA (1000 ha)` = col_double(),
##   ..   `ONION AREA (1000 ha)` = col_double(),
##   ..   `FODDER AREA (1000 ha)` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Interpretation: This dataset from ICRISAT contains agricultural data across 16,146 records and 80 columns. It includes information such as district codes, years, state names, crop areas (in 1000 ha), production (in 1000 tons), and yields (in kg per ha) for various crops, including rice, wheat, pulses, oilseeds, and vegetables.

Question 1.2: Are there any missing values in the dataset?

colSums(is.na(ICRISAT))
##                            dist_code                                 year 
##                                    0                                    0 
##                           state_code                           state_name 
##                                    0                                    0 
##                        district_name                     rice_area_1000ha 
##                                    0                                    0 
##             rice_production_1000tons                 rice_yield_kg_per_ha 
##                                    0                                    0 
##                    wheat_area_1000ha            wheat_production_1000tons 
##                                    0                                    0 
##                wheat_yield_kg_per_ha           kharif_sorghum_area_1000ha 
##                                    0                                    0 
##   kharif_sorghum_production_1000tons       kharif_sorghum_yield_kg_per_ha 
##                                    0                                    0 
##             rabi_sorghum_area_1000ha     rabi_sorghum_production_1000tons 
##                                    0                                    0 
##         rabi_sorghum_yield_kg_per_ha                  sorghum_area_1000ha 
##                                    0                                    0 
##          sorghum_production_1000tons              sorghum_yield_kg_per_ha 
##                                    0                                    0 
##             pearl_millet_area_1000ha     pearl_millet_production_1000tons 
##                                    0                                    0 
##         pearl_millet_yield_kg_per_ha                    maize_area_1000ha 
##                                    0                                    0 
##            maize_production_1000tons                maize_yield_kg_per_ha 
##                                    0                                    0 
##            finger_millet_area_1000ha    finger_millet_production_1000tons 
##                                    0                                    0 
##        finger_millet_yield_kg_per_ha                   barley_area_1000ha 
##                                    0                                    0 
##           barley_production_1000tons               barley_yield_kg_per_ha 
##                                    0                                    0 
##                 chickpea_area_1000ha         chickpea_production_1000tons 
##                                    0                                    0 
##             chickpea_yield_kg_per_ha                pigeonpea_area_1000ha 
##                                    0                                    0 
##        pigeonpea_production_1000tons            pigeonpea_yield_kg_per_ha 
##                                    0                                    0 
##             minor_pulses_area_1000ha     minor_pulses_production_1000tons 
##                                    0                                    0 
##         minor_pulses_yield_kg_per_ha                groundnut_area_1000ha 
##                                    0                                    0 
##        groundnut_production_1000tons            groundnut_yield_kg_per_ha 
##                                    0                                    0 
##                  sesamum_area_1000ha          sesamum_production_1000tons 
##                                    0                                    0 
##              sesamum_yield_kg_per_ha         rapeseed_mustard_area_1000ha 
##                                    0                                    0 
## rapeseed_mustard_production_1000tons     rapeseed_mustard_yield_kg_per_ha 
##                                    0                                    0 
##                safflower_area_1000ha        safflower_production_1000tons 
##                                    0                                    0 
##            safflower_yield_kg_per_ha                   castor_area_1000ha 
##                                    0                                    0 
##           castor_production_1000tons               castor_yield_kg_per_ha 
##                                    0                                    0 
##                  linseed_area_1000ha          linseed_production_1000tons 
##                                    0                                    0 
##              linseed_yield_kg_per_ha                sunflower_area_1000ha 
##                                    0                                    0 
##        sunflower_production_1000tons            sunflower_yield_kg_per_ha 
##                                    0                                    0 
##                 soyabean_area_1000ha         soyabean_production_1000tons 
##                                    0                                    0 
##             soyabean_yield_kg_per_ha                 oilseeds_area_1000ha 
##                                    0                                    0 
##         oilseeds_production_1000tons             oilseeds_yield_kg_per_ha 
##                                    0                                    0 
##                sugarcane_area_1000ha        sugarcane_production_1000tons 
##                                    0                                    0 
##            sugarcane_yield_kg_per_ha                   cotton_area_1000ha 
##                                    0                                    0 
##           cotton_production_1000tons               cotton_yield_kg_per_ha 
##                                    0                                    0 
##                   fruits_area_1000ha               vegetables_area_1000ha 
##                                    0                                    0 
##        fruits_vegetables_area_1000ha                 potatoes_area_1000ha 
##                                    0                                    0 
##                    onion_area_1000ha                   fodder_area_1000ha 
##                                    0                                    0

Interpretation: The dataset has no missing values, ensuring complete data for analysis. This eliminates the need for imputation and allows seamless filtering, grouping, and feature engineering. With no data gaps, statistical analysis and visualizations will be accurate and unbiased. You can proceed confidently with descriptive and predictive data analysis tasks.

Question 1.3: What is the average yield (Kg per ha) for each crop across all years?

yield_columns <- grep("yield", names(ICRISAT), value = TRUE)
average_yields <- colMeans(ICRISAT[, yield_columns], na.rm = TRUE)
average_yields
##             rice_yield_kg_per_ha            wheat_yield_kg_per_ha 
##                       1486.92478                       1492.41986 
##   kharif_sorghum_yield_kg_per_ha     rabi_sorghum_yield_kg_per_ha 
##                        586.03107                        225.63576 
##          sorghum_yield_kg_per_ha     pearl_millet_yield_kg_per_ha 
##                        586.09356                        517.91709 
##            maize_yield_kg_per_ha    finger_millet_yield_kg_per_ha 
##                       1408.76322                        354.84912 
##           barley_yield_kg_per_ha         chickpea_yield_kg_per_ha 
##                        734.24637                        630.81819 
##        pigeonpea_yield_kg_per_ha     minor_pulses_yield_kg_per_ha 
##                        618.02017                        453.63263 
##        groundnut_yield_kg_per_ha          sesamum_yield_kg_per_ha 
##                        765.94765                        264.72901 
## rapeseed_mustard_yield_kg_per_ha        safflower_yield_kg_per_ha 
##                        497.95363                         73.04832 
##           castor_yield_kg_per_ha          linseed_yield_kg_per_ha 
##                        215.40571                        175.54807 
##        sunflower_yield_kg_per_ha         soyabean_yield_kg_per_ha 
##                        278.44970                        242.96065 
##         oilseeds_yield_kg_per_ha        sugarcane_yield_kg_per_ha 
##                        593.65962                       4500.15306 
##           cotton_yield_kg_per_ha 
##                        124.64482

Interpretation: The dataset shows sugarcane has the highest yield (4500.15 kg/ha), while safflower has the lowest (73.05 kg/ha). Cereals like rice (1486.92 kg/ha) and wheat (1492.42 kg/ha) yield more than pulses and oilseeds. Rabi sorghum (225.64 kg/ha) yields lower than Kharif sorghum (586.03 kg/ha), highlighting seasonal differences.

———————————————–

Level 2: Data Extraction & Filtering

———————————————–

Question 2.1: Which are the top 10 districts with the highest total crop production?

ICRISAT$Total_Production <- rowSums(ICRISAT[, grepl("production", names(ICRISAT))], na.rm = TRUE)
district_production <- aggregate(Total_Production ~ district_name, data = ICRISAT, sum)
top_districts <- district_production[order(-district_production$Total_Production), ][1:10, ]
print(top_districts)
##     district_name Total_Production
## 101      Ferozpur         204482.8
## 120        Hissar         172299.0
## 154        Karnal         155121.1
## 255       Sangrur         135274.1
## 42       Bhatinda         125960.4
## 221       Patiala         110206.3
## 191        Meerut         109794.8
## 193      Midnapur         107328.6
## 16       Amritsar         104968.6
## 102    Ganganagar         102189.8

Interpretation: The top 10 districts with the highest total crop production include Ferozpur (204,482.8), Hissar (172,299.0), and Karnal (155,121.1) as the leading contributors. Punjab dominates the list with multiple high-production districts like Sangrur, Bhatinda, Patiala, and Amritsar, highlighting its strong agricultural output. Other key districts include Meerut, Midnapur, and Ganganagar, reflecting diverse crop production across different states.

Question 2.2: Which 5 districts had the highest rice production in any year?

top_rice_production <- ICRISAT[order(-ICRISAT$rice_production_1000tons), 
                               c("state_name", "district_name", "year", "rice_production_1000tons")][1:5, ]
print(top_rice_production)
## # A tibble: 5 × 4
##   state_name  district_name  year rice_production_1000tons
##   <chr>       <chr>         <dbl>                    <dbl>
## 1 West Bengal Midnapur       2015                    3215.
## 2 West Bengal Midnapur       2014                    3153.
## 3 West Bengal Midnapur       2017                    3002.
## 4 West Bengal Midnapur       2010                    2947.
## 5 West Bengal Midnapur       2012                    2940.

Interpretation: The top 5 highest rice production records all belong to Midnapur district in West Bengal, indicating it is a major rice-producing region. The highest recorded production was 3,215.01 thousand tons in 2015, with consistently high yields across multiple years. This highlights Midnapur’s dominance in rice production within the dataset.

Question 2.3: Find the top 10 states with the largest cultivated area for pulses.

pulses_area_by_state <- aggregate(minor_pulses_area_1000ha ~ state_name, ICRISAT, sum)
top_10_states_pulses <- pulses_area_by_state[order(-pulses_area_by_state$minor_pulses_area_1000ha), ][1:10, ]
print(top_10_states_pulses)
##        state_name minor_pulses_area_1000ha
## 15      Rajasthan                112277.68
## 12    Maharashtra                 85307.95
## 13         Orissa                 78237.72
## 11 Madhya Pradesh                 73514.80
## 18  Uttar Pradesh                 64293.63
## 9       Karnataka                 49346.49
## 4    Chhattisgarh                 36450.44
## 3           Bihar                 32472.02
## 1  Andhra Pradesh                 31438.27
## 16     Tamil Nadu                 27775.98

Interpretation: Rajasthan has the largest cultivated area for pulses at 112,277.68 (1000 ha), followed by Maharashtra (85,307.95), Odisha (78,237.72), and Madhya Pradesh (73,514.80). Uttar Pradesh ranks fifth (64,293.63). Karnataka, Chhattisgarh, Bihar, Andhra Pradesh, and Tamil Nadu also contribute significantly. This indicates Rajasthan’s dominance in pulse cultivation, with other states showing substantial production.

———————————————–

Level 3: Grouping & Summarization

———————————————–

Question 3.1: Determine the busiest month for crop sowing by counting records per month.

ICRISAT$Month <- format(as.Date(paste(ICRISAT$year, "01", "01", sep = "-")), "%m") 
busiest_month <- ICRISAT %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  head(1)
print(busiest_month)
## # A tibble: 1 × 2
##   Month Count
##   <chr> <int>
## 1 01    16146

Interpretation: The busiest month for crop sowing is January (Month 01), with 1,038 recorded instances. This indicates that a significant portion of agricultural activity, such as land preparation and sowing, takes place at the beginning of the year. Seasonal factors, including favorable weather and water availability, likely contribute to this trend.

Question 3.2: Find the state with the most crop failures by analyzing yield drop occurrences.

ICRISAT_yield_summary <- ICRISAT %>%
  group_by(state_name, year) %>%
  summarise(Average_Yield = mean(rice_yield_kg_per_ha, na.rm = TRUE)) %>%
  arrange(state_name, year)
# Add Yield Drop column
ICRISAT_yield_summary$Yield_Drop <- c(NA, diff(ICRISAT_yield_summary$Average_Yield))
# Find states with maximum yield drop occurrences
crop_failure_state <- ICRISAT_yield_summary %>%
  filter(Yield_Drop < 0) %>%
  group_by(state_name) %>%
  summarise(Failure_Count = n()) %>%
  arrange(desc(Failure_Count)) %>%
  head(1)
print(crop_failure_state)
## # A tibble: 1 × 2
##   state_name  Failure_Count
##   <chr>               <int>
## 1 Maharashtra            28

Interpretation: Maharashtra experienced the highest number of crop failures, with 28 instances of yield decline over the recorded years. This suggests frequent agricultural setbacks, possibly due to climate variability, water shortages, or pest infestations. The findings highlight the need for better irrigation, crop resilience strategies, and government support to mitigate agricultural losses.

———————————————–

Level 4: Sorting & Ranking Data

———————————————–

Question 4.1: Rank districts based on average rice yield (highest to lowest).

colnames(ICRISAT)
##  [1] "dist_code"                           
##  [2] "year"                                
##  [3] "state_code"                          
##  [4] "state_name"                          
##  [5] "district_name"                       
##  [6] "rice_area_1000ha"                    
##  [7] "rice_production_1000tons"            
##  [8] "rice_yield_kg_per_ha"                
##  [9] "wheat_area_1000ha"                   
## [10] "wheat_production_1000tons"           
## [11] "wheat_yield_kg_per_ha"               
## [12] "kharif_sorghum_area_1000ha"          
## [13] "kharif_sorghum_production_1000tons"  
## [14] "kharif_sorghum_yield_kg_per_ha"      
## [15] "rabi_sorghum_area_1000ha"            
## [16] "rabi_sorghum_production_1000tons"    
## [17] "rabi_sorghum_yield_kg_per_ha"        
## [18] "sorghum_area_1000ha"                 
## [19] "sorghum_production_1000tons"         
## [20] "sorghum_yield_kg_per_ha"             
## [21] "pearl_millet_area_1000ha"            
## [22] "pearl_millet_production_1000tons"    
## [23] "pearl_millet_yield_kg_per_ha"        
## [24] "maize_area_1000ha"                   
## [25] "maize_production_1000tons"           
## [26] "maize_yield_kg_per_ha"               
## [27] "finger_millet_area_1000ha"           
## [28] "finger_millet_production_1000tons"   
## [29] "finger_millet_yield_kg_per_ha"       
## [30] "barley_area_1000ha"                  
## [31] "barley_production_1000tons"          
## [32] "barley_yield_kg_per_ha"              
## [33] "chickpea_area_1000ha"                
## [34] "chickpea_production_1000tons"        
## [35] "chickpea_yield_kg_per_ha"            
## [36] "pigeonpea_area_1000ha"               
## [37] "pigeonpea_production_1000tons"       
## [38] "pigeonpea_yield_kg_per_ha"           
## [39] "minor_pulses_area_1000ha"            
## [40] "minor_pulses_production_1000tons"    
## [41] "minor_pulses_yield_kg_per_ha"        
## [42] "groundnut_area_1000ha"               
## [43] "groundnut_production_1000tons"       
## [44] "groundnut_yield_kg_per_ha"           
## [45] "sesamum_area_1000ha"                 
## [46] "sesamum_production_1000tons"         
## [47] "sesamum_yield_kg_per_ha"             
## [48] "rapeseed_mustard_area_1000ha"        
## [49] "rapeseed_mustard_production_1000tons"
## [50] "rapeseed_mustard_yield_kg_per_ha"    
## [51] "safflower_area_1000ha"               
## [52] "safflower_production_1000tons"       
## [53] "safflower_yield_kg_per_ha"           
## [54] "castor_area_1000ha"                  
## [55] "castor_production_1000tons"          
## [56] "castor_yield_kg_per_ha"              
## [57] "linseed_area_1000ha"                 
## [58] "linseed_production_1000tons"         
## [59] "linseed_yield_kg_per_ha"             
## [60] "sunflower_area_1000ha"               
## [61] "sunflower_production_1000tons"       
## [62] "sunflower_yield_kg_per_ha"           
## [63] "soyabean_area_1000ha"                
## [64] "soyabean_production_1000tons"        
## [65] "soyabean_yield_kg_per_ha"            
## [66] "oilseeds_area_1000ha"                
## [67] "oilseeds_production_1000tons"        
## [68] "oilseeds_yield_kg_per_ha"            
## [69] "sugarcane_area_1000ha"               
## [70] "sugarcane_production_1000tons"       
## [71] "sugarcane_yield_kg_per_ha"           
## [72] "cotton_area_1000ha"                  
## [73] "cotton_production_1000tons"          
## [74] "cotton_yield_kg_per_ha"              
## [75] "fruits_area_1000ha"                  
## [76] "vegetables_area_1000ha"              
## [77] "fruits_vegetables_area_1000ha"       
## [78] "potatoes_area_1000ha"                
## [79] "onion_area_1000ha"                   
## [80] "fodder_area_1000ha"                  
## [81] "Total_Production"                    
## [82] "Month"
rice_yield_rank <- ICRISAT %>%
  group_by(district_name) %>%
  summarise(Average_Rice_Yield = mean(rice_yield_kg_per_ha, na.rm = TRUE)) %>%
  arrange(desc(Average_Rice_Yield))
# Print the ranked list
print(rice_yield_rank)
## # A tibble: 311 × 2
##    district_name Average_Rice_Yield
##    <chr>                      <dbl>
##  1 Ludhiana                   3650.
##  2 Sangrur                    3533.
##  3 Thirunelveli               3388.
##  4 Bhatinda                   3363.
##  5 Madurai                    3277.
##  6 Kanyakumari                3241.
##  7 Ferozpur                   3234.
##  8 Patiala                    3222.
##  9 Jalandhar                  3187.
## 10 Salem                      3103.
## # ℹ 301 more rows

Interpretation: Ludhiana ranks highest in rice yield (3650 Kg/ha), followed by Sangrur and Thirunelveli. Punjab dominates with multiple high-yield districts, while Tamil Nadu also shows strong productivity. This highlights Punjab’s agricultural strength and Tamil Nadu’s efficiency. Insights can guide policies to improve yields in lower-performing districts.

Question 4.2: Find the top 5 districts with the highest maize production.

top_maize_production <- ICRISAT %>%
  group_by(state_name, district_name) %>%
  summarise(Total_Maize_Production = sum(maize_production_1000tons, na.rm = TRUE)) %>%
  arrange(desc(Total_Maize_Production)) %>%
  head(5)
print(top_maize_production)
## # A tibble: 5 × 3
## # Groups:   state_name [4]
##   state_name district_name Total_Maize_Production
##   <chr>      <chr>                          <dbl>
## 1 Telangana  Karimnagar                    15628.
## 2 Karnataka  Chitradurga                   14105.
## 3 Rajasthan  Udaipur                       13292.
## 4 Bihar      Mungair                       12616.
## 5 Karnataka  Belgaum                       12327.

Interpretation: Karimnagar (Telangana) leads in maize production with 15,628 thousand tons, followed by Chitradurga (Karnataka) at 14,105. Udaipur (Rajasthan), Mungair (Bihar), and Belgaum (Karnataka) also rank high. Karnataka appears twice, highlighting its strong maize cultivation. These districts contribute significantly to India’s maize output.

Question 4.3: Identify the year with the highest wheat production.

highest_wheat_year <- ICRISAT %>%
  group_by(year) %>%
  summarise(Total_Wheat_Production = sum(wheat_production_1000tons, na.rm = TRUE)) %>%
  arrange(desc(Total_Wheat_Production)) %>%
  head(1)
print(highest_wheat_year)
## # A tibble: 1 × 2
##    year Total_Wheat_Production
##   <dbl>                  <dbl>
## 1  2016                112963.

Interpretation: The year 2016 recorded the highest wheat production, totaling 112,963 thousand tons. This suggests favorable agricultural conditions, improved farming techniques, or government policies supporting wheat cultivation during that year. It highlights 2016 as a peak year for wheat production in the dataset.

——————————————————————-

Level 5: Feature Engineering (Creating New Insights)

——————————————————————-

Question 5.1: Create a new column for “Total Cereal Production” (Rice + Wheat + Maize).

ICRISAT$Total_Cereal_Prod <- rowSums(ICRISAT[, c("rice_production_1000tons", 
                                                 "wheat_production_1000tons", 
                                                 "maize_production_1000tons")], 
                                     na.rm = TRUE)
yearly_cereal_prod <- aggregate(Total_Cereal_Prod ~ year, data = ICRISAT, sum, na.rm = TRUE)
print(yearly_cereal_prod)
##    year Total_Cereal_Prod
## 1  1966          44954.58
## 2  1967          57150.07
## 3  1968          56562.78
## 4  1969          59850.01
## 5  1970          69127.88
## 6  1971          70218.89
## 7  1972          66637.01
## 8  1973          68160.45
## 9  1974          65860.75
## 10 1975          79810.26
## 11 1976          73582.46
## 12 1977          85933.56
## 13 1978          90613.48
## 14 1979          76682.55
## 15 1980          94703.57
## 16 1981          95932.59
## 17 1982          93599.50
## 18 1983         110096.90
## 19 1984         108056.94
## 20 1985         115284.77
## 21 1986         108756.01
## 22 1987         105658.25
## 23 1988         130323.05
## 24 1989         134700.15
## 25 1990         134381.74
## 26 1991         134645.82
## 27 1992         136320.09
## 28 1993         145503.25
## 29 1994         153130.74
## 30 1995         142510.03
## 31 1996         156588.48
## 32 1997         155965.18
## 33 1998         162776.68
## 34 1999         173505.32
## 35 2000         164759.51
## 36 2001         176081.22
## 37 2002         145805.63
## 38 2003         171237.55
## 39 2004         163090.05
## 40 2005         170248.47
## 41 2006         181915.78
## 42 2007         192037.90
## 43 2008         195046.56
## 44 2009         185118.80
## 45 2010         209859.43
## 46 2011         224350.94
## 47 2012         224326.58
## 48 2013         238360.60
## 49 2014         222151.01
## 50 2015         214763.63
## 51 2016         257759.58
## 52 2017         257462.85

Interpretation: The total cereal production has steadily increased from 44,954.58 in 1966 to 257,462.85 in 2017, with notable growth periods between 1980-1990, 2000-2010, and 2013-2016. Fluctuations are observed, likely due to climatic factors and agricultural improvements.

Question 5.2: Calculate “Productivity Ratio” (Production/Area) for Karif Sorghum.

ICRISAT$Productivity_Ratio_Sorghum <- ICRISAT$kharif_sorghum_production_1000tons / ICRISAT$kharif_sorghum_area_1000ha
print(ICRISAT[, c("year", "Productivity_Ratio_Sorghum")])
## # A tibble: 16,146 × 2
##     year Productivity_Ratio_Sorghum
##    <dbl>                      <dbl>
##  1  1966                      0.667
##  2  1967                      0.818
##  3  1968                      0.8  
##  4  1969                      0.75 
##  5  1970                      0.667
##  6  1971                      0.667
##  7  1972                      1    
##  8  1973                      1    
##  9  1974                      0.8  
## 10  1975                      1    
## # ℹ 16,136 more rows

Interpretation: The code calculates the productivity ratio of Kharif Sorghum by dividing its production by the area under cultivation for each year. The output shows the ratio for each year, indicating fluctuations in sorghum productivity over time.

Question 5.3: Compute the percentage of low yield per crop in each year.

low_yield_threshold_pearl_millet <- 500
ICRISAT$Low_Yield_Pearl_Millet <- ICRISAT$pearl_millet_yield_kg_per_ha < low_yield_threshold_pearl_millet
low_yield_percentage_pearl_millet <- sum(ICRISAT$Low_Yield_Pearl_Millet) / nrow(ICRISAT) * 100
message("Percentage of low yield years for Pearl Millet: ", round(low_yield_percentage_pearl_millet, 2), "%")

Interpretation: The result indicates that 55.03% of the years in the dataset for Pearl Millet had a yield below the threshold of 500 Kg per ha. This means that more than half of the years observed had Pearl Millet yields considered “low” based on the defined threshold.

———————————————————

Data Visualization

———————————————————

V1

Bar Chart: Compare total production of major crops (like rice, wheat, maize) across states.

# Aggregating data by state
state_prod <- aggregate(cbind(rice_production_1000tons, wheat_production_1000tons, maize_production_1000tons) ~ state_name, data = ICRISAT, sum)

# Convert data to long format using tidyr::pivot_longer
state_prod_long <- state_prod %>%
  pivot_longer(cols = c(rice_production_1000tons, wheat_production_1000tons, maize_production_1000tons),
               names_to = "crop_type",
               values_to = "production")

# Grouped Bar Chart
ggplot(state_prod_long, aes(x = state_name, y = production, fill = factor(crop_type))) +
  geom_col(position = "dodge") +
  ggtitle("Grouped Bar Chart: Total Crop Production by State") +
  xlab("State") +
  ylab("Production (1000 tons)") +
  labs(fill = "Crop Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Interpretation :- The chart compares maize, rice, and wheat production across Indian states, highlighting Uttar Pradesh’s dominance in wheat, West Bengal and Punjab in rice, and Karnataka and Andhra Pradesh in maize.

———————————————————

V2

Histogram: Visualize distribution of Barley yield across districts

ggplot(ICRISAT, aes(x = barley_yield_kg_per_ha)) +
  geom_histogram(bins = 30, fill = 'orange', color = 'black') +
  theme_minimal() +
  labs(title = 'Distribution of Barley Yield across Districts',
       x = 'Barley Yield (kg/ha)',
       y = 'Frequency')

Interpretation :- Most districts have low barley yields, resulting in a tall bar on the left. The distribution is right-skewed, indicating a few districts with very high yields.

———————————————————

V3

Pie Chart: Show the proportion of crop area used by different cereals in a selected state.

# Select a state (e.g., "Chhattisgarh")
state_data <- subset(ICRISAT, state_name == 'Chhattisgarh')

# Summarize area data for cereals (rice, wheat, maize)
state_cereal_area <- state_data[, c('rice_area_1000ha', 'wheat_area_1000ha', 'maize_area_1000ha')]
state_cereal_area_total <- colSums(state_cereal_area)

# Pie chart
pie(state_cereal_area_total, labels = names(state_cereal_area_total), col = rainbow(length(state_cereal_area_total)),
    main = 'Proportion of Cereal Crop Area in Chhattisgarh')

Interpretation :- The red segment dominates the chart, indicating that rice occupies the largest area, while crops like maize and wheat have relatively smaller shares.

———————————————————

V4

Pair Plot: Explore relationships among area, production, and yield for wheat.

# Subset for the pair plot
pair_data <- ICRISAT[, c('rice_area_1000ha', 'rice_production_1000tons', 'rice_yield_kg_per_ha',
                         'wheat_area_1000ha', 'wheat_production_1000tons', 'wheat_yield_kg_per_ha')]

# Pair plot
ggpairs(pair_data)

Interpretation :- The pair plot shows a strong positive correlation between area and production for both rice (0.833) and wheat (0.911). Yield has a moderate correlation with production for rice (0.530) and wheat (0.687), but weak correlation with area.

———————————————————

V5

Boxplot: Compare rice yield variability across different states.

# Select relevant columns for rice
rice_data <- ICRISAT[, c('state_name', 'rice_yield_kg_per_ha')]

# Gather the data into long format (though there's only one variable here, so no need to reshape much)
rice_data_long <- gather(rice_data, key = 'crop_type', value = 'yield', rice_yield_kg_per_ha)

# Boxplot
ggplot(rice_data_long, aes(x = state_name, y = yield, fill = crop_type)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = 'Rice Yield Variability across States', x = 'State', y = 'Rice Yield (kg/ha)')

Interpretation :- Rice yield varies widely across states, with Punjab, Haryana, and West Bengal showing higher medians. Several states like Madhya Pradesh have lower yields and many outliers, indicating significant yield fluctuations within states

———————————————————

V6

Boxplot: Visualize the wheat yield (in kg/ha) across different states.

# Select relevant data
wheat_data <- ICRISAT[, c('state_name', 'wheat_yield_kg_per_ha')]

# Boxplot for wheat yield across states
ggplot(wheat_data, aes(x = state_name, y = wheat_yield_kg_per_ha, fill = state_name)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = 'Wheat Yield (kg/ha) Across States',
       x = 'State',
       y = 'Wheat Yield (kg/ha)')

Interpretation :- Based on the boxplot, Punjab and Haryana exhibit higher median wheat yields compared to other states, while states like Bihar and Tamil Nadu tend to have lower median yields and greater variability.States like Uttrakhand & MP have most outliers.

———————————————————

V7

Line Chart: Is there a noticeable increase or decrease in the total crop production over the years from 1966 to 2017?

# Filter the dataset for years between 1966 and 2017
ICRISAT_filtered <- ICRISAT %>% 
  filter(year >= 1966 & year <= 2017)
# Sum the production for all crops for each year (you can modify for different crops if needed)
ICRISAT_filtered$total_production <- ICRISAT_filtered$rice_production_1000tons + 
  ICRISAT_filtered$wheat_production_1000tons +
  ICRISAT_filtered$barley_production_1000tons + 
  ICRISAT_filtered$sorghum_production_1000tons + 
  ICRISAT_filtered$pearl_millet_production_1000tons + 
  ICRISAT_filtered$maize_production_1000tons + 
  ICRISAT_filtered$finger_millet_production_1000tons + 
  ICRISAT_filtered$chickpea_production_1000tons + 
  ICRISAT_filtered$pigeonpea_production_1000tons + 
  ICRISAT_filtered$groundnut_production_1000tons  # Add other crops if necessary
# Summarize the total production by year
production_by_year <- ICRISAT_filtered %>%
  group_by(year) %>%
  summarize(total_production = sum(total_production))
# Plot the total production for each year from 1996 to 2017
ggplot(production_by_year, aes(x = year, y = total_production)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +  # Adds points to highlight each year
  labs(title = "Total Crop Production from 1996 to 2017", 
       x = "Year", 
       y = "Total Production (1000 tons)") +
  theme_minimal()

Interpretation :- Yes, the line chart illustrates a strong positive growth trajectory in total crop production between 1966 and 2017.

———————————————————

Advanced Engineering

———————————————————

1.1

ANOVA 1: Analyze the effect of state on wheat yield.

Null Hypothesis (H0): No significant difference in wheat yield between states.

Alternative Hypothesis (H1): Significant difference in wheat yield between at least one pair of states.

anova_wheat_yield <- aov(wheat_yield_kg_per_ha ~ state_name, data = wheat_data)
summary(anova_wheat_yield)
##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## state_name     19 1.074e+10 565029057    1119 <2e-16 ***
## Residuals   16126 8.140e+09    504761                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation :- With a very small p-value (< 0.05), reject the null hypothesis. There is a significant difference in wheat yield between states.

———————————————————

1.2

ANOVA 2: Analyze the effect of year on maize yield.

Null Hypothesis (H0): No significant difference in maize yield across years.

Alternative Hypothesis (H1): Significant difference in maize yield across at least one year.

anova_maize_yield_year <- aov(maize_yield_kg_per_ha ~ factor(year), data = ICRISAT)
summary(anova_maize_yield_year)
##                 Df    Sum Sq  Mean Sq F value Pr(>F)    
## factor(year)    51 4.084e+09 80080912   68.39 <2e-16 ***
## Residuals    16094 1.884e+10  1170869                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation :- With a very small p-value (< 0.05), reject the null hypothesis. There is a significant difference in maize yield across years.

———————————————————

1.3

Simple Linear Regression: Model Relationship Between Area and Production for Maize( Area vs. Production for Maize)

simple_regression_maize <- lm(maize_production_1000tons ~ maize_area_1000ha, data = ICRISAT)
# Summary of the model
summary(simple_regression_maize)
## 
## Call:
## lm(formula = maize_production_1000tons ~ maize_area_1000ha, data = ICRISAT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -303.13   -6.08    1.19    1.86 1188.31 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.85684    0.44409  -4.181 2.91e-05 ***
## maize_area_1000ha  1.88400    0.01104 170.598  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.86 on 16144 degrees of freedom
## Multiple R-squared:  0.6432, Adjusted R-squared:  0.6432 
## F-statistic: 2.91e+04 on 1 and 16144 DF,  p-value: < 2.2e-16
# Plot the regression line
ggplot(ICRISAT, aes(x = maize_area_1000ha, y = maize_production_1000tons)) +
  geom_point(color = 'blue') +
  geom_smooth(method = 'lm', color = 'red') +
  labs(title = 'Simple Linear Regression: Maize Production vs. Area',
       x = 'Maize Area (1000 ha)', y = 'Maize Production (1000 tons)') +
  theme_minimal()

Interpretation: The plot shows a positive linear relationship between maize area and maize production. l As the Maize Area (x-axis) increases, the Maize Production (y-axis) also increases — points trend upward. The red line (regression line) slopes upward, showing a positive relationship.

———————————————————

1.4

Multiple Linear Regression: Predict Yield Using Area, Production, and State (Predict Maize Yield)

multiple_regression_maize <- lm(maize_yield_kg_per_ha ~ maize_area_1000ha + maize_production_1000tons + state_name, data = ICRISAT)
# Summary of the model
summary(multiple_regression_maize)
## 
## Call:
## lm(formula = maize_yield_kg_per_ha ~ maize_area_1000ha + maize_production_1000tons + 
##     state_name, data = ICRISAT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6330.7  -387.8   -44.3   344.2 20296.2 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2771.1755    36.7213   75.47   <2e-16 ***
## maize_area_1000ha            -14.2577     0.3522  -40.48   <2e-16 ***
## maize_production_1000tons     10.4973     0.1483   70.77   <2e-16 ***
## state_nameAssam            -1954.4906    52.9989  -36.88   <2e-16 ***
## state_nameBihar            -1291.2931    52.7464  -24.48   <2e-16 ***
## state_nameChhattisgarh     -1491.2628    61.5959  -24.21   <2e-16 ***
## state_nameGujarat          -1719.7050    46.7365  -36.80   <2e-16 ***
## state_nameHaryana          -1786.3246    58.6336  -30.47   <2e-16 ***
## state_nameHimachal Pradesh -1005.4017    53.1183  -18.93   <2e-16 ***
## state_nameJharkhand        -1569.1894    62.5631  -25.08   <2e-16 ***
## state_nameKarnataka         -587.1615    45.9422  -12.78   <2e-16 ***
## state_nameKerala           -2726.8697    53.0948  -51.36   <2e-16 ***
## state_nameMadhya Pradesh   -1573.6552    41.9425  -37.52   <2e-16 ***
## state_nameMaharashtra      -1588.4516    43.6504  -36.39   <2e-16 ***
## state_nameOrissa           -1570.9338    49.7540  -31.57   <2e-16 ***
## state_namePunjab            -849.3102    51.8112  -16.39   <2e-16 ***
## state_nameRajasthan        -1850.4769    44.6140  -41.48   <2e-16 ***
## state_nameTamil Nadu       -1201.1451    50.4540  -23.81   <2e-16 ***
## state_nameTelangana        -1063.9191    54.8296  -19.40   <2e-16 ***
## state_nameUttar Pradesh    -1605.4503    41.2592  -38.91   <2e-16 ***
## state_nameUttarakhand      -1768.1483    56.3370  -31.39   <2e-16 ***
## state_nameWest Bengal      -1257.5091    48.1816  -26.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 871.1 on 16124 degrees of freedom
## Multiple R-squared:  0.4664, Adjusted R-squared:  0.4657 
## F-statistic:   671 on 21 and 16124 DF,  p-value: < 2.2e-16
# Plot the model results for Punjab.
state_data <- subset(ICRISAT, state_name == 'Punjab')
# Plot the regression line
ggplot(state_data, aes(x = maize_area_1000ha, y = maize_yield_kg_per_ha)) +
  geom_point(color = 'green') +
  geom_smooth(method = 'lm', color = 'purple') +
  labs(title = 'Multiple Linear Regression: Maize Yield vs. Area (Punjab)',
       x = 'Maize Area (1000 ha)', y = 'Maize Yield (kg/ha)') +
  theme_minimal()

Interpretation: The plot shows a negative linear relationship between maize area and maize yield. As the Maize Area (x-axis) increases, the Maize Yield (y-axis) slightly decreases — points trend downward. The purple line (regression line) slopes downward, showing a negative relationship.

———————————————————

1.5

Correlation: Check correlation across rice, wheat, and maize — area, production, and yield.

# Select Area, Production, Yield columns for rice, wheat, maize
cereals_data <- ICRISAT[, c("rice_area_1000ha", "rice_production_1000tons", "rice_yield_kg_per_ha",
                            "wheat_area_1000ha", "wheat_production_1000tons", "wheat_yield_kg_per_ha",
                            "maize_area_1000ha", "maize_production_1000tons", "maize_yield_kg_per_ha")]
# Calculate correlation matrix
cor_cereals <- cor(cereals_data, use = "complete.obs", method = "pearson")
# Visualize
corrplot(cor_cereals, method = "color", addCoef.col = "black", number.cex = 0.7)

Interpretation: There is a strong positive correlation between area and production for all three crops (rice, wheat, and maize). Increasing the area under cultivation significantly boosts total production for each crop. Cross-crop relationships (e.g., rice vs wheat, wheat vs maize) show weak correlations. This suggests that production trends across crops are independent and not strongly linked to each other.

———————————————————