Load Data and basic structure and summary

yield.data <- fread("apy.csv", header = T, stringsAsFactors = T)

str(yield.data)
## Classes 'data.table' and 'data.frame':   246091 obs. of  7 variables:
##  $ State_Name   : Factor w/ 33 levels "Andaman and Nicobar Islands",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ District_Name: Factor w/ 646 levels "24 PARAGANAS NORTH",..: 428 428 428 428 428 428 428 428 428 428 ...
##  $ Crop_Year    : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ Season       : Factor w/ 6 levels "Autumn","Kharif",..: 2 2 2 5 5 5 5 5 5 5 ...
##  $ Crop         : Factor w/ 124 levels "Apple","Arcanut (Processed)",..: 3 75 96 8 23 29 39 107 109 110 ...
##  $ Area         : num  1254 2 102 176 720 ...
##  $ Production   : num  2000 1 321 641 165 65100000 100 2 15 169 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Summary

summary(yield.data)
##           State_Name       District_Name      Crop_Year   
##  Uttar Pradesh : 33306   BIJAPUR  :   945   Min.   :1997  
##  Madhya Pradesh: 22943   TUMKUR   :   936   1st Qu.:2002  
##  Karnataka     : 21122   BELGAUM  :   925   Median :2006  
##  Bihar         : 18885   HASSAN   :   895   Mean   :2006  
##  Assam         : 14628   BELLARY  :   887   3rd Qu.:2010  
##  Odisha        : 13575   DAVANGERE:   886   Max.   :2015  
##  (Other)       :121632   (Other)  :240617                 
##         Season                     Crop             Area        
##  Autumn    : 4949   Rice             : 15104   Min.   :      0  
##  Kharif    :95951   Maize            : 13947   1st Qu.:     80  
##  Rabi      :66987   Moong(Green Gram): 10318   Median :    582  
##  Summer    :14841   Urad             :  9850   Mean   :  12003  
##  Whole Year:57305   Sesamum          :  9046   3rd Qu.:   4392  
##  Winter    : 6058   Groundnut        :  8834   Max.   :8580100  
##                     (Other)          :178992                    
##    Production       
##  Min.   :0.000e+00  
##  1st Qu.:8.800e+01  
##  Median :7.290e+02  
##  Mean   :5.825e+05  
##  3rd Qu.:7.023e+03  
##  Max.   :1.251e+09  
##  NA's   :3730

Looking at Haryana Data Only

haryana.data <- subset(yield.data, yield.data$State_Name=='Haryana')
print(paste0("The number of rows: ", nrow(haryana.data)))
## [1] "The number of rows: 5875"

Finding the crops with maximum yield

prod.crop <- aggregate(haryana.data$Production ~ haryana.data$Crop, FUN = mean)
names(prod.crop) <- c("Crop","Production")
top.10.prod.crop <- arrange(prod.crop, -prod.crop$Production) %>% head(10)
top.10.prod.crop$Crop <- as.character(top.10.prod.crop$Crop)
print(top.10.prod.crop)
##                 Crop Production
## 1              Wheat 500463.722
## 2          Sugarcane 435061.390
## 3               Rice 164394.333
## 4       Cotton(lint) 123817.157
## 5              Bajra  46558.966
## 6  Rapeseed &Mustard  39719.853
## 7   Other Vegetables  17481.579
## 8          Guar seed  17398.688
## 9             Potato  16766.204
## 10              Gram   8777.778
g<-ggplot(data = top.10.prod.crop) + aes( x=reorder(Crop, -Production), y = Production, fill=Crop) + geom_bar(stat = "identity") + xlab("Crops") + ylab("Production") + ggtitle("Mean Production by Crop - Top 10 only")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Wheat

wheat.data <- haryana.data %>% filter(haryana.data$Crop=='Wheat') 
wheat.prod.year <- aggregate(wheat.data$Production ~ wheat.data$Crop_Year, FUN = sum)
names(wheat.prod.year) <- c("Year","Production")

g<-ggplot(data = wheat.prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Wheat)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Sugarcane

sugarcane.data <- haryana.data %>% filter(haryana.data$Crop=='Sugarcane') 
prod.year <- aggregate(sugarcane.data$Production ~ sugarcane.data$Crop_Year, FUN = sum)
names(prod.year) <- c("Year","Production")

g<-ggplot(data = prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Sugarcane)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Rice

rice.data <- haryana.data %>% filter(haryana.data$Crop=='Rice') 
prod.year <- aggregate(rice.data$Production ~ rice.data$Crop_Year, FUN = sum)
names(prod.year) <- c("Year","Production")

g<-ggplot(data = prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Rice)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Cotton

cotton.data <- haryana.data %>% filter(haryana.data$Crop=='Cotton(lint)') 
prod.year <- aggregate(cotton.data$Production ~ cotton.data$Crop_Year, FUN = sum)
names(prod.year) <- c("Year","Production")

g<-ggplot(data = prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Cotton)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Crop Summary

We can select Wheat, Sugarcane, Rice and Cotton as the main crops to target.


Area Analysis

top4crops <- c("Wheat","Sugarcane","Rice","Cotton(lint)")
top4crops.data <- haryana.data %>% filter(haryana.data$Crop %in% top4crops)

mean.area.crop <- aggregate(top4crops.data$Area ~ top4crops.data$Crop, FUN=mean)
names(mean.area.crop) <- c("Crop","Area_acres")
mean.area.crop
##           Crop Area_acres
## 1 Cotton(lint)   30311.90
## 2         Rice   56957.36
## 3    Sugarcane    6708.99
## 4        Wheat  119037.64
g<-qplot(mean.area.crop, x=mean.area.crop$Crop, y=mean.area.crop$Area_acres, xlab = "Crop", ylab="Area in Acres")
## Warning: Ignoring unknown parameters: NA
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Area used over the years

mean.area.crop <- aggregate(top4crops.data$Area ~ top4crops.data$Crop + top4crops.data$Crop_Year, FUN=sum)
names(mean.area.crop) <- c("Crop","Year","Area_acres")
g<-ggplot(mean.area.crop) + aes(x=Year, y=Area_acres, fill=Year) + facet_grid(.~Crop) + geom_bar(stat = "identity") + ggtitle("Area used by crop") + xlab("Year") + ylab("Area")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

Production over the years

mean.prod.crop <- aggregate(top4crops.data$Production ~ top4crops.data$Crop + top4crops.data$Crop_Year, FUN=sum)
names(mean.prod.crop) <- c("Crop","Year","Production")
g<-ggplot(mean.prod.crop) + aes(x=Year, y=Production, fill=Year) + facet_grid(.~Crop) + geom_bar(stat = "identity") + ggtitle("Crop Production") + xlab("Year") + ylab("Production")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg

One interesting thing I can see is crop production has increased over year but similarly production area has also increased. Even though there has been some excellent advances in agriculture in the last decade, there is no significant increase in yield.