yield.data <- fread("apy.csv", header = T, stringsAsFactors = T)
str(yield.data)
## Classes 'data.table' and 'data.frame': 246091 obs. of 7 variables:
## $ State_Name : Factor w/ 33 levels "Andaman and Nicobar Islands",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ District_Name: Factor w/ 646 levels "24 PARAGANAS NORTH",..: 428 428 428 428 428 428 428 428 428 428 ...
## $ Crop_Year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ Season : Factor w/ 6 levels "Autumn","Kharif",..: 2 2 2 5 5 5 5 5 5 5 ...
## $ Crop : Factor w/ 124 levels "Apple","Arcanut (Processed)",..: 3 75 96 8 23 29 39 107 109 110 ...
## $ Area : num 1254 2 102 176 720 ...
## $ Production : num 2000 1 321 641 165 65100000 100 2 15 169 ...
## - attr(*, ".internal.selfref")=<externalptr>
Summary
summary(yield.data)
## State_Name District_Name Crop_Year
## Uttar Pradesh : 33306 BIJAPUR : 945 Min. :1997
## Madhya Pradesh: 22943 TUMKUR : 936 1st Qu.:2002
## Karnataka : 21122 BELGAUM : 925 Median :2006
## Bihar : 18885 HASSAN : 895 Mean :2006
## Assam : 14628 BELLARY : 887 3rd Qu.:2010
## Odisha : 13575 DAVANGERE: 886 Max. :2015
## (Other) :121632 (Other) :240617
## Season Crop Area
## Autumn : 4949 Rice : 15104 Min. : 0
## Kharif :95951 Maize : 13947 1st Qu.: 80
## Rabi :66987 Moong(Green Gram): 10318 Median : 582
## Summer :14841 Urad : 9850 Mean : 12003
## Whole Year:57305 Sesamum : 9046 3rd Qu.: 4392
## Winter : 6058 Groundnut : 8834 Max. :8580100
## (Other) :178992
## Production
## Min. :0.000e+00
## 1st Qu.:8.800e+01
## Median :7.290e+02
## Mean :5.825e+05
## 3rd Qu.:7.023e+03
## Max. :1.251e+09
## NA's :3730
haryana.data <- subset(yield.data, yield.data$State_Name=='Haryana')
print(paste0("The number of rows: ", nrow(haryana.data)))
## [1] "The number of rows: 5875"
prod.crop <- aggregate(haryana.data$Production ~ haryana.data$Crop, FUN = mean)
names(prod.crop) <- c("Crop","Production")
top.10.prod.crop <- arrange(prod.crop, -prod.crop$Production) %>% head(10)
top.10.prod.crop$Crop <- as.character(top.10.prod.crop$Crop)
print(top.10.prod.crop)
## Crop Production
## 1 Wheat 500463.722
## 2 Sugarcane 435061.390
## 3 Rice 164394.333
## 4 Cotton(lint) 123817.157
## 5 Bajra 46558.966
## 6 Rapeseed &Mustard 39719.853
## 7 Other Vegetables 17481.579
## 8 Guar seed 17398.688
## 9 Potato 16766.204
## 10 Gram 8777.778
g<-ggplot(data = top.10.prod.crop) + aes( x=reorder(Crop, -Production), y = Production, fill=Crop) + geom_bar(stat = "identity") + xlab("Crops") + ylab("Production") + ggtitle("Mean Production by Crop - Top 10 only")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
wheat.data <- haryana.data %>% filter(haryana.data$Crop=='Wheat')
wheat.prod.year <- aggregate(wheat.data$Production ~ wheat.data$Crop_Year, FUN = sum)
names(wheat.prod.year) <- c("Year","Production")
g<-ggplot(data = wheat.prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Wheat)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
sugarcane.data <- haryana.data %>% filter(haryana.data$Crop=='Sugarcane')
prod.year <- aggregate(sugarcane.data$Production ~ sugarcane.data$Crop_Year, FUN = sum)
names(prod.year) <- c("Year","Production")
g<-ggplot(data = prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Sugarcane)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
rice.data <- haryana.data %>% filter(haryana.data$Crop=='Rice')
prod.year <- aggregate(rice.data$Production ~ rice.data$Crop_Year, FUN = sum)
names(prod.year) <- c("Year","Production")
g<-ggplot(data = prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Rice)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
cotton.data <- haryana.data %>% filter(haryana.data$Crop=='Cotton(lint)')
prod.year <- aggregate(cotton.data$Production ~ cotton.data$Crop_Year, FUN = sum)
names(prod.year) <- c("Year","Production")
g<-ggplot(data = prod.year) + aes( x=Year, y = Production, fill=Year) + geom_bar(stat = "identity") + xlab("Year") + ylab("Production") + ggtitle("Total Production by Year (Cotton)")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
We can select Wheat, Sugarcane, Rice and Cotton as the main crops to target.
top4crops <- c("Wheat","Sugarcane","Rice","Cotton(lint)")
top4crops.data <- haryana.data %>% filter(haryana.data$Crop %in% top4crops)
mean.area.crop <- aggregate(top4crops.data$Area ~ top4crops.data$Crop, FUN=mean)
names(mean.area.crop) <- c("Crop","Area_acres")
mean.area.crop
## Crop Area_acres
## 1 Cotton(lint) 30311.90
## 2 Rice 56957.36
## 3 Sugarcane 6708.99
## 4 Wheat 119037.64
g<-qplot(mean.area.crop, x=mean.area.crop$Crop, y=mean.area.crop$Area_acres, xlab = "Crop", ylab="Area in Acres")
## Warning: Ignoring unknown parameters: NA
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
Area used over the years
mean.area.crop <- aggregate(top4crops.data$Area ~ top4crops.data$Crop + top4crops.data$Crop_Year, FUN=sum)
names(mean.area.crop) <- c("Crop","Year","Area_acres")
g<-ggplot(mean.area.crop) + aes(x=Year, y=Area_acres, fill=Year) + facet_grid(.~Crop) + geom_bar(stat = "identity") + ggtitle("Area used by crop") + xlab("Year") + ylab("Area")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
Production over the years
mean.prod.crop <- aggregate(top4crops.data$Production ~ top4crops.data$Crop + top4crops.data$Crop_Year, FUN=sum)
names(mean.prod.crop) <- c("Crop","Year","Production")
g<-ggplot(mean.prod.crop) + aes(x=Year, y=Production, fill=Year) + facet_grid(.~Crop) + geom_bar(stat = "identity") + ggtitle("Crop Production") + xlab("Year") + ylab("Production")
gg <- ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
gg
One interesting thing I can see is crop production has increased over year but similarly production area has also increased. Even though there has been some excellent advances in agriculture in the last decade, there is no significant increase in yield.