library(dplyr)
library(plyr)
library(zoo)
library(ggplot2)
library(caroline)
library(forecast)
library(DT)
# read in the data file
data<- read.csv("Monthly_data_cmo.csv", header = TRUE)
# arranging the data frame by date
# in case of month and year as.Date is the wrong tool here. we will use the zoo package here.
data<- data[order(as.yearmon(data$date, "%Y-%m")), ] # this sorts rows according to month and year.
dim(data)
## [1] 62429 11
summary(data)
## APMC Commodity Year Month
## Mumbai : 1538 Gram : 3972 Min. :2014 November : 7183
## Pune : 1513 Wheat(Husked) : 3946 1st Qu.:2015 October : 6887
## Nagpur : 1340 Sorgum(Jawar) : 3560 Median :2015 September: 6518
## Barshi : 1076 Soybean : 3545 Mean :2015 June : 4906
## Jalgaon: 1055 Pigeon Pea (Tur): 3389 3rd Qu.:2016 January : 4858
## Solapur: 984 Maize : 2422 Max. :2016 December : 4816
## (Other):54923 (Other) :41595 (Other) :27261
## arrivals_in_qtl min_price max_price modal_price
## Min. : 1 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 38 1st Qu.: 1250 1st Qu.: 1600 1st Qu.: 1450
## Median : 211 Median : 1976 Median : 2797 Median : 2425
## Mean : 6043 Mean : 2945 Mean : 3689 Mean : 3296
## 3rd Qu.: 1364 3rd Qu.: 3900 3rd Qu.: 4647 3rd Qu.: 4257
## Max. :1450254 Max. :3153038 Max. :1600090 Max. :142344
##
## date district_name state_name
## 2016-01: 2769 Pune : 6366 Maharashtra:62429
## 2016-06: 2753 Ahmadnagar: 4638
## 2016-05: 2752 Nagpur : 4527
## 2015-12: 2739 Solapur : 4524
## 2016-02: 2698 Nasik : 3620
## 2016-10: 2679 Satara : 2771
## (Other):46039 (Other) :35983
#This is what the data looks like
head(data,5)
Data contains 62429 observations on 11 variables. Those 11 variables are:
names(data)
## [1] "APMC" "Commodity" "Year"
## [4] "Month" "arrivals_in_qtl" "min_price"
## [7] "max_price" "modal_price" "date"
## [10] "district_name" "state_name"
print(paste0("There are ", length(unique(data$Commodity)), " commodities"))
## [1] "There are 352 commodities"
Names of some commodities are written in small as well as capital letters. So it was converted to lower.
# Convert comm. name lower case
data$Commodity<- sapply(data$Commodity, tolower)
print(paste0("Now there are ", length(unique(data$Commodity)), " commodities"))
## [1] "Now there are 204 commodities"
# no of observations for each of the commodity
com_data <- groupBy(data, by="Commodity", aggregation = "length", clmns = c("min_price", "modal_price"))
## Warning in groupBy(data, by = "Commodity", aggregation = "length", clmns =
## c("min_price", : automatically extending your only specified aggregation to
## all 2 clmns
#removing commodities which have less than 30 data points
com_data <- com_data[which(com_data$min_price>=27), ]
#subsetting data accordingly
subset_data <- data[data$Commodity %in% rownames(com_data), ]
print(paste0("Number of commodities omitted: ",length(rownames(com_data[which(com_data$min_price<27), ]))))
## [1] "Number of commodities omitted: 0"
For a given commodity in an APMC, are 27 data points (prices from Sept, 2014 to Nov, 2016). Commodities having less than 27 observations were omitted from the analyis as they were too less to have given us any meaningful insights. 69 commodities Inclding amba koy, amla, aster, banana(raw) were removed. Most of them had less that 10 observations corresponding to them.
# randomly select 9 commodities from
set.seed(4)
sampled_com<- rownames(com_data)[sample.int(dim(com_data)[1], size = 9)]
{par(mfrow=c(3,3))
for(i in sampled_com){
hist(data$min_price[data$Commodity==i], xlab = "min_price", main=i, col="#A7A7A7")
abline(v=mean(data$min_price[data$Commodity==i]), col = "blue", lwd=2.5, lty="dotted")
}}
{par(mfrow=c(3,3))
for(i in sampled_com){
hist(data$modal_price[data$Commodity==i], xlab = "modal_price", main=i, col="#A7A7A7", breaks = 15)
abline(v=mean(data$modal_price[data$Commodity==i]), col = "blue", lwd=2.5, lty="dotted")
}}
Some seem to be approximately normally distributed. We will perform the test for normality on prices of all commodities. If the data is normally distributed, any score not lying within the 3 standard deviations from mean would be treated as an outlier and will be omitted from the analysis. Shapiro-Wilk test was used for testing normality as it has the best power for given significance out of Shapiro–Wilk, Kolmogorov–Smirnov, Lilliefors and Anderson–Darling tests.
# function for testing normality
normality_test<- function(x){
pval <- shapiro.test(x)$p.value
if(pval<=0.05){
result<- "Not normal"
}else{
result<- "Normal"
}
return(result)
}
# Split prices by com then applying the function for testing normality and then combine
non<- aggregate(cbind(min_price, modal_price, max_price) ~ Commodity, data = subset_data, FUN = normality_test)
head(non)
# function for detecting outliers
outliers<- function(x){
H <- 1.5*IQR(x, na.rm = T)
q1 <- summary(x)[[2]]
q3<- summary(x)[[5]]
return(x[x>q3+H | x<q1-H])
}
outliers_normal<- function(x){
d<- (x-mean(x))/sd(x)
return(x[d>3 | d<(-3)])
}
#normal
for (i in non$Commodity[non$min_price=="Normal"] ){
outlier <- outliers_normal(data$min_price[data$Commodity==i])
if (length(outlier)==0) next # skip and go to next iteration.
subset_data<- subset_data[-which((subset_data$min_price %in% outlier & subset_data$Commodity==i)), ]
#print(length(outlier))
}
# non normal
for (i in non$Commodity[non$min_price=="Not normal"]){
outlier <- outliers(subset_data$min_price[subset_data$Commodity==i])
if (length(outlier)==0) next # skip and go to next iteration.
subset_data <- subset_data[-which((subset_data$min_price %in% outlier & subset_data$Commodity==i)), ]
#print(length(outlier))
}
Similarly, outliers in the max_price
and modal_price
also need to be removed. Dim of subset_data is 59730 11
.
#normal
for (i in non$Commodity[non$modal_price=="Normal"] ){
outlier <- outliers_normal(subset_data$modal_price[subset_data$Commodity==i])
if (length(outlier)==0) next # skip and go to next iteration.
subset_data<- subset_data[-which((subset_data$modal_price %in% outlier & subset_data$Commodity==i)), ]
#print(length(outlier))
}
# non normal
for (i in non$Commodity[non$modal_price=="Not normal"] ){
outlier <- outliers(subset_data$modal_price[subset_data$Commodity==i])
if (length(outlier)==0) next # skip and go to next iteration.
subset_data<- subset_data[-which((subset_data$modal_price %in% outlier & subset_data$Commodity==i)), ]
#print(length(outlier))
}
#normal
for (i in non$Commodity[non$max_price=="Normal"] ){
outlier <- outliers_normal(subset_data$max_price[subset_data$Commodity==i])
if (length(outlier)==0) next # skip and go to next iteration.
subset_data<- subset_data[-which((subset_data$max_price %in% outlier & subset_data$Commodity==i)), ]
#print(length(outlier))
}
# non normal
for (i in non$Commodity[non$max_price=="Not normal"] ){
outlier <- outliers(subset_data$max_price[subset_data$Commodity==i])
if (length(outlier)==0) next # skip and go to next iteration.
subset_data<- subset_data[-which((subset_data$max_price %in% outlier & subset_data$Commodity==i)), ]
#print(length(outlier))
}
subset_data
is the processed data and will be used for further analysis.
dim(subset_data)
## [1] 58358 11
print(paste0("There are ", length(unique(subset_data$Commodity)), " commodities"))
## [1] "There are 135 commodities"
#avg min, max and modal price for all commodities
aggregate(. ~ Commodity, data = data, FUN = mean)[c("Commodity","min_price","modal_price","max_price")]
# combine all prices for a com
combined_prices<- aggregate(cbind(min_price, modal_price, max_price)~Commodity, data=data, FUN=as.vector)
# Function for calculating median of all prices of a commodity
median_price<- function(x,y,z){
return(median(c(x,y,z)))
}
# Function for calculating coeff. of variation of all prices of a commodity
cvariation <- function(x,y,z){
prices <- c(x,y,z)
return(sd(prices)/mean(prices))
}
#median
combined_prices$median <- apply(combined_prices[,c("min_price","modal_price","max_price")], 1, function(x) median_price(x["min_price"][[1]],x["modal_price"][[1]],x["max_price"][[1]]))
#coeff. of variation
combined_prices$coeff_variation <- apply(combined_prices[,c("min_price","modal_price","max_price")], 1, function(x) cvariation(x["min_price"][[1]],x["modal_price"][[1]],x["max_price"][[1]]))
datatable(combined_prices[order(combined_prices$median, decreasing = T),c("Commodity","median","coeff_variation")], extensions = 'AutoFill', options = list(autoFill=T))
The table above sorts the commodity prices by the median prices. Top of the table shows the costliest commodities their median prices and the amount of fluctuation in those prices.
datatable(combined_prices[order(combined_prices$coeff_variation, decreasing = T),c("Commodity","median","coeff_variation")], extensions = 'AutoFill', options = list(autoFill=T))
The above table shows the Commodities with the highest variation/fluctuations in their prices.
subset_data <- subset_data[order(as.yearmon(subset_data$date, "%Y-%m")), ]
dataforplot <- aggregate(cbind(min_price, modal_price, max_price)~Commodity+date, data = subset_data, FUN = mean)
#sample 10 commodities
set.seed(1267)
comm10<- sample(unique(subset_data$Commodity), size=10, replace = F)
#concat data for first five
list_df<- list()
for (i in 1:3){
list_df[[i]] = data.frame(dataforplot[dataforplot$Commodity==comm10[i], ])
}
concatenated_df <- ldply(list_df, rbind)
list_df<- list()
for (i in 4:6){
list_df[[i]] = data.frame(dataforplot[dataforplot$Commodity==comm10[i], ])
}
concatenated_df1 <- ldply(list_df, rbind)
{ggplot(data=concatenated_df, aes(x=date, y=modal_price, group=Commodity, colour=Commodity))+geom_line()+theme(axis.text.x = element_text(angle = 60, hjust=1))+facet_wrap(~Commodity, ncol = 1, scales="free_y")}
{ggplot(data=concatenated_df1, aes(x=date, y=min_price, group=Commodity, colour=Commodity))+geom_line()+theme(axis.text.x = element_text(angle = 60, hjust=1))+facet_wrap(~Commodity, ncol = 1, scales="free_y")}
By looking at the time series plots of a few randomly sampled commodities, no clear seasonal patterns are visible. So statistical tests needs to be performed which can confirm the presence of seasonality.
# function which takes a vector, converts it into an object of class time series and detects seasonality.
detect_season<- function(x){
if (sum(x %in% 0)!=0){
return("na")
}else{
x<- ts(x, frequency = 12, start = c(2014,9))
seasonal_comp<- ets(x)$component[3]
return(seasonal_comp)
}
}
A simple approach for detecting seasonality is to fit an ETS model using ets()
, if the chosen model has a seasonal component then the data is seasonal. If the seasonal component is not detected by the model then it gives N (none) in place of a seasonal component. If a seasonal component is detected, a log-liklihood test can be used to test its significance. The aforementioned method of detecting seasonality was suggested by Prof. Hyndman.
seasonality <- aggregate(cbind(min_price, modal_price, max_price)~Commodity, data=dataforplot, FUN=detect_season)
datatable(seasonality, extensions = 'AutoFill', options = list(autoFill=T))
Any series containing missing values have na
corresponding to them. By looking at the above table, we can conclude that the time series of ‘min_price’,‘modal_price’ and ‘max_price’ of any commodity does not have any statistically significant seasonal pattern. It can be because the time series are not long enough for the seasonal patterns to be detected. Here we have 27 data points from Sept 2014 to Nov 2016 for each commodity. Non-parametric tests like Kolmogrov-smirnov and wilcoxon signed-rank test requires the sample size to be atleast 50 and 45 for parametric tests.
But we do know that many vegetables are seasonal and prices do down in their respective seasons. Let’s look at the plots of some seasonal vegetables like Spinach, Bitter gourd, Carrot, etc.
t<-dataforplot[dataforplot$Commodity=="spinach",c("min_price","modal_price","max_price")]
t<- ts(t,frequency = 12, start=c(2014,9))
{plot.ts(t, plot.type = "single", ylab="Prices", col="dark grey", lwd=2, main="Minimum, Modal and Maximum prices\n of Spinach", type="o", panel.first=grid(col="grey"))}
temp<- dataforplot[dataforplot$Commodity=="bitter gourd",c("min_price","modal_price","max_price")]
temp<- ts(temp,frequency = 12, start=c(2014,9))
{plot.ts(temp, plot.type = "single", ylab="Prices", col="dark grey", lwd=2, main="Minimum, Modal and Maximum prices\n of Bitter gourd", type="o", panel.first=grid(col="grey"))}
temp<- dataforplot[dataforplot$Commodity=="carrot",c("min_price","modal_price","max_price")]
temp<- ts(temp,frequency = 12, start=c(2014,9))
{plot.ts(temp, plot.type = "single", ylab="Prices", col="dark grey", lwd=2, main="Minimum, Modal and Maximum prices\n of carrot", type="o", panel.first=grid(col="grey"))}
As expected, the prices do show some seasonal variations. However due to less data, above statistical test was unable to detect it. Time series plots of prices Spinach and Carrot take a dip during the winter season and peak in the off season (summers). And vice versa for Bitter gourd. Since the peaks can be expected to get higher with time due to inflation and will not remain constant throughout time, the appropriate seasonality type is multiplicative. So we’ll just go ahead and use decompose()
function in R to break the series into its components and get the seasonal indices.
m<-ts(dataforplot[dataforplot$Commodity=="spinach",c("modal_price")], frequency = 12, start = c(2014,9))
{plot.ts(decompose(m, type="multiplicative")$seasonal, plot.type = "single", ylab="Seasonal Indices", col="dark grey", lwd=2, main="Seasonal Indices (Spinach)", type="o", panel.first=grid(col="grey"))}
m<-ts(dataforplot[dataforplot$Commodity=="carrot",c("modal_price")], frequency = 12, start = c(2014,9))
{plot.ts(decompose(m, type="additive")$seasonal, plot.type = "single", ylab="Seasonal Indices", col="dark grey", lwd=2, main="Seasonal Indices (Carrot)", type="o", panel.first=grid(col="grey"))}
m<-ts(dataforplot[dataforplot$Commodity=="bitter gourd",c("modal_price")], frequency = 12, start = c(2014,9))
{plot.ts(decompose(m, type="additive")$seasonal, plot.type = "single", ylab="Seasonal Indices", col="dark grey", lwd=2, main="Seasonal Indices (Bitter Gourd)", type="o", panel.first=grid(col="grey"))}
Here, commodity prices were compared with their respective MSP. The following questions can be asked here: Are there any commodities for which the prices are less/more than MSP in Mandis. How often are the commodities sold at prices less than or more than MSP in a particular mandi. Are there any significant differences between the prices of commodities and MSPs in mandis in various years.
A metric was devised known as the cost ratio wherein the prices of a commodity in an APMC for a year were divided by their respective MSPs. This ratio tells us whether the prices are x times costlier or cheaper than MSPs in a Mandi. To answer the first question in the preceding paragraph, we calcuated the median of ratios and ranked them from highest to lowest. Then the proportion of ratios greater than 1 was calculated to answer the second question. A t-test was also performed to determine if there are significant differences between the prices and MSP.
# Read files
msp_data<- read.csv("CMO_MSP_Mandi.csv",header = T)
msp_data$commodity<- sapply(msp_data$commodity, tolower)
temp_df<- aggregate(cbind(min_price, modal_price, max_price)~Commodity+APMC+Year, data=subset_data, FUN=as.vector)
# function for calculating the proportion of ratios greater than 1.
ratio<- function(x, y, z, year, commodity){
MSP<- msp_data$msprice[msp_data$year==year & msp_data$commodity==commodity]
prices<- c(x,y,z)
if (length(MSP)==0){
return(NA)
}else{
ratios<- prices/MSP
return(sum(ratios>1)/length(ratios))
}
}
# function for calculating the median of ratios.
medianratio<- function(x, y, z, year, commodity){
MSP<- msp_data$msprice[msp_data$year==year & msp_data$commodity==commodity]
prices<- c(x,y,z)
if (length(MSP)==0){
return(NA)
}else{
ratios<- prices/MSP
return(median(ratios))
}
}
temp_df$howoften <- apply(temp_df[,c("min_price","Year","Commodity","modal_price","max_price")], 1, function(x) ratio(x["min_price"][[1]],x["modal_price"][[1]],x["max_price"][[1]],x["Year"],x["Commodity"]))
temp_df$median_ratio <- apply(temp_df[,c("min_price","Year","Commodity","modal_price","max_price")], 1, function(x) medianratio(x["min_price"][[1]],x["modal_price"][[1]],x["max_price"][[1]], x["Year"],x["Commodity"]))
# function to get msp for a given year and com
get_msp<- function(year, commodity){
MSP<- msp_data$msprice[msp_data$year==year & msp_data$commodity==commodity]
if (length(MSP)==0){
return(NA)
}else{
return(MSP)
}
}
# making a column of MSPs in temp_df
temp_df$minsupprice <- apply(temp_df[,c("Year","Commodity")], 1, function(x) get_msp(x["Year"],x["Commodity"]))
temp_df<- (as.data.frame(temp_df))
# ranking by median
datatable(temp_df[order(temp_df$median_ratio, decreasing = TRUE), c("Commodity","APMC","Year","minsupprice","median_ratio")], extensions = 'AutoFill', options = list(autoFill = TRUE))
The above table gives us an interesting insight. The median ratio can be interpretated as follows: From row 1 we can say, On an average rice(paddy-husked) had prices 5.3 times higher than the minimum support price in APMC Mumbai in 2014. Whereas in 2015 and 2016 prices were 3.96 and 3.76 times higher respectively than minimum support price. In APMC Pune it was 4.2 times costlier than the minimum support price in 2014. Prices of maize in Udgir were 3.7 times higher on an average in the year 2016. And so on…
The same table can be sorted by median_ratio
in ascending order to find the commodity and mandi where prices are lower than the MSP.
datatable(temp_df[order(temp_df$median_ratio, decreasing = FALSE), c("Commodity","APMC","Year","minsupprice","median_ratio")][10:100,], extensions = 'AutoFill', options = list(autoFill = TRUE))
This part of the table is dominated by commodities like Sun flower, Sesamum, sorgum(jawar), paddy(unhusked). The prices of the aforementioned commodities are much lower than the MSPs in various APMCs.
# Ranking by proportion (of ratios > 1)
datatable(temp_df[order(temp_df$howoften, decreasing = F), c("Commodity","APMC","Year","minsupprice","howoften")][550:1000,], extensions = 'AutoFill', options = list(autoFill = TRUE))
Feature named howoften
in the above table was calculated as follows: Firstly the prices of a commodity in an APMC was divided by the respective MSP and then proportion all these ratios greater than one was calculated. The proportion is reported in the last column of the above table. It tells us the proportion of prices of various months for which a commodity was sold for prices more than MSP in a mandi in that year. For eg. in Nanded, only 2.7% of the prices reported for sorgum(jawar) were more than MSP in year 2015.
# function for testing normality
normality_test<- function(x,y,z){
prices<- c(x,y,z)
if(sd(prices)==0|length(prices)<2){
return(NA)
}else{
pval <- shapiro.test(prices)$p.value
if(pval<=0.05){
result<- "Not normal"
}else{
result<- "Normal"
}
return(result)
}
}
temp_df$normal <- apply(temp_df[,c("min_price","modal_price","max_price")], 1, function(x) normality_test(x["min_price"][[1]],x["modal_price"][[1]],x["max_price"][[1]]))
dim(temp_df)[1]; dim(temp_df[temp_df$normal=="Normal", ])[1]
## [1] 10449
## [1] 7588
Test of normality was necessary because t-test was employed for testing whether there exists a significant difference between the MSP and prices of a commodity in an APMC for a given year. The most important assumption in t-test is that the sample is drawn from a normal population.
Most of series (of prices) were found to be normal (7249 out of 10569). So we can go ahead and do significance testing.
# for performing t-test (for finding whether there is any significant difference between the prices and MSP)
df_comp_msp <- list()
for (i in 1:nrow(msp_data)){
row<- msp_data[i, ]
year <- row[["year"]]
com <- row[["commodity"]]
msprice <- row[["msprice"]]
if(is.na(row[["msprice"]])) next
if (sum(temp_df$Commodity==com & temp_df$Year==year)!=0){
allapmc <- unique(as.vector(temp_df$APMC[temp_df$Commodity==com & temp_df$Year==year]))
for (j in 1:length(allapmc)){
min_prices <- temp_df$min_price[temp_df$Commodity==com & temp_df$Year==year & temp_df$APMC==allapmc[j]][[1]]
modal_prices <- temp_df$modal_price[temp_df$Commodity==com & temp_df$Year==year & temp_df$APMC==allapmc[j]][[1]]
max_prices <- temp_df$max_price[temp_df$Commodity==com & temp_df$Year==year & temp_df$APMC==allapmc[j]][[1]]
prices <- c(min_prices, modal_prices,max_prices)
if (length(min_prices)<2||length(max_prices)<2|sd(min_prices)==0|sd(modal_prices)==0|sd(max_prices)==0) next
else{
pvalue <- t.test(prices, mu=msprice, alternative = "two.sided")$p.value
}
df_comp_msp[[i+j-1]] = data.frame(APMC=allapmc[j], Commodity=com, Year=year, MSP=msprice, pval=pvalue)
# a list of data frames storing all results
}
}
}
result<- ldply(df_comp_msp, rbind) # then concatenate all data frames
datatable(result[result$pval<0.05, ], extensions = 'AutoFill', options = list(autoFill = TRUE))
Above tables show prices of which commodity in which APMC and year differ significantly from its Minimum Support Price.
To measure the variation or fluctuations in a sample of prices, coefficient of variation (CV) was chosen. Standard deviation or variance is also a good metric but it cannot be chosen here because they are not dimensionless, hence the fluctuations in prices of potato cannot be compared with the fluctuations in prices of avocados (say). On the other hand, CV is dimensionless and is comparable across measures that differ widely in location and scale.
cvariation <- function(x,y,z){
prices <- c(x,y,z)
return(sd(prices)/mean(prices))
}
temp_df$coeff_variation <- apply(temp_df[,c("min_price","modal_price","max_price")], 1, function(x) cvariation(x["min_price"][[1]],x["modal_price"][[1]],x["max_price"][[1]]))
datatable(temp_df[order(temp_df$coeff_variation, decreasing = T), c("Commodity","APMC","Year","coeff_variation")], extensions = 'AutoFill', options = list(autoFill = TRUE))
Mandis/APMC and commodities with highest price fluctuations in each year are given in the above table in descending order of CV.