Set working directory

Read Dataset

wpi <- read.csv("Wholesal_price_index.csv")

Load Libraries

library(tseries)
library(Hmisc)
library(psych)
library(ggplot2)

View the Structure of the dataset

str(wpi)
## 'data.frame':    113 obs. of  2 variables:
##  $ Monthly              : chr  "2012M04" "2012M05" "2012M06" "2012M07" ...
##  $ wholesale.price.index: num  105 105 105 106 107 ...
View(wpi)

ChecK for missing values

missing_row_index <- which(is.na(wpi$wholesale.price.index))
print(missing_row_index)
## [1] 101

Calculate the median and imput missing value

median_wpi <- median(wpi$wholesale.price.index, na.rm = TRUE)

wpi$wholesale.price.index[is.na(wpi$wholesale.price.index)] <- median_wpi

print(wpi$wholesale.price.index)
##   [1] 104.7 105.3 105.3 106.2 106.9 107.6 107.4 107.3 107.1 108.0 108.4 108.6
##  [13] 108.6 108.6 110.1 111.2 112.9 114.3 114.6 114.3 113.4 113.6 113.6 114.3
##  [25] 114.1 114.8 115.2 116.7 117.2 116.4 115.6 114.1 112.1 110.8 109.6 109.9
##  [37] 110.2 111.4 111.8 111.1 110.0 109.9 110.1 109.9 109.4 108.0 107.1 107.7
##  [49] 109.0 110.4 111.7 111.8 111.2 111.4 111.5 111.9 111.7 112.6 113.0 113.2
##  [61] 113.2 112.9 112.7 113.9 114.8 114.9 115.6 116.4 115.7 116.0 116.1 116.3
##  [73] 117.3 118.3 119.1 119.9 120.1 120.9 122.0 121.6 119.7 119.2 119.5 119.9
##  [85] 121.1 121.6 121.5 121.3 121.5 121.3 122.0 122.3 123.0 123.4 122.2 120.4
##  [97] 119.2 117.5 119.3 121.0 114.3 122.9 123.6 125.1 125.4 126.5 128.1 129.9
## [109] 132.0 132.9 133.7 134.5 135.9

Create a Time series plot

wpi_numeric <- as.numeric(wpi$wholesale.price.index)

x <- ts(wpi_numeric, start = c(2012, 4), frequency = 12, end = c(2021, 8))

ts.plot(x, main = "Wholesale Price Index", xlab = "Year", ylab = "Percentage")

Identifer the outlier using a boxplot

boxplot(x)

boxplot.stats(x)
## $stats
## [1] 104.7 110.8 114.3 120.1 133.7
## 
## $n
## [1] 113
## 
## $conf
## [1] 112.9177 115.6823
## 
## $out
## [1] 134.5 135.9

Auto-correlation Function (ACF) AND Partial Correlation Function (PCF) Plot

acf(x, lag.max = 200, type = c("correlation", "covariance", "partial"), plot = TRUE, na.action = na.fail, demean = TRUE)

pacf(x, lag.max = 50)

adf.test(x)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x
## Dickey-Fuller = -1.1302, Lag order = 4, p-value = 0.9142
## alternative hypothesis: stationary

Non stationary to stationary differences

dx <- diff(x, 1)
ts.plot(dx)

adf.test(dx)
## Warning in adf.test(dx): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  dx
## Dickey-Fuller = -4.4256, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
-   *The Dickey-Fuller statistic (-4.4256) is calculated using the data differences (dx).*
-   *The lag order, or the number of lags utilized in the test, is four.*
-   *The p-value for the test is reported as 0.01.*

Create training dataset

x1<-x[1:110]
x1
##   [1] 104.7 105.3 105.3 106.2 106.9 107.6 107.4 107.3 107.1 108.0 108.4 108.6
##  [13] 108.6 108.6 110.1 111.2 112.9 114.3 114.6 114.3 113.4 113.6 113.6 114.3
##  [25] 114.1 114.8 115.2 116.7 117.2 116.4 115.6 114.1 112.1 110.8 109.6 109.9
##  [37] 110.2 111.4 111.8 111.1 110.0 109.9 110.1 109.9 109.4 108.0 107.1 107.7
##  [49] 109.0 110.4 111.7 111.8 111.2 111.4 111.5 111.9 111.7 112.6 113.0 113.2
##  [61] 113.2 112.9 112.7 113.9 114.8 114.9 115.6 116.4 115.7 116.0 116.1 116.3
##  [73] 117.3 118.3 119.1 119.9 120.1 120.9 122.0 121.6 119.7 119.2 119.5 119.9
##  [85] 121.1 121.6 121.5 121.3 121.5 121.3 122.0 122.3 123.0 123.4 122.2 120.4
##  [97] 119.2 117.5 119.3 121.0 114.3 122.9 123.6 125.1 125.4 126.5 128.1 129.9
## [109] 132.0 132.9

ARIMA Model

result <- arima(x1, order = c(1,1,0))
tsdiag(result)

The ARIMA model aids in understanding and predicting wholesale pricing variations over time. The diagnostic plots tell us the following:

While my ARIMA model provides a fair overall picture of how wholesale prices vary over time, these plots show that it may overlook some minor subtleties. This suggests that we may need to modify our model to produce more accurate forecasts.

Testing the model

predict(result, 3)
## $pred
## Time Series:
## Start = 111 
## End = 113 
## Frequency = 1 
## [1] 132.8450 132.8483 132.8481
## 
## $se
## Time Series:
## Start = 111 
## End = 113 
## Frequency = 1 
## [1] 1.373933 1.884582 2.286675
x[111:113]
## [1] 133.7 134.5 135.9

Forecasting Wholesale Price Index

result_1 <- arima (x, order = c(1,1,0))
predict(result_1, 3)
## $pred
##           Sep      Oct      Nov
## 2021 135.8319 135.8352 135.8350
## 
## $se
##           Sep      Oct      Nov
## 2021 1.366955 1.886703 2.293286
x2 <- x [1:113]
plot (x2, main = "Wholesale Price Index", xlab = "Year", ylab = "Percentages")
forecast = predict(result_1, n.ahead = 5)
lines(114:118, forecast$pred, type = "o", col="red")