library(forecast)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tseries)
library(fpp)
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
base_data<-read.csv('D:/Freelancer_questions/kevin/1134769745_daily_with_workdays_27.csv')
colnames(base_data)<-c("Date","Temperature","Humidity","Windspeed","Count","All_workdays")
base_data<-base_data[order(as.Date(base_data$Date, format="%d/%m/%Y")),]
base_data<-base_data[order(as.Date(base_data$Date, format="%d/%m/%Y")),]
base_data$All_workdays<-as.factor(base_data$All_workdays)
head(base_data)
#ARIMA programming starts
## 75% of the sample size
smp_size <- floor(0.95 * nrow(base_data))
print(smp_size)
## [1] 1033
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(base_data)), size = smp_size)
train <- base_data[1:smp_size, ]
test <- base_data[smp_size+1:nrow(base_data), ]
test<-na.omit(test)
xreg <- cbind(as_workday=model.matrix(~train$All_workdays),
Temp=train$Temperature,
Humidity=train$Humidity,
Windspeed=train$Windspeed
)
# Remove intercept
xreg <- xreg[,-1]
# Rename columns
colnames(xreg) <- c("All_workdays","Temp","Humidity","Windspeed")
#creating the same for the test data
xreg1 <- cbind(as_workday=model.matrix(~test$All_workdays),
Temp=test$Temperature,
Humidity=test$Humidity,
Windspeed=test$Windspeed
)
# Remove intercept
xreg1 <- xreg1[,-1]
# Rename columns
colnames(xreg1) <- c("All_workdays","Temp","Humidity","Windspeed")
Count <- ts(train$Count, start=c(2016.6),frequency=365)
Inference : As the data is for each day the frequency will be 365 and the start date is 2016-7-7
modArima <- auto.arima(Count, xreg=xreg)
modArima
## Series: Count
## Regression with ARIMA(1,1,1) errors
##
## Coefficients:
## ar1 ma1 All_workdays Temp Humidity Windspeed
## 0.1945 -0.8571 36.8114 -0.8123 -1.2827 -24.4085
## s.e. 0.0380 0.0198 7.6259 2.0231 0.2886 6.3385
##
## sigma^2 estimated as 13801: log likelihood=-6380.6
## AIC=12775.2 AICc=12775.31 BIC=12809.78
Forecasted_values<-forecast(modArima,nrow(test),xreg=xreg1)
Final_forecasted_values<-Forecasted_values$mean
length(Final_forecasted_values)
## [1] 55
mean((test$Count - Final_forecasted_values)^2)
## [1] 51860.74
library(ggplot2)
forecast(modArima,nrow(test),xreg=xreg1) -> fc
autoplot(Count, series="Data") +
autolayer(fc, series="Forecast") +
autolayer(fitted(fc), series="Fitted")
decompose_data = decompose(Count, "additive")
adjust_count = Count - decompose_data$seasonal
plot(adjust_count)
### Find ARIMAX model of deseasoned data
modArima_desea <- auto.arima(adjust_count, xreg=xreg)
modArima_desea
## Series: adjust_count
## Regression with ARIMA(3,1,1) errors
##
## Coefficients:
## ar1 ar2 ar3 ma1 All_workdays Temp Humidity
## 0.2376 0.0552 -0.0288 -0.9219 33.6719 -2.7078 -0.8318
## s.e. 0.0360 0.0346 0.0343 0.0184 7.0878 1.7676 0.2657
## Windspeed
## -16.5851
## s.e. 5.7804
##
## sigma^2 estimated as 11553: log likelihood=-6288.04
## AIC=12594.09 AICc=12594.26 BIC=12638.54
Forecasted_values_des<-forecast(modArima_desea,nrow(test),xreg=xreg1)
Final_forecasted_values_des<-Forecasted_values_des$mean
library(ggplot2)
forecast(modArima_desea,nrow(test),xreg=xreg1) -> fc1
autoplot(Count, series="Data") +
autolayer(fc1, series="Forecast") +
autolayer(fitted(fc1), series="Fitted")
mean((test$Count - Final_forecasted_values_des)^2)
## [1] 36754.14
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
x<-train[order(as.Date(train$Date, format="%d/%m/%Y")),]
train_aug <- x %>%
mutate(count_lag1 = lag(Count, n = 1, order_by = Date),
count_lag2 = lag(Count, n = 2, order_by = Date),
temp_lag1 = lag(Temperature, n = 1, order_by = Date),
temp_lag2 = lag(Temperature, n = 2, order_by = Date),
Hum_lag1 = lag(Humidity, n = 1, order_by = Date),
Hum_lag2 = lag(Humidity, n = 2, order_by = Date),
Wind_lag1 = lag(Windspeed, n = 1, order_by = Date),
Wind_lag2 = lag(Windspeed, n = 2, order_by = Date)
)
x1<-test[order(as.Date(test$Date, format="%d/%m/%Y")),]
test_aug <- x1 %>%
mutate(count_lag1 = lag(Count, n = 1, order_by = Date),
count_lag2 = lag(Count, n = 2, order_by = Date),
temp_lag1 = lag(Temperature, n = 1, order_by = Date),
temp_lag2 = lag(Temperature, n = 2, order_by = Date),
Hum_lag1 = lag(Humidity, n = 1, order_by = Date),
Hum_lag2 = lag(Humidity, n = 2, order_by = Date),
Wind_lag1 = lag(Windspeed, n = 1, order_by = Date),
Wind_lag2 = lag(Windspeed, n = 2, order_by = Date)
)
my_lm <- lm(Count ~ count_lag1 + count_lag2 + temp_lag1 + temp_lag2 +Hum_lag1+Hum_lag2+Wind_lag1+Wind_lag2,data = train_aug[3:nrow(train_aug), ])
summary(my_lm)
##
## Call:
## lm(formula = Count ~ count_lag1 + count_lag2 + temp_lag1 + temp_lag2 +
## Hum_lag1 + Hum_lag2 + Wind_lag1 + Wind_lag2, data = train_aug[3:nrow(train_aug),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -614.77 -130.09 9.72 132.51 1165.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 294.09214 47.59342 6.179 9.29e-10 ***
## count_lag1 -0.24892 0.03336 -7.463 1.81e-13 ***
## count_lag2 -0.11772 0.03275 -3.595 0.00034 ***
## temp_lag1 20.62815 1.78986 11.525 < 2e-16 ***
## temp_lag2 8.43898 1.90639 4.427 1.06e-05 ***
## Hum_lag1 0.48761 0.38250 1.275 0.20267
## Hum_lag2 0.52731 0.38248 1.379 0.16829
## Wind_lag1 17.31211 8.33838 2.076 0.03813 *
## Wind_lag2 -6.19091 8.35632 -0.741 0.45895
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 193.3 on 1020 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2286, Adjusted R-squared: 0.2225
## F-statistic: 37.78 on 8 and 1020 DF, p-value: < 2.2e-16
Inference: Keeping only the significant variables where the P value is <0.05 and removing the other variables
My_lm_final <-lm(Count ~ count_lag1 + count_lag2 + temp_lag1 + temp_lag2 , data = train_aug[3:nrow(train_aug), ])
summary(My_lm_final)
##
## Call:
## lm(formula = Count ~ count_lag1 + count_lag2 + temp_lag1 + temp_lag2,
## data = train_aug[3:nrow(train_aug), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -594.78 -131.38 5.66 134.94 1135.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 378.78521 29.68109 12.762 < 2e-16 ***
## count_lag1 -0.27122 0.03221 -8.420 < 2e-16 ***
## count_lag2 -0.11323 0.03173 -3.568 0.000376 ***
## temp_lag1 21.15650 1.71722 12.320 < 2e-16 ***
## temp_lag2 8.41413 1.83701 4.580 5.21e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 193.8 on 1024 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.222, Adjusted R-squared: 0.2189
## F-statistic: 73.04 on 4 and 1024 DF, p-value: < 2.2e-16
predicted_Dynm<-predict(My_lm_final,newdata = test_aug)
test_aug$Predicted<-predicted_Dynm
test_aug_1<-na.omit(test_aug)
# Mean Square error for the dynamic regression#
mean((test_aug_1$Count - test_aug_1$Predicted)^2)
## [1] 28639.19
plot(test_aug_1$Count,test_aug_1$Predicted,
xlab="predicted",ylab="actual")
abline(a=0,b=1)