setwd("C:/Users/obunadic8159/OneDrive - ARCADIS/Desktop/Data_Analysis_11DCE_14DOX/Recent_New_Data/MW")
#Load necessary libraries
#install.packages("car")
#install.packages("corrplot")
#install.packages("MASS")
#install.packages("lmtest")
library(MASS)
library(dplyr)
library(ggplot2)
library(caret)
library(car)
library(corrplot)
library(e1071)
library(forecast)
library(lubridate)
library(lmtest)Linear Regression and Time Series Model_MW for 1,1-Dichloroethene
Importing the Data from our directory
# Load the dataset
data <- read.csv("C:/Users/obunadic8159/OneDrive - ARCADIS/Desktop/Data_Analysis_11DCE_14DOX/Recent_New_Data/MW/11DCE.csv")Visualizing the Data Type and Structure
##################Checking the Data Type and Structure ###############
head(data) SiteName Date Quarter X11DCE
1 DMW13 11/30/2001 Q4 5e-04
2 DMW13 5/29/2002 Q2 5e-04
3 DMW13 11/6/2002 Q4 5e-04
4 DMW13 9/23/2003 Q3 5e-04
5 DMW13 5/20/2004 Q2 5e-04
6 DMW13 4/27/2005 Q2 5e-04
#str(data)Convert Date to Date format and factorizing Site Name
# Convert Date to Date format and factorize SiteName
data$Date <- as.Date(data$Date, format="%m/%d/%Y")
data$SiteName <- as.factor(data$SiteName)Checking for Missing data across the data using For-Loop
#### Handling Missing Data
vnames <- colnames(data)
n <- nrow(data)
out <- NULL
for (j in 1:ncol(data)){
vname <- colnames(data)[j]
x <- as.vector(data[,j])
n1 <- sum(is.na(x), na.rm=TRUE) # NA
n2 <- sum(x=="NA", na.rm=TRUE) # "NA"
n3 <- sum(x==" ", na.rm=TRUE) # missing
nmiss <- n1 + n2 + n3
nmiss <- sum(is.na(x))
ncomplete <- n-nmiss
out <- rbind(out, c(col.num=j, v.name=vname, mode=mode(x),
n.level=length(unique(x)),
ncom=ncomplete, nmiss= nmiss, miss.prop=nmiss/n))
}
out <- as.data.frame(out)
row.names(out) <- NULL
out col.num v.name mode n.level ncom nmiss miss.prop
1 1 SiteName character 67 1247 0 0
2 2 Date numeric 173 1247 0 0
3 3 Quarter character 4 1247 0 0
4 4 X11DCE numeric 616 1247 0 0
#for (j in 1:NCOL(data)){
#print(head(colnames(data)[j]))
#print(head(table(data[,j], useNA="ifany")))
#}Checking for Outliers using Box-plot
# Check for outliers using boxplots for DCE
ggplot(data, aes(x = SiteName, y = X11DCE)) + geom_boxplot() + ggtitle("Boxplot for DCE by SiteName")Calculating and Filtering Outliers in 11DCE
# Calculate IQR and detect outliers for X11DCE
Q1_DCE <- quantile(data$X11DCE, 0.25)
Q3_DCE <- quantile(data$X11DCE, 0.75)
IQR_DCE <- Q3_DCE - Q1_DCE
outliers_DCE <- data %>% filter(X11DCE < (Q1_DCE - 1.5 * IQR_DCE) | X11DCE > (Q3_DCE + 1.5 * IQR_DCE))There are 148 observations of 1,1Dichloroethene (DCE) with outliers. Therefore, it is important to remove them before building the linear regression model and time series forecast.
# Outliers in 11DCE
#print(outliers_DCE)
#print("Outliers in X11DCE:")
print(head(outliers_DCE)) SiteName Date Quarter X11DCE
1 DMW37 2020-01-23 Q1 0.0621
2 DMW43 2023-01-17 Q1 0.0696
3 DMW43 2024-01-22 Q1 0.1050
4 MW01 1995-06-22 Q2 0.1400
5 MW01 1996-04-11 Q2 0.0688
6 MW01 1998-05-19 Q2 0.1490
Checking for skewness in 1,1-Dichloroethene
[1] "Skewness for X11DCE: 13.4298884110363"
Interpretation of Skewness Values:
Skewness for 11DCE (13.4):This is also a high positive skewness value. The distribution of 11DCE is positively skewed, with most data points clustered towards the lower end and a long tail to the right. This indicates the presence of some high values. The high skewness values suggest that the data for 11DCE are not normally distributed and are influenced by a few very large values. When working with these variables in analyses, it might be necessary to consider transformations (such as log or square root transformations) to reduce skewness and achieve a more normal distribution, which can be beneficial for certain statistical analyses and models.~
# Visualize distributions
par(mfrow=c(1, 2))
hist(data$X11DCE, main="11DCE", xlab="11DCE", col = 'red', breaks=20)Transforming data due to skewness by using log transformation.
# Transform data if skewness is high (optional, example using log transformation)
if(abs(skewness_DCE) > 1){data$X11DCE <- log1p(data$X11DCE)}Removing the Outliers for 11DCE.
# Remove outliers (optional, depending on analysis)
data <- data %>% filter(X11DCE >= (Q1_DCE - 1.5 * IQR_DCE) & X11DCE <= (Q3_DCE + 1.5 * IQR_DCE))
##################Checking the Data Type and Structure ###############
#str(data)
head(data) SiteName Date Quarter X11DCE
1 DMW13 2001-11-30 Q4 0.000499875
2 DMW13 2002-05-29 Q2 0.000499875
3 DMW13 2002-11-06 Q4 0.000499875
4 DMW13 2003-09-23 Q3 0.000499875
5 DMW13 2004-05-20 Q2 0.000499875
6 DMW13 2005-04-27 Q2 0.000499875
Convert Date to numeric for VIF calculation
data$Date_numeric <- as.numeric(data$Date)Check for multicollinearity using VIF
# Check for multicollinearity using VIF
vif_data <- lm(X11DCE ~ Date_numeric + SiteName, data = data)
vif(vif_data) GVIF Df GVIF^(1/(2*Df))
Date_numeric 2.905404 1 1.704525
SiteName 2.905404 66 1.008113
In this case, both Date_numeric and SiteName have GVIF values close to 1, indicating that there is little to no multicollinearity between these predictors in the model. This suggests that the model is not adversely affected by multicollinearity, and the estimates of the regression coefficients should be reliable.
Check correlation matrix for predictor variable 11DCE and Date
1: Perfect positive correlation (as one variable increases, the other variable also increases).
-1: Perfect negative correlation (as one variable increases, the other variable decreases).
0: No linear correlation (the variables do not have a linear relationship).
# Check correlation matrix for predictor and dependent variables
cor_matrix <- cor(data %>% select(X11DCE, Date_numeric))
print(cor_matrix) X11DCE Date_numeric
X11DCE 1.0000000 -0.2333438
Date_numeric -0.2333438 1.0000000
Implications of Correlation Matrix on the Model:
The correlation results suggest that there is almost no linear relationship between X11DCE and Date_numeric. This low correlation indicates that Date_numeric is not a strong predictor of X11DCE based on linear association alone.
# Visualize the correlation matrix
corrplot(cor_matrix, method = "circle")Removing the 5th column (Date Numeric) for Linear Regression Model
# Convert Date back to as.Date data type and removing the 5th column
data$Date <- as.Date(data$Date, format="%m/%d/%Y")
#head(data)
#colnames(data)
data= data[, -(5)]
head(data) SiteName Date Quarter X11DCE
1 DMW13 2001-11-30 Q4 0.000499875
2 DMW13 2002-05-29 Q2 0.000499875
3 DMW13 2002-11-06 Q4 0.000499875
4 DMW13 2003-09-23 Q3 0.000499875
5 DMW13 2004-05-20 Q2 0.000499875
6 DMW13 2005-04-27 Q2 0.000499875
Splitting the data into training and testing sets for the Linear Regression Modelling of 11DCE
# Split the data into training and testing sets for X11DCE
set.seed(123)
trainIndex <- createDataPartition(data$X11DCE, p = .8, list = FALSE, times = 1)
trainData <- data[trainIndex,]
testData <- data[-trainIndex,]This line uses the createDataPartition function from the caret package to create an index for splitting the data.X11DCE: The target variable, which is 1,1 Dicholorethene, is used to ensure that the split maintains the same distribution of this variable in both the training and test sets.p = .8: This specifies that 80% of the data should be used for training. The remaining 20% will be used for testing.list = FALSE: By setting this to FALSE, the function returns the indices as a vector instead of a list.times = 1: This specifies that only one partition should be created.
Using set.seed(123) ensures that every time you run this code, the training and test splits will be the same, allowing for consistent and reproducible results.
Factoring SiteName in testData to Match with the trainData (Categorical Variable)
# Ensure factor levels in test set match training set
testData$SiteName <- factor(testData$SiteName, levels = levels(trainData$SiteName))The above code ensures that the SiteName factor levels in the testData dataset match those in the trainData dataset. This step is crucial when you want to make predictions on the test data using a model trained on the training data, especially when dealing with categorical variables.
Linear Regression Model and StepWise Backward Elimination Model Selection
# Model for X11DCE using stepwise backward elimination
full_model_DCE <- lm(X11DCE ~ Date + SiteName, data = trainData)
step_model_DCE <- step(full_model_DCE, direction = "backward", trace = 0)Purpose: The goal of step-wise model selection is to improve the model by removing predictors that do not contribute significantly to the prediction of the response variable. This can lead to a more parsimonious model that is easier to interpret and may perform better on new data.
Backward Elimination: In this specific procedure, predictors are removed one by one based on their statistical significance, starting with the least significant predictor. The process continues until only predictors that contribute meaningfully to the model remain.
Summary and Interpretation of the Linear Regression Model
# Summary of the final model
summary(step_model_DCE)
Call:
lm(formula = X11DCE ~ Date + SiteName, data = trainData)
Residuals:
Min 1Q Median 3Q Max
-0.041859 -0.003597 -0.000210 0.002353 0.037380
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.132e-02 3.989e-03 10.359 < 2e-16 ***
Date -2.552e-06 2.183e-07 -11.693 < 2e-16 ***
SiteNameDMW28 2.705e-03 3.171e-03 0.853 0.393943
SiteNameDMW29 2.393e-03 3.252e-03 0.736 0.462065
SiteNameDMW30 1.950e-02 3.351e-03 5.819 8.49e-09 ***
SiteNameDMW31 2.465e-02 2.990e-03 8.244 6.64e-16 ***
SiteNameDMW32 2.957e-03 3.461e-03 0.854 0.393200
SiteNameDMW33 2.793e-03 3.349e-03 0.834 0.404547
SiteNameDMW34 2.787e-03 2.925e-03 0.953 0.341010
SiteNameDMW35 2.550e-03 3.348e-03 0.762 0.446384
SiteNameDMW36 2.755e-03 3.171e-03 0.869 0.385163
SiteNameDMW37 1.479e-02 2.988e-03 4.948 9.13e-07 ***
SiteNameDMW38 3.567e-02 3.034e-03 11.758 < 2e-16 ***
SiteNameDMW39 2.710e-03 3.254e-03 0.833 0.405195
SiteNameDMW40 2.864e-03 3.589e-03 0.798 0.425212
SiteNameDMW41 2.959e-03 3.461e-03 0.855 0.392894
SiteNameDMW42 4.400e-03 3.047e-03 1.444 0.149133
SiteNameDMW43 2.278e-02 3.102e-03 7.346 4.97e-13 ***
SiteNameDMW44 2.891e-03 3.254e-03 0.888 0.374540
SiteNameMW01 6.646e-03 3.418e-03 1.945 0.052160 .
SiteNameMW02 2.476e-02 3.118e-03 7.942 6.62e-15 ***
SiteNameMW03 -4.883e-03 2.599e-03 -1.879 0.060639 .
SiteNameMW04 1.417e-02 2.914e-03 4.861 1.40e-06 ***
SiteNameMW05 5.907e-03 4.281e-03 1.380 0.168071
SiteNameMW06 1.312e-02 2.956e-03 4.437 1.04e-05 ***
SiteNameMW07 -9.755e-03 2.976e-03 -3.278 0.001091 **
SiteNameMW08 -5.071e-03 2.629e-03 -1.929 0.054062 .
SiteNameMW09 9.012e-03 2.877e-03 3.132 0.001798 **
SiteNameMW10 -8.612e-04 2.948e-03 -0.292 0.770233
SiteNameMW11 1.442e-02 2.788e-03 5.172 2.92e-07 ***
SiteNameMW12 6.539e-03 3.291e-03 1.987 0.047287 *
SiteNameMW15 2.252e-02 3.938e-03 5.717 1.52e-08 ***
SiteNameMW15R 1.349e-02 3.364e-03 4.010 6.63e-05 ***
SiteNameMW16 7.479e-03 2.914e-03 2.566 0.010453 *
SiteNameMW17 3.267e-03 2.914e-03 1.121 0.262563
SiteNameMW18 2.161e-02 2.717e-03 7.953 6.09e-15 ***
SiteNameMW19 4.686e-04 2.868e-03 0.163 0.870239
SiteNameMW20 1.908e-02 2.873e-03 6.640 5.74e-11 ***
SiteNameMW21 3.024e-04 2.867e-03 0.105 0.916033
SiteNameMW22 7.104e-03 3.096e-03 2.294 0.022023 *
SiteNameMW23 3.324e-02 3.611e-03 9.207 < 2e-16 ***
SiteNameMW24 7.631e-03 2.884e-03 2.646 0.008309 **
SiteNameMW25 9.173e-03 2.884e-03 3.181 0.001523 **
SiteNameMW26 4.347e-02 3.464e-03 12.551 < 2e-16 ***
SiteNameMW27 1.343e-02 2.880e-03 4.664 3.63e-06 ***
SiteNameMW45 3.102e-03 3.943e-03 0.787 0.431655
SiteNameMW46 5.039e-03 3.477e-03 1.449 0.147614
SiteNameMW47 7.367e-03 3.184e-03 2.314 0.020934 *
SiteNameMW48 1.088e-02 2.847e-03 3.823 0.000142 ***
SiteNameMW49 3.086e-03 4.185e-03 0.737 0.461198
SiteNameMW50 3.462e-03 4.506e-03 0.768 0.442532
SiteNameMW51 3.953e-03 3.949e-03 1.001 0.317083
SiteNameMW52 2.320e-02 3.274e-03 7.087 2.97e-12 ***
SiteNameMW53 7.955e-03 3.260e-03 2.440 0.014880 *
SiteNameMW54 3.396e-02 3.598e-03 9.439 < 2e-16 ***
SiteNameMW55 1.064e-02 3.266e-03 3.257 0.001174 **
SiteNameMW56 3.536e-02 6.744e-03 5.243 2.02e-07 ***
SiteNameMW57 9.741e-03 4.200e-03 2.320 0.020612 *
SiteNameMW58 6.198e-03 3.619e-03 1.713 0.087155 .
SiteNameMW59 1.037e-02 4.197e-03 2.471 0.013691 *
SiteNameMW60 8.329e-03 3.974e-03 2.096 0.036384 *
SiteNameMW61 1.000e-02 4.200e-03 2.381 0.017508 *
SiteNameMW62 1.047e-02 3.619e-03 2.893 0.003916 **
SiteNameMW63 1.109e-02 3.613e-03 3.070 0.002214 **
SiteNameSMW30 6.730e-03 3.254e-03 2.068 0.038926 *
SiteNameSMW43 3.015e-02 2.985e-03 10.098 < 2e-16 ***
SiteNameTC01 8.136e-03 3.945e-03 2.062 0.039519 *
SiteNameTC10 5.973e-03 3.459e-03 1.727 0.084527 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.009111 on 814 degrees of freedom
Multiple R-squared: 0.5914, Adjusted R-squared: 0.5578
F-statistic: 17.59 on 67 and 814 DF, p-value: < 2.2e-16
Residuals: Min ( -0.041859), 1Q(-0.003597), Median(-0.000210), 3Q(0.002353) and Max(0.037380) shows the distribution of the residuals (difference between observed and predicted values). The small values of the residuals indicate that the model’s prediction are close to the observed values.
Date: The coefficient for Date is significant at 5% level (p<0.05), suggesting that there is a relationship between Date and XIIDCE.
Significant SiteName Coefficients:
SiteNameDMW30: p=8.49×10−9
SiteNameDMW31: p=6.64×10−16
SiteNameDMW37: p=9.13×10−7
SiteNameDMW38: p<2×10−16
SiteNameDMW43: p=4.97×10−13
SiteNameMW02: p=6.62×10−15
SiteNameMW04: p=1.40×10−6
SiteNameMW06: p=1.04×10−5
SiteNameMW07: p=0.001091
SiteNameMW09: p=0.001798
SiteNameMW11: p=2.92×10−7
SiteNameMW12: p=0.047287
SiteNameMW15: p=1.52×10−8
SiteNameMW15R: p=6.63×10−5
SiteNameMW16: p=0.010453
SiteNameMW18: p=6.09×10−15
SiteNameMW20: p=5.74×10−11
SiteNameMW22: p=0.022023
SiteNameMW23: p<2×10−16
SiteNameMW24: p=0.008309
SiteNameMW25: p=0.001523
SiteNameMW26: p<2×10−16
SiteNameMW27: p=3.63×10−6
SiteNameMW47: p=0.020934
SiteNameMW48: p=0.000142
SiteNameMW52: p=2.97×10−12
SiteNameMW53: p=0.014880
SiteNameMW54: p<2×10−16
SiteNameMW55: p=0.001174
SiteNameMW56: p=2.02×10−7
SiteNameMW57: p=0.020612
SiteNameMW59: p=0.013691
SiteNameMW60: p=0.036384
SiteNameMW61: p=0.017508
SiteNameMW62: p=0.003916
SiteNameMW63: p=0.002214
SiteNameSMW30: p=0.038926
SiteNameSMW43: p<2×10−16
SiteNameTC01: p=0.039519
These significant coefficients indicate that the corresponding SiteName levels have a statistically significant effect on X11DCE.
Model Fit Statistics
Residual standard error: 0.0091 on 814 degrees of freedom
Multiple R-squared (0.60): This indicates that approximately 60% of the variability in
X11DCEis explained by the model.Adjusted R-squared (0.56): This adjusts the R-squared value for the number of predictors in the model, providing a more accurate measure of model fit.
F-statistic: 17.59 on 67 and 1632 DF (p-value: < 2.2e-16): The F-statistic tests the overall significance of the model. A very low p-value (< 2.2e-16) indicates that the model is statistically significant.
Identifying High Leverage Points:
Leverage Values: These values indicate how much influence each data point has on the fitted values of the model. High leverage points are those that can potentially have a large impact on the model.
Data Points with High Leverage Points:
# Print high leverage points
#print(high_leverage_points)
print(head(high_leverage_points))396 397 399 400 401 402
313 314 315 316 317 318
print(tail(high_leverage_points))1019 1020 1021 1022 1023 1027
814 815 816 817 818 819
Removing the Data with High Leverage Points for Improved Model:
# Remove high leverage points and refit the model if necessary
trainData_cleaned <- trainData[-high_leverage_points, ]Refitting the Model after removing high leverage points:
# Refit the model without high leverage points
full_model_DCE_cleaned <- lm(X11DCE ~ Date + SiteName, data = trainData_cleaned)
step_model_DCE_cleaned <- step(full_model_DCE_cleaned, direction = "backward", trace = 0)Summary and Interpretation of the Re-fitted Linear Regression Model
# Summary of the final model without high leverage points
summary(step_model_DCE_cleaned)
Call:
lm(formula = X11DCE ~ Date + SiteName, data = trainData_cleaned)
Residuals:
Min 1Q Median 3Q Max
-0.041854 -0.003669 -0.000230 0.002492 0.037391
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.116e-02 4.019e-03 10.241 < 2e-16 ***
Date -2.543e-06 2.203e-07 -11.545 < 2e-16 ***
SiteNameDMW28 2.694e-03 3.181e-03 0.847 0.397314
SiteNameDMW29 2.383e-03 3.262e-03 0.731 0.465278
SiteNameDMW30 1.949e-02 3.361e-03 5.798 9.75e-09 ***
SiteNameDMW31 2.463e-02 2.999e-03 8.212 8.93e-16 ***
SiteNameDMW32 2.945e-03 3.472e-03 0.848 0.396567
SiteNameDMW33 2.782e-03 3.360e-03 0.828 0.407881
SiteNameDMW34 2.776e-03 2.934e-03 0.946 0.344445
SiteNameDMW35 2.540e-03 3.358e-03 0.757 0.449566
SiteNameDMW36 2.745e-03 3.181e-03 0.863 0.388538
SiteNameDMW37 1.477e-02 2.998e-03 4.926 1.02e-06 ***
SiteNameDMW38 3.566e-02 3.044e-03 11.717 < 2e-16 ***
SiteNameDMW39 2.699e-03 3.264e-03 0.827 0.408530
SiteNameDMW40 2.854e-03 3.601e-03 0.793 0.428279
SiteNameDMW41 2.947e-03 3.472e-03 0.849 0.396255
SiteNameDMW42 4.384e-03 3.057e-03 1.434 0.152006
SiteNameDMW43 2.277e-02 3.112e-03 7.319 6.21e-13 ***
SiteNameDMW44 2.880e-03 3.264e-03 0.882 0.377853
SiteNameMW01 6.678e-03 3.429e-03 1.947 0.051859 .
SiteNameMW02 2.479e-02 3.129e-03 7.925 7.85e-15 ***
SiteNameMW03 -4.860e-03 2.608e-03 -1.863 0.062778 .
SiteNameMW04 1.417e-02 2.924e-03 4.845 1.52e-06 ***
SiteNameMW06 1.315e-02 2.967e-03 4.434 1.06e-05 ***
SiteNameMW07 -9.714e-03 2.987e-03 -3.252 0.001196 **
SiteNameMW08 -5.042e-03 2.638e-03 -1.911 0.056320 .
SiteNameMW09 9.023e-03 2.887e-03 3.126 0.001839 **
SiteNameMW10 -8.190e-04 2.959e-03 -0.277 0.782000
SiteNameMW11 1.441e-02 2.797e-03 5.153 3.24e-07 ***
SiteNameMW12 6.579e-03 3.303e-03 1.992 0.046749 *
SiteNameMW15 2.252e-02 3.951e-03 5.701 1.68e-08 ***
SiteNameMW15R 1.347e-02 3.375e-03 3.992 7.17e-05 ***
SiteNameMW16 7.480e-03 2.924e-03 2.559 0.010694 *
SiteNameMW17 3.268e-03 2.923e-03 1.118 0.263995
SiteNameMW18 2.161e-02 2.726e-03 7.928 7.66e-15 ***
SiteNameMW19 4.667e-04 2.877e-03 0.162 0.871161
SiteNameMW20 1.907e-02 2.882e-03 6.616 6.84e-11 ***
SiteNameMW21 3.012e-04 2.877e-03 0.105 0.916646
SiteNameMW22 7.094e-03 3.106e-03 2.284 0.022636 *
SiteNameMW23 3.322e-02 3.622e-03 9.171 < 2e-16 ***
SiteNameMW24 7.618e-03 2.894e-03 2.632 0.008643 **
SiteNameMW25 9.159e-03 2.893e-03 3.166 0.001604 **
SiteNameMW26 4.346e-02 3.475e-03 12.508 < 2e-16 ***
SiteNameMW27 1.342e-02 2.889e-03 4.645 4.00e-06 ***
SiteNameMW45 3.090e-03 3.955e-03 0.781 0.434929
SiteNameMW46 5.021e-03 3.488e-03 1.439 0.150421
SiteNameMW47 7.350e-03 3.194e-03 2.301 0.021651 *
SiteNameMW48 1.087e-02 2.856e-03 3.805 0.000153 ***
SiteNameMW51 3.938e-03 3.961e-03 0.994 0.320519
SiteNameMW52 2.319e-02 3.285e-03 7.058 3.72e-12 ***
SiteNameMW53 7.941e-03 3.270e-03 2.428 0.015386 *
SiteNameMW54 3.395e-02 3.609e-03 9.405 < 2e-16 ***
SiteNameMW55 1.062e-02 3.277e-03 3.241 0.001241 **
SiteNameMW58 6.175e-03 3.631e-03 1.701 0.089355 .
SiteNameMW60 8.305e-03 3.987e-03 2.083 0.037575 *
SiteNameMW62 1.045e-02 3.630e-03 2.878 0.004117 **
SiteNameMW63 1.107e-02 3.625e-03 3.054 0.002335 **
SiteNameSMW30 6.719e-03 3.264e-03 2.058 0.039873 *
SiteNameSMW43 3.013e-02 2.995e-03 10.061 < 2e-16 ***
SiteNameTC01 8.122e-03 3.958e-03 2.052 0.040497 *
SiteNameTC10 5.964e-03 3.470e-03 1.719 0.086045 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.00914 on 784 degrees of freedom
Multiple R-squared: 0.5927, Adjusted R-squared: 0.5615
F-statistic: 19.01 on 60 and 784 DF, p-value: < 2.2e-16
Residuals: Min ( -0.0084612), 1Q(-0.0000823), Median(-0.0000014), 3Q(0.0000000) and Max(0.0122086) shows the distribution of the residuals (difference between observed and predicted values). The small values of the residuals indicate that the model’s prediction are close to the observed values.
Intercept: The intercept is statistically significant at the 5% level, indicating a small but significant base level of
X11DCEwhen allSiteNamefactors are zero (though in practice,SiteNamefactors wouldn’t be zero).
Significant SiteName Coefficients:
SiteNameDMW30: p=9.75×10−9
SiteNameDMW31: p=8.93×10−16
SiteNameDMW37: p=1.02×10−6
SiteNameDMW38: p<2×10−16
SiteNameDMW43: p=6.21×10−13
SiteNameMW02: p=7.85×10−15
SiteNameMW04: p=1.52×10−6
SiteNameMW06: p=1.06×10−5
SiteNameMW07: p=0.001196
SiteNameMW09: p=0.001839
SiteNameMW11: p=3.24×10−7
SiteNameMW12: p=0.046749
SiteNameMW15: p=1.68×10−8
SiteNameMW15R: p=7.17×10−5
SiteNameMW16: p=0.010694
SiteNameMW18: p=7.66×10−15
SiteNameMW20: p=6.84×10−11
SiteNameMW22: p=0.022636
SiteNameMW23: p<2×10−16
SiteNameMW24: p=0.008643
SiteNameMW25: p=0.001604
SiteNameMW26: p<2×10−16
SiteNameMW27: p=4.00×10−6
SiteNameMW47: p=0.021651
SiteNameMW48: p=0.000153
SiteNameMW52: p=3.72×10−12
SiteNameMW53: p=0.015386
SiteNameMW54: p<2×10−16
SiteNameMW55: p=0.001241
SiteNameMW60: p=0.037575
SiteNameMW62: p=0.004117
SiteNameMW63: p=0.002335
SiteNameSMW30: p=0.039873
SiteNameSMW43: p<2×10−16
SiteNameTC01: p=0.040497
These significant coefficients indicate that the corresponding SiteName levels have a statistically significant effect on X11DCE.
Model Fit Statistics
Residual standard error: 0.00914 on 784 degrees of freedom
Multiple R-squared (0.6): This indicates that approximately 60% of the variability in
X11DCEis explained by the model.Adjusted R-squared (0.56): This adjusts the R-squared value for the number of predictors in the model, providing a more accurate measure of model fit.
Conclusion:
The model explains a significant portion of the variance in X11DCE (R-squared = 60%). Several SiteName levels are significant, indicating that these levels are important for predicting X11DCE. The model has a good fit, but there might be room for improvement or further refinement to increase the explained variance.
Checking for Model Assumption:
# Check for model assumptions
par(mfrow=c(2, 2))
plot(step_model_DCE_cleaned)Durbin-Watson test:
The Durbin-Watson test is used to detect the presence of autocorrelation in the residuals of a regression analysis.
# Durbin-Watson test for autocorrelation
dwtest(step_model_DCE)
Durbin-Watson test
data: step_model_DCE
DW = 1.4138, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
DW = 1.41: The Durbin-Watson statistic value is approximately. The DW statistic ranges from 0 to 4. A value around 2 suggests no autocorrelation. Values less than 2 indicate positive autocorrelation, while values greater than 2 indicate negative autocorrelation.
P-value = <2.2e-16: The p-value is <2.2e-16. This value is used to determine the significance of the test result. Typically, a p-value less than 0.05 indicates that the null hypothesis can be rejected with a very high level of confidence.
Alternative hypothesis: true autocorrelation is greater than 0: The alternative hypothesis in this test suggests that there is positive autocorrelation in the residuals (i.e., the residuals are positively correlated).
Conclusion
The Durbin-Watson test statistic of 1.41 and the very low p-value suggest that there is significant positive autocorrelation in the residuals of the model step_model_DCE. Positive autocorrelation means that consecutive residuals are correlated with each other, which can indicate that the model might be missing some important variables or that there are patterns in the data that are not captured by the model.
Residual Plots Vs Time or Fitted Values to Visualize Autocorrelation
####Create Plots of Residuals Vs Time or Fitted Values to Visually Inspect the AutoCorrelation
plot(residuals(step_model_DCE_cleaned), type = "l", main = "Residuals over Time")acf(residuals(step_model_DCE_cleaned), main = "ACF of Residuals")If most autocorrelation coefficients fall within the confidence bands, it indicates that the residuals are roughly uncorrelated. If many coefficients fall outside the confidence bands, especially at low lags, this suggests autocorrelation in the residuals. A random pattern with most points within the confidence bands indicates that the residuals are likely independent. A pattern where many points outside the bands or showing systematic trends (e.g., gradually decreasing) indicates that residuals are autocorrelated.
Factoring SiteName in testData to Match with the Non-High Leverage trainData (Categorical Variable)
# Ensure factor levels in test set match training set
testData$SiteName <- factor(testData$SiteName , levels = levels(trainData_cleaned$SiteName))The above code ensures that the SiteName factor levels in the testData matches those in the Non-High Leverage trainData dataset. This step is crucial when you want to make predictions on the test data using a model trained on the training data, especially when dealing with categorical variables.
Removing Data Points with High Levels for Prediction of Actual Vs Predicted Values
# Filter out rows in testData that have levels not present in trainData_cleaned
testData <- testData[!testData$SiteName %in% c('MW05', 'MW49', 'MW50', 'MW57', 'MW59', 'MW61'), ]Predicting on test data
Having Trained our model to predict X11DCE based on the testdata. We shall now proceed to investigate how efficient our model is,
# Predict on test data
predictions_DCE_cleaned <- predict(step_model_DCE_cleaned, newdata = testData)Model Evaluation
# Evaluate the model
DCE_results_cleaned <- data.frame(WellName = testData$SiteName, Actual = testData$X11DCE,
Predicted = predictions_DCE_cleaned)Result of the Prediction
The correlation coefficient of 0.7145267 indicates a strong positive relationship between the actual and predicted values, suggesting that the model performs well. However, further analysis and refinement can be conducted to improve the model’s accuracy and reliability.
print(DCE_results_cleaned) WellName Actual Predicted
1 DMW13 0.0004998750 1.152282e-02
7 DMW13 0.0004998750 7.449155e-03
12 DMW13 0.0000000000 -1.247425e-03
13 DMW13 0.0001919816 -1.293196e-03
22 DMW13 0.0001919816 -3.459713e-03
25 DMW13 0.0001919816 -5.341429e-03
32 DMW28 0.0001919816 1.314263e-03
41 DMW28 0.0001919816 -7.479960e-04
52 DMW29 0.0002999550 3.068270e-04
53 DMW29 0.0001919816 9.831248e-05
59 DMW29 0.0001919816 -2.019890e-03
60 DMW30 0.0480279690 1.821621e-02
62 DMW30 0.0269340012 1.787801e-02
68 DMW30 0.0262523711 1.670830e-02
76 DMW31 0.0150856374 2.301947e-02
78 DMW31 0.0000000000 2.274738e-02
93 DMW32 0.0001919816 1.565762e-03
94 DMW32 0.0001919816 1.339447e-03
97 DMW32 0.0002999550 8.690184e-04
101 DMW32 0.0001919816 -5.149682e-05
108 DMW33 0.0001919816 1.176398e-03
112 DMW33 0.0001919816 4.999969e-04
129 DMW34 0.0001919816 2.618989e-04
131 DMW34 0.0001919816 7.612951e-06
148 DMW35 0.0001919816 -9.014157e-04
149 DMW35 0.0001919816 -1.348959e-03
151 DMW36 0.0001919816 1.443738e-03
154 DMW36 0.0001919816 1.138595e-03
157 DMW36 0.0000000000 8.487092e-04
175 DMW37 0.0285874569 1.199925e-02
177 DMW37 0.0383549539 1.132794e-02
194 DMW38 0.0467881615 3.289225e-02
196 DMW38 0.0382587121 3.239639e-02
200 DMW38 0.0000000000 3.199208e-02
201 DMW38 0.0298500220 3.177848e-02
205 DMW39 0.0002999550 1.316863e-03
211 DMW39 0.0001919816 4.065194e-04
217 DMW39 0.0001919816 -1.190397e-03
223 DMW40 0.0000000000 9.529249e-04
227 DMW40 0.0001919816 8.326682e-05
228 DMW40 0.0005618421 -1.430477e-04
230 DMW40 0.0004998750 -1.030506e-03
232 DMW41 0.0001919816 1.643945e-03
234 DMW41 0.0001919816 1.343888e-03
235 DMW41 0.0001919816 1.140459e-03
247 DMW42 0.0001919816 2.582145e-03
250 DMW42 0.0001919816 2.086287e-03
254 DMW42 0.0001919816 1.384458e-03
255 DMW42 0.0001919816 1.384458e-03
257 DMW42 0.0001919816 5.325998e-04
262 DMW42 0.0002159767 -1.888203e-03
267 DMW43 0.0045197704 2.139001e-02
269 DMW43 0.0186254641 2.096535e-02
280 DMW43 0.0319830459 1.837418e-02
282 DMW43 0.0434425578 1.646703e-02
289 DMW44 0.0001919816 8.064686e-04
293 DMW44 0.0004159135 -1.140466e-04
301 MW01 0.0344014267 2.057063e-02
304 MW01 0.0178399181 1.959418e-02
305 MW01 0.0000000000 1.503737e-02
312 MW02 0.0188217542 4.025297e-02
313 MW02 0.0440168854 3.959945e-02
319 MW02 0.0544881853 3.794405e-02
320 MW02 0.0004998750 3.747616e-02
323 MW02 0.0339182182 3.632170e-02
325 MW02 0.0449733656 3.402296e-02
326 MW02 0.0168571171 3.315330e-02
336 MW03 0.0089795628 1.189867e-02
337 MW03 0.0004998750 1.059927e-02
340 MW03 0.0029955090 9.325293e-03
342 MW03 0.0004998750 9.032864e-03
348 MW03 0.0038924147 7.573262e-03
350 MW03 0.0004998750 6.668004e-03
356 MW03 0.0004998750 2.589257e-03
365 MW03 0.0001919816 -7.630496e-03
371 MW04 0.0148886125 3.242004e-02
379 MW04 0.0054849302 1.590417e-02
380 MW04 0.0058826632 1.429708e-02
383 MW04 0.0062305498 1.236450e-02
384 MW04 0.0000000000 1.225770e-02
388 MW04 0.0062106738 1.070402e-02
392 MW04 0.0028858320 7.891614e-03
393 MW04 0.0018183458 7.072813e-03
408 MW06 0.0109399400 2.704696e-02
412 MW06 0.0305292050 2.607050e-02
419 MW06 0.0158733492 2.060335e-02
423 MW06 0.0030952049 1.709929e-02
430 MW07 0.0019980027 4.773634e-03
435 MW07 0.0069756137 3.659862e-03
438 MW07 0.0055843783 2.968204e-03
439 MW07 0.0004998750 2.719004e-03
440 MW07 0.0036931718 2.490146e-03
442 MW07 0.0004998750 1.350945e-03
446 MW07 0.0004998750 -1.354658e-03
451 MW07 0.0004998750 -5.769062e-03
459 MW08 0.0069756137 9.143306e-03
464 MW08 0.0129162253 8.108362e-03
465 MW08 0.0049875415 7.874419e-03
469 MW08 0.0050870390 6.486018e-03
471 MW08 0.0004998750 5.613817e-03
486 MW08 0.0001919816 -8.501598e-03
488 MW08 0.0001919816 -9.429742e-03
490 MW09 0.0440168854 2.217303e-02
491 MW09 0.0341115296 2.008788e-02
501 MW09 0.0043903483 1.213382e-02
503 MW09 0.0012991557 9.156130e-03
504 MW09 0.0013191296 7.643129e-03
508 MW09 0.0015887373 6.244556e-03
512 MW09 0.0003519381 4.632383e-03
519 MW10 0.0109399400 1.323903e-02
526 MW10 0.0190180058 1.138529e-02
534 MW10 0.0032945669 5.724880e-03
539 MW11 0.0535407669 2.425392e-02
540 MW11 0.0382587121 2.364364e-02
559 MW11 0.0007717022 5.352848e-03
560 MW12 0.0227394870 2.203784e-02
572 MW12 0.0027960873 1.493818e-02
573 MW12 0.0056838165 1.402784e-02
576 MW15 0.0506931143 3.175038e-02
579 MW15 0.0440168854 2.796406e-02
581 MW15 0.0314986671 2.646886e-02
583 MW15 0.0062802380 2.265457e-02
589 MW15R 0.0068067812 1.140431e-02
590 MW15R 0.0052860044 1.118309e-02
591 MW15R 0.0036632820 1.095423e-02
595 MW15R 0.0118297518 9.581084e-03
602 MW15R 0.0053556330 4.429250e-03
604 MW16 0.0017983819 1.583987e-02
612 MW16 0.0129162253 7.611171e-03
613 MW16 0.0006587830 6.100712e-03
614 MW16 0.0087119406 5.678598e-03
626 MW17 0.0065783154 1.071700e-02
631 MW17 0.0034938893 6.378881e-03
632 MW17 0.0087615057 5.005737e-03
636 MW17 0.0040716994 1.466076e-03
642 MW17 0.0002249747 -1.918683e-04
645 MW17 0.0001919816 -1.114926e-03
658 MW18 0.0171520588 1.999573e-02
660 MW18 0.0000000000 1.970839e-02
666 MW18 0.0151841353 1.861496e-02
667 MW18 0.0189198849 1.815216e-02
673 MW18 0.0231304174 1.351907e-02
674 MW18 0.0013091427 1.257566e-02
680 MW19 0.0004998750 3.577741e-03
695 MW20 0.0506931143 2.561524e-02
698 MW20 0.0506931143 2.301390e-02
699 MW20 0.0227394870 2.217984e-02
704 MW20 0.0258626595 1.726195e-02
708 MW20 0.0050671403 1.654995e-02
710 MW20 0.0058130713 1.607189e-02
717 MW20 0.0009805191 1.096583e-02
719 MW21 0.0004998750 7.165450e-03
734 MW21 0.0001919816 -2.212616e-03
739 MW21 0.0001919816 -4.096875e-03
743 MW22 0.0106431601 5.717145e-03
750 MW22 0.0072437009 4.577944e-03
754 MW22 0.0023272898 3.649800e-03
758 MW23 0.0435383020 3.182229e-02
771 MW24 0.0022374949 6.004052e-03
778 MW24 0.0024270523 4.841965e-03
784 MW24 0.0020578811 1.307390e-03
795 MW25 0.0082161548 7.090504e-03
810 MW26 0.0000000000 4.155855e-02
818 MW26 0.0502177162 3.813077e-02
819 MW26 0.0327575642 3.715177e-02
836 MW27 0.0182327683 8.090019e-03
843 MW45 0.0001919816 7.773513e-05
851 MW46 0.0001919816 2.237256e-03
852 MW46 0.0003239475 2.008399e-03
855 MW46 0.0001919816 6.352548e-04
856 MW46 0.0001919816 -3.259462e-04
874 MW47 0.0005878272 -1.711489e-03
880 MW48 0.0000000000 8.956888e-03
890 MW48 0.0010394596 6.485228e-03
914 MW51 0.0001919816 1.645047e-03
918 MW51 0.0001919816 8.881637e-05
922 MW52 0.0000000000 2.128176e-02
923 MW52 0.0287818014 2.111393e-02
924 MW52 0.0404699326 2.089525e-02
938 MW53 0.0000000000 6.037990e-03
942 MW53 0.0040816587 5.160703e-03
948 MW53 0.0033842669 3.543445e-03
955 MW54 0.0319830459 3.095286e-02
961 MW54 0.0092272973 2.686140e-02
965 MW55 0.0213700257 8.547658e-03
966 MW55 0.0243022925 8.334057e-03
970 MW55 0.0041115360 7.164342e-03
976 MW55 0.0014189928 2.518537e-03
978 MW55 0.0014988761 1.587851e-03
990 MW58 0.0017384880 3.397014e-03
995 MW58 0.0008526364 2.290870e-03
996 MW58 0.0007387271 1.774670e-03
997 MW58 0.0001919816 8.338115e-04
1010 MW60 0.0011992806 5.180561e-03
1011 MW60 0.0034540280 4.842360e-03
1030 MW62 0.0068067812 6.592619e-03
1045 MW63 0.0142973047 3.982744e-03
1052 SMW30 0.0071742037 4.909562e-03
1057 SMW30 0.0080376117 3.709333e-03
1064 SMW43 0.0457378917 2.851765e-02
1078 SMW43 0.0086028888 2.202827e-02
1080 TC01 0.0000000000 9.857517e-03
1083 TC01 0.0047288016 6.322942e-03
1084 TC01 0.0000000000 6.213599e-03
1090 TC01 0.0006078152 -9.292940e-04
1091 TC10 0.0217614918 7.696327e-03
1100 TC10 0.0055744339 2.503807e-03
print(cor(DCE_results_cleaned$Actual, DCE_results_cleaned$Predicted))[1] 0.7136918
A correlation of 0.71 suggests a strong positive linear relationship between the actual and predicted values. This means that as the actual values increase, the predicted values tend to also increase, and vice versa. While the correlation provides a measure of the strength of the linear relationship, it does not directly indicate how well the model fits the data in terms of variance explained. For this, metrics such as R-squared, RMSE (Root Mean Square Error), and MAE (Mean Absolute Error) are also important.
Visual Inspection of Actual Vs Predicted Values
## Visual Inspection ##
plot(DCE_results_cleaned$Actual, DCE_results_cleaned$Predicted, main = "Actual vs Predicted Values", xlab = "Actual", ylab = "Predicted")
abline(lm(DCE_results_cleaned$Predicted ~ DCE_results_cleaned$Actual), col = "red")Residual Analysis
Analyze the residuals (actual - predicted) to check for patterns that might indicate model deficiencies.
### Residual Analysis###
residuals <- DCE_results_cleaned$Actual - DCE_results_cleaned$Predicted
plot(residuals, main = "Residuals", ylab = "Residuals")Model Evaluation Metrics
### Model Evaluation Metrics ##
rmse <- sqrt(mean((DCE_results_cleaned$Actual - DCE_results_cleaned$Predicted)^2))
print("Root Mean Square Error:")[1] "Root Mean Square Error:"
print(rmse)[1] 0.01013567
mae <- mean(abs(DCE_results_cleaned$Actual - DCE_results_cleaned$Predicted))
print("Mean Absolute Error:")[1] "Mean Absolute Error:"
print(mae)[1] 0.006768408
r_squared <- summary(lm(DCE_results_cleaned$Predicted ~ DCE_results_cleaned$Actual))$r.squared
print("Root Square Error:")[1] "Root Square Error:"
print(r_squared)[1] 0.509356
Summary of Model Performance
RMSE (0.010): Indicates the average error magnitude in predicting the dependent variable. A very low value suggests that the model’s predictions are close to the actual values.
MAE (0.006): Shows the average absolute error in predictions. The low value reinforces the indication from RMSE that the model performs well.
R^2 (0.51): Indicates that about 51% of the variance in the dependent variable is explained by the model. While this shows a decent fit, there’s still room for improvement to capture more variance.
Time series forecasting for 1,1-Dichloroethene
Aggregating data by month for a better time series analysis
For the Time Series Forecasting,
# Time series forecasting for X11DCE
# Aggregating data by month for a better time series analysis
data_ts <- data %>%
group_by(Date, SiteName) %>%
summarize(X11DCE = mean(X11DCE), .groups = "drop")Convert to time series object
# Convert to time series object
ts_DCE <- ts(data_ts$X11DCE, start = c(year(min(data_ts$Date)), month(min(data_ts$Date))), frequency = 12)# Decompose time series
decomp_DCE <- stl(ts_DCE, s.window="periodic")# Plot decompositions
plot(decomp_DCE)Fit ARIMA model for DCE
# Fit ARIMA model
fit_DCE <- auto.arima(ts_DCE)
summary(fit_DCE)Series: ts_DCE
ARIMA(3,1,2)(2,0,0)[12]
Coefficients:
ar1 ar2 ar3 ma1 ma2 sar1 sar2
-0.3511 0.0397 0.0461 -0.6290 -0.3356 -0.0569 0.0015
s.e. 0.7434 0.0360 0.0378 0.7437 0.7218 0.0318 0.0317
sigma^2 = 0.0001804: log likelihood = 2997.39
AIC=-5978.77 AICc=-5978.63 BIC=-5939.23
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -0.0002787663 0.01337882 0.01002845 -Inf Inf 0.7810371
ACF1
Training set -0.0006419278
Forecast for DCE in the next 12 months
# Forecast for the next 12 months
forecast_DCE <- forecast(fit_DCE, h = 12)
plot(forecast_DCE)# Combine forecasts into a data frame
future_forecasts_DCE <- data.frame(
Date = seq.Date(from = max(data$Date) + 1, by = "month", length.out = 48),
DCE_Forecast = as.numeric(forecast_DCE$mean)
)# Print future forecasts
print(future_forecasts_DCE) Date DCE_Forecast
1 2024-01-31 0.004160921
2 2024-03-02 0.004049296
3 2024-03-31 0.004259004
4 2024-05-01 0.004428379
5 2024-05-31 0.004305166
6 2024-07-01 0.004409102
7 2024-07-31 0.004362960
8 2024-08-31 0.004235477
9 2024-10-01 0.004389000
10 2024-10-31 0.004379273
11 2024-12-01 0.004389946
12 2024-12-31 0.004402565
13 2025-01-31 0.004160921
14 2025-03-03 0.004049296
15 2025-03-31 0.004259004
16 2025-05-01 0.004428379
17 2025-05-31 0.004305166
18 2025-07-01 0.004409102
19 2025-07-31 0.004362960
20 2025-08-31 0.004235477
21 2025-10-01 0.004389000
22 2025-10-31 0.004379273
23 2025-12-01 0.004389946
24 2025-12-31 0.004402565
25 2026-01-31 0.004160921
26 2026-03-03 0.004049296
27 2026-03-31 0.004259004
28 2026-05-01 0.004428379
29 2026-05-31 0.004305166
30 2026-07-01 0.004409102
31 2026-07-31 0.004362960
32 2026-08-31 0.004235477
33 2026-10-01 0.004389000
34 2026-10-31 0.004379273
35 2026-12-01 0.004389946
36 2026-12-31 0.004402565
37 2027-01-31 0.004160921
38 2027-03-03 0.004049296
39 2027-03-31 0.004259004
40 2027-05-01 0.004428379
41 2027-05-31 0.004305166
42 2027-07-01 0.004409102
43 2027-07-31 0.004362960
44 2027-08-31 0.004235477
45 2027-10-01 0.004389000
46 2027-10-31 0.004379273
47 2027-12-01 0.004389946
48 2027-12-31 0.004402565
Method2: Time Series Forecast for each Site w.r.t 11DCE
# Create an empty list to store forecasts for each site
forecasts_list <- list()# Iterate over each site
for(site in unique(data_ts$SiteName)) {site_data <- data_ts %>% filter(SiteName == site)
# Check if there is enough data for decomposition
if(nrow(site_data) < 24) { # Ensure at least two full years of data for monthly series
warning(paste("Not enough data for site", site, ". Skipping decomposition and ARIMA modeling."))
next
}
# Convert to time series object
ts_DCE <- ts(site_data$X11DCE, start = c(year(min(site_data$Date)), month(min(site_data$Date))), frequency = 12)
# Check if time series has sufficient length for decomposition
if (length(ts_DCE) < 24) { # Ensure at least two full periods of data for decomposition
warning(paste("Series is too short for STL decomposition for site", site, ". Skipping decomposition and ARIMA modeling."))
next
}
# Decompose time series
decomp_DCE <- tryCatch({
stl(ts_DCE, s.window = "periodic")
}, error = function(e) {
warning(paste("Error in STL decomposition for site", site, ":", e$message))
NULL
})
if (!is.null(decomp_DCE)) {
# Plot decompositions
plot(decomp_DCE, main = paste("Decomposition for Site", site))
}
# Fit ARIMA model
fit_DCE <- auto.arima(ts_DCE)
summary(fit_DCE)
# Forecast for the next 12 months
forecast_DCE <- forecast(fit_DCE, h = 12)
plot(forecast_DCE, main = paste("Forecast for Site", site))
# Store forecast in the list
forecasts_list[[site]] <- data.frame(
Date = seq.Date(from = max(site_data$Date) + 1, by = "month", length.out = 12),
SiteName = site,
DCE_Forecast = as.numeric(forecast_DCE$mean)
)
}Warning: Not enough data for site MW06 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW10 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW01 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW02 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW05 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW12 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW11 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW15 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW16 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW17 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW19 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW21 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site TC01 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site TC10 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW22 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW23 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW24 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW25 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW27 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW28 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site SMW30 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW32 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW29 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW30 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW31 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW33 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW39 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW35 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW36 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW37 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW38 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW34 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW40 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW41 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW42 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW44 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW43 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site SMW43 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW26 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW45 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW47 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW48 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW49 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW46 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW50 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW51 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW15R . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW52 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW53 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW55 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW54 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW57 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW58 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW59 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW60 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW61 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW62 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW63 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW56 . Skipping decomposition and ARIMA
modeling.
# Combine all forecasts into a single data frame
all_forecasts <- do.call(rbind, forecasts_list)
# Print future forecasts
print(all_forecasts) Date SiteName DCE_Forecast
MW03.1 2024-01-24 MW03 0.0002383094
MW03.2 2024-02-24 MW03 0.0002383094
MW03.3 2024-03-24 MW03 0.0002383094
MW03.4 2024-04-24 MW03 0.0002383094
MW03.5 2024-05-24 MW03 0.0002383094
MW03.6 2024-06-24 MW03 0.0002383094
MW03.7 2024-07-24 MW03 0.0002383094
MW03.8 2024-08-24 MW03 0.0002383094
MW03.9 2024-09-24 MW03 0.0002383094
MW03.10 2024-10-24 MW03 0.0002383094
MW03.11 2024-11-24 MW03 0.0002383094
MW03.12 2024-12-24 MW03 0.0002383094
MW04.1 2024-01-26 MW04 0.0017983819
MW04.2 2024-02-26 MW04 0.0017983819
MW04.3 2024-03-26 MW04 0.0017983819
MW04.4 2024-04-26 MW04 0.0017983819
MW04.5 2024-05-26 MW04 0.0017983819
MW04.6 2024-06-26 MW04 0.0017983819
MW04.7 2024-07-26 MW04 0.0017983819
MW04.8 2024-08-26 MW04 0.0017983819
MW04.9 2024-09-26 MW04 0.0017983819
MW04.10 2024-10-26 MW04 0.0017983819
MW04.11 2024-11-26 MW04 0.0017983819
MW04.12 2024-12-26 MW04 0.0017983819
MW08.1 2019-01-18 MW08 0.0002122387
MW08.2 2019-02-18 MW08 0.0002122387
MW08.3 2019-03-18 MW08 0.0002122387
MW08.4 2019-04-18 MW08 0.0002122387
MW08.5 2019-05-18 MW08 0.0002122387
MW08.6 2019-06-18 MW08 0.0002122387
MW08.7 2019-07-18 MW08 0.0002122387
MW08.8 2019-08-18 MW08 0.0002122387
MW08.9 2019-09-18 MW08 0.0002122387
MW08.10 2019-10-18 MW08 0.0002122387
MW08.11 2019-11-18 MW08 0.0002122387
MW08.12 2019-12-18 MW08 0.0002122387
MW07.1 2012-06-13 MW07 0.0013575057
MW07.2 2012-07-13 MW07 0.0016377772
MW07.3 2012-08-13 MW07 0.0017293693
MW07.4 2012-09-13 MW07 0.0017593013
MW07.5 2012-10-13 MW07 0.0017690830
MW07.6 2012-11-13 MW07 0.0017722796
MW07.7 2012-12-13 MW07 0.0017733243
MW07.8 2013-01-13 MW07 0.0017736657
MW07.9 2013-02-13 MW07 0.0017737773
MW07.10 2013-03-13 MW07 0.0017738137
MW07.11 2013-04-13 MW07 0.0017738256
MW07.12 2013-05-13 MW07 0.0017738295
MW09.1 2024-01-31 MW09 -0.0018595509
MW09.2 2024-03-02 MW09 -0.0028353596
MW09.3 2024-03-31 MW09 -0.0045080197
MW09.4 2024-05-01 MW09 -0.0057917862
MW09.5 2024-05-31 MW09 -0.0072925835
MW09.6 2024-07-01 MW09 -0.0086722619
MW09.7 2024-07-31 MW09 -0.0101195334
MW09.8 2024-08-31 MW09 -0.0115290831
MW09.9 2024-10-01 MW09 -0.0129596843
MW09.10 2024-10-31 MW09 -0.0143785373
MW09.11 2024-12-01 MW09 -0.0158039466
MW09.12 2024-12-31 MW09 -0.0172256969
DMW13.1 2024-01-26 DMW13 0.0003755115
DMW13.2 2024-02-26 DMW13 0.0003755115
DMW13.3 2024-03-26 DMW13 0.0003755115
DMW13.4 2024-04-26 DMW13 0.0003755115
DMW13.5 2024-05-26 DMW13 0.0003755115
DMW13.6 2024-06-26 DMW13 0.0003755115
DMW13.7 2024-07-26 DMW13 0.0003755115
DMW13.8 2024-08-26 DMW13 0.0003755115
DMW13.9 2024-09-26 DMW13 0.0003755115
DMW13.10 2024-10-26 DMW13 0.0003755115
DMW13.11 2024-11-26 DMW13 0.0003755115
DMW13.12 2024-12-26 DMW13 0.0003755115
MW18.1 2024-01-19 MW18 0.0144983736
MW18.2 2024-02-19 MW18 0.0144983736
MW18.3 2024-03-19 MW18 0.0144983736
MW18.4 2024-04-19 MW18 0.0144983736
MW18.5 2024-05-19 MW18 0.0144983736
MW18.6 2024-06-19 MW18 0.0144983736
MW18.7 2024-07-19 MW18 0.0144983736
MW18.8 2024-08-19 MW18 0.0144983736
MW18.9 2024-09-19 MW18 0.0144983736
MW18.10 2024-10-19 MW18 0.0144983736
MW18.11 2024-11-19 MW18 0.0144983736
MW18.12 2024-12-19 MW18 0.0144983736
MW20.1 2024-01-23 MW20 0.0019386039
MW20.2 2024-02-23 MW20 0.0010024483
MW20.3 2024-03-23 MW20 0.0012446654
MW20.4 2024-04-23 MW20 0.0013994633
MW20.5 2024-05-23 MW20 0.0011857261
MW20.6 2024-06-23 MW20 0.0012988470
MW20.7 2024-07-23 MW20 0.0012880109
MW20.8 2024-08-23 MW20 0.0012545844
MW20.9 2024-09-23 MW20 0.0012853122
MW20.10 2024-10-23 MW20 0.0012732054
MW20.11 2024-11-23 MW20 0.0012714440
MW20.12 2024-12-23 MW20 0.0012773545