Linear Regression and Time Series Model_MW for 1,1-Dichloroethene

Author

Callistus Obunadike

setwd("C:/Users/obunadic8159/OneDrive - ARCADIS/Desktop/Data_Analysis_11DCE_14DOX/Recent_New_Data/MW")
#Load necessary libraries
#install.packages("car")
#install.packages("corrplot")
#install.packages("MASS")
#install.packages("lmtest")
library(MASS)
library(dplyr)
library(ggplot2)
library(caret)
library(car)
library(corrplot)
library(e1071)
library(forecast)
library(lubridate)
library(lmtest)

Importing the Data from our directory

# Load the dataset
data <- read.csv("C:/Users/obunadic8159/OneDrive - ARCADIS/Desktop/Data_Analysis_11DCE_14DOX/Recent_New_Data/MW/11DCE.csv")

Visualizing the Data Type and Structure

##################Checking the Data Type and Structure ###############
head(data)
  SiteName       Date Quarter X11DCE
1    DMW13 11/30/2001      Q4  5e-04
2    DMW13  5/29/2002      Q2  5e-04
3    DMW13  11/6/2002      Q4  5e-04
4    DMW13  9/23/2003      Q3  5e-04
5    DMW13  5/20/2004      Q2  5e-04
6    DMW13  4/27/2005      Q2  5e-04
#str(data)

Convert Date to Date format and factorizing Site Name

# Convert Date to Date format and factorize SiteName
data$Date <- as.Date(data$Date, format="%m/%d/%Y")
data$SiteName <- as.factor(data$SiteName)

Checking for Missing data across the data using For-Loop

#### Handling Missing Data
vnames <- colnames(data)
n <- nrow(data)
out <- NULL
for (j in 1:ncol(data)){
  vname <- colnames(data)[j]
  x <- as.vector(data[,j])
  n1 <- sum(is.na(x), na.rm=TRUE)  # NA
  n2 <- sum(x=="NA", na.rm=TRUE) # "NA"
  n3 <- sum(x==" ", na.rm=TRUE)  # missing
  nmiss <- n1 + n2 + n3
  nmiss <- sum(is.na(x))
  ncomplete <- n-nmiss
  out <- rbind(out, c(col.num=j, v.name=vname, mode=mode(x),
                      n.level=length(unique(x)),
                      ncom=ncomplete, nmiss= nmiss, miss.prop=nmiss/n))
}
out <- as.data.frame(out)
row.names(out) <- NULL
out
  col.num   v.name      mode n.level ncom nmiss miss.prop
1       1 SiteName character      67 1247     0         0
2       2     Date   numeric     173 1247     0         0
3       3  Quarter character       4 1247     0         0
4       4   X11DCE   numeric     616 1247     0         0
#for (j in 1:NCOL(data)){
  #print(head(colnames(data)[j]))
  #print(head(table(data[,j], useNA="ifany")))
#}

Checking for Outliers using Box-plot

# Check for outliers using boxplots for DCE
ggplot(data, aes(x = SiteName, y = X11DCE)) + geom_boxplot() + ggtitle("Boxplot for DCE by SiteName")

Calculating and Filtering Outliers in 11DCE

# Calculate IQR and detect outliers for X11DCE
Q1_DCE <- quantile(data$X11DCE, 0.25)
Q3_DCE <- quantile(data$X11DCE, 0.75)
IQR_DCE <- Q3_DCE - Q1_DCE
outliers_DCE <- data %>% filter(X11DCE < (Q1_DCE - 1.5 * IQR_DCE) | X11DCE > (Q3_DCE + 1.5 * IQR_DCE))

There are 148 observations of 1,1Dichloroethene (DCE) with outliers. Therefore, it is important to remove them before building the linear regression model and time series forecast.

# Outliers in 11DCE 
#print(outliers_DCE) 
#print("Outliers in X11DCE:")
print(head(outliers_DCE))
  SiteName       Date Quarter X11DCE
1    DMW37 2020-01-23      Q1 0.0621
2    DMW43 2023-01-17      Q1 0.0696
3    DMW43 2024-01-22      Q1 0.1050
4     MW01 1995-06-22      Q2 0.1400
5     MW01 1996-04-11      Q2 0.0688
6     MW01 1998-05-19      Q2 0.1490

Checking for skewness in 1,1-Dichloroethene

[1] "Skewness for X11DCE: 13.4298884110363"

Interpretation of Skewness Values:

Skewness for 11DCE (13.4):This is also a high positive skewness value. The distribution of 11DCE is positively skewed, with most data points clustered towards the lower end and a long tail to the right. This indicates the presence of some high values. The high skewness values suggest that the data for 11DCE are not normally distributed and are influenced by a few very large values. When working with these variables in analyses, it might be necessary to consider transformations (such as log or square root transformations) to reduce skewness and achieve a more normal distribution, which can be beneficial for certain statistical analyses and models.~

# Visualize distributions
par(mfrow=c(1, 2))
hist(data$X11DCE, main="11DCE", xlab="11DCE", col = 'red', breaks=20)

Transforming data due to skewness by using log transformation.

# Transform data if skewness is high (optional, example using log transformation)
if(abs(skewness_DCE) > 1){data$X11DCE <- log1p(data$X11DCE)}

Removing the Outliers for 11DCE.

# Remove outliers (optional, depending on analysis)
data <- data %>% filter(X11DCE >= (Q1_DCE - 1.5 * IQR_DCE) & X11DCE <= (Q3_DCE + 1.5 * IQR_DCE))


##################Checking the Data Type and Structure ###############
#str(data)
head(data)
  SiteName       Date Quarter      X11DCE
1    DMW13 2001-11-30      Q4 0.000499875
2    DMW13 2002-05-29      Q2 0.000499875
3    DMW13 2002-11-06      Q4 0.000499875
4    DMW13 2003-09-23      Q3 0.000499875
5    DMW13 2004-05-20      Q2 0.000499875
6    DMW13 2005-04-27      Q2 0.000499875

Convert Date to numeric for VIF calculation

data$Date_numeric <- as.numeric(data$Date)

Check for multicollinearity using VIF

# Check for multicollinearity using VIF
vif_data <- lm(X11DCE ~ Date_numeric + SiteName, data = data)
vif(vif_data)
                 GVIF Df GVIF^(1/(2*Df))
Date_numeric 2.905404  1        1.704525
SiteName     2.905404 66        1.008113

In this case, both Date_numeric and SiteName have GVIF values close to 1, indicating that there is little to no multicollinearity between these predictors in the model. This suggests that the model is not adversely affected by multicollinearity, and the estimates of the regression coefficients should be reliable.

Check correlation matrix for predictor variable 11DCE and Date

  • 1: Perfect positive correlation (as one variable increases, the other variable also increases).

  • -1: Perfect negative correlation (as one variable increases, the other variable decreases).

  • 0: No linear correlation (the variables do not have a linear relationship).

# Check correlation matrix for predictor and dependent variables
cor_matrix <- cor(data %>% select(X11DCE, Date_numeric))
print(cor_matrix)
                 X11DCE Date_numeric
X11DCE        1.0000000   -0.2333438
Date_numeric -0.2333438    1.0000000

Implications of Correlation Matrix on the Model:

The correlation results suggest that there is almost no linear relationship between X11DCE and Date_numeric. This low correlation indicates that Date_numeric is not a strong predictor of X11DCE based on linear association alone.

# Visualize the correlation matrix
corrplot(cor_matrix, method = "circle")

Removing the 5th column (Date Numeric) for Linear Regression Model

# Convert Date back to as.Date data type and removing the 5th column
data$Date <- as.Date(data$Date, format="%m/%d/%Y")
#head(data)
#colnames(data)
data= data[, -(5)]
head(data)
  SiteName       Date Quarter      X11DCE
1    DMW13 2001-11-30      Q4 0.000499875
2    DMW13 2002-05-29      Q2 0.000499875
3    DMW13 2002-11-06      Q4 0.000499875
4    DMW13 2003-09-23      Q3 0.000499875
5    DMW13 2004-05-20      Q2 0.000499875
6    DMW13 2005-04-27      Q2 0.000499875

Splitting the data into training and testing sets for the Linear Regression Modelling of 11DCE

# Split the data into training and testing sets for X11DCE
set.seed(123)
trainIndex <- createDataPartition(data$X11DCE, p = .8, list = FALSE, times = 1)
trainData <- data[trainIndex,]
testData  <- data[-trainIndex,]
  • This line uses the createDataPartition function from the caret package to create an index for splitting the data.

  • X11DCE: The target variable, which is 1,1 Dicholorethene, is used to ensure that the split maintains the same distribution of this variable in both the training and test sets.

  • p = .8: This specifies that 80% of the data should be used for training. The remaining 20% will be used for testing.

  • list = FALSE: By setting this to FALSE, the function returns the indices as a vector instead of a list.

  • times = 1: This specifies that only one partition should be created.

Using set.seed(123) ensures that every time you run this code, the training and test splits will be the same, allowing for consistent and reproducible results.

Factoring SiteName in testData to Match with the trainData (Categorical Variable)

# Ensure factor levels in test set match training set
testData$SiteName <- factor(testData$SiteName, levels = levels(trainData$SiteName))

The above code ensures that the SiteName factor levels in the testData dataset match those in the trainData dataset. This step is crucial when you want to make predictions on the test data using a model trained on the training data, especially when dealing with categorical variables.

Linear Regression Model and StepWise Backward Elimination Model Selection

# Model for X11DCE using stepwise backward elimination
full_model_DCE <- lm(X11DCE ~ Date + SiteName, data = trainData)
step_model_DCE <- step(full_model_DCE, direction = "backward", trace = 0)
  • Purpose: The goal of step-wise model selection is to improve the model by removing predictors that do not contribute significantly to the prediction of the response variable. This can lead to a more parsimonious model that is easier to interpret and may perform better on new data.

  • Backward Elimination: In this specific procedure, predictors are removed one by one based on their statistical significance, starting with the least significant predictor. The process continues until only predictors that contribute meaningfully to the model remain.

Summary and Interpretation of the Linear Regression Model

# Summary of the final model
summary(step_model_DCE)

Call:
lm(formula = X11DCE ~ Date + SiteName, data = trainData)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.041859 -0.003597 -0.000210  0.002353  0.037380 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    4.132e-02  3.989e-03  10.359  < 2e-16 ***
Date          -2.552e-06  2.183e-07 -11.693  < 2e-16 ***
SiteNameDMW28  2.705e-03  3.171e-03   0.853 0.393943    
SiteNameDMW29  2.393e-03  3.252e-03   0.736 0.462065    
SiteNameDMW30  1.950e-02  3.351e-03   5.819 8.49e-09 ***
SiteNameDMW31  2.465e-02  2.990e-03   8.244 6.64e-16 ***
SiteNameDMW32  2.957e-03  3.461e-03   0.854 0.393200    
SiteNameDMW33  2.793e-03  3.349e-03   0.834 0.404547    
SiteNameDMW34  2.787e-03  2.925e-03   0.953 0.341010    
SiteNameDMW35  2.550e-03  3.348e-03   0.762 0.446384    
SiteNameDMW36  2.755e-03  3.171e-03   0.869 0.385163    
SiteNameDMW37  1.479e-02  2.988e-03   4.948 9.13e-07 ***
SiteNameDMW38  3.567e-02  3.034e-03  11.758  < 2e-16 ***
SiteNameDMW39  2.710e-03  3.254e-03   0.833 0.405195    
SiteNameDMW40  2.864e-03  3.589e-03   0.798 0.425212    
SiteNameDMW41  2.959e-03  3.461e-03   0.855 0.392894    
SiteNameDMW42  4.400e-03  3.047e-03   1.444 0.149133    
SiteNameDMW43  2.278e-02  3.102e-03   7.346 4.97e-13 ***
SiteNameDMW44  2.891e-03  3.254e-03   0.888 0.374540    
SiteNameMW01   6.646e-03  3.418e-03   1.945 0.052160 .  
SiteNameMW02   2.476e-02  3.118e-03   7.942 6.62e-15 ***
SiteNameMW03  -4.883e-03  2.599e-03  -1.879 0.060639 .  
SiteNameMW04   1.417e-02  2.914e-03   4.861 1.40e-06 ***
SiteNameMW05   5.907e-03  4.281e-03   1.380 0.168071    
SiteNameMW06   1.312e-02  2.956e-03   4.437 1.04e-05 ***
SiteNameMW07  -9.755e-03  2.976e-03  -3.278 0.001091 ** 
SiteNameMW08  -5.071e-03  2.629e-03  -1.929 0.054062 .  
SiteNameMW09   9.012e-03  2.877e-03   3.132 0.001798 ** 
SiteNameMW10  -8.612e-04  2.948e-03  -0.292 0.770233    
SiteNameMW11   1.442e-02  2.788e-03   5.172 2.92e-07 ***
SiteNameMW12   6.539e-03  3.291e-03   1.987 0.047287 *  
SiteNameMW15   2.252e-02  3.938e-03   5.717 1.52e-08 ***
SiteNameMW15R  1.349e-02  3.364e-03   4.010 6.63e-05 ***
SiteNameMW16   7.479e-03  2.914e-03   2.566 0.010453 *  
SiteNameMW17   3.267e-03  2.914e-03   1.121 0.262563    
SiteNameMW18   2.161e-02  2.717e-03   7.953 6.09e-15 ***
SiteNameMW19   4.686e-04  2.868e-03   0.163 0.870239    
SiteNameMW20   1.908e-02  2.873e-03   6.640 5.74e-11 ***
SiteNameMW21   3.024e-04  2.867e-03   0.105 0.916033    
SiteNameMW22   7.104e-03  3.096e-03   2.294 0.022023 *  
SiteNameMW23   3.324e-02  3.611e-03   9.207  < 2e-16 ***
SiteNameMW24   7.631e-03  2.884e-03   2.646 0.008309 ** 
SiteNameMW25   9.173e-03  2.884e-03   3.181 0.001523 ** 
SiteNameMW26   4.347e-02  3.464e-03  12.551  < 2e-16 ***
SiteNameMW27   1.343e-02  2.880e-03   4.664 3.63e-06 ***
SiteNameMW45   3.102e-03  3.943e-03   0.787 0.431655    
SiteNameMW46   5.039e-03  3.477e-03   1.449 0.147614    
SiteNameMW47   7.367e-03  3.184e-03   2.314 0.020934 *  
SiteNameMW48   1.088e-02  2.847e-03   3.823 0.000142 ***
SiteNameMW49   3.086e-03  4.185e-03   0.737 0.461198    
SiteNameMW50   3.462e-03  4.506e-03   0.768 0.442532    
SiteNameMW51   3.953e-03  3.949e-03   1.001 0.317083    
SiteNameMW52   2.320e-02  3.274e-03   7.087 2.97e-12 ***
SiteNameMW53   7.955e-03  3.260e-03   2.440 0.014880 *  
SiteNameMW54   3.396e-02  3.598e-03   9.439  < 2e-16 ***
SiteNameMW55   1.064e-02  3.266e-03   3.257 0.001174 ** 
SiteNameMW56   3.536e-02  6.744e-03   5.243 2.02e-07 ***
SiteNameMW57   9.741e-03  4.200e-03   2.320 0.020612 *  
SiteNameMW58   6.198e-03  3.619e-03   1.713 0.087155 .  
SiteNameMW59   1.037e-02  4.197e-03   2.471 0.013691 *  
SiteNameMW60   8.329e-03  3.974e-03   2.096 0.036384 *  
SiteNameMW61   1.000e-02  4.200e-03   2.381 0.017508 *  
SiteNameMW62   1.047e-02  3.619e-03   2.893 0.003916 ** 
SiteNameMW63   1.109e-02  3.613e-03   3.070 0.002214 ** 
SiteNameSMW30  6.730e-03  3.254e-03   2.068 0.038926 *  
SiteNameSMW43  3.015e-02  2.985e-03  10.098  < 2e-16 ***
SiteNameTC01   8.136e-03  3.945e-03   2.062 0.039519 *  
SiteNameTC10   5.973e-03  3.459e-03   1.727 0.084527 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.009111 on 814 degrees of freedom
Multiple R-squared:  0.5914,    Adjusted R-squared:  0.5578 
F-statistic: 17.59 on 67 and 814 DF,  p-value: < 2.2e-16
  • Residuals: Min ( -0.041859), 1Q(-0.003597), Median(-0.000210), 3Q(0.002353) and Max(0.037380) shows the distribution of the residuals (difference between observed and predicted values). The small values of the residuals indicate that the model’s prediction are close to the observed values.

  • Date: The coefficient for Date is significant at 5% level (p<0.05), suggesting that there is a relationship between Date and XIIDCE.

Significant SiteName Coefficients:

  • SiteNameDMW30: p=8.49×10−9

  • SiteNameDMW31: p=6.64×10−16

  • SiteNameDMW37: p=9.13×10−7

  • SiteNameDMW38: p<2×10−16

  • SiteNameDMW43: p=4.97×10−13

  • SiteNameMW02: p=6.62×10−15

  • SiteNameMW04: p=1.40×10−6

  • SiteNameMW06: p=1.04×10−5

  • SiteNameMW07: p=0.001091

  • SiteNameMW09: p=0.001798

  • SiteNameMW11: p=2.92×10−7

  • SiteNameMW12: p=0.047287

  • SiteNameMW15: p=1.52×10−8

  • SiteNameMW15R: p=6.63×10−5

  • SiteNameMW16: p=0.010453

  • SiteNameMW18: p=6.09×10−15

  • SiteNameMW20: p=5.74×10−11

  • SiteNameMW22: p=0.022023

  • SiteNameMW23: p<2×10−16

  • SiteNameMW24: p=0.008309

  • SiteNameMW25: p=0.001523

  • SiteNameMW26: p<2×10−16

  • SiteNameMW27: p=3.63×10−6

  • SiteNameMW47: p=0.020934

  • SiteNameMW48: p=0.000142

  • SiteNameMW52: p=2.97×10−12

  • SiteNameMW53: p=0.014880

  • SiteNameMW54: p<2×10−16

  • SiteNameMW55: p=0.001174

  • SiteNameMW56: p=2.02×10−7

  • SiteNameMW57: p=0.020612

  • SiteNameMW59: p=0.013691

  • SiteNameMW60: p=0.036384

  • SiteNameMW61: p=0.017508

  • SiteNameMW62: p=0.003916

  • SiteNameMW63: p=0.002214

  • SiteNameSMW30: p=0.038926

  • SiteNameSMW43: p<2×10−16

  • SiteNameTC01: p=0.039519

These significant coefficients indicate that the corresponding SiteName levels have a statistically significant effect on X11DCE.

Model Fit Statistics

  • Residual standard error: 0.0091 on 814 degrees of freedom

  • Multiple R-squared (0.60): This indicates that approximately 60% of the variability in X11DCE is explained by the model.

  • Adjusted R-squared (0.56): This adjusts the R-squared value for the number of predictors in the model, providing a more accurate measure of model fit.

  • F-statistic: 17.59 on 67 and 1632 DF (p-value: < 2.2e-16): The F-statistic tests the overall significance of the model. A very low p-value (< 2.2e-16) indicates that the model is statistically significant.

Identifying High Leverage Points:

Leverage Values: These values indicate how much influence each data point has on the fitted values of the model. High leverage points are those that can potentially have a large impact on the model.

Data Points with High Leverage Points:

# Print high leverage points
#print(high_leverage_points)
print(head(high_leverage_points))
396 397 399 400 401 402 
313 314 315 316 317 318 
print(tail(high_leverage_points))
1019 1020 1021 1022 1023 1027 
 814  815  816  817  818  819 

Removing the Data with High Leverage Points for Improved Model:

# Remove high leverage points and refit the model if necessary
trainData_cleaned <- trainData[-high_leverage_points, ]

Refitting the Model after removing high leverage points:

# Refit the model without high leverage points
full_model_DCE_cleaned <- lm(X11DCE ~ Date + SiteName, data = trainData_cleaned)
step_model_DCE_cleaned <- step(full_model_DCE_cleaned, direction = "backward", trace = 0)

Summary and Interpretation of the Re-fitted Linear Regression Model

# Summary of the final model without high leverage points
summary(step_model_DCE_cleaned)

Call:
lm(formula = X11DCE ~ Date + SiteName, data = trainData_cleaned)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.041854 -0.003669 -0.000230  0.002492  0.037391 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    4.116e-02  4.019e-03  10.241  < 2e-16 ***
Date          -2.543e-06  2.203e-07 -11.545  < 2e-16 ***
SiteNameDMW28  2.694e-03  3.181e-03   0.847 0.397314    
SiteNameDMW29  2.383e-03  3.262e-03   0.731 0.465278    
SiteNameDMW30  1.949e-02  3.361e-03   5.798 9.75e-09 ***
SiteNameDMW31  2.463e-02  2.999e-03   8.212 8.93e-16 ***
SiteNameDMW32  2.945e-03  3.472e-03   0.848 0.396567    
SiteNameDMW33  2.782e-03  3.360e-03   0.828 0.407881    
SiteNameDMW34  2.776e-03  2.934e-03   0.946 0.344445    
SiteNameDMW35  2.540e-03  3.358e-03   0.757 0.449566    
SiteNameDMW36  2.745e-03  3.181e-03   0.863 0.388538    
SiteNameDMW37  1.477e-02  2.998e-03   4.926 1.02e-06 ***
SiteNameDMW38  3.566e-02  3.044e-03  11.717  < 2e-16 ***
SiteNameDMW39  2.699e-03  3.264e-03   0.827 0.408530    
SiteNameDMW40  2.854e-03  3.601e-03   0.793 0.428279    
SiteNameDMW41  2.947e-03  3.472e-03   0.849 0.396255    
SiteNameDMW42  4.384e-03  3.057e-03   1.434 0.152006    
SiteNameDMW43  2.277e-02  3.112e-03   7.319 6.21e-13 ***
SiteNameDMW44  2.880e-03  3.264e-03   0.882 0.377853    
SiteNameMW01   6.678e-03  3.429e-03   1.947 0.051859 .  
SiteNameMW02   2.479e-02  3.129e-03   7.925 7.85e-15 ***
SiteNameMW03  -4.860e-03  2.608e-03  -1.863 0.062778 .  
SiteNameMW04   1.417e-02  2.924e-03   4.845 1.52e-06 ***
SiteNameMW06   1.315e-02  2.967e-03   4.434 1.06e-05 ***
SiteNameMW07  -9.714e-03  2.987e-03  -3.252 0.001196 ** 
SiteNameMW08  -5.042e-03  2.638e-03  -1.911 0.056320 .  
SiteNameMW09   9.023e-03  2.887e-03   3.126 0.001839 ** 
SiteNameMW10  -8.190e-04  2.959e-03  -0.277 0.782000    
SiteNameMW11   1.441e-02  2.797e-03   5.153 3.24e-07 ***
SiteNameMW12   6.579e-03  3.303e-03   1.992 0.046749 *  
SiteNameMW15   2.252e-02  3.951e-03   5.701 1.68e-08 ***
SiteNameMW15R  1.347e-02  3.375e-03   3.992 7.17e-05 ***
SiteNameMW16   7.480e-03  2.924e-03   2.559 0.010694 *  
SiteNameMW17   3.268e-03  2.923e-03   1.118 0.263995    
SiteNameMW18   2.161e-02  2.726e-03   7.928 7.66e-15 ***
SiteNameMW19   4.667e-04  2.877e-03   0.162 0.871161    
SiteNameMW20   1.907e-02  2.882e-03   6.616 6.84e-11 ***
SiteNameMW21   3.012e-04  2.877e-03   0.105 0.916646    
SiteNameMW22   7.094e-03  3.106e-03   2.284 0.022636 *  
SiteNameMW23   3.322e-02  3.622e-03   9.171  < 2e-16 ***
SiteNameMW24   7.618e-03  2.894e-03   2.632 0.008643 ** 
SiteNameMW25   9.159e-03  2.893e-03   3.166 0.001604 ** 
SiteNameMW26   4.346e-02  3.475e-03  12.508  < 2e-16 ***
SiteNameMW27   1.342e-02  2.889e-03   4.645 4.00e-06 ***
SiteNameMW45   3.090e-03  3.955e-03   0.781 0.434929    
SiteNameMW46   5.021e-03  3.488e-03   1.439 0.150421    
SiteNameMW47   7.350e-03  3.194e-03   2.301 0.021651 *  
SiteNameMW48   1.087e-02  2.856e-03   3.805 0.000153 ***
SiteNameMW51   3.938e-03  3.961e-03   0.994 0.320519    
SiteNameMW52   2.319e-02  3.285e-03   7.058 3.72e-12 ***
SiteNameMW53   7.941e-03  3.270e-03   2.428 0.015386 *  
SiteNameMW54   3.395e-02  3.609e-03   9.405  < 2e-16 ***
SiteNameMW55   1.062e-02  3.277e-03   3.241 0.001241 ** 
SiteNameMW58   6.175e-03  3.631e-03   1.701 0.089355 .  
SiteNameMW60   8.305e-03  3.987e-03   2.083 0.037575 *  
SiteNameMW62   1.045e-02  3.630e-03   2.878 0.004117 ** 
SiteNameMW63   1.107e-02  3.625e-03   3.054 0.002335 ** 
SiteNameSMW30  6.719e-03  3.264e-03   2.058 0.039873 *  
SiteNameSMW43  3.013e-02  2.995e-03  10.061  < 2e-16 ***
SiteNameTC01   8.122e-03  3.958e-03   2.052 0.040497 *  
SiteNameTC10   5.964e-03  3.470e-03   1.719 0.086045 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.00914 on 784 degrees of freedom
Multiple R-squared:  0.5927,    Adjusted R-squared:  0.5615 
F-statistic: 19.01 on 60 and 784 DF,  p-value: < 2.2e-16
  • Residuals: Min ( -0.0084612), 1Q(-0.0000823), Median(-0.0000014), 3Q(0.0000000) and Max(0.0122086) shows the distribution of the residuals (difference between observed and predicted values). The small values of the residuals indicate that the model’s prediction are close to the observed values.

  • Intercept: The intercept is statistically significant at the 5% level, indicating a small but significant base level of X11DCE when all SiteName factors are zero (though in practice, SiteName factors wouldn’t be zero).

Significant SiteName Coefficients:

  • SiteNameDMW30: p=9.75×10−9

  • SiteNameDMW31: p=8.93×10−16

  • SiteNameDMW37: p=1.02×10−6

  • SiteNameDMW38: p<2×10−16

  • SiteNameDMW43: p=6.21×10−13

  • SiteNameMW02: p=7.85×10−15

  • SiteNameMW04: p=1.52×10−6

  • SiteNameMW06: p=1.06×10−5

  • SiteNameMW07: p=0.001196

  • SiteNameMW09: p=0.001839

  • SiteNameMW11: p=3.24×10−7

  • SiteNameMW12: p=0.046749

  • SiteNameMW15: p=1.68×10−8

  • SiteNameMW15R: p=7.17×10−5

  • SiteNameMW16: p=0.010694

  • SiteNameMW18: p=7.66×10−15

  • SiteNameMW20: p=6.84×10−11

  • SiteNameMW22: p=0.022636

  • SiteNameMW23: p<2×10−16

  • SiteNameMW24: p=0.008643

  • SiteNameMW25: p=0.001604

  • SiteNameMW26: p<2×10−16

  • SiteNameMW27: p=4.00×10−6

  • SiteNameMW47: p=0.021651

  • SiteNameMW48: p=0.000153

  • SiteNameMW52: p=3.72×10−12

  • SiteNameMW53: p=0.015386

  • SiteNameMW54: p<2×10−16

  • SiteNameMW55: p=0.001241

  • SiteNameMW60: p=0.037575

  • SiteNameMW62: p=0.004117

  • SiteNameMW63: p=0.002335

  • SiteNameSMW30: p=0.039873

  • SiteNameSMW43: p<2×10−16

  • SiteNameTC01: p=0.040497

These significant coefficients indicate that the corresponding SiteName levels have a statistically significant effect on X11DCE.

Model Fit Statistics

  • Residual standard error: 0.00914 on 784 degrees of freedom

  • Multiple R-squared (0.6): This indicates that approximately 60% of the variability in X11DCE is explained by the model.

  • Adjusted R-squared (0.56): This adjusts the R-squared value for the number of predictors in the model, providing a more accurate measure of model fit.

Conclusion:

The model explains a significant portion of the variance in X11DCE (R-squared = 60%). Several SiteName levels are significant, indicating that these levels are important for predicting X11DCE. The model has a good fit, but there might be room for improvement or further refinement to increase the explained variance.

Checking for Model Assumption:

# Check for model assumptions
par(mfrow=c(2, 2))
plot(step_model_DCE_cleaned)

Durbin-Watson test:

The Durbin-Watson test is used to detect the presence of autocorrelation in the residuals of a regression analysis.

# Durbin-Watson test for autocorrelation
dwtest(step_model_DCE)

    Durbin-Watson test

data:  step_model_DCE
DW = 1.4138, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
  • DW = 1.41: The Durbin-Watson statistic value is approximately. The DW statistic ranges from 0 to 4. A value around 2 suggests no autocorrelation. Values less than 2 indicate positive autocorrelation, while values greater than 2 indicate negative autocorrelation.

  • P-value = <2.2e-16: The p-value is <2.2e-16. This value is used to determine the significance of the test result. Typically, a p-value less than 0.05 indicates that the null hypothesis can be rejected with a very high level of confidence.

  • Alternative hypothesis: true autocorrelation is greater than 0: The alternative hypothesis in this test suggests that there is positive autocorrelation in the residuals (i.e., the residuals are positively correlated).

Conclusion

The Durbin-Watson test statistic of 1.41 and the very low p-value suggest that there is significant positive autocorrelation in the residuals of the model step_model_DCE. Positive autocorrelation means that consecutive residuals are correlated with each other, which can indicate that the model might be missing some important variables or that there are patterns in the data that are not captured by the model.

Residual Plots Vs Time or Fitted Values to Visualize Autocorrelation

####Create Plots of Residuals Vs Time or Fitted Values to Visually Inspect the AutoCorrelation

plot(residuals(step_model_DCE_cleaned), type = "l", main = "Residuals over Time")

acf(residuals(step_model_DCE_cleaned), main = "ACF of Residuals")

If most autocorrelation coefficients fall within the confidence bands, it indicates that the residuals are roughly uncorrelated. If many coefficients fall outside the confidence bands, especially at low lags, this suggests autocorrelation in the residuals. A random pattern with most points within the confidence bands indicates that the residuals are likely independent. A pattern where many points outside the bands or showing systematic trends (e.g., gradually decreasing) indicates that residuals are autocorrelated.

Factoring SiteName in testData to Match with the Non-High Leverage trainData (Categorical Variable)

# Ensure factor levels in test set match training set 
testData$SiteName <- factor(testData$SiteName , levels = levels(trainData_cleaned$SiteName))

The above code ensures that the SiteName factor levels in the testData matches those in the Non-High Leverage trainData dataset. This step is crucial when you want to make predictions on the test data using a model trained on the training data, especially when dealing with categorical variables.

Removing Data Points with High Levels for Prediction of Actual Vs Predicted Values

# Filter out rows in testData that have levels not present in trainData_cleaned
testData <- testData[!testData$SiteName %in% c('MW05', 'MW49', 'MW50', 'MW57', 'MW59', 'MW61'), ]

Predicting on test data

Having Trained our model to predict X11DCE based on the testdata. We shall now proceed to investigate how efficient our model is,

# Predict on test data
predictions_DCE_cleaned <- predict(step_model_DCE_cleaned, newdata = testData)

Model Evaluation

# Evaluate the model
DCE_results_cleaned <- data.frame(WellName = testData$SiteName, Actual = testData$X11DCE, 
                                                        Predicted = predictions_DCE_cleaned)

Result of the Prediction

The correlation coefficient of 0.7145267 indicates a strong positive relationship between the actual and predicted values, suggesting that the model performs well. However, further analysis and refinement can be conducted to improve the model’s accuracy and reliability.

print(DCE_results_cleaned)
     WellName       Actual     Predicted
1       DMW13 0.0004998750  1.152282e-02
7       DMW13 0.0004998750  7.449155e-03
12      DMW13 0.0000000000 -1.247425e-03
13      DMW13 0.0001919816 -1.293196e-03
22      DMW13 0.0001919816 -3.459713e-03
25      DMW13 0.0001919816 -5.341429e-03
32      DMW28 0.0001919816  1.314263e-03
41      DMW28 0.0001919816 -7.479960e-04
52      DMW29 0.0002999550  3.068270e-04
53      DMW29 0.0001919816  9.831248e-05
59      DMW29 0.0001919816 -2.019890e-03
60      DMW30 0.0480279690  1.821621e-02
62      DMW30 0.0269340012  1.787801e-02
68      DMW30 0.0262523711  1.670830e-02
76      DMW31 0.0150856374  2.301947e-02
78      DMW31 0.0000000000  2.274738e-02
93      DMW32 0.0001919816  1.565762e-03
94      DMW32 0.0001919816  1.339447e-03
97      DMW32 0.0002999550  8.690184e-04
101     DMW32 0.0001919816 -5.149682e-05
108     DMW33 0.0001919816  1.176398e-03
112     DMW33 0.0001919816  4.999969e-04
129     DMW34 0.0001919816  2.618989e-04
131     DMW34 0.0001919816  7.612951e-06
148     DMW35 0.0001919816 -9.014157e-04
149     DMW35 0.0001919816 -1.348959e-03
151     DMW36 0.0001919816  1.443738e-03
154     DMW36 0.0001919816  1.138595e-03
157     DMW36 0.0000000000  8.487092e-04
175     DMW37 0.0285874569  1.199925e-02
177     DMW37 0.0383549539  1.132794e-02
194     DMW38 0.0467881615  3.289225e-02
196     DMW38 0.0382587121  3.239639e-02
200     DMW38 0.0000000000  3.199208e-02
201     DMW38 0.0298500220  3.177848e-02
205     DMW39 0.0002999550  1.316863e-03
211     DMW39 0.0001919816  4.065194e-04
217     DMW39 0.0001919816 -1.190397e-03
223     DMW40 0.0000000000  9.529249e-04
227     DMW40 0.0001919816  8.326682e-05
228     DMW40 0.0005618421 -1.430477e-04
230     DMW40 0.0004998750 -1.030506e-03
232     DMW41 0.0001919816  1.643945e-03
234     DMW41 0.0001919816  1.343888e-03
235     DMW41 0.0001919816  1.140459e-03
247     DMW42 0.0001919816  2.582145e-03
250     DMW42 0.0001919816  2.086287e-03
254     DMW42 0.0001919816  1.384458e-03
255     DMW42 0.0001919816  1.384458e-03
257     DMW42 0.0001919816  5.325998e-04
262     DMW42 0.0002159767 -1.888203e-03
267     DMW43 0.0045197704  2.139001e-02
269     DMW43 0.0186254641  2.096535e-02
280     DMW43 0.0319830459  1.837418e-02
282     DMW43 0.0434425578  1.646703e-02
289     DMW44 0.0001919816  8.064686e-04
293     DMW44 0.0004159135 -1.140466e-04
301      MW01 0.0344014267  2.057063e-02
304      MW01 0.0178399181  1.959418e-02
305      MW01 0.0000000000  1.503737e-02
312      MW02 0.0188217542  4.025297e-02
313      MW02 0.0440168854  3.959945e-02
319      MW02 0.0544881853  3.794405e-02
320      MW02 0.0004998750  3.747616e-02
323      MW02 0.0339182182  3.632170e-02
325      MW02 0.0449733656  3.402296e-02
326      MW02 0.0168571171  3.315330e-02
336      MW03 0.0089795628  1.189867e-02
337      MW03 0.0004998750  1.059927e-02
340      MW03 0.0029955090  9.325293e-03
342      MW03 0.0004998750  9.032864e-03
348      MW03 0.0038924147  7.573262e-03
350      MW03 0.0004998750  6.668004e-03
356      MW03 0.0004998750  2.589257e-03
365      MW03 0.0001919816 -7.630496e-03
371      MW04 0.0148886125  3.242004e-02
379      MW04 0.0054849302  1.590417e-02
380      MW04 0.0058826632  1.429708e-02
383      MW04 0.0062305498  1.236450e-02
384      MW04 0.0000000000  1.225770e-02
388      MW04 0.0062106738  1.070402e-02
392      MW04 0.0028858320  7.891614e-03
393      MW04 0.0018183458  7.072813e-03
408      MW06 0.0109399400  2.704696e-02
412      MW06 0.0305292050  2.607050e-02
419      MW06 0.0158733492  2.060335e-02
423      MW06 0.0030952049  1.709929e-02
430      MW07 0.0019980027  4.773634e-03
435      MW07 0.0069756137  3.659862e-03
438      MW07 0.0055843783  2.968204e-03
439      MW07 0.0004998750  2.719004e-03
440      MW07 0.0036931718  2.490146e-03
442      MW07 0.0004998750  1.350945e-03
446      MW07 0.0004998750 -1.354658e-03
451      MW07 0.0004998750 -5.769062e-03
459      MW08 0.0069756137  9.143306e-03
464      MW08 0.0129162253  8.108362e-03
465      MW08 0.0049875415  7.874419e-03
469      MW08 0.0050870390  6.486018e-03
471      MW08 0.0004998750  5.613817e-03
486      MW08 0.0001919816 -8.501598e-03
488      MW08 0.0001919816 -9.429742e-03
490      MW09 0.0440168854  2.217303e-02
491      MW09 0.0341115296  2.008788e-02
501      MW09 0.0043903483  1.213382e-02
503      MW09 0.0012991557  9.156130e-03
504      MW09 0.0013191296  7.643129e-03
508      MW09 0.0015887373  6.244556e-03
512      MW09 0.0003519381  4.632383e-03
519      MW10 0.0109399400  1.323903e-02
526      MW10 0.0190180058  1.138529e-02
534      MW10 0.0032945669  5.724880e-03
539      MW11 0.0535407669  2.425392e-02
540      MW11 0.0382587121  2.364364e-02
559      MW11 0.0007717022  5.352848e-03
560      MW12 0.0227394870  2.203784e-02
572      MW12 0.0027960873  1.493818e-02
573      MW12 0.0056838165  1.402784e-02
576      MW15 0.0506931143  3.175038e-02
579      MW15 0.0440168854  2.796406e-02
581      MW15 0.0314986671  2.646886e-02
583      MW15 0.0062802380  2.265457e-02
589     MW15R 0.0068067812  1.140431e-02
590     MW15R 0.0052860044  1.118309e-02
591     MW15R 0.0036632820  1.095423e-02
595     MW15R 0.0118297518  9.581084e-03
602     MW15R 0.0053556330  4.429250e-03
604      MW16 0.0017983819  1.583987e-02
612      MW16 0.0129162253  7.611171e-03
613      MW16 0.0006587830  6.100712e-03
614      MW16 0.0087119406  5.678598e-03
626      MW17 0.0065783154  1.071700e-02
631      MW17 0.0034938893  6.378881e-03
632      MW17 0.0087615057  5.005737e-03
636      MW17 0.0040716994  1.466076e-03
642      MW17 0.0002249747 -1.918683e-04
645      MW17 0.0001919816 -1.114926e-03
658      MW18 0.0171520588  1.999573e-02
660      MW18 0.0000000000  1.970839e-02
666      MW18 0.0151841353  1.861496e-02
667      MW18 0.0189198849  1.815216e-02
673      MW18 0.0231304174  1.351907e-02
674      MW18 0.0013091427  1.257566e-02
680      MW19 0.0004998750  3.577741e-03
695      MW20 0.0506931143  2.561524e-02
698      MW20 0.0506931143  2.301390e-02
699      MW20 0.0227394870  2.217984e-02
704      MW20 0.0258626595  1.726195e-02
708      MW20 0.0050671403  1.654995e-02
710      MW20 0.0058130713  1.607189e-02
717      MW20 0.0009805191  1.096583e-02
719      MW21 0.0004998750  7.165450e-03
734      MW21 0.0001919816 -2.212616e-03
739      MW21 0.0001919816 -4.096875e-03
743      MW22 0.0106431601  5.717145e-03
750      MW22 0.0072437009  4.577944e-03
754      MW22 0.0023272898  3.649800e-03
758      MW23 0.0435383020  3.182229e-02
771      MW24 0.0022374949  6.004052e-03
778      MW24 0.0024270523  4.841965e-03
784      MW24 0.0020578811  1.307390e-03
795      MW25 0.0082161548  7.090504e-03
810      MW26 0.0000000000  4.155855e-02
818      MW26 0.0502177162  3.813077e-02
819      MW26 0.0327575642  3.715177e-02
836      MW27 0.0182327683  8.090019e-03
843      MW45 0.0001919816  7.773513e-05
851      MW46 0.0001919816  2.237256e-03
852      MW46 0.0003239475  2.008399e-03
855      MW46 0.0001919816  6.352548e-04
856      MW46 0.0001919816 -3.259462e-04
874      MW47 0.0005878272 -1.711489e-03
880      MW48 0.0000000000  8.956888e-03
890      MW48 0.0010394596  6.485228e-03
914      MW51 0.0001919816  1.645047e-03
918      MW51 0.0001919816  8.881637e-05
922      MW52 0.0000000000  2.128176e-02
923      MW52 0.0287818014  2.111393e-02
924      MW52 0.0404699326  2.089525e-02
938      MW53 0.0000000000  6.037990e-03
942      MW53 0.0040816587  5.160703e-03
948      MW53 0.0033842669  3.543445e-03
955      MW54 0.0319830459  3.095286e-02
961      MW54 0.0092272973  2.686140e-02
965      MW55 0.0213700257  8.547658e-03
966      MW55 0.0243022925  8.334057e-03
970      MW55 0.0041115360  7.164342e-03
976      MW55 0.0014189928  2.518537e-03
978      MW55 0.0014988761  1.587851e-03
990      MW58 0.0017384880  3.397014e-03
995      MW58 0.0008526364  2.290870e-03
996      MW58 0.0007387271  1.774670e-03
997      MW58 0.0001919816  8.338115e-04
1010     MW60 0.0011992806  5.180561e-03
1011     MW60 0.0034540280  4.842360e-03
1030     MW62 0.0068067812  6.592619e-03
1045     MW63 0.0142973047  3.982744e-03
1052    SMW30 0.0071742037  4.909562e-03
1057    SMW30 0.0080376117  3.709333e-03
1064    SMW43 0.0457378917  2.851765e-02
1078    SMW43 0.0086028888  2.202827e-02
1080     TC01 0.0000000000  9.857517e-03
1083     TC01 0.0047288016  6.322942e-03
1084     TC01 0.0000000000  6.213599e-03
1090     TC01 0.0006078152 -9.292940e-04
1091     TC10 0.0217614918  7.696327e-03
1100     TC10 0.0055744339  2.503807e-03
print(cor(DCE_results_cleaned$Actual, DCE_results_cleaned$Predicted))
[1] 0.7136918

A correlation of 0.71 suggests a strong positive linear relationship between the actual and predicted values. This means that as the actual values increase, the predicted values tend to also increase, and vice versa. While the correlation provides a measure of the strength of the linear relationship, it does not directly indicate how well the model fits the data in terms of variance explained. For this, metrics such as R-squared, RMSE (Root Mean Square Error), and MAE (Mean Absolute Error) are also important.

Visual Inspection of Actual Vs Predicted Values

## Visual Inspection ##
plot(DCE_results_cleaned$Actual, DCE_results_cleaned$Predicted, main = "Actual vs Predicted Values", xlab = "Actual", ylab = "Predicted")
abline(lm(DCE_results_cleaned$Predicted ~ DCE_results_cleaned$Actual), col = "red")

Residual Analysis

Analyze the residuals (actual - predicted) to check for patterns that might indicate model deficiencies.

### Residual Analysis###
residuals <- DCE_results_cleaned$Actual - DCE_results_cleaned$Predicted
plot(residuals, main = "Residuals", ylab = "Residuals")

Model Evaluation Metrics

### Model Evaluation Metrics  ##
rmse <- sqrt(mean((DCE_results_cleaned$Actual - DCE_results_cleaned$Predicted)^2))
print("Root Mean Square Error:")
[1] "Root Mean Square Error:"
print(rmse)
[1] 0.01013567
mae <- mean(abs(DCE_results_cleaned$Actual - DCE_results_cleaned$Predicted))
print("Mean Absolute Error:")
[1] "Mean Absolute Error:"
print(mae)
[1] 0.006768408
r_squared <- summary(lm(DCE_results_cleaned$Predicted ~ DCE_results_cleaned$Actual))$r.squared
print("Root Square Error:")
[1] "Root Square Error:"
print(r_squared)
[1] 0.509356

Summary of Model Performance

  • RMSE (0.010): Indicates the average error magnitude in predicting the dependent variable. A very low value suggests that the model’s predictions are close to the actual values.

  • MAE (0.006): Shows the average absolute error in predictions. The low value reinforces the indication from RMSE that the model performs well.

  • R^2 (0.51): Indicates that about 51% of the variance in the dependent variable is explained by the model. While this shows a decent fit, there’s still room for improvement to capture more variance.

Time series forecasting for 1,1-Dichloroethene

Aggregating data by month for a better time series analysis

For the Time Series Forecasting,

# Time series forecasting for X11DCE
# Aggregating data by month for a better time series analysis
data_ts <- data %>%
  group_by(Date, SiteName) %>%
  summarize(X11DCE = mean(X11DCE), .groups = "drop")

Convert to time series object

# Convert to time series object
ts_DCE <- ts(data_ts$X11DCE, start = c(year(min(data_ts$Date)), month(min(data_ts$Date))), frequency = 12)
# Decompose time series
decomp_DCE <- stl(ts_DCE, s.window="periodic")
# Plot decompositions
plot(decomp_DCE)

Fit ARIMA model for DCE

# Fit ARIMA model
fit_DCE <- auto.arima(ts_DCE)
summary(fit_DCE)
Series: ts_DCE 
ARIMA(3,1,2)(2,0,0)[12] 

Coefficients:
          ar1     ar2     ar3      ma1      ma2     sar1    sar2
      -0.3511  0.0397  0.0461  -0.6290  -0.3356  -0.0569  0.0015
s.e.   0.7434  0.0360  0.0378   0.7437   0.7218   0.0318  0.0317

sigma^2 = 0.0001804:  log likelihood = 2997.39
AIC=-5978.77   AICc=-5978.63   BIC=-5939.23

Training set error measures:
                        ME       RMSE        MAE  MPE MAPE      MASE
Training set -0.0002787663 0.01337882 0.01002845 -Inf  Inf 0.7810371
                      ACF1
Training set -0.0006419278

Forecast for DCE in the next 12 months

# Forecast for the next 12 months
forecast_DCE <- forecast(fit_DCE, h = 12)
plot(forecast_DCE)

# Combine forecasts into a data frame
future_forecasts_DCE <- data.frame(
  Date = seq.Date(from = max(data$Date) + 1, by = "month", length.out = 48),
  DCE_Forecast = as.numeric(forecast_DCE$mean)
)
# Print future forecasts
print(future_forecasts_DCE)
         Date DCE_Forecast
1  2024-01-31  0.004160921
2  2024-03-02  0.004049296
3  2024-03-31  0.004259004
4  2024-05-01  0.004428379
5  2024-05-31  0.004305166
6  2024-07-01  0.004409102
7  2024-07-31  0.004362960
8  2024-08-31  0.004235477
9  2024-10-01  0.004389000
10 2024-10-31  0.004379273
11 2024-12-01  0.004389946
12 2024-12-31  0.004402565
13 2025-01-31  0.004160921
14 2025-03-03  0.004049296
15 2025-03-31  0.004259004
16 2025-05-01  0.004428379
17 2025-05-31  0.004305166
18 2025-07-01  0.004409102
19 2025-07-31  0.004362960
20 2025-08-31  0.004235477
21 2025-10-01  0.004389000
22 2025-10-31  0.004379273
23 2025-12-01  0.004389946
24 2025-12-31  0.004402565
25 2026-01-31  0.004160921
26 2026-03-03  0.004049296
27 2026-03-31  0.004259004
28 2026-05-01  0.004428379
29 2026-05-31  0.004305166
30 2026-07-01  0.004409102
31 2026-07-31  0.004362960
32 2026-08-31  0.004235477
33 2026-10-01  0.004389000
34 2026-10-31  0.004379273
35 2026-12-01  0.004389946
36 2026-12-31  0.004402565
37 2027-01-31  0.004160921
38 2027-03-03  0.004049296
39 2027-03-31  0.004259004
40 2027-05-01  0.004428379
41 2027-05-31  0.004305166
42 2027-07-01  0.004409102
43 2027-07-31  0.004362960
44 2027-08-31  0.004235477
45 2027-10-01  0.004389000
46 2027-10-31  0.004379273
47 2027-12-01  0.004389946
48 2027-12-31  0.004402565

Method2: Time Series Forecast for each Site w.r.t 11DCE

# Create an empty list to store forecasts for each site
forecasts_list <- list()
# Iterate over each site
for(site in unique(data_ts$SiteName)) {site_data <- data_ts %>% filter(SiteName == site)
  
# Check if there is enough data for decomposition
if(nrow(site_data) < 24) {  # Ensure at least two full years of data for monthly series
    warning(paste("Not enough data for site", site, ". Skipping decomposition and ARIMA modeling."))
    next
  }
  
# Convert to time series object
ts_DCE <- ts(site_data$X11DCE, start = c(year(min(site_data$Date)), month(min(site_data$Date))), frequency = 12)
  
  # Check if time series has sufficient length for decomposition
  if (length(ts_DCE) < 24) {  # Ensure at least two full periods of data for decomposition
    warning(paste("Series is too short for STL decomposition for site", site, ". Skipping decomposition and ARIMA modeling."))
    next
  }
  
  # Decompose time series
  decomp_DCE <- tryCatch({
    stl(ts_DCE, s.window = "periodic")
  }, error = function(e) {
    warning(paste("Error in STL decomposition for site", site, ":", e$message))
    NULL
  })
  
  if (!is.null(decomp_DCE)) {
    # Plot decompositions
    plot(decomp_DCE, main = paste("Decomposition for Site", site))
  }
  
  # Fit ARIMA model
  fit_DCE <- auto.arima(ts_DCE)
  summary(fit_DCE)
  
  # Forecast for the next 12 months
  forecast_DCE <- forecast(fit_DCE, h = 12)
  plot(forecast_DCE, main = paste("Forecast for Site", site))
  
  # Store forecast in the list
  forecasts_list[[site]] <- data.frame(
    Date = seq.Date(from = max(site_data$Date) + 1, by = "month", length.out = 12),
    SiteName = site,
    DCE_Forecast = as.numeric(forecast_DCE$mean)
  )
}

Warning: Not enough data for site MW06 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW10 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW01 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW02 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW05 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW12 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW11 . Skipping decomposition and ARIMA
modeling.

Warning: Not enough data for site MW15 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW16 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW17 . Skipping decomposition and ARIMA
modeling.

Warning: Not enough data for site MW19 . Skipping decomposition and ARIMA
modeling.

Warning: Not enough data for site MW21 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site TC01 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site TC10 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW22 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW23 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW24 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW25 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW27 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW28 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site SMW30 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW32 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW29 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW30 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW31 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW33 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW39 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW35 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW36 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW37 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW38 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW34 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW40 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW41 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW42 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW44 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site DMW43 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site SMW43 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW26 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW45 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW47 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW48 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW49 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW46 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW50 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW51 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW15R . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW52 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW53 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW55 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW54 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW57 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW58 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW59 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW60 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW61 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW62 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW63 . Skipping decomposition and ARIMA
modeling.
Warning: Not enough data for site MW56 . Skipping decomposition and ARIMA
modeling.

# Combine all forecasts into a single data frame
all_forecasts <- do.call(rbind, forecasts_list)

# Print future forecasts
print(all_forecasts)
               Date SiteName  DCE_Forecast
MW03.1   2024-01-24     MW03  0.0002383094
MW03.2   2024-02-24     MW03  0.0002383094
MW03.3   2024-03-24     MW03  0.0002383094
MW03.4   2024-04-24     MW03  0.0002383094
MW03.5   2024-05-24     MW03  0.0002383094
MW03.6   2024-06-24     MW03  0.0002383094
MW03.7   2024-07-24     MW03  0.0002383094
MW03.8   2024-08-24     MW03  0.0002383094
MW03.9   2024-09-24     MW03  0.0002383094
MW03.10  2024-10-24     MW03  0.0002383094
MW03.11  2024-11-24     MW03  0.0002383094
MW03.12  2024-12-24     MW03  0.0002383094
MW04.1   2024-01-26     MW04  0.0017983819
MW04.2   2024-02-26     MW04  0.0017983819
MW04.3   2024-03-26     MW04  0.0017983819
MW04.4   2024-04-26     MW04  0.0017983819
MW04.5   2024-05-26     MW04  0.0017983819
MW04.6   2024-06-26     MW04  0.0017983819
MW04.7   2024-07-26     MW04  0.0017983819
MW04.8   2024-08-26     MW04  0.0017983819
MW04.9   2024-09-26     MW04  0.0017983819
MW04.10  2024-10-26     MW04  0.0017983819
MW04.11  2024-11-26     MW04  0.0017983819
MW04.12  2024-12-26     MW04  0.0017983819
MW08.1   2019-01-18     MW08  0.0002122387
MW08.2   2019-02-18     MW08  0.0002122387
MW08.3   2019-03-18     MW08  0.0002122387
MW08.4   2019-04-18     MW08  0.0002122387
MW08.5   2019-05-18     MW08  0.0002122387
MW08.6   2019-06-18     MW08  0.0002122387
MW08.7   2019-07-18     MW08  0.0002122387
MW08.8   2019-08-18     MW08  0.0002122387
MW08.9   2019-09-18     MW08  0.0002122387
MW08.10  2019-10-18     MW08  0.0002122387
MW08.11  2019-11-18     MW08  0.0002122387
MW08.12  2019-12-18     MW08  0.0002122387
MW07.1   2012-06-13     MW07  0.0013575057
MW07.2   2012-07-13     MW07  0.0016377772
MW07.3   2012-08-13     MW07  0.0017293693
MW07.4   2012-09-13     MW07  0.0017593013
MW07.5   2012-10-13     MW07  0.0017690830
MW07.6   2012-11-13     MW07  0.0017722796
MW07.7   2012-12-13     MW07  0.0017733243
MW07.8   2013-01-13     MW07  0.0017736657
MW07.9   2013-02-13     MW07  0.0017737773
MW07.10  2013-03-13     MW07  0.0017738137
MW07.11  2013-04-13     MW07  0.0017738256
MW07.12  2013-05-13     MW07  0.0017738295
MW09.1   2024-01-31     MW09 -0.0018595509
MW09.2   2024-03-02     MW09 -0.0028353596
MW09.3   2024-03-31     MW09 -0.0045080197
MW09.4   2024-05-01     MW09 -0.0057917862
MW09.5   2024-05-31     MW09 -0.0072925835
MW09.6   2024-07-01     MW09 -0.0086722619
MW09.7   2024-07-31     MW09 -0.0101195334
MW09.8   2024-08-31     MW09 -0.0115290831
MW09.9   2024-10-01     MW09 -0.0129596843
MW09.10  2024-10-31     MW09 -0.0143785373
MW09.11  2024-12-01     MW09 -0.0158039466
MW09.12  2024-12-31     MW09 -0.0172256969
DMW13.1  2024-01-26    DMW13  0.0003755115
DMW13.2  2024-02-26    DMW13  0.0003755115
DMW13.3  2024-03-26    DMW13  0.0003755115
DMW13.4  2024-04-26    DMW13  0.0003755115
DMW13.5  2024-05-26    DMW13  0.0003755115
DMW13.6  2024-06-26    DMW13  0.0003755115
DMW13.7  2024-07-26    DMW13  0.0003755115
DMW13.8  2024-08-26    DMW13  0.0003755115
DMW13.9  2024-09-26    DMW13  0.0003755115
DMW13.10 2024-10-26    DMW13  0.0003755115
DMW13.11 2024-11-26    DMW13  0.0003755115
DMW13.12 2024-12-26    DMW13  0.0003755115
MW18.1   2024-01-19     MW18  0.0144983736
MW18.2   2024-02-19     MW18  0.0144983736
MW18.3   2024-03-19     MW18  0.0144983736
MW18.4   2024-04-19     MW18  0.0144983736
MW18.5   2024-05-19     MW18  0.0144983736
MW18.6   2024-06-19     MW18  0.0144983736
MW18.7   2024-07-19     MW18  0.0144983736
MW18.8   2024-08-19     MW18  0.0144983736
MW18.9   2024-09-19     MW18  0.0144983736
MW18.10  2024-10-19     MW18  0.0144983736
MW18.11  2024-11-19     MW18  0.0144983736
MW18.12  2024-12-19     MW18  0.0144983736
MW20.1   2024-01-23     MW20  0.0019386039
MW20.2   2024-02-23     MW20  0.0010024483
MW20.3   2024-03-23     MW20  0.0012446654
MW20.4   2024-04-23     MW20  0.0013994633
MW20.5   2024-05-23     MW20  0.0011857261
MW20.6   2024-06-23     MW20  0.0012988470
MW20.7   2024-07-23     MW20  0.0012880109
MW20.8   2024-08-23     MW20  0.0012545844
MW20.9   2024-09-23     MW20  0.0012853122
MW20.10  2024-10-23     MW20  0.0012732054
MW20.11  2024-11-23     MW20  0.0012714440
MW20.12  2024-12-23     MW20  0.0012773545