Training the Data using MLR

Here we observe the result of MLR on PM25 with features: WND_dir, WND_speed, TMP_air, SLP_pressure, Flow_dakota, VMT_shield, Speed_shield.

I was also interested in seeing how the predicted values looked like overlaying the orginal values for PM25.

ORIGINAL_DATA <- read.csv("all_data_imputed_dow_median.csv")

df = ORIGINAL_DATA[,2:ncol(ORIGINAL_DATA)]

index <- floor(0.7 * nrow(df))

train <- df[1:index, ]
test  <- df[index:nrow(df), ]

mlr_model <- lm(PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure + Flow_dakota + VMT_shield + Speed_shield, data = train)
summary(mlr_model)
## 
## Call:
## lm(formula = PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure + 
##     Flow_dakota + VMT_shield + Speed_shield, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.724  -5.779  -1.823   2.772 132.187 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.692e+01  4.621e+00   7.989 1.95e-15 ***
## WND_dir      -6.618e-03  4.152e-03  -1.594   0.1110    
## WND_speed    -4.764e-01  1.829e-02 -26.048  < 2e-16 ***
## TMP_air      -2.271e-02  3.057e-02  -0.743   0.4575    
## SLP_pressure  1.125e-04  1.990e-05   5.650 1.76e-08 ***
## Flow_dakota  -2.648e-04  2.686e-05  -9.858  < 2e-16 ***
## VMT_shield    6.530e-04  5.136e-05  12.715  < 2e-16 ***
## Speed_shield -1.420e-01  6.214e-02  -2.285   0.0224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.63 on 2904 degrees of freedom
## Multiple R-squared:  0.3299, Adjusted R-squared:  0.3283 
## F-statistic: 204.3 on 7 and 2904 DF,  p-value: < 2.2e-16
predictions <- predict(mlr_model, newdata = test)

#Root mean squared error
rmse <- sqrt(mean((test$PM25 - predictions)^2))

#Mean absolute error
mae <- mean(abs(test$PM25 - predictions))


# Computing the R^2
SSE <- sum((test$PM25 - predictions)^2)
SST <- sum((test$PM25 - mean(test$PM25))^2)

R2_test <- 1 - (SSE/SST)


results <- data.frame(
  Metric = c("RMSE", "MAE", "R²"),
  Value = c(rmse, mae, R2_test)
)

#outputting results
results
##   Metric     Value
## 1   RMSE 6.6967031
## 2    MAE 5.2349590
## 3     R² 0.2317415
#plotting actual vs predicted  
plot(test$PM25, predictions,
     xlab = "Actual",
     ylab = "Predicted",
     main = "Predicted vs Actual")
abline(0, 1, col = "red", lwd = 2)

last_30per_df <- last_30 <- df$PM25[index:nrow(df)]

plot(1:nrow(df), df$PM25, col = rgb(0, 0, 1, 0.7), pch = .01)
points(index:nrow(df), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)

plot(index:nrow(df), last_30, col = rgb(0, 0, 1, 0.7), pch = .01)
points(index:nrow(df), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)

plot(index:nrow(df), last_30, col = rgb(0, 0, 1, 0.3), pch = .01)
points(index:nrow(df), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
abline(h = mean(last_30per_df), col = "blue", lwd = 4,)
abline(h = mean(predictions), col = "red", lwd = 4)

Time Series Plots

Here we look at some important features and their time series.

#Start : 2014-04-08
#End : 2025-08-27


ORIGINAL_DATA <- read.csv("all_data_imputed_dow_median.csv")

df = ORIGINAL_DATA[,2:ncol(ORIGINAL_DATA)]

TIME_SEIRES_DATA <- ts(df$PM25, start = c(2010,4), end=c(2025,8), frequency = 365)

#plot(TIME_SEIRES_DATA,main = "PM25 v. day",
#     ylab = "PM25",
#     xlab = "day")

ts_plot <- function(data, value_col) {
  TIME_SEIRES_DATA <- ts(data[[value_col]], start = c(2010,4), end=c(2025,8), frequency = 365)
  plot(TIME_SEIRES_DATA,
       xlab = "Years",
       ylab = value_col,
       main = paste(value_col, "from 2014-04-08 to 2025-08-27"))
}

#Wanted plots: PM2.5, WND_dir, WND_speed, TMP, 
# DEW_point, SLP_pressure, VMT_shield, 
# Flow_shield, Speed_shield vs. time
ts_plot(df,"PM25")

ts_plot(df,"WND_dir")

ts_plot(df,"WND_speed")

ts_plot(df,"TMP_air")

ts_plot(df,"DEW_point")

ts_plot(df,"SLP_pressure")

ts_plot(df,"VMT_shield")

ts_plot(df,"Flow_shield")

ts_plot(df,"Speed_shield")

Box Plot of PM25

ORIGINAL_DATA <- read.csv("all_data_imputed_dow_median.csv")

df = ORIGINAL_DATA[,2:ncol(ORIGINAL_DATA)]

#Creating a box plot
boxplot(df$PM25)

Normalizing the Data by computing the z-score

Here we normalize the data, and test the MLR model again.

summary(df)
##       PM25            WND_dir        WND_speed          TMP_air      
##  Min.   :  0.700   Min.   : 75.0   Min.   : 0.4348   Min.   : 1.243  
##  1st Qu.:  6.463   1st Qu.:186.6   1st Qu.:16.2143   1st Qu.:12.320  
##  Median :  9.700   Median :253.2   Median :26.2009   Median :18.952  
##  Mean   : 13.253   Mean   :237.8   Mean   :26.9025   Mean   :19.348  
##  3rd Qu.: 15.600   3rd Qu.:295.9   3rd Qu.:35.6071   3rd Qu.:26.639  
##  Max.   :167.867   Max.   :324.8   Max.   :85.2727   Max.   :39.850  
##    DEW_point       SLP_pressure       VMT_dakota       Speed_dakota  
##  Min.   :-7.361   Min.   :  997.3   Min.   :  414.5   Min.   :33.90  
##  1st Qu.: 4.856   1st Qu.: 1013.1   1st Qu.:18752.4   1st Qu.:63.20  
##  Median : 7.884   Median : 1021.1   Median :23147.3   Median :64.10  
##  Mean   : 7.596   Mean   : 7759.4   Mean   :22419.1   Mean   :64.14  
##  3rd Qu.:10.463   3rd Qu.:13218.9   3rd Qu.:26458.0   3rd Qu.:65.60  
##  Max.   :19.136   Max.   :62441.5   Max.   :31518.0   Max.   :68.10  
##   Flow_dakota      VMT_shield     Speed_shield   Flow_shield         dow       
##  Min.   :  921   Min.   : 6084   Min.   :34.4   Min.   :22125   Min.   :1.000  
##  1st Qu.:41672   1st Qu.:17318   1st Qu.:57.7   1st Qu.:62603   1st Qu.:2.000  
##  Median :51439   Median :20119   Median :60.8   Median :72857   Median :4.000  
##  Mean   :49820   Mean   :19283   Mean   :60.1   Mean   :68605   Mean   :3.999  
##  3rd Qu.:58796   3rd Qu.:21164   3rd Qu.:63.8   3rd Qu.:76692   3rd Qu.:6.000  
##  Max.   :70040   Max.   :44278   Max.   :67.7   Max.   :90388   Max.   :7.000
#Normalizing the data

df_norm <- as.data.frame(scale(df))


index <- floor(0.7 * nrow(df_norm))

train <- df_norm[1:index, ]
test  <- df_norm[index:nrow(df_norm), ]

mlr_model <- lm(PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure + Flow_dakota + VMT_shield + Speed_shield, data = train)
summary(mlr_model)
## 
## Call:
## lm(formula = PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure + 
##     Flow_dakota + VMT_shield + Speed_shield, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1831 -0.4904 -0.1547  0.2353 11.2185 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.04870    0.01729   2.816  0.00489 ** 
## WND_dir      -0.03502    0.02197  -1.594  0.11103    
## WND_speed    -0.52873    0.02030 -26.048  < 2e-16 ***
## TMP_air      -0.01545    0.02079  -0.743  0.45751    
## SLP_pressure  0.09517    0.01684   5.650 1.76e-08 ***
## Flow_dakota  -0.23589    0.02393  -9.858  < 2e-16 ***
## VMT_shield    0.22919    0.01803  12.715  < 2e-16 ***
## Speed_shield -0.05558    0.02432  -2.285  0.02236 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9024 on 2904 degrees of freedom
## Multiple R-squared:  0.3299, Adjusted R-squared:  0.3283 
## F-statistic: 204.3 on 7 and 2904 DF,  p-value: < 2.2e-16
predictions <- predict(mlr_model, newdata = test)

#Root mean squared error
rmse <- sqrt(mean((test$PM25 - predictions)^2))

#Mean absolute error
mae <- mean(abs(test$PM25 - predictions))


# Computing the R^2
SSE <- sum((test$PM25 - predictions)^2)
SST <- sum((test$PM25 - mean(test$PM25))^2)

R2_test <- 1 - (SSE/SST)


results <- data.frame(
  Metric = c("RMSE", "MAE", "R²"),
  Value = c(rmse, mae, R2_test)
)

#outputting results
results
##   Metric     Value
## 1   RMSE 0.5683375
## 2    MAE 0.4442818
## 3     R² 0.2317415
#plotting actual vs predicted  
plot(test$PM25, predictions,
     xlab = "Actual",
     ylab = "Predicted",
     main = "Normalized : Predicted vs Actual")
abline(0, 1, col = "red", lwd = 2)

last_30per_df_norm <- last_30 <- df_norm$PM25[index:nrow(df_norm)]

plot(1:nrow(df_norm), df_norm$PM25, col = rgb(0, 0, 1, 0.7), pch = .01, main = "Normalized")
points(index:nrow(df_norm), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)

plot(index:nrow(df_norm), last_30, col = rgb(0, 0, 1, 0.7), pch = .01, main = "Normalized")
points(index:nrow(df_norm), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)

plot(index:nrow(df_norm), last_30, col = rgb(0, 0, 1, 0.3), pch = .01, main = "Normalized")
points(index:nrow(df_norm), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
abline(h = mean(last_30per_df_norm), col = "blue", lwd = 4,)
abline(h = mean(predictions), col = "red", lwd = 4)