Here we observe the result of MLR on PM25 with features: WND_dir, WND_speed, TMP_air, SLP_pressure, Flow_dakota, VMT_shield, Speed_shield.
I was also interested in seeing how the predicted values looked like overlaying the orginal values for PM25.
ORIGINAL_DATA <- read.csv("all_data_imputed_dow_median.csv")
df = ORIGINAL_DATA[,2:ncol(ORIGINAL_DATA)]
index <- floor(0.7 * nrow(df))
train <- df[1:index, ]
test <- df[index:nrow(df), ]
mlr_model <- lm(PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure + Flow_dakota + VMT_shield + Speed_shield, data = train)
summary(mlr_model)
##
## Call:
## lm(formula = PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure +
## Flow_dakota + VMT_shield + Speed_shield, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.724 -5.779 -1.823 2.772 132.187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.692e+01 4.621e+00 7.989 1.95e-15 ***
## WND_dir -6.618e-03 4.152e-03 -1.594 0.1110
## WND_speed -4.764e-01 1.829e-02 -26.048 < 2e-16 ***
## TMP_air -2.271e-02 3.057e-02 -0.743 0.4575
## SLP_pressure 1.125e-04 1.990e-05 5.650 1.76e-08 ***
## Flow_dakota -2.648e-04 2.686e-05 -9.858 < 2e-16 ***
## VMT_shield 6.530e-04 5.136e-05 12.715 < 2e-16 ***
## Speed_shield -1.420e-01 6.214e-02 -2.285 0.0224 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.63 on 2904 degrees of freedom
## Multiple R-squared: 0.3299, Adjusted R-squared: 0.3283
## F-statistic: 204.3 on 7 and 2904 DF, p-value: < 2.2e-16
predictions <- predict(mlr_model, newdata = test)
#Root mean squared error
rmse <- sqrt(mean((test$PM25 - predictions)^2))
#Mean absolute error
mae <- mean(abs(test$PM25 - predictions))
# Computing the R^2
SSE <- sum((test$PM25 - predictions)^2)
SST <- sum((test$PM25 - mean(test$PM25))^2)
R2_test <- 1 - (SSE/SST)
results <- data.frame(
Metric = c("RMSE", "MAE", "R²"),
Value = c(rmse, mae, R2_test)
)
#outputting results
results
## Metric Value
## 1 RMSE 6.6967031
## 2 MAE 5.2349590
## 3 R² 0.2317415
#plotting actual vs predicted
plot(test$PM25, predictions,
xlab = "Actual",
ylab = "Predicted",
main = "Predicted vs Actual")
abline(0, 1, col = "red", lwd = 2)
last_30per_df <- last_30 <- df$PM25[index:nrow(df)]
plot(1:nrow(df), df$PM25, col = rgb(0, 0, 1, 0.7), pch = .01)
points(index:nrow(df), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
plot(index:nrow(df), last_30, col = rgb(0, 0, 1, 0.7), pch = .01)
points(index:nrow(df), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
plot(index:nrow(df), last_30, col = rgb(0, 0, 1, 0.3), pch = .01)
points(index:nrow(df), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
abline(h = mean(last_30per_df), col = "blue", lwd = 4,)
abline(h = mean(predictions), col = "red", lwd = 4)
Here we look at some important features and their time series.
#Start : 2014-04-08
#End : 2025-08-27
ORIGINAL_DATA <- read.csv("all_data_imputed_dow_median.csv")
df = ORIGINAL_DATA[,2:ncol(ORIGINAL_DATA)]
TIME_SEIRES_DATA <- ts(df$PM25, start = c(2010,4), end=c(2025,8), frequency = 365)
#plot(TIME_SEIRES_DATA,main = "PM25 v. day",
# ylab = "PM25",
# xlab = "day")
ts_plot <- function(data, value_col) {
TIME_SEIRES_DATA <- ts(data[[value_col]], start = c(2010,4), end=c(2025,8), frequency = 365)
plot(TIME_SEIRES_DATA,
xlab = "Years",
ylab = value_col,
main = paste(value_col, "from 2014-04-08 to 2025-08-27"))
}
#Wanted plots: PM2.5, WND_dir, WND_speed, TMP,
# DEW_point, SLP_pressure, VMT_shield,
# Flow_shield, Speed_shield vs. time
ts_plot(df,"PM25")
ts_plot(df,"WND_dir")
ts_plot(df,"WND_speed")
ts_plot(df,"TMP_air")
ts_plot(df,"DEW_point")
ts_plot(df,"SLP_pressure")
ts_plot(df,"VMT_shield")
ts_plot(df,"Flow_shield")
ts_plot(df,"Speed_shield")
ORIGINAL_DATA <- read.csv("all_data_imputed_dow_median.csv")
df = ORIGINAL_DATA[,2:ncol(ORIGINAL_DATA)]
#Creating a box plot
boxplot(df$PM25)
Here we normalize the data, and test the MLR model again.
summary(df)
## PM25 WND_dir WND_speed TMP_air
## Min. : 0.700 Min. : 75.0 Min. : 0.4348 Min. : 1.243
## 1st Qu.: 6.463 1st Qu.:186.6 1st Qu.:16.2143 1st Qu.:12.320
## Median : 9.700 Median :253.2 Median :26.2009 Median :18.952
## Mean : 13.253 Mean :237.8 Mean :26.9025 Mean :19.348
## 3rd Qu.: 15.600 3rd Qu.:295.9 3rd Qu.:35.6071 3rd Qu.:26.639
## Max. :167.867 Max. :324.8 Max. :85.2727 Max. :39.850
## DEW_point SLP_pressure VMT_dakota Speed_dakota
## Min. :-7.361 Min. : 997.3 Min. : 414.5 Min. :33.90
## 1st Qu.: 4.856 1st Qu.: 1013.1 1st Qu.:18752.4 1st Qu.:63.20
## Median : 7.884 Median : 1021.1 Median :23147.3 Median :64.10
## Mean : 7.596 Mean : 7759.4 Mean :22419.1 Mean :64.14
## 3rd Qu.:10.463 3rd Qu.:13218.9 3rd Qu.:26458.0 3rd Qu.:65.60
## Max. :19.136 Max. :62441.5 Max. :31518.0 Max. :68.10
## Flow_dakota VMT_shield Speed_shield Flow_shield dow
## Min. : 921 Min. : 6084 Min. :34.4 Min. :22125 Min. :1.000
## 1st Qu.:41672 1st Qu.:17318 1st Qu.:57.7 1st Qu.:62603 1st Qu.:2.000
## Median :51439 Median :20119 Median :60.8 Median :72857 Median :4.000
## Mean :49820 Mean :19283 Mean :60.1 Mean :68605 Mean :3.999
## 3rd Qu.:58796 3rd Qu.:21164 3rd Qu.:63.8 3rd Qu.:76692 3rd Qu.:6.000
## Max. :70040 Max. :44278 Max. :67.7 Max. :90388 Max. :7.000
#Normalizing the data
df_norm <- as.data.frame(scale(df))
index <- floor(0.7 * nrow(df_norm))
train <- df_norm[1:index, ]
test <- df_norm[index:nrow(df_norm), ]
mlr_model <- lm(PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure + Flow_dakota + VMT_shield + Speed_shield, data = train)
summary(mlr_model)
##
## Call:
## lm(formula = PM25 ~ WND_dir + WND_speed + TMP_air + SLP_pressure +
## Flow_dakota + VMT_shield + Speed_shield, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1831 -0.4904 -0.1547 0.2353 11.2185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04870 0.01729 2.816 0.00489 **
## WND_dir -0.03502 0.02197 -1.594 0.11103
## WND_speed -0.52873 0.02030 -26.048 < 2e-16 ***
## TMP_air -0.01545 0.02079 -0.743 0.45751
## SLP_pressure 0.09517 0.01684 5.650 1.76e-08 ***
## Flow_dakota -0.23589 0.02393 -9.858 < 2e-16 ***
## VMT_shield 0.22919 0.01803 12.715 < 2e-16 ***
## Speed_shield -0.05558 0.02432 -2.285 0.02236 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9024 on 2904 degrees of freedom
## Multiple R-squared: 0.3299, Adjusted R-squared: 0.3283
## F-statistic: 204.3 on 7 and 2904 DF, p-value: < 2.2e-16
predictions <- predict(mlr_model, newdata = test)
#Root mean squared error
rmse <- sqrt(mean((test$PM25 - predictions)^2))
#Mean absolute error
mae <- mean(abs(test$PM25 - predictions))
# Computing the R^2
SSE <- sum((test$PM25 - predictions)^2)
SST <- sum((test$PM25 - mean(test$PM25))^2)
R2_test <- 1 - (SSE/SST)
results <- data.frame(
Metric = c("RMSE", "MAE", "R²"),
Value = c(rmse, mae, R2_test)
)
#outputting results
results
## Metric Value
## 1 RMSE 0.5683375
## 2 MAE 0.4442818
## 3 R² 0.2317415
#plotting actual vs predicted
plot(test$PM25, predictions,
xlab = "Actual",
ylab = "Predicted",
main = "Normalized : Predicted vs Actual")
abline(0, 1, col = "red", lwd = 2)
last_30per_df_norm <- last_30 <- df_norm$PM25[index:nrow(df_norm)]
plot(1:nrow(df_norm), df_norm$PM25, col = rgb(0, 0, 1, 0.7), pch = .01, main = "Normalized")
points(index:nrow(df_norm), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
plot(index:nrow(df_norm), last_30, col = rgb(0, 0, 1, 0.7), pch = .01, main = "Normalized")
points(index:nrow(df_norm), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
plot(index:nrow(df_norm), last_30, col = rgb(0, 0, 1, 0.3), pch = .01, main = "Normalized")
points(index:nrow(df_norm), predictions, col = rgb(1, 0, 0, 0.3), pch = .01)
abline(h = mean(last_30per_df_norm), col = "blue", lwd = 4,)
abline(h = mean(predictions), col = "red", lwd = 4)