Data

Initial preparation

for a regression project we use dataset measuring traffic on one of the highways for one-hourly intervals. After loading the dataset, we separate dataset for train and test sample (70/30 ratio) and look for outliers in train dataset. There are few records with outlier temperature (-273. that is absolute zero so it is obvious missing data). There is one record with 9831.3 mm of rain that is also obvious outlier. Next biggest values are 55.63, 44.45, 31.45 etc. so we treat them as possible values. We replace outliers with median.

library(readr)
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(caret)
## Loading required package: lattice
traffic <- read_csv("traffic_train.csv")
## Rows: 29701 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): weather_general, weather_detailed
## dbl  (5): clouds_coverage_pct, temperature, rain_mm, snow_mm, traffic
## dttm (1): date_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
traffic <- traffic[!traffic$traffic == 0,]
set.seed(1234)
sample <- sample.int(n = nrow(traffic), size = floor(.70*nrow(traffic)), replace = F)
TrainData <- traffic[sample,]
TestData  <- traffic[-sample, ]

# Viewing biggest values of temperature and rain
head(sort(TrainData$temperature, decreasing = TRUE))
head(sort(TrainData$rain_mm, decreasing = TRUE))
# Replacing outliers with medians
TrainData[TrainData$temperature < -50,]$temperature <- median(TrainData$temperature)

Distribution of variables

From a datestamp variable we extract month and hour, and what is more, with use of function weekday(), we derive the weekday of the provided day. Derived time variables seem to have influence on traffic, as for different values (especially Hour and Weekday) traffic values differ. After initial analysis, we concluded to change snow and rain variables into binary: zero and non-zero. Due to low quality of the variable and strong correlation with weather_general, we have also removed weather_detailed. Categorical variables were turned into factors

TrainData$hour <- as.factor(substr(TrainData$date_time,12,13))
TrainData$month <- as.factor(substr(TrainData$date_time,6,7))
TrainData$weekday <- as.factor(weekdays(as.Date(substr(TrainData$date_time,0,10))))
box_hour <- ggplot(TrainData, aes(x = traffic, y = hour)) + geom_boxplot() + theme_gray() + ggtitle('Hour')
box_month <- ggplot(TrainData, aes(x = traffic, y = month)) + geom_boxplot() + theme_gray() + ggtitle('Month')
box_weekday <- ggplot(TrainData, aes(x = traffic, y = weekday)) + geom_boxplot() + theme_gray() + ggtitle('Day of week')
box_weather <- ggplot(TrainData, aes(x = traffic, y = weather_general)) + geom_boxplot() + theme_gray() + ggtitle('Weather') + ylab("Weather")
box_temperature <- ggplot(TrainData, aes(x = traffic, y = cut_interval(temperature, length = 5))) + geom_boxplot() + theme_gray() + ggtitle('Temperature')+ ylab("Temperature")
box_rain <- ggplot(TrainData, aes(x = traffic, y = cut_interval(rain_mm, length = 2))) + geom_boxplot() + theme_gray() + ggtitle('Rain in mm')+ ylab("Rain in mm")
box_snow <- ggplot(TrainData, aes(x = traffic, y = cut_interval(snow_mm, length = 0.1))) + geom_boxplot() + theme_gray() + ggtitle('Snow in mm')+ ylab("Snow in mm")
box_clouds <- ggplot(TrainData, aes(x = traffic, y = cut_interval(clouds_coverage_pct, length = 10))) + geom_boxplot() + theme_gray() + ggtitle('Cloud Coverage') + ylab("Clouds coverage pct")

grid.arrange(box_hour,
             box_month,
             box_weekday,
             box_weather,
             box_temperature,
             box_rain,
             box_snow,
             box_clouds, ncol=3)

TrainData$weather_general <- as.factor(TrainData$weather_general)
TrainData$rain <- 0
TrainData$rain[TrainData$rain_mm>0] = 1 
TrainData$snow <- 0
TrainData$snow[TrainData$snow_mm>0] = 1
TrainData$temperature <- (TrainData$temperature - mean(TrainData$temperature))/sd(TrainData$temperature)
TrainData$clouds_coverage_pct <- (TrainData$clouds_coverage_pct - mean(TrainData$clouds_coverage_pct))/sd(TrainData$clouds_coverage_pct)

Correlations

Correlations measured by Kendall Tau coefficient for continous variables show minor correlations between variables. Anova test was held for categorical variables and the H0 of not influencing traffic is rejected for every dependent variable

colsNumeric <- c(4:8)
correlations <- 
  cor(TrainData[,colsNumeric],
      method = "kendall")
corrplot.mixed(correlations,
               upper = "square",
               lower = "number",
               tl.col="black", # color of labels (variable names)
               tl.pos = "lt")

# Anova test for categorical variables

categorical_vars <- 
  sapply(TrainData, is.factor) %>% 
  which() %>% names()

anova_statistic <- function(categorical_var) {
  anova_ <- aov(TrainData$traffic ~ 
                  TrainData[[categorical_var]]) 
  
  return(summary(anova_)[[1]][1, 5])
}

sapply(categorical_vars,
       anova_statistic) %>% 
  sort(decreasing = TRUE) -> anova_all_categorical

anova_all_categorical
##           month weather_general         weekday            hour 
##    1.247709e-20    1.357002e-51   3.012272e-247    0.000000e+00

Regression

Linear Regression

firstly we estimate the Linear Regression model, which outcomes suggest, that most of the variables are significant (based on t-test), so theory suggests that Ridge regression should be more suitable compared to LASSO, which is better for small amount of predictors. Exception to that is weather_general levels, most of which are insignificant. Although due to very low p-value of Anova test, we decide to keep that variable for the further estimations. Finally we decided to replace values that were estimated as negative with 0s, as negative estimations won’t make any sense. By linear regression we receive Mean Absolute Percentege Error at the level of 0.85

lm1 <- lm(traffic~weather_general+clouds_coverage_pct+temperature+rain+snow+hour+month+weekday, data = TrainData)
summary(lm1)
## 
## Call:
## lm(formula = traffic ~ weather_general + clouds_coverage_pct + 
##     temperature + rain + snow + hour + month + weekday, data = TrainData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4944.4  -410.5    -8.5   497.8  3153.3 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1150.573     43.465  26.471  < 2e-16 ***
## weather_generalClouds          48.271     20.847   2.315 0.020598 *  
## weather_generalDrizzle         38.052     39.863   0.955 0.339806    
## weather_generalFog             15.582     48.527   0.321 0.748141    
## weather_generalHaze           -37.661     40.152  -0.938 0.348276    
## weather_generalMist             6.455     25.982   0.248 0.803804    
## weather_generalRain            17.064     28.700   0.595 0.552157    
## weather_generalSmoke         -434.240    289.106  -1.502 0.133110    
## weather_generalSnow           -63.426     32.178  -1.971 0.048725 *  
## weather_generalSquall          38.915    471.618   0.083 0.934238    
## weather_generalThunderstorm   -21.841     50.362  -0.434 0.664524    
## clouds_coverage_pct           -33.007      9.385  -3.517 0.000437 ***
## temperature                    88.117     12.636   6.973 3.19e-12 ***
## rain                          -90.785     24.081  -3.770 0.000164 ***
## snow                         -342.117    122.829  -2.785 0.005353 ** 
## hour01                       -298.928     38.885  -7.687 1.57e-14 ***
## hour02                       -389.997     38.687 -10.081  < 2e-16 ***
## hour03                       -443.288     39.080 -11.343  < 2e-16 ***
## hour04                       -100.541     38.893  -2.585 0.009742 ** 
## hour05                       1267.550     38.729  32.729  < 2e-16 ***
## hour06                       3309.258     38.630  85.665  < 2e-16 ***
## hour07                       3912.424     38.994 100.334  < 2e-16 ***
## hour08                       3788.016     38.430  98.568  < 2e-16 ***
## hour09                       3538.415     39.305  90.024  < 2e-16 ***
## hour10                       3372.971     38.695  87.168  < 2e-16 ***
## hour11                       3629.583     39.314  92.323  < 2e-16 ***
## hour12                       3883.792     39.393  98.592  < 2e-16 ***
## hour13                       3870.899     39.531  97.920  < 2e-16 ***
## hour14                       4106.475     39.277 104.552  < 2e-16 ***
## hour15                       4375.763     39.233 111.534  < 2e-16 ***
## hour16                       4785.758     39.084 122.448  < 2e-16 ***
## hour17                       4458.153     39.786 112.052  < 2e-16 ***
## hour18                       3395.629     39.181  86.666  < 2e-16 ***
## hour19                       2402.096     39.143  61.368  < 2e-16 ***
## hour20                       1941.617     38.957  49.840  < 2e-16 ***
## hour21                       1810.297     38.998  46.420  < 2e-16 ***
## hour22                       1327.002     38.649  34.335  < 2e-16 ***
## hour23                        601.318     38.577  15.587  < 2e-16 ***
## month02                       118.358     30.558   3.873 0.000108 ***
## month03                       209.404     31.664   6.613 3.85e-11 ***
## month04                       200.303     33.422   5.993 2.09e-09 ***
## month05                        88.507     37.171   2.381 0.017269 *  
## month06                       132.766     41.974   3.163 0.001563 ** 
## month07                      -120.948     41.954  -2.883 0.003944 ** 
## month08                        47.310     42.009   1.126 0.260107    
## month09                        14.288     40.859   0.350 0.726581    
## month10                        87.684     34.387   2.550 0.010781 *  
## month11                       -55.925     30.222  -1.850 0.064265 .  
## month12                      -125.254     27.355  -4.579 4.70e-06 ***
## weekdayMonday                -315.764     21.092 -14.971  < 2e-16 ***
## weekdaySaturday              -860.548     21.309 -40.384  < 2e-16 ***
## weekdaySunday               -1309.080     21.179 -61.810  < 2e-16 ***
## weekdayThursday               -29.266     21.169  -1.383 0.166825    
## weekdayTuesday               -100.806     21.312  -4.730 2.26e-06 ***
## weekdayWednesday              -47.797     21.196  -2.255 0.024142 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 814.8 on 20724 degrees of freedom
## Multiple R-squared:  0.832,  Adjusted R-squared:  0.8316 
## F-statistic:  1901 on 54 and 20724 DF,  p-value: < 2.2e-16
# Constructing metrics to properly understand and test the data
regressionMetrics <- function(real, predicted) {
  # Mean Square Error
  MSE <- mean((real - predicted)^2)
  # Root Mean Square Error
  RMSE <- sqrt(MSE)
  # Mean Absolute Error
  MAE <- mean(abs(real - predicted))
  # Mean Absolute Percentage Error
  MAPE <- mean(abs(real - predicted)/real)
  # Median Absolute Error
  MedAE <- median(abs(real - predicted))
  # Mean Logarithmic Absolute Error
  MSLE <- mean((log(1 + real) - log(1 + predicted))^2)
  # R2
  R2 <- cor(predicted, real)^2
  
  
  result <- data.frame(MSE, RMSE, MAE, MAPE, MedAE, MSLE, R2)
  return(result)
}
TestData$hour <- as.factor(substr(TestData$date_time,12,13))
TestData$month <- as.factor(substr(TestData$date_time,6,7))
TestData$weekday <- as.factor(weekdays(as.Date(substr(TestData$date_time,0,10))))
TestData$weather_general <- as.factor(TestData$weather_general)
TestData$rain <- 0
TestData$rain[TestData$rain_mm>0] = 1 
TestData$snow <- 0
TestData$snow[TestData$snow_mm>0] = 1
TestData$temperature <- (TestData$temperature - mean(TestData$temperature))/sd(TestData$temperature)
TestData$clouds_coverage_pct <- (TestData$clouds_coverage_pct - mean(TestData$clouds_coverage_pct))/sd(TestData$clouds_coverage_pct)
fitted_vals <- predict.lm(lm1, newdata = TestData)
fitted_vals[fitted_vals < 0] = 0 
regressionMetrics(real = TestData$traffic,
                  predicted = fitted_vals)
##        MSE     RMSE      MAE      MAPE    MedAE     MSLE        R2
## 1 671521.6 819.4642 588.8444 0.8526099 438.9735 1.980953 0.8314193
ggplot(data.frame(data = TestData$traffic, 
                  fitted = fitted_vals),
       aes(x = data, y = fitted)) +
  geom_point()+
  geom_abline(slope = 1, size = 1.5, color = "gold")+
  theme_gray()+
  ggtitle("Linear regression predicted vs actual values")

The chart above presents comparison of values predicted in regression with actual values, with y=x line in the middle. We can see relatively big group of observations predicted as 0s with real values varying between 0 and 2000 and also huge group of underfitted values for the bigger actual values. The biggest problem from a perspective of the task to minimize MAPE are values that are predicted as high numbers, but the actual values are around 0. Therefore for further estimations we will remove them from estimations as they are most probably mistaken values.

TestData <- TestData[!TestData$traffic<100,]
TrainData <- TrainData[!TrainData$traffic<100,]

Ridge regression

Encouraged by the outcomes of linear regression, we decided to introduce Ridge regression model with cross validation and huge variety of lambdas.

# Ridge regression

cols <- colnames(TrainData)[c(2,4,5,8:13)]

lambdas <- exp(log(10)*seq(-2, 9, length.out = 200))

ctrl_cv5 <- trainControl(method = "cv",
                         number = 5)
parameters_ridge <- expand.grid(alpha = 0, # ridge 
                                lambda = lambdas)
set.seed(1234)
traffic_ridge <- train(traffic ~ .,
                      data = TrainData %>% 
                        dplyr::select(all_of(cols)),
                      method = "glmnet", 
                      tuneGrid = parameters_ridge,
                      trControl = ctrl_cv5)
ridge_predited <- predict(traffic_ridge, TestData)
ridge_predited[ridge_predited <0 ] = 0
regressionMetrics(real = TestData$traffic,
                  predicted = ridge_predited)
##        MSE     RMSE      MAE      MAPE    MedAE     MSLE        R2
## 1 713155.7 844.4855 617.4686 0.4578488 450.0579 1.547379 0.8229065
ggplot(data.frame(data = TestData$traffic, 
                  fitted = ridge_predited),
       aes(x = data, y = fitted)) +
  geom_point()+
  geom_abline(slope = 1, size = 1.5, color = "gold")+
  theme_gray()+
  ggtitle("Ridge regression predicted vs actual values")

Chart presented above shows results of the Ridge regression, which achieved 0.46 MAPE score. As we can see there are still observations with very low actual traffic size and big predicted one, but we can accept that amount of errors.

LASSO

Next we introduce lasso method to see how the model will behave with a possibility to remove some of the variables.

parameters_lasso <- expand.grid(alpha = 1,
                                lambda = lambdas)
set.seed(1234)
traffic_lasso <-  train(traffic ~ .,
                       data = TrainData %>% 
                         dplyr::select(all_of(cols)),
                       method = "glmnet", 
                       tuneGrid = parameters_lasso,
                       trControl = ctrl_cv5)

lasso_predited <- predict(traffic_lasso, TestData)
lasso_predited[lasso_predited <0 ] = 0

predict(traffic_lasso$finalModel, # stored model
        s = traffic_lasso$bestTune$lambda, # lambda
        type = "coefficients")
## 55 x 1 sparse Matrix of class "dgCMatrix"
##                                       s1
## (Intercept)                  1233.486292
## weather_generalClouds          37.709622
## weather_generalDrizzle         14.633431
## weather_generalFog              8.587031
## weather_generalHaze           -43.488168
## weather_generalMist             .       
## weather_generalRain             .       
## weather_generalSmoke         -428.355832
## weather_generalSnow           -72.226442
## weather_generalSquall           .       
## weather_generalThunderstorm   -25.674590
## clouds_coverage_pct           -26.300896
## temperature                    91.933061
## hour01                       -376.220739
## hour02                       -468.118420
## hour03                       -519.958122
## hour04                       -178.872163
## hour05                       1181.726863
## hour06                       3223.936910
## hour07                       3825.969310
## hour08                       3702.252335
## hour09                       3460.739044
## hour10                       3290.692028
## hour11                       3547.969344
## hour12                       3796.875497
## hour13                       3808.641333
## hour14                       4019.806109
## hour15                       4294.841570
## hour16                       4716.784049
## hour17                       4383.387626
## hour18                       3314.101735
## hour19                       2316.674200
## hour20                       1859.169520
## hour21                       1725.815384
## hour22                       1248.492880
## hour23                        520.595923
## month02                       105.547674
## month03                       193.423077
## month04                       182.004900
## month05                        66.947537
## month06                       110.758321
## month07                      -100.202924
## month08                        26.828324
## month09                         .       
## month10                        71.249916
## month11                       -65.879459
## month12                      -135.397991
## weekdayMonday                -303.732108
## weekdaySaturday              -826.665980
## weekdaySunday               -1292.762572
## weekdayThursday               -15.536822
## weekdayTuesday                -87.574901
## weekdayWednesday              -34.339235
## rain                          -65.100345
## snow                         -338.280418
regressionMetrics(real = TestData$traffic,
                  predicted = lasso_predited)
##        MSE     RMSE      MAE      MAPE    MedAE     MSLE        R2
## 1 661483.6 813.3164 587.2223 0.4142896 440.9405 1.834257 0.8336715
ggplot(data.frame(data = TestData$traffic, 
                  fitted = lasso_predited),
       aes(x = data, y = fitted)) +
  geom_point()+
  geom_abline(slope = 1, size = 1.5, color = "gold")+
  theme_gray()+
  ggtitle("LASSO regression predicted vs actual values")

LASSO Regression turns out to have even smaller MAPE at the level of 0.41. Deeper investigation into coefficients reveals that several levels of weather_general and one level of month are removed from the model. Finally we receive model with MAPE around 0.41. Chart above shows that observations are less dispersed compared to previous model. Let’s then see the results of Elastic net, which is a hybrid of those two.

Elastic net

# Elastic net
parameters_elastic <- expand.grid(alpha = seq(0, 1, 0.2), 
                                  lambda = lambdas)


traffic_elastic <-  train(traffic ~ .,
                          data = TrainData %>% 
                            dplyr::select(all_of(cols)),
                          method = "glmnet", 
                          tuneGrid = parameters_elastic,
                          trControl = ctrl_cv5)
traffic_elastic$bestTune
##      alpha   lambda
## 1034     1 0.666992
elastic_predited <- predict(traffic_elastic, TestData)
elastic_predited[elastic_predited <0 ] = 0

regressionMetrics(real = TestData$traffic,
                  predicted = elastic_predited)
##        MSE     RMSE      MAE      MAPE    MedAE     MSLE        R2
## 1 661483.6 813.3164 587.2223 0.4142896 440.9405 1.834257 0.8336715
ggplot(data.frame(data = TestData$traffic, 
                  fitted = elastic_predited),
       aes(x = data, y = fitted)) +
  geom_point()+
  geom_abline(slope = 1, size = 1.5, color = "gold")+
  theme_gray()+
  ggtitle("Elastic regression predicted vs actual values")

Best hyperparameters considered are alpha = 1 and lambda ~ 0.67, so actually LASSO regression turns out to be the best one. Let’s then test it on real test dataset.

Testing final model

Let’s start with applying all the possible transformations to the test dataset

traffic_test <- read_csv("traffic_test.csv")
## Rows: 10591 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): weather_general, weather_detailed
## dbl  (4): clouds_coverage_pct, temperature, rain_mm, snow_mm
## dttm (1): date_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
traffic_test$hour <- as.factor(substr(traffic_test$date_time,12,13))
traffic_test$month <- as.factor(substr(traffic_test$date_time,6,7))
traffic_test$weekday <- as.factor(weekdays(as.Date(substr(traffic_test$date_time,0,10))))
traffic_test$weather_general <- as.factor(traffic_test$weather_general)
traffic_test$rain <- 0
traffic_test$rain[traffic_test$rain_mm>0] = 1 
traffic_test$snow <- 0
traffic_test$snow[traffic_test$snow_mm>0] = 1
traffic_test$temperature <- (traffic_test$temperature - mean(traffic_test$temperature))/sd(traffic_test$temperature)
traffic_test$clouds_coverage_pct <- (traffic_test$clouds_coverage_pct - mean(traffic_test$clouds_coverage_pct))/sd(traffic_test$clouds_coverage_pct)

Predictions

final_predicted <- predict(traffic_elastic, traffic_test)
final_predicted[final_predicted <0 ] = 0
write.csv(final_predicted,"/Users/jandudzik/Desktop/Studiaa/ML1_2/Regression/outcomes.csv", row.names = TRUE)