University of Warsaw Project

Traffic for regression

Your task is to apply various ML algorithms to build a model explaining the traffic on one of the highways for one-hourly intervals based on the training sample and generate predictions for all observations from the test sample.

# Let's load the datasets
#setwd("G:\\My Drive\\Semester 2\\ML1_2021_2022\\project\\")
traffic_train <- read_csv("traffic_train.csv")

## Rows: 29701 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): weather_general, weather_detailed
## dbl  (5): clouds_coverage_pct, temperature, rain_mm, snow_mm, traffic
## dttm (1): date_time
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

traffic_test <- read_csv("traffic_test.csv")

## Rows: 10591 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): weather_general, weather_detailed
## dbl  (4): clouds_coverage_pct, temperature, rain_mm, snow_mm
## dttm (1): date_time
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

traffic_test_backup <- traffic_test

# Let's count missings in both datasets
colSums(is.na(traffic_test)) %>% 
  sort()

##           date_time     weather_general    weather_detailed clouds_coverage_pct 
##                   0                   0                   0                   0 
##         temperature             rain_mm             snow_mm 
##                   0                   0                   0

colSums(is.na(traffic_train)) %>% 
  sort()

##           date_time     weather_general    weather_detailed clouds_coverage_pct 
##                   0                   0                   0                   0 
##         temperature             rain_mm             snow_mm             traffic 
##                   0                   0                   0                   0

# there are no missings

# Let's assume that near zero traffic is impossible on the roads
# where in peak hours it can be above 6000 cars/hr remove all the traffic 
# values below 50
traffic_outliers <- which(traffic_train$traffic < 50)
traffic_train <- traffic_train[-traffic_outliers,]
rm(traffic_outliers)

# Let's split the first column of the dataset to have the year, month, date and time separately

traffic_train <- tidyr::separate(traffic_train, date_time, c("date", "hour"), sep = " ", remove=FALSE)
traffic_test <- tidyr::separate(traffic_test, date_time, c("date", "hour"), sep = " ", remove=FALSE)

traffic_train <- traffic_train %>%
  add_column(year = "",
             .after = 1)
traffic_train <- traffic_train %>%
  add_column(month = "",
             .after = 2)
traffic_train <- traffic_train %>%
  add_column(day_of_week = "",
             .after = 4)
traffic_train <- traffic_train %>%
  add_column(cos_time = "",
             .after = 6) 
traffic_train <- traffic_train %>%
  add_column(sin_time = "",
             .after = 7) 

traffic_test <- traffic_test %>%
  add_column(year = "",
             .after = 1)
traffic_test <- traffic_test %>%
  add_column(month = "",
             .after = 2)
traffic_test <- traffic_test %>%
  add_column(day_of_week = "",
             .after = 4)
traffic_test <- traffic_test %>%
  add_column(cos_time = "",
             .after = 6) 
traffic_test <- traffic_test %>%
  add_column(sin_time = "",
             .after = 7)


# Converting class of date from "Character" to "Date" and filling "year" and "month" column
traffic_train$year <- year(traffic_train$date)
traffic_train$month <- month(traffic_train$date)
traffic_train$date <- day(traffic_train$date)

traffic_test$year <- year(traffic_test$date)
traffic_test$month <- month(traffic_test$date)
traffic_test$date <- day(traffic_test$date)

# Let's also fill "day_of_week" column using lubridate package
traffic_train$day_of_week <- wday(traffic_train$date_time, label=TRUE)
traffic_test$day_of_week <- wday(traffic_test$date_time, label=TRUE)

# To be more flexible with the approaches and do some testings, 
# 2 variables from "time" were added - sin(time) and cos(time) 
# to represent time as a cyclic variable.
traffic_train$hour <- as.numeric(hour(traffic_train$date_time))
traffic_train$cos_time <- cos(traffic_train$hour)
traffic_train$sin_time <- sin(traffic_train$hour)

traffic_test$hour <- as.numeric(hour(traffic_test$date_time))
traffic_test$cos_time <- cos(traffic_test$hour)
traffic_test$sin_time <- sin(traffic_test$hour)

# Let's remove unused columns
traffic_train <- subset (traffic_train, select = -1)
traffic_test <- subset (traffic_test, select = -1)

# And check how the dataset looks like
glimpse(traffic_train)

## Rows: 29,644
## Columns: 14
## $ year                <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 20~
## $ month               <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10~
## $ date                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ day_of_week         <ord> Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, ~
## $ hour                <dbl> 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 13, 14, 15, 16, 18,~
## $ cos_time            <dbl> 1.0000000, 0.5403023, -0.4161468, -0.9899925, -0.6~
## $ sin_time            <dbl> 0.000000000, 0.841470985, 0.909297427, 0.141120008~
## $ weather_general     <chr> "Clear", "Clear", "Clear", "Clear", "Clear", "Clea~
## $ weather_detailed    <chr> "sky is clear", "sky is clear", "sky is clear", "s~
## $ clouds_coverage_pct <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 20, ~
## $ temperature         <dbl> 11.5, 10.3, 8.0, 7.9, 6.4, 5.5, 5.1, 5.0, 9.3, 18.~
## $ rain_mm             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ snow_mm             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ traffic             <dbl> 508, 323, 274, 372, 812, 2720, 5674, 6512, 5473, 5~

# Let's convert the variables for both datasets
traffic_train$year <- as.numeric(traffic_train$year, ordered = FALSE)
traffic_train$month <- factor(traffic_train$month, ordered = FALSE)
traffic_train$date <- factor(traffic_train$date, ordered = FALSE)
traffic_train$hour <- factor(traffic_train$hour, ordered = FALSE)
traffic_train$day_of_week <- factor(traffic_train$day_of_week, ordered = FALSE)
traffic_train$weather_general <- factor(traffic_train$weather_general, ordered=FALSE)
traffic_train$weather_detailed <- factor(traffic_train$weather_detailed, ordered=FALSE)

traffic_test$year <- as.numeric(traffic_test$year, ordered = FALSE)
traffic_test$month <- factor(traffic_test$month, ordered = FALSE)
traffic_test$date <- factor(traffic_test$date, ordered = FALSE)
traffic_test$hour <- factor(traffic_test$hour, ordered = FALSE)
traffic_test$day_of_week <- factor(traffic_test$day_of_week, ordered = FALSE)
traffic_test$weather_general <- factor(traffic_test$weather_general, ordered=FALSE)
traffic_test$weather_detailed <- factor(traffic_test$weather_detailed, ordered=FALSE)

# Let's finally check factors and numeric variables
sapply(traffic_train, is.factor)

##                year               month                date         day_of_week 
##               FALSE                TRUE                TRUE                TRUE 
##                hour            cos_time            sin_time     weather_general 
##                TRUE               FALSE               FALSE                TRUE 
##    weather_detailed clouds_coverage_pct         temperature             rain_mm 
##                TRUE               FALSE               FALSE               FALSE 
##             snow_mm             traffic 
##               FALSE               FALSE

# Let's check the distribution (histogram) of the dependent variable "traffic"
ggplot(traffic_train,
       aes(x = traffic)) +
  geom_histogram(fill = "blue",
                 bins = 100) +
  theme_bw()

# Let's check how it looks after log transformation
ggplot(traffic_train,
       aes(x = log(traffic + 1))) +
  geom_histogram(fill = "blue",
                 bins = 100) +
  theme_bw()

# Let's store numerical variables in a vector
traffic_num_vars <- 
  sapply(traffic_train, is.numeric) %>% 
  which() %>% 
  names()

# Let's check how many unique values all the numerical variables have
sapply(traffic_train[, traffic_num_vars], 
       function(x) 
         unique(x) %>% 
         length()) %>% 

  sort()

##                year             snow_mm            cos_time            sin_time 
##                   5                  12                  24                  24 
## clouds_coverage_pct             rain_mm         temperature             traffic 
##                  60                 350                 615                6469

# Let's store qualitative variables in a vector
traffic_factor_vars <- 
  sapply(traffic_train, is.factor) %>% 
  which() %>% 
  names()

# Let's see how many unique values all qualitative variables have
sapply(traffic_train[, traffic_factor_vars], 
       function(x) 
         unique(x) %>% 
         length()) %>% 
  sort()

##      day_of_week  weather_general            month             hour 
##                7               11               12               24 
##             date weather_detailed 
##               31               37

# Let's have a final look at the both vectors
traffic_num_vars

## [1] "year"                "cos_time"            "sin_time"           
## [4] "clouds_coverage_pct" "temperature"         "rain_mm"            
## [7] "snow_mm"             "traffic"

traffic_factor_vars

## [1] "month"            "date"             "day_of_week"      "hour"            
## [5] "weather_general"  "weather_detailed"

### DIVIDING DATASET###

# Let's divide training dataset into a training and testing sample
set.seed(987654321)

traffic_which_train <- createDataPartition(traffic_train$traffic,
                                          p = 0.7, 
                                          list = FALSE) 

train_sample <- traffic_train[traffic_which_train,]
test_sample <- traffic_train[-traffic_which_train,]
rm(traffic_which_train)

# Let's check the distribution of the target variable in both samples
summary(train_sample$traffic)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      60    1176    3317    3235    4921    7244

summary(test_sample$traffic)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      97    1176    3315    3233    4920    7263

# NUMERIC VARIABLES #

# Let's calculate correlations between numeric variables and "traffic"
traffic_correlations <- 
  cor(train_sample[, traffic_num_vars],
      use = "pairwise.complete.obs")
traffic_correlations

##                             year      cos_time     sin_time clouds_coverage_pct
## year                 1.000000000  0.0074522895 -0.014782813        -0.102994034
## cos_time             0.007452289  1.0000000000  0.042463927         0.000980204
## sin_time            -0.014782813  0.0424639265  1.000000000        -0.002796557
## clouds_coverage_pct -0.102994034  0.0009802040 -0.002796557         1.000000000
## temperature          0.194242023  0.0090775872 -0.005684121        -0.148972787
## rain_mm              0.024385721 -0.0018223435 -0.002759685         0.088558605
## snow_mm              0.022845113  0.0005292218 -0.002911731         0.034462913
## traffic             -0.016027111  0.0860052859 -0.038004351         0.044442204
##                      temperature       rain_mm       snow_mm       traffic
## year                 0.194242023  0.0243857213  0.0228451129 -0.0160271107
## cos_time             0.009077587 -0.0018223435  0.0005292218  0.0860052859
## sin_time            -0.005684121 -0.0027596847 -0.0029117309 -0.0380043508
## clouds_coverage_pct -0.148972787  0.0885586045  0.0344629133  0.0444422038
## temperature          1.000000000  0.0959351719 -0.0227070187  0.1366876702
## rain_mm              0.095935172  1.0000000000  0.0008793879 -0.0174253010
## snow_mm             -0.022707019  0.0008793879  1.0000000000  0.0009174909
## traffic              0.136687670 -0.0174253010  0.0009174909  1.0000000000

#Let's plot it
corrplot(traffic_correlations, method = "number")

# We DO NOT see strong correlation with numerical variables

rm(traffic_correlations)

# Let's be curious and see the relation of the target variable with the temperature
ggplot(train_sample,
       aes(x = temperature,
           y = traffic)) +
  geom_point(col = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

# rain
ggplot(train_sample,
       aes(x = rain_mm,
           y = traffic)) +
  geom_point(col = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

# and snow
ggplot(train_sample,
       aes(x = snow_mm,
           y = traffic)) +
  geom_point(col = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

# Let's remove outliers with temperature < -50, rain_mm > 40mm
traffic_outliers <- which(train_sample$temperature < -50 | train_sample$rain_mm > 40)
train_sample <- train_sample[-traffic_outliers,]
rm(traffic_outliers)


# QUALITATIVE (CATEGORICAL) VARIABLES #
aov(train_sample$traffic ~ train_sample$month + train_sample$date + 
      train_sample$day_of_week + train_sample$hour + train_sample$weather_general +
      train_sample$weather_detailed) ->
  traffic_anova

summary(traffic_anova)

##                                  Df    Sum Sq   Mean Sq  F value   Pr(>F)    
## train_sample$month               11 4.682e+08 4.256e+07   65.724  < 2e-16 ***
## train_sample$date                30 2.597e+08 8.656e+06   13.366  < 2e-16 ***
## train_sample$day_of_week          6 4.155e+09 6.924e+08 1069.268  < 2e-16 ***
## train_sample$hour                23 6.343e+10 2.758e+09 4258.446  < 2e-16 ***
## train_sample$weather_general     10 2.728e+07 2.728e+06    4.213 7.22e-06 ***
## train_sample$weather_detailed    26 2.165e+07 8.328e+05    1.286     0.15    
## Residuals                     20638 1.337e+10 6.476e+05                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# F value is significant for hour, day_of_week and month

# Let's see the relation of the target variable with them

# hour
ggplot(train_sample,
       aes(x = hour,
           y = traffic)) +
  geom_boxplot(fill = "blue", outlier.color = "red") +
  theme_bw()

# hour + day_of_week
ggplot(train_sample, mapping = aes(x = hour, y = traffic)) + 
  geom_boxplot(fill = "blue", outlier.colour = "red") + 
  facet_wrap(~day_of_week)

# Let's remove outliers for traffic ~ hours and create a dataset without them
train_sample_noout <- train_sample

list_quantiles <- tapply(train_sample_noout$traffic, train_sample_noout$hour, quantile)
Q1s <- sapply(1:24, function(i) list_quantiles[[i]][2])
Q3s <- sapply(1:24, function(i) list_quantiles[[i]][4])
IQRs <- tapply(train_sample_noout$traffic, train_sample_noout$hour, IQR)
Lowers <- Q1s - 1.5*IQRs
Uppers <- Q3s + 1.5*IQRs
data_s <- split(train_sample_noout, train_sample_noout$hour)
train_sample_final <- NULL
for (i in 1:24){
  out <- subset(data_s[[i]], data_s[[i]]$traffic > Lowers[i] & data_s[[i]]$traffic < Uppers[i])
  train_sample_final <- rbind(train_sample_final, out)
}
rm(data_s, list_quantiles, out, train_sample_noout, i, IQRs, Lowers, Q1s, Q3s, Uppers)

# Let's plot "hour" again
ggplot(train_sample_final,
       aes(x = hour,
           y = traffic)) +
  geom_boxplot(fill = "blue", outlier.color = "red") +
  theme_bw()

# Looks a bit better, will test both datasets for MAPE later

# day of week
ggplot(train_sample,
       aes(x = day_of_week,
           y = traffic)) +
  geom_boxplot(fill = "blue",  outlier.color = "red") +
  theme_bw()

# weather general
ggplot(train_sample,
       aes(x = weather_general,
           y = traffic)) +
  geom_boxplot(fill = "blue",   outlier.color = "red") +
  theme_bw()

# month
ggplot(train_sample,
       aes(x = month,
           y = traffic)) +
  geom_boxplot(fill = "blue") +
  theme_bw()

## Let's check Near Zero Variance
nearZeroVar(train_sample_final,
            saveMetrics = TRUE) -> traffic_nzv_stats

traffic_nzv_stats %>% 
  rownames_to_column("variable") %>% 
  arrange(-zeroVar, -nzv, -freqRatio)

##               variable   freqRatio percentUnique zeroVar   nzv
## 1              snow_mm 1695.666667    0.05884949   FALSE  TRUE
## 2              rain_mm   39.620985    1.48594968   FALSE  TRUE
## 3     weather_detailed    2.305995    0.17654848   FALSE FALSE
## 4  clouds_coverage_pct    1.376914    0.27953509   FALSE FALSE
## 5      weather_general    1.260516    0.05394537   FALSE FALSE
## 6          temperature    1.186275    2.97680349   FALSE FALSE
## 7              traffic    1.095238   29.65033593   FALSE FALSE
## 8                 year    1.072130    0.02452062   FALSE FALSE
## 9                month    1.038278    0.05884949   FALSE FALSE
## 10         day_of_week    1.021269    0.03432887   FALSE FALSE
## 11                hour    1.015521    0.11769898   FALSE FALSE
## 12            cos_time    1.015521    0.11769898   FALSE FALSE
## 13            sin_time    1.015521    0.11769898   FALSE FALSE
## 14                date    1.004032    0.15202786   FALSE FALSE

# Snow and rain have NZV and as they are not correlated with traffic
# Probably we will not use them

# Let's kill all unnecessary variables
rm(traffic_anova, traffic_nzv_stats)


# Let's define function that will summarize all the errors and R2
regressionMetrics <- function(real, predicted) {
  MSE <- mean((real - predicted)^2) # Mean Square Error
  RMSE <- sqrt(MSE) # Root Mean Square Error
  MAE <- mean(abs(real - predicted)) # Mean Absolute Error
  MAPE <- mean(abs(real - predicted)/real) # Mean Absolute Percentage Error
  MedAE <- median(abs(real - predicted)) # Median Absolute Error
  R2 <- cor(predicted, real)^2 # R2
  
  result <- data.frame(MSE, RMSE, MAE, MAPE, MedAE, R2)
  return(result)
}


#####################################
#       LINEAR REGRESSION           #                  
#####################################

# Defining contrasts
options(contrasts = c("contr.treatment",
                      "contr.treatment"))

# Creating a model
set.seed(987654321)

traffic_lm <- lm(traffic ~ month + day_of_week + hour,
                 data = train_sample_final)

summary(traffic_lm)

## 
## Call:
## lm(formula = traffic ~ month + day_of_week + hour, data = train_sample_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4319.3  -405.1   -25.9   474.5  2438.1 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -275.55      36.26  -7.599 3.12e-14 ***
## month2           128.65      29.70   4.332 1.49e-05 ***
## month3           222.07      29.29   7.581 3.56e-14 ***
## month4           282.52      28.48   9.919  < 2e-16 ***
## month5           219.97      27.48   8.004 1.26e-15 ***
## month6           308.38      29.15  10.580  < 2e-16 ***
## month7           145.87      26.89   5.425 5.85e-08 ***
## month8           245.47      28.27   8.683  < 2e-16 ***
## month9           191.93      29.18   6.578 4.88e-11 ***
## month10          227.19      27.85   8.156 3.65e-16 ***
## month11           28.84      27.12   1.064   0.2875    
## month12         -136.58      26.69  -5.117 3.14e-07 ***
## day_of_weekMon   967.71      20.69  46.781  < 2e-16 ***
## day_of_weekTue  1183.77      20.94  56.527  < 2e-16 ***
## day_of_weekWed  1243.15      20.79  59.797  < 2e-16 ***
## day_of_weekThu  1260.95      20.84  60.514  < 2e-16 ***
## day_of_weekFri  1281.10      20.81  61.548  < 2e-16 ***
## day_of_weekSat   427.35      20.97  20.376  < 2e-16 ***
## hour1           -274.20      37.72  -7.270 3.73e-13 ***
## hour2           -428.20      38.12 -11.232  < 2e-16 ***
## hour3           -447.58      38.27 -11.694  < 2e-16 ***
## hour4            -88.75      37.36  -2.376   0.0175 *  
## hour5           1275.41      37.90  33.650  < 2e-16 ***
## hour6           3299.93      37.63  87.703  < 2e-16 ***
## hour7           4018.63      37.73 106.512  < 2e-16 ***
## hour8           3748.30      37.50  99.948  < 2e-16 ***
## hour9           3603.87      38.55  93.489  < 2e-16 ***
## hour10          3452.17      38.31  90.110  < 2e-16 ***
## hour11          3724.38      38.48  96.784  < 2e-16 ***
## hour12          3983.95      38.24 104.183  < 2e-16 ***
## hour13          3973.34      39.01 101.848  < 2e-16 ***
## hour14          4209.98      38.15 110.351  < 2e-16 ***
## hour15          4474.54      38.48 116.288  < 2e-16 ***
## hour16          4849.24      38.12 127.204  < 2e-16 ***
## hour17          4507.38      38.02 118.558  < 2e-16 ***
## hour18          3477.33      38.06  91.369  < 2e-16 ***
## hour19          2465.63      37.80  65.220  < 2e-16 ***
## hour20          2033.08      38.28  53.108  < 2e-16 ***
## hour21          1827.67      37.84  48.294  < 2e-16 ***
## hour22          1375.15      38.34  35.871  < 2e-16 ***
## hour23           599.98      37.96  15.807  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 787.6 on 20350 degrees of freedom
## Multiple R-squared:  0.844,  Adjusted R-squared:  0.8437 
## F-statistic:  2753 on 40 and 20350 DF,  p-value: < 2.2e-16

# Predicting traffic in training dataset
traffic_lm_fitted <- predict(traffic_lm)

# Checking their correlation
cor(train_sample_final$traffic,
    traffic_lm_fitted)

## [1] 0.9187008

# Let's check metrics for our model
regressionMetrics(real = train_sample_final$traffic,
                  predicted = traffic_lm_fitted)

##        MSE     RMSE      MAE      MAPE    MedAE        R2
## 1 619085.1 786.8196 572.8329 0.4064949 433.0294 0.8440111

# MAPE = 40.65%, not good but reasonable

# Checking the distribution of errors
ggplot(data.frame(error = train_sample_final$traffic - traffic_lm_fitted),
       aes(x = error)) +
  geom_histogram(fill = "blue",
                 bins = 100) +
  theme_bw()

# Let's apply model to our test sample
traffic_lm_test_fitted <- predict(traffic_lm, test_sample)

regressionMetrics(real = test_sample$traffic,
                  predicted = traffic_lm_test_fitted)

##        MSE     RMSE      MAE      MAPE   MedAE        R2
## 1 684841.6 827.5515 602.9276 0.4315104 452.745 0.8268774

# MAPE = 43.15% so let's try another methods


########################################
#                KNN                   #
########################################

modelLookup("knn")

##   model parameter      label forReg forClass probModel
## 1   knn         k #Neighbors   TRUE     TRUE      TRUE

set.seed(987654321)
ctrl_nocv <- trainControl(method = "none")

traffic_train_knn <- 
  train(traffic ~ month + day_of_week + hour + sin_time + cos_time, 
        data = train_sample_final,
        method = "knn",
        trControl = ctrl_nocv)

traffic_train_knn_fitted <- predict(traffic_train_knn)

regressionMetrics(real = train_sample_final$traffic,
                  predicted = traffic_train_knn_fitted)

##        MSE     RMSE      MAE     MAPE    MedAE        R2
## 1 180911.6 425.3371 246.3047 0.102594 138.5714 0.9544163

# MAPE = 10.25%

# Checking the distribution of errors
ggplot(data.frame(error = train_sample_final$traffic - traffic_train_knn_fitted),
       aes(x = error)) +
  geom_histogram(fill = "blue",
                 bins = 100) +
  theme_bw()

# Looks good

# Let's validate our model with the test sample
traffic_test_knn_fitted <- predict(traffic_train_knn,
                                    test_sample)

regressionMetrics(real = test_sample$traffic,
                  predicted = traffic_test_knn_fitted)

##        MSE     RMSE      MAE      MAPE    MedAE        R2
## 1 269114.4 518.7623 303.1432 0.1330975 170.3333 0.9319415

# MAPE = 13.3% - not so bad.


########################################
#                Ridge                 #
########################################

#Defining vectors with all variables and selected ones

all_vars <- names(traffic_train)

selected_vars <- all_vars[-which(all_vars %in%
                             c("year", "date", "weather_general", 
                               "weather_detailed", "snow_mm", "clouds_coverage_pct",
                               "temperature", "rain_mm"))]

train_sample_ridge <- train_sample_final

traffic_ridge <- 
  glmnet(x = as.matrix(train_sample_ridge[, all_vars]),
         y = train_sample_ridge$traffic,
         family = "gaussian",
         lambda = exp(log(10)*seq(-2, 9, length.out = 200)),
         alpha = 0)

dim(coef(traffic_ridge))

## [1]  15 200

coef(traffic_ridge)[, 1]

##         (Intercept)                year               month                date 
##        3.252828e+03       -3.325854e-05       -1.914224e-05       -1.179208e-05 
##         day_of_week                hour            cos_time            sin_time 
##        0.000000e+00        2.002342e-04        4.660377e-04       -2.143746e-04 
##     weather_general    weather_detailed clouds_coverage_pct         temperature 
##        0.000000e+00        0.000000e+00        4.594741e-06        4.297694e-05 
##             rain_mm             snow_mm             traffic 
##       -7.151565e-05        2.789380e-04        1.992174e-06

# Ridge regression

options(contrasts = c("contr.treatment",
                      "contr.treatment"))

ctrl_cv5 <- trainControl(method = "cv",
                         number = 5,
#                         returnResamp = "all"
#                         savePredictions = "all"
                         )

parameters_ridge <- expand.grid(alpha = 0,
                                lambda = seq(10, 1e3, 10))

set.seed(123456789)

traffic_ridge <- train(traffic ~ month + day_of_week + hour + sin_time + cos_time,
                      data = train_sample_ridge,
                      method = "glmnet", 
                      tuneGrid = parameters_ridge,
                      trControl = ctrl_cv5)

traffic_ridge$bestTune$lambda

## [1] 50

# Let's do the predictions
predict(traffic_ridge$finalModel,
        s = traffic_ridge$bestTune$lambda,
        type = "coefficients")

## 43 x 1 sparse Matrix of class "dgCMatrix"
##                          s1
## (Intercept)     1397.489955
## month2            85.366470
## month3           167.322636
## month4           233.162374
## month5           174.198268
## month6           264.306766
## month7           101.018424
## month8           203.041645
## month9           145.041020
## month10          184.818319
## month11          -12.239936
## month12         -177.776228
## day_of_weekMon   799.476689
## day_of_weekTue  1007.931710
## day_of_weekWed  1067.437253
## day_of_weekThu  1086.617531
## day_of_weekFri  1100.376666
## day_of_weekSat   272.040589
## hour1          -1281.434017
## hour2          -2188.650344
## hour3          -2663.101360
## hour4          -2047.213858
## hour5             21.472895
## hour6           2527.503913
## hour7           3063.735862
## hour8           2081.486344
## hour9           1335.434499
## hour10          1248.616195
## hour11          2182.594086
## hour12          3103.208058
## hour13          3140.935755
## hour14          2760.191575
## hour15          2303.878381
## hour16          2507.105320
## hour17          2720.418680
## hour18          2467.741078
## hour19          1741.133191
## hour20           857.428269
## hour21          -103.255766
## hour22          -900.631056
## hour23         -1281.940458
## sin_time           1.869623
## cos_time        -819.145824

# And check metrics
regressionMetrics(real = train_sample_final$traffic,
                  predicted = predict(traffic_ridge, 
                                      train_sample_final))

##        MSE     RMSE     MAE      MAPE    MedAE        R2
## 1 644909.6 803.0626 590.413 0.4289818 444.8925 0.8395226

# Almost the same results like in linear model

# MAPE = 42.89%

# Let's apply the model to the  test_sample and check metrics
regressionMetrics(real = test_sample$traffic,
                  predicted = predict(traffic_ridge, 
                                      test_sample))

##        MSE     RMSE      MAE     MAPE    MedAE        R2
## 1 708812.2 841.9099 620.4978 0.451831 466.7647 0.8216676

# MAPE = 45.18%


#######################==RESULTS==############################

# Mean absolute percentage (MAPE) for the algorithms used:
# Linear regression: 43.15%
# KNN: 13.3%
# Ridge_Lasso: 45.18% 

# We see that KNN algorithm performs better than others.
# As for linear regression we assume a linear relationship between X and Y
# but if the true relationship is far from linear, then the resulting model
# will provide a poor fit to the data.
# KNN is non-linear. It can detect linear or non-linear distributed data
# it tends to perform very well with a lot of situations.
# KNN predicts y by a local average.


# Let's apply KN model to the "traffic_test.csv"
traffic_final_fitted <- predict(traffic_train_knn,
                                   traffic_test)

# Add this column to the original "traffic_test" dataset
traffic_test_backup$traffic <- as.integer(traffic_final_fitted)

# And save it
write.csv(traffic_test_backup,"traffic_test_predicted.csv", row.names = TRUE)


#############################################################

Machine Learning Regression

Keith Xolani Mpala & Alexey Demchuk

6/5/2022

University of Warsaw Project

Traffic for regression