Your task is to apply various ML algorithms to build a model explaining the traffic on one of the highways for one-hourly intervals based on the training sample and generate predictions for all observations from the test sample.
# Let's load the datasets
#setwd("G:\\My Drive\\Semester 2\\ML1_2021_2022\\project\\")
traffic_train <- read_csv("traffic_train.csv")
## Rows: 29701 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): weather_general, weather_detailed
## dbl (5): clouds_coverage_pct, temperature, rain_mm, snow_mm, traffic
## dttm (1): date_time
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
traffic_test <- read_csv("traffic_test.csv")
## Rows: 10591 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): weather_general, weather_detailed
## dbl (4): clouds_coverage_pct, temperature, rain_mm, snow_mm
## dttm (1): date_time
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
traffic_test_backup <- traffic_test
# Let's count missings in both datasets
colSums(is.na(traffic_test)) %>%
sort()
## date_time weather_general weather_detailed clouds_coverage_pct
## 0 0 0 0
## temperature rain_mm snow_mm
## 0 0 0
colSums(is.na(traffic_train)) %>%
sort()
## date_time weather_general weather_detailed clouds_coverage_pct
## 0 0 0 0
## temperature rain_mm snow_mm traffic
## 0 0 0 0
# there are no missings
# Let's assume that near zero traffic is impossible on the roads
# where in peak hours it can be above 6000 cars/hr remove all the traffic
# values below 50
traffic_outliers <- which(traffic_train$traffic < 50)
traffic_train <- traffic_train[-traffic_outliers,]
rm(traffic_outliers)
# Let's split the first column of the dataset to have the year, month, date and time separately
traffic_train <- tidyr::separate(traffic_train, date_time, c("date", "hour"), sep = " ", remove=FALSE)
traffic_test <- tidyr::separate(traffic_test, date_time, c("date", "hour"), sep = " ", remove=FALSE)
traffic_train <- traffic_train %>%
add_column(year = "",
.after = 1)
traffic_train <- traffic_train %>%
add_column(month = "",
.after = 2)
traffic_train <- traffic_train %>%
add_column(day_of_week = "",
.after = 4)
traffic_train <- traffic_train %>%
add_column(cos_time = "",
.after = 6)
traffic_train <- traffic_train %>%
add_column(sin_time = "",
.after = 7)
traffic_test <- traffic_test %>%
add_column(year = "",
.after = 1)
traffic_test <- traffic_test %>%
add_column(month = "",
.after = 2)
traffic_test <- traffic_test %>%
add_column(day_of_week = "",
.after = 4)
traffic_test <- traffic_test %>%
add_column(cos_time = "",
.after = 6)
traffic_test <- traffic_test %>%
add_column(sin_time = "",
.after = 7)
# Converting class of date from "Character" to "Date" and filling "year" and "month" column
traffic_train$year <- year(traffic_train$date)
traffic_train$month <- month(traffic_train$date)
traffic_train$date <- day(traffic_train$date)
traffic_test$year <- year(traffic_test$date)
traffic_test$month <- month(traffic_test$date)
traffic_test$date <- day(traffic_test$date)
# Let's also fill "day_of_week" column using lubridate package
traffic_train$day_of_week <- wday(traffic_train$date_time, label=TRUE)
traffic_test$day_of_week <- wday(traffic_test$date_time, label=TRUE)
# To be more flexible with the approaches and do some testings,
# 2 variables from "time" were added - sin(time) and cos(time)
# to represent time as a cyclic variable.
traffic_train$hour <- as.numeric(hour(traffic_train$date_time))
traffic_train$cos_time <- cos(traffic_train$hour)
traffic_train$sin_time <- sin(traffic_train$hour)
traffic_test$hour <- as.numeric(hour(traffic_test$date_time))
traffic_test$cos_time <- cos(traffic_test$hour)
traffic_test$sin_time <- sin(traffic_test$hour)
# Let's remove unused columns
traffic_train <- subset (traffic_train, select = -1)
traffic_test <- subset (traffic_test, select = -1)
# And check how the dataset looks like
glimpse(traffic_train)
## Rows: 29,644
## Columns: 14
## $ year <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 20~
## $ month <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10~
## $ date <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ day_of_week <ord> Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, Wed, ~
## $ hour <dbl> 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 13, 14, 15, 16, 18,~
## $ cos_time <dbl> 1.0000000, 0.5403023, -0.4161468, -0.9899925, -0.6~
## $ sin_time <dbl> 0.000000000, 0.841470985, 0.909297427, 0.141120008~
## $ weather_general <chr> "Clear", "Clear", "Clear", "Clear", "Clear", "Clea~
## $ weather_detailed <chr> "sky is clear", "sky is clear", "sky is clear", "s~
## $ clouds_coverage_pct <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 20, ~
## $ temperature <dbl> 11.5, 10.3, 8.0, 7.9, 6.4, 5.5, 5.1, 5.0, 9.3, 18.~
## $ rain_mm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ snow_mm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ traffic <dbl> 508, 323, 274, 372, 812, 2720, 5674, 6512, 5473, 5~
# Let's convert the variables for both datasets
traffic_train$year <- as.numeric(traffic_train$year, ordered = FALSE)
traffic_train$month <- factor(traffic_train$month, ordered = FALSE)
traffic_train$date <- factor(traffic_train$date, ordered = FALSE)
traffic_train$hour <- factor(traffic_train$hour, ordered = FALSE)
traffic_train$day_of_week <- factor(traffic_train$day_of_week, ordered = FALSE)
traffic_train$weather_general <- factor(traffic_train$weather_general, ordered=FALSE)
traffic_train$weather_detailed <- factor(traffic_train$weather_detailed, ordered=FALSE)
traffic_test$year <- as.numeric(traffic_test$year, ordered = FALSE)
traffic_test$month <- factor(traffic_test$month, ordered = FALSE)
traffic_test$date <- factor(traffic_test$date, ordered = FALSE)
traffic_test$hour <- factor(traffic_test$hour, ordered = FALSE)
traffic_test$day_of_week <- factor(traffic_test$day_of_week, ordered = FALSE)
traffic_test$weather_general <- factor(traffic_test$weather_general, ordered=FALSE)
traffic_test$weather_detailed <- factor(traffic_test$weather_detailed, ordered=FALSE)
# Let's finally check factors and numeric variables
sapply(traffic_train, is.factor)
## year month date day_of_week
## FALSE TRUE TRUE TRUE
## hour cos_time sin_time weather_general
## TRUE FALSE FALSE TRUE
## weather_detailed clouds_coverage_pct temperature rain_mm
## TRUE FALSE FALSE FALSE
## snow_mm traffic
## FALSE FALSE
# Let's check the distribution (histogram) of the dependent variable "traffic"
ggplot(traffic_train,
aes(x = traffic)) +
geom_histogram(fill = "blue",
bins = 100) +
theme_bw()
# Let's check how it looks after log transformation
ggplot(traffic_train,
aes(x = log(traffic + 1))) +
geom_histogram(fill = "blue",
bins = 100) +
theme_bw()
# Let's store numerical variables in a vector
traffic_num_vars <-
sapply(traffic_train, is.numeric) %>%
which() %>%
names()
# Let's check how many unique values all the numerical variables have
sapply(traffic_train[, traffic_num_vars],
function(x)
unique(x) %>%
length()) %>%
sort()
## year snow_mm cos_time sin_time
## 5 12 24 24
## clouds_coverage_pct rain_mm temperature traffic
## 60 350 615 6469
# Let's store qualitative variables in a vector
traffic_factor_vars <-
sapply(traffic_train, is.factor) %>%
which() %>%
names()
# Let's see how many unique values all qualitative variables have
sapply(traffic_train[, traffic_factor_vars],
function(x)
unique(x) %>%
length()) %>%
sort()
## day_of_week weather_general month hour
## 7 11 12 24
## date weather_detailed
## 31 37
# Let's have a final look at the both vectors
traffic_num_vars
## [1] "year" "cos_time" "sin_time"
## [4] "clouds_coverage_pct" "temperature" "rain_mm"
## [7] "snow_mm" "traffic"
traffic_factor_vars
## [1] "month" "date" "day_of_week" "hour"
## [5] "weather_general" "weather_detailed"
### DIVIDING DATASET###
# Let's divide training dataset into a training and testing sample
set.seed(987654321)
traffic_which_train <- createDataPartition(traffic_train$traffic,
p = 0.7,
list = FALSE)
train_sample <- traffic_train[traffic_which_train,]
test_sample <- traffic_train[-traffic_which_train,]
rm(traffic_which_train)
# Let's check the distribution of the target variable in both samples
summary(train_sample$traffic)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60 1176 3317 3235 4921 7244
summary(test_sample$traffic)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 97 1176 3315 3233 4920 7263
# NUMERIC VARIABLES #
# Let's calculate correlations between numeric variables and "traffic"
traffic_correlations <-
cor(train_sample[, traffic_num_vars],
use = "pairwise.complete.obs")
traffic_correlations
## year cos_time sin_time clouds_coverage_pct
## year 1.000000000 0.0074522895 -0.014782813 -0.102994034
## cos_time 0.007452289 1.0000000000 0.042463927 0.000980204
## sin_time -0.014782813 0.0424639265 1.000000000 -0.002796557
## clouds_coverage_pct -0.102994034 0.0009802040 -0.002796557 1.000000000
## temperature 0.194242023 0.0090775872 -0.005684121 -0.148972787
## rain_mm 0.024385721 -0.0018223435 -0.002759685 0.088558605
## snow_mm 0.022845113 0.0005292218 -0.002911731 0.034462913
## traffic -0.016027111 0.0860052859 -0.038004351 0.044442204
## temperature rain_mm snow_mm traffic
## year 0.194242023 0.0243857213 0.0228451129 -0.0160271107
## cos_time 0.009077587 -0.0018223435 0.0005292218 0.0860052859
## sin_time -0.005684121 -0.0027596847 -0.0029117309 -0.0380043508
## clouds_coverage_pct -0.148972787 0.0885586045 0.0344629133 0.0444422038
## temperature 1.000000000 0.0959351719 -0.0227070187 0.1366876702
## rain_mm 0.095935172 1.0000000000 0.0008793879 -0.0174253010
## snow_mm -0.022707019 0.0008793879 1.0000000000 0.0009174909
## traffic 0.136687670 -0.0174253010 0.0009174909 1.0000000000
#Let's plot it
corrplot(traffic_correlations, method = "number")
# We DO NOT see strong correlation with numerical variables
rm(traffic_correlations)
# Let's be curious and see the relation of the target variable with the temperature
ggplot(train_sample,
aes(x = temperature,
y = traffic)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# rain
ggplot(train_sample,
aes(x = rain_mm,
y = traffic)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# and snow
ggplot(train_sample,
aes(x = snow_mm,
y = traffic)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# Let's remove outliers with temperature < -50, rain_mm > 40mm
traffic_outliers <- which(train_sample$temperature < -50 | train_sample$rain_mm > 40)
train_sample <- train_sample[-traffic_outliers,]
rm(traffic_outliers)
# QUALITATIVE (CATEGORICAL) VARIABLES #
aov(train_sample$traffic ~ train_sample$month + train_sample$date +
train_sample$day_of_week + train_sample$hour + train_sample$weather_general +
train_sample$weather_detailed) ->
traffic_anova
summary(traffic_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## train_sample$month 11 4.682e+08 4.256e+07 65.724 < 2e-16 ***
## train_sample$date 30 2.597e+08 8.656e+06 13.366 < 2e-16 ***
## train_sample$day_of_week 6 4.155e+09 6.924e+08 1069.268 < 2e-16 ***
## train_sample$hour 23 6.343e+10 2.758e+09 4258.446 < 2e-16 ***
## train_sample$weather_general 10 2.728e+07 2.728e+06 4.213 7.22e-06 ***
## train_sample$weather_detailed 26 2.165e+07 8.328e+05 1.286 0.15
## Residuals 20638 1.337e+10 6.476e+05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# F value is significant for hour, day_of_week and month
# Let's see the relation of the target variable with them
# hour
ggplot(train_sample,
aes(x = hour,
y = traffic)) +
geom_boxplot(fill = "blue", outlier.color = "red") +
theme_bw()
# hour + day_of_week
ggplot(train_sample, mapping = aes(x = hour, y = traffic)) +
geom_boxplot(fill = "blue", outlier.colour = "red") +
facet_wrap(~day_of_week)
# Let's remove outliers for traffic ~ hours and create a dataset without them
train_sample_noout <- train_sample
list_quantiles <- tapply(train_sample_noout$traffic, train_sample_noout$hour, quantile)
Q1s <- sapply(1:24, function(i) list_quantiles[[i]][2])
Q3s <- sapply(1:24, function(i) list_quantiles[[i]][4])
IQRs <- tapply(train_sample_noout$traffic, train_sample_noout$hour, IQR)
Lowers <- Q1s - 1.5*IQRs
Uppers <- Q3s + 1.5*IQRs
data_s <- split(train_sample_noout, train_sample_noout$hour)
train_sample_final <- NULL
for (i in 1:24){
out <- subset(data_s[[i]], data_s[[i]]$traffic > Lowers[i] & data_s[[i]]$traffic < Uppers[i])
train_sample_final <- rbind(train_sample_final, out)
}
rm(data_s, list_quantiles, out, train_sample_noout, i, IQRs, Lowers, Q1s, Q3s, Uppers)
# Let's plot "hour" again
ggplot(train_sample_final,
aes(x = hour,
y = traffic)) +
geom_boxplot(fill = "blue", outlier.color = "red") +
theme_bw()
# Looks a bit better, will test both datasets for MAPE later
# day of week
ggplot(train_sample,
aes(x = day_of_week,
y = traffic)) +
geom_boxplot(fill = "blue", outlier.color = "red") +
theme_bw()
# weather general
ggplot(train_sample,
aes(x = weather_general,
y = traffic)) +
geom_boxplot(fill = "blue", outlier.color = "red") +
theme_bw()
# month
ggplot(train_sample,
aes(x = month,
y = traffic)) +
geom_boxplot(fill = "blue") +
theme_bw()
## Let's check Near Zero Variance
nearZeroVar(train_sample_final,
saveMetrics = TRUE) -> traffic_nzv_stats
traffic_nzv_stats %>%
rownames_to_column("variable") %>%
arrange(-zeroVar, -nzv, -freqRatio)
## variable freqRatio percentUnique zeroVar nzv
## 1 snow_mm 1695.666667 0.05884949 FALSE TRUE
## 2 rain_mm 39.620985 1.48594968 FALSE TRUE
## 3 weather_detailed 2.305995 0.17654848 FALSE FALSE
## 4 clouds_coverage_pct 1.376914 0.27953509 FALSE FALSE
## 5 weather_general 1.260516 0.05394537 FALSE FALSE
## 6 temperature 1.186275 2.97680349 FALSE FALSE
## 7 traffic 1.095238 29.65033593 FALSE FALSE
## 8 year 1.072130 0.02452062 FALSE FALSE
## 9 month 1.038278 0.05884949 FALSE FALSE
## 10 day_of_week 1.021269 0.03432887 FALSE FALSE
## 11 hour 1.015521 0.11769898 FALSE FALSE
## 12 cos_time 1.015521 0.11769898 FALSE FALSE
## 13 sin_time 1.015521 0.11769898 FALSE FALSE
## 14 date 1.004032 0.15202786 FALSE FALSE
# Snow and rain have NZV and as they are not correlated with traffic
# Probably we will not use them
# Let's kill all unnecessary variables
rm(traffic_anova, traffic_nzv_stats)
# Let's define function that will summarize all the errors and R2
regressionMetrics <- function(real, predicted) {
MSE <- mean((real - predicted)^2) # Mean Square Error
RMSE <- sqrt(MSE) # Root Mean Square Error
MAE <- mean(abs(real - predicted)) # Mean Absolute Error
MAPE <- mean(abs(real - predicted)/real) # Mean Absolute Percentage Error
MedAE <- median(abs(real - predicted)) # Median Absolute Error
R2 <- cor(predicted, real)^2 # R2
result <- data.frame(MSE, RMSE, MAE, MAPE, MedAE, R2)
return(result)
}
#####################################
# LINEAR REGRESSION #
#####################################
# Defining contrasts
options(contrasts = c("contr.treatment",
"contr.treatment"))
# Creating a model
set.seed(987654321)
traffic_lm <- lm(traffic ~ month + day_of_week + hour,
data = train_sample_final)
summary(traffic_lm)
##
## Call:
## lm(formula = traffic ~ month + day_of_week + hour, data = train_sample_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4319.3 -405.1 -25.9 474.5 2438.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -275.55 36.26 -7.599 3.12e-14 ***
## month2 128.65 29.70 4.332 1.49e-05 ***
## month3 222.07 29.29 7.581 3.56e-14 ***
## month4 282.52 28.48 9.919 < 2e-16 ***
## month5 219.97 27.48 8.004 1.26e-15 ***
## month6 308.38 29.15 10.580 < 2e-16 ***
## month7 145.87 26.89 5.425 5.85e-08 ***
## month8 245.47 28.27 8.683 < 2e-16 ***
## month9 191.93 29.18 6.578 4.88e-11 ***
## month10 227.19 27.85 8.156 3.65e-16 ***
## month11 28.84 27.12 1.064 0.2875
## month12 -136.58 26.69 -5.117 3.14e-07 ***
## day_of_weekMon 967.71 20.69 46.781 < 2e-16 ***
## day_of_weekTue 1183.77 20.94 56.527 < 2e-16 ***
## day_of_weekWed 1243.15 20.79 59.797 < 2e-16 ***
## day_of_weekThu 1260.95 20.84 60.514 < 2e-16 ***
## day_of_weekFri 1281.10 20.81 61.548 < 2e-16 ***
## day_of_weekSat 427.35 20.97 20.376 < 2e-16 ***
## hour1 -274.20 37.72 -7.270 3.73e-13 ***
## hour2 -428.20 38.12 -11.232 < 2e-16 ***
## hour3 -447.58 38.27 -11.694 < 2e-16 ***
## hour4 -88.75 37.36 -2.376 0.0175 *
## hour5 1275.41 37.90 33.650 < 2e-16 ***
## hour6 3299.93 37.63 87.703 < 2e-16 ***
## hour7 4018.63 37.73 106.512 < 2e-16 ***
## hour8 3748.30 37.50 99.948 < 2e-16 ***
## hour9 3603.87 38.55 93.489 < 2e-16 ***
## hour10 3452.17 38.31 90.110 < 2e-16 ***
## hour11 3724.38 38.48 96.784 < 2e-16 ***
## hour12 3983.95 38.24 104.183 < 2e-16 ***
## hour13 3973.34 39.01 101.848 < 2e-16 ***
## hour14 4209.98 38.15 110.351 < 2e-16 ***
## hour15 4474.54 38.48 116.288 < 2e-16 ***
## hour16 4849.24 38.12 127.204 < 2e-16 ***
## hour17 4507.38 38.02 118.558 < 2e-16 ***
## hour18 3477.33 38.06 91.369 < 2e-16 ***
## hour19 2465.63 37.80 65.220 < 2e-16 ***
## hour20 2033.08 38.28 53.108 < 2e-16 ***
## hour21 1827.67 37.84 48.294 < 2e-16 ***
## hour22 1375.15 38.34 35.871 < 2e-16 ***
## hour23 599.98 37.96 15.807 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 787.6 on 20350 degrees of freedom
## Multiple R-squared: 0.844, Adjusted R-squared: 0.8437
## F-statistic: 2753 on 40 and 20350 DF, p-value: < 2.2e-16
# Predicting traffic in training dataset
traffic_lm_fitted <- predict(traffic_lm)
# Checking their correlation
cor(train_sample_final$traffic,
traffic_lm_fitted)
## [1] 0.9187008
# Let's check metrics for our model
regressionMetrics(real = train_sample_final$traffic,
predicted = traffic_lm_fitted)
## MSE RMSE MAE MAPE MedAE R2
## 1 619085.1 786.8196 572.8329 0.4064949 433.0294 0.8440111
# MAPE = 40.65%, not good but reasonable
# Checking the distribution of errors
ggplot(data.frame(error = train_sample_final$traffic - traffic_lm_fitted),
aes(x = error)) +
geom_histogram(fill = "blue",
bins = 100) +
theme_bw()
# Let's apply model to our test sample
traffic_lm_test_fitted <- predict(traffic_lm, test_sample)
regressionMetrics(real = test_sample$traffic,
predicted = traffic_lm_test_fitted)
## MSE RMSE MAE MAPE MedAE R2
## 1 684841.6 827.5515 602.9276 0.4315104 452.745 0.8268774
# MAPE = 43.15% so let's try another methods
########################################
# KNN #
########################################
modelLookup("knn")
## model parameter label forReg forClass probModel
## 1 knn k #Neighbors TRUE TRUE TRUE
set.seed(987654321)
ctrl_nocv <- trainControl(method = "none")
traffic_train_knn <-
train(traffic ~ month + day_of_week + hour + sin_time + cos_time,
data = train_sample_final,
method = "knn",
trControl = ctrl_nocv)
traffic_train_knn_fitted <- predict(traffic_train_knn)
regressionMetrics(real = train_sample_final$traffic,
predicted = traffic_train_knn_fitted)
## MSE RMSE MAE MAPE MedAE R2
## 1 180911.6 425.3371 246.3047 0.102594 138.5714 0.9544163
# MAPE = 10.25%
# Checking the distribution of errors
ggplot(data.frame(error = train_sample_final$traffic - traffic_train_knn_fitted),
aes(x = error)) +
geom_histogram(fill = "blue",
bins = 100) +
theme_bw()
# Looks good
# Let's validate our model with the test sample
traffic_test_knn_fitted <- predict(traffic_train_knn,
test_sample)
regressionMetrics(real = test_sample$traffic,
predicted = traffic_test_knn_fitted)
## MSE RMSE MAE MAPE MedAE R2
## 1 269114.4 518.7623 303.1432 0.1330975 170.3333 0.9319415
# MAPE = 13.3% - not so bad.
########################################
# Ridge #
########################################
#Defining vectors with all variables and selected ones
all_vars <- names(traffic_train)
selected_vars <- all_vars[-which(all_vars %in%
c("year", "date", "weather_general",
"weather_detailed", "snow_mm", "clouds_coverage_pct",
"temperature", "rain_mm"))]
train_sample_ridge <- train_sample_final
traffic_ridge <-
glmnet(x = as.matrix(train_sample_ridge[, all_vars]),
y = train_sample_ridge$traffic,
family = "gaussian",
lambda = exp(log(10)*seq(-2, 9, length.out = 200)),
alpha = 0)
dim(coef(traffic_ridge))
## [1] 15 200
coef(traffic_ridge)[, 1]
## (Intercept) year month date
## 3.252828e+03 -3.325854e-05 -1.914224e-05 -1.179208e-05
## day_of_week hour cos_time sin_time
## 0.000000e+00 2.002342e-04 4.660377e-04 -2.143746e-04
## weather_general weather_detailed clouds_coverage_pct temperature
## 0.000000e+00 0.000000e+00 4.594741e-06 4.297694e-05
## rain_mm snow_mm traffic
## -7.151565e-05 2.789380e-04 1.992174e-06
# Ridge regression
options(contrasts = c("contr.treatment",
"contr.treatment"))
ctrl_cv5 <- trainControl(method = "cv",
number = 5,
# returnResamp = "all"
# savePredictions = "all"
)
parameters_ridge <- expand.grid(alpha = 0,
lambda = seq(10, 1e3, 10))
set.seed(123456789)
traffic_ridge <- train(traffic ~ month + day_of_week + hour + sin_time + cos_time,
data = train_sample_ridge,
method = "glmnet",
tuneGrid = parameters_ridge,
trControl = ctrl_cv5)
traffic_ridge$bestTune$lambda
## [1] 50
# Let's do the predictions
predict(traffic_ridge$finalModel,
s = traffic_ridge$bestTune$lambda,
type = "coefficients")
## 43 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1397.489955
## month2 85.366470
## month3 167.322636
## month4 233.162374
## month5 174.198268
## month6 264.306766
## month7 101.018424
## month8 203.041645
## month9 145.041020
## month10 184.818319
## month11 -12.239936
## month12 -177.776228
## day_of_weekMon 799.476689
## day_of_weekTue 1007.931710
## day_of_weekWed 1067.437253
## day_of_weekThu 1086.617531
## day_of_weekFri 1100.376666
## day_of_weekSat 272.040589
## hour1 -1281.434017
## hour2 -2188.650344
## hour3 -2663.101360
## hour4 -2047.213858
## hour5 21.472895
## hour6 2527.503913
## hour7 3063.735862
## hour8 2081.486344
## hour9 1335.434499
## hour10 1248.616195
## hour11 2182.594086
## hour12 3103.208058
## hour13 3140.935755
## hour14 2760.191575
## hour15 2303.878381
## hour16 2507.105320
## hour17 2720.418680
## hour18 2467.741078
## hour19 1741.133191
## hour20 857.428269
## hour21 -103.255766
## hour22 -900.631056
## hour23 -1281.940458
## sin_time 1.869623
## cos_time -819.145824
# And check metrics
regressionMetrics(real = train_sample_final$traffic,
predicted = predict(traffic_ridge,
train_sample_final))
## MSE RMSE MAE MAPE MedAE R2
## 1 644909.6 803.0626 590.413 0.4289818 444.8925 0.8395226
# Almost the same results like in linear model
# MAPE = 42.89%
# Let's apply the model to the test_sample and check metrics
regressionMetrics(real = test_sample$traffic,
predicted = predict(traffic_ridge,
test_sample))
## MSE RMSE MAE MAPE MedAE R2
## 1 708812.2 841.9099 620.4978 0.451831 466.7647 0.8216676
# MAPE = 45.18%
#######################==RESULTS==############################
# Mean absolute percentage (MAPE) for the algorithms used:
# Linear regression: 43.15%
# KNN: 13.3%
# Ridge_Lasso: 45.18%
# We see that KNN algorithm performs better than others.
# As for linear regression we assume a linear relationship between X and Y
# but if the true relationship is far from linear, then the resulting model
# will provide a poor fit to the data.
# KNN is non-linear. It can detect linear or non-linear distributed data
# it tends to perform very well with a lot of situations.
# KNN predicts y by a local average.
# Let's apply KN model to the "traffic_test.csv"
traffic_final_fitted <- predict(traffic_train_knn,
traffic_test)
# Add this column to the original "traffic_test" dataset
traffic_test_backup$traffic <- as.integer(traffic_final_fitted)
# And save it
write.csv(traffic_test_backup,"traffic_test_predicted.csv", row.names = TRUE)
#############################################################