Post - Metro_Interstate_Traffic_Volume_datset

library(mlbench)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(klaR)
## Loading required package: MASS
library(ElemStatLearn)
library(naivebayes)
## naivebayes 0.9.6 loaded
library(e1071)
library(useful)
library(ipred)
library(ada)
## Loading required package: rpart
library(corrplot)
## corrplot 0.84 loaded
library(stats)
library(svmpath)
## Loaded svmpath 0.955
## 
## Attaching package: 'svmpath'
## The following object is masked from 'package:MASS':
## 
##     enlist
library(gmodels)
library(vcd)
## Loading required package: grid
library(irr)
## Loading required package: lpSolve
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
require(useful)
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(adabag)
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
## 
## Attaching package: 'adabag'
## The following object is masked from 'package:ipred':
## 
##     bagging
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:naivebayes':
## 
##     tables
library(tidyr)
library(pandocfilters)
## 
## Info message:
##  Couldn't find 'pandoc'!
## 
## Attaching package: 'pandocfilters'
## The following object is masked from 'package:MASS':
## 
##     Null
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:methods':
## 
##     Math
library(class)
require(RODBC)
## Loading required package: RODBC
metro5 <- read.csv("Metro Interstate Traffic Volume(1).csv", header = TRUE)

data(metro5)
## Warning in data(metro5): data set 'metro5' not found
str(metro5)
## 'data.frame':    10000 obs. of  5 variables:
##  $ temp          : num  288 289 290 290 291 ...
##  $ rain.1h       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ clouds.all    : int  40 75 90 90 75 1 1 1 20 20 ...
##  $ weather.main  : Factor w/ 10 levels "Clear","Clouds",..: 2 2 2 2 2 1 1 1 2 2 ...
##  $ traffic.volume: int  5545 4516 4767 5026 4918 5181 5584 6015 5791 4770 ...
head(metro5)
##     temp rain.1h clouds.all weather.main traffic.volume
## 1 288.28       0         40       Clouds           5545
## 2 289.36       0         75       Clouds           4516
## 3 289.58       0         90       Clouds           4767
## 4 290.13       0         90       Clouds           5026
## 5 291.14       0         75       Clouds           4918
## 6 291.72       0          1        Clear           5181
names(metro5)
## [1] "temp"           "rain.1h"        "clouds.all"     "weather.main"  
## [5] "traffic.volume"
summary(metro5)
##       temp          rain.1h          clouds.all      weather.main 
##  Min.   :244.8   Min.   : 0.0000   Min.   :  0.00   Clouds :3796  
##  1st Qu.:271.3   1st Qu.: 0.0000   1st Qu.:  5.00   Clear  :2404  
##  Median :278.5   Median : 0.0000   Median : 75.00   Mist   :1319  
##  Mean   :279.6   Mean   : 0.1175   Mean   : 55.03   Rain   : 964  
##  3rd Qu.:289.2   3rd Qu.: 0.0000   3rd Qu.: 90.00   Snow   : 653  
##  Max.   :308.2   Max.   :55.6300   Max.   :100.00   Drizzle: 311  
##                                                     (Other): 553  
##  traffic.volume
##  Min.   : 125  
##  1st Qu.:1213  
##  Median :3374  
##  Mean   :3302  
##  3rd Qu.:5035  
##  Max.   :7217  
## 
train_index <- createDataPartition(metro5$weather.main, p=0.8, list = FALSE)
metro_train <- metro5[train_index, ]
metro_test <- metro5[ -train_index, ]
table(metro_train$weather.main)
## 
##        Clear       Clouds      Drizzle          Fog         Haze 
##         1924         3037          249          116          223 
##         Mist         Rain         Snow       Squall Thunderstorm 
##         1056          772          523            3          102
table(metro_test$weather.main)
## 
##        Clear       Clouds      Drizzle          Fog         Haze 
##          480          759           62           29           55 
##         Mist         Rain         Snow       Squall Thunderstorm 
##          263          192          130            0           25

Run Algorithms using 10-fold cross-validation:

trainControl <- trainControl(method = "repeatedcv", number = 4, repeats = 3)
metric <- "Accuracy"

Section 3: KNN and SVM Methods:

Let’s now look at a Baseline of performance and look at KNN and SVM:

set.seed(188)
fit.knn <- train(weather.main ~ ., data = metro5, method = "knn", metric=metric, preProc=c("center","scale","BoxCox"),
                 trControl = trainControl)
set.seed(188)
fit.svm <- train(weather.main ~ ., data = metro5, method = "svmRadial", metric=metric,
                 preProc = c("center", "scale", "BoxCox"), trControl = trainControl)

Let’s compare the Algorithms and get a quick idea of what’s going on:

We keep in mind that (1) RMSE is a perfect fit at 0: and, (2) R-squared is perfect at 1, worst at 0

set.seed(188)
results <- resamples(list(KNN = fit.knn, SVM = fit.svm))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: KNN, SVM 
## Number of resamples: 12 
## 
## Accuracy 
##          Min. 1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## KNN 0.5971223  0.6076 0.6119549 0.6116010 0.6159918 0.6210484    0
## SVM 0.6266986  0.6311 0.6325471 0.6327348 0.6356271 0.6375651    0
## 
## Kappa 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## KNN 0.4526133 0.4630359 0.4700534 0.4692746 0.4755751 0.4800476    0
## SVM 0.4393665 0.4466494 0.4489064 0.4492613 0.4537959 0.4574281    0
dotplot(results)

###COMMENT: #### The performance of each model is not good; however, the svm model performs just a little better than the knn model with this data.
# ### Let’s look at the svm model with this data and prepare parameters for data trtansform:

set.seed(188)
datasetNoMissing <- metro5[complete.cases(metro5), ]
x <- datasetNoMissing[ ,1:5]
preprocessParams <- preProcess(x, method = c("BoxCox"))
x <- predict(preprocessParams, x)

Prepare the validation dataset for making a prediction:

Validation Dataset:

Split out validation dataset

create a list of 80% of the rows in the original dataset we can use for training:

set.seed(188)
validationIndex <- createDataPartition(metro5$weather.main, p = 0.80, list = FALSE)

Select 20% of the data for validation:

validation <- metro5[-validationIndex, ]

Use the remaining 80% of data to training and testing the models:

dataset <- metro5[validationIndex]

dimensions of dataset:

dim(dataset)
## NULL

Let’s look at some data to see what we are working with:

head(dataset, n=10)
##  [1] "288.280" "289.360" "290.130" "291.140" "291.720" "293.170" "293.860"
##  [8] "294.140" "293.100" "290.970"

Remove missing values:

set.seed(188)
validation <- validation[complete.cases(validation), ]

Convert to numeric; and, convert to factor:

set.seed(188)
for(i in 1:4) {
  validation[ ,i] <- as.numeric(as.character(validation[ ,i]))
}
## Warning: NAs introduced by coercion
metro5$weather.main = as.factor(metro5$weather.main)

Transform the validation dataset:

set.seed(188)
validationX <- predict(preprocessParams, validation[ ,1:5])

Make Predictions: