library(mlbench)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(klaR)
## Loading required package: MASS
library(ElemStatLearn)
library(naivebayes)
## naivebayes 0.9.6 loaded
library(e1071)
library(useful)
library(ipred)
library(ada)
## Loading required package: rpart
library(corrplot)
## corrplot 0.84 loaded
library(stats)
library(svmpath)
## Loaded svmpath 0.955
##
## Attaching package: 'svmpath'
## The following object is masked from 'package:MASS':
##
## enlist
library(gmodels)
library(vcd)
## Loading required package: grid
library(irr)
## Loading required package: lpSolve
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
require(useful)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(adabag)
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
##
## Attaching package: 'adabag'
## The following object is masked from 'package:ipred':
##
## bagging
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:naivebayes':
##
## tables
library(tidyr)
library(pandocfilters)
##
## Info message:
## Couldn't find 'pandoc'!
##
## Attaching package: 'pandocfilters'
## The following object is masked from 'package:MASS':
##
## Null
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:methods':
##
## Math
library(class)
require(RODBC)
## Loading required package: RODBC
metro5 <- read.csv("Metro Interstate Traffic Volume(1).csv", header = TRUE)
data(metro5)
## Warning in data(metro5): data set 'metro5' not found
str(metro5)
## 'data.frame': 10000 obs. of 5 variables:
## $ temp : num 288 289 290 290 291 ...
## $ rain.1h : num 0 0 0 0 0 0 0 0 0 0 ...
## $ clouds.all : int 40 75 90 90 75 1 1 1 20 20 ...
## $ weather.main : Factor w/ 10 levels "Clear","Clouds",..: 2 2 2 2 2 1 1 1 2 2 ...
## $ traffic.volume: int 5545 4516 4767 5026 4918 5181 5584 6015 5791 4770 ...
head(metro5)
## temp rain.1h clouds.all weather.main traffic.volume
## 1 288.28 0 40 Clouds 5545
## 2 289.36 0 75 Clouds 4516
## 3 289.58 0 90 Clouds 4767
## 4 290.13 0 90 Clouds 5026
## 5 291.14 0 75 Clouds 4918
## 6 291.72 0 1 Clear 5181
names(metro5)
## [1] "temp" "rain.1h" "clouds.all" "weather.main"
## [5] "traffic.volume"
summary(metro5)
## temp rain.1h clouds.all weather.main
## Min. :244.8 Min. : 0.0000 Min. : 0.00 Clouds :3796
## 1st Qu.:271.3 1st Qu.: 0.0000 1st Qu.: 5.00 Clear :2404
## Median :278.5 Median : 0.0000 Median : 75.00 Mist :1319
## Mean :279.6 Mean : 0.1175 Mean : 55.03 Rain : 964
## 3rd Qu.:289.2 3rd Qu.: 0.0000 3rd Qu.: 90.00 Snow : 653
## Max. :308.2 Max. :55.6300 Max. :100.00 Drizzle: 311
## (Other): 553
## traffic.volume
## Min. : 125
## 1st Qu.:1213
## Median :3374
## Mean :3302
## 3rd Qu.:5035
## Max. :7217
##
train_index <- createDataPartition(metro5$weather.main, p=0.8, list = FALSE)
metro_train <- metro5[train_index, ]
metro_test <- metro5[ -train_index, ]
table(metro_train$weather.main)
##
## Clear Clouds Drizzle Fog Haze
## 1924 3037 249 116 223
## Mist Rain Snow Squall Thunderstorm
## 1056 772 523 3 102
table(metro_test$weather.main)
##
## Clear Clouds Drizzle Fog Haze
## 480 759 62 29 55
## Mist Rain Snow Squall Thunderstorm
## 263 192 130 0 25
trainControl <- trainControl(method = "repeatedcv", number = 4, repeats = 3)
metric <- "Accuracy"
set.seed(188)
fit.knn <- train(weather.main ~ ., data = metro5, method = "knn", metric=metric, preProc=c("center","scale","BoxCox"),
trControl = trainControl)
set.seed(188)
fit.svm <- train(weather.main ~ ., data = metro5, method = "svmRadial", metric=metric,
preProc = c("center", "scale", "BoxCox"), trControl = trainControl)
set.seed(188)
results <- resamples(list(KNN = fit.knn, SVM = fit.svm))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: KNN, SVM
## Number of resamples: 12
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 0.5971223 0.6076 0.6119549 0.6116010 0.6159918 0.6210484 0
## SVM 0.6266986 0.6311 0.6325471 0.6327348 0.6356271 0.6375651 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 0.4526133 0.4630359 0.4700534 0.4692746 0.4755751 0.4800476 0
## SVM 0.4393665 0.4466494 0.4489064 0.4492613 0.4537959 0.4574281 0
dotplot(results)
###COMMENT: #### The performance of each model is not good; however, the svm model performs just a little better than the knn model with this data.
# ### Let’s look at the svm model with this data and prepare parameters for data trtansform:
set.seed(188)
datasetNoMissing <- metro5[complete.cases(metro5), ]
x <- datasetNoMissing[ ,1:5]
preprocessParams <- preProcess(x, method = c("BoxCox"))
x <- predict(preprocessParams, x)
set.seed(188)
validationIndex <- createDataPartition(metro5$weather.main, p = 0.80, list = FALSE)
validation <- metro5[-validationIndex, ]
dataset <- metro5[validationIndex]
dim(dataset)
## NULL
head(dataset, n=10)
## [1] "288.280" "289.360" "290.130" "291.140" "291.720" "293.170" "293.860"
## [8] "294.140" "293.100" "290.970"
set.seed(188)
validation <- validation[complete.cases(validation), ]
set.seed(188)
for(i in 1:4) {
validation[ ,i] <- as.numeric(as.character(validation[ ,i]))
}
## Warning: NAs introduced by coercion
metro5$weather.main = as.factor(metro5$weather.main)
set.seed(188)
validationX <- predict(preprocessParams, validation[ ,1:5])