# Predict how many cylinders a vehicle has based on about 2 dozen variables
library(caret); library(RCurl); library(nnet) #neural network multinomial modeling
## Warning: package 'caret' was built under R version 3.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
## Warning: package 'RCurl' was built under R version 3.1.2
## Loading required package: bitops
## Warning: package 'nnet' was built under R version 3.1.2
#data source 'https://raw.githubusercontent.com/hadley/fueleconomy/master/data-raw/vehicles.csv'
setwd("/Users/SKSN/Documents/NYDSA/DATA SET/DemoWork")
data <- read.csv("vehicles_hadley_fueleconomy.csv") # data from hard disk
dim(data)
## [1] 34631 74
data <- data[,1:24] # taking subset of data for simplicity
str(data)
## 'data.frame': 34631 obs. of 24 variables:
## $ barrels08 : Factor w/ 125 levels "0.059892","0.06634",..: 69 90 56 90 73 67 61 63 58 61 ...
## $ barrelsA08 : Factor w/ 30 levels "0.0","0.080848",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ charge120 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ charge240 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ city08 : int 19 9 23 10 17 21 22 23 23 23 ...
## $ city08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityA08 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cityA08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityCD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityUF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ co2 : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ co2A : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ co2TailpipeAGpm: Factor w/ 137 levels "0.0","273.0",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ co2TailpipeGpm : Factor w/ 506 levels "0.0","110.0",..: 253 499 146 499 303 230 176 193 160 176 ...
## $ comb08 : int 21 11 27 11 19 22 25 24 26 25 ...
## $ comb08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combA08 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ combA08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combinedCD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combinedUF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cylinders : int 4 12 4 8 4 4 4 4 4 4 ...
## $ displ : num 2 4.9 2.2 5.2 2.2 1.8 1.8 1.6 1.6 1.8 ...
data <- data.frame(lapply(data, as.character), stringsAsFactors=FALSE)
data <- data.frame(lapply(data, as.numeric))
data[is.na(data)] <- 0
data$cylinders <- as.factor(data$cylinders)
str(data)
## 'data.frame': 34631 obs. of 24 variables:
## $ barrels08 : num 15.7 30 12.2 30 17.3 ...
## $ barrelsA08 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charge120 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charge240 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ city08 : num 19 9 23 10 17 21 22 23 23 23 ...
## $ city08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityA08 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityA08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityCD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cityUF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ co2 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ co2A : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ co2TailpipeAGpm: num 0 0 0 0 0 0 0 0 0 0 ...
## $ co2TailpipeGpm : num 423 808 329 808 468 ...
## $ comb08 : num 21 11 27 11 19 22 25 24 26 25 ...
## $ comb08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combA08 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combA08U : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combinedCD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ combinedUF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cylinders : Factor w/ 10 levels "0","2","3","4",..: 4 9 4 7 4 4 4 4 4 4 ...
## $ displ : num 2 4.9 2.2 5.2 2.2 1.8 1.8 1.6 1.6 1.8 ...
table(data$cylinders)
##
## 0 2 3 4 5 6 8 10 12 16
## 66 51 182 13133 757 12101 7715 138 481 7
set.seed(111) #data partitioning
split <- createDataPartition(data$cylinders, p=.7, list=FALSE)
Train <- data[split,]; Test <- data[-split, ]
#1 Model: multinom without cross-validation
cylModel <- multinom(cylinders~., data=Train, maxit=500, trace=T)
## # weights: 250 (216 variable)
## initial value 55828.478165
## iter 10 value 30334.667642
## iter 20 value 28163.198958
## iter 30 value 25964.035156
## iter 40 value 24793.330698
## iter 50 value 24200.318611
## iter 60 value 23757.051026
## iter 70 value 22966.047716
## iter 80 value 21760.527395
## iter 90 value 19701.327432
## iter 100 value 17246.778708
## iter 110 value 14272.117841
## iter 120 value 12501.427997
## iter 130 value 10024.833987
## iter 140 value 8653.358621
## iter 150 value 7723.473248
## iter 160 value 7418.364005
## iter 170 value 7315.161713
## iter 180 value 7239.334729
## iter 190 value 7188.314943
## iter 200 value 7166.877560
## iter 210 value 7150.005212
## iter 220 value 7129.801574
## iter 230 value 7115.570803
## iter 240 value 7108.756322
## iter 250 value 7102.928903
## iter 260 value 7098.712705
## iter 270 value 7095.902297
## iter 280 value 7093.735873
## iter 290 value 7091.945468
## iter 300 value 7090.294153
## iter 310 value 7088.229287
## iter 320 value 7087.179647
## iter 330 value 7086.340349
## iter 340 value 7084.957010
## iter 350 value 7083.860035
## iter 360 value 7082.419159
## iter 370 value 7081.424061
## iter 380 value 7080.914717
## iter 390 value 7080.543250
## iter 400 value 7080.246201
## iter 410 value 7079.954150
## iter 420 value 7079.665257
## iter 430 value 7079.548107
## final value 7079.502748
## converged
# Prediction on test dataset
Predicts <- predict(cylModel, type="class", newdata=Test); head(Predicts)
## [1] 4 4 4 8 4 8
## Levels: 0 2 3 4 5 6 8 10 12 16
(SingleRunResult <- postResample(Test$cylinders,Predicts)) # accuracy by postResample function
## Accuracy Kappa
## 0.9031295 0.8560092
## Top influential variables
topVari <- varImp(cylModel)
topVari$Variables <- row.names(topVari)
topVari <- topVari[order(-topVari$Overall),]; head(topVari)
## Overall Variables
## cityUF 816.4296 cityUF
## combinedUF 788.1985 combinedUF
## cityA08U 475.0981 cityA08U
## charge240 465.2369 charge240
## combA08U 286.4455 combA08U
## cityA08 269.8205 cityA08
#2 Model: multinom with Cross-Validation in a for loop
totalAccuracy <- c(); i <- 10
(divider <- floor(nrow(data) / (i + 1)))
## [1] 3148
for (i in seq(1:i)) {
# assign chunk to data test
Idx <- c((i * divider):(i * divider + divider))
Train <- data[-Idx, ]; Test <- data[Idx, ]
cylModel50 <- multinom(cylinders~., data=Train, maxit=1000, trace=F) # maxit of 1000 improves results
Predicts <- predict(cylModel, newdata=Test, type="class")
accuracys <- postResample(Test$cylinders, Predicts)[[1]]
print(paste('Current Accuracy:',accuracys,'for Cross Validation:',i))
totalAccuracy <- c(totalAccuracy, accuracys)
}
## [1] "Current Accuracy: 0.892029215624008 for Cross Validation: 1"
## [1] "Current Accuracy: 0.88504287075262 for Cross Validation: 2"
## [1] "Current Accuracy: 0.862813591616386 for Cross Validation: 3"
## [1] "Current Accuracy: 0.877738964750715 for Cross Validation: 4"
## [1] "Current Accuracy: 0.901238488409019 for Cross Validation: 5"
## [1] "Current Accuracy: 0.912353127977136 for Cross Validation: 6"
## [1] "Current Accuracy: 0.932359479199746 for Cross Validation: 7"
## [1] "Current Accuracy: 0.940298507462687 for Cross Validation: 8"
## [1] "Current Accuracy: 0.924420450936805 for Cross Validation: 9"
## [1] "Current Accuracy: 0.927913623372499 for Cross Validation: 10"
(crossValidatedResult <- mean(totalAccuracy))
## [1] 0.9056208
SingleRunResult # !calculated earlier
## Accuracy Kappa
## 0.9031295 0.8560092