#TECHNICAL REPORT
Exporting and loading
getwd()
## [1] "C:/Users/S/Documents/Ubiqum/dataanalyticsII_task2"
Survey <- read.csv("CompleteResponses.csv")
IncompleteSurvey <- read.csv("SurveyIncomplete.csv")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Metrics)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(caTools)
library(mlbench)
library(C50)
## Warning: package 'C50' was built under R version 3.6.1
library(inum)
## Warning: package 'inum' was built under R version 3.6.1
library(doSNOW)
## Warning: package 'doSNOW' was built under R version 3.6.1
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: snow
library(mgsub)
## Warning: package 'mgsub' was built under R version 3.6.1
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.1
library(textclean)
## Warning: package 'textclean' was built under R version 3.6.1
##
## Attaching package: 'textclean'
## The following object is masked from 'package:mgsub':
##
## mgsub
library(tictoc)
Data Exploration
head(Survey)
## salary age elevel car zipcode credit brand
## 1 119806.54 45 0 14 4 442037.71 0
## 2 106880.48 63 1 11 6 45007.18 1
## 3 78020.75 23 0 15 2 48795.32 0
## 4 63689.94 51 3 6 5 40888.88 1
## 5 50873.62 20 3 14 4 352951.50 0
## 6 130812.74 56 3 14 3 135943.02 1
summary(Survey)
## salary age elevel car
## Min. : 20000 Min. :20.00 Min. :0.000 Min. : 1.00
## 1st Qu.: 52082 1st Qu.:35.00 1st Qu.:1.000 1st Qu.: 6.00
## Median : 84950 Median :50.00 Median :2.000 Median :11.00
## Mean : 84871 Mean :49.78 Mean :1.983 Mean :10.52
## 3rd Qu.:117162 3rd Qu.:65.00 3rd Qu.:3.000 3rd Qu.:15.75
## Max. :150000 Max. :80.00 Max. :4.000 Max. :20.00
## zipcode credit brand
## Min. :0.000 Min. : 0 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:120807 1st Qu.:0.0000
## Median :4.000 Median :250607 Median :1.0000
## Mean :4.041 Mean :249176 Mean :0.6217
## 3rd Qu.:6.000 3rd Qu.:374640 3rd Qu.:1.0000
## Max. :8.000 Max. :500000 Max. :1.0000
str(Survey)
## 'data.frame': 9898 obs. of 7 variables:
## $ salary : num 119807 106880 78021 63690 50874 ...
## $ age : int 45 63 23 51 20 56 24 62 29 41 ...
## $ elevel : int 0 1 0 3 3 3 4 3 4 1 ...
## $ car : int 14 11 15 6 14 14 8 3 17 5 ...
## $ zipcode: int 4 6 2 5 4 3 5 0 0 4 ...
## $ credit : num 442038 45007 48795 40889 352951 ...
## $ brand : int 0 1 0 1 0 1 1 1 0 1 ...
#Checking if there are repeated lines
sum(duplicated(x = Survey))
## [1] 0
#checking for outliers
boxplot(Survey[,c("age", "salary", "elevel", "car", "credit", "zipcode" )])
#There are none
Data Exploration (Continuation) + Preprocessing
#Checking salary variable. It is evenly distributed
ggplot(Survey, aes(x=salary)) + geom_histogram(color="darkblue", fill="lightblue", bins=20)
#Same as car, and zipcode, which tells us the data is a stratified random sample
ggplot(Survey, aes(x=zipcode)) + geom_histogram(color="darkblue", fill="lightblue", bins=20)
Survey$car <- mgsub(x = Survey$car,pattern = c(1:20), replacement = c("BMW","Buick","Cadillac","Chevrolet","Chrysler","Dodge","Ford","Honda","Hyundai","Jeep","Kia","Lincoln","Mazda","Mercedes Benz","Mitsubishi","Nissan","Ram", "Subaru", "Toyota","None of the above"))
#Car brand is, unhelpingly, even
ggplot(Survey, aes(x=car)) + geom_bar(color="darkblue", fill="lightblue", bins=20)
## Warning: Ignoring unknown parameters: bins
#Renaming dependent and independent variable
Survey$brand <- as.factor(Survey$brand)
Survey$brand <- ifelse(Survey$brand==0, "Acer", "Sony")
IncompleteSurvey$brand <- as.factor(IncompleteSurvey$brand)
IncompleteSurvey$brand <- ifelse(IncompleteSurvey$brand==0, "Acer", "Sony")
#Brand difference. 66% of customers surveyed prefer Sony.
ggplot(Survey, aes(x=brand, fill=brand)) + geom_bar()
#Overlapped brand preference in a salary scale
ggplot(Survey, aes(x=salary, col=brand, fill=brand)) +geom_histogram(color="black", position="identity", alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Median salary of people that prefer each brand
ggplot(Survey, aes(x=brand, y=salary, fill=brand)) +
geom_boxplot() +stat_summary(fun.y=median, colour="black", geom="text",
vjust=-0.7, aes(label=round(..y.., digits=1)))
#Age and salary distribution over brand, with flow line
ggplot(Survey, aes(x=age, y=salary, col=brand)) +geom_point() +geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Preprocessing
#After the first model, I found the parameters 'elevel, car, zipcode and credit' have no correlation (<0.1%) with the dependent variable, and so I removed them.
Survey <- Survey[-c(3:6)]
IncompleteSurvey <- IncompleteSurvey[-c(3:6)]
Feature Engineering
#normalizing columns of interest
for (i in c(1,2)){Survey[,i] <- scale(Survey[,i])}
#normalizing columns of interest
for (i in c(1,2)){IncompleteSurvey[,i] <- scale(IncompleteSurvey[,i])}
Creating training and test sets
set.seed(456)
inTraining <- createDataPartition(Survey$brand, times=1, p=.75, list=FALSE)
Training <- Survey[inTraining, ]
Testing <- Survey[-inTraining, ]
#Examining the proportions of the brand label across the datasets
#Confirmed the 'caret' package keeps the same distribution
prop.table(table(Survey$brand))
##
## Acer Sony
## 0.3782582 0.6217418
prop.table(table(Training$brand))
##
## Acer Sony
## 0.3782328 0.6217672
prop.table(table(Testing$brand))
##
## Acer Sony
## 0.3783347 0.6216653
Model
fitControl <- trainControl(method = "cv", number= 10)
tic()
plsFit1 <- train(brand~., data = Training,
method = "C5.0", trControl=fitControl,
tuneLength =2)
toc()
## 15.95 sec elapsed
plsFit1
## C5.0
##
## 7424 samples
## 2 predictor
## 2 classes: 'Acer', 'Sony'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6682, 6682, 6682, 6681, 6681, 6681, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8207211 0.6426651
## rules FALSE 10 0.9197191 0.8284152
## rules TRUE 1 0.8207211 0.6426651
## rules TRUE 10 0.9197191 0.8284152
## tree FALSE 1 0.8203168 0.6419143
## tree FALSE 10 0.9191797 0.8278861
## tree TRUE 1 0.8203168 0.6419143
## tree TRUE 10 0.9191797 0.8278861
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = rules
## and winnow = TRUE.
summary(plsFit1)
##
## Call:
## (function (x, y, trials = 1, rules = FALSE, weights = NULL, control
## fuzzyThreshold = FALSE, sample = 0, earlyStopping = TRUE, label
## = "outcome", seed = 908L))
##
##
## C5.0 [Release 2.07 GPL Edition] Thu Jul 11 10:15:51 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 7424 cases (3 attributes) from undefined.data
##
## No attributes winnowed
##
## ----- Trial 0: -----
##
## Rules:
##
## Rule 0/1: (446/52, lift 2.3)
## salary > 0.4062575
## salary <= 1.037807
## age > -0.6693671
## age <= 0.5240395
## -> class Acer [0.882]
##
## Rule 0/2: (375/52, lift 2.3)
## salary > -1.589961
## salary <= -1.042109
## age > 0.5240395
## -> class Acer [0.859]
##
## Rule 0/3: (3122/1163, lift 1.7)
## salary > -1.042109
## salary <= 0.4062575
## -> class Acer [0.627]
##
## Rule 0/4: (997/16, lift 1.6)
## salary <= -1.042109
## age <= 0.5240395
## -> class Sony [0.983]
##
## Rule 0/5: (994/20, lift 1.6)
## salary > 0.4062575
## age > 0.5240395
## -> class Sony [0.979]
##
## Rule 0/6: (1490/33, lift 1.6)
## salary > 1.037807
## -> class Sony [0.977]
##
## Rule 0/7: (886/34, lift 1.5)
## salary > 0.4062575
## age <= -0.6693671
## -> class Sony [0.961]
##
## Rule 0/8: (546/40, lift 1.5)
## salary <= -0.8688307
## age <= -0.726196
## -> class Sony [0.925]
##
## Rule 0/9: (314/30, lift 1.5)
## salary <= -1.589961
## -> class Sony [0.902]
##
## Default class: Sony
##
## ----- Trial 1: -----
##
## Rules:
##
## Rule 1/1: (422.6/61.3, lift 2.6)
## salary > -1.003016
## salary <= -0.2386878
## age > 0.6376973
## -> class Acer [0.853]
##
## Rule 1/2: (430.6/87.8, lift 2.4)
## salary > -1.003016
## salary <= -0.3290423
## age <= -0.6693671
## -> class Acer [0.795]
##
## Rule 1/3: (1168.8/284.5, lift 2.3)
## salary > -0.3290423
## salary <= 0.5037088
## age <= 0.6376973
## -> class Acer [0.756]
##
## Rule 1/4: (1218.6/381.4, lift 2.1)
## salary > -0.3290423
## salary <= 1.162528
## age > -0.726196
## age <= 0.6376973
## -> class Acer [0.687]
##
## Rule 1/5: (554.8/200.3, lift 1.9)
## salary <= -1.003016
## age > 0.4103817
## -> class Acer [0.638]
##
## Rule 1/6: (988/7.4, lift 1.5)
## salary > 1.162528
## -> class Sony [0.991]
##
## Rule 1/7: (633.2/7.4, lift 1.5)
## salary > 0.5037088
## age <= -0.726196
## -> class Sony [0.987]
##
## Rule 1/8: (1554.6/25.2, lift 1.5)
## salary > -0.2386878
## age > 0.6376973
## -> class Sony [0.983]
##
## Rule 1/9: (846.7/21.3, lift 1.5)
## salary <= -1.003016
## age <= 0.4103817
## -> class Sony [0.974]
##
## Rule 1/10: (1448.6/122, lift 1.4)
## salary <= -0.3290423
## age > -0.6693671
## age <= 0.6376973
## -> class Sony [0.915]
##
## Default class: Sony
##
## ----- Trial 2: -----
##
## Rules:
##
## Rule 2/1: (131.6, lift 3.2)
## salary > -0.119302
## salary <= 0.2652091
## age > -0.5557094
## age <= 0.3535528
## -> class Acer [0.993]
##
## Rule 2/2: (592.9/71.4, lift 2.8)
## salary > -0.7922122
## salary <= 0.2652091
## age <= -0.5557094
## -> class Acer [0.878]
##
## Rule 2/3: (862.4/182.3, lift 2.6)
## salary > -1.441603
## salary <= -0.320527
## age > 0.3535528
## -> class Acer [0.788]
##
## Rule 2/4: (531.8/197.2, lift 2.0)
## salary > 0.2652091
## salary <= 1.005452
## age > -0.5557094
## age <= 0.6945261
## -> class Acer [0.629]
##
## Rule 2/5: (1229.6/68.2, lift 1.4)
## salary > 1.005452
## -> class Sony [0.944]
##
## Rule 2/6: (1087.1/78.4, lift 1.3)
## salary <= -0.119302
## age > -0.5557094
## age <= 0.3535528
## -> class Sony [0.927]
##
## Rule 2/7: (1875.6/201.5, lift 1.3)
## salary > -0.320527
## age > 0.3535528
## -> class Sony [0.892]
##
## Rule 2/8: (1012.3/123.4, lift 1.3)
## salary > 0.2652091
## age <= -0.5557094
## -> class Sony [0.877]
##
## Rule 2/9: (699.5/111.9, lift 1.2)
## salary <= -1.441603
## -> class Sony [0.839]
##
## Rule 2/10: (692.1/129.9, lift 1.2)
## salary <= -0.7922122
## age <= -0.5557094
## -> class Sony [0.811]
##
## Default class: Sony
##
## ----- Trial 3: -----
##
## Rules:
##
## Rule 3/1: (1097/323.8, lift 1.7)
## salary > -1.072638
## salary <= 0.5414523
## age <= -0.7830248
## -> class Acer [0.704]
##
## Rule 3/2: (1153.2/378.4, lift 1.6)
## salary > -0.1692976
## salary <= 0.9518045
## age > -0.7830248
## age <= 0.6945261
## -> class Acer [0.672]
##
## Rule 3/3: (1382.7/462, lift 1.6)
## salary > -1.712772
## salary <= -0.1692976
## age > 0.5240395
## -> class Acer [0.666]
##
## Rule 3/4: (358.9, lift 1.7)
## salary > 0.5414523
## age <= -0.7830248
## -> class Sony [0.997]
##
## Rule 3/5: (201.7, lift 1.7)
## salary <= -1.072638
## age <= -0.7830248
## -> class Sony [0.995]
##
## Rule 3/6: (102.7, lift 1.7)
## salary <= -1.712772
## -> class Sony [0.990]
##
## Rule 3/7: (878/8.7, lift 1.7)
## salary > -0.1692976
## age > 0.6945261
## -> class Sony [0.989]
##
## Rule 3/8: (1148.5/169.3, lift 1.4)
## salary > 0.9518045
## -> class Sony [0.852]
##
## Rule 3/9: (1614.6/403.7, lift 1.3)
## salary <= -0.1692976
## age > -0.7830248
## age <= 0.5240395
## -> class Sony [0.750]
##
## Default class: Sony
##
## ----- Trial 4: -----
##
## Rules:
##
## Rule 4/1: (950.1/255.2, lift 1.8)
## salary > -0.9038483
## salary <= 0.3727877
## age <= -0.5557094
## -> class Acer [0.731]
##
## Rule 4/2: (1298.3/388.2, lift 1.8)
## salary > -0.3968317
## salary <= 0.3727877
## age <= 0.5808684
## -> class Acer [0.701]
##
## Rule 4/3: (726.7/271.3, lift 1.6)
## salary > 0.3727877
## salary <= 1.215824
## age > -0.4988804
## age <= 0.5808684
## -> class Acer [0.626]
##
## Rule 4/4: (551.1/239.6, lift 1.4)
## salary > -0.9038483
## salary <= -0.1692976
## age > 0.5808684
## -> class Acer [0.565]
##
## Rule 4/5: (710.8/332.6, lift 1.3)
## salary > -1.712772
## salary <= -0.9038483
## age > 0.5808684
## -> class Acer [0.532]
##
## Rule 4/6: (447, lift 1.7)
## salary > 1.215824
## -> class Sony [0.998]
##
## Rule 4/7: (84.9, lift 1.6)
## salary <= -1.712772
## -> class Sony [0.988]
##
## Rule 4/8: (988.1/62.3, lift 1.6)
## salary > -0.1692976
## age > 0.5808684
## -> class Sony [0.936]
##
## Rule 4/9: (958.2/179.3, lift 1.4)
## salary <= -0.3968317
## age > -0.5557094
## age <= 0.5808684
## -> class Sony [0.812]
##
## Rule 4/10: (1054.2/214.8, lift 1.3)
## salary <= -0.9038483
## age <= 0.5808684
## -> class Sony [0.796]
##
## Rule 4/11: (966.8/197.5, lift 1.3)
## salary > 0.3727877
## age <= -0.4988804
## -> class Sony [0.795]
##
## Default class: Sony
##
## ----- Trial 5: -----
##
## Rules:
##
## Rule 5/1: (305.4/52.5, lift 2.1)
## salary > -0.5426162
## salary <= 0.330084
## age <= -0.726196
## -> class Acer [0.826]
##
## Rule 5/2: (650.9/186.7, lift 1.9)
## salary > -0.2194015
## salary <= 0.330084
## age <= 0.5240395
## -> class Acer [0.713]
##
## Rule 5/3: (682.8/257.5, lift 1.6)
## salary > -1.54434
## salary <= -0.4298495
## age > 0.5240395
## -> class Acer [0.623]
##
## Rule 5/4: (692.3/269.8, lift 1.6)
## salary > 0.330084
## salary <= 1.107454
## age > -0.6693671
## age <= 0.4103817
## -> class Acer [0.610]
##
## Rule 5/5: (647.6/53.9, lift 1.5)
## salary > 1.107454
## -> class Sony [0.915]
##
## Rule 5/6: (7049.2/2853.5, lift 1.0)
## salary <= 1.215824
## -> class Sony [0.595]
##
## Default class: Sony
##
## ----- Trial 6: -----
##
## Rules:
##
## Rule 6/1: (175.9/20.1, lift 2.0)
## salary > 0.2872152
## salary <= 0.9994338
## age > -0.3852227
## age <= 0.4103817
## -> class Acer [0.881]
##
## Rule 6/2: (227.9/26.5, lift 2.0)
## salary > -0.13866
## salary <= 0.2872152
## age <= 0.4103817
## -> class Acer [0.880]
##
## Rule 6/3: (1052.2/301.1, lift 1.6)
## salary > -0.9989635
## salary <= -0.13866
## age <= -0.5557094
## -> class Acer [0.713]
##
## Rule 6/4: (659.3/269.2, lift 1.3)
## salary > -0.4159816
## salary <= -0.13866
## age <= 0.4103817
## -> class Acer [0.591]
##
## Rule 6/5: (977.9/400.6, lift 1.3)
## salary > -0.9989635
## salary <= -0.13866
## age > 0.4103817
## -> class Acer [0.590]
##
## Rule 6/6: (273.5/114, lift 1.3)
## salary > -1.433198
## salary <= -0.9989635
## age > 0.4103817
## -> class Acer [0.583]
##
## Rule 6/7: (435.2/26.2, lift 1.7)
## salary <= -0.4159816
## age > -0.5557094
## age <= 0.4103817
## -> class Sony [0.938]
##
## Rule 6/8: (424.1/60.9, lift 1.5)
## salary <= -0.9989635
## age <= 0.4103817
## -> class Sony [0.855]
##
## Rule 6/9: (1199.1/271.5, lift 1.4)
## salary > -0.13866
## age > 0.4103817
## -> class Sony [0.773]
##
## Rule 6/10: (796.9/238.8, lift 1.3)
## salary <= -1.433198
## -> class Sony [0.700]
##
## Rule 6/11: (2518.3/890.7, lift 1.2)
## salary > 0.2872152
## -> class Sony [0.646]
##
## Default class: Sony
##
## ----- Trial 7: -----
##
## Rules:
##
## Rule 7/1: (130.8, lift 2.2)
## salary > -0.873679
## salary <= -0.320527
## age > 0.6376973
## -> class Acer [0.992]
##
## Rule 7/2: (1023.1/363, lift 1.4)
## salary > -0.2754662
## salary <= 0.9994338
## age > -0.3852227
## age <= 0.6376973
## -> class Acer [0.645]
##
## Rule 7/3: (2187.3/814.4, lift 1.4)
## salary > -0.873679
## salary <= 1.107454
## age <= -0.3852227
## -> class Acer [0.628]
##
## Rule 7/4: (810.8/342.7, lift 1.3)
## salary > -1.712772
## salary <= -0.873679
## age > 0.5808684
## -> class Acer [0.577]
##
## Rule 7/5: (448.5, lift 1.9)
## salary > 1.107454
## -> class Sony [0.998]
##
## Rule 7/6: (53.8, lift 1.8)
## salary <= -1.712772
## -> class Sony [0.982]
##
## Rule 7/7: (535.3/88.1, lift 1.5)
## salary > 0.9994338
## age > -0.3852227
## -> class Sony [0.834]
##
## Rule 7/8: (926.6/156.9, lift 1.5)
## salary > -0.320527
## age > 0.6376973
## -> class Sony [0.830]
##
## Rule 7/9: (994.3/247.1, lift 1.4)
## salary <= -0.873679
## age <= 0.5808684
## -> class Sony [0.751]
##
## Rule 7/10: (1150.8/337.8, lift 1.3)
## salary <= -0.2754662
## age > -0.3852227
## age <= 0.6376973
## -> class Sony [0.706]
##
## Default class: Sony
##
## ----- Trial 8: -----
##
## Rules:
##
## Rule 8/1: (257.8/21.9, lift 2.2)
## salary > -0.2187837
## salary <= 0.5012654
## age > -0.6693671
## age <= 0.3535528
## -> class Acer [0.912]
##
## Rule 8/2: (990.8/281.2, lift 1.7)
## salary > -1.589961
## salary <= -0.1692976
## age > 0.5808684
## -> class Acer [0.716]
##
## Rule 8/3: (1337.4/403, lift 1.7)
## salary > -0.9989635
## salary <= 0.5012654
## age <= -0.6693671
## -> class Acer [0.698]
##
## Rule 8/4: (699.2/260.4, lift 1.5)
## salary > 0.5012654
## salary <= 1.037807
## age > -0.726196
## age <= 0.5808684
## -> class Acer [0.627]
##
## Rule 8/5: (626, lift 1.8)
## salary > 0.5012654
## age <= -0.726196
## -> class Sony [0.998]
##
## Rule 8/6: (150.7, lift 1.8)
## salary <= -0.9989635
## age <= -0.6693671
## -> class Sony [0.993]
##
## Rule 8/7: (622/16, lift 1.8)
## salary > 1.037807
## -> class Sony [0.973]
##
## Rule 8/8: (5043.8/2176.9, lift 1.0)
## age > -0.6693671
## -> class Sony [0.568]
##
## Default class: Sony
##
## ----- Trial 9: -----
##
## Rules:
##
## Rule 9/1: (260.5, lift 2.4)
## salary > -1.587265
## salary <= -0.873679
## age > 0.6376973
## -> class Acer [0.996]
##
## Rule 9/2: (794.1/74.5, lift 2.2)
## salary > -0.873679
## salary <= 0.3718849
## age <= -0.5557094
## -> class Acer [0.905]
##
## Rule 9/3: (529.9/109.2, lift 1.9)
## salary > 0.4062575
## salary <= 1.037807
## age > -0.6693671
## age <= 0.5240395
## -> class Acer [0.793]
##
## Rule 9/4: (1472.5/344, lift 1.9)
## salary > -0.3342096
## salary <= 0.4062575
## age <= 0.5808684
## -> class Acer [0.766]
##
## Rule 9/5: (936.6, lift 1.9)
## salary > 0.4062575
## age <= -0.6693671
## -> class Sony [0.999]
##
## Rule 9/6: (539, lift 1.9)
## salary > 1.037807
## -> class Sony [0.998]
##
## Rule 9/7: (400/32.5, lift 1.7)
## salary > 0.4062575
## age > 0.5240395
## -> class Sony [0.917]
##
## Rule 9/8: (928.3/206.7, lift 1.4)
## salary > -0.3342096
## age > 0.5808684
## -> class Sony [0.777]
##
## Rule 9/9: (1085.3/258.5, lift 1.4)
## salary <= -0.873679
## age <= 0.6376973
## -> class Sony [0.761]
##
## Rule 9/10: (1587.5/684.4, lift 1.1)
## salary <= -0.3342096
## age > -0.5557094
## -> class Sony [0.569]
##
## Default class: Sony
##
##
## Evaluation on training data (7424 cases):
##
## Trial Rules
## ----- ----------------
## No Errors
##
## 0 9 1366(18.4%)
## 1 10 727( 9.8%)
## 2 10 829(11.2%)
## 3 9 789(10.6%)
## 4 11 678( 9.1%)
## 5 6 838(11.3%)
## 6 11 807(10.9%)
## 7 10 1050(14.1%)
## 8 8 633( 8.5%)
## 9 10 888(12.0%)
## boost 525( 7.1%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 2537 271 (a): class Acer
## 254 4362 (b): class Sony
##
##
## Attribute usage:
##
## 100.00% salary
## 100.00% age
##
##
## Time: 0.3 secs
plsFit2 <- train(brand~., data = Training,
method = "gbm", trControl=fitControl,
tuneLength =2)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3012 nan 0.1000 0.0128
## 2 1.2791 nan 0.1000 0.0105
## 3 1.2608 nan 0.1000 0.0090
## 4 1.2456 nan 0.1000 0.0075
## 5 1.2327 nan 0.1000 0.0063
## 6 1.2215 nan 0.1000 0.0055
## 7 1.2105 nan 0.1000 0.0052
## 8 1.2023 nan 0.1000 0.0039
## 9 1.1953 nan 0.1000 0.0035
## 10 1.1888 nan 0.1000 0.0026
## 20 1.1301 nan 0.1000 0.0017
## 40 1.0729 nan 0.1000 0.0007
## 60 1.0497 nan 0.1000 0.0002
## 80 1.0381 nan 0.1000 -0.0000
## 100 1.0321 nan 0.1000 0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2897 nan 0.1000 0.0176
## 2 1.2602 nan 0.1000 0.0150
## 3 1.2350 nan 0.1000 0.0128
## 4 1.2100 nan 0.1000 0.0125
## 5 1.1915 nan 0.1000 0.0088
## 6 1.1724 nan 0.1000 0.0093
## 7 1.1557 nan 0.1000 0.0075
## 8 1.1427 nan 0.1000 0.0060
## 9 1.1306 nan 0.1000 0.0056
## 10 1.1211 nan 0.1000 0.0044
## 20 1.0467 nan 0.1000 0.0027
## 40 0.8598 nan 0.1000 0.0059
## 60 0.7304 nan 0.1000 0.0005
## 80 0.6560 nan 0.1000 0.0026
## 100 0.5780 nan 0.1000 0.0011
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3009 nan 0.1000 0.0130
## 2 1.2786 nan 0.1000 0.0112
## 3 1.2597 nan 0.1000 0.0090
## 4 1.2428 nan 0.1000 0.0078
## 5 1.2295 nan 0.1000 0.0066
## 6 1.2186 nan 0.1000 0.0052
## 7 1.2069 nan 0.1000 0.0058
## 8 1.1970 nan 0.1000 0.0051
## 9 1.1896 nan 0.1000 0.0031
## 10 1.1804 nan 0.1000 0.0044
## 20 1.1241 nan 0.1000 0.0016
## 40 1.0667 nan 0.1000 0.0006
## 60 1.0396 nan 0.1000 0.0008
## 80 1.0282 nan 0.1000 -0.0000
## 100 1.0234 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2899 nan 0.1000 0.0176
## 2 1.2586 nan 0.1000 0.0152
## 3 1.2300 nan 0.1000 0.0148
## 4 1.2047 nan 0.1000 0.0119
## 5 1.1856 nan 0.1000 0.0099
## 6 1.1690 nan 0.1000 0.0086
## 7 1.1530 nan 0.1000 0.0077
## 8 1.1392 nan 0.1000 0.0064
## 9 1.1262 nan 0.1000 0.0062
## 10 1.1162 nan 0.1000 0.0047
## 20 1.0388 nan 0.1000 0.0029
## 40 0.9006 nan 0.1000 0.0082
## 60 0.7246 nan 0.1000 0.0028
## 80 0.6567 nan 0.1000 0.0014
## 100 0.5930 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3004 nan 0.1000 0.0127
## 2 1.2790 nan 0.1000 0.0108
## 3 1.2614 nan 0.1000 0.0085
## 4 1.2465 nan 0.1000 0.0073
## 5 1.2322 nan 0.1000 0.0070
## 6 1.2195 nan 0.1000 0.0066
## 7 1.2090 nan 0.1000 0.0052
## 8 1.1993 nan 0.1000 0.0047
## 9 1.1919 nan 0.1000 0.0036
## 10 1.1840 nan 0.1000 0.0039
## 20 1.1243 nan 0.1000 0.0031
## 40 1.0672 nan 0.1000 0.0007
## 60 1.0419 nan 0.1000 0.0001
## 80 1.0285 nan 0.1000 0.0000
## 100 1.0223 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2887 nan 0.1000 0.0180
## 2 1.2583 nan 0.1000 0.0157
## 3 1.2289 nan 0.1000 0.0141
## 4 1.2049 nan 0.1000 0.0112
## 5 1.1857 nan 0.1000 0.0093
## 6 1.1664 nan 0.1000 0.0093
## 7 1.1495 nan 0.1000 0.0079
## 8 1.1368 nan 0.1000 0.0064
## 9 1.1251 nan 0.1000 0.0055
## 10 1.1131 nan 0.1000 0.0052
## 20 1.0385 nan 0.1000 0.0033
## 40 0.8745 nan 0.1000 0.0003
## 60 0.7462 nan 0.1000 0.0003
## 80 0.6470 nan 0.1000 0.0037
## 100 0.5824 nan 0.1000 0.0027
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3012 nan 0.1000 0.0130
## 2 1.2791 nan 0.1000 0.0105
## 3 1.2608 nan 0.1000 0.0086
## 4 1.2441 nan 0.1000 0.0082
## 5 1.2299 nan 0.1000 0.0070
## 6 1.2194 nan 0.1000 0.0053
## 7 1.2096 nan 0.1000 0.0047
## 8 1.2026 nan 0.1000 0.0031
## 9 1.1926 nan 0.1000 0.0051
## 10 1.1866 nan 0.1000 0.0028
## 20 1.1256 nan 0.1000 0.0030
## 40 1.0694 nan 0.1000 0.0007
## 60 1.0441 nan 0.1000 0.0008
## 80 1.0322 nan 0.1000 -0.0001
## 100 1.0270 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2888 nan 0.1000 0.0182
## 2 1.2589 nan 0.1000 0.0147
## 3 1.2293 nan 0.1000 0.0145
## 4 1.2056 nan 0.1000 0.0111
## 5 1.1864 nan 0.1000 0.0092
## 6 1.1686 nan 0.1000 0.0080
## 7 1.1528 nan 0.1000 0.0076
## 8 1.1395 nan 0.1000 0.0063
## 9 1.1266 nan 0.1000 0.0061
## 10 1.1166 nan 0.1000 0.0048
## 20 1.0411 nan 0.1000 0.0016
## 40 0.8599 nan 0.1000 0.0085
## 60 0.7388 nan 0.1000 0.0007
## 80 0.6585 nan 0.1000 0.0028
## 100 0.5994 nan 0.1000 0.0029
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3002 nan 0.1000 0.0134
## 2 1.2788 nan 0.1000 0.0096
## 3 1.2597 nan 0.1000 0.0093
## 4 1.2432 nan 0.1000 0.0082
## 5 1.2292 nan 0.1000 0.0070
## 6 1.2177 nan 0.1000 0.0055
## 7 1.2068 nan 0.1000 0.0054
## 8 1.1988 nan 0.1000 0.0040
## 9 1.1913 nan 0.1000 0.0032
## 10 1.1842 nan 0.1000 0.0035
## 20 1.1289 nan 0.1000 0.0018
## 40 1.0706 nan 0.1000 0.0006
## 60 1.0481 nan 0.1000 0.0001
## 80 1.0377 nan 0.1000 -0.0001
## 100 1.0322 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2898 nan 0.1000 0.0182
## 2 1.2595 nan 0.1000 0.0150
## 3 1.2301 nan 0.1000 0.0140
## 4 1.2074 nan 0.1000 0.0110
## 5 1.1872 nan 0.1000 0.0092
## 6 1.1699 nan 0.1000 0.0087
## 7 1.1552 nan 0.1000 0.0069
## 8 1.1425 nan 0.1000 0.0061
## 9 1.1307 nan 0.1000 0.0057
## 10 1.1216 nan 0.1000 0.0045
## 20 1.0481 nan 0.1000 0.0030
## 40 0.8921 nan 0.1000 0.0126
## 60 0.7317 nan 0.1000 0.0040
## 80 0.6629 nan 0.1000 0.0012
## 100 0.5800 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3011 nan 0.1000 0.0127
## 2 1.2795 nan 0.1000 0.0105
## 3 1.2614 nan 0.1000 0.0085
## 4 1.2468 nan 0.1000 0.0069
## 5 1.2323 nan 0.1000 0.0069
## 6 1.2204 nan 0.1000 0.0061
## 7 1.2091 nan 0.1000 0.0053
## 8 1.2015 nan 0.1000 0.0032
## 9 1.1933 nan 0.1000 0.0042
## 10 1.1843 nan 0.1000 0.0044
## 20 1.1279 nan 0.1000 0.0016
## 40 1.0684 nan 0.1000 0.0004
## 60 1.0436 nan 0.1000 0.0000
## 80 1.0327 nan 0.1000 0.0000
## 100 1.0262 nan 0.1000 0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2898 nan 0.1000 0.0182
## 2 1.2602 nan 0.1000 0.0150
## 3 1.2308 nan 0.1000 0.0145
## 4 1.2088 nan 0.1000 0.0112
## 5 1.1897 nan 0.1000 0.0092
## 6 1.1708 nan 0.1000 0.0095
## 7 1.1560 nan 0.1000 0.0078
## 8 1.1427 nan 0.1000 0.0064
## 9 1.1313 nan 0.1000 0.0056
## 10 1.1191 nan 0.1000 0.0058
## 20 1.0406 nan 0.1000 0.0027
## 40 0.9172 nan 0.1000 0.0081
## 60 0.7486 nan 0.1000 0.0003
## 80 0.6529 nan 0.1000 0.0045
## 100 0.5919 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3008 nan 0.1000 0.0128
## 2 1.2794 nan 0.1000 0.0110
## 3 1.2616 nan 0.1000 0.0088
## 4 1.2460 nan 0.1000 0.0076
## 5 1.2317 nan 0.1000 0.0072
## 6 1.2194 nan 0.1000 0.0058
## 7 1.2090 nan 0.1000 0.0046
## 8 1.2018 nan 0.1000 0.0031
## 9 1.1917 nan 0.1000 0.0048
## 10 1.1837 nan 0.1000 0.0034
## 20 1.1267 nan 0.1000 0.0032
## 40 1.0703 nan 0.1000 0.0004
## 60 1.0454 nan 0.1000 0.0009
## 80 1.0330 nan 0.1000 0.0000
## 100 1.0277 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2892 nan 0.1000 0.0183
## 2 1.2583 nan 0.1000 0.0152
## 3 1.2323 nan 0.1000 0.0129
## 4 1.2062 nan 0.1000 0.0123
## 5 1.1852 nan 0.1000 0.0100
## 6 1.1681 nan 0.1000 0.0083
## 7 1.1525 nan 0.1000 0.0073
## 8 1.1393 nan 0.1000 0.0063
## 9 1.1281 nan 0.1000 0.0054
## 10 1.1182 nan 0.1000 0.0042
## 20 1.0464 nan 0.1000 0.0018
## 40 0.9246 nan 0.1000 0.0049
## 60 0.7896 nan 0.1000 0.0006
## 80 0.6787 nan 0.1000 0.0031
## 100 0.6164 nan 0.1000 0.0019
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2993 nan 0.1000 0.0134
## 2 1.2775 nan 0.1000 0.0109
## 3 1.2599 nan 0.1000 0.0084
## 4 1.2435 nan 0.1000 0.0080
## 5 1.2295 nan 0.1000 0.0070
## 6 1.2169 nan 0.1000 0.0060
## 7 1.2069 nan 0.1000 0.0049
## 8 1.1984 nan 0.1000 0.0041
## 9 1.1912 nan 0.1000 0.0031
## 10 1.1835 nan 0.1000 0.0035
## 20 1.1286 nan 0.1000 0.0013
## 40 1.0689 nan 0.1000 0.0005
## 60 1.0434 nan 0.1000 0.0005
## 80 1.0313 nan 0.1000 0.0002
## 100 1.0255 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2882 nan 0.1000 0.0181
## 2 1.2584 nan 0.1000 0.0150
## 3 1.2330 nan 0.1000 0.0128
## 4 1.2087 nan 0.1000 0.0118
## 5 1.1869 nan 0.1000 0.0105
## 6 1.1699 nan 0.1000 0.0083
## 7 1.1552 nan 0.1000 0.0071
## 8 1.1413 nan 0.1000 0.0065
## 9 1.1301 nan 0.1000 0.0055
## 10 1.1195 nan 0.1000 0.0050
## 20 1.0397 nan 0.1000 0.0021
## 40 0.9192 nan 0.1000 0.0007
## 60 0.7426 nan 0.1000 0.0037
## 80 0.6851 nan 0.1000 0.0001
## 100 0.6048 nan 0.1000 0.0037
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2998 nan 0.1000 0.0131
## 2 1.2777 nan 0.1000 0.0110
## 3 1.2603 nan 0.1000 0.0083
## 4 1.2439 nan 0.1000 0.0084
## 5 1.2299 nan 0.1000 0.0070
## 6 1.2182 nan 0.1000 0.0056
## 7 1.2068 nan 0.1000 0.0052
## 8 1.1970 nan 0.1000 0.0046
## 9 1.1895 nan 0.1000 0.0035
## 10 1.1819 nan 0.1000 0.0037
## 20 1.1238 nan 0.1000 0.0031
## 40 1.0651 nan 0.1000 0.0005
## 60 1.0386 nan 0.1000 -0.0002
## 80 1.0283 nan 0.1000 0.0001
## 100 1.0208 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2907 nan 0.1000 0.0187
## 2 1.2561 nan 0.1000 0.0167
## 3 1.2290 nan 0.1000 0.0136
## 4 1.2062 nan 0.1000 0.0114
## 5 1.1861 nan 0.1000 0.0101
## 6 1.1681 nan 0.1000 0.0085
## 7 1.1530 nan 0.1000 0.0070
## 8 1.1381 nan 0.1000 0.0071
## 9 1.1259 nan 0.1000 0.0059
## 10 1.1149 nan 0.1000 0.0050
## 20 1.0404 nan 0.1000 0.0030
## 40 0.9058 nan 0.1000 0.0008
## 60 0.7448 nan 0.1000 0.0036
## 80 0.6638 nan 0.1000 0.0021
## 100 0.5954 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3004 nan 0.1000 0.0130
## 2 1.2780 nan 0.1000 0.0109
## 3 1.2593 nan 0.1000 0.0085
## 4 1.2440 nan 0.1000 0.0072
## 5 1.2301 nan 0.1000 0.0072
## 6 1.2175 nan 0.1000 0.0063
## 7 1.2067 nan 0.1000 0.0051
## 8 1.1992 nan 0.1000 0.0033
## 9 1.1899 nan 0.1000 0.0044
## 10 1.1836 nan 0.1000 0.0029
## 20 1.1223 nan 0.1000 0.0027
## 40 1.0672 nan 0.1000 0.0007
## 60 1.0418 nan 0.1000 0.0002
## 80 1.0280 nan 0.1000 0.0001
## 100 1.0224 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2890 nan 0.1000 0.0185
## 2 1.2577 nan 0.1000 0.0156
## 3 1.2280 nan 0.1000 0.0144
## 4 1.2050 nan 0.1000 0.0113
## 5 1.1846 nan 0.1000 0.0098
## 6 1.1658 nan 0.1000 0.0092
## 7 1.1504 nan 0.1000 0.0073
## 8 1.1385 nan 0.1000 0.0061
## 9 1.1242 nan 0.1000 0.0072
## 10 1.1124 nan 0.1000 0.0056
## 20 1.0362 nan 0.1000 0.0028
## 40 0.8543 nan 0.1000 0.0094
## 60 0.7273 nan 0.1000 0.0045
## 80 0.6557 nan 0.1000 0.0004
## 100 0.5815 nan 0.1000 0.0024
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2899 nan 0.1000 0.0186
## 2 1.2602 nan 0.1000 0.0145
## 3 1.2298 nan 0.1000 0.0144
## 4 1.2074 nan 0.1000 0.0111
## 5 1.1848 nan 0.1000 0.0100
## 6 1.1678 nan 0.1000 0.0081
## 7 1.1531 nan 0.1000 0.0077
## 8 1.1407 nan 0.1000 0.0060
## 9 1.1284 nan 0.1000 0.0055
## 10 1.1178 nan 0.1000 0.0049
## 20 1.0380 nan 0.1000 0.0028
## 40 0.8939 nan 0.1000 0.0108
## 60 0.7363 nan 0.1000 0.0020
## 80 0.6517 nan 0.1000 0.0014
## 100 0.5788 nan 0.1000 0.0003
plsFit3 <- train(brand~., data = Training,
method = "rf", trControl=fitControl,
tuneLength =5)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
#This model was chosen for its accuracy of 93% and Kappa of 85%.
#It also takes considerably less time than other models.
#Lastly, GBT and RF, offer similar results. The final results have a variance of only 50 between each other. It's hard to choose which predictions will be closer to reality.
predictions <- predict(object = plsFit1, newdata = IncompleteSurvey)
predictions2 <- predict(object = plsFit2, newdata = IncompleteSurvey)
predictions3 <- predict(object = plsFit3, newdata = IncompleteSurvey)
plot(predictions)
summary(predictions)
## Acer Sony
## 1894 3106
summary(predictions2)
## Acer Sony
## 2107 2893
summary(predictions3)
## Acer Sony
## 1910 3090
#Final predictions. Problem: the distribution percentage of brand is alarmingly similar to the dataset. This is probably due to the sample being stratified.
predictions_testing <-predict(plsFit1, Testing)
Testing$predictions <- predictions_testing
Testing$brand <- as.factor(Testing$brand)
Testing$predictions <- as.factor(Testing$predictions)
postResample(Testing$predictions, Testing$brand)
## Accuracy Kappa
## 0.9288601 0.8493955
#Accuracy and Kappa
KungFuSionMatrix <- confusionMatrix(Testing$predictions, Testing$brand)
KungFuSionMatrix$table
## Reference
## Prediction Acer Sony
## Acer 858 98
## Sony 78 1440
#2298 out of 2474 have been correctly identified