#TECHNICAL REPORT

Exporting and loading

getwd()
## [1] "C:/Users/S/Documents/Ubiqum/dataanalyticsII_task2"
Survey           <- read.csv("CompleteResponses.csv")
IncompleteSurvey <- read.csv("SurveyIncomplete.csv")

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Metrics)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(caTools)
library(mlbench)
library(C50)
## Warning: package 'C50' was built under R version 3.6.1
library(inum)
## Warning: package 'inum' was built under R version 3.6.1
library(doSNOW)
## Warning: package 'doSNOW' was built under R version 3.6.1
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: snow
library(mgsub)
## Warning: package 'mgsub' was built under R version 3.6.1
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.1
library(textclean)
## Warning: package 'textclean' was built under R version 3.6.1
## 
## Attaching package: 'textclean'
## The following object is masked from 'package:mgsub':
## 
##     mgsub
library(tictoc)

Data Exploration

head(Survey)
##      salary age elevel car zipcode    credit brand
## 1 119806.54  45      0  14       4 442037.71     0
## 2 106880.48  63      1  11       6  45007.18     1
## 3  78020.75  23      0  15       2  48795.32     0
## 4  63689.94  51      3   6       5  40888.88     1
## 5  50873.62  20      3  14       4 352951.50     0
## 6 130812.74  56      3  14       3 135943.02     1
summary(Survey)
##      salary            age            elevel           car       
##  Min.   : 20000   Min.   :20.00   Min.   :0.000   Min.   : 1.00  
##  1st Qu.: 52082   1st Qu.:35.00   1st Qu.:1.000   1st Qu.: 6.00  
##  Median : 84950   Median :50.00   Median :2.000   Median :11.00  
##  Mean   : 84871   Mean   :49.78   Mean   :1.983   Mean   :10.52  
##  3rd Qu.:117162   3rd Qu.:65.00   3rd Qu.:3.000   3rd Qu.:15.75  
##  Max.   :150000   Max.   :80.00   Max.   :4.000   Max.   :20.00  
##     zipcode          credit           brand       
##  Min.   :0.000   Min.   :     0   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:120807   1st Qu.:0.0000  
##  Median :4.000   Median :250607   Median :1.0000  
##  Mean   :4.041   Mean   :249176   Mean   :0.6217  
##  3rd Qu.:6.000   3rd Qu.:374640   3rd Qu.:1.0000  
##  Max.   :8.000   Max.   :500000   Max.   :1.0000
str(Survey)
## 'data.frame':    9898 obs. of  7 variables:
##  $ salary : num  119807 106880 78021 63690 50874 ...
##  $ age    : int  45 63 23 51 20 56 24 62 29 41 ...
##  $ elevel : int  0 1 0 3 3 3 4 3 4 1 ...
##  $ car    : int  14 11 15 6 14 14 8 3 17 5 ...
##  $ zipcode: int  4 6 2 5 4 3 5 0 0 4 ...
##  $ credit : num  442038 45007 48795 40889 352951 ...
##  $ brand  : int  0 1 0 1 0 1 1 1 0 1 ...
#Checking if there are repeated lines
sum(duplicated(x = Survey))
## [1] 0
#checking for outliers
boxplot(Survey[,c("age", "salary", "elevel", "car", "credit", "zipcode" )])

#There are none

Data Exploration (Continuation) + Preprocessing

#Checking salary variable. It is evenly distributed
ggplot(Survey, aes(x=salary)) + geom_histogram(color="darkblue", fill="lightblue", bins=20)

#Same as car, and zipcode, which tells us the data is a stratified random sample
ggplot(Survey, aes(x=zipcode)) + geom_histogram(color="darkblue", fill="lightblue", bins=20)

Survey$car <- mgsub(x = Survey$car,pattern = c(1:20), replacement = c("BMW","Buick","Cadillac","Chevrolet","Chrysler","Dodge","Ford","Honda","Hyundai","Jeep","Kia","Lincoln","Mazda","Mercedes Benz","Mitsubishi","Nissan","Ram", "Subaru", "Toyota","None of the above"))

#Car brand is, unhelpingly, even
ggplot(Survey, aes(x=car)) + geom_bar(color="darkblue", fill="lightblue", bins=20)
## Warning: Ignoring unknown parameters: bins

#Renaming dependent and independent variable
Survey$brand           <- as.factor(Survey$brand)
Survey$brand           <- ifelse(Survey$brand==0, "Acer", "Sony")
IncompleteSurvey$brand <- as.factor(IncompleteSurvey$brand)
IncompleteSurvey$brand <- ifelse(IncompleteSurvey$brand==0, "Acer", "Sony")

#Brand difference. 66% of customers surveyed prefer Sony.
ggplot(Survey, aes(x=brand, fill=brand)) + geom_bar()

#Overlapped brand preference in a salary scale
ggplot(Survey, aes(x=salary, col=brand, fill=brand)) +geom_histogram(color="black", position="identity", alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Median salary of people that prefer each brand
ggplot(Survey, aes(x=brand, y=salary, fill=brand)) +
  geom_boxplot() +stat_summary(fun.y=median, colour="black", geom="text", 
                               vjust=-0.7, aes(label=round(..y.., digits=1))) 

#Age and salary distribution over brand, with flow line
ggplot(Survey, aes(x=age, y=salary, col=brand)) +geom_point() +geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Preprocessing

#After the first model, I found the parameters 'elevel, car, zipcode and credit' have no correlation (<0.1%) with the dependent variable, and so I removed them.
Survey <- Survey[-c(3:6)]
IncompleteSurvey <- IncompleteSurvey[-c(3:6)]

Feature Engineering

#normalizing columns of interest
  for (i in c(1,2)){Survey[,i] <- scale(Survey[,i])}
#normalizing columns of interest
  for (i in c(1,2)){IncompleteSurvey[,i] <- scale(IncompleteSurvey[,i])}

Creating training and test sets

set.seed(456)
inTraining <- createDataPartition(Survey$brand, times=1, p=.75, list=FALSE)
Training   <- Survey[inTraining, ]
Testing    <- Survey[-inTraining, ]

#Examining the proportions of the brand label across the datasets
#Confirmed the 'caret' package keeps the same distribution
prop.table(table(Survey$brand))
## 
##      Acer      Sony 
## 0.3782582 0.6217418
prop.table(table(Training$brand))
## 
##      Acer      Sony 
## 0.3782328 0.6217672
prop.table(table(Testing$brand))
## 
##      Acer      Sony 
## 0.3783347 0.6216653

Model

fitControl <- trainControl(method = "cv", number= 10)

tic()
plsFit1 <- train(brand~., data = Training, 
                 method = "C5.0", trControl=fitControl, 
                 tuneLength =2)
toc()
## 15.95 sec elapsed
plsFit1
## C5.0 
## 
## 7424 samples
##    2 predictor
##    2 classes: 'Acer', 'Sony' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 6682, 6682, 6682, 6681, 6681, 6681, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE    1      0.8207211  0.6426651
##   rules  FALSE   10      0.9197191  0.8284152
##   rules   TRUE    1      0.8207211  0.6426651
##   rules   TRUE   10      0.9197191  0.8284152
##   tree   FALSE    1      0.8203168  0.6419143
##   tree   FALSE   10      0.9191797  0.8278861
##   tree    TRUE    1      0.8203168  0.6419143
##   tree    TRUE   10      0.9191797  0.8278861
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = rules
##  and winnow = TRUE.
summary(plsFit1)
## 
## Call:
## (function (x, y, trials = 1, rules = FALSE, weights = NULL, control
##  fuzzyThreshold = FALSE, sample = 0, earlyStopping = TRUE, label
##  = "outcome",     seed = 908L))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Jul 11 10:15:51 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 7424 cases (3 attributes) from undefined.data
## 
## No attributes winnowed
## 
## -----  Trial 0:  -----
## 
## Rules:
## 
## Rule 0/1: (446/52, lift 2.3)
##  salary > 0.4062575
##  salary <= 1.037807
##  age > -0.6693671
##  age <= 0.5240395
##  ->  class Acer  [0.882]
## 
## Rule 0/2: (375/52, lift 2.3)
##  salary > -1.589961
##  salary <= -1.042109
##  age > 0.5240395
##  ->  class Acer  [0.859]
## 
## Rule 0/3: (3122/1163, lift 1.7)
##  salary > -1.042109
##  salary <= 0.4062575
##  ->  class Acer  [0.627]
## 
## Rule 0/4: (997/16, lift 1.6)
##  salary <= -1.042109
##  age <= 0.5240395
##  ->  class Sony  [0.983]
## 
## Rule 0/5: (994/20, lift 1.6)
##  salary > 0.4062575
##  age > 0.5240395
##  ->  class Sony  [0.979]
## 
## Rule 0/6: (1490/33, lift 1.6)
##  salary > 1.037807
##  ->  class Sony  [0.977]
## 
## Rule 0/7: (886/34, lift 1.5)
##  salary > 0.4062575
##  age <= -0.6693671
##  ->  class Sony  [0.961]
## 
## Rule 0/8: (546/40, lift 1.5)
##  salary <= -0.8688307
##  age <= -0.726196
##  ->  class Sony  [0.925]
## 
## Rule 0/9: (314/30, lift 1.5)
##  salary <= -1.589961
##  ->  class Sony  [0.902]
## 
## Default class: Sony
## 
## -----  Trial 1:  -----
## 
## Rules:
## 
## Rule 1/1: (422.6/61.3, lift 2.6)
##  salary > -1.003016
##  salary <= -0.2386878
##  age > 0.6376973
##  ->  class Acer  [0.853]
## 
## Rule 1/2: (430.6/87.8, lift 2.4)
##  salary > -1.003016
##  salary <= -0.3290423
##  age <= -0.6693671
##  ->  class Acer  [0.795]
## 
## Rule 1/3: (1168.8/284.5, lift 2.3)
##  salary > -0.3290423
##  salary <= 0.5037088
##  age <= 0.6376973
##  ->  class Acer  [0.756]
## 
## Rule 1/4: (1218.6/381.4, lift 2.1)
##  salary > -0.3290423
##  salary <= 1.162528
##  age > -0.726196
##  age <= 0.6376973
##  ->  class Acer  [0.687]
## 
## Rule 1/5: (554.8/200.3, lift 1.9)
##  salary <= -1.003016
##  age > 0.4103817
##  ->  class Acer  [0.638]
## 
## Rule 1/6: (988/7.4, lift 1.5)
##  salary > 1.162528
##  ->  class Sony  [0.991]
## 
## Rule 1/7: (633.2/7.4, lift 1.5)
##  salary > 0.5037088
##  age <= -0.726196
##  ->  class Sony  [0.987]
## 
## Rule 1/8: (1554.6/25.2, lift 1.5)
##  salary > -0.2386878
##  age > 0.6376973
##  ->  class Sony  [0.983]
## 
## Rule 1/9: (846.7/21.3, lift 1.5)
##  salary <= -1.003016
##  age <= 0.4103817
##  ->  class Sony  [0.974]
## 
## Rule 1/10: (1448.6/122, lift 1.4)
##  salary <= -0.3290423
##  age > -0.6693671
##  age <= 0.6376973
##  ->  class Sony  [0.915]
## 
## Default class: Sony
## 
## -----  Trial 2:  -----
## 
## Rules:
## 
## Rule 2/1: (131.6, lift 3.2)
##  salary > -0.119302
##  salary <= 0.2652091
##  age > -0.5557094
##  age <= 0.3535528
##  ->  class Acer  [0.993]
## 
## Rule 2/2: (592.9/71.4, lift 2.8)
##  salary > -0.7922122
##  salary <= 0.2652091
##  age <= -0.5557094
##  ->  class Acer  [0.878]
## 
## Rule 2/3: (862.4/182.3, lift 2.6)
##  salary > -1.441603
##  salary <= -0.320527
##  age > 0.3535528
##  ->  class Acer  [0.788]
## 
## Rule 2/4: (531.8/197.2, lift 2.0)
##  salary > 0.2652091
##  salary <= 1.005452
##  age > -0.5557094
##  age <= 0.6945261
##  ->  class Acer  [0.629]
## 
## Rule 2/5: (1229.6/68.2, lift 1.4)
##  salary > 1.005452
##  ->  class Sony  [0.944]
## 
## Rule 2/6: (1087.1/78.4, lift 1.3)
##  salary <= -0.119302
##  age > -0.5557094
##  age <= 0.3535528
##  ->  class Sony  [0.927]
## 
## Rule 2/7: (1875.6/201.5, lift 1.3)
##  salary > -0.320527
##  age > 0.3535528
##  ->  class Sony  [0.892]
## 
## Rule 2/8: (1012.3/123.4, lift 1.3)
##  salary > 0.2652091
##  age <= -0.5557094
##  ->  class Sony  [0.877]
## 
## Rule 2/9: (699.5/111.9, lift 1.2)
##  salary <= -1.441603
##  ->  class Sony  [0.839]
## 
## Rule 2/10: (692.1/129.9, lift 1.2)
##  salary <= -0.7922122
##  age <= -0.5557094
##  ->  class Sony  [0.811]
## 
## Default class: Sony
## 
## -----  Trial 3:  -----
## 
## Rules:
## 
## Rule 3/1: (1097/323.8, lift 1.7)
##  salary > -1.072638
##  salary <= 0.5414523
##  age <= -0.7830248
##  ->  class Acer  [0.704]
## 
## Rule 3/2: (1153.2/378.4, lift 1.6)
##  salary > -0.1692976
##  salary <= 0.9518045
##  age > -0.7830248
##  age <= 0.6945261
##  ->  class Acer  [0.672]
## 
## Rule 3/3: (1382.7/462, lift 1.6)
##  salary > -1.712772
##  salary <= -0.1692976
##  age > 0.5240395
##  ->  class Acer  [0.666]
## 
## Rule 3/4: (358.9, lift 1.7)
##  salary > 0.5414523
##  age <= -0.7830248
##  ->  class Sony  [0.997]
## 
## Rule 3/5: (201.7, lift 1.7)
##  salary <= -1.072638
##  age <= -0.7830248
##  ->  class Sony  [0.995]
## 
## Rule 3/6: (102.7, lift 1.7)
##  salary <= -1.712772
##  ->  class Sony  [0.990]
## 
## Rule 3/7: (878/8.7, lift 1.7)
##  salary > -0.1692976
##  age > 0.6945261
##  ->  class Sony  [0.989]
## 
## Rule 3/8: (1148.5/169.3, lift 1.4)
##  salary > 0.9518045
##  ->  class Sony  [0.852]
## 
## Rule 3/9: (1614.6/403.7, lift 1.3)
##  salary <= -0.1692976
##  age > -0.7830248
##  age <= 0.5240395
##  ->  class Sony  [0.750]
## 
## Default class: Sony
## 
## -----  Trial 4:  -----
## 
## Rules:
## 
## Rule 4/1: (950.1/255.2, lift 1.8)
##  salary > -0.9038483
##  salary <= 0.3727877
##  age <= -0.5557094
##  ->  class Acer  [0.731]
## 
## Rule 4/2: (1298.3/388.2, lift 1.8)
##  salary > -0.3968317
##  salary <= 0.3727877
##  age <= 0.5808684
##  ->  class Acer  [0.701]
## 
## Rule 4/3: (726.7/271.3, lift 1.6)
##  salary > 0.3727877
##  salary <= 1.215824
##  age > -0.4988804
##  age <= 0.5808684
##  ->  class Acer  [0.626]
## 
## Rule 4/4: (551.1/239.6, lift 1.4)
##  salary > -0.9038483
##  salary <= -0.1692976
##  age > 0.5808684
##  ->  class Acer  [0.565]
## 
## Rule 4/5: (710.8/332.6, lift 1.3)
##  salary > -1.712772
##  salary <= -0.9038483
##  age > 0.5808684
##  ->  class Acer  [0.532]
## 
## Rule 4/6: (447, lift 1.7)
##  salary > 1.215824
##  ->  class Sony  [0.998]
## 
## Rule 4/7: (84.9, lift 1.6)
##  salary <= -1.712772
##  ->  class Sony  [0.988]
## 
## Rule 4/8: (988.1/62.3, lift 1.6)
##  salary > -0.1692976
##  age > 0.5808684
##  ->  class Sony  [0.936]
## 
## Rule 4/9: (958.2/179.3, lift 1.4)
##  salary <= -0.3968317
##  age > -0.5557094
##  age <= 0.5808684
##  ->  class Sony  [0.812]
## 
## Rule 4/10: (1054.2/214.8, lift 1.3)
##  salary <= -0.9038483
##  age <= 0.5808684
##  ->  class Sony  [0.796]
## 
## Rule 4/11: (966.8/197.5, lift 1.3)
##  salary > 0.3727877
##  age <= -0.4988804
##  ->  class Sony  [0.795]
## 
## Default class: Sony
## 
## -----  Trial 5:  -----
## 
## Rules:
## 
## Rule 5/1: (305.4/52.5, lift 2.1)
##  salary > -0.5426162
##  salary <= 0.330084
##  age <= -0.726196
##  ->  class Acer  [0.826]
## 
## Rule 5/2: (650.9/186.7, lift 1.9)
##  salary > -0.2194015
##  salary <= 0.330084
##  age <= 0.5240395
##  ->  class Acer  [0.713]
## 
## Rule 5/3: (682.8/257.5, lift 1.6)
##  salary > -1.54434
##  salary <= -0.4298495
##  age > 0.5240395
##  ->  class Acer  [0.623]
## 
## Rule 5/4: (692.3/269.8, lift 1.6)
##  salary > 0.330084
##  salary <= 1.107454
##  age > -0.6693671
##  age <= 0.4103817
##  ->  class Acer  [0.610]
## 
## Rule 5/5: (647.6/53.9, lift 1.5)
##  salary > 1.107454
##  ->  class Sony  [0.915]
## 
## Rule 5/6: (7049.2/2853.5, lift 1.0)
##  salary <= 1.215824
##  ->  class Sony  [0.595]
## 
## Default class: Sony
## 
## -----  Trial 6:  -----
## 
## Rules:
## 
## Rule 6/1: (175.9/20.1, lift 2.0)
##  salary > 0.2872152
##  salary <= 0.9994338
##  age > -0.3852227
##  age <= 0.4103817
##  ->  class Acer  [0.881]
## 
## Rule 6/2: (227.9/26.5, lift 2.0)
##  salary > -0.13866
##  salary <= 0.2872152
##  age <= 0.4103817
##  ->  class Acer  [0.880]
## 
## Rule 6/3: (1052.2/301.1, lift 1.6)
##  salary > -0.9989635
##  salary <= -0.13866
##  age <= -0.5557094
##  ->  class Acer  [0.713]
## 
## Rule 6/4: (659.3/269.2, lift 1.3)
##  salary > -0.4159816
##  salary <= -0.13866
##  age <= 0.4103817
##  ->  class Acer  [0.591]
## 
## Rule 6/5: (977.9/400.6, lift 1.3)
##  salary > -0.9989635
##  salary <= -0.13866
##  age > 0.4103817
##  ->  class Acer  [0.590]
## 
## Rule 6/6: (273.5/114, lift 1.3)
##  salary > -1.433198
##  salary <= -0.9989635
##  age > 0.4103817
##  ->  class Acer  [0.583]
## 
## Rule 6/7: (435.2/26.2, lift 1.7)
##  salary <= -0.4159816
##  age > -0.5557094
##  age <= 0.4103817
##  ->  class Sony  [0.938]
## 
## Rule 6/8: (424.1/60.9, lift 1.5)
##  salary <= -0.9989635
##  age <= 0.4103817
##  ->  class Sony  [0.855]
## 
## Rule 6/9: (1199.1/271.5, lift 1.4)
##  salary > -0.13866
##  age > 0.4103817
##  ->  class Sony  [0.773]
## 
## Rule 6/10: (796.9/238.8, lift 1.3)
##  salary <= -1.433198
##  ->  class Sony  [0.700]
## 
## Rule 6/11: (2518.3/890.7, lift 1.2)
##  salary > 0.2872152
##  ->  class Sony  [0.646]
## 
## Default class: Sony
## 
## -----  Trial 7:  -----
## 
## Rules:
## 
## Rule 7/1: (130.8, lift 2.2)
##  salary > -0.873679
##  salary <= -0.320527
##  age > 0.6376973
##  ->  class Acer  [0.992]
## 
## Rule 7/2: (1023.1/363, lift 1.4)
##  salary > -0.2754662
##  salary <= 0.9994338
##  age > -0.3852227
##  age <= 0.6376973
##  ->  class Acer  [0.645]
## 
## Rule 7/3: (2187.3/814.4, lift 1.4)
##  salary > -0.873679
##  salary <= 1.107454
##  age <= -0.3852227
##  ->  class Acer  [0.628]
## 
## Rule 7/4: (810.8/342.7, lift 1.3)
##  salary > -1.712772
##  salary <= -0.873679
##  age > 0.5808684
##  ->  class Acer  [0.577]
## 
## Rule 7/5: (448.5, lift 1.9)
##  salary > 1.107454
##  ->  class Sony  [0.998]
## 
## Rule 7/6: (53.8, lift 1.8)
##  salary <= -1.712772
##  ->  class Sony  [0.982]
## 
## Rule 7/7: (535.3/88.1, lift 1.5)
##  salary > 0.9994338
##  age > -0.3852227
##  ->  class Sony  [0.834]
## 
## Rule 7/8: (926.6/156.9, lift 1.5)
##  salary > -0.320527
##  age > 0.6376973
##  ->  class Sony  [0.830]
## 
## Rule 7/9: (994.3/247.1, lift 1.4)
##  salary <= -0.873679
##  age <= 0.5808684
##  ->  class Sony  [0.751]
## 
## Rule 7/10: (1150.8/337.8, lift 1.3)
##  salary <= -0.2754662
##  age > -0.3852227
##  age <= 0.6376973
##  ->  class Sony  [0.706]
## 
## Default class: Sony
## 
## -----  Trial 8:  -----
## 
## Rules:
## 
## Rule 8/1: (257.8/21.9, lift 2.2)
##  salary > -0.2187837
##  salary <= 0.5012654
##  age > -0.6693671
##  age <= 0.3535528
##  ->  class Acer  [0.912]
## 
## Rule 8/2: (990.8/281.2, lift 1.7)
##  salary > -1.589961
##  salary <= -0.1692976
##  age > 0.5808684
##  ->  class Acer  [0.716]
## 
## Rule 8/3: (1337.4/403, lift 1.7)
##  salary > -0.9989635
##  salary <= 0.5012654
##  age <= -0.6693671
##  ->  class Acer  [0.698]
## 
## Rule 8/4: (699.2/260.4, lift 1.5)
##  salary > 0.5012654
##  salary <= 1.037807
##  age > -0.726196
##  age <= 0.5808684
##  ->  class Acer  [0.627]
## 
## Rule 8/5: (626, lift 1.8)
##  salary > 0.5012654
##  age <= -0.726196
##  ->  class Sony  [0.998]
## 
## Rule 8/6: (150.7, lift 1.8)
##  salary <= -0.9989635
##  age <= -0.6693671
##  ->  class Sony  [0.993]
## 
## Rule 8/7: (622/16, lift 1.8)
##  salary > 1.037807
##  ->  class Sony  [0.973]
## 
## Rule 8/8: (5043.8/2176.9, lift 1.0)
##  age > -0.6693671
##  ->  class Sony  [0.568]
## 
## Default class: Sony
## 
## -----  Trial 9:  -----
## 
## Rules:
## 
## Rule 9/1: (260.5, lift 2.4)
##  salary > -1.587265
##  salary <= -0.873679
##  age > 0.6376973
##  ->  class Acer  [0.996]
## 
## Rule 9/2: (794.1/74.5, lift 2.2)
##  salary > -0.873679
##  salary <= 0.3718849
##  age <= -0.5557094
##  ->  class Acer  [0.905]
## 
## Rule 9/3: (529.9/109.2, lift 1.9)
##  salary > 0.4062575
##  salary <= 1.037807
##  age > -0.6693671
##  age <= 0.5240395
##  ->  class Acer  [0.793]
## 
## Rule 9/4: (1472.5/344, lift 1.9)
##  salary > -0.3342096
##  salary <= 0.4062575
##  age <= 0.5808684
##  ->  class Acer  [0.766]
## 
## Rule 9/5: (936.6, lift 1.9)
##  salary > 0.4062575
##  age <= -0.6693671
##  ->  class Sony  [0.999]
## 
## Rule 9/6: (539, lift 1.9)
##  salary > 1.037807
##  ->  class Sony  [0.998]
## 
## Rule 9/7: (400/32.5, lift 1.7)
##  salary > 0.4062575
##  age > 0.5240395
##  ->  class Sony  [0.917]
## 
## Rule 9/8: (928.3/206.7, lift 1.4)
##  salary > -0.3342096
##  age > 0.5808684
##  ->  class Sony  [0.777]
## 
## Rule 9/9: (1085.3/258.5, lift 1.4)
##  salary <= -0.873679
##  age <= 0.6376973
##  ->  class Sony  [0.761]
## 
## Rule 9/10: (1587.5/684.4, lift 1.1)
##  salary <= -0.3342096
##  age > -0.5557094
##  ->  class Sony  [0.569]
## 
## Default class: Sony
## 
## 
## Evaluation on training data (7424 cases):
## 
## Trial            Rules     
## -----      ----------------
##      No      Errors
## 
##    0      9 1366(18.4%)
##    1     10  727( 9.8%)
##    2     10  829(11.2%)
##    3      9  789(10.6%)
##    4     11  678( 9.1%)
##    5      6  838(11.3%)
##    6     11  807(10.9%)
##    7     10 1050(14.1%)
##    8      8  633( 8.5%)
##    9     10  888(12.0%)
## boost            525( 7.1%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    2537   271    (a): class Acer
##     254  4362    (b): class Sony
## 
## 
##  Attribute usage:
## 
##  100.00% salary
##  100.00% age
## 
## 
## Time: 0.3 secs
plsFit2 <- train(brand~., data = Training, 
                 method = "gbm", trControl=fitControl, 
                 tuneLength =2)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3012             nan     0.1000    0.0128
##      2        1.2791             nan     0.1000    0.0105
##      3        1.2608             nan     0.1000    0.0090
##      4        1.2456             nan     0.1000    0.0075
##      5        1.2327             nan     0.1000    0.0063
##      6        1.2215             nan     0.1000    0.0055
##      7        1.2105             nan     0.1000    0.0052
##      8        1.2023             nan     0.1000    0.0039
##      9        1.1953             nan     0.1000    0.0035
##     10        1.1888             nan     0.1000    0.0026
##     20        1.1301             nan     0.1000    0.0017
##     40        1.0729             nan     0.1000    0.0007
##     60        1.0497             nan     0.1000    0.0002
##     80        1.0381             nan     0.1000   -0.0000
##    100        1.0321             nan     0.1000    0.0003
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2897             nan     0.1000    0.0176
##      2        1.2602             nan     0.1000    0.0150
##      3        1.2350             nan     0.1000    0.0128
##      4        1.2100             nan     0.1000    0.0125
##      5        1.1915             nan     0.1000    0.0088
##      6        1.1724             nan     0.1000    0.0093
##      7        1.1557             nan     0.1000    0.0075
##      8        1.1427             nan     0.1000    0.0060
##      9        1.1306             nan     0.1000    0.0056
##     10        1.1211             nan     0.1000    0.0044
##     20        1.0467             nan     0.1000    0.0027
##     40        0.8598             nan     0.1000    0.0059
##     60        0.7304             nan     0.1000    0.0005
##     80        0.6560             nan     0.1000    0.0026
##    100        0.5780             nan     0.1000    0.0011
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3009             nan     0.1000    0.0130
##      2        1.2786             nan     0.1000    0.0112
##      3        1.2597             nan     0.1000    0.0090
##      4        1.2428             nan     0.1000    0.0078
##      5        1.2295             nan     0.1000    0.0066
##      6        1.2186             nan     0.1000    0.0052
##      7        1.2069             nan     0.1000    0.0058
##      8        1.1970             nan     0.1000    0.0051
##      9        1.1896             nan     0.1000    0.0031
##     10        1.1804             nan     0.1000    0.0044
##     20        1.1241             nan     0.1000    0.0016
##     40        1.0667             nan     0.1000    0.0006
##     60        1.0396             nan     0.1000    0.0008
##     80        1.0282             nan     0.1000   -0.0000
##    100        1.0234             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2899             nan     0.1000    0.0176
##      2        1.2586             nan     0.1000    0.0152
##      3        1.2300             nan     0.1000    0.0148
##      4        1.2047             nan     0.1000    0.0119
##      5        1.1856             nan     0.1000    0.0099
##      6        1.1690             nan     0.1000    0.0086
##      7        1.1530             nan     0.1000    0.0077
##      8        1.1392             nan     0.1000    0.0064
##      9        1.1262             nan     0.1000    0.0062
##     10        1.1162             nan     0.1000    0.0047
##     20        1.0388             nan     0.1000    0.0029
##     40        0.9006             nan     0.1000    0.0082
##     60        0.7246             nan     0.1000    0.0028
##     80        0.6567             nan     0.1000    0.0014
##    100        0.5930             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3004             nan     0.1000    0.0127
##      2        1.2790             nan     0.1000    0.0108
##      3        1.2614             nan     0.1000    0.0085
##      4        1.2465             nan     0.1000    0.0073
##      5        1.2322             nan     0.1000    0.0070
##      6        1.2195             nan     0.1000    0.0066
##      7        1.2090             nan     0.1000    0.0052
##      8        1.1993             nan     0.1000    0.0047
##      9        1.1919             nan     0.1000    0.0036
##     10        1.1840             nan     0.1000    0.0039
##     20        1.1243             nan     0.1000    0.0031
##     40        1.0672             nan     0.1000    0.0007
##     60        1.0419             nan     0.1000    0.0001
##     80        1.0285             nan     0.1000    0.0000
##    100        1.0223             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2887             nan     0.1000    0.0180
##      2        1.2583             nan     0.1000    0.0157
##      3        1.2289             nan     0.1000    0.0141
##      4        1.2049             nan     0.1000    0.0112
##      5        1.1857             nan     0.1000    0.0093
##      6        1.1664             nan     0.1000    0.0093
##      7        1.1495             nan     0.1000    0.0079
##      8        1.1368             nan     0.1000    0.0064
##      9        1.1251             nan     0.1000    0.0055
##     10        1.1131             nan     0.1000    0.0052
##     20        1.0385             nan     0.1000    0.0033
##     40        0.8745             nan     0.1000    0.0003
##     60        0.7462             nan     0.1000    0.0003
##     80        0.6470             nan     0.1000    0.0037
##    100        0.5824             nan     0.1000    0.0027
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3012             nan     0.1000    0.0130
##      2        1.2791             nan     0.1000    0.0105
##      3        1.2608             nan     0.1000    0.0086
##      4        1.2441             nan     0.1000    0.0082
##      5        1.2299             nan     0.1000    0.0070
##      6        1.2194             nan     0.1000    0.0053
##      7        1.2096             nan     0.1000    0.0047
##      8        1.2026             nan     0.1000    0.0031
##      9        1.1926             nan     0.1000    0.0051
##     10        1.1866             nan     0.1000    0.0028
##     20        1.1256             nan     0.1000    0.0030
##     40        1.0694             nan     0.1000    0.0007
##     60        1.0441             nan     0.1000    0.0008
##     80        1.0322             nan     0.1000   -0.0001
##    100        1.0270             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2888             nan     0.1000    0.0182
##      2        1.2589             nan     0.1000    0.0147
##      3        1.2293             nan     0.1000    0.0145
##      4        1.2056             nan     0.1000    0.0111
##      5        1.1864             nan     0.1000    0.0092
##      6        1.1686             nan     0.1000    0.0080
##      7        1.1528             nan     0.1000    0.0076
##      8        1.1395             nan     0.1000    0.0063
##      9        1.1266             nan     0.1000    0.0061
##     10        1.1166             nan     0.1000    0.0048
##     20        1.0411             nan     0.1000    0.0016
##     40        0.8599             nan     0.1000    0.0085
##     60        0.7388             nan     0.1000    0.0007
##     80        0.6585             nan     0.1000    0.0028
##    100        0.5994             nan     0.1000    0.0029
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3002             nan     0.1000    0.0134
##      2        1.2788             nan     0.1000    0.0096
##      3        1.2597             nan     0.1000    0.0093
##      4        1.2432             nan     0.1000    0.0082
##      5        1.2292             nan     0.1000    0.0070
##      6        1.2177             nan     0.1000    0.0055
##      7        1.2068             nan     0.1000    0.0054
##      8        1.1988             nan     0.1000    0.0040
##      9        1.1913             nan     0.1000    0.0032
##     10        1.1842             nan     0.1000    0.0035
##     20        1.1289             nan     0.1000    0.0018
##     40        1.0706             nan     0.1000    0.0006
##     60        1.0481             nan     0.1000    0.0001
##     80        1.0377             nan     0.1000   -0.0001
##    100        1.0322             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2898             nan     0.1000    0.0182
##      2        1.2595             nan     0.1000    0.0150
##      3        1.2301             nan     0.1000    0.0140
##      4        1.2074             nan     0.1000    0.0110
##      5        1.1872             nan     0.1000    0.0092
##      6        1.1699             nan     0.1000    0.0087
##      7        1.1552             nan     0.1000    0.0069
##      8        1.1425             nan     0.1000    0.0061
##      9        1.1307             nan     0.1000    0.0057
##     10        1.1216             nan     0.1000    0.0045
##     20        1.0481             nan     0.1000    0.0030
##     40        0.8921             nan     0.1000    0.0126
##     60        0.7317             nan     0.1000    0.0040
##     80        0.6629             nan     0.1000    0.0012
##    100        0.5800             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3011             nan     0.1000    0.0127
##      2        1.2795             nan     0.1000    0.0105
##      3        1.2614             nan     0.1000    0.0085
##      4        1.2468             nan     0.1000    0.0069
##      5        1.2323             nan     0.1000    0.0069
##      6        1.2204             nan     0.1000    0.0061
##      7        1.2091             nan     0.1000    0.0053
##      8        1.2015             nan     0.1000    0.0032
##      9        1.1933             nan     0.1000    0.0042
##     10        1.1843             nan     0.1000    0.0044
##     20        1.1279             nan     0.1000    0.0016
##     40        1.0684             nan     0.1000    0.0004
##     60        1.0436             nan     0.1000    0.0000
##     80        1.0327             nan     0.1000    0.0000
##    100        1.0262             nan     0.1000    0.0003
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2898             nan     0.1000    0.0182
##      2        1.2602             nan     0.1000    0.0150
##      3        1.2308             nan     0.1000    0.0145
##      4        1.2088             nan     0.1000    0.0112
##      5        1.1897             nan     0.1000    0.0092
##      6        1.1708             nan     0.1000    0.0095
##      7        1.1560             nan     0.1000    0.0078
##      8        1.1427             nan     0.1000    0.0064
##      9        1.1313             nan     0.1000    0.0056
##     10        1.1191             nan     0.1000    0.0058
##     20        1.0406             nan     0.1000    0.0027
##     40        0.9172             nan     0.1000    0.0081
##     60        0.7486             nan     0.1000    0.0003
##     80        0.6529             nan     0.1000    0.0045
##    100        0.5919             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3008             nan     0.1000    0.0128
##      2        1.2794             nan     0.1000    0.0110
##      3        1.2616             nan     0.1000    0.0088
##      4        1.2460             nan     0.1000    0.0076
##      5        1.2317             nan     0.1000    0.0072
##      6        1.2194             nan     0.1000    0.0058
##      7        1.2090             nan     0.1000    0.0046
##      8        1.2018             nan     0.1000    0.0031
##      9        1.1917             nan     0.1000    0.0048
##     10        1.1837             nan     0.1000    0.0034
##     20        1.1267             nan     0.1000    0.0032
##     40        1.0703             nan     0.1000    0.0004
##     60        1.0454             nan     0.1000    0.0009
##     80        1.0330             nan     0.1000    0.0000
##    100        1.0277             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2892             nan     0.1000    0.0183
##      2        1.2583             nan     0.1000    0.0152
##      3        1.2323             nan     0.1000    0.0129
##      4        1.2062             nan     0.1000    0.0123
##      5        1.1852             nan     0.1000    0.0100
##      6        1.1681             nan     0.1000    0.0083
##      7        1.1525             nan     0.1000    0.0073
##      8        1.1393             nan     0.1000    0.0063
##      9        1.1281             nan     0.1000    0.0054
##     10        1.1182             nan     0.1000    0.0042
##     20        1.0464             nan     0.1000    0.0018
##     40        0.9246             nan     0.1000    0.0049
##     60        0.7896             nan     0.1000    0.0006
##     80        0.6787             nan     0.1000    0.0031
##    100        0.6164             nan     0.1000    0.0019
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2993             nan     0.1000    0.0134
##      2        1.2775             nan     0.1000    0.0109
##      3        1.2599             nan     0.1000    0.0084
##      4        1.2435             nan     0.1000    0.0080
##      5        1.2295             nan     0.1000    0.0070
##      6        1.2169             nan     0.1000    0.0060
##      7        1.2069             nan     0.1000    0.0049
##      8        1.1984             nan     0.1000    0.0041
##      9        1.1912             nan     0.1000    0.0031
##     10        1.1835             nan     0.1000    0.0035
##     20        1.1286             nan     0.1000    0.0013
##     40        1.0689             nan     0.1000    0.0005
##     60        1.0434             nan     0.1000    0.0005
##     80        1.0313             nan     0.1000    0.0002
##    100        1.0255             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2882             nan     0.1000    0.0181
##      2        1.2584             nan     0.1000    0.0150
##      3        1.2330             nan     0.1000    0.0128
##      4        1.2087             nan     0.1000    0.0118
##      5        1.1869             nan     0.1000    0.0105
##      6        1.1699             nan     0.1000    0.0083
##      7        1.1552             nan     0.1000    0.0071
##      8        1.1413             nan     0.1000    0.0065
##      9        1.1301             nan     0.1000    0.0055
##     10        1.1195             nan     0.1000    0.0050
##     20        1.0397             nan     0.1000    0.0021
##     40        0.9192             nan     0.1000    0.0007
##     60        0.7426             nan     0.1000    0.0037
##     80        0.6851             nan     0.1000    0.0001
##    100        0.6048             nan     0.1000    0.0037
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2998             nan     0.1000    0.0131
##      2        1.2777             nan     0.1000    0.0110
##      3        1.2603             nan     0.1000    0.0083
##      4        1.2439             nan     0.1000    0.0084
##      5        1.2299             nan     0.1000    0.0070
##      6        1.2182             nan     0.1000    0.0056
##      7        1.2068             nan     0.1000    0.0052
##      8        1.1970             nan     0.1000    0.0046
##      9        1.1895             nan     0.1000    0.0035
##     10        1.1819             nan     0.1000    0.0037
##     20        1.1238             nan     0.1000    0.0031
##     40        1.0651             nan     0.1000    0.0005
##     60        1.0386             nan     0.1000   -0.0002
##     80        1.0283             nan     0.1000    0.0001
##    100        1.0208             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2907             nan     0.1000    0.0187
##      2        1.2561             nan     0.1000    0.0167
##      3        1.2290             nan     0.1000    0.0136
##      4        1.2062             nan     0.1000    0.0114
##      5        1.1861             nan     0.1000    0.0101
##      6        1.1681             nan     0.1000    0.0085
##      7        1.1530             nan     0.1000    0.0070
##      8        1.1381             nan     0.1000    0.0071
##      9        1.1259             nan     0.1000    0.0059
##     10        1.1149             nan     0.1000    0.0050
##     20        1.0404             nan     0.1000    0.0030
##     40        0.9058             nan     0.1000    0.0008
##     60        0.7448             nan     0.1000    0.0036
##     80        0.6638             nan     0.1000    0.0021
##    100        0.5954             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3004             nan     0.1000    0.0130
##      2        1.2780             nan     0.1000    0.0109
##      3        1.2593             nan     0.1000    0.0085
##      4        1.2440             nan     0.1000    0.0072
##      5        1.2301             nan     0.1000    0.0072
##      6        1.2175             nan     0.1000    0.0063
##      7        1.2067             nan     0.1000    0.0051
##      8        1.1992             nan     0.1000    0.0033
##      9        1.1899             nan     0.1000    0.0044
##     10        1.1836             nan     0.1000    0.0029
##     20        1.1223             nan     0.1000    0.0027
##     40        1.0672             nan     0.1000    0.0007
##     60        1.0418             nan     0.1000    0.0002
##     80        1.0280             nan     0.1000    0.0001
##    100        1.0224             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2890             nan     0.1000    0.0185
##      2        1.2577             nan     0.1000    0.0156
##      3        1.2280             nan     0.1000    0.0144
##      4        1.2050             nan     0.1000    0.0113
##      5        1.1846             nan     0.1000    0.0098
##      6        1.1658             nan     0.1000    0.0092
##      7        1.1504             nan     0.1000    0.0073
##      8        1.1385             nan     0.1000    0.0061
##      9        1.1242             nan     0.1000    0.0072
##     10        1.1124             nan     0.1000    0.0056
##     20        1.0362             nan     0.1000    0.0028
##     40        0.8543             nan     0.1000    0.0094
##     60        0.7273             nan     0.1000    0.0045
##     80        0.6557             nan     0.1000    0.0004
##    100        0.5815             nan     0.1000    0.0024
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2899             nan     0.1000    0.0186
##      2        1.2602             nan     0.1000    0.0145
##      3        1.2298             nan     0.1000    0.0144
##      4        1.2074             nan     0.1000    0.0111
##      5        1.1848             nan     0.1000    0.0100
##      6        1.1678             nan     0.1000    0.0081
##      7        1.1531             nan     0.1000    0.0077
##      8        1.1407             nan     0.1000    0.0060
##      9        1.1284             nan     0.1000    0.0055
##     10        1.1178             nan     0.1000    0.0049
##     20        1.0380             nan     0.1000    0.0028
##     40        0.8939             nan     0.1000    0.0108
##     60        0.7363             nan     0.1000    0.0020
##     80        0.6517             nan     0.1000    0.0014
##    100        0.5788             nan     0.1000    0.0003
plsFit3 <- train(brand~., data = Training, 
                 method = "rf", trControl=fitControl, 
                 tuneLength =5)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
#This model was chosen for its accuracy of 93% and Kappa of 85%.
#It also takes considerably less time than other models.
#Lastly, GBT and RF, offer similar results. The final results have a variance of only 50 between each other. It's hard to choose which predictions will be closer to reality.


predictions <- predict(object = plsFit1, newdata = IncompleteSurvey)
predictions2 <- predict(object = plsFit2, newdata = IncompleteSurvey)
predictions3 <- predict(object = plsFit3, newdata = IncompleteSurvey)

plot(predictions)

summary(predictions)
## Acer Sony 
## 1894 3106
summary(predictions2)
## Acer Sony 
## 2107 2893
summary(predictions3)
## Acer Sony 
## 1910 3090
#Final predictions. Problem: the distribution percentage of brand is alarmingly similar to the dataset. This is probably due to the sample being stratified.

predictions_testing <-predict(plsFit1, Testing)
Testing$predictions <- predictions_testing


Testing$brand       <- as.factor(Testing$brand)
Testing$predictions <- as.factor(Testing$predictions)


postResample(Testing$predictions, Testing$brand)
##  Accuracy     Kappa 
## 0.9288601 0.8493955
#Accuracy and Kappa

KungFuSionMatrix <- confusionMatrix(Testing$predictions, Testing$brand)
KungFuSionMatrix$table
##           Reference
## Prediction Acer Sony
##       Acer  858   98
##       Sony   78 1440
#2298 out of 2474 have been correctly identified