We explored the dataset and removed vjet

library(dplyr)
data <- read.csv("./dataSaoPaulo2018-master/data/event-0005.csv")
head(data)
##   jet constituent       eta       phi        px        py        pz
## 1   0           0 -0.456679 -1.769878 -0.178798 -0.886218 -0.427374
## 2   0           1 -0.451003 -1.770757 -0.180011 -0.888201 -0.422723
## 3   0           2 -0.455031 -1.657863 -0.078671 -0.901284 -0.426026
## 4   0           3 -0.452284 -1.651613 -0.073121 -0.902811 -0.423775
## 5   0           4 -0.446974 -1.658059 -0.079116 -0.904343 -0.419409
## 6   0           5 -0.452834 -1.767828 -0.177271 -0.888036 -0.424225
##           p vjet
## 1 788.65130  NaN
## 2 234.61834  NaN
## 3 122.20752  NaN
## 4  99.43224  NaN
## 5  91.63662  NaN
## 6  54.52448  NaN
data <- data %>% select(-vjet)

We plotted for outliers…

par(mfrow = c(3,3), mar = c(2,1,2,1))
for (i in 1:ncol(data)){
plot(sort(data[,i]), main = names(data)[i])
}

… and deleted p, constituent and the response variable, jet

data_selected <- data %>% select(-jet, -p, -constituent)
names(data_selected)
## [1] "eta" "phi" "px"  "py"  "pz"

A PCA to explore the overall relationships

library(vegan)
par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
pca <- rda(scale(data_selected)) 
biplot(pca, scaling = -1,
     xlab = paste(round(eigenvals(pca)[1]/sum(eigenvals(pca)), 4)*100,"%"),
     ylab = paste(round(eigenvals(pca)[2]/sum(eigenvals(pca)), 4)*100,"%"))
text(pca, display = "species", scaling = -1, font = 2)
points(rda(scale(data_selected)), display = "sites", scaling = -1, pch = 19, bg = "dodgerblue4")

A k-means cluster and let’s see them in a pairs() plot

#GGally::ggpairs(data)
kMean_all <- kmeans(data_selected, centers = 3)
names(kMean_all)
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
pairs(data_selected, col = kMean_all$cluster)

The classification with all the data makes no mistake

plot(kMean_all$cluster, data$jet)

table(kMean_all$cluster, data$jet)
##    
##      0  1  2
##   1  0  0 68
##   2  0 91  0
##   3 46  0  0

Let’s see if splitting the data into a train a test set does the same

set.seed(123)
tr <- sample(nrow(data_selected), size = 140)    

train <- data_selected[tr,]

jet_train <- data$jet[tr]

test <- data_selected[-tr, ]

jet_test <- data$jet[-tr]

The train dataset looks the same and has 100% good classification

kMeantr <- kmeans(train, centers = 3)
pairs(train, col = kMeantr$cluster)

table(kMeantr$cluster, jet_train)
##    jet_train
##      0  1  2
##   1 29  0  0
##   2  0  0 46
##   3  0 65  0

This was doing kmeans which is an unsupervised approach, Let’s see how a supervised approach works

Classification trees

library(party)
train_ctree <- ctree(formula = as.factor(jet_train) ~ ., data = train)
plot(train_ctree)

let’s plot these two variables and their critical values

par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$phi, train$pz, col = as.factor(jet_train), pch = 19)
abline(v = -1.317)
abline(h = -0.091)

with these data there is also a 100% good classification rate

resp <- train_ctree@predict_response()
real <- jet_train
table(resp, real)
##     real
## resp  0  1  2
##    0 29  0  0
##    1  0 65  0
##    2  0  0 46

We check if the prediction is going to work only graphically at first by plotting the TEST data

par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$phi, train$pz, col = as.factor(jet_train), pch = 21)
abline(v = -1.317)
abline(h = -0.114)

points(test$phi, test$pz, col = as.factor(jet_test), pch = 19)
legend("topright", c("train", "test"), pch = c(21, 19))

we predict to the test set

predict_to_test <- predict(train_ctree, newdata = test, type = "response") 
table(predict_to_test, jet_test)
##                jet_test
## predict_to_test  0  1  2
##               0 17  0  0
##               1  0 26  0
##               2  0  0 22

is this too good? let’s make an ugly model

library(party)
names(train)
## [1] "eta" "phi" "px"  "py"  "pz"
train_ctree_ugly <- ctree(formula = as.factor(jet_train) ~ px + eta, data = train)
plot(train_ctree_ugly)

par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$eta, train$px, col = as.factor(jet_train), pch = 21)
abline(v = c(0.211, -0.547))
abline(h = -0.208)
points(test$eta, test$px, col = as.factor(jet_test), pch = 19)
legend("topright", c("train", "test"), pch = c(21, 19))

resp_ugly <- train_ctree_ugly@predict_response()
real <- jet_train
table(resp_ugly, real)
##          real
## resp_ugly  0  1  2
##         0 20  1  3
##         1  0 64  0
##         2  9  0 43
predict_to_test_ugly <- predict(train_ctree_ugly, newdata = test, type = "response") 
table(predict_to_test_ugly, jet_test)
##                     jet_test
## predict_to_test_ugly  0  1  2
##                    0 11  0  1
##                    1  1 26  0
##                    2  5  0 21

we wanted to evaluate the performance of the models

# create the confusion matrix
cm <- as.matrix(table(Predicted = predict_to_test_ugly, Actual = jet_test)) 
cm
##          Actual
## Predicted  0  1  2
##         0 11  0  1
##         1  1 26  0
##         2  5  0 21
n <- sum(cm) # number of instances
nc <-  nrow(cm) # number of classes
diag <-  diag(cm) # number of correctly classified instances per class 
rowsums <- apply(cm, 1, sum) # number of instances per class
colsums <- apply(cm, 2, sum) # number of predictions per class
p <- rowsums / n # distribution of instances over the actual classes
q <- colsums / n # distribution of instances over the predicted classes
 
 
accuracy <-  sum(diag) / n 
precision <- diag / colsums 

accuracy
## [1] 0.8923077
precision
##         0         1         2 
## 0.6470588 1.0000000 0.9545455

We also fit a svm model

library(e1071)
train <- cbind(jet_train, train)
svmfit <- svm(as.factor(jet_train) ~ ., data = train, kernel = "linear")

par(mfrow = c(2, 2)) ## to reset to the default mfrow paramater for panels
plot(svmfit, data = train, formula = phi ~ pz)

plot(svmfit, data = train, formula =eta ~ pz)

plot(svmfit, data = train, formula =px ~ pz)

plot(svmfit, train, py ~ pz)

###predictions

pred <- predict(svmfit, test)
table(pred, jet_test)
##     jet_test
## pred  0  1  2
##    0 17  0  0
##    1  0 26  0
##    2  0  0 22

We also did a svm with the two known “bad” variables, eta and px

svmfit3 <- svm(as.factor(jet_train) ~ eta + px, data = train, kernel = "linear")
plot(svmfit3, data = train, formula = eta ~ px)

pred3 <- predict(svmfit3, test)
table(pred3, jet_test)
##      jet_test
## pred3  0  1  2
##     0  9  0  1
##     1  3 26  0
##     2  5  0 21

And a svm with the two known “good” variables, phi and pz

svmfit2 <- svm(as.factor(jet_train) ~ phi + pz, data = train, kernel = "linear")
plot(svmfit2, data = train, formula = phi ~ pz)

pred2 <- predict(svmfit2, test)
table(pred2, jet_test)
##      jet_test
## pred2  0  1  2
##     0 17  0  0
##     1  0 26  0
##     2  0  0 22