We explored the dataset and removed vjet

library(dplyr)
data <- read.csv("./dataSaoPaulo2018-master/data/event-0005.csv")
head(data)

##   jet constituent       eta       phi        px        py        pz
## 1   0           0 -0.456679 -1.769878 -0.178798 -0.886218 -0.427374
## 2   0           1 -0.451003 -1.770757 -0.180011 -0.888201 -0.422723
## 3   0           2 -0.455031 -1.657863 -0.078671 -0.901284 -0.426026
## 4   0           3 -0.452284 -1.651613 -0.073121 -0.902811 -0.423775
## 5   0           4 -0.446974 -1.658059 -0.079116 -0.904343 -0.419409
## 6   0           5 -0.452834 -1.767828 -0.177271 -0.888036 -0.424225
##           p vjet
## 1 788.65130  NaN
## 2 234.61834  NaN
## 3 122.20752  NaN
## 4  99.43224  NaN
## 5  91.63662  NaN
## 6  54.52448  NaN

data <- data %>% select(-vjet)

We plotted for outliers…

par(mfrow = c(3,3), mar = c(2,1,2,1))
for (i in 1:ncol(data)){
plot(sort(data[,i]), main = names(data)[i])
}

… and deleted p, constituent and the response variable, `jet`

data_selected <- data %>% select(-jet, -p, -constituent)
names(data_selected)

## [1] "eta" "phi" "px"  "py"  "pz"

A PCA to explore the overall relationships

library(vegan)
par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
pca <- rda(scale(data_selected)) 
biplot(pca, scaling = -1,
     xlab = paste(round(eigenvals(pca)[1]/sum(eigenvals(pca)), 4)*100,"%"),
     ylab = paste(round(eigenvals(pca)[2]/sum(eigenvals(pca)), 4)*100,"%"))
text(pca, display = "species", scaling = -1, font = 2)
points(rda(scale(data_selected)), display = "sites", scaling = -1, pch = 19, bg = "dodgerblue4")

A k-means cluster and let’s see them in a `pairs()` plot

#GGally::ggpairs(data)
kMean_all <- kmeans(data_selected, centers = 3)
names(kMean_all)

## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

pairs(data_selected, col = kMean_all$cluster)

The classification with all the data makes no mistake

plot(kMean_all$cluster, data$jet)

table(kMean_all$cluster, data$jet)

##    
##      0  1  2
##   1  0  0 68
##   2  0 91  0
##   3 46  0  0

Let’s see if splitting the data into a train a test set does the same

set.seed(123)
tr <- sample(nrow(data_selected), size = 140)    

train <- data_selected[tr,]

jet_train <- data$jet[tr]

test <- data_selected[-tr, ]

jet_test <- data$jet[-tr]

The train dataset looks the same and has 100% good classification

kMeantr <- kmeans(train, centers = 3)
pairs(train, col = kMeantr$cluster)

table(kMeantr$cluster, jet_train)

##    jet_train
##      0  1  2
##   1 29  0  0
##   2  0  0 46
##   3  0 65  0

This was doing kmeans which is an unsupervised approach, Let’s see how a supervised approach works

Classification trees

library(party)
train_ctree <- ctree(formula = as.factor(jet_train) ~ ., data = train)
plot(train_ctree)

let’s plot these two variables and their critical values

par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$phi, train$pz, col = as.factor(jet_train), pch = 19)
abline(v = -1.317)
abline(h = -0.091)

with these data there is also a 100% good classification rate

resp <- train_ctree@predict_response()
real <- jet_train
table(resp, real)

##     real
## resp  0  1  2
##    0 29  0  0
##    1  0 65  0
##    2  0  0 46

We check if the prediction is going to work only graphically at first by plotting the TEST data

par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$phi, train$pz, col = as.factor(jet_train), pch = 21)
abline(v = -1.317)
abline(h = -0.114)

points(test$phi, test$pz, col = as.factor(jet_test), pch = 19)
legend("topright", c("train", "test"), pch = c(21, 19))

we predict to the test set

predict_to_test <- predict(train_ctree, newdata = test, type = "response") 
table(predict_to_test, jet_test)

##                jet_test
## predict_to_test  0  1  2
##               0 17  0  0
##               1  0 26  0
##               2  0  0 22

is this too good? let’s make an ugly model

library(party)
names(train)

## [1] "eta" "phi" "px"  "py"  "pz"

train_ctree_ugly <- ctree(formula = as.factor(jet_train) ~ px + eta, data = train)
plot(train_ctree_ugly)

par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$eta, train$px, col = as.factor(jet_train), pch = 21)
abline(v = c(0.211, -0.547))
abline(h = -0.208)
points(test$eta, test$px, col = as.factor(jet_test), pch = 19)
legend("topright", c("train", "test"), pch = c(21, 19))

resp_ugly <- train_ctree_ugly@predict_response()
real <- jet_train
table(resp_ugly, real)

##          real
## resp_ugly  0  1  2
##         0 20  1  3
##         1  0 64  0
##         2  9  0 43

predict_to_test_ugly <- predict(train_ctree_ugly, newdata = test, type = "response") 
table(predict_to_test_ugly, jet_test)

##                     jet_test
## predict_to_test_ugly  0  1  2
##                    0 11  0  1
##                    1  1 26  0
##                    2  5  0 21

we wanted to evaluate the performance of the models

# create the confusion matrix
cm <- as.matrix(table(Predicted = predict_to_test_ugly, Actual = jet_test)) 
cm

##          Actual
## Predicted  0  1  2
##         0 11  0  1
##         1  1 26  0
##         2  5  0 21

n <- sum(cm) # number of instances
nc <-  nrow(cm) # number of classes
diag <-  diag(cm) # number of correctly classified instances per class 
rowsums <- apply(cm, 1, sum) # number of instances per class
colsums <- apply(cm, 2, sum) # number of predictions per class
p <- rowsums / n # distribution of instances over the actual classes
q <- colsums / n # distribution of instances over the predicted classes
 
 
accuracy <-  sum(diag) / n 
precision <- diag / colsums 

accuracy

## [1] 0.8923077

precision

##         0         1         2 
## 0.6470588 1.0000000 0.9545455

We also fit a svm model

library(e1071)
train <- cbind(jet_train, train)
svmfit <- svm(as.factor(jet_train) ~ ., data = train, kernel = "linear")

par(mfrow = c(2, 2)) ## to reset to the default mfrow paramater for panels
plot(svmfit, data = train, formula = phi ~ pz)

plot(svmfit, data = train, formula =eta ~ pz)

plot(svmfit, data = train, formula =px ~ pz)

plot(svmfit, train, py ~ pz)

###predictions

pred <- predict(svmfit, test)
table(pred, jet_test)

##     jet_test
## pred  0  1  2
##    0 17  0  0
##    1  0 26  0
##    2  0  0 22

We also did a svm with the two known “bad” variables, eta and px

svmfit3 <- svm(as.factor(jet_train) ~ eta + px, data = train, kernel = "linear")
plot(svmfit3, data = train, formula = eta ~ px)

pred3 <- predict(svmfit3, test)
table(pred3, jet_test)

##      jet_test
## pred3  0  1  2
##     0  9  0  1
##     1  3 26  0
##     2  5  0 21

And a svm with the two known “good” variables, phi and pz

svmfit2 <- svm(as.factor(jet_train) ~ phi + pz, data = train, kernel = "linear")
plot(svmfit2, data = train, formula = phi ~ pz)

pred2 <- predict(svmfit2, test)
table(pred2, jet_test)

##      jet_test
## pred2  0  1  2
##     0 17  0  0
##     1  0 26  0
##     2  0  0 22

Introduction to machine learning

Patricia Torres, Jessica Leiria, Andrea Sánchez Tapia, Juan Camilo Sánchez Arcila

11/12/2018

We explored the dataset and removed vjet

We plotted for outliers…

… and deleted p, constituent and the response variable, `jet`

A PCA to explore the overall relationships

A k-means cluster and let’s see them in a `pairs()` plot

The classification with all the data makes no mistake

Let’s see if splitting the data into a train a test set does the same

The train dataset looks the same and has 100% good classification

Classification trees

let’s plot these two variables and their critical values

with these data there is also a 100% good classification rate

We check if the prediction is going to work only graphically at first by plotting the TEST data

we predict to the test set

is this too good? let’s make an ugly model

we wanted to evaluate the performance of the models

We also fit a svm model

We also did a svm with the two known “bad” variables, eta and px

And a svm with the two known “good” variables, phi and pz

Introduction to machine learning

Patricia Torres, Jessica Leiria, Andrea Sánchez Tapia, Juan Camilo Sánchez Arcila

11/12/2018

We explored the dataset and removed vjet

We plotted for outliers…

… and deleted p, constituent and the response variable, jet

A PCA to explore the overall relationships

A k-means cluster and let’s see them in a pairs() plot

The classification with all the data makes no mistake

Let’s see if splitting the data into a train a test set does the same

The train dataset looks the same and has 100% good classification

Classification trees

let’s plot these two variables and their critical values

with these data there is also a 100% good classification rate

We check if the prediction is going to work only graphically at first by plotting the TEST data

we predict to the test set

is this too good? let’s make an ugly model

we wanted to evaluate the performance of the models

We also fit a svm model

We also did a svm with the two known “bad” variables, eta and px

And a svm with the two known “good” variables, phi and pz

… and deleted p, constituent and the response variable, `jet`

A k-means cluster and let’s see them in a `pairs()` plot