library(dplyr)
data <- read.csv("./dataSaoPaulo2018-master/data/event-0005.csv")
head(data)
## jet constituent eta phi px py pz
## 1 0 0 -0.456679 -1.769878 -0.178798 -0.886218 -0.427374
## 2 0 1 -0.451003 -1.770757 -0.180011 -0.888201 -0.422723
## 3 0 2 -0.455031 -1.657863 -0.078671 -0.901284 -0.426026
## 4 0 3 -0.452284 -1.651613 -0.073121 -0.902811 -0.423775
## 5 0 4 -0.446974 -1.658059 -0.079116 -0.904343 -0.419409
## 6 0 5 -0.452834 -1.767828 -0.177271 -0.888036 -0.424225
## p vjet
## 1 788.65130 NaN
## 2 234.61834 NaN
## 3 122.20752 NaN
## 4 99.43224 NaN
## 5 91.63662 NaN
## 6 54.52448 NaN
data <- data %>% select(-vjet)
par(mfrow = c(3,3), mar = c(2,1,2,1))
for (i in 1:ncol(data)){
plot(sort(data[,i]), main = names(data)[i])
}
jetdata_selected <- data %>% select(-jet, -p, -constituent)
names(data_selected)
## [1] "eta" "phi" "px" "py" "pz"
library(vegan)
par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
pca <- rda(scale(data_selected))
biplot(pca, scaling = -1,
xlab = paste(round(eigenvals(pca)[1]/sum(eigenvals(pca)), 4)*100,"%"),
ylab = paste(round(eigenvals(pca)[2]/sum(eigenvals(pca)), 4)*100,"%"))
text(pca, display = "species", scaling = -1, font = 2)
points(rda(scale(data_selected)), display = "sites", scaling = -1, pch = 19, bg = "dodgerblue4")
pairs() plot#GGally::ggpairs(data)
kMean_all <- kmeans(data_selected, centers = 3)
names(kMean_all)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
pairs(data_selected, col = kMean_all$cluster)
plot(kMean_all$cluster, data$jet)
table(kMean_all$cluster, data$jet)
##
## 0 1 2
## 1 0 0 68
## 2 0 91 0
## 3 46 0 0
set.seed(123)
tr <- sample(nrow(data_selected), size = 140)
train <- data_selected[tr,]
jet_train <- data$jet[tr]
test <- data_selected[-tr, ]
jet_test <- data$jet[-tr]
kMeantr <- kmeans(train, centers = 3)
pairs(train, col = kMeantr$cluster)
table(kMeantr$cluster, jet_train)
## jet_train
## 0 1 2
## 1 29 0 0
## 2 0 0 46
## 3 0 65 0
This was doing kmeans which is an unsupervised approach, Let’s see how a supervised approach works
library(party)
train_ctree <- ctree(formula = as.factor(jet_train) ~ ., data = train)
plot(train_ctree)
par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$phi, train$pz, col = as.factor(jet_train), pch = 19)
abline(v = -1.317)
abline(h = -0.091)
resp <- train_ctree@predict_response()
real <- jet_train
table(resp, real)
## real
## resp 0 1 2
## 0 29 0 0
## 1 0 65 0
## 2 0 0 46
par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$phi, train$pz, col = as.factor(jet_train), pch = 21)
abline(v = -1.317)
abline(h = -0.114)
points(test$phi, test$pz, col = as.factor(jet_test), pch = 19)
legend("topright", c("train", "test"), pch = c(21, 19))
predict_to_test <- predict(train_ctree, newdata = test, type = "response")
table(predict_to_test, jet_test)
## jet_test
## predict_to_test 0 1 2
## 0 17 0 0
## 1 0 26 0
## 2 0 0 22
library(party)
names(train)
## [1] "eta" "phi" "px" "py" "pz"
train_ctree_ugly <- ctree(formula = as.factor(jet_train) ~ px + eta, data = train)
plot(train_ctree_ugly)
par(mfrow = c(1,1), mar = c(5, 4, 1, 1))
plot(train$eta, train$px, col = as.factor(jet_train), pch = 21)
abline(v = c(0.211, -0.547))
abline(h = -0.208)
points(test$eta, test$px, col = as.factor(jet_test), pch = 19)
legend("topright", c("train", "test"), pch = c(21, 19))
resp_ugly <- train_ctree_ugly@predict_response()
real <- jet_train
table(resp_ugly, real)
## real
## resp_ugly 0 1 2
## 0 20 1 3
## 1 0 64 0
## 2 9 0 43
predict_to_test_ugly <- predict(train_ctree_ugly, newdata = test, type = "response")
table(predict_to_test_ugly, jet_test)
## jet_test
## predict_to_test_ugly 0 1 2
## 0 11 0 1
## 1 1 26 0
## 2 5 0 21
# create the confusion matrix
cm <- as.matrix(table(Predicted = predict_to_test_ugly, Actual = jet_test))
cm
## Actual
## Predicted 0 1 2
## 0 11 0 1
## 1 1 26 0
## 2 5 0 21
n <- sum(cm) # number of instances
nc <- nrow(cm) # number of classes
diag <- diag(cm) # number of correctly classified instances per class
rowsums <- apply(cm, 1, sum) # number of instances per class
colsums <- apply(cm, 2, sum) # number of predictions per class
p <- rowsums / n # distribution of instances over the actual classes
q <- colsums / n # distribution of instances over the predicted classes
accuracy <- sum(diag) / n
precision <- diag / colsums
accuracy
## [1] 0.8923077
precision
## 0 1 2
## 0.6470588 1.0000000 0.9545455
library(e1071)
train <- cbind(jet_train, train)
svmfit <- svm(as.factor(jet_train) ~ ., data = train, kernel = "linear")
par(mfrow = c(2, 2)) ## to reset to the default mfrow paramater for panels
plot(svmfit, data = train, formula = phi ~ pz)
plot(svmfit, data = train, formula =eta ~ pz)
plot(svmfit, data = train, formula =px ~ pz)
plot(svmfit, train, py ~ pz)
###predictions
pred <- predict(svmfit, test)
table(pred, jet_test)
## jet_test
## pred 0 1 2
## 0 17 0 0
## 1 0 26 0
## 2 0 0 22
svmfit3 <- svm(as.factor(jet_train) ~ eta + px, data = train, kernel = "linear")
plot(svmfit3, data = train, formula = eta ~ px)
pred3 <- predict(svmfit3, test)
table(pred3, jet_test)
## jet_test
## pred3 0 1 2
## 0 9 0 1
## 1 3 26 0
## 2 5 0 21
svmfit2 <- svm(as.factor(jet_train) ~ phi + pz, data = train, kernel = "linear")
plot(svmfit2, data = train, formula = phi ~ pz)
pred2 <- predict(svmfit2, test)
table(pred2, jet_test)
## jet_test
## pred2 0 1 2
## 0 17 0 0
## 1 0 26 0
## 2 0 0 22