train.filename = "train.1"
test.filename = "test.1"
variables <- c("target", "v1", "v2", "v3", "v4")
# functions
accuracy <- function(cm) {
wrong = 0
correct = 0
for (i in 1:nrow(cm)) {
correct = correct + cm[i, i]
wrong = wrong + sum(cm[i, -i])
}
return(correct/(wrong + correct))
}
precision <- function(cm, class) {
return(cm[class, class]/sum(cm[class, ]))
}
recall <- function(cm, class) {
return(cm[class, class]/sum(cm[, class]))
}
fmeasure <- function(cm, class) {
r = recall(cm, class)
p = precision(cm, class)
return(2 * r * p/(r + p))
}
evaluate <- function(cm) {
result <- data.frame()
for (i in 1:nrow(cm)) {
result <- rbind(result, data.frame(recall = recall(cm, i), precision = precision(cm, i), f = fmeasure(cm, i)))
}
rownames(result) <- rownames(cm)
return(result)
}
train <- read.table(train.filename, col.names = variables)
train$target = as.factor(train$target > 0.5)
summary(train)
## target v1 v2 v3 v4
## FALSE:1089 Min. : 0.0 Min. : -4.6 Min. :-0.7524 Min. : 8.16
## TRUE :2000 1st Qu.: 16.5 1st Qu.: 35.5 1st Qu.:-0.1567 1st Qu.: 94.21
## Median : 23.5 Median : 86.8 Median : 0.1263 Median :122.51
## Mean : 32.3 Mean :113.3 Mean : 0.0686 Mean :115.67
## 3rd Qu.: 37.9 3rd Qu.:164.4 3rd Qu.: 0.2460 3rd Qu.:145.35
## Max. :297.1 Max. :581.1 Max. : 0.7171 Max. :180.00
library(GGally)
ggpairs(train, colour = "target", alpha = 0.3)
test <- read.table(test.filename, col.names = variables)
test$target = as.factor(test$target > 0.5)
summary(test)
## target v1 v2 v3 v4
## FALSE:2000 Min. : 0.0 Min. : -7.7 Min. :-0.8016 Min. : 5.65
## TRUE :2000 1st Qu.: 13.2 1st Qu.: 26.3 1st Qu.:-0.1596 1st Qu.: 75.58
## Median : 19.8 Median : 54.4 Median : 0.1402 Median :117.30
## Mean : 28.1 Mean : 94.6 Mean : 0.0778 Mean :108.87
## 3rd Qu.: 32.3 3rd Qu.:139.5 3rd Qu.: 0.2725 3rd Qu.:142.19
## Max. :244.5 Max. :546.0 Max. : 0.7261 Max. :180.00
library(GGally)
ggpairs(test, colour = "target", alpha = 0.3)
formula <- target ~ .
train.glm <- glm(formula, train, family = binomial(link = "logit"))
train.glm <- step(train.glm, direction = "both")
## Start: AIC=781.6
## target ~ v1 + v2 + v3 + v4
##
## Df Deviance AIC
## <none> 772 782
## - v3 1 786 794
## - v4 1 890 898
## - v1 1 917 925
## - v2 1 1949 1957
summary(train.glm)
##
## Call:
## glm(formula = target ~ v1 + v2 + v3 + v4, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.051 -0.130 0.000 0.038 2.899
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.56382 0.50820 -18.82 < 2e-16 ***
## v1 0.06697 0.00687 9.76 < 2e-16 ***
## v2 0.09581 0.00510 18.79 < 2e-16 ***
## v3 -1.29062 0.34017 -3.79 0.00015 ***
## v4 0.02893 0.00293 9.86 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4009.56 on 3088 degrees of freedom
## Residual deviance: 771.63 on 3084 degrees of freedom
## AIC: 781.6
##
## Number of Fisher Scoring iterations: 9
##
test$predict.glm <- as.factor(predict(train.glm, test, type = "response") > 0.5)
summary(test$predict.glm)
## FALSE TRUE
## 1996 2004
print(test.cm.glm <- table(predicted = test$predict.glm, actual = test$target))
## actual
## predicted FALSE TRUE
## FALSE 1908 88
## TRUE 92 1912
accuracy(test.cm.glm)
## [1] 0.955
evaluate(test.cm.glm)
## recall precision f
## FALSE 0.954 0.9559 0.955
## TRUE 0.956 0.9541 0.955
library(GGally)
test$predict.glm.correct = as.factor(test$predict.glm == test$target)
ggpairs(test, columns = variables, alpha = 0.4, colour = "predict.glm.correct")
library(kernlab)
formula <- target ~ .
print(train.svm <- ksvm(formula, train))
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.409305645588518
##
## Number of Support Vectors : 413
##
## Objective Function Value : -302.4
## Training error : 0.027193
# TODO: tune hyper parameters
test$predict.svm <- predict(train.svm, test)
summary(test$predict.svm)
## FALSE TRUE
## 1990 2010
print(test.cm.svm <- table(predicted = test$predict.svm, actual = test$target))
## actual
## predicted FALSE TRUE
## FALSE 1935 55
## TRUE 65 1945
accuracy(test.cm.svm)
## [1] 0.97
evaluate(test.cm.svm)
## recall precision f
## FALSE 0.9675 0.9724 0.9699
## TRUE 0.9725 0.9677 0.9701
library(GGally)
test$predict.svm.correct <- as.factor(test$target == test$predict.svm)
ggpairs(test, columns = variables, alpha = 0.4, colour = "predict.svm.correct")