Astroparticle (http://www.csie.ntu.edu.tw/~cjlin/papers/guide/data/)

Prepare

train.filename = "train.1"
test.filename = "test.1"
variables <- c("target", "v1", "v2", "v3", "v4")

# functions
accuracy <- function(cm) {
    wrong = 0
    correct = 0
    for (i in 1:nrow(cm)) {
        correct = correct + cm[i, i]
        wrong = wrong + sum(cm[i, -i])
    }
    return(correct/(wrong + correct))
}

precision <- function(cm, class) {
    return(cm[class, class]/sum(cm[class, ]))
}

recall <- function(cm, class) {
    return(cm[class, class]/sum(cm[, class]))
}

fmeasure <- function(cm, class) {
    r = recall(cm, class)
    p = precision(cm, class)
    return(2 * r * p/(r + p))
}

evaluate <- function(cm) {
    result <- data.frame()
    for (i in 1:nrow(cm)) {
        result <- rbind(result, data.frame(recall = recall(cm, i), precision = precision(cm, i), f = fmeasure(cm, i)))
    }
    rownames(result) <- rownames(cm)
    return(result)
}

Training Set

train <- read.table(train.filename, col.names = variables)
train$target = as.factor(train$target > 0.5)
summary(train)

##    target           v1              v2              v3                v4        
##  FALSE:1089   Min.   :  0.0   Min.   : -4.6   Min.   :-0.7524   Min.   :  8.16  
##  TRUE :2000   1st Qu.: 16.5   1st Qu.: 35.5   1st Qu.:-0.1567   1st Qu.: 94.21  
##               Median : 23.5   Median : 86.8   Median : 0.1263   Median :122.51  
##               Mean   : 32.3   Mean   :113.3   Mean   : 0.0686   Mean   :115.67  
##               3rd Qu.: 37.9   3rd Qu.:164.4   3rd Qu.: 0.2460   3rd Qu.:145.35  
##               Max.   :297.1   Max.   :581.1   Max.   : 0.7171   Max.   :180.00

library(GGally)
ggpairs(train, colour = "target", alpha = 0.3)

plot of chunk unnamed-chunk-5

Test Set

test <- read.table(test.filename, col.names = variables)
test$target = as.factor(test$target > 0.5)
summary(test)

##    target           v1              v2              v3                v4        
##  FALSE:2000   Min.   :  0.0   Min.   : -7.7   Min.   :-0.8016   Min.   :  5.65  
##  TRUE :2000   1st Qu.: 13.2   1st Qu.: 26.3   1st Qu.:-0.1596   1st Qu.: 75.58  
##               Median : 19.8   Median : 54.4   Median : 0.1402   Median :117.30  
##               Mean   : 28.1   Mean   : 94.6   Mean   : 0.0778   Mean   :108.87  
##               3rd Qu.: 32.3   3rd Qu.:139.5   3rd Qu.: 0.2725   3rd Qu.:142.19  
##               Max.   :244.5   Max.   :546.0   Max.   : 0.7261   Max.   :180.00

library(GGally)
ggpairs(test, colour = "target", alpha = 0.3)

plot of chunk unnamed-chunk-7

Logistic Regression

Learn

formula <- target ~ .
train.glm <- glm(formula, train, family = binomial(link = "logit"))
train.glm <- step(train.glm, direction = "both")

## Start:  AIC=781.6
## target ~ v1 + v2 + v3 + v4
##

##        Df Deviance  AIC
## <none>         772  782
## - v3    1      786  794
## - v4    1      890  898
## - v1    1      917  925
## - v2    1     1949 1957

summary(train.glm)

## 
## Call:
## glm(formula = target ~ v1 + v2 + v3 + v4, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.051  -0.130   0.000   0.038   2.899  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.56382    0.50820  -18.82  < 2e-16 ***
## v1           0.06697    0.00687    9.76  < 2e-16 ***
## v2           0.09581    0.00510   18.79  < 2e-16 ***
## v3          -1.29062    0.34017   -3.79  0.00015 ***
## v4           0.02893    0.00293    9.86  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4009.56  on 3088  degrees of freedom
## Residual deviance:  771.63  on 3084  degrees of freedom
## AIC: 781.6
## 
## Number of Fisher Scoring iterations: 9
##

Predict

test$predict.glm <- as.factor(predict(train.glm, test, type = "response") > 0.5)
summary(test$predict.glm)

## FALSE  TRUE 
##  1996  2004

Evaluate

Confusion Matrix

print(test.cm.glm <- table(predicted = test$predict.glm, actual = test$target))

##          actual
## predicted FALSE TRUE
##     FALSE  1908   88
##     TRUE     92 1912

Detail

accuracy(test.cm.glm)

## [1] 0.955

evaluate(test.cm.glm)

##       recall precision     f
## FALSE  0.954    0.9559 0.955
## TRUE   0.956    0.9541 0.955

Error Plot

library(GGally)
test$predict.glm.correct = as.factor(test$predict.glm == test$target)
ggpairs(test, columns = variables, alpha = 0.4, colour = "predict.glm.correct")

plot of chunk unnamed-chunk-12

Support Vector Machine (ksvm)

Learn

library(kernlab)
formula <- target ~ .
print(train.svm <- ksvm(formula, train))

## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.409305645588518 
## 
## Number of Support Vectors : 413 
## 
## Objective Function Value : -302.4 
## Training error : 0.027193

# TODO: tune hyper parameters

Predict

test$predict.svm <- predict(train.svm, test)
summary(test$predict.svm)

## FALSE  TRUE 
##  1990  2010

Evaluate

Confusion Matrix

print(test.cm.svm <- table(predicted = test$predict.svm, actual = test$target))

##          actual
## predicted FALSE TRUE
##     FALSE  1935   55
##     TRUE     65 1945

Detail

accuracy(test.cm.svm)

## [1] 0.97

evaluate(test.cm.svm)

##       recall precision      f
## FALSE 0.9675    0.9724 0.9699
## TRUE  0.9725    0.9677 0.9701

Error Plot

library(GGally)
test$predict.svm.correct <- as.factor(test$target == test$predict.svm)
ggpairs(test, columns = variables, alpha = 0.4, colour = "predict.svm.correct")

plot of chunk unnamed-chunk-17