2.2 精度が低いので反省する

どうやら、偉い人がやれば94～98%くらいの正答率が出るようです。
いくら「なんちゃって」といっても低いので、考え直します。

コードの再掲

# read data
setwd("C:/Users/k-harada/Desktop/kdd/digit")

traindata <- read.csv("ORG/train.csv")
testdata <- read.csv("ORG/test.csv")
train_label <- traindata$label[1:28000]
valid_label <- traindata$label[28001:42000]

# do not resize here
train_mat <- as.matrix(traindata[1:28000, -1])

meanMat <- matrix(0, nrow = 28 * 28, ncol = 49)
for (i in seq(7)) {
    for (j in seq(7)) {
        tempmat <- matrix(0, nrow = 28, ncol = 28)
        tempmat[((4 * i - 3):(4 * i)), ((4 * j - 3):(4 * j))] <- 1/16
        meanMat[, 7 * (j - 1) + i] <- as.numeric(tempmat)
    }
}

answer_mat <- matrix(0, nrow = 28000, ncol = 10)
for (i in seq(10)) {
    answer_mat[train_label == (i - 1), i] <- 1
}

# add layer

# learn weights
W1 <- t(meanMat)
intercept1 <- rep(-127, length = 49)

W2 <- matrix(0, nrow = 10, ncol = 49)
intercept2 <- rep(0, length = 10)

# learn rate
eta1 <- 0.001
eta2 <- 0.001

for (i in seq(28000)) {
    # feed forward
    output1 <- 1/(1 + exp(-1 * (W1 %*% train_mat[i, ] + intercept1)))
    output2 <- 1/(1 + exp(-1 * (W2 %*% output1 + intercept2)))
    # back propagation
    W2 <- W2 + eta2 * (answer_mat[i, ] - output2) %*% t(output1)
    intercept2 <- intercept2 + eta2 * (answer_mat[i, ] - output2)
    W1 <- W1 + eta1 * ((output1 * (1 - output1)) * (t(W2) %*% (answer_mat[i, 
        ] - output2))) %*% t(train_mat[i, ])
    intercept1 <- intercept1 + eta1 * (output1 * (1 - output1)) * (t(W2) %*% 
        (answer_mat[i, ] - output2))
}

出力の計算

output_mat1 <- 1/(1 + exp(-1 * (train_mat %*% t(W1) + matrix(1, nrow = 28000, 
    ncol = 1) %*% matrix(intercept1, ncol = 49))))
output_mat2 <- 1/(1 + exp(-1 * (output_mat1 %*% t(W2) + matrix(1, nrow = 28000, 
    ncol = 1) %*% matrix(intercept2, ncol = 10))))
trainres <- max.col(as.matrix(output_mat2)) - 1

Confusion Matrix

table(trainres, train_label)

##         train_label
## trainres    0    1    2    3    4    5    6    7    8    9
##        0 2345    9  265  232   22  934  130   12  221   32
##        1    4 3022  236   71   68  238   81   86  287   48
##        2    6   17 1179   21    2   37    7    5   23    1
##        3   25   27  362 2294    4  544    5   26  548   27
##        4    9    2   90   13 2186  187   21   48  222 1015
##        5    0    0    1   18    4  190    0    1   24    6
##        6  284   14  582   29  195  130 2535   13  114   30
##        7   11   12   63  130  267   68    0 2706  222 1428
##        8   37    4   43  100    1  179    0   20 1019   40
##        9    0    0    3    1    7   14    0    7   14  138

Categorization Accuracy

mean(trainres == train_label)

## [1] 0.6291

反省のための確認タイム

さて、ここで反省します。
入力層から中間層の間に問題があるとみてよいでしょうから、そこを見てみます。

intercept1[1:10]

##  [1] -127 -127 -127 -127 -127 -127 -127 -127 -127 -127

image(matrix(W1[1, ], nrow = 28))

plot of chunk unnamed-chunk-6

・・・まるで学習していない。。。

反省

結論から言うと、初期値が悪いようです。

Back Propagationの式をよく見ると(output1*(1-output1))という項があります。
今の初期値ではoutput1には\( 1/1+\exp(127) \)とか\( 1/1+\exp(-127) \)が入って、この値はすごく0になります。つまり動きません。
これはシグモイド関数（ってか有界な関数全般）の弱点なので、ケアしてあげる必要があるようです。

2.3 やり直し

ということで、やり直します。
やることはデータを255で割って範囲を小さくすることと、学習パラメータをちょっと大きくすることと、あと検証データの精度も計って、ループを増やしました。
けっこういっぱいしれっとやったね！

# EDIT read data
setwd("C:/Users/k-harada/Desktop/kdd/digit")
traindata <- read.csv("ORG/train.csv")
testdata <- read.csv("ORG/test.csv")
train_label <- traindata$label[1:28000]
valid_label <- traindata$label[28001:42000]

# do not resize her
train_mat <- as.matrix(traindata[1:28000, -1])/255
valid_mat <- as.matrix(traindata[28001:42000, -1])/255


meanMat <- matrix(0, nrow = 28 * 28, ncol = 49)
for (i in seq(7)) {
    for (j in seq(7)) {
        tempmat <- matrix(0, nrow = 28, ncol = 28)
        tempmat[((4 * i - 3):(4 * i)), ((4 * j - 3):(4 * j))] <- 1/16
        meanMat[, 7 * (j - 1) + i] <- as.numeric(tempmat)
    }
}

answer_mat <- matrix(0, nrow = 28000, ncol = 10)
for (i in seq(10)) {
    answer_mat[train_label == (i - 1), i] <- 1
}
# three layers

# learn weights
W1 <- t(meanMat)
intercept1 <- rep(-0.5, length = 49)

W2 <- matrix(0, nrow = 10, ncol = 49)
intercept2 <- rep(0, length = 10)

# learn rate
eta1 <- 0.01
eta2 <- 0.01

nloop <- 5
train_err <- rep(0, length = nloop)
valid_err <- rep(0, length = nloop)
for (loop in seq(nloop)) {
    for (i in seq(28000)) {
        # feed forward
        output1 <- 1/(1 + exp(-1 * (W1 %*% train_mat[i, ] + intercept1)))
        output2 <- 1/(1 + exp(-1 * (W2 %*% output1 + intercept2)))
        # back propagation
        W2 <- W2 + eta2 * (answer_mat[i, ] - output2) %*% t(output1)
        intercept2 <- intercept2 + eta2 * (answer_mat[i, ] - output2)
        W1 <- W1 + eta1 * ((output1 * (1 - output1)) * (t(W2) %*% (answer_mat[i, 
            ] - output2))) %*% t(train_mat[i, ])
        intercept1 <- intercept1 + eta1 * (output1 * (1 - output1)) * (t(W2) %*% 
            (answer_mat[i, ] - output2))
    }

    output_mat1 <- 1/(1 + exp(-1 * (train_mat %*% t(W1) + matrix(1, nrow = 28000, 
        ncol = 1) %*% matrix(intercept1, ncol = 49))))
    output_mat2 <- 1/(1 + exp(-1 * (output_mat1 %*% t(W2) + matrix(1, nrow = 28000, 
        ncol = 1) %*% matrix(intercept2, ncol = 10))))

    output_mat1_v <- 1/(1 + exp(-1 * (valid_mat %*% t(W1) + matrix(1, nrow = 14000, 
        ncol = 1) %*% matrix(intercept1, ncol = 49))))
    output_mat2_v <- 1/(1 + exp(-1 * (output_mat1_v %*% t(W2) + matrix(1, nrow = 14000, 
        ncol = 1) %*% matrix(intercept2, ncol = 10))))


    trainres <- max.col(as.matrix(output_mat2)) - 1
    validres <- max.col(as.matrix(output_mat2_v)) - 1

    train_err[loop] <- mean(trainres == train_label)
    valid_err[loop] <- mean(validres == valid_label)

    print(paste("LOOP=", loop, "now"))
}

## [1] "LOOP= 1 now"
## [1] "LOOP= 2 now"
## [1] "LOOP= 3 now"
## [1] "LOOP= 4 now"
## [1] "LOOP= 5 now"

table(trainres, train_label)

##         train_label
## trainres    0    1    2    3    4    5    6    7    8    9
##        0 2658    0    9    2    3   22   17    9    8   14
##        1    0 3032   12    9   11    5    2   19   16    7
##        2    5    8 2656   38   10    8    2   40   10    1
##        3    5   13   26 2725    1   66    0   21   31   41
##        4    5    2   20    2 2623   12    6   17    3   53
##        5    6    7    1   39    0 2313   15    2   12   11
##        6   15    3   22    8   16   23 2716    2    8    1
##        7    0    9   26   13    5    2    1 2775    4   23
##        8   21   31   49   62   17   54   20    8 2593   35
##        9    6    2    3   11   70   16    0   31    9 2579

table(validres, valid_label)

##         valid_label
## validres    0    1    2    3    4    5    6    7    8    9
##        0 1376    0   10    3    2   13   15    6    1   13
##        1    0 1540    0    4    7    4    2    6   19    7
##        2    2    4 1266   23    3    6    5   24   10    2
##        3    3    6   17 1338    1   48    0    3    9   23
##        4    4    2    9    0 1239   12    5   14    3   32
##        5    4    0    4   25    1 1136    7    2   10    6
##        6    8    1   13    5   10   11 1311    0    4    1
##        7    0    2   11   10    0    1    0 1405    4   24
##        8   14   22   21   31   11   31   13    4 1303   21
##        9    0    0    2    3   42   12    0   13    6 1294

# plot
plot(train_err, type = "l", col = 1, ylim = c(0.85, 1), xlab = "", ylab = "")
par(new = T)
plot(valid_err, type = "l", col = 2, ylim = c(0.85, 1), xlab = "LOOP", ylab = "Accuracy")

plot of chunk unnamed-chunk-10

ちなみにループ増やすと検証でも96%～97%くらいまで到達しました。