This is a practice of k-nearest neighbours.
x’ <- x value of new test datapoint
train <- training data (X,Y)
distance <- train[order((train$X-x’))] #calculate distance and sort
cfreq <- count(distance[1:k], vars= ‘Y’) #count frequency of each label
y’ <- cfreq[cfreq$freq == max]$Y #the most frequent label is our prediction
#install.packages(RnavGraphImageData)
library(RnavGraphImageData)
data(digits)
library(plyr)
library(magrittr)
plot_digit <- function(x){
m <- matrix(x, nrow=16, ncol=16, byrow = TRUE)
m <- apply(m, 1, rev)
m <- t(m)
image(m, col = gray(0:255/255))
}
apply(digits[, seq(1, 5500, 1100)], 2, plot_digit)
## NULL
plot_digit(digits[, 5502])
plot_digit(digits[, 6603])
apply(digits[, seq(7701, 11000, 1100)], 2, plot_digit)
## NULL
get_digits <- function(select_digs, size){
index <- numeric()
select_digs[select_digs==0] <- 10
start <- (select_digs-1)*1100+1
for(ii in 1:length(start)){
index <- c(index, seq(start[ii], length.out = size))
}
return(digits[,index])
}
select_digs <- c(0,8)
size <- 100
my_train <- get_digits(select_digs, size)
euc_dist <- function(a, b) (a-b)^2 %>% sum %>% sqrt
new_digit <- my_train[1]
distances <- laply(my_train, euc_dist, new_digit)
head(distances)
## [1] 0.000 1805.859 1456.509 1547.326 2067.671 1897.901
get_knn <- function(k, my_train, new_digit){
distances <- laply(my_train, euc_dist, new_digit)
indices <- sort(distances, index.return = TRUE)$ix
return(indices[1:k])
}
my_labels <- rep(c(0, 8), each= 100)
get_label <- function(kindices){
labels <- my_labels[kindices]
freq <- table(labels)
return(names(which.max(freq)))
}
my_knn <- function(k, my_train, my_labels, test_ip){
kindices <- get_knn(k, my_train, test_ip)
return(get_label(kindices))
}
k <- 5
prediction <- laply(my_train, my_knn, my_train = my_train, k = k, my_labels = my_labels)
paste("my_knn got wrong ", sum(prediction[1:100] != my_labels[1:100]), " 0's.")
## [1] "my_knn got wrong 0 0's."
paste("my_knn got wrong ", sum(prediction[101:200] != my_labels[101:200]), " 8's.")
## [1] "my_knn got wrong 0 8's."
my_test <- digits[,c(10000:10099, 7900:7999)]
test_labels <- rep(c(0, 8), each =100)
prediction <- laply(my_test, my_knn, my_train = my_train, k = k, my_labels = my_labels)
paste("my_knn got wrong ", sum(prediction[1:100] != test_labels[1:100]), " 0's.")
## [1] "my_knn got wrong 1 0's."
paste("my_knn got wrong ", sum(prediction[101:200] != test_labels[101:200]), " 8's.")
## [1] "my_knn got wrong 4 8's."
Apply my_knn to one hundred 5’s.
my_test5 <- digits[,4900:4999]
prediction <- laply(my_test5, my_knn, my_train = my_train, k = k, my_labels = my_labels)
table(prediction)
## prediction
## 0 8
## 51 49