1 Goal


The goal of this tutorial is to see how different algorithms work on a 2D canvas. For this example we are going to use the Japanese flag because it is symmetrical. We will try to predict a missing part of the flag as well as doing the standard train-test separation to learn how different algorithms see the flag.


2 Create the Japanese flag


# First we load the libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rpart)
library(caret)
## Loading required package: lattice
library(e1071)



# Let's create 50k points on a 3x2 grid
x <- runif(50000, min = 0, max = 3)
y <- runif(50000, min = 0, max = 2)

# Flag colour palette
japanPalette <- c("red", "white")

# Flag dataframe
japan_flag <- as.data.frame(x = x)
japan_flag$y <- y


# Now we add the colour
japan_flag <-mutate(japan_flag, flag_colour = ifelse( (x - 1.5)^2 + (y-1)^2 > 3/10, "white", "red"))
ggplot(japan_flag) + geom_point(aes(x = x, y = y, color = flag_colour), size = 0.1) + coord_fixed(ratio = 1) + scale_colour_manual(values = japanPalette)


3 Predicting half of the flag


# We are going to choose the left half of the flag to predict the other half
trainSet <- japan_flag[which(x < 1.8), ]
testSet <- japan_flag[-which(x < 1.8), ]

# TrainSet plot. We see that it contains the left part of the flag
ggplot(trainSet) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) + xlim(0,3)+
  scale_colour_manual(values = japanPalette)

# TestSet plot. It contains the right part of the flag
ggplot(testSet) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) + xlim(0,3)+
  scale_colour_manual(values = japanPalette)

# Random forest prediction
rf_model <-  rpart(flag_colour ~ .,
                      data = trainSet,
                      control = rpart.control(maxdepth = 25,cp=0.0001))

pred = as.data.frame(predict(rf_model,newdata= testSet)) #predict test dataset
pred <- pred %>% mutate(flag_colour = ifelse(pred[,1] > pred[,2], "red", "white")) #Change numeric to yes/no prediction

# Now we plot the random forest prediction
testSetrf <- testSet
testSetrf$flag_colour <- pred$flag_colour

# Now we joint the two datasets
Full_data <- rbind(testSetrf, trainSet)
ggplot(Full_data) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1)+
    ggtitle("Random Forest prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)

# Knn prediction
knn_model <- knn3(flag_colour ~ ., data = trainSet, k = 5)

knn_pred <- as.data.frame(predict(knn_model, newdata = testSet))
knn_pred <- knn_pred %>% mutate(flag_colour =ifelse(knn_pred[,1] > knn_pred[,2], "red", "white"))

# Now we plot the knn prediction
testSetknn <- testSet
testSetknn$flag_colour <- knn_pred$flag_colour

# Now we joint the two datasets
Full_data <- rbind(testSetknn, trainSet)
ggplot(Full_data) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) +
    ggtitle("Knn Prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)

# SVM prediction
svm_model <- svm(formula= flag_colour~ .,data = trainSet, type = "C", fitted = TRUE)

svm_pred <- as.data.frame(as.character(predict(svm_model, newdata = testSet)))

# Now we plot the SVM prediction
testSet$flag_colour <- svm_pred[,1]

# Now we joint the two datasets
Full_data <- rbind(testSet, trainSet)
ggplot(Full_data) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) +
    ggtitle("SVM prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)

# Now we turn the values to numerical to use a linear model
japan_flag_lm <- japan_flag
japan_flag_lm$flag_colour <- as.numeric(as.factor(japan_flag_lm$flag_colour))
head(japan_flag_lm)
##          x         y flag_colour
## 1 2.011354 0.4786877           2
## 2 2.172299 1.1496088           2
## 3 2.100427 0.6012050           2
## 4 2.960621 0.0686992           2
## 5 1.341001 0.3019827           2
## 6 2.537556 1.3033955           2
trainSet <- japan_flag_lm[which(x < 1.8), ]
testSet <- japan_flag_lm[-which(x < 1.8), ]

# We create the linear model
model_lm <- lm(flag_colour ~., data = trainSet)
prediction_lm <- as.data.frame(round(predict(model_lm, newdata = testSet)))
colnames(prediction_lm) <- "flag_colour"
prediction_lm$flag_colour <- as.factor(prediction_lm$flag_colour)
head(prediction_lm)
##    flag_colour
## 1            1
## 2            1
## 3            1
## 4            1
## 6            1
## 11           1
testSet_lm <- testSet
testSet_lm$flag_colour <- prediction_lm$flag_colour
trainSet$flag_colour <- as.factor(trainSet$flag_colour)

# Now we joint the two datasets
Full_data_lm <- rbind(testSet_lm, trainSet)
levels(Full_data_lm$flag_colour) <- c("red", "white")

ggplot(Full_data_lm) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) +
    ggtitle("LM prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)


4 How algorithms see the Japanese flag


# Now we choose the train and test random to scan the full flag
train_index <- createDataPartition(japan_flag$flag_colour,list = FALSE, p = 0.2)
trainSet <- japan_flag[train_index, ]
testSet <- japan_flag[-train_index, ]
ggplot(trainSet) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) + xlim(0,3)+
  scale_colour_manual(values = japanPalette)

ggplot(testSet) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) + xlim(0,3)+
  scale_colour_manual(values = japanPalette)

# Random forest model

rf_model <-  rpart(flag_colour ~ .,
                      data = trainSet,
                      control = rpart.control(maxdepth = 7,cp=0.0001))

pred = as.data.frame(predict(rf_model,newdata= testSet)) #predict test dataset
pred <- pred %>% mutate(flag_colour = ifelse(pred[,1] > pred[,2], "red", "white")) 
#Change numeric to yes/no prediction

print("Rf accuracy is")
## [1] "Rf accuracy is"
100 - round(length(which(pred$flag_colour != testSet$flag_colour))/length(testSet$flag_colour) * 100,2)
## [1] 98.33
# Now we plot the prediction
testSetrf <- testSet
testSetrf$flag_colour <- pred$flag_colour

# Now we joint the two datasets
#Full_data <- rbind(testSetrf, trainSet)
Full_data <- testSetrf

ggplot(Full_data) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1)+
    ggtitle("Random Forest prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)

knn_model <- knn3(flag_colour ~ ., data = trainSet, k = 10)

knn_pred <- as.data.frame(predict(knn_model, newdata = testSet))
knn_pred <- knn_pred %>% mutate(flag_colour =ifelse(knn_pred[,1] > knn_pred[,2], "red", "white"))

# Now we plot the prediction
testSetknn <- testSet
testSetknn$flag_colour <- knn_pred$flag_colour


print("Knn accuracy is")
## [1] "Knn accuracy is"
100 - round(length(which(knn_pred$flag_colour != testSet$flag_colour))/length(testSet$flag_colour) * 100,2)
## [1] 99.54
# Now we joint the two datasets
#Full_data <- rbind(testSetknn, trainSet)
Full_data_knn <- testSetknn
ggplot(Full_data_knn) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) +
    ggtitle("Knn Prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)

# Now with svm
svm_model <- svm(formula= flag_colour~ .,data = trainSet, type = "C", fitted = TRUE)

svm_pred <- as.data.frame(as.character(predict(svm_model, newdata = testSet)))

# Now we plot the prediction
testSetsvm <- testSet
testSetsvm$flag_colour <- svm_pred[,1]

print("Svm accuracy is")
## [1] "Svm accuracy is"
100 - round(length(which(svm_pred$flag_colour != testSet$flag_colour))/length(testSet$flag_colour) * 100,2)
## [1] 100
# Now we joint the two datasets
#Full_data <- rbind(testSet, trainSet)
Full_data <- testSetsvm

ggplot(Full_data) + geom_point(aes(x = x, y = y, color = flag_colour), size  = 0.1) + coord_fixed(ratio = 1) +
    ggtitle("SVM prediction") +  theme(plot.title = element_text(hjust = 0.5))+
  scale_colour_manual(values = japanPalette)


5 Conclusion


In this tutorial we have learnt how different algorithms see the Japanese flag on a 2D canvas. We have predicted a missing piece of the flag as well as the full flag usin a train-test separation of the dataset.