Set Up

Clean the Environment

rm(list = ls())

Load Package

packages <- c("stargazer", 
              "ggplot2",
              "visdat",
              "class",
              "caret",
              "stringdist",
              "proxy"

              )
 for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 载入需要的程辑包:lattice
## 
## 载入程辑包:'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix

prepare dataset

DF <- read.csv("D:/Study/AI1/Final Project/Traning.csv")

Visualize the data

vis_dat(DF)

Because the dataset is too big and contains too many variables, I have to drop 80% of my data and some variables to run the model efficiently without consuming all the memory in my computer.

DF <- DF[sample(nrow(DF), size = round(nrow(DF) * (1 - .8))), ]

Split my dataset into 80/20

set.seed(123)

# Generate a vector of indices for the training set

train_index <- sample(x = nrow(DF),  size = round(0.8 * nrow(DF) ) )

# Create the training and testing sets

train <- DF[train_index, ]

test  <- DF[-train_index, ]  

Research Question

I want to know which distance function preforms the best when predicting the crash.

Model Building

Euclidean Distance

euc_distance <- function(a, b) sqrt(sum((a - b)^2))

EDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) euc_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

ED <- EDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)

tableED <- table(ED, test$Was.Car.in.a.crash)

tableED
##    
## ED    0   1
##   0 183  46
##   1   4  25
EDA <- (tableED[1,1] + tableED[2,2]) / 258

Manhattan Distance

manha_distance <- function(a, b) {
  sum(abs(a - b))
}

ManDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) manha_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

ManD <- ManDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)

tableManD <- table(ManD, test$Was.Car.in.a.crash)
tableManD
##     
## ManD   0   1
##    0 187  43
##    1   0  28
ManDA <- (tableManD[1,1] + tableManD[2,2]) / 258

Hamming Distance

hamming_distance <- function(a, b) {
  sum(a != b)
}

HDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) hamming_distance (x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

HD <- HDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)

tableHD <- table(HD, test$Was.Car.in.a.crash)
tableHD
##    
## HD    0   1
##   0 187   2
##   1   0  69
HDA <- (tableHD[1,1] + tableHD[2,2]) / 258

Jaccard Distance

jaccard_distance <- function(a, b) {
  return(1 - sum(a == b) / length(a))
}

JDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) jaccard_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

JD <- JDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)

tableJD <- table(JD, test$Was.Car.in.a.crash)
tableJD
##    
## JD    0   1
##   0 187   2
##   1   0  69
JDA <- (tableJD[1,1] + tableJD[2,2]) / 258

Cosine Distance

cosine_distance <- function(a, b) {
  return(1 - sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))))
}

CDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) cosine_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

CD <- CDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)

tableCD <- table(CD, test$Was.Car.in.a.crash)
tableCD
##    
## CD    0   1
##   0 187  16
##   1   0  55
CDA <- (tableCD[1,1] + tableCD[2,2]) / 258

Mahalanobis Distance

mahalanobisKNN <- function(train_data, train_labels, test_data, k) {
  
  cov_mat <- cov(train_data)
  
  inv_cov_mat <- solve(cov_mat)
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) mahalanobis(new_input, x, inv_cov_mat))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

predicted_labels <- mahalanobisKNN(train, train$Was.Car.in.a.crash, test, k = 18)

MahaA <- sum(predicted_labels == test$Was.Car.in.a.crash) / length(test$Was.Car.in.a.crash)

MahaA
## [1] 0.7015504

Compare the Accuarcy

ADF <- data.frame(Method = c("Mahalanobis", "Manhattan", "Cosine", "Euclidean", "Hamming", "Jarrard"),
                   Accuracy = c(MahaA, ManDA, CDA, EDA, HDA, JDA))

Accuracy_Plot <- ggplot(ADF, aes(x = Method, y = Accuracy)) +
  geom_histogram(stat = "identity", fill = "lightblue", color = "black") +
  ggtitle("Accuracy for Each KNN") +
  xlab("Method") +
  ylab("Accuracy")
## Warning in geom_histogram(stat = "identity", fill = "lightblue", color =
## "black"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Accuracy_Plot