Set Up

Clean the Environment

rm(list = ls())

Load Package

packages <- c("stargazer", 
              "ggplot2",
              "visdat",
              "class",
              "caret"
              )



 for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 载入需要的程辑包:lattice

prepare dataset

DF <- read.csv("D:/Study/AI1/Final Project/DF2.csv")

Visualize the data

vis_dat(DF)

Because the dataset is too big and contains too many variables, I have to drop 80% of my data and some variables to run the model efficiently without consuming all the memory in my computer.

DF <- DF[sample(nrow(DF), size = round(nrow(DF) * (1 - .8))), ]

Split my dataset into 80/20

set.seed(123)

# Generate a vector of indices for the training set

train_index <- sample(x = nrow(DF),  size = round(0.8 * nrow(DF) ) )

# Create the training and testing sets

train <- DF[train_index, ]

test  <- DF[-train_index, ]  

Research Question

I want to know which distance function preforms the best when predicting the crash.

Model Building

Get Best using Package

trControl <- trainControl(method = "cv", number = 10)
k_grid <- expand.grid(k = seq(from = 1, to = 50, by = 2))

# Fit the model for each value of k
knn_fit <- train(Was.Car.in.a.crash ~ ., data = train, method = "knn", tuneGrid = k_grid, trControl = trControl, preProcess = c("center", "scale"), metric = "RMSE")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Extract the best k value
k <- knn_fit$bestTune$k

Euclidean Distance

euc_distance <- function(a, b) sqrt(sum((a - b)^2))

EDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) euc_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

ED <- EDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)

tableED <- table(ED, test$Was.Car.in.a.crash)

tableED
##    
## ED    0   1
##   0 178  79
##   1   0   1
EDA <- (tableED[1,1] + tableED[2,2]) / 258

Manhattan Distance

manha_distance <- function(a, b) {
  sum(abs(a - b))
}

ManDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) manha_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

ManD <- ManDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)

tableManD <- table(ManD, test$Was.Car.in.a.crash)
tableManD
##     
## ManD   0   1
##    0 178  77
##    1   0   3
ManDA <- (tableManD[1,1] + tableManD[2,2]) / 258

Hamming Distance

hamming_distance <- function(a, b) {
  sum(a != b)
}

HDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) hamming_distance (x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

HD <- HDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)

tableHD <- table(HD, test$Was.Car.in.a.crash)
tableHD
##    
## HD    0   1
##   0 178  26
##   1   0  54
HDA <- (tableHD[1,1] + tableHD[2,2]) / 258

Jaccard Distance

jaccard_distance <- function(a, b) {
  return(1 - sum(a == b) / length(a))
}

JDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) jaccard_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

JD <- JDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)

tableJD <- table(JD, test$Was.Car.in.a.crash)
tableJD
##    
## JD    0   1
##   0 178  26
##   1   0  54
JDA <- (tableJD[1,1] + tableJD[2,2]) / 258

Cosine Distance

cosine_distance <- function(a, b) {
  return(1 - sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))))
}

CDKNN <- function(train_data, train_labels, test_data, k) {
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) cosine_distance(x, new_input))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

CD <- CDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)

tableCD <- table(CD, test$Was.Car.in.a.crash)
tableCD
##    
## CD    0   1
##   0 177  80
##   1   1   0
CDA <- (tableCD[1,1] + tableCD[2,2]) / 258

Mahalanobis Distance

mahalanobisKNN <- function(train_data, train_labels, test_data, k) {
  
  cov_mat <- cov(train_data)
  
  inv_cov_mat <- solve(cov_mat)
  
  predictions <- vector("character", nrow(test_data))
  
  for (i in 1:nrow(test_data)) {
    
    new_input <- test_data[i, ]
    
    distances <- apply(train_data, 1, function(x) mahalanobis(new_input, x, inv_cov_mat))
    
    DL_table <- data.frame(dist = distances, label = train_labels)
    
    sorted_DL_table <- DL_table[order(DL_table$dist), ]
    
    neighbors <- head(sorted_DL_table, k)
    
    predictions[i] <- names(which.max(table(neighbors$label)))
  }
  
  return(predictions)
}

predicted_labels <- mahalanobisKNN(train, train$Was.Car.in.a.crash, test, k = k)

MahaA <- sum(predicted_labels == test$Was.Car.in.a.crash) / length(test$Was.Car.in.a.crash)

MahaA
## [1] 0.6860465

Compare the Accuarcy

ADF <- data.frame(Method = c("MahaA", "ManDA", "CDA", "EDA", "HDA", "JDA"),
                   Accuracy = c(MahaA, ManDA, CDA, EDA, HDA, JDA))

Accuracy_Plot <- ggplot(ADF, aes(x = Method, y = Accuracy)) +
  geom_histogram(stat = "identity", fill = "lightblue", color = "black") +
  ggtitle("Accuracy for Each KNN") +
  xlab("Method") +
  ylab("Accuracy")
## Warning in geom_histogram(stat = "identity", fill = "lightblue", color =
## "black"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Accuracy_Plot

### Conclusion

We can see that Both Hamming Distance and Jaccrad Distacne function perform really well in my model and this is because they are really good dealing binary data. In my dataset, there are only two conditions so it is perfect for these distance function.