rm(list = ls())
packages <- c("stargazer",
"ggplot2",
"visdat",
"class",
"caret",
"stringdist",
"proxy"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 载入需要的程辑包:lattice
##
## 载入程辑包:'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
DF <- read.csv("D:/Study/AI1/Final Project/Traning.csv")
vis_dat(DF)
Because the dataset is too big and contains too many variables, I have to drop 80% of my data and some variables to run the model efficiently without consuming all the memory in my computer.
DF <- DF[sample(nrow(DF), size = round(nrow(DF) * (1 - .8))), ]
set.seed(123)
# Generate a vector of indices for the training set
train_index <- sample(x = nrow(DF), size = round(0.8 * nrow(DF) ) )
# Create the training and testing sets
train <- DF[train_index, ]
test <- DF[-train_index, ]
I want to know which distance function preforms the best when predicting the crash.
euc_distance <- function(a, b) sqrt(sum((a - b)^2))
EDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) euc_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
ED <- EDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)
tableED <- table(ED, test$Was.Car.in.a.crash)
tableED
##
## ED 0 1
## 0 183 46
## 1 4 25
EDA <- (tableED[1,1] + tableED[2,2]) / 258
manha_distance <- function(a, b) {
sum(abs(a - b))
}
ManDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) manha_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
ManD <- ManDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)
tableManD <- table(ManD, test$Was.Car.in.a.crash)
tableManD
##
## ManD 0 1
## 0 187 43
## 1 0 28
ManDA <- (tableManD[1,1] + tableManD[2,2]) / 258
hamming_distance <- function(a, b) {
sum(a != b)
}
HDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) hamming_distance (x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
HD <- HDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)
tableHD <- table(HD, test$Was.Car.in.a.crash)
tableHD
##
## HD 0 1
## 0 187 2
## 1 0 69
HDA <- (tableHD[1,1] + tableHD[2,2]) / 258
jaccard_distance <- function(a, b) {
return(1 - sum(a == b) / length(a))
}
JDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) jaccard_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
JD <- JDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)
tableJD <- table(JD, test$Was.Car.in.a.crash)
tableJD
##
## JD 0 1
## 0 187 2
## 1 0 69
JDA <- (tableJD[1,1] + tableJD[2,2]) / 258
cosine_distance <- function(a, b) {
return(1 - sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))))
}
CDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) cosine_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
CD <- CDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, 18)
tableCD <- table(CD, test$Was.Car.in.a.crash)
tableCD
##
## CD 0 1
## 0 187 16
## 1 0 55
CDA <- (tableCD[1,1] + tableCD[2,2]) / 258
mahalanobisKNN <- function(train_data, train_labels, test_data, k) {
cov_mat <- cov(train_data)
inv_cov_mat <- solve(cov_mat)
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) mahalanobis(new_input, x, inv_cov_mat))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
predicted_labels <- mahalanobisKNN(train, train$Was.Car.in.a.crash, test, k = 18)
MahaA <- sum(predicted_labels == test$Was.Car.in.a.crash) / length(test$Was.Car.in.a.crash)
MahaA
## [1] 0.7015504
ADF <- data.frame(Method = c("Mahalanobis", "Manhattan", "Cosine", "Euclidean", "Hamming", "Jarrard"),
Accuracy = c(MahaA, ManDA, CDA, EDA, HDA, JDA))
Accuracy_Plot <- ggplot(ADF, aes(x = Method, y = Accuracy)) +
geom_histogram(stat = "identity", fill = "lightblue", color = "black") +
ggtitle("Accuracy for Each KNN") +
xlab("Method") +
ylab("Accuracy")
## Warning in geom_histogram(stat = "identity", fill = "lightblue", color =
## "black"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Accuracy_Plot