rm(list = ls())
packages <- c("stargazer",
"ggplot2",
"visdat",
"class",
"caret"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 载入需要的程辑包:lattice
DF <- read.csv("D:/Study/AI1/Final Project/DF2.csv")
vis_dat(DF)
Because the dataset is too big and contains too many variables, I have
to drop 80% of my data and some variables to run the model efficiently
without consuming all the memory in my computer.
DF <- DF[sample(nrow(DF), size = round(nrow(DF) * (1 - .8))), ]
set.seed(123)
# Generate a vector of indices for the training set
train_index <- sample(x = nrow(DF), size = round(0.8 * nrow(DF) ) )
# Create the training and testing sets
train <- DF[train_index, ]
test <- DF[-train_index, ]
I want to know which distance function preforms the best when predicting the crash.
trControl <- trainControl(method = "cv", number = 10)
k_grid <- expand.grid(k = seq(from = 1, to = 50, by = 2))
# Fit the model for each value of k
knn_fit <- train(Was.Car.in.a.crash ~ ., data = train, method = "knn", tuneGrid = k_grid, trControl = trControl, preProcess = c("center", "scale"), metric = "RMSE")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Extract the best k value
k <- knn_fit$bestTune$k
euc_distance <- function(a, b) sqrt(sum((a - b)^2))
EDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) euc_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
ED <- EDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)
tableED <- table(ED, test$Was.Car.in.a.crash)
tableED
##
## ED 0 1
## 0 178 79
## 1 0 1
EDA <- (tableED[1,1] + tableED[2,2]) / 258
manha_distance <- function(a, b) {
sum(abs(a - b))
}
ManDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) manha_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
ManD <- ManDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)
tableManD <- table(ManD, test$Was.Car.in.a.crash)
tableManD
##
## ManD 0 1
## 0 178 77
## 1 0 3
ManDA <- (tableManD[1,1] + tableManD[2,2]) / 258
hamming_distance <- function(a, b) {
sum(a != b)
}
HDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) hamming_distance (x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
HD <- HDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)
tableHD <- table(HD, test$Was.Car.in.a.crash)
tableHD
##
## HD 0 1
## 0 178 26
## 1 0 54
HDA <- (tableHD[1,1] + tableHD[2,2]) / 258
jaccard_distance <- function(a, b) {
return(1 - sum(a == b) / length(a))
}
JDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) jaccard_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
JD <- JDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)
tableJD <- table(JD, test$Was.Car.in.a.crash)
tableJD
##
## JD 0 1
## 0 178 26
## 1 0 54
JDA <- (tableJD[1,1] + tableJD[2,2]) / 258
cosine_distance <- function(a, b) {
return(1 - sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))))
}
CDKNN <- function(train_data, train_labels, test_data, k) {
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) cosine_distance(x, new_input))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
CD <- CDKNN(train_data = train, train_labels = train$Was.Car.in.a.crash, test_data = test, k)
tableCD <- table(CD, test$Was.Car.in.a.crash)
tableCD
##
## CD 0 1
## 0 177 80
## 1 1 0
CDA <- (tableCD[1,1] + tableCD[2,2]) / 258
mahalanobisKNN <- function(train_data, train_labels, test_data, k) {
cov_mat <- cov(train_data)
inv_cov_mat <- solve(cov_mat)
predictions <- vector("character", nrow(test_data))
for (i in 1:nrow(test_data)) {
new_input <- test_data[i, ]
distances <- apply(train_data, 1, function(x) mahalanobis(new_input, x, inv_cov_mat))
DL_table <- data.frame(dist = distances, label = train_labels)
sorted_DL_table <- DL_table[order(DL_table$dist), ]
neighbors <- head(sorted_DL_table, k)
predictions[i] <- names(which.max(table(neighbors$label)))
}
return(predictions)
}
predicted_labels <- mahalanobisKNN(train, train$Was.Car.in.a.crash, test, k = k)
MahaA <- sum(predicted_labels == test$Was.Car.in.a.crash) / length(test$Was.Car.in.a.crash)
MahaA
## [1] 0.6860465
ADF <- data.frame(Method = c("MahaA", "ManDA", "CDA", "EDA", "HDA", "JDA"),
Accuracy = c(MahaA, ManDA, CDA, EDA, HDA, JDA))
Accuracy_Plot <- ggplot(ADF, aes(x = Method, y = Accuracy)) +
geom_histogram(stat = "identity", fill = "lightblue", color = "black") +
ggtitle("Accuracy for Each KNN") +
xlab("Method") +
ylab("Accuracy")
## Warning in geom_histogram(stat = "identity", fill = "lightblue", color =
## "black"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Accuracy_Plot
### Conclusion
We can see that Both Hamming Distance and Jaccrad Distacne function perform really well in my model and this is because they are really good dealing binary data. In my dataset, there are only two conditions so it is perfect for these distance function.