## 'data.frame': 946 obs. of 15 variables:
## $ Model.Year : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ Make : chr "Acura" "Acura" "Acura" "Acura" ...
## $ Model : chr "ILX" "MDX SH-AWD" "RDX SH-AWD" "RDX SH-AWD A-SPEC" ...
## $ Vehicle.Class : chr "Compact" "SUV: Small" "SUV: Small" "SUV: Small" ...
## $ Engine.Size.L. : num 2.4 3.5 2 2 2 2 3 3 2 2 ...
## $ Cylinders : int 4 6 4 4 4 4 6 6 4 4 ...
## $ Transmission : chr "AM8" "AS10" "AS10" "AS10" ...
## $ Fuel.Type : chr "Z" "Z" "Z" "Z" ...
## $ Fuel.Consumption..City..L.100.km.: num 9.9 12.6 11 11.3 11.2 11.3 12.3 12.3 10 10.5 ...
## $ Fuel.Consumption.Hwy..L.100.km.. : num 7 9.4 8.6 9.1 8 8.1 9.4 9.8 7.2 7.7 ...
## $ Fuel.Consumption.Comb..L.100.km..: num 8.6 11.2 9.9 10.3 9.8 9.8 11 11.2 8.7 9.2 ...
## $ Fuel.Consumption.Comb..mpg.. : int 33 25 29 27 29 29 26 25 32 31 ...
## $ CO2.Emissions.g.km. : int 200 263 232 242 230 231 256 261 205 217 ...
## $ CO2.Rating : int 6 4 5 5 5 5 5 4 6 5 ...
## $ Smog.Rating : int 3 5 6 6 7 7 5 5 3 3 ...
##
## Compact Full-size Mid-size
## 69 64 117
## Minicompact Minivan Pickup truck: Small
## 48 7 20
## Pickup truck: Standard Special purpose vehicle Station wagon: Mid-size
## 113 12 8
## Station wagon: Small Subcompact SUV: Small
## 19 80 197
## SUV: Standard Two-seater
## 141 51
##
## 1.2 1.3 1.4 1.5 1.6 1.8 2 2.3 2.4 2.5 2.7 2.8 2.9 3 3.2 3.3 3.4 3.5 3.6 3.7
## 4 8 6 32 23 9 210 26 21 72 18 5 17 134 3 7 7 50 47 4
## 3.8 4 4.4 4.6 5 5.2 5.3 5.6 5.7 6 6.2 6.4 6.5 6.6 6.7 8
## 8 40 25 1 32 11 49 2 15 4 33 9 3 1 7 3
##
## D E X Z
## 28 14 446 458
##
## A10 A6 A7 A8 A9 AM6 AM7 AM8 AS10 AS5 AS6 AS7 AS8 AS9 AV AV1
## 72 24 1 100 65 10 70 52 87 2 39 7 212 19 43 4
## AV10 AV6 AV7 AV8 M5 M6 M7
## 8 8 10 23 5 71 14
## 'data.frame': 946 obs. of 15 variables:
## $ Model.Year : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ Make : Factor w/ 39 levels "Acura","Alfa Romeo",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ Model : chr "ILX" "MDX SH-AWD" "RDX SH-AWD" "RDX SH-AWD A-SPEC" ...
## $ Vehicle.Class : Factor w/ 14 levels "Compact","Full-size",..: 1 12 12 12 1 1 1 1 3 3 ...
## $ Engine.Size.L. : Factor w/ 36 levels "1.2","1.3","1.4",..: 9 18 7 7 7 7 14 14 7 7 ...
## $ Cylinders : int 4 6 4 4 4 4 6 6 4 4 ...
## $ Transmission : chr "AM8" "AS10" "AS10" "AS10" ...
## $ Fuel.Type : Factor w/ 4 levels "D","E","X","Z": 4 4 4 4 4 4 4 4 4 4 ...
## $ Fuel.Consumption..City..L.100.km.: num 9.9 12.6 11 11.3 11.2 11.3 12.3 12.3 10 10.5 ...
## $ Fuel.Consumption.Hwy..L.100.km.. : num 7 9.4 8.6 9.1 8 8.1 9.4 9.8 7.2 7.7 ...
## $ Fuel.Consumption.Comb..L.100.km..: num 8.6 11.2 9.9 10.3 9.8 9.8 11 11.2 8.7 9.2 ...
## $ Fuel.Consumption.Comb..mpg.. : int 33 25 29 27 29 29 26 25 32 31 ...
## $ CO2.Emissions.g.km. : int 200 263 232 242 230 231 256 261 205 217 ...
## $ CO2.Rating : int 6 4 5 5 5 5 5 4 6 5 ...
## $ Smog.Rating : int 3 5 6 6 7 7 5 5 3 3 ...
# gonna take out some variables I won't be focusing on
car1 <- car[,-c(1,3,2,5,7)]
str(car1)
## 'data.frame': 946 obs. of 10 variables:
## $ Vehicle.Class : Factor w/ 14 levels "Compact","Full-size",..: 1 12 12 12 1 1 1 1 3 3 ...
## $ Cylinders : int 4 6 4 4 4 4 6 6 4 4 ...
## $ Fuel.Type : Factor w/ 4 levels "D","E","X","Z": 4 4 4 4 4 4 4 4 4 4 ...
## $ Fuel.Consumption..City..L.100.km.: num 9.9 12.6 11 11.3 11.2 11.3 12.3 12.3 10 10.5 ...
## $ Fuel.Consumption.Hwy..L.100.km.. : num 7 9.4 8.6 9.1 8 8.1 9.4 9.8 7.2 7.7 ...
## $ Fuel.Consumption.Comb..L.100.km..: num 8.6 11.2 9.9 10.3 9.8 9.8 11 11.2 8.7 9.2 ...
## $ Fuel.Consumption.Comb..mpg.. : int 33 25 29 27 29 29 26 25 32 31 ...
## $ CO2.Emissions.g.km. : int 200 263 232 242 230 231 256 261 205 217 ...
## $ CO2.Rating : int 6 4 5 5 5 5 5 4 6 5 ...
## $ Smog.Rating : int 3 5 6 6 7 7 5 5 3 3 ...
##
## 1 2 3 4 5 6 7 8 9 10
## 7 48 189 228 266 113 71 10 13 1
## [1] 1 3 5 5 10
ggplot(car1, aes(x=CO2.Rating))+geom_histogram(bins = 30)
ggplot(car1, aes(x=Vehicle.Class, y=Fuel.Type, fill = CO2.Rating))+geom_tile()+theme(axis.text.x = element_text(angle = 90))
# my goal is going to be predicting CO2 Emissions using Vehicle Class and potentially Engine Size but I can actually find out which variables will work better
# Starting with missing data
md.pattern(car1, rotate.names = TRUE) # all good
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Vehicle.Class Cylinders Fuel.Type Fuel.Consumption..City..L.100.km.
## 946 1 1 1 1
## 0 0 0 0
## Fuel.Consumption.Hwy..L.100.km.. Fuel.Consumption.Comb..L.100.km..
## 946 1 1
## 0 0
## Fuel.Consumption.Comb..mpg.. CO2.Emissions.g.km. CO2.Rating Smog.Rating
## 946 1 1 1 1 0
## 0 0 0 0 0
normalize <- function(x){
(x - min(x, na.rm = T)) / (max(x, na.rm = T) - min(x, na.rm = T))
}
car_numb <- names(select_if(car1, is.numeric))
car1[car_numb] <- lapply(car1[car_numb], normalize)
str(car1)
## 'data.frame': 946 obs. of 10 variables:
## $ Vehicle.Class : Factor w/ 14 levels "Compact","Full-size",..: 1 12 12 12 1 1 1 1 3 3 ...
## $ Cylinders : num 0.0769 0.2308 0.0769 0.0769 0.0769 ...
## $ Fuel.Type : Factor w/ 4 levels "D","E","X","Z": 4 4 4 4 4 4 4 4 4 4 ...
## $ Fuel.Consumption..City..L.100.km.: num 0.224 0.327 0.266 0.278 0.274 ...
## $ Fuel.Consumption.Hwy..L.100.km.. : num 0.182 0.324 0.276 0.306 0.241 ...
## $ Fuel.Consumption.Comb..L.100.km..: num 0.208 0.326 0.267 0.285 0.262 ...
## $ Fuel.Consumption.Comb..mpg.. : num 0.367 0.233 0.3 0.267 0.3 ...
## $ CO2.Emissions.g.km. : num 0.206 0.329 0.268 0.288 0.265 ...
## $ CO2.Rating : num 0.556 0.333 0.444 0.444 0.444 ...
## $ Smog.Rating : num 0.333 0.667 0.833 0.833 1 ...
fivenum(car1$CO2.Rating)
## [1] 0.0000000 0.2222222 0.4444444 0.4444444 1.0000000
car1$CO2.Rating <- cut(car1$CO2.Rating, c(0,.444,1), labels = c(0,1))
car_1h <- one_hot(as.data.table(car1), cols ="auto", sparsifyNAs = TRUE, naCols = FALSE, dropCols = TRUE, dropUnusedLevels = TRUE)
str(car_1h)
## Classes 'data.table' and 'data.frame': 946 obs. of 27 variables:
## $ Vehicle.Class_Compact : int 1 0 0 0 1 1 1 1 0 0 ...
## $ Vehicle.Class_Full-size : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Mid-size : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Vehicle.Class_Minicompact : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Minivan : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Pickup truck: Small : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Pickup truck: Standard : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Special purpose vehicle: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Station wagon: Mid-size: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Station wagon: Small : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Subcompact : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_SUV: Small : int 0 1 1 1 0 0 0 0 0 0 ...
## $ Vehicle.Class_SUV: Standard : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Two-seater : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cylinders : num 0.0769 0.2308 0.0769 0.0769 0.0769 ...
## $ Fuel.Type_D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel.Type_E : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel.Type_X : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel.Type_Z : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Fuel.Consumption..City..L.100.km. : num 0.224 0.327 0.266 0.278 0.274 ...
## $ Fuel.Consumption.Hwy..L.100.km.. : num 0.182 0.324 0.276 0.306 0.241 ...
## $ Fuel.Consumption.Comb..L.100.km.. : num 0.208 0.326 0.267 0.285 0.262 ...
## $ Fuel.Consumption.Comb..mpg.. : num 0.367 0.233 0.3 0.267 0.3 ...
## $ CO2.Emissions.g.km. : num 0.206 0.329 0.268 0.288 0.265 ...
## $ CO2.Rating_0 : int 0 1 0 0 0 0 0 1 0 0 ...
## $ CO2.Rating_1 : int 1 0 1 1 1 1 1 0 1 1 ...
## $ Smog.Rating : num 0.333 0.667 0.833 0.833 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
car_1h <- car_1h[, -26]
table(car_1h$CO2.Rating_1)[2]/ sum(table(car_1h$CO2.Rating_0)) # 50%
## [1] NA
car_part_index <- createDataPartition(car_1h$CO2.Rating_0, times = 1, p = .7, groups = 1, list = FALSE)
train <- car_1h[car_part_index, ]
tune_and_test <- car_1h[-car_part_index,]
# Now use the function again to create the tuning set
tune_and_test_index <- createDataPartition(tune_and_test$CO2.Rating_0, times = 1, p = .5, list = FALSE)
tune <- tune_and_test[tune_and_test_index, ]
test <- tune_and_test[-tune_and_test_index, ]
str(train)
## Classes 'data.table' and 'data.frame': 663 obs. of 26 variables:
## $ Vehicle.Class_Compact : int 0 0 1 1 1 0 0 0 0 0 ...
## $ Vehicle.Class_Full-size : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Mid-size : int 0 0 0 0 0 1 1 0 0 0 ...
## $ Vehicle.Class_Minicompact : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Vehicle.Class_Minivan : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Pickup truck: Small : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Pickup truck: Standard : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Special purpose vehicle: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Station wagon: Mid-size: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Station wagon: Small : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Subcompact : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_SUV: Small : int 1 1 0 0 0 0 0 1 0 0 ...
## $ Vehicle.Class_SUV: Standard : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Vehicle.Class_Two-seater : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cylinders : num 0.2308 0.0769 0.0769 0.0769 0.2308 ...
## $ Fuel.Type_D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel.Type_E : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel.Type_X : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel.Type_Z : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Fuel.Consumption..City..L.100.km. : num 0.327 0.266 0.274 0.278 0.316 ...
## $ Fuel.Consumption.Hwy..L.100.km.. : num 0.324 0.276 0.241 0.247 0.347 ...
## $ Fuel.Consumption.Comb..L.100.km.. : num 0.326 0.267 0.262 0.262 0.326 ...
## $ Fuel.Consumption.Comb..mpg.. : num 0.233 0.3 0.3 0.3 0.233 ...
## $ CO2.Emissions.g.km. : num 0.329 0.268 0.265 0.267 0.325 ...
## $ CO2.Rating_0 : int 1 0 0 0 1 0 0 0 1 1 ...
## $ Smog.Rating : num 0.667 0.833 1 1 0.667 ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(tune)
## [1] 142 26
dim(test)
## [1] 141 26
set.seed(123)
car_3nn <- knn(train = train, test = tune, cl = train$CO2.Rating_0, k = 3, use.all = TRUE, prob = TRUE)
str(car_3nn)
## Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 1 2 1 ...
## - attr(*, "prob")= num [1:142] 1 1 1 1 1 ...
table(car_3nn)
## car_3nn
## 0 1
## 66 76
car_kNN_res <- table(car_3nn, tune$CO2.Rating_0)
confusionMatrix(as.factor(car_3nn), as.factor(tune$CO2.Rating_0), positive = "1", dnn = c("Prediction", "Actual"), mode = "sens_spec")
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 66 0
## 1 0 76
##
## Accuracy : 1
## 95% CI : (0.9744, 1)
## No Information Rate : 0.5352
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5352
## Detection Rate : 0.5352
## Detection Prevalence : 0.5352
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
# Accuracy is 98% correct, meaning 98 times out of 100 this model will predict the right classification
# Specificity is 97%, so 97 out of 100 cars were identified in the correct classification of CO2 rates
chooseK = function(k, train_set, val_set, train_class, val_class){
set.seed(1)
class_knn = knn(train = train_set, test = val_set, cl = train_class, k = k, use.all = TRUE)
conf_mat = table(class_knn, val_class)
# calculate the accuracy
accu = sum(conf_mat[row(conf_mat) == col(conf_mat)]) / sum(conf_mat)
cbind(k = k, accuracy = accu)
}
knn_different_k = sapply(seq(1, 21, by = 2), function(x) chooseK(x, train_set = train, val_set = tune, train_class = train$CO2.Rating_0, val_class = tune$CO2.Rating_0))
knn_different_k = tibble(k = knn_different_k[1,], accuracy = knn_different_k[2,])
# plot accuracy vs. k
ggplot(knn_different_k, aes(x = k, y = accuracy)) + geom_line(color = "orange", size = 1.5) + geom_point(size = 3)
# So our prediction model is more accurate when K is 3 or 5. We can keep it then
car_prob <- tibble(attr(car_3nn, "prob"))
final_model <- tibble(k_prob= car_prob$`attr(car_3nn, "prob")`, pred= car_3nn, target = as.factor(tune$CO2.Rating_0))
# view(final_model)
# Converting to the likelihood to be in the positive class
final_model$pos_prec <- ifelse(final_model$pred == 0, 1- final_model$k_prob, final_model$k_prob)
# view(final_model)
densityplot(final_model$pos_prec)
confusionMatrix(final_model$pred, final_model$target, positive = "1", dnn = c("Prediction", "Actual"), mode = "sens_spec")
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 66 0
## 1 0 76
##
## Accuracy : 1
## 95% CI : (0.9744, 1)
## No Information Rate : 0.5352
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5352
## Detection Rate : 0.5352
## Detection Prevalence : 0.5352
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
# once again there is an error here, as my accuracy, specificity and sensitivity scores are all 100%
adjust_thres <- function(x,y,z){
# x = pred_probabilities, y = threshold, z = tune_outcomes
thres <- as.factor(ifelse(x > y, 1, 0))
confusionMatrix(thres, z, positive = "1", dnn = c("Prediction", "Actual"), mode = "everything")
}
adjust_thres(final_model$pos_prec, .30, as.factor(final_model$target))
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 65 0
## 1 1 76
##
## Accuracy : 0.993
## 95% CI : (0.9614, 0.9998)
## No Information Rate : 0.5352
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9858
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9848
## Pos Pred Value : 0.9870
## Neg Pred Value : 1.0000
## Precision : 0.9870
## Recall : 1.0000
## F1 : 0.9935
## Prevalence : 0.5352
## Detection Rate : 0.5352
## Detection Prevalence : 0.5423
## Balanced Accuracy : 0.9924
##
## 'Positive' Class : 1
##
#playing around with the threshold
adjust_thres(final_model$pos_prec, .50, as.factor(final_model$target))
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 66 0
## 1 0 76
##
## Accuracy : 1
## 95% CI : (0.9744, 1)
## No Information Rate : 0.5352
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Precision : 1.0000
## Recall : 1.0000
## F1 : 1.0000
## Prevalence : 0.5352
## Detection Rate : 0.5352
## Detection Prevalence : 0.5352
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
# nothing changed :/
# adjusting it more
adjust_thres(final_model$pos_prec, .20, as.factor(final_model$target))
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 65 0
## 1 1 76
##
## Accuracy : 0.993
## 95% CI : (0.9614, 0.9998)
## No Information Rate : 0.5352
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9858
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9848
## Pos Pred Value : 0.9870
## Neg Pred Value : 1.0000
## Precision : 0.9870
## Recall : 1.0000
## F1 : 0.9935
## Prevalence : 0.5352
## Detection Rate : 0.5352
## Detection Prevalence : 0.5423
## Balanced Accuracy : 0.9924
##
## 'Positive' Class : 1
##
# none of these are making a difference
adjust_thres(final_model$pos_prec, .1, as.factor(final_model$target))
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 65 0
## 1 1 76
##
## Accuracy : 0.993
## 95% CI : (0.9614, 0.9998)
## No Information Rate : 0.5352
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9858
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9848
## Pos Pred Value : 0.9870
## Neg Pred Value : 1.0000
## Precision : 0.9870
## Recall : 1.0000
## F1 : 0.9935
## Prevalence : 0.5352
## Detection Rate : 0.5352
## Detection Prevalence : 0.5423
## Balanced Accuracy : 0.9924
##
## 'Positive' Class : 1
##
# yeah these are all the same thing