Link: # https://archive.ics.uci.edu/ml/datasets/auto+mpg (data) # https://www.rpubs.com/harshaash/KNN_imputation (method)
download from data link # .data file <- they are .txt
auto.mpg <- read.table(
"C:/Users/aungk/OneDrive/R programming/Github/PSM/auto-mpg.data",
quote="\"", comment.char="")
# names can be obtained from "auto.mpg-names" file
catalog <- c("mpg", "cylinders", "displacement",
"horsepower", "weight", "acceleration",
"model_year","origin", "car_name")
names(auto.mpg) <- catalog
cars_info <- auto.mpg
summary(cars_info) # horsepower -> need transforming to numeric
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Length:398
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.2 Class :character
## Median :23.00 Median :4.000 Median :148.5 Mode :character
## Mean :23.51 Mean :5.455 Mean :193.4
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0
## Max. :46.60 Max. :8.000 Max. :455.0
## weight acceleration model_year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2224 1st Qu.:13.82 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2970 Mean :15.57 Mean :76.01 Mean :1.573
## 3rd Qu.:3608 3rd Qu.:17.18 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
## car_name
## Length:398
## Class :character
## Mode :character
##
##
##
cars_info$horsepower <- as.numeric(cars_info$horsepower)
## Warning: NAs introduced by coercion
summary(cars_info) # six missing values found
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.2 1st Qu.: 75.0 1st Qu.:2224
## Median :23.00 Median :4.000 Median :148.5 Median : 93.5 Median :2804
## Mean :23.51 Mean :5.455 Mean :193.4 Mean :104.5 Mean :2970
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0 3rd Qu.:126.0 3rd Qu.:3608
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
## NA's :6
## acceleration model_year origin car_name
## Min. : 8.00 Min. :70.00 Min. :1.000 Length:398
## 1st Qu.:13.82 1st Qu.:73.00 1st Qu.:1.000 Class :character
## Median :15.50 Median :76.00 Median :1.000 Mode :character
## Mean :15.57 Mean :76.01 Mean :1.573
## 3rd Qu.:17.18 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :24.80 Max. :82.00 Max. :3.000
##
# check out missing values (display as .html table)
cars_info %>% filter(is.na(horsepower)) -> ms
library(DT)
datatable(ms)
# get row numbers of missing values <- to recheck after imputation
which(is.na(cars_info$horsepower)) -> ind
# check grouping
ggplot(cars_info, aes(x = mpg, y = acceleration, color = horsepower)) +
geom_point(show.legend = TRUE) +
labs(x = 'Mpg', y='Acceleration', title = "Auto MPG",
color = 'Horsepower') +
scale_color_gradient(low = "green", high = "red",
na.value = "blue", guide = "legend") +
theme_minimal()+theme(legend.position="bottom")
# mpg and acceleration seem to be related to horsepower
# missing values are denoted blue
# KNN imputation
library(caret)
preProcValues <- preProcess(cars_info %>%
dplyr::select(mpg, cylinders, displacement, weight, acceleration, origin, horsepower),
method = c("knnImpute"),
k = 20,
knnSummary = mean)
library(RANN)
impute_cars_info <- predict(preProcValues, cars_info,na.action = na.pass)
impute_cars_info %>% filter(is.na(horsepower)) -> ms2
ms2
## [1] mpg cylinders displacement horsepower weight
## [6] acceleration model_year origin car_name
## <0 rows> (or 0-length row.names)
# 0 result, OK
# reconstruct the data by loop
procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
impute_cars_info[i] <- impute_cars_info[i]*preProcValues$std[i]+preProcValues$mean[i]
}
# recheck the imputed data
impute_cars_info %>% slice(ind) -> imputed
imputed # check against original html table
## mpg cylinders displacement horsepower weight acceleration model_year origin
## 1 25.0 4 98 78.95 2046 19.0 71 1
## 2 21.0 6 200 93.60 2875 17.0 74 1
## 3 40.9 4 85 67.75 1835 17.3 80 2
## 4 23.6 4 140 88.85 2905 14.3 80 1
## 5 34.5 4 100 76.70 2320 15.8 81 2
## 6 23.0 4 151 83.25 3035 20.5 82 1
## car_name
## 1 ford pinto
## 2 ford maverick
## 3 renault lecar deluxe
## 4 ford mustang cobra
## 5 renault 18i
## 6 amc concord dl