Link: # https://archive.ics.uci.edu/ml/datasets/auto+mpg (data) # https://www.rpubs.com/harshaash/KNN_imputation (method)

download from data link # .data file <- they are .txt

auto.mpg <- read.table(
         "C:/Users/aungk/OneDrive/R programming/Github/PSM/auto-mpg.data", 
           quote="\"", comment.char="")

# names can be obtained from "auto.mpg-names" file
catalog <- c("mpg", "cylinders", "displacement", 
             "horsepower", "weight", "acceleration",
             "model_year","origin", "car_name")
names(auto.mpg) <- catalog
cars_info <- auto.mpg
summary(cars_info) # horsepower -> need transforming to numeric
##       mpg          cylinders      displacement    horsepower       
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Length:398        
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.2   Class :character  
##  Median :23.00   Median :4.000   Median :148.5   Mode  :character  
##  Mean   :23.51   Mean   :5.455   Mean   :193.4                     
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0                     
##  Max.   :46.60   Max.   :8.000   Max.   :455.0                     
##      weight      acceleration     model_year        origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2224   1st Qu.:13.82   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2970   Mean   :15.57   Mean   :76.01   Mean   :1.573  
##  3rd Qu.:3608   3rd Qu.:17.18   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##    car_name        
##  Length:398        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
cars_info$horsepower <- as.numeric(cars_info$horsepower)
## Warning: NAs introduced by coercion
summary(cars_info) # six missing values found
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.2   1st Qu.: 75.0   1st Qu.:2224  
##  Median :23.00   Median :4.000   Median :148.5   Median : 93.5   Median :2804  
##  Mean   :23.51   Mean   :5.455   Mean   :193.4   Mean   :104.5   Mean   :2970  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0   3rd Qu.:126.0   3rd Qu.:3608  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                  NA's   :6                     
##   acceleration     model_year        origin        car_name        
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   Length:398        
##  1st Qu.:13.82   1st Qu.:73.00   1st Qu.:1.000   Class :character  
##  Median :15.50   Median :76.00   Median :1.000   Mode  :character  
##  Mean   :15.57   Mean   :76.01   Mean   :1.573                     
##  3rd Qu.:17.18   3rd Qu.:79.00   3rd Qu.:2.000                     
##  Max.   :24.80   Max.   :82.00   Max.   :3.000                     
## 
# check out missing values (display as .html table)
cars_info %>% filter(is.na(horsepower)) -> ms
library(DT)
datatable(ms)
 # get row numbers of missing values <- to recheck after imputation
which(is.na(cars_info$horsepower)) -> ind
# check grouping
ggplot(cars_info, aes(x = mpg, y = acceleration, color = horsepower)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Mpg', y='Acceleration',  title = "Auto MPG",
       color = 'Horsepower') + 
  scale_color_gradient(low = "green", high = "red",
                       na.value = "blue", guide = "legend") +
  theme_minimal()+theme(legend.position="bottom")

 # mpg and acceleration seem to be related to horsepower
 # missing values are denoted blue
# KNN imputation
library(caret)
preProcValues <- preProcess(cars_info %>% 
                              dplyr::select(mpg, cylinders, displacement, weight, acceleration, origin, horsepower),
                            method = c("knnImpute"),
                            k = 20,
                            knnSummary = mean)

library(RANN)
impute_cars_info <- predict(preProcValues, cars_info,na.action = na.pass)
impute_cars_info %>% filter(is.na(horsepower)) -> ms2
ms2
## [1] mpg          cylinders    displacement horsepower   weight      
## [6] acceleration model_year   origin       car_name    
## <0 rows> (or 0-length row.names)
# 0 result, OK

# reconstruct the data by loop
procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
  impute_cars_info[i] <- impute_cars_info[i]*preProcValues$std[i]+preProcValues$mean[i] 
}
# recheck the imputed data
impute_cars_info %>% slice(ind) -> imputed
imputed # check against original html table
##    mpg cylinders displacement horsepower weight acceleration model_year origin
## 1 25.0         4           98      78.95   2046         19.0         71      1
## 2 21.0         6          200      93.60   2875         17.0         74      1
## 3 40.9         4           85      67.75   1835         17.3         80      2
## 4 23.6         4          140      88.85   2905         14.3         80      1
## 5 34.5         4          100      76.70   2320         15.8         81      2
## 6 23.0         4          151      83.25   3035         20.5         82      1
##               car_name
## 1           ford pinto
## 2        ford maverick
## 3 renault lecar deluxe
## 4   ford mustang cobra
## 5          renault 18i
## 6       amc concord dl