library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("mlbench")
library(mlbench)
data(Glass)
glimpse(Glass)
## Rows: 214
## Columns: 10
## $ RI   <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na   <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg   <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al   <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si   <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K    <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca   <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
table(Glass$Type)
## 
##  1  2  3  5  6  7 
## 70 76 17 13  9 29
pairs(Glass)

par(mfrow=c(1,2))
boxplot(Glass$Mg~Glass$Type)
boxplot(Glass$Al~Glass$Type)

par(mfrow=c(1,1))
#install.packages("corrplot")
library(corrplot)
## corrplot 0.92 loaded
corrplot( cor( Glass[,-10] ), order="hclust" )
apply(iris[1:4],2,mean)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333
#install.packages("e1071")
library(e1071)
apply( Glass[,-10], 2, skewness )
##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107
set.seed(777)
library(mlbench)
data("BostonHousing")
BostonHousing[sample(1:nrow(BostonHousing),10),"crim"] <- NA
sample(1:46,6)
## [1]  9 16  4 44 26 23
colSums(is.na(BostonHousing))
##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax 
##      10       0       0       0       0       0       0       0       0       0 
## ptratio       b   lstat    medv 
##       0       0       0       0
Y <- BostonHousing$medv
X <- BostonHousing[, 1:5]
model<-caret::train(x=X,y=Y,
                    method="rf",
                    preProcess="medianImpute")
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice

model
## Random Forest 
## 
## 506 samples
##   5 predictor
## 
## Pre-processing: median imputation (4), ignore (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     6.167868  0.5551334  4.151774
##   3     6.251547  0.5468529  4.184168
##   5     6.500694  0.5174786  4.336459
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
set.seed(777) 
data("BostonHousing")
BostonHousing[BostonHousing$crim > 0.5, "crim"] <- NA
# 예측모형: 설명변수와 종속변수 분리
Y <- BostonHousing$medv
X <- BostonHousing[, c(1:3,5)]

model_median <- caret::train(x = X, y = Y, method = "glm", preProcess = "medianImpute")
print(min(model_median$results$RMSE))
## [1] 7.870132
#model_knn <- caret::train(x = X, y = Y, method = "glm", preProcess = "knnImpute")
#print(min(model_knn$results$RMSE))