library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("mlbench")
library(mlbench)
data(Glass)
glimpse(Glass)
## Rows: 214
## Columns: 10
## $ RI <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
table(Glass$Type)
##
## 1 2 3 5 6 7
## 70 76 17 13 9 29
pairs(Glass)

par(mfrow=c(1,2))
boxplot(Glass$Mg~Glass$Type)
boxplot(Glass$Al~Glass$Type)

par(mfrow=c(1,1))
#install.packages("corrplot")
library(corrplot)
## corrplot 0.92 loaded
corrplot( cor( Glass[,-10] ), order="hclust" )
apply(iris[1:4],2,mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
#install.packages("e1071")
library(e1071)
apply( Glass[,-10], 2, skewness )
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
set.seed(777)
library(mlbench)
data("BostonHousing")
BostonHousing[sample(1:nrow(BostonHousing),10),"crim"] <- NA
sample(1:46,6)
## [1] 9 16 4 44 26 23
colSums(is.na(BostonHousing))
## crim zn indus chas nox rm age dis rad tax
## 10 0 0 0 0 0 0 0 0 0
## ptratio b lstat medv
## 0 0 0 0
Y <- BostonHousing$medv
X <- BostonHousing[, 1:5]
model<-caret::train(x=X,y=Y,
method="rf",
preProcess="medianImpute")
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice

model
## Random Forest
##
## 506 samples
## 5 predictor
##
## Pre-processing: median imputation (4), ignore (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 6.167868 0.5551334 4.151774
## 3 6.251547 0.5468529 4.184168
## 5 6.500694 0.5174786 4.336459
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
set.seed(777)
data("BostonHousing")
BostonHousing[BostonHousing$crim > 0.5, "crim"] <- NA
# 예측모형: 설명변수와 종속변수 분리
Y <- BostonHousing$medv
X <- BostonHousing[, c(1:3,5)]
model_median <- caret::train(x = X, y = Y, method = "glm", preProcess = "medianImpute")
print(min(model_median$results$RMSE))
## [1] 7.870132
#model_knn <- caret::train(x = X, y = Y, method = "glm", preProcess = "knnImpute")
#print(min(model_knn$results$RMSE))