library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(mlbench)
data(Glass)
glimpse(Glass)
## Rows: 214
## Columns: 10
## $ RI <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
table(Glass$Type)
##
## 1 2 3 5 6 7
## 70 76 17 13 9 29
pairs(Glass)

cor(Glass[,-10])
## RI Na Mg Al Si K
## RI 1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790 1.00000000 -0.273731961 0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196 1.000000000 -0.48179851 -0.16592672 0.005395667
## Al -0.4073260341 0.15679367 -0.481798509 1.00000000 -0.00552372 0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372 1.00000000 -0.193330854
## K -0.2898327111 -0.26608650 0.005395667 0.32595845 -0.19333085 1.000000000
## Ca 0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189 0.32660288 -0.492262118 0.47940390 -0.10215131 -0.042618059
## Fe 0.1430096093 -0.24134641 0.083059529 -0.07440215 -0.09420073 -0.007719049
## Ca Ba Fe
## RI 0.8104027 -0.0003860189 0.143009609
## Na -0.2754425 0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178 0.083059529
## Al -0.2595920 0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K -0.3178362 -0.0426180594 -0.007719049
## Ca 1.0000000 -0.1128409671 0.124968219
## Ba -0.1128410 1.0000000000 -0.058691755
## Fe 0.1249682 -0.0586917554 1.000000000
par(mfrow=c(1,2))#한화면에 두개의 플롯
boxplot(Glass$Mg ~ Glass$Type)
boxplot(Glass$Al ~ Glass$Type)

par(mfrow=c(1,1))
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(Glass[,-10]),order='hclust')#원의 크기와 색에 따라 상관계수를 잘 보여주는

library(e1071)
apply(iris[1:4],2,mean)# 1=열에 2=행에
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
apply(Glass[,-10],2,skewness)#(양수이면 오른쪽 꼬리분포 음수이면 왼쪽 꼬리분포 )
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
trans<-preProcess(Glass[,-10],method=c("BoxCox","center","scale")) #세가지 다사용해야함
# boxcox = 비대칭분포를 정규분포로 변환 center=표준화 scale=0과1시아의 값으로 변경
trans
## Created from 214 samples and 9 variables
##
## Pre-processing:
## - Box-Cox transformation (5)
## - centered (9)
## - ignored (0)
## - scaled (9)
##
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, 0.5, 2, -1.1
transformed<-predict(trans,Glass[,-10])
head(transformed[,1:5])
## RI Na Mg Al Si
## 1 0.8756898 0.3133883 1.2517037 -0.65520274 -1.12729016
## 2 -0.2471367 0.6129977 0.6346799 -0.08726137 0.09719851
## 3 -0.7216425 0.1798164 0.6000157 0.27454124 0.43512776
## 4 -0.2305698 -0.2150217 0.6970756 -0.23439154 -0.05836211
## 5 -0.3101056 -0.1402661 0.6485456 -0.34194384 0.55238422
## 6 -0.7947626 -0.7480171 0.6416128 0.42852325 0.40909039
transl<-preProcess(Glass[,-10],method=c("BoxCox","center","scale","pca"))
transformed<-predict(transl,Glass[,-10])
head(transformed[,1:5])
## PC1 PC2 PC3 PC4 PC5
## 1 -1.2126444 -0.3942139 -0.1730756 -1.7193852 0.1913387
## 2 0.6179073 -0.7020476 -0.5507034 -0.8575350 0.1566312
## 3 0.9907027 -0.8876886 -0.6452946 -0.3027716 0.1363025
## 4 0.1510212 -0.9042336 -0.1622361 -0.4521567 0.4291846
## 5 0.3582849 -1.0160965 -0.5763959 -0.1667831 0.3634192
## 6 0.3408017 -1.3565637 0.7451275 1.0568333 -1.7762845
set.seed(777)
# 무작위 추출에도 같은 결과를 얻기위한 샘플번호느낌
data("BostonHousing")
BostonHousing[sample(1:nrow(BostonHousing),10),"crim"]<-NA
sample(1:46,6)
## [1] 9 16 4 44 26 23
colSums(is.na(BostonHousing))
## crim zn indus chas nox rm age dis rad tax
## 10 0 0 0 0 0 0 0 0 0
## ptratio b lstat medv
## 0 0 0 0
Y<-BostonHousing$medv
X<-BostonHousing[,1:5]
model<-caret::train(x=X,y=Y,
method="rf",
preProcess="medianImpute")
model
## Random Forest
##
## 506 samples
## 5 predictor
##
## Pre-processing: median imputation (4), ignore (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 6.167868 0.5551334 4.151774
## 3 6.251547 0.5468529 4.184168
## 5 6.500694 0.5174786 4.336459
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
set.seed(777)
data("BostonHousing")
BostonHousing[BostonHousing$crim>0.5,"crim"]<-NA
Y<-BostonHousing$medv
X<-BostonHousing[,c(1:3,5)]
model_median<-caret::train(x=X,y=Y,
method="glm",
preProcess="medianImpute")
print(min(model_median$results$RMSE))
## [1] 7.870132