lecture

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(e1071)

library(mlbench)
data(Glass)
glimpse(Glass)

## Rows: 214
## Columns: 10
## $ RI   <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na   <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg   <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al   <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si   <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K    <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca   <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

table(Glass$Type)

## 
##  1  2  3  5  6  7 
## 70 76 17 13  9 29

pairs(Glass)

cor(Glass[,-10])

##               RI          Na           Mg          Al          Si            K
## RI  1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790  1.00000000 -0.273731961  0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196  1.000000000 -0.48179851 -0.16592672  0.005395667
## Al -0.4073260341  0.15679367 -0.481798509  1.00000000 -0.00552372  0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372  1.00000000 -0.193330854
## K  -0.2898327111 -0.26608650  0.005395667  0.32595845 -0.19333085  1.000000000
## Ca  0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189  0.32660288 -0.492262118  0.47940390 -0.10215131 -0.042618059
## Fe  0.1430096093 -0.24134641  0.083059529 -0.07440215 -0.09420073 -0.007719049
##            Ca            Ba           Fe
## RI  0.8104027 -0.0003860189  0.143009609
## Na -0.2754425  0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178  0.083059529
## Al -0.2595920  0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K  -0.3178362 -0.0426180594 -0.007719049
## Ca  1.0000000 -0.1128409671  0.124968219
## Ba -0.1128410  1.0000000000 -0.058691755
## Fe  0.1249682 -0.0586917554  1.000000000

par(mfrow=c(1,2))#한화면에 두개의 플롯
boxplot(Glass$Mg ~ Glass$Type)
boxplot(Glass$Al ~ Glass$Type)

par(mfrow=c(1,1))
library(corrplot)

## corrplot 0.92 loaded

corrplot(cor(Glass[,-10]),order='hclust')#원의 크기와 색에 따라 상관계수를 잘 보여주는

library(e1071)
apply(iris[1:4],2,mean)# 1=열에 2=행에

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333

apply(Glass[,-10],2,skewness)#(양수이면 오른쪽 꼬리분포 음수이면 왼쪽 꼬리분포 )

##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107

trans<-preProcess(Glass[,-10],method=c("BoxCox","center","scale")) #세가지 다사용해야함 
# boxcox = 비대칭분포를 정규분포로 변환 center=표준화 scale=0과1시아의 값으로 변경
trans

## Created from 214 samples and 9 variables
## 
## Pre-processing:
##   - Box-Cox transformation (5)
##   - centered (9)
##   - ignored (0)
##   - scaled (9)
## 
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, 0.5, 2, -1.1

transformed<-predict(trans,Glass[,-10])
head(transformed[,1:5])

##           RI         Na        Mg          Al          Si
## 1  0.8756898  0.3133883 1.2517037 -0.65520274 -1.12729016
## 2 -0.2471367  0.6129977 0.6346799 -0.08726137  0.09719851
## 3 -0.7216425  0.1798164 0.6000157  0.27454124  0.43512776
## 4 -0.2305698 -0.2150217 0.6970756 -0.23439154 -0.05836211
## 5 -0.3101056 -0.1402661 0.6485456 -0.34194384  0.55238422
## 6 -0.7947626 -0.7480171 0.6416128  0.42852325  0.40909039

transl<-preProcess(Glass[,-10],method=c("BoxCox","center","scale","pca"))
transformed<-predict(transl,Glass[,-10])
head(transformed[,1:5])

##          PC1        PC2        PC3        PC4        PC5
## 1 -1.2126444 -0.3942139 -0.1730756 -1.7193852  0.1913387
## 2  0.6179073 -0.7020476 -0.5507034 -0.8575350  0.1566312
## 3  0.9907027 -0.8876886 -0.6452946 -0.3027716  0.1363025
## 4  0.1510212 -0.9042336 -0.1622361 -0.4521567  0.4291846
## 5  0.3582849 -1.0160965 -0.5763959 -0.1667831  0.3634192
## 6  0.3408017 -1.3565637  0.7451275  1.0568333 -1.7762845

set.seed(777)
# 무작위 추출에도 같은 결과를 얻기위한 샘플번호느낌 
data("BostonHousing")
BostonHousing[sample(1:nrow(BostonHousing),10),"crim"]<-NA
sample(1:46,6)

## [1]  9 16  4 44 26 23

colSums(is.na(BostonHousing))

##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax 
##      10       0       0       0       0       0       0       0       0       0 
## ptratio       b   lstat    medv 
##       0       0       0       0

Y<-BostonHousing$medv
X<-BostonHousing[,1:5]
model<-caret::train(x=X,y=Y,
                    method="rf",
                    preProcess="medianImpute")
model

## Random Forest 
## 
## 506 samples
##   5 predictor
## 
## Pre-processing: median imputation (4), ignore (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     6.167868  0.5551334  4.151774
##   3     6.251547  0.5468529  4.184168
##   5     6.500694  0.5174786  4.336459
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

set.seed(777)
data("BostonHousing")
BostonHousing[BostonHousing$crim>0.5,"crim"]<-NA
Y<-BostonHousing$medv
X<-BostonHousing[,c(1:3,5)]
model_median<-caret::train(x=X,y=Y,
                           method="glm",
                           preProcess="medianImpute")
print(min(model_median$results$RMSE))

## [1] 7.870132

lecture_07

2023-01-10