#(2) Training vs Validation vs Test Split

# install.packages("caret") # Rmarkdown시 사용하면 오류 출력
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)

nrow(iris) #행의 수
## [1] 150
idx <- createDataPartition(iris$Species, p=0.6, list=FALSE)

idx
##       Resample1
##  [1,]         1
##  [2,]         2
##  [3,]         4
##  [4,]         6
##  [5,]         8
##  [6,]        11
##  [7,]        13
##  [8,]        14
##  [9,]        15
## [10,]        16
## [11,]        17
## [12,]        18
## [13,]        19
## [14,]        20
## [15,]        22
## [16,]        24
## [17,]        26
## [18,]        32
## [19,]        33
## [20,]        34
## [21,]        38
## [22,]        41
## [23,]        42
## [24,]        43
## [25,]        44
## [26,]        45
## [27,]        46
## [28,]        48
## [29,]        49
## [30,]        50
## [31,]        51
## [32,]        52
## [33,]        53
## [34,]        55
## [35,]        56
## [36,]        57
## [37,]        58
## [38,]        59
## [39,]        60
## [40,]        61
## [41,]        62
## [42,]        64
## [43,]        69
## [44,]        70
## [45,]        73
## [46,]        74
## [47,]        78
## [48,]        79
## [49,]        80
## [50,]        81
## [51,]        83
## [52,]        85
## [53,]        86
## [54,]        87
## [55,]        91
## [56,]        92
## [57,]        93
## [58,]        94
## [59,]        97
## [60,]        98
## [61,]       101
## [62,]       102
## [63,]       104
## [64,]       105
## [65,]       106
## [66,]       107
## [67,]       108
## [68,]       111
## [69,]       114
## [70,]       117
## [71,]       118
## [72,]       119
## [73,]       120
## [74,]       121
## [75,]       122
## [76,]       124
## [77,]       125
## [78,]       130
## [79,]       131
## [80,]       133
## [81,]       134
## [82,]       135
## [83,]       137
## [84,]       141
## [85,]       142
## [86,]       143
## [87,]       146
## [88,]       147
## [89,]       148
## [90,]       149
train<-iris[idx,]
test<-iris[-idx,]

nrow(train) # 60% 학습용
## [1] 90
nrow(test) # 40% 평가용
## [1] 60
round(0.7811159,2) # 소수점 아래 자리수를 지정된 자리수로 반올림하느 함수
## [1] 0.78
# 5장 데이터 전처리

# install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# install.packages("ggplot2")
library(ggplot2)
data("diamonds")
diamonds %>% head # %>% dim
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
# 2. rename()함수 : 변수 이름 바꾸기

diamonds1<-diamonds %>% rename(c=clarity, p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31
# 3. count()함수 : 빈도분석

count(diamonds,cut)
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
# 4. select()함수 : 데이터셋에서 원하는 열을 추출하기
# (1) 필요한 열만 추출하기

library(dplyr)
library(ggplot2)
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 × 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327
# 6. filter()함수 : 조건에 맞는 행을 추출하기
# (1) 비교값이 같은 데이터 추출

library(dplyr)
library(ggplot2)
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
# 7. mutate()함수 : 파생변수 만들기

library(dplyr)
library(ggplot2)
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.
head(diamonds,3)
## # A tibble: 3 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
#63번과 65번이 결과값의 차이가 나오는 이유는 확인만 했기 때문이다.
#어디에 할당을 한다면 같은 결과가 나온다.