# (2) Training vs Validation vs Test Split
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)

nrow(iris) # 행의 수
## [1] 150
idx <- createDataPartition(iris$Species,p=0.6,list=FALSE)

idx
##       Resample1
##  [1,]         1
##  [2,]         2
##  [3,]         3
##  [4,]         4
##  [5,]         7
##  [6,]         8
##  [7,]         9
##  [8,]        13
##  [9,]        15
## [10,]        16
## [11,]        17
## [12,]        20
## [13,]        23
## [14,]        26
## [15,]        28
## [16,]        31
## [17,]        32
## [18,]        34
## [19,]        35
## [20,]        36
## [21,]        38
## [22,]        39
## [23,]        40
## [24,]        41
## [25,]        42
## [26,]        43
## [27,]        44
## [28,]        48
## [29,]        49
## [30,]        50
## [31,]        52
## [32,]        54
## [33,]        55
## [34,]        56
## [35,]        57
## [36,]        58
## [37,]        61
## [38,]        62
## [39,]        63
## [40,]        64
## [41,]        67
## [42,]        68
## [43,]        69
## [44,]        70
## [45,]        71
## [46,]        72
## [47,]        77
## [48,]        80
## [49,]        81
## [50,]        82
## [51,]        84
## [52,]        86
## [53,]        87
## [54,]        88
## [55,]        90
## [56,]        91
## [57,]        92
## [58,]        96
## [59,]        98
## [60,]       100
## [61,]       102
## [62,]       103
## [63,]       104
## [64,]       105
## [65,]       106
## [66,]       107
## [67,]       110
## [68,]       111
## [69,]       112
## [70,]       114
## [71,]       115
## [72,]       117
## [73,]       118
## [74,]       120
## [75,]       125
## [76,]       126
## [77,]       127
## [78,]       128
## [79,]       129
## [80,]       131
## [81,]       136
## [82,]       137
## [83,]       139
## [84,]       140
## [85,]       141
## [86,]       142
## [87,]       145
## [88,]       146
## [89,]       148
## [90,]       150
train<-iris[idx,]
test <-iris[-idx,]

nrow(train)
## [1] 90
nrow(test)
## [1] 60
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data("diamonds")
diamonds %>% head # %>% dim
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
diamonds1<-diamonds %>% rename(c=clarity,p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31
count(diamonds,cut)
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 × 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.