# (2) Training vs Validation vs Test Split
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)
nrow(iris) # 행의 수
## [1] 150
idx <- createDataPartition(iris$Species,p=0.6,list=FALSE)
idx
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 7
## [6,] 8
## [7,] 9
## [8,] 13
## [9,] 15
## [10,] 16
## [11,] 17
## [12,] 20
## [13,] 23
## [14,] 26
## [15,] 28
## [16,] 31
## [17,] 32
## [18,] 34
## [19,] 35
## [20,] 36
## [21,] 38
## [22,] 39
## [23,] 40
## [24,] 41
## [25,] 42
## [26,] 43
## [27,] 44
## [28,] 48
## [29,] 49
## [30,] 50
## [31,] 52
## [32,] 54
## [33,] 55
## [34,] 56
## [35,] 57
## [36,] 58
## [37,] 61
## [38,] 62
## [39,] 63
## [40,] 64
## [41,] 67
## [42,] 68
## [43,] 69
## [44,] 70
## [45,] 71
## [46,] 72
## [47,] 77
## [48,] 80
## [49,] 81
## [50,] 82
## [51,] 84
## [52,] 86
## [53,] 87
## [54,] 88
## [55,] 90
## [56,] 91
## [57,] 92
## [58,] 96
## [59,] 98
## [60,] 100
## [61,] 102
## [62,] 103
## [63,] 104
## [64,] 105
## [65,] 106
## [66,] 107
## [67,] 110
## [68,] 111
## [69,] 112
## [70,] 114
## [71,] 115
## [72,] 117
## [73,] 118
## [74,] 120
## [75,] 125
## [76,] 126
## [77,] 127
## [78,] 128
## [79,] 129
## [80,] 131
## [81,] 136
## [82,] 137
## [83,] 139
## [84,] 140
## [85,] 141
## [86,] 142
## [87,] 145
## [88,] 146
## [89,] 148
## [90,] 150
train<-iris[idx,]
test <-iris[-idx,]
nrow(train)
## [1] 90
nrow(test)
## [1] 60
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("diamonds")
diamonds %>% head # %>% dim
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
diamonds1<-diamonds %>% rename(c=clarity,p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
## carat cut color c depth table p x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
count(diamonds,cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 × 2
## carat price
## <dbl> <int>
## 1 0.23 326
## 2 0.21 326
## 3 0.23 327
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 2 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
## carat cut color clarity depth table price x y z Ratio Double
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 1417. 2835.
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1552. 3105.
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 1422. 2843.