# (2) Training vs. Validation vs. Test Split


library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)

#행의 수"iris0"  #행의 수가 몇개인가
nrow(iris)
## [1] 150
idx <-createDataPartition(iris$Species,p=0.6,list=FALSE)

idx<-createDataPartition(iris$Species,p=0.6,list = FALSE)

idx
##       Resample1
##  [1,]         1
##  [2,]         2
##  [3,]         3
##  [4,]         5
##  [5,]         8
##  [6,]         9
##  [7,]        10
##  [8,]        12
##  [9,]        14
## [10,]        15
## [11,]        16
## [12,]        19
## [13,]        20
## [14,]        21
## [15,]        22
## [16,]        23
## [17,]        24
## [18,]        26
## [19,]        28
## [20,]        29
## [21,]        30
## [22,]        31
## [23,]        32
## [24,]        34
## [25,]        38
## [26,]        40
## [27,]        42
## [28,]        45
## [29,]        46
## [30,]        49
## [31,]        52
## [32,]        54
## [33,]        58
## [34,]        59
## [35,]        60
## [36,]        64
## [37,]        66
## [38,]        67
## [39,]        69
## [40,]        70
## [41,]        71
## [42,]        72
## [43,]        74
## [44,]        75
## [45,]        77
## [46,]        78
## [47,]        79
## [48,]        80
## [49,]        81
## [50,]        82
## [51,]        83
## [52,]        85
## [53,]        86
## [54,]        87
## [55,]        88
## [56,]        90
## [57,]        91
## [58,]        93
## [59,]        99
## [60,]       100
## [61,]       101
## [62,]       107
## [63,]       108
## [64,]       109
## [65,]       110
## [66,]       111
## [67,]       112
## [68,]       113
## [69,]       115
## [70,]       118
## [71,]       119
## [72,]       120
## [73,]       121
## [74,]       122
## [75,]       123
## [76,]       124
## [77,]       126
## [78,]       127
## [79,]       129
## [80,]       130
## [81,]       132
## [82,]       133
## [83,]       134
## [84,]       135
## [85,]       137
## [86,]       140
## [87,]       142
## [88,]       143
## [89,]       144
## [90,]       145
train<-iris[idx,]                         
test<-iris[-idx,]



#round함수
round(0.7811159,2)
## [1] 0.78
#데이터전처리

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

data("diamonds")
diamonds %>% head
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
library(dplyr)
library(ggplot2)
data("diamonds")
diamonds1<-diamonds %>% rename(c=clarity,p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31
count(diamonds,cut)
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
glimpse(diamonds)  #행의 수 5만여개 #변수10개
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
library(dplyr)
library(ggplot2)
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 × 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.