#(2) Training vs Validation vs Test Split
# install.packages("caret") # Rmarkdown시 사용하면 오류 출력
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)
nrow(iris) #행의 수
## [1] 150
idx <- createDataPartition(iris$Species, p=0.6, list=FALSE)
idx
## Resample1
## [1,] 1
## [2,] 2
## [3,] 4
## [4,] 6
## [5,] 8
## [6,] 11
## [7,] 13
## [8,] 14
## [9,] 15
## [10,] 16
## [11,] 17
## [12,] 18
## [13,] 19
## [14,] 20
## [15,] 22
## [16,] 24
## [17,] 26
## [18,] 32
## [19,] 33
## [20,] 34
## [21,] 38
## [22,] 41
## [23,] 42
## [24,] 43
## [25,] 44
## [26,] 45
## [27,] 46
## [28,] 48
## [29,] 49
## [30,] 50
## [31,] 51
## [32,] 52
## [33,] 53
## [34,] 55
## [35,] 56
## [36,] 57
## [37,] 58
## [38,] 59
## [39,] 60
## [40,] 61
## [41,] 62
## [42,] 64
## [43,] 69
## [44,] 70
## [45,] 73
## [46,] 74
## [47,] 78
## [48,] 79
## [49,] 80
## [50,] 81
## [51,] 83
## [52,] 85
## [53,] 86
## [54,] 87
## [55,] 91
## [56,] 92
## [57,] 93
## [58,] 94
## [59,] 97
## [60,] 98
## [61,] 101
## [62,] 102
## [63,] 104
## [64,] 105
## [65,] 106
## [66,] 107
## [67,] 108
## [68,] 111
## [69,] 114
## [70,] 117
## [71,] 118
## [72,] 119
## [73,] 120
## [74,] 121
## [75,] 122
## [76,] 124
## [77,] 125
## [78,] 130
## [79,] 131
## [80,] 133
## [81,] 134
## [82,] 135
## [83,] 137
## [84,] 141
## [85,] 142
## [86,] 143
## [87,] 146
## [88,] 147
## [89,] 148
## [90,] 149
train<-iris[idx,]
test<-iris[-idx,]
nrow(train) # 60% 학습용
## [1] 90
nrow(test) # 40% 평가용
## [1] 60
round(0.7811159,2) # 소수점 아래 자리수를 지정된 자리수로 반올림하느 함수
## [1] 0.78
# 5장 데이터 전처리
# install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# install.packages("ggplot2")
library(ggplot2)
data("diamonds")
diamonds %>% head # %>% dim
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
# 2. rename()함수 : 변수 이름 바꾸기
diamonds1<-diamonds %>% rename(c=clarity, p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
## carat cut color c depth table p x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
# 3. count()함수 : 빈도분석
count(diamonds,cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
# 4. select()함수 : 데이터셋에서 원하는 열을 추출하기
# (1) 필요한 열만 추출하기
library(dplyr)
library(ggplot2)
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 × 2
## carat price
## <dbl> <int>
## 1 0.23 326
## 2 0.21 326
## 3 0.23 327
# 6. filter()함수 : 조건에 맞는 행을 추출하기
# (1) 비교값이 같은 데이터 추출
library(dplyr)
library(ggplot2)
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 2 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
# 7. mutate()함수 : 파생변수 만들기
library(dplyr)
library(ggplot2)
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
## carat cut color clarity depth table price x y z Ratio Double
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 1417. 2835.
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1552. 3105.
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 1422. 2843.
head(diamonds,3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
#63번과 65번이 결과값의 차이가 나오는 이유는 확인만 했기 때문이다.
#어디에 할당을 한다면 같은 결과가 나온다.