#install.packages("caret")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(iris)
# 행의 수
nrow(iris)
## [1] 150
# 랜덤으로 60%의 행을 뽑음
idx <-createDataPartition(iris$Species, p=0.6, list = FALSE)
idx
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 6
## [7,] 8
## [8,] 9
## [9,] 10
## [10,] 11
## [11,] 12
## [12,] 14
## [13,] 17
## [14,] 18
## [15,] 19
## [16,] 23
## [17,] 25
## [18,] 26
## [19,] 30
## [20,] 31
## [21,] 33
## [22,] 34
## [23,] 36
## [24,] 37
## [25,] 41
## [26,] 42
## [27,] 43
## [28,] 45
## [29,] 46
## [30,] 48
## [31,] 53
## [32,] 55
## [33,] 56
## [34,] 57
## [35,] 58
## [36,] 59
## [37,] 60
## [38,] 63
## [39,] 64
## [40,] 65
## [41,] 68
## [42,] 70
## [43,] 71
## [44,] 74
## [45,] 75
## [46,] 76
## [47,] 80
## [48,] 82
## [49,] 83
## [50,] 85
## [51,] 86
## [52,] 89
## [53,] 90
## [54,] 91
## [55,] 92
## [56,] 93
## [57,] 95
## [58,] 97
## [59,] 98
## [60,] 99
## [61,] 102
## [62,] 103
## [63,] 104
## [64,] 105
## [65,] 106
## [66,] 109
## [67,] 110
## [68,] 112
## [69,] 114
## [70,] 117
## [71,] 120
## [72,] 121
## [73,] 123
## [74,] 124
## [75,] 126
## [76,] 127
## [77,] 128
## [78,] 130
## [79,] 132
## [80,] 133
## [81,] 135
## [82,] 137
## [83,] 139
## [84,] 140
## [85,] 141
## [86,] 143
## [87,] 145
## [88,] 146
## [89,] 148
## [90,] 150
train<-iris[idx,]
test<-iris[-idx,] # - 는 그것 외 트레이닝
# 60% 학습용
nrow(train)
## [1] 90
# 40% 평가용
nrow(test)
## [1] 60
round(0.7811159,2)
## [1] 0.78
# 데이터 전처리
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("diamonds")
# ctrl+shift+M
diamonds %>% head %>% dim
## [1] 6 10
library(dplyr)
library(ggplot2)
data("diamonds")
diamonds1<-diamonds %>% rename(c=clarity,p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
## carat cut color c depth table p x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
count(diamonds, cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
dim(diamonds)
## [1] 53940 10
table(diamonds$cut)
##
## Fair Good Very Good Premium Ideal
## 1610 4906 12082 13791 21551
# 원하는 열의 값만 추출하기
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 × 2
## carat price
## <dbl> <int>
## 1 0.23 326
## 2 0.21 326
## 3 0.23 327
# 비교값이 같은 데이터 추출
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 2 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
# 파생병수 만들기
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
## carat cut color clarity depth table price x y z Ratio Double
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 1417. 2835.
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1552. 3105.
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 1422. 2843.
# 파생변수는 임시로 보여짐에 따라 변수에 값을 저장해야힘
df3<-diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
df3
## # A tibble: 3 × 12
## carat cut color clarity depth table price x y z Ratio Double
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 1417. 2835.
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1552. 3105.
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 1422. 2843.