20250520

#install.packages("caret")

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

data(iris)

# 행의 수
nrow(iris)

## [1] 150

# 랜덤으로 60%의 행을 뽑음
idx <-createDataPartition(iris$Species, p=0.6, list = FALSE)

idx

##       Resample1
##  [1,]         1
##  [2,]         2
##  [3,]         3
##  [4,]         4
##  [5,]         5
##  [6,]         6
##  [7,]         8
##  [8,]         9
##  [9,]        10
## [10,]        11
## [11,]        12
## [12,]        14
## [13,]        17
## [14,]        18
## [15,]        19
## [16,]        23
## [17,]        25
## [18,]        26
## [19,]        30
## [20,]        31
## [21,]        33
## [22,]        34
## [23,]        36
## [24,]        37
## [25,]        41
## [26,]        42
## [27,]        43
## [28,]        45
## [29,]        46
## [30,]        48
## [31,]        53
## [32,]        55
## [33,]        56
## [34,]        57
## [35,]        58
## [36,]        59
## [37,]        60
## [38,]        63
## [39,]        64
## [40,]        65
## [41,]        68
## [42,]        70
## [43,]        71
## [44,]        74
## [45,]        75
## [46,]        76
## [47,]        80
## [48,]        82
## [49,]        83
## [50,]        85
## [51,]        86
## [52,]        89
## [53,]        90
## [54,]        91
## [55,]        92
## [56,]        93
## [57,]        95
## [58,]        97
## [59,]        98
## [60,]        99
## [61,]       102
## [62,]       103
## [63,]       104
## [64,]       105
## [65,]       106
## [66,]       109
## [67,]       110
## [68,]       112
## [69,]       114
## [70,]       117
## [71,]       120
## [72,]       121
## [73,]       123
## [74,]       124
## [75,]       126
## [76,]       127
## [77,]       128
## [78,]       130
## [79,]       132
## [80,]       133
## [81,]       135
## [82,]       137
## [83,]       139
## [84,]       140
## [85,]       141
## [86,]       143
## [87,]       145
## [88,]       146
## [89,]       148
## [90,]       150

train<-iris[idx,]
test<-iris[-idx,] # - 는 그것 외 트레이닝

# 60% 학습용
nrow(train)

## [1] 90

# 40% 평가용
nrow(test)

## [1] 60

round(0.7811159,2)

## [1] 0.78

# 데이터 전처리 
#install.packages("dplyr")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
data("diamonds")

# ctrl+shift+M
diamonds %>% head %>% dim

## [1]  6 10

library(dplyr)
library(ggplot2)
data("diamonds")
diamonds1<-diamonds %>%  rename(c=clarity,p=price)
head(diamonds1,3)

## # A tibble: 3 × 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31

count(diamonds, cut)

## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551

dim(diamonds)

## [1] 53940    10

table(diamonds$cut)

## 
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551

# 원하는 열의 값만 추출하기
df1<-diamonds %>% select(carat,price)
head(df1,3)

## # A tibble: 3 × 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327

# 비교값이 같은 데이터 추출
diamonds %>% filter(cut=="Good") %>% head(3)

## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73

# 파생병수 만들기
diamonds %>%  mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)

## # A tibble: 3 × 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.

# 파생변수는 임시로 보여짐에 따라 변수에 값을 저장해야힘
df3<-diamonds %>%  mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)

df3

## # A tibble: 3 × 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.

20250520

김선주

2025-05-20