ls()
## character(0)
rm(list=ls())
ls()
## character(0)
library(modeldata)
## Warning: 패키지 'modeldata'는 R 버전 4.1.3에서 작성되었습니다
data(biomass)
head(biomass)
##                   sample  dataset carbon hydrogen oxygen nitrogen sulfur    HHV
## 1           Akhrot Shell Training  49.81     5.64  42.94     0.41   0.00 20.008
## 2 Alabama Oak Wood Waste Training  49.50     5.70  41.30     0.20   0.00 19.228
## 3                  Alder Training  47.82     5.80  46.25     0.11   0.02 18.299
## 4                Alfalfa Training  45.10     4.97  35.60     3.30   0.16 18.151
## 5     Alfalfa Seed Straw Training  46.76     5.40  40.72     1.00   0.02 18.450
## 6         Alfalfa Stalks Training  45.40     5.75  40.20     2.04   0.10 18.465
tail(biomass)
##                 sample  dataset carbon hydrogen oxygen nitrogen sulfur    HHV
## 531 Wood Sawdust Waste Training  45.97     5.13  48.53     0.12   0.24 18.207
## 532               Wood  Testing  47.80     5.80  39.00     1.40   0.10 17.205
## 533           Wood/Pit Training  48.62     5.78  39.73     0.65   0.06 19.660
## 534   Wood/Wheat Straw Training  47.48     5.81  37.92     0.35   0.12 18.800
## 535      Wyoming Elkol  Testing  71.50     5.30  16.90     1.20   0.90 29.570
## 536         Yard Waste Training  41.54     4.79  31.91     0.85   0.24 16.300
str(biomass)
## 'data.frame':    536 obs. of  8 variables:
##  $ sample  : chr  "Akhrot Shell" "Alabama Oak Wood Waste" "Alder" "Alfalfa" ...
##  $ dataset : chr  "Training" "Training" "Training" "Training" ...
##  $ carbon  : num  49.8 49.5 47.8 45.1 46.8 ...
##  $ hydrogen: num  5.64 5.7 5.8 4.97 5.4 5.75 5.99 5.7 5.5 5.9 ...
##  $ oxygen  : num  42.9 41.3 46.2 35.6 40.7 ...
##  $ nitrogen: num  0.41 0.2 0.11 3.3 1 2.04 2.68 1.7 0.8 1.2 ...
##  $ sulfur  : num  0 0 0.02 0.16 0.02 0.1 0.2 0.2 0 0.1 ...
##  $ HHV     : num  20 19.2 18.3 18.2 18.4 ...
biomass_tr <- biomass[biomass$dataset == "Training",]
biomass_te <- biomass[biomass$dataset == "Testing",]

head(biomass_tr)
##                   sample  dataset carbon hydrogen oxygen nitrogen sulfur    HHV
## 1           Akhrot Shell Training  49.81     5.64  42.94     0.41   0.00 20.008
## 2 Alabama Oak Wood Waste Training  49.50     5.70  41.30     0.20   0.00 19.228
## 3                  Alder Training  47.82     5.80  46.25     0.11   0.02 18.299
## 4                Alfalfa Training  45.10     4.97  35.60     3.30   0.16 18.151
## 5     Alfalfa Seed Straw Training  46.76     5.40  40.72     1.00   0.02 18.450
## 6         Alfalfa Stalks Training  45.40     5.75  40.20     2.04   0.10 18.465
library(ggplot2)
theme_set(theme_bw())
ggplot(biomass_tr, aes(x = carbon)) + 
  geom_histogram(binwidth = 5, col = "blue", fill = "blue", alpha = .5) + 
  geom_vline(xintercept = biomass_te$carbon[1], lty = 2)

library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: dplyr
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
              data = biomass_tr)
head(rec)
## $var_info
## # A tibble: 6 x 4
##   variable type    role      source  
##   <chr>    <chr>   <chr>     <chr>   
## 1 carbon   numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen   numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur   numeric predictor original
## 6 HHV      numeric outcome   original
## 
## $term_info
## # A tibble: 6 x 4
##   variable type    role      source  
##   <chr>    <chr>   <chr>     <chr>   
## 1 carbon   numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen   numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur   numeric predictor original
## 6 HHV      numeric outcome   original
## 
## $steps
## NULL
## 
## $template
## # A tibble: 456 x 6
##    carbon hydrogen oxygen nitrogen sulfur   HHV
##     <dbl>    <dbl>  <dbl>    <dbl>  <dbl> <dbl>
##  1   49.8     5.64   42.9     0.41   0     20.0
##  2   49.5     5.7    41.3     0.2    0     19.2
##  3   47.8     5.8    46.2     0.11   0.02  18.3
##  4   45.1     4.97   35.6     3.3    0.16  18.2
##  5   46.8     5.4    40.7     1      0.02  18.4
##  6   45.4     5.75   40.2     2.04   0.1   18.5
##  7   47.2     5.99   38.2     2.68   0.2   18.7
##  8   45.7     5.7    39.7     1.7    0.2   18.3
##  9   48.8     5.5    40.9     0.8    0     18.6
## 10   47.1     5.9    40       1.2    0.1   18.9
## # ... with 446 more rows
## 
## $levels
## NULL
## 
## $retained
## [1] NA
#숫자 데이터를 정규화하여 표준 편차가 하나이고 평균이 0이 되도록 
#레시피 단계의 사양을 만듭니다.
norm_trans <- rec %>%
  step_normalize(carbon, hydrogen)
str(norm_trans)
## List of 6
##  $ var_info : tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
##   ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
##   ..$ type    : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
##   ..$ role    : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
##   ..$ source  : chr [1:6] "original" "original" "original" "original" ...
##  $ term_info: tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
##   ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
##   ..$ type    : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
##   ..$ role    : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
##   ..$ source  : chr [1:6] "original" "original" "original" "original" ...
##  $ steps    :List of 1
##   ..$ :List of 8
##   .. ..$ terms  :List of 2
##   .. .. ..$ : language ~carbon
##   .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8> 
##   .. .. ..$ : language ~hydrogen
##   .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8> 
##   .. .. ..- attr(*, "class")= chr [1:2] "quosures" "list"
##   .. ..$ role   : logi NA
##   .. ..$ trained: logi FALSE
##   .. ..$ means  : NULL
##   .. ..$ sds    : NULL
##   .. ..$ na_rm  : logi TRUE
##   .. ..$ skip   : logi FALSE
##   .. ..$ id     : chr "normalize_65Owz"
##   .. ..- attr(*, "class")= chr [1:2] "step_normalize" "step"
##  $ template : tibble [456 x 6] (S3: tbl_df/tbl/data.frame)
##   ..$ carbon  : num [1:456] 49.8 49.5 47.8 45.1 46.8 ...
##   ..$ hydrogen: num [1:456] 5.64 5.7 5.8 4.97 5.4 5.75 5.99 5.7 5.5 5.9 ...
##   ..$ oxygen  : num [1:456] 42.9 41.3 46.2 35.6 40.7 ...
##   ..$ nitrogen: num [1:456] 0.41 0.2 0.11 3.3 1 2.04 2.68 1.7 0.8 1.2 ...
##   ..$ sulfur  : num [1:456] 0 0 0.02 0.16 0.02 0.1 0.2 0.2 0 0.1 ...
##   ..$ HHV     : num [1:456] 20 19.2 18.3 18.2 18.4 ...
##  $ levels   : NULL
##  $ retained : logi NA
##  - attr(*, "class")= chr "recipe"
norm_obj <- prep(norm_trans, training = biomass_tr)
str(norm_obj)
## List of 8
##  $ var_info      : tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
##   ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
##   ..$ type    : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
##   ..$ role    : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
##   ..$ source  : chr [1:6] "original" "original" "original" "original" ...
##  $ term_info     : tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
##   ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
##   ..$ type    : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
##   ..$ role    : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
##   ..$ source  : chr [1:6] "original" "original" "original" "original" ...
##  $ steps         :List of 1
##   ..$ :List of 8
##   .. ..$ terms  :List of 2
##   .. .. ..$ : language ~carbon
##   .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8> 
##   .. .. ..$ : language ~hydrogen
##   .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8> 
##   .. .. ..- attr(*, "class")= chr [1:2] "quosures" "list"
##   .. ..$ role   : logi NA
##   .. ..$ trained: logi TRUE
##   .. ..$ means  : Named num [1:2] 48.35 5.46
##   .. .. ..- attr(*, "names")= chr [1:2] "carbon" "hydrogen"
##   .. ..$ sds    : Named num [1:2] 10.4 1.2
##   .. .. ..- attr(*, "names")= chr [1:2] "carbon" "hydrogen"
##   .. ..$ na_rm  : logi TRUE
##   .. ..$ skip   : logi FALSE
##   .. ..$ id     : chr "normalize_65Owz"
##   .. ..- attr(*, "class")= chr [1:2] "step_normalize" "step"
##  $ template      : tibble [456 x 6] (S3: tbl_df/tbl/data.frame)
##   ..$ carbon  : num [1:456] 0.1399 0.1101 -0.0513 -0.3126 -0.1532 ...
##   ..$ hydrogen: num [1:456] 0.1512 0.2013 0.2848 -0.4086 -0.0494 ...
##   ..$ oxygen  : num [1:456] 42.9 41.3 46.2 35.6 40.7 ...
##   ..$ nitrogen: num [1:456] 0.41 0.2 0.11 3.3 1 2.04 2.68 1.7 0.8 1.2 ...
##   ..$ sulfur  : num [1:456] 0 0 0.02 0.16 0.02 0.1 0.2 0.2 0 0.1 ...
##   ..$ HHV     : num [1:456] 20 19.2 18.3 18.2 18.4 ...
##  $ retained      : logi TRUE
##  $ tr_info       :'data.frame':  1 obs. of  2 variables:
##   ..$ nrows    : int 456
##   ..$ ncomplete: int 456
##  $ orig_lvls     :List of 6
##   ..$ carbon  :List of 2
##   .. ..$ values : logi NA
##   .. ..$ ordered: logi NA
##   ..$ hydrogen:List of 2
##   .. ..$ values : logi NA
##   .. ..$ ordered: logi NA
##   ..$ oxygen  :List of 2
##   .. ..$ values : logi NA
##   .. ..$ ordered: logi NA
##   ..$ nitrogen:List of 2
##   .. ..$ values : logi NA
##   .. ..$ ordered: logi NA
##   ..$ sulfur  :List of 2
##   .. ..$ values : logi NA
##   .. ..$ ordered: logi NA
##   ..$ HHV     :List of 2
##   .. ..$ values : logi NA
##   .. ..$ ordered: logi NA
##  $ last_term_info: grouped_df [6 x 6] (S3: grouped_df/tbl_df/tbl/data.frame)
##   ..$ variable: chr [1:6] "carbon" "HHV" "hydrogen" "nitrogen" ...
##   ..$ type    : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
##   ..$ role    :List of 6
##   .. ..$ : chr "predictor"
##   .. ..$ : chr "outcome"
##   .. ..$ : chr "predictor"
##   .. ..$ : chr "predictor"
##   .. ..$ : chr "predictor"
##   .. ..$ : chr "predictor"
##   ..$ source  : chr [1:6] "original" "original" "original" "original" ...
##   ..$ number  : num [1:6] 1 1 1 1 1 1
##   ..$ skip    : logi [1:6] FALSE FALSE FALSE FALSE FALSE FALSE
##   ..- attr(*, "groups")= tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
##   .. ..$ variable: chr [1:6] "carbon" "HHV" "hydrogen" "nitrogen" ...
##   .. ..$ .rows   : list<int> [1:6] 
##   .. .. ..$ : int 1
##   .. .. ..$ : int 2
##   .. .. ..$ : int 3
##   .. .. ..$ : int 4
##   .. .. ..$ : int 5
##   .. .. ..$ : int 6
##   .. .. ..@ ptype: int(0) 
##   .. ..- attr(*, ".drop")= logi TRUE
##  - attr(*, "class")= chr "recipe"
head(norm_obj)
## $var_info
## # A tibble: 6 x 4
##   variable type    role      source  
##   <chr>    <chr>   <chr>     <chr>   
## 1 carbon   numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen   numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur   numeric predictor original
## 6 HHV      numeric outcome   original
## 
## $term_info
## # A tibble: 6 x 4
##   variable type    role      source  
##   <chr>    <chr>   <chr>     <chr>   
## 1 carbon   numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen   numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur   numeric predictor original
## 6 HHV      numeric outcome   original
## 
## $steps
## $steps[[1]]
## $terms
## <list_of<quosure>>
## 
## [[1]]
## <quosure>
## expr: ^carbon
## env:  0x0000000027b724f8
## 
## [[2]]
## <quosure>
## expr: ^hydrogen
## env:  0x0000000027b724f8
## 
## 
## $role
## [1] NA
## 
## $trained
## [1] TRUE
## 
## $means
##    carbon  hydrogen 
## 48.354145  5.459079 
## 
## $sds
##   carbon hydrogen 
## 10.40829  1.19688 
## 
## $na_rm
## [1] TRUE
## 
## $skip
## [1] FALSE
## 
## $id
## [1] "normalize_65Owz"
## 
## attr(,"class")
## [1] "step_normalize" "step"          
## 
## 
## $template
## # A tibble: 456 x 6
##     carbon hydrogen oxygen nitrogen sulfur   HHV
##      <dbl>    <dbl>  <dbl>    <dbl>  <dbl> <dbl>
##  1  0.140    0.151    42.9     0.41   0     20.0
##  2  0.110    0.201    41.3     0.2    0     19.2
##  3 -0.0513   0.285    46.2     0.11   0.02  18.3
##  4 -0.313   -0.409    35.6     3.3    0.16  18.2
##  5 -0.153   -0.0494   40.7     1      0.02  18.4
##  6 -0.284    0.243    40.2     2.04   0.1   18.5
##  7 -0.114    0.444    38.2     2.68   0.2   18.7
##  8 -0.255    0.201    39.7     1.7    0.2   18.3
##  9  0.0428   0.0342   40.9     0.8    0     18.6
## 10 -0.120    0.368    40       1.2    0.1   18.9
## # ... with 446 more rows
## 
## $retained
## [1] TRUE
## 
## $tr_info
##   nrows ncomplete
## 1   456       456
transformed_te <- bake(norm_obj, biomass_te)
head(transformed_te)
## # A tibble: 6 x 6
##    carbon hydrogen oxygen nitrogen sulfur   HHV
##     <dbl>    <dbl>  <dbl>    <dbl>  <dbl> <dbl>
## 1 -0.193    0.176    47.2     0.3    0.22  18.3
## 2 -0.490    0.0342   48.1     2.85   0.34  17.6
## 3 -0.543    0.0342   49.1     2.4    0.3   17.2
## 4 -0.188    0.535    37.3     1.8    0.5   18.9
## 5  0.0390   0.719    42.8     0.2    0     20.5
## 6 -0.390    0.0342   41.7     0.7    0.2   18.5
biomass_te[1:10, names(transformed_te)]
##    carbon hydrogen oxygen nitrogen sulfur    HHV
## 15  46.35     5.67  47.20     0.30   0.22 18.275
## 20  43.25     5.50  48.06     2.85   0.34 17.560
## 26  42.70     5.50  49.10     2.40   0.30 17.173
## 31  46.40     6.10  37.30     1.80   0.50 18.851
## 36  48.76     6.32  42.77     0.20   0.00 20.547
## 41  44.30     5.50  41.70     0.70   0.20 18.467
## 46  38.94     5.23  54.13     1.19   0.51 15.095
## 51  42.10     4.66  33.80     0.95   0.20 16.240
## 55  29.20     4.40  31.10     0.14   4.90 11.147
## 65  27.80     3.77  23.69     4.63   1.05 10.750
transformed_te
## # A tibble: 80 x 6
##     carbon hydrogen oxygen nitrogen sulfur   HHV
##      <dbl>    <dbl>  <dbl>    <dbl>  <dbl> <dbl>
##  1 -0.193    0.176    47.2     0.3    0.22  18.3
##  2 -0.490    0.0342   48.1     2.85   0.34  17.6
##  3 -0.543    0.0342   49.1     2.4    0.3   17.2
##  4 -0.188    0.535    37.3     1.8    0.5   18.9
##  5  0.0390   0.719    42.8     0.2    0     20.5
##  6 -0.390    0.0342   41.7     0.7    0.2   18.5
##  7 -0.904   -0.191    54.1     1.19   0.51  15.1
##  8 -0.601   -0.668    33.8     0.95   0.2   16.2
##  9 -1.84    -0.885    31.1     0.14   4.9   11.1
## 10 -1.97    -1.41     23.7     4.63   1.05  10.8
## # ... with 70 more rows
tidy(norm_trans, number = 1)
## # A tibble: 2 x 4
##   terms    statistic value id             
##   <chr>    <chr>     <dbl> <chr>          
## 1 carbon   <NA>         NA normalize_65Owz
## 2 hydrogen <NA>         NA normalize_65Owz
tidy(norm_obj, number = 1)
## # A tibble: 4 x 4
##   terms    statistic value id             
##   <chr>    <chr>     <dbl> <chr>          
## 1 carbon   mean      48.4  normalize_65Owz
## 2 hydrogen mean       5.46 normalize_65Owz
## 3 carbon   sd        10.4  normalize_65Owz
## 4 hydrogen sd         1.20 normalize_65Owz
# To keep the original variables in the output, use `step_mutate_at`:
norm_keep_orig <- rec %>%
  step_mutate_at(all_numeric_predictors(), fn = list(orig = ~.)) %>%
  step_normalize(-contains("orig"), -all_outcomes())

keep_orig_obj <- prep(norm_keep_orig, training = biomass_tr)
keep_orig_te <- bake(keep_orig_obj, biomass_te)
keep_orig_te
## # A tibble: 80 x 11
##     carbon hydrogen oxygen nitrogen   sulfur   HHV carbon_orig hydrogen_orig
##      <dbl>    <dbl>  <dbl>    <dbl>    <dbl> <dbl>       <dbl>         <dbl>
##  1 -0.193    0.176   0.801  -0.643   0.00755  18.3        46.4          5.67
##  2 -0.490    0.0342  0.881   1.47    0.281    17.6        43.2          5.5 
##  3 -0.543    0.0342  0.977   1.10    0.190    17.2        42.7          5.5 
##  4 -0.188    0.535  -0.113   0.602   0.646    18.9        46.4          6.1 
##  5  0.0390   0.719   0.392  -0.726  -0.494    20.5        48.8          6.32
##  6 -0.390    0.0342  0.293  -0.311  -0.0380   18.5        44.3          5.5 
##  7 -0.904   -0.191   1.44    0.0958  0.668    15.1        38.9          5.23
##  8 -0.601   -0.668  -0.436  -0.103  -0.0380   16.2        42.1          4.66
##  9 -1.84    -0.885  -0.686  -0.776  10.7      11.1        29.2          4.4 
## 10 -1.97    -1.41   -1.37    2.95    1.90     10.8        27.8          3.77
## # ... with 70 more rows, and 3 more variables: oxygen_orig <dbl>,
## #   nitrogen_orig <dbl>, sulfur_orig <dbl>