ls()
## character(0)
rm(list=ls())
ls()
## character(0)
library(modeldata)
## Warning: 패키지 'modeldata'는 R 버전 4.1.3에서 작성되었습니다
data(biomass)
head(biomass)
## sample dataset carbon hydrogen oxygen nitrogen sulfur HHV
## 1 Akhrot Shell Training 49.81 5.64 42.94 0.41 0.00 20.008
## 2 Alabama Oak Wood Waste Training 49.50 5.70 41.30 0.20 0.00 19.228
## 3 Alder Training 47.82 5.80 46.25 0.11 0.02 18.299
## 4 Alfalfa Training 45.10 4.97 35.60 3.30 0.16 18.151
## 5 Alfalfa Seed Straw Training 46.76 5.40 40.72 1.00 0.02 18.450
## 6 Alfalfa Stalks Training 45.40 5.75 40.20 2.04 0.10 18.465
tail(biomass)
## sample dataset carbon hydrogen oxygen nitrogen sulfur HHV
## 531 Wood Sawdust Waste Training 45.97 5.13 48.53 0.12 0.24 18.207
## 532 Wood Testing 47.80 5.80 39.00 1.40 0.10 17.205
## 533 Wood/Pit Training 48.62 5.78 39.73 0.65 0.06 19.660
## 534 Wood/Wheat Straw Training 47.48 5.81 37.92 0.35 0.12 18.800
## 535 Wyoming Elkol Testing 71.50 5.30 16.90 1.20 0.90 29.570
## 536 Yard Waste Training 41.54 4.79 31.91 0.85 0.24 16.300
str(biomass)
## 'data.frame': 536 obs. of 8 variables:
## $ sample : chr "Akhrot Shell" "Alabama Oak Wood Waste" "Alder" "Alfalfa" ...
## $ dataset : chr "Training" "Training" "Training" "Training" ...
## $ carbon : num 49.8 49.5 47.8 45.1 46.8 ...
## $ hydrogen: num 5.64 5.7 5.8 4.97 5.4 5.75 5.99 5.7 5.5 5.9 ...
## $ oxygen : num 42.9 41.3 46.2 35.6 40.7 ...
## $ nitrogen: num 0.41 0.2 0.11 3.3 1 2.04 2.68 1.7 0.8 1.2 ...
## $ sulfur : num 0 0 0.02 0.16 0.02 0.1 0.2 0.2 0 0.1 ...
## $ HHV : num 20 19.2 18.3 18.2 18.4 ...
biomass_tr <- biomass[biomass$dataset == "Training",]
biomass_te <- biomass[biomass$dataset == "Testing",]
head(biomass_tr)
## sample dataset carbon hydrogen oxygen nitrogen sulfur HHV
## 1 Akhrot Shell Training 49.81 5.64 42.94 0.41 0.00 20.008
## 2 Alabama Oak Wood Waste Training 49.50 5.70 41.30 0.20 0.00 19.228
## 3 Alder Training 47.82 5.80 46.25 0.11 0.02 18.299
## 4 Alfalfa Training 45.10 4.97 35.60 3.30 0.16 18.151
## 5 Alfalfa Seed Straw Training 46.76 5.40 40.72 1.00 0.02 18.450
## 6 Alfalfa Stalks Training 45.40 5.75 40.20 2.04 0.10 18.465
library(ggplot2)
theme_set(theme_bw())
ggplot(biomass_tr, aes(x = carbon)) +
geom_histogram(binwidth = 5, col = "blue", fill = "blue", alpha = .5) +
geom_vline(xintercept = biomass_te$carbon[1], lty = 2)

library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: dplyr
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
data = biomass_tr)
head(rec)
## $var_info
## # A tibble: 6 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 carbon numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur numeric predictor original
## 6 HHV numeric outcome original
##
## $term_info
## # A tibble: 6 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 carbon numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur numeric predictor original
## 6 HHV numeric outcome original
##
## $steps
## NULL
##
## $template
## # A tibble: 456 x 6
## carbon hydrogen oxygen nitrogen sulfur HHV
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 49.8 5.64 42.9 0.41 0 20.0
## 2 49.5 5.7 41.3 0.2 0 19.2
## 3 47.8 5.8 46.2 0.11 0.02 18.3
## 4 45.1 4.97 35.6 3.3 0.16 18.2
## 5 46.8 5.4 40.7 1 0.02 18.4
## 6 45.4 5.75 40.2 2.04 0.1 18.5
## 7 47.2 5.99 38.2 2.68 0.2 18.7
## 8 45.7 5.7 39.7 1.7 0.2 18.3
## 9 48.8 5.5 40.9 0.8 0 18.6
## 10 47.1 5.9 40 1.2 0.1 18.9
## # ... with 446 more rows
##
## $levels
## NULL
##
## $retained
## [1] NA
#숫자 데이터를 정규화하여 표준 편차가 하나이고 평균이 0이 되도록
#레시피 단계의 사양을 만듭니다.
norm_trans <- rec %>%
step_normalize(carbon, hydrogen)
str(norm_trans)
## List of 6
## $ var_info : tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
## ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
## ..$ type : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
## ..$ role : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
## ..$ source : chr [1:6] "original" "original" "original" "original" ...
## $ term_info: tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
## ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
## ..$ type : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
## ..$ role : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
## ..$ source : chr [1:6] "original" "original" "original" "original" ...
## $ steps :List of 1
## ..$ :List of 8
## .. ..$ terms :List of 2
## .. .. ..$ : language ~carbon
## .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8>
## .. .. ..$ : language ~hydrogen
## .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8>
## .. .. ..- attr(*, "class")= chr [1:2] "quosures" "list"
## .. ..$ role : logi NA
## .. ..$ trained: logi FALSE
## .. ..$ means : NULL
## .. ..$ sds : NULL
## .. ..$ na_rm : logi TRUE
## .. ..$ skip : logi FALSE
## .. ..$ id : chr "normalize_65Owz"
## .. ..- attr(*, "class")= chr [1:2] "step_normalize" "step"
## $ template : tibble [456 x 6] (S3: tbl_df/tbl/data.frame)
## ..$ carbon : num [1:456] 49.8 49.5 47.8 45.1 46.8 ...
## ..$ hydrogen: num [1:456] 5.64 5.7 5.8 4.97 5.4 5.75 5.99 5.7 5.5 5.9 ...
## ..$ oxygen : num [1:456] 42.9 41.3 46.2 35.6 40.7 ...
## ..$ nitrogen: num [1:456] 0.41 0.2 0.11 3.3 1 2.04 2.68 1.7 0.8 1.2 ...
## ..$ sulfur : num [1:456] 0 0 0.02 0.16 0.02 0.1 0.2 0.2 0 0.1 ...
## ..$ HHV : num [1:456] 20 19.2 18.3 18.2 18.4 ...
## $ levels : NULL
## $ retained : logi NA
## - attr(*, "class")= chr "recipe"
norm_obj <- prep(norm_trans, training = biomass_tr)
str(norm_obj)
## List of 8
## $ var_info : tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
## ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
## ..$ type : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
## ..$ role : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
## ..$ source : chr [1:6] "original" "original" "original" "original" ...
## $ term_info : tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
## ..$ variable: chr [1:6] "carbon" "hydrogen" "oxygen" "nitrogen" ...
## ..$ type : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
## ..$ role : chr [1:6] "predictor" "predictor" "predictor" "predictor" ...
## ..$ source : chr [1:6] "original" "original" "original" "original" ...
## $ steps :List of 1
## ..$ :List of 8
## .. ..$ terms :List of 2
## .. .. ..$ : language ~carbon
## .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8>
## .. .. ..$ : language ~hydrogen
## .. .. .. ..- attr(*, ".Environment")=<environment: 0x0000000027b724f8>
## .. .. ..- attr(*, "class")= chr [1:2] "quosures" "list"
## .. ..$ role : logi NA
## .. ..$ trained: logi TRUE
## .. ..$ means : Named num [1:2] 48.35 5.46
## .. .. ..- attr(*, "names")= chr [1:2] "carbon" "hydrogen"
## .. ..$ sds : Named num [1:2] 10.4 1.2
## .. .. ..- attr(*, "names")= chr [1:2] "carbon" "hydrogen"
## .. ..$ na_rm : logi TRUE
## .. ..$ skip : logi FALSE
## .. ..$ id : chr "normalize_65Owz"
## .. ..- attr(*, "class")= chr [1:2] "step_normalize" "step"
## $ template : tibble [456 x 6] (S3: tbl_df/tbl/data.frame)
## ..$ carbon : num [1:456] 0.1399 0.1101 -0.0513 -0.3126 -0.1532 ...
## ..$ hydrogen: num [1:456] 0.1512 0.2013 0.2848 -0.4086 -0.0494 ...
## ..$ oxygen : num [1:456] 42.9 41.3 46.2 35.6 40.7 ...
## ..$ nitrogen: num [1:456] 0.41 0.2 0.11 3.3 1 2.04 2.68 1.7 0.8 1.2 ...
## ..$ sulfur : num [1:456] 0 0 0.02 0.16 0.02 0.1 0.2 0.2 0 0.1 ...
## ..$ HHV : num [1:456] 20 19.2 18.3 18.2 18.4 ...
## $ retained : logi TRUE
## $ tr_info :'data.frame': 1 obs. of 2 variables:
## ..$ nrows : int 456
## ..$ ncomplete: int 456
## $ orig_lvls :List of 6
## ..$ carbon :List of 2
## .. ..$ values : logi NA
## .. ..$ ordered: logi NA
## ..$ hydrogen:List of 2
## .. ..$ values : logi NA
## .. ..$ ordered: logi NA
## ..$ oxygen :List of 2
## .. ..$ values : logi NA
## .. ..$ ordered: logi NA
## ..$ nitrogen:List of 2
## .. ..$ values : logi NA
## .. ..$ ordered: logi NA
## ..$ sulfur :List of 2
## .. ..$ values : logi NA
## .. ..$ ordered: logi NA
## ..$ HHV :List of 2
## .. ..$ values : logi NA
## .. ..$ ordered: logi NA
## $ last_term_info: grouped_df [6 x 6] (S3: grouped_df/tbl_df/tbl/data.frame)
## ..$ variable: chr [1:6] "carbon" "HHV" "hydrogen" "nitrogen" ...
## ..$ type : chr [1:6] "numeric" "numeric" "numeric" "numeric" ...
## ..$ role :List of 6
## .. ..$ : chr "predictor"
## .. ..$ : chr "outcome"
## .. ..$ : chr "predictor"
## .. ..$ : chr "predictor"
## .. ..$ : chr "predictor"
## .. ..$ : chr "predictor"
## ..$ source : chr [1:6] "original" "original" "original" "original" ...
## ..$ number : num [1:6] 1 1 1 1 1 1
## ..$ skip : logi [1:6] FALSE FALSE FALSE FALSE FALSE FALSE
## ..- attr(*, "groups")= tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
## .. ..$ variable: chr [1:6] "carbon" "HHV" "hydrogen" "nitrogen" ...
## .. ..$ .rows : list<int> [1:6]
## .. .. ..$ : int 1
## .. .. ..$ : int 2
## .. .. ..$ : int 3
## .. .. ..$ : int 4
## .. .. ..$ : int 5
## .. .. ..$ : int 6
## .. .. ..@ ptype: int(0)
## .. ..- attr(*, ".drop")= logi TRUE
## - attr(*, "class")= chr "recipe"
head(norm_obj)
## $var_info
## # A tibble: 6 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 carbon numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur numeric predictor original
## 6 HHV numeric outcome original
##
## $term_info
## # A tibble: 6 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 carbon numeric predictor original
## 2 hydrogen numeric predictor original
## 3 oxygen numeric predictor original
## 4 nitrogen numeric predictor original
## 5 sulfur numeric predictor original
## 6 HHV numeric outcome original
##
## $steps
## $steps[[1]]
## $terms
## <list_of<quosure>>
##
## [[1]]
## <quosure>
## expr: ^carbon
## env: 0x0000000027b724f8
##
## [[2]]
## <quosure>
## expr: ^hydrogen
## env: 0x0000000027b724f8
##
##
## $role
## [1] NA
##
## $trained
## [1] TRUE
##
## $means
## carbon hydrogen
## 48.354145 5.459079
##
## $sds
## carbon hydrogen
## 10.40829 1.19688
##
## $na_rm
## [1] TRUE
##
## $skip
## [1] FALSE
##
## $id
## [1] "normalize_65Owz"
##
## attr(,"class")
## [1] "step_normalize" "step"
##
##
## $template
## # A tibble: 456 x 6
## carbon hydrogen oxygen nitrogen sulfur HHV
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.140 0.151 42.9 0.41 0 20.0
## 2 0.110 0.201 41.3 0.2 0 19.2
## 3 -0.0513 0.285 46.2 0.11 0.02 18.3
## 4 -0.313 -0.409 35.6 3.3 0.16 18.2
## 5 -0.153 -0.0494 40.7 1 0.02 18.4
## 6 -0.284 0.243 40.2 2.04 0.1 18.5
## 7 -0.114 0.444 38.2 2.68 0.2 18.7
## 8 -0.255 0.201 39.7 1.7 0.2 18.3
## 9 0.0428 0.0342 40.9 0.8 0 18.6
## 10 -0.120 0.368 40 1.2 0.1 18.9
## # ... with 446 more rows
##
## $retained
## [1] TRUE
##
## $tr_info
## nrows ncomplete
## 1 456 456
transformed_te <- bake(norm_obj, biomass_te)
head(transformed_te)
## # A tibble: 6 x 6
## carbon hydrogen oxygen nitrogen sulfur HHV
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.193 0.176 47.2 0.3 0.22 18.3
## 2 -0.490 0.0342 48.1 2.85 0.34 17.6
## 3 -0.543 0.0342 49.1 2.4 0.3 17.2
## 4 -0.188 0.535 37.3 1.8 0.5 18.9
## 5 0.0390 0.719 42.8 0.2 0 20.5
## 6 -0.390 0.0342 41.7 0.7 0.2 18.5
biomass_te[1:10, names(transformed_te)]
## carbon hydrogen oxygen nitrogen sulfur HHV
## 15 46.35 5.67 47.20 0.30 0.22 18.275
## 20 43.25 5.50 48.06 2.85 0.34 17.560
## 26 42.70 5.50 49.10 2.40 0.30 17.173
## 31 46.40 6.10 37.30 1.80 0.50 18.851
## 36 48.76 6.32 42.77 0.20 0.00 20.547
## 41 44.30 5.50 41.70 0.70 0.20 18.467
## 46 38.94 5.23 54.13 1.19 0.51 15.095
## 51 42.10 4.66 33.80 0.95 0.20 16.240
## 55 29.20 4.40 31.10 0.14 4.90 11.147
## 65 27.80 3.77 23.69 4.63 1.05 10.750
transformed_te
## # A tibble: 80 x 6
## carbon hydrogen oxygen nitrogen sulfur HHV
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.193 0.176 47.2 0.3 0.22 18.3
## 2 -0.490 0.0342 48.1 2.85 0.34 17.6
## 3 -0.543 0.0342 49.1 2.4 0.3 17.2
## 4 -0.188 0.535 37.3 1.8 0.5 18.9
## 5 0.0390 0.719 42.8 0.2 0 20.5
## 6 -0.390 0.0342 41.7 0.7 0.2 18.5
## 7 -0.904 -0.191 54.1 1.19 0.51 15.1
## 8 -0.601 -0.668 33.8 0.95 0.2 16.2
## 9 -1.84 -0.885 31.1 0.14 4.9 11.1
## 10 -1.97 -1.41 23.7 4.63 1.05 10.8
## # ... with 70 more rows
tidy(norm_trans, number = 1)
## # A tibble: 2 x 4
## terms statistic value id
## <chr> <chr> <dbl> <chr>
## 1 carbon <NA> NA normalize_65Owz
## 2 hydrogen <NA> NA normalize_65Owz
tidy(norm_obj, number = 1)
## # A tibble: 4 x 4
## terms statistic value id
## <chr> <chr> <dbl> <chr>
## 1 carbon mean 48.4 normalize_65Owz
## 2 hydrogen mean 5.46 normalize_65Owz
## 3 carbon sd 10.4 normalize_65Owz
## 4 hydrogen sd 1.20 normalize_65Owz
# To keep the original variables in the output, use `step_mutate_at`:
norm_keep_orig <- rec %>%
step_mutate_at(all_numeric_predictors(), fn = list(orig = ~.)) %>%
step_normalize(-contains("orig"), -all_outcomes())
keep_orig_obj <- prep(norm_keep_orig, training = biomass_tr)
keep_orig_te <- bake(keep_orig_obj, biomass_te)
keep_orig_te
## # A tibble: 80 x 11
## carbon hydrogen oxygen nitrogen sulfur HHV carbon_orig hydrogen_orig
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.193 0.176 0.801 -0.643 0.00755 18.3 46.4 5.67
## 2 -0.490 0.0342 0.881 1.47 0.281 17.6 43.2 5.5
## 3 -0.543 0.0342 0.977 1.10 0.190 17.2 42.7 5.5
## 4 -0.188 0.535 -0.113 0.602 0.646 18.9 46.4 6.1
## 5 0.0390 0.719 0.392 -0.726 -0.494 20.5 48.8 6.32
## 6 -0.390 0.0342 0.293 -0.311 -0.0380 18.5 44.3 5.5
## 7 -0.904 -0.191 1.44 0.0958 0.668 15.1 38.9 5.23
## 8 -0.601 -0.668 -0.436 -0.103 -0.0380 16.2 42.1 4.66
## 9 -1.84 -0.885 -0.686 -0.776 10.7 11.1 29.2 4.4
## 10 -1.97 -1.41 -1.37 2.95 1.90 10.8 27.8 3.77
## # ... with 70 more rows, and 3 more variables: oxygen_orig <dbl>,
## # nitrogen_orig <dbl>, sulfur_orig <dbl>