Các packages: dplyr, sjmisc

Nhu cầu đặt lại giá trị cho các biến số là thường gặp khi phân tích dữ liệu. VD như “nam” thành 1, “nữ” thành 0. Age được chia thành Age group.

Những cách recoding values đơn giản, thường được sử dụng thông qua các packages như dplyr, Rmisc

library(dplyr)
library(sjmisc)

Biến số categorical

sử dụng function recode()

  
df %>% mutate(player2 = recode(player, 
  "A" = "Non-mgmt",
  "B" = "Non-mgmt",
  .default = "Mgmt"))  %>% 
  head()
NA

Cách khác

  
df %>% mutate(player2 = recode(player, 
  "A" = "Non-mgmt",
  "B" = "Non-mgmt",
  .default = "Mgmt"))  %>% 
  head()

Biến số numeric

Biến số liên tục như Age, weight … thường được chia nhóm theo các mốc giá trị cố định như mean, median, percentile 75th hoặc nhiều mốc giá trị từ min đến max.

data("iris")
summary(iris$Sepal.Length)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.300   5.100   5.800   5.843   6.400   7.900 

Chúng ta dùng function cut() với breaks = để chia các khoảng giá trị.

library(Rmisc)
iris$Sepal.Length.group = cut(iris$Sepal.Length, breaks = c(4.3, 5, 6, 7.9))
frq(iris$Sepal.Length.group)

x <categorical>
# total N=150  valid N=149  mean=2.20  sd=0.76

Value   |  N | Raw % | Valid % | Cum. %
---------------------------------------
(4.3,5] | 31 | 20.67 |   20.81 |  20.81
(5,6]   | 57 | 38.00 |   38.26 |  59.06
(6,7.9] | 61 | 40.67 |   40.94 | 100.00
<NA>    |  1 |  0.67 |    <NA> |   <NA>

Từ một biến số numeric Sepal.Length có range là 4.3 - 7.9, chúng ta đã chia các khoảng để trở thành biến số categorical Sepal.Length.group:

(4.3 -5]

(5-6]

(6-7.9]

Khi muốn chia các khoảng đều nhau chúng ta viết như sau:

iris$Sepal.Length.group = cut(iris$Sepal.Length, 3 , right = TRUE)
frq(iris$Sepal.Length.group)

x <categorical>
# total N=150  valid N=150  mean=1.74  sd=0.68

Value     |  N | Raw % | Valid % | Cum. %
-----------------------------------------
(4.3,5.5] | 59 | 39.33 |   39.33 |  39.33
(5.5,6.7] | 71 | 47.33 |   47.33 |  86.67
(6.7,7.9] | 20 | 13.33 |   13.33 | 100.00
<NA>      |  0 |  0.00 |    <NA> |   <NA>

nếu muốn chia 2 khoảng với mốc là mean ta viết code như sau.

iris2 <- iris %>% mutate(Sepal.Length.gr = ifelse(iris$Sepal.Length < mean(iris$Sepal.Length), 0, 1)) %>%
  frq(Sepal.Length.gr) %>% print()

Sepal.Length.gr <numeric>
# total N=150  valid N=150  mean=0.47  sd=0.50

Value |  N | Raw % | Valid % | Cum. %
-------------------------------------
    0 | 80 | 53.33 |   53.33 |  53.33
    1 | 70 | 46.67 |   46.67 | 100.00
 <NA> |  0 |  0.00 |    <NA> |   <NA>

Lấy mốc quantile 75th làm mốc để chia 2 nhóm

iris2 <- iris %>% mutate(Sepal.Length.gr75 = ifelse(iris$Sepal.Length < quantile((iris$Sepal.Length), 0.75), 0, 1)) %>%
  frq(Sepal.Length.gr75) %>% print()

Sepal.Length.gr75 <numeric>
# total N=150  valid N=150  mean=0.28  sd=0.45

Value |   N | Raw % | Valid % | Cum. %
--------------------------------------
    0 | 108 | 72.00 |   72.00 |     72
    1 |  42 | 28.00 |   28.00 |    100
 <NA> |   0 |  0.00 |    <NA> |   <NA>

Function case_when

data("iris")
iris %>% mutate(Sepal.Length.group = case_when(
  Sepal.Length < 5.5                     ~ 1,
  Sepal.Length >= 5.5 & Sepal.Length < 6 ~ 2,
  Sepal.Length >= 6                       ~ 3
)) %>% tail()
LS0tDQp0aXRsZTogIlJlY29kaW5nIHZhbHVlcyBpbiBSIg0KYXV0aG9yOiAiVGhpZXUgTmd1eWVuIg0KZGF0ZTogIjEwLzMvMjAyMSINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICBkZl9wcmludDogcGFnZWQNCmVkaXRvcl9vcHRpb25zOg0KICBjaHVua19vdXRwdXRfdHlwZTogaW5saW5lDQotLS0NCg0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpDQpgYGANCg0KIyMgQ8OhYyBwYWNrYWdlczogZHBseXIsIHNqbWlzYw0KDQpOaHUgY+G6p3UgxJHhurd0IGzhuqFpIGdpw6EgdHLhu4sgY2hvIGPDoWMgYmnhur9uIHPhu5EgbMOgIHRoxrDhu51uZyBn4bq3cCBraGkgcGjDom4gdMOtY2ggZOG7ryBsaeG7h3UuIFZEIG5oxrAgIm5hbSIgdGjDoG5oIDEsICJu4buvIiB0aMOgbmggMC4gQWdlIMSRxrDhu6NjIGNoaWEgdGjDoG5oIEFnZSBncm91cC4NCg0KTmjhu69uZyBjw6FjaCByZWNvZGluZyB2YWx1ZXMgxJHGoW4gZ2nhuqNuLCB0aMaw4budbmcgxJHGsOG7o2Mgc+G7rSBk4bulbmcgdGjDtG5nIHF1YSBjw6FjIHBhY2thZ2VzIG5oxrAgZHBseXIsIFJtaXNjDQoNCmBgYHtyIH0NCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHNqbWlzYykNCmBgYA0KDQojIyBCaeG6v24gc+G7kSBjYXRlZ29yaWNhbA0KDQpz4butIGThu6VuZyBmdW5jdGlvbiByZWNvZGUoKQ0KDQpgYGB7ciBwcmVzc3VyZX0NCiNjcmVhdGUgZGF0YWZyYW1lIA0KZGYgPC0gZGF0YS5mcmFtZShwbGF5ZXIgPSBjKCdBJywgJ0InLCAnQycsICdEJyksDQogICAgICAgICAgICAgICAgIHBvaW50cyA9IGMoMjQsIDI5LCAxMywgMTUpLA0KICAgICAgICAgICAgICAgICByZXN1bHQgPSBjKCdXaW4nLCAnTG9zcycsICdXaW4nLCAnTG9zcycpKQ0KDQojdmlldyBkYXRhZnJhbWUgDQpkZg0KDQojY2hhbmdlICdXaW4nIGFuZCAnTG9zcycgdG8gJzEnIGFuZCAnMCcNCmRmICU+JSBtdXRhdGUocmVzdWx0Mj1yZWNvZGUocmVzdWx0LCAnV2luJz0nMScsICdMb3NzJz0nMCcpKQ0KYGBgDQoNCkPDoWNoIGtow6FjDQoNCmBgYHtyIH0NCiAgDQpkZiAlPiUgbXV0YXRlKHBsYXllcjIgPSByZWNvZGUocGxheWVyLCANCiAgIkEiID0gIk5vbi1tZ210IiwNCiAgIkIiID0gIk5vbi1tZ210IiwNCiAgLmRlZmF1bHQgPSAiTWdtdCIpKSAgJT4lIA0KICBoZWFkKCkNCg0KYGBgDQoNCiMjIEJp4bq/biBz4buRIG51bWVyaWMgDQoNCkJp4bq/biBz4buRIGxpw6puIHThu6VjIG5oxrAgQWdlLCB3ZWlnaHQgLi4uIHRoxrDhu51uZyDEkcaw4bujYyBjaGlhIG5ow7NtIHRoZW8gY8OhYyBt4buRYyBnacOhIHRy4buLIGPhu5EgxJHhu4tuaCBuaMawIG1lYW4sIG1lZGlhbiwgcGVyY2VudGlsZSA3NXRoIGhv4bq3YyBuaGnhu4F1IG3hu5FjIGdpw6EgdHLhu4sgdOG7qyBtaW4gxJHhur9uIG1heC4NCg0KYGBge3IgfQ0KZGF0YSgiaXJpcyIpDQpzdW1tYXJ5KGlyaXMkU2VwYWwuTGVuZ3RoKQ0KYGBgDQoNCkNow7puZyB0YSBkw7luZyBmdW5jdGlvbiBjdXQoKSB24bubaSBicmVha3MgPSDEkeG7gyBjaGlhIGPDoWMga2hv4bqjbmcgZ2nDoSB0cuG7iy4NCg0KYGBge3IgfQ0KDQppcmlzJFNlcGFsLkxlbmd0aC5ncm91cCA9IGN1dChpcmlzJFNlcGFsLkxlbmd0aCwgYnJlYWtzID0gYyg0LjMsIDUsIDYsIDcuOSkgLCByaWdodCA9IFRSVUUpDQpmcnEoaXJpcyRTZXBhbC5MZW5ndGguZ3JvdXApDQpgYGANCg0KVOG7qyBt4buZdCBiaeG6v24gc+G7kSBudW1lcmljIFNlcGFsLkxlbmd0aCBjw7MgcmFuZ2UgbMOgIDQuMyAtIDcuOSwgY2jDum5nIHRhIMSRw6MgY2hpYSBjw6FjIGtob+G6o25nIMSR4buDIHRy4bufIHRow6BuaCBiaeG6v24gc+G7kSBjYXRlZ29yaWNhbCBTZXBhbC5MZW5ndGguZ3JvdXA6DQoNCig0LjMgLTVdDQoNCig1LTZdDQoNCig2LTcuOV0NCg0KS2hpIG114buRbiBjaGlhIGPDoWMga2hv4bqjbmcgxJHhu4F1IG5oYXUgY2jDum5nIHRhIHZp4bq/dCBuaMawIHNhdToNCg0KYGBge3IgfQ0KaXJpcyRTZXBhbC5MZW5ndGguZ3JvdXAgPSBjdXQoaXJpcyRTZXBhbC5MZW5ndGgsIDMgLCByaWdodCA9IFRSVUUpDQpmcnEoaXJpcyRTZXBhbC5MZW5ndGguZ3JvdXApDQoNCmBgYA0KDQpu4bq/dSBtdeG7kW4gY2hpYSAyIGtob+G6o25nIHbhu5tpIG3hu5FjIGzDoCBtZWFuIHRhIHZp4bq/dCBjb2RlIG5oxrAgc2F1Lg0KDQpgYGB7ciB9DQppcmlzMiA8LSBpcmlzICU+JSBtdXRhdGUoU2VwYWwuTGVuZ3RoLmdyID0gaWZlbHNlKGlyaXMkU2VwYWwuTGVuZ3RoIDwgbWVhbihpcmlzJFNlcGFsLkxlbmd0aCksIDAsIDEpKSAlPiUNCiAgZnJxKFNlcGFsLkxlbmd0aC5ncikgJT4lIHByaW50KCkNCmBgYA0KDQpM4bqleSBt4buRYyBxdWFudGlsZSA3NXRoIGzDoG0gbeG7kWMgxJHhu4MgY2hpYSAyIG5ow7NtDQoNCmBgYHtyIH0NCmlyaXMyIDwtIGlyaXMgJT4lIG11dGF0ZShTZXBhbC5MZW5ndGguZ3I3NSA9IGlmZWxzZShpcmlzJFNlcGFsLkxlbmd0aCA8IHF1YW50aWxlKChpcmlzJFNlcGFsLkxlbmd0aCksIDAuNzUpLCAwLCAxKSkgJT4lDQogIGZycShTZXBhbC5MZW5ndGguZ3I3NSkgJT4lIHByaW50KCkNCmBgYA0KDQojIyBGdW5jdGlvbiBjYXNlX3doZW4NCg0KYGBge3J9DQpkYXRhKCJpcmlzIikNCmlyaXMgJT4lIG11dGF0ZShTZXBhbC5MZW5ndGguZ3JvdXAgPSBjYXNlX3doZW4oDQogIFNlcGFsLkxlbmd0aCA8IDUuNSAgICAgICAgICAgICAgICAgICAgIH4gMSwNCiAgU2VwYWwuTGVuZ3RoID49IDUuNSAmIFNlcGFsLkxlbmd0aCA8IDYgfiAyLA0KICBTZXBhbC5MZW5ndGggPj0gNiAgICAgICAgICAgICAgICAgICAgICAgfiAzDQopKSAlPiUgdGFpbCgpDQpgYGANCg0K