Các packages: dplyr, sjmisc
Nhu cầu đặt lại giá trị cho các biến số là thường gặp khi phân tích dữ liệu. VD như “nam” thành 1, “nữ” thành 0. Age được chia thành Age group.
Những cách recoding values đơn giản, thường được sử dụng thông qua các packages như dplyr, Rmisc
library(dplyr)
library(sjmisc)
Biến số categorical
sử dụng function recode()
df %>% mutate(player2 = recode(player,
"A" = "Non-mgmt",
"B" = "Non-mgmt",
.default = "Mgmt")) %>%
head()
NA
Cách khác
df %>% mutate(player2 = recode(player,
"A" = "Non-mgmt",
"B" = "Non-mgmt",
.default = "Mgmt")) %>%
head()
Biến số numeric
Biến số liên tục như Age, weight … thường được chia nhóm theo các mốc giá trị cố định như mean, median, percentile 75th hoặc nhiều mốc giá trị từ min đến max.
data("iris")
summary(iris$Sepal.Length)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.300 5.100 5.800 5.843 6.400 7.900
Chúng ta dùng function cut() với breaks = để chia các khoảng giá trị.
library(Rmisc)
iris$Sepal.Length.group = cut(iris$Sepal.Length, breaks = c(4.3, 5, 6, 7.9))
frq(iris$Sepal.Length.group)
x <categorical>
# total N=150 valid N=149 mean=2.20 sd=0.76
Value | N | Raw % | Valid % | Cum. %
---------------------------------------
(4.3,5] | 31 | 20.67 | 20.81 | 20.81
(5,6] | 57 | 38.00 | 38.26 | 59.06
(6,7.9] | 61 | 40.67 | 40.94 | 100.00
<NA> | 1 | 0.67 | <NA> | <NA>
Từ một biến số numeric Sepal.Length có range là 4.3 - 7.9, chúng ta đã chia các khoảng để trở thành biến số categorical Sepal.Length.group:
(4.3 -5]
(5-6]
(6-7.9]
Khi muốn chia các khoảng đều nhau chúng ta viết như sau:
iris$Sepal.Length.group = cut(iris$Sepal.Length, 3 , right = TRUE)
frq(iris$Sepal.Length.group)
x <categorical>
# total N=150 valid N=150 mean=1.74 sd=0.68
Value | N | Raw % | Valid % | Cum. %
-----------------------------------------
(4.3,5.5] | 59 | 39.33 | 39.33 | 39.33
(5.5,6.7] | 71 | 47.33 | 47.33 | 86.67
(6.7,7.9] | 20 | 13.33 | 13.33 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
nếu muốn chia 2 khoảng với mốc là mean ta viết code như sau.
iris2 <- iris %>% mutate(Sepal.Length.gr = ifelse(iris$Sepal.Length < mean(iris$Sepal.Length), 0, 1)) %>%
frq(Sepal.Length.gr) %>% print()
Sepal.Length.gr <numeric>
# total N=150 valid N=150 mean=0.47 sd=0.50
Value | N | Raw % | Valid % | Cum. %
-------------------------------------
0 | 80 | 53.33 | 53.33 | 53.33
1 | 70 | 46.67 | 46.67 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
Lấy mốc quantile 75th làm mốc để chia 2 nhóm
iris2 <- iris %>% mutate(Sepal.Length.gr75 = ifelse(iris$Sepal.Length < quantile((iris$Sepal.Length), 0.75), 0, 1)) %>%
frq(Sepal.Length.gr75) %>% print()
Sepal.Length.gr75 <numeric>
# total N=150 valid N=150 mean=0.28 sd=0.45
Value | N | Raw % | Valid % | Cum. %
--------------------------------------
0 | 108 | 72.00 | 72.00 | 72
1 | 42 | 28.00 | 28.00 | 100
<NA> | 0 | 0.00 | <NA> | <NA>
Function case_when
data("iris")
iris %>% mutate(Sepal.Length.group = case_when(
Sepal.Length < 5.5 ~ 1,
Sepal.Length >= 5.5 & Sepal.Length < 6 ~ 2,
Sepal.Length >= 6 ~ 3
)) %>% tail()
LS0tDQp0aXRsZTogIlJlY29kaW5nIHZhbHVlcyBpbiBSIg0KYXV0aG9yOiAiVGhpZXUgTmd1eWVuIg0KZGF0ZTogIjEwLzMvMjAyMSINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICBkZl9wcmludDogcGFnZWQNCmVkaXRvcl9vcHRpb25zOg0KICBjaHVua19vdXRwdXRfdHlwZTogaW5saW5lDQotLS0NCg0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpDQpgYGANCg0KIyMgQ8OhYyBwYWNrYWdlczogZHBseXIsIHNqbWlzYw0KDQpOaHUgY+G6p3UgxJHhurd0IGzhuqFpIGdpw6EgdHLhu4sgY2hvIGPDoWMgYmnhur9uIHPhu5EgbMOgIHRoxrDhu51uZyBn4bq3cCBraGkgcGjDom4gdMOtY2ggZOG7ryBsaeG7h3UuIFZEIG5oxrAgIm5hbSIgdGjDoG5oIDEsICJu4buvIiB0aMOgbmggMC4gQWdlIMSRxrDhu6NjIGNoaWEgdGjDoG5oIEFnZSBncm91cC4NCg0KTmjhu69uZyBjw6FjaCByZWNvZGluZyB2YWx1ZXMgxJHGoW4gZ2nhuqNuLCB0aMaw4budbmcgxJHGsOG7o2Mgc+G7rSBk4bulbmcgdGjDtG5nIHF1YSBjw6FjIHBhY2thZ2VzIG5oxrAgZHBseXIsIFJtaXNjDQoNCmBgYHtyIH0NCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHNqbWlzYykNCmBgYA0KDQojIyBCaeG6v24gc+G7kSBjYXRlZ29yaWNhbA0KDQpz4butIGThu6VuZyBmdW5jdGlvbiByZWNvZGUoKQ0KDQpgYGB7ciBwcmVzc3VyZX0NCiNjcmVhdGUgZGF0YWZyYW1lIA0KZGYgPC0gZGF0YS5mcmFtZShwbGF5ZXIgPSBjKCdBJywgJ0InLCAnQycsICdEJyksDQogICAgICAgICAgICAgICAgIHBvaW50cyA9IGMoMjQsIDI5LCAxMywgMTUpLA0KICAgICAgICAgICAgICAgICByZXN1bHQgPSBjKCdXaW4nLCAnTG9zcycsICdXaW4nLCAnTG9zcycpKQ0KDQojdmlldyBkYXRhZnJhbWUgDQpkZg0KDQojY2hhbmdlICdXaW4nIGFuZCAnTG9zcycgdG8gJzEnIGFuZCAnMCcNCmRmICU+JSBtdXRhdGUocmVzdWx0Mj1yZWNvZGUocmVzdWx0LCAnV2luJz0nMScsICdMb3NzJz0nMCcpKQ0KYGBgDQoNCkPDoWNoIGtow6FjDQoNCmBgYHtyIH0NCiAgDQpkZiAlPiUgbXV0YXRlKHBsYXllcjIgPSByZWNvZGUocGxheWVyLCANCiAgIkEiID0gIk5vbi1tZ210IiwNCiAgIkIiID0gIk5vbi1tZ210IiwNCiAgLmRlZmF1bHQgPSAiTWdtdCIpKSAgJT4lIA0KICBoZWFkKCkNCg0KYGBgDQoNCiMjIEJp4bq/biBz4buRIG51bWVyaWMgDQoNCkJp4bq/biBz4buRIGxpw6puIHThu6VjIG5oxrAgQWdlLCB3ZWlnaHQgLi4uIHRoxrDhu51uZyDEkcaw4bujYyBjaGlhIG5ow7NtIHRoZW8gY8OhYyBt4buRYyBnacOhIHRy4buLIGPhu5EgxJHhu4tuaCBuaMawIG1lYW4sIG1lZGlhbiwgcGVyY2VudGlsZSA3NXRoIGhv4bq3YyBuaGnhu4F1IG3hu5FjIGdpw6EgdHLhu4sgdOG7qyBtaW4gxJHhur9uIG1heC4NCg0KYGBge3IgfQ0KZGF0YSgiaXJpcyIpDQpzdW1tYXJ5KGlyaXMkU2VwYWwuTGVuZ3RoKQ0KYGBgDQoNCkNow7puZyB0YSBkw7luZyBmdW5jdGlvbiBjdXQoKSB24bubaSBicmVha3MgPSDEkeG7gyBjaGlhIGPDoWMga2hv4bqjbmcgZ2nDoSB0cuG7iy4NCg0KYGBge3IgfQ0KDQppcmlzJFNlcGFsLkxlbmd0aC5ncm91cCA9IGN1dChpcmlzJFNlcGFsLkxlbmd0aCwgYnJlYWtzID0gYyg0LjMsIDUsIDYsIDcuOSkgLCByaWdodCA9IFRSVUUpDQpmcnEoaXJpcyRTZXBhbC5MZW5ndGguZ3JvdXApDQpgYGANCg0KVOG7qyBt4buZdCBiaeG6v24gc+G7kSBudW1lcmljIFNlcGFsLkxlbmd0aCBjw7MgcmFuZ2UgbMOgIDQuMyAtIDcuOSwgY2jDum5nIHRhIMSRw6MgY2hpYSBjw6FjIGtob+G6o25nIMSR4buDIHRy4bufIHRow6BuaCBiaeG6v24gc+G7kSBjYXRlZ29yaWNhbCBTZXBhbC5MZW5ndGguZ3JvdXA6DQoNCig0LjMgLTVdDQoNCig1LTZdDQoNCig2LTcuOV0NCg0KS2hpIG114buRbiBjaGlhIGPDoWMga2hv4bqjbmcgxJHhu4F1IG5oYXUgY2jDum5nIHRhIHZp4bq/dCBuaMawIHNhdToNCg0KYGBge3IgfQ0KaXJpcyRTZXBhbC5MZW5ndGguZ3JvdXAgPSBjdXQoaXJpcyRTZXBhbC5MZW5ndGgsIDMgLCByaWdodCA9IFRSVUUpDQpmcnEoaXJpcyRTZXBhbC5MZW5ndGguZ3JvdXApDQoNCmBgYA0KDQpu4bq/dSBtdeG7kW4gY2hpYSAyIGtob+G6o25nIHbhu5tpIG3hu5FjIGzDoCBtZWFuIHRhIHZp4bq/dCBjb2RlIG5oxrAgc2F1Lg0KDQpgYGB7ciB9DQppcmlzMiA8LSBpcmlzICU+JSBtdXRhdGUoU2VwYWwuTGVuZ3RoLmdyID0gaWZlbHNlKGlyaXMkU2VwYWwuTGVuZ3RoIDwgbWVhbihpcmlzJFNlcGFsLkxlbmd0aCksIDAsIDEpKSAlPiUNCiAgZnJxKFNlcGFsLkxlbmd0aC5ncikgJT4lIHByaW50KCkNCmBgYA0KDQpM4bqleSBt4buRYyBxdWFudGlsZSA3NXRoIGzDoG0gbeG7kWMgxJHhu4MgY2hpYSAyIG5ow7NtDQoNCmBgYHtyIH0NCmlyaXMyIDwtIGlyaXMgJT4lIG11dGF0ZShTZXBhbC5MZW5ndGguZ3I3NSA9IGlmZWxzZShpcmlzJFNlcGFsLkxlbmd0aCA8IHF1YW50aWxlKChpcmlzJFNlcGFsLkxlbmd0aCksIDAuNzUpLCAwLCAxKSkgJT4lDQogIGZycShTZXBhbC5MZW5ndGguZ3I3NSkgJT4lIHByaW50KCkNCmBgYA0KDQojIyBGdW5jdGlvbiBjYXNlX3doZW4NCg0KYGBge3J9DQpkYXRhKCJpcmlzIikNCmlyaXMgJT4lIG11dGF0ZShTZXBhbC5MZW5ndGguZ3JvdXAgPSBjYXNlX3doZW4oDQogIFNlcGFsLkxlbmd0aCA8IDUuNSAgICAgICAgICAgICAgICAgICAgIH4gMSwNCiAgU2VwYWwuTGVuZ3RoID49IDUuNSAmIFNlcGFsLkxlbmd0aCA8IDYgfiAyLA0KICBTZXBhbC5MZW5ndGggPj0gNiAgICAgICAgICAgICAgICAgICAgICAgfiAzDQopKSAlPiUgdGFpbCgpDQpgYGANCg0K