How can I create a new vector from combinations of HBsAg, HBcAb, and HBsAb vectors smartly in R?
HBsAg HBcAb and HBsAb are all vectors within a data frame. These may have NAs.
## HBsAg HBcAb HBsAb hbv.status
## pos any any = Active
## neg pos any = Past
## neg neg pos = Vaccined
## neg neg neg = Negative
## any is defined as NA, neg, or pos.
* Mock data step*;
data hbvdata;
set dir.hbvdata;
if HBsAg = "pos" then hbvstat = "positive";
else if HBsAg = "neg" & HBcAb = "pos" then hbvstat = "past";
else if HBsAg = "neg" & HBcAb = "neg" & HBsAb = "pos" then hbvstat = "vaccined";
else if HBsAg = "neg" & HBcAb = "neg" & HBsAb = "neg" then hbvstat = "negative";
else hbvstat = .;
run;
I tried both conditional indexing and nested ifelse(). ifelse seems faster.
Addition on 2012-10-01: The ura-R-jp-Wiki (critique of programming site (written in Japanese); http://blog.goo.ne.jp/r-de-r/e/e241af106721d5c2e8ec943865617b4c) suggested use of comprehensive demonstration data for this kind of task. I really appreciate it!
More information on expand.grid() is available in Data Manipulation with R (pages 27-29).
http://books.google.com/books?id=grfuq1twFe4C&printsec=frontcover&source=gbs_ge_summary_r&cad=0#v=onepage&q=expand.grid&f=false
I was able to confirm these two conversions actually agree. The pos-pos-pos pattern may not valid medically, but is necessary for testing the code.
## Create comprehensive demo data
HBsAg <- factor(c("neg", "pos", NA))
HBcAb <- factor(c("neg", "pos", NA))
HBsAb <- factor(c("neg", "pos", NA))
hbv.data <- expand.grid(HBsAb = HBsAb, HBcAb = HBcAb, HBsAg = HBsAg)
hbv.data <- hbv.data[,3:1]
hbv.data2 <- hbv.data # for later use
## No ifelse version
hbv.data <- within(hbv.data, {
## Included is.na() condition to achieve "else if" in the SAS code. But I'm not sure if it does.
## Create a HBV status vector and give a value
hbv.status <- NA
hbv.status[is.na(hbv.status) & HBsAg == "pos"] <- "positive"
hbv.status[is.na(hbv.status) & HBsAg == "neg" & HBcAb == "pos"] <- "past"
hbv.status[is.na(hbv.status) & HBsAg == "neg" & HBcAb == "neg" & HBsAb == "pos"] <- "vaccined"
hbv.status[is.na(hbv.status) & HBsAg == "neg" & HBcAb == "neg" & HBsAb == "neg"] <- "negative"
})
## Nested ifelse version (emacs literally nests the structure...)
hbv.data <- within(hbv.data, {
hbv.status.nest <-
ifelse( HBsAg == "pos", "positive",
ifelse( HBsAg == "neg" & HBcAb == "pos", "past",
ifelse( HBsAg == "neg" & HBcAb == "neg" & HBsAb == "pos", "vaccined",
ifelse( HBsAg == "neg" & HBcAb == "neg" & HBsAb == "neg", "negative",
no = NA
)
)
)
)
})
hbv.data
HBsAg HBcAb HBsAb hbv.status hbv.status.nest
1 neg neg neg negative negative
2 neg neg pos vaccined vaccined
3 neg neg <NA> <NA> <NA>
4 neg pos neg past past
5 neg pos pos past past
6 neg pos <NA> past past
7 neg <NA> neg <NA> <NA>
8 neg <NA> pos <NA> <NA>
9 neg <NA> <NA> <NA> <NA>
10 pos neg neg positive positive
11 pos neg pos positive positive
12 pos neg <NA> positive positive
13 pos pos neg positive positive
14 pos pos pos positive positive
15 pos pos <NA> positive positive
16 pos <NA> neg positive positive
17 pos <NA> pos positive positive
18 pos <NA> <NA> positive positive
19 <NA> neg neg <NA> <NA>
20 <NA> neg pos <NA> <NA>
21 <NA> neg <NA> <NA> <NA>
22 <NA> pos neg <NA> <NA>
23 <NA> pos pos <NA> <NA>
24 <NA> pos <NA> <NA> <NA>
25 <NA> <NA> neg <NA> <NA>
26 <NA> <NA> pos <NA> <NA>
27 <NA> <NA> <NA> <NA> <NA>
## hbv.data3 <- do.call(rbind, lapply(1:5000, function(x) hbv.data2)) # This may also be inefficient.
hbv.data3 <- hbv.data2[rep(seq_len(nrow(hbv.data2)), 5000), ] # This is much faster
nrow(hbv.data3)
[1] 135000
no.nest.one <- function(df) {
within(df, {
## Included is.na() condition to achieve "else if" in the SAS code. But I'm not sure if it does.
## Create a HBV status vector and give a value
hbv.status <- NA
hbv.status[is.na(hbv.status) & HBsAg == "pos"] <- "positive"
hbv.status[is.na(hbv.status) & HBsAg == "neg" & HBcAb == "pos"] <- "past"
hbv.status[is.na(hbv.status) & HBsAg == "neg" & HBcAb == "neg" & HBsAb == "pos"] <- "vaccined"
hbv.status[is.na(hbv.status) & HBsAg == "neg" & HBcAb == "neg" & HBsAb == "neg"] <- "negative"
})
}
no.nest.two <- function(df) {
within(df, {
## Without is.na() for simplicity
## Create a HBV status vector and give a value
hbv.status <- NA
hbv.status[HBsAg == "pos"] <- "positive"
hbv.status[HBsAg == "neg" & HBcAb == "pos"] <- "past"
hbv.status[HBsAg == "neg" & HBcAb == "neg" & HBsAb == "pos"] <- "vaccined"
hbv.status[HBsAg == "neg" & HBcAb == "neg" & HBsAb == "neg"] <- "negative"
})
}
nest.one <- function(df) {
within(df, {
hbv.status. <-
ifelse( HBsAg == "pos", "positive",
ifelse( HBsAg == "neg" & HBcAb == "pos", "past",
ifelse( HBsAg == "neg" & HBcAb == "neg" & HBsAb == "pos", "vaccined",
ifelse( HBsAg == "neg" & HBcAb == "neg" & HBsAb == "neg", "negative",
no = NA
)
)
)
)
})
}
no.nest.two is slightly faster than no.nest.one that has a is.na() condition, although the result was sometimes inconsistent. R appears to test the second etc conditions regardless of the TRUE/FALSE status of the first condition. Thus, trying to simulate else if by adding is.na() does not make it faster.
However, inclusion of is.na() in the condition can prevent accidental overwritting of previsouly matched results, so it may be preferable to no is.na() solution in terms of safety.
system.time(junk <- no.nest.one(hbv.data3))
user system elapsed
0.254 0.004 0.257
rm(junk)
system.time(junk <- no.nest.two(hbv.data3))
user system elapsed
0.124 0.007 0.132
rm(junk)
system.time(junk <- nest.one(hbv.data3))
user system elapsed
0.336 0.013 0.349
rm(junk)