## Create data
dat <- read.table(text = "
0 10
1 20
NA 30
3 40
NA 50
5 60
")
## Show
dat
V1 V2
1 0 10
2 1 20
3 NA 30
4 3 40
5 NA 50
6 5 60
== or != operator cannot handle euqality or inequality to NA, which does not have a value by definition!
The index vector should not have NA in it!
Use is.na() to create a correct index vector
is.element() or its equivalent %in% operator can handle NA and non-NA values same time
In the examples below, I will try to show the rows where (V1 is not 0) AND (V1 is not NA), i.e, rows where V1 has non-zero integers
It returns NA for comparison with NA, and indexing with NA’s will cause a disaster.
## (intention) Choose rows where V1 is not 0
indexVector <- !(dat$V1 == 0)
## indexVector contains NA (WRONG)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 NA
4 3 40 TRUE
5 NA 50 NA
6 5 60 TRUE
## WRONG
dat[indexVector,]
V1 V2
2 1 20
NA NA NA
4 3 40
NA.1 NA NA
6 5 60
== cannot assess equality to NA, which does not have a value by definition
## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !(dat$V1 == 0 | dat$V1 == NA)
## indexVector contains NA (WRONG)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 NA
3 NA 30 NA
4 3 40 NA
5 NA 50 NA
6 5 60 NA
## WRONG
dat[indexVector,]
V1 V2
NA NA NA
NA.1 NA NA
NA.2 NA NA
NA.3 NA NA
NA.4 NA NA
Use is.na() instead
## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !(dat$V1 == 0 | is.na(dat$V1))
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 FALSE
4 3 40 TRUE
5 NA 50 FALSE
6 5 60 TRUE
## CORRECT
dat[indexVector,]
V1 V2
2 1 20
4 3 40
6 5 60
It returns NA for comparison with NA, and indexing with NA’s will cause a disaster.
## (intention) Choose rows where V1 is NOT 0
indexVector <- dat$V1 != 0
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 NA
4 3 40 TRUE
5 NA 50 NA
6 5 60 TRUE
## indexing with NA's messes it up
dat[indexVector,]
V1 V2
2 1 20
NA NA NA
4 3 40
NA.1 NA NA
6 5 60
!= cannot assess inequality to NA, which does not have a value by definition
## (intention) Choose rows where V1 is (NOT 0) AND (NOT NA)
indexVector <- (dat$V1 != 0 & dat$V1 != NA)
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 NA
3 NA 30 NA
4 3 40 NA
5 NA 50 NA
6 5 60 NA
## indexing with NA's messes it up
dat[indexVector,]
V1 V2
NA NA NA
NA.1 NA NA
NA.2 NA NA
NA.3 NA NA
NA.4 NA NA
Use is.na() instead
## (intention) Choose rows where V1 is (NOT 0) AND (NOT NA)
indexVector <- (dat$V1 != 0 & !is.na(dat$V1))
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 FALSE
4 3 40 TRUE
5 NA 50 FALSE
6 5 60 TRUE
## indexing with NA's messes it up
dat[indexVector,]
V1 V2
2 1 20
4 3 40
6 5 60
is.element is more flexible thant
## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !is.element(dat$V1, c(0,NA))
## indexVector does not contain NA
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 FALSE
4 3 40 TRUE
5 NA 50 FALSE
6 5 60 TRUE
## CORRECT
dat[indexVector,]
V1 V2
2 1 20
4 3 40
6 5 60
## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !(dat$V1 %in% c(0,NA))
## indexVector does not contain NA
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 FALSE
4 3 40 TRUE
5 NA 50 FALSE
6 5 60 TRUE
## CORRECT
dat[indexVector,]
V1 V2
2 1 20
4 3 40
6 5 60
Actually, subset() drops rows indexed with NA, so this is more robust. But I believe avoiding creation of indexing by NA is the best way.
## (intention) Choose rows where V1 is not 0
indexVector <- !(dat$V1 == 0)
## indexVector contains NA (WRONG)
cbind(dat, indexVector)
V1 V2 indexVector
1 0 10 FALSE
2 1 20 TRUE
3 NA 30 NA
4 3 40 TRUE
5 NA 50 NA
6 5 60 TRUE
## Indexing by the same thing: This works!
subset(dat, !V1 == 0)
V1 V2
2 1 20
4 3 40
6 5 60