Handling of NA in == and is.element()

== will return NA for comparison with NA, is.element() considers NA is not included in a set without NA, such as c(0,1,2).

## Create data
dat <- read.table(text = "
          0   10
          1   20
          NA  30
          3   40
          NA  50
          5   60
")

## Show
dat
  V1 V2
1  0 10
2  1 20
3 NA 30
4  3 40
5 NA 50
6  5 60

Basic rules

Examples

In the examples below, I will try to show the rows where (V1 is not 0) AND (V1 is not NA), i.e, rows where V1 has non-zero integers

== method alone cannot handle NA

It returns NA for comparison with NA, and indexing with NA’s will cause a disaster.

## (intention) Choose rows where V1 is not 0
indexVector <- !(dat$V1 == 0)
## indexVector contains NA (WRONG)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30          NA
4  3 40        TRUE
5 NA 50          NA
6  5 60        TRUE
## WRONG
dat[indexVector,]
     V1 V2
2     1 20
NA   NA NA
4     3 40
NA.1 NA NA
6     5 60

== cannot assess equality to NA, which does not have a value by definition

## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !(dat$V1 == 0 | dat$V1 == NA)
## indexVector contains NA (WRONG)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20          NA
3 NA 30          NA
4  3 40          NA
5 NA 50          NA
6  5 60          NA
## WRONG
dat[indexVector,]
     V1 V2
NA   NA NA
NA.1 NA NA
NA.2 NA NA
NA.3 NA NA
NA.4 NA NA

Use is.na() instead

## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !(dat$V1 == 0 | is.na(dat$V1))
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30       FALSE
4  3 40        TRUE
5 NA 50       FALSE
6  5 60        TRUE
## CORRECT
dat[indexVector,]
  V1 V2
2  1 20
4  3 40
6  5 60

!= method alone cannot handle NA

It returns NA for comparison with NA, and indexing with NA’s will cause a disaster.

## (intention) Choose rows where V1 is NOT 0
indexVector <- dat$V1 != 0
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30          NA
4  3 40        TRUE
5 NA 50          NA
6  5 60        TRUE
## indexing with NA's messes it up
dat[indexVector,]
     V1 V2
2     1 20
NA   NA NA
4     3 40
NA.1 NA NA
6     5 60

!= cannot assess inequality to NA, which does not have a value by definition

## (intention) Choose rows where V1 is (NOT 0) AND (NOT NA)
indexVector <- (dat$V1 != 0 & dat$V1 != NA)
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20          NA
3 NA 30          NA
4  3 40          NA
5 NA 50          NA
6  5 60          NA
## indexing with NA's messes it up
dat[indexVector,]
     V1 V2
NA   NA NA
NA.1 NA NA
NA.2 NA NA
NA.3 NA NA
NA.4 NA NA

Use is.na() instead

## (intention) Choose rows where V1 is (NOT 0) AND (NOT NA)
indexVector <- (dat$V1 != 0 & !is.na(dat$V1))
## indexVector does not contain NA (CORRECT)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30       FALSE
4  3 40        TRUE
5 NA 50       FALSE
6  5 60        TRUE
## indexing with NA's messes it up
dat[indexVector,]
  V1 V2
2  1 20
4  3 40
6  5 60

is.element() method

is.element is more flexible thant

## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !is.element(dat$V1, c(0,NA))
## indexVector does not contain NA
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30       FALSE
4  3 40        TRUE
5 NA 50       FALSE
6  5 60        TRUE
## CORRECT
dat[indexVector,]
  V1 V2
2  1 20
4  3 40
6  5 60

%in% method (equivalent to is.element)

## (intention) Choose rows where V1 is NOT (0 OR NA)
indexVector <- !(dat$V1 %in% c(0,NA))
## indexVector does not contain NA
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30       FALSE
4  3 40        TRUE
5 NA 50       FALSE
6  5 60        TRUE
## CORRECT
dat[indexVector,]
  V1 V2
2  1 20
4  3 40
6  5 60

subset() is robust

Actually, subset() drops rows indexed with NA, so this is more robust. But I believe avoiding creation of indexing by NA is the best way.

## (intention) Choose rows where V1 is not 0
indexVector <- !(dat$V1 == 0)
## indexVector contains NA (WRONG)
cbind(dat, indexVector)
  V1 V2 indexVector
1  0 10       FALSE
2  1 20        TRUE
3 NA 30          NA
4  3 40        TRUE
5 NA 50          NA
6  5 60        TRUE
## Indexing by the same thing: This works!
subset(dat, !V1 == 0)
  V1 V2
2  1 20
4  3 40
6  5 60