Working with Missing Values

Some Basic Codes

myvec1 <- c(11,13,5,6,NA,9) 
# Calculate mean: this fails if there are missing values 
mean(myvec1)

## [1] NA

# Calculate mean after removing the missing values 
mean(myvec1, na.rm=TRUE)

## [1] 8.8

# Is a value missing? (TRUE or FALSE) 
is.na(myvec1)

## [1] FALSE FALSE FALSE FALSE  TRUE FALSE

# Which of the elements of a vector is missing? 
which(is.na(myvec1))

## [1] 5

# How many values in a vector are NA? 
sum(is.na(myvec1))

## [1] 1

Making missing values

# Some vector that contains bad values coded as -9999 
datavec <- c(2,-9999,100,3,-9999,5)
# Assign NA to the values that were -9999 
datavec[datavec == -9999] <- NA

Not a number

Another type of missing value is the result of calculations that went wrong, for example:

# A character vector, some of these look like numbers: 
myvec <- c("101","289","12.3","abc","99")
# when we convert the vector to numeric, we will receive a warning NAs introduced: 
as.numeric(myvec)

## Warning: NAs durch Umwandlung erzeugt

## [1] 101.0 289.0  12.3    NA  99.0

# Attempt to take the logarithm of a negative number: 
log(-1)

## Warning in log(-1): NaNs wurden erzeugt

## [1] NaN

#Dividing by zero is not usually meaningful, but R does not produce a missing value: 
1000/0

## [1] Inf

Missing values in dataframes

# Read the data 
pupae <- read.csv("att.csv")
# Assign NA to the values that were " "
pupae[pupae == ""] <- NA 
# Look at a summary to see if there are missing values: 
summary(pupae)

##   pick         income        moves          age        education  
##  ATT:504   15-25  :185   0      :597   25-34  :214   HS     :361  
##  OCC:496   25-35  :171   1      :221   35-44  :203   Coll   :187  
##            7.5-15 :114   2      : 88   65+    :184   <HS    :153  
##            35-45  :107   3      : 38   55-64  :153   BA     :150  
##            <7.5   : 96   4      : 16   45-54  :152   >BA    : 60  
##            (Other):112   (Other): 23   (Other): 61   (Other): 54  
##            NA's   :215   NA's   : 17   NA's   : 33   NA's   : 35  
##    employment      usage         nonpub    reachout     card    
##  F      :548   Min.   :  0.00       :  0       :  0       :  0  
##  R      :215   1st Qu.:  1.00   NO  :808   NO  :919   NO  :702  
##  H      : 93   Median :  6.00   YES :188   YES : 62   YES :281  
##  P      : 67   Mean   : 16.34   NA's:  4   NA's: 19   NA's: 17  
##  U      : 26   3rd Qu.: 23.00                                   
##  (Other): 25   Max.   :291.00                                   
##  NA's   : 26

# Notice there are  NA's (missing values) for every v???iable.
# Option 1: take subset of data where Reachout, for example, is not missing: 
pupae_subs1 <- subset(pupae, !is.na(reachout))
# Option 2: take subset of data where Reachout AND Card are not missing 
pupae_subs2 <- subset(pupae, !is.na(reachout) & !is.na(card))
# A more rigorous subset: remove all rows from a dataset where ANY variable # has a missing value: 
pupae_nona <- pupae[complete.cases(pupae),]

Subsetting when there are missing values

# A small dataframe 
dfr <- data.frame(a=1:4, b=c(4,NA,6,NA))
# subset drops all missing values 
subset(dfr, b > 4, select=b)

# square bracket notation keeps them 
dfr[dfr$b > 4,"b"]

## [1] NA  6 NA

# ... but drops them when we use 'which' 
dfr[which(dfr$b > 4),"b"]

## [1] 6