myvec1 <- c(11,13,5,6,NA,9)
# Calculate mean: this fails if there are missing values
mean(myvec1)
## [1] NA
# Calculate mean after removing the missing values
mean(myvec1, na.rm=TRUE)
## [1] 8.8
# Is a value missing? (TRUE or FALSE)
is.na(myvec1)
## [1] FALSE FALSE FALSE FALSE TRUE FALSE
# Which of the elements of a vector is missing?
which(is.na(myvec1))
## [1] 5
# How many values in a vector are NA?
sum(is.na(myvec1))
## [1] 1
# Some vector that contains bad values coded as -9999
datavec <- c(2,-9999,100,3,-9999,5)
# Assign NA to the values that were -9999
datavec[datavec == -9999] <- NA
Another type of missing value is the result of calculations that went wrong, for example:
# A character vector, some of these look like numbers:
myvec <- c("101","289","12.3","abc","99")
# when we convert the vector to numeric, we will receive a warning NAs introduced:
as.numeric(myvec)
## Warning: NAs durch Umwandlung erzeugt
## [1] 101.0 289.0 12.3 NA 99.0
# Attempt to take the logarithm of a negative number:
log(-1)
## Warning in log(-1): NaNs wurden erzeugt
## [1] NaN
#Dividing by zero is not usually meaningful, but R does not produce a missing value:
1000/0
## [1] Inf
# Read the data
pupae <- read.csv("att.csv")
# Assign NA to the values that were " "
pupae[pupae == ""] <- NA
# Look at a summary to see if there are missing values:
summary(pupae)
## pick income moves age education
## ATT:504 15-25 :185 0 :597 25-34 :214 HS :361
## OCC:496 25-35 :171 1 :221 35-44 :203 Coll :187
## 7.5-15 :114 2 : 88 65+ :184 <HS :153
## 35-45 :107 3 : 38 55-64 :153 BA :150
## <7.5 : 96 4 : 16 45-54 :152 >BA : 60
## (Other):112 (Other): 23 (Other): 61 (Other): 54
## NA's :215 NA's : 17 NA's : 33 NA's : 35
## employment usage nonpub reachout card
## F :548 Min. : 0.00 : 0 : 0 : 0
## R :215 1st Qu.: 1.00 NO :808 NO :919 NO :702
## H : 93 Median : 6.00 YES :188 YES : 62 YES :281
## P : 67 Mean : 16.34 NA's: 4 NA's: 19 NA's: 17
## U : 26 3rd Qu.: 23.00
## (Other): 25 Max. :291.00
## NA's : 26
# Notice there are NA's (missing values) for every v???iable.
# Option 1: take subset of data where Reachout, for example, is not missing:
pupae_subs1 <- subset(pupae, !is.na(reachout))
# Option 2: take subset of data where Reachout AND Card are not missing
pupae_subs2 <- subset(pupae, !is.na(reachout) & !is.na(card))
# A more rigorous subset: remove all rows from a dataset where ANY variable # has a missing value:
pupae_nona <- pupae[complete.cases(pupae),]
# A small dataframe
dfr <- data.frame(a=1:4, b=c(4,NA,6,NA))
# subset drops all missing values
subset(dfr, b > 4, select=b)
# square bracket notation keeps them
dfr[dfr$b > 4,"b"]
## [1] NA 6 NA
# ... but drops them when we use 'which'
dfr[which(dfr$b > 4),"b"]
## [1] 6