Part 1

Write a function named 'pollutantmean' that calculates the mean of a pollutant (sulfate or nitrate) across a specified list of monitors. The function 'pollutantmean' takes three arguments: 'directory', 'pollutant', and 'id'. Given a vector monitor ID numbers, 'pollutantmean' reads that monitors' particulate matter data from the directory specified in the 'directory' argument and returns the mean of the pollutant across all of the monitors, ignoring any missing values coded as NA. A prototype of the function is as follows


pollutantmean <- function(directory, pollutant, id = 1:332) {
    ## 'directory' is a character vector of length 1 indicating the location of
    ## the CSV files

    ## 'pollutant' is a character vector of length 1 indicating the name of the
    ## pollutant for which we will calculate the mean; either 'sulfate' or
    ## 'nitrate'.

    ## 'id' is an integer vector indicating the monitor ID numbers to be used

    ## Return the mean of the pollutant across all monitors list in the 'id'
    ## vector (ignoring NA values)
    data = numeric()
    for (i in id) {

        newRead = read.csv(paste(directory, "/", formatC(i, width = 3, flag = "0"), 
            ".csv", sep = ""))

        data = c(data, newRead[[pollutant]])
    }
    return(mean(data, na.rm = TRUE))
}
pollutantmean("specdata", "nitrate", 23)

## [1] 1.281

# 1.2801
pollutantmean("specdata", "nitrate", 70:72)

## [1] 1.706

# 1.706
pollutantmean("specdata", "sulfate", 1:10)

## [1] 4.064

# 4.064

Alternative


pollutantmean <- function(directory, pollutant, id = 1:332) {
    data = lapply(id, function(i) read.csv(paste(directory, "/", formatC(i, 
        width = 3, flag = "0"), ".csv", sep = ""))[[pollutant]])

    return(mean(unlist(data), na.rm = TRUE))
}
pollutantmean("specdata", "nitrate", 23)

## [1] 1.281

# 1.2801
pollutantmean("specdata", "nitrate", 70:72)

## [1] 1.706

# 1.706
pollutantmean("specdata", "sulfate", 1:10)

## [1] 4.064

# 4.064

Part 2

Write a function that reads a directory full of files and reports the number of completely observed cases in each data file. The function should return a data frame where the first column is the name of the file and the second column is the number of complete cases. A prototype of this function follows

complete <- function(directory, id = 1:332) {
    ## 'directory' is a character vector of length 1 indicating the location of
    ## the CSV files

    ## 'id' is an integer vector indicating the monitor ID numbers to be used

    ## Return a data frame of the form: id nobs 1 117 2 1041 ...  where 'id' is
    ## the monitor ID number and 'nobs' is the number of complete cases
    nobs = numeric()
    for (i in id) {

        newRead = read.csv(paste(directory, "/", formatC(i, width = 3, flag = "0"), 
            ".csv", sep = ""))
        nobs = c(nobs, sum(complete.cases(newRead)))
    }
    return(data.frame(id, nobs))
}
complete("specdata", c(2, 4, 8, 10, 12))

##   id nobs
## 1  2 1041
## 2  4  474
## 3  8  192
## 4 10  148
## 5 12   96

## id nobs 1 2 1041 2 4 474 3 8 192 4 10 148 5 12 96
complete("specdata", 30:25)

##   id nobs
## 1 30  932
## 2 29  711
## 3 28  475
## 4 27  338
## 5 26  586
## 6 25  463

## id nobs 1 30 932 2 29 711 3 28 475 4 27 338 5 26 586 6 25 463
complete("specdata", 3)

##   id nobs
## 1  3  243

## id nobs 1 3 243

Alternative


complete <- function(directory, id = 1:332) {
    f <- function(i) {
        data = read.csv(paste(directory, "/", formatC(i, width = 3, flag = "0"), 
            ".csv", sep = ""))
        sum(complete.cases(data))
    }
    nobs = sapply(id, f)
    return(data.frame(id, nobs))
}
complete("specdata", c(2, 4, 8, 10, 12))

##   id nobs
## 1  2 1041
## 2  4  474
## 3  8  192
## 4 10  148
## 5 12   96

## id nobs 1 2 1041 2 4 474 3 8 192 4 10 148 5 12 96
complete("specdata", 30:25)

##   id nobs
## 1 30  932
## 2 29  711
## 3 28  475
## 4 27  338
## 5 26  586
## 6 25  463

## id nobs 1 30 932 2 29 711 3 28 475 4 27 338 5 26 586 6 25 463
complete("specdata", 3)

##   id nobs
## 1  3  243

## id nobs 1 3 243

Part 3

Write a function that takes a directory of data files and a threshold for complete cases and calculates the correlation between sulfate and nitrate for monitor locations where the number of completely observed cases (on all variables) is greater than the threshold. The function should return a vector of correlations for the monitors that meet the threshold requirement. If no monitors meet the threshold requirement, then the function should return a numeric vector of length 0. A prototype of this function follows

corr <- function(directory, threshold = 0) {
    ## 'directory' is a character vector of length 1 indicating the location of
    ## the CSV files

    ## 'threshold' is a numeric vector of length 1 indicating the number of
    ## completely observed observations (on all variables) required to compute
    ## the correlation between nitrate and sulfate; the default is 0

    ## Return a numeric vector of correlations
    df = complete(directory)
    ids = df[df["nobs"] > threshold, ]$id
    corrr = numeric()
    for (i in ids) {

        newRead = read.csv(paste(directory, "/", formatC(i, width = 3, flag = "0"), 
            ".csv", sep = ""))
        dff = newRead[complete.cases(newRead), ]
        corrr = c(corrr, cor(dff$sulfate, dff$nitrate))
    }
    return(corrr)
}
cr <- corr("specdata", 150)
head(cr)

## [1] -0.01896 -0.14051 -0.04390 -0.06816 -0.12351 -0.07589

## [1] -0.01896 -0.14051 -0.04390 -0.06816 -0.12351 -0.07589
summary(cr)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.2110 -0.0500  0.0946  0.1250  0.2680  0.7630

## Min. 1st Qu.  Median Mean 3rd Qu.  Max.  -0.2110 -0.0500 0.0946 0.1250
## 0.2680 0.7630