Data

The zip file containing the data can be downloaded here: specdata.zip. The zip file contains 332 comma-separated-value (CSV) files containing pollution monitoring data for fine particulate matter (PM) air pollution at 332 locations in the United States. Each file contains data from a single monitor and the ID number for each monitor is contained in the file name. For example, data for monitor 200 is contained in the file “200.csv”. Each file contains three variables. Date: the date of the observation in (year-month-day) format, sulfate: the level of sulfate PM in the air on that date (measured in micrograms per cubic meter), and nitrate: the level of nitrate PM in the air on that date (measured in micrograms per cubic meter)

Part-1

Write a function named ‘pollutantmean’ that calculates the mean of a pollutant (sulfate or nitrate) across a specified list of monitors. The function ‘pollutantmean’ takes three arguments: ‘directory’, ‘pollutant’, and ‘id’. Given a vector monitor ID numbers, ‘pollutantmean’ reads that monitors’ particulate matter data from the directory specified in the ‘directory’ argument and returns the mean of the pollutant across all of the monitors, ignoring any missing values coded as NA.

pollutantmean <- function(directory, pollutant, id = 1:332) {
  
  setwd(file.path(getwd(), directory)) ## setting the directory
  total = 0                            ## the sum of all observed values of pollutant (either sulfate or nitrate)
  observations = 0                     ## the total number of observed values of pollutant (either sulfate or nitrate)
  
  #Looping thru the directory's files specified in the 'id' argument 
  for (i in id)
{
    
    
      ## Due to the format of the filename, i.e 001, 010  instead of 1, 10. I became aware that the following method works but not efficient, 
      ## but at the time of the completion of this assignment, it was the only way I knew how to do it.           
      if (i <10) { 
                         data <- read.csv(paste("0","0", as.character(i), ".csv", sep=""),  ## for example, if 'id' =7, we get 007.csv
                                          header = T, 
                                          na.strings=c("NA","NaN", " "))
                 }
      
else if (i>=10 & i<100) { 
                         data <- read.csv(paste("0", as.character(i), ".csv", sep=""),  ## for example, if 'id' = 17, we get 017.csv
                                          header = T, 
                                          na.strings=c("NA","NaN", " ") 
                                          )
                        }
                     
       

     else       { 
                        data <- read.csv(paste(as.character(i), ".csv", sep=""),     ## Normal
                                         header = T, 
                                         na.strings=c("NA","NaN", " ") 
                                        )
                }
  
  ## getting rid of all the "NA" values and, consequently, all the non-complete ovservations (the ones with at least one NA in row)
 data = na.omit(data)    
 ##  cumulative addition of the complete observations
 observations = observations + nrow(data)
 ## depending the poluttant ( sulfate or nitrate), we aggregate the observed values
  if (pollutant == "sulfate") {total = total + sum(data$sulfate)}
 else {total = total + sum(data$nitrate)}

}
  
  ## reset directory path
  setwd("..")
  ## returning the mean of the pollutant values
  return (total/observations)

}

Let’s put this function to work,

pollutantmean("specdata", "sulfate", 1:10)
## [1] 4.064128
pollutantmean("specdata", "nitrate", 70:72)
## [1] 1.732979
pollutantmean("specdata", "nitrate", 23)
## [1] 1.280833

These answers match the expected outputs

Part-2

Write a function that reads a directory full of files and reports the number of completely observed cases in each data file. The function should return a data frame where the first column is the name of the file and the second column is the number of complete cases.

complete <- function(directory, id = 1:332) {
  
  
  dataframe = NULL  ## initializing the dataframe we want from this function   
  setwd(file.path(getwd(), directory)) ## setting the directory
  
  #Looping thru the directory's files specified in the 'id' argument 
  for (i in id)
{
    
    
      ## Due to the format of the filename, i.e 001, 010  instead of 1, 10. I became aware that the following method works but not efficient, 
      ## but at the time of the completion of this assignment, it was the only way I knew how to do it.           
      if (i <10) { 
                         data <- read.csv(paste("0","0", as.character(i), ".csv", sep=""),  ## for example, if 'id' =7, we get 007.csv
                                          header = T, 
                                          na.strings=c("NA","NaN", " "))
                 }
      
else if (i>=10 & i<100) { 
                         data <- read.csv(paste("0", as.character(i), ".csv", sep=""),  ## for example, if 'id' = 17, we get 017.csv
                                          header = T, 
                                          na.strings=c("NA","NaN", " ") 
                                          )
                        }
                     
       

     else       { 
                        data <- read.csv(paste(as.character(i), ".csv", sep=""),     ## Normal
                                         header = T, 
                                         na.strings=c("NA","NaN", " ") 
                                        )
                }
  
  ## getting rid of all the "NA" values and, consequently, all the non-complete ovservations (the ones with at least one NA in row)
    data = na.omit(data) 
 ##  make it a matrix to easily fill each successive row of our dataframe
    data = as.matrix(data)
    dataframe = rbind(dataframe, c(i,nrow(data))) # fill each successive row of our dataframe. Each row contains the monitor ID,
                                                  # and its total complete observed cases (no rows containg NAs)
    
    
    
  }
  
  setwd("..")  # reseting working directory path
  dataframe = data.frame(dataframe)  # from matix to data frame 
  names(dataframe) = c('id', 'nobs') # set the column names of the data frame
  return (dataframe) 
}

We can now do this

complete("specdata", 1)
##   id nobs
## 1  1  117
complete("specdata", c(2, 4, 8, 10, 12))
##   id nobs
## 1  2 1041
## 2  4  474
## 3  8  192
## 4 10  148
## 5 12   96
complete("specdata", 30:25)
##   id nobs
## 1 30  932
## 2 29  711
## 3 28  475
## 4 27  338
## 5 26  586
## 6 25  463
complete("specdata", 3)
##   id nobs
## 1  3  243

These answered match the expected outputs

Part-3

Write a function that takes a directory of data files and a threshold for complete cases and calculates the correlation between sulfate and nitrate for monitor locations where the number of completely observed cases (on all variables) is greater than the threshold. The function should return a vector of correlations for the monitors that meet the threshold requirement. If no monitors meet the threshold requirement, then the function should return a numeric vector of length 0.

corr <- function(directory, threshold = 0) {
setwd(file.path(getwd(), directory)) ## setting the directory

correlationVector = NULL ## initializing the correlation matrix
              
  #Looping thru ALL the directory's files 
  for (i in 1:332)
{
    
    
      ## Due to the format of the filename, i.e 001, 010  instead of 1, 10. I became aware that the following method works but not efficient, 
      ## but at the time of the completion of this assignment, it was the only way I knew how to do it.           
      if (i <10) { 
                         data <- read.csv(
                                          paste("0","0", as.character(i), ".csv", sep=""),  ## for example, if 'id' =7, we get 007.csv
                                          header = T, 
                                          na.strings=c("NA","NaN", " ")
                                          
                                          )
                 }
      
else if (i>=10 & i<100) { 
                         data <- read.csv(
                                          paste("0", as.character(i), ".csv", sep=""),  ## for example, if 'id' = 17, we get 017.csv
                                          header = T, 
                                          na.strings=c("NA","NaN", " ") 
                                          
                                          )
                        }
                     
       

     else       { 
                        data <- read.csv(
                                         paste(as.character(i), ".csv", sep=""),     ## Normal
                                         header = T, 
                                         na.strings=c("NA","NaN", " ") 
                                        
                                         )
                }
  
    ## getting rid of all the "NA" values and, consequently, all the non-complete ovservations (the ones with at least one NA in row)
    data = na.omit(data) 
    
    ## if the number of complete observed cases meets the quota, find the correlation between the pollutants for the given monitor AND
    ## store the results in the correlation matrix
    if (nrow(data) > threshold) {
                                  correlationVector = c(correlationVector, cor(data[,2], data[,3]))
                               }
    
    
  }
  
  
  
  setwd("..")  # reseting working directory path
  return (correlationVector)
}

We can test this function with the following queries

cr <- corr("specdata", 150)
head(cr); summary(cr)
## [1] -0.01895754 -0.14051254 -0.04389737 -0.06815956 -0.12350667 -0.07588814
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.21060 -0.04999  0.09463  0.12530  0.26840  0.76310
cr <- corr("specdata", 400)
head(cr); summary(cr)
## [1] -0.01895754 -0.04389737 -0.06815956 -0.07588814  0.76312884 -0.15782860
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.17620 -0.03109  0.10020  0.13970  0.26850  0.76310
cr <- corr("specdata", 5000)
head(cr); summary(cr) ; length(cr)
## NULL
## Length  Class   Mode 
##      0   NULL   NULL
## [1] 0
cr <- corr("specdata") # default threshold value  is ZERO
head(cr); summary(cr) ; length(cr)
## [1] -0.22255256 -0.01895754 -0.14051254 -0.04389737 -0.06815956 -0.12350667
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.00000 -0.05282  0.10720  0.13680  0.27830  1.00000
## [1] 323

These answers match the expected outputs


Ahmed TADDE.