1 Load libraries

# load the libraries at the beginning of rmd file
library(dplyr)
# library(stringr)

2 fn_read_data_file

  • Note: @export tag “puts the function name in the package NAMESPACE file which means it can be accessed by users after they run library(<packagename>)”.
    • Reference here.
    • Usually, helper functions are not exported
  • Note: difference between @import and @importFrom: @import imports every functions in the package into the namespace (could lead to potential conflict in namespace); @importFrom only import specific functions inside the package.
#' @title Read Data File to Summarize
#' @description This function reads a tab split file, and get the summary stats of a column within it
#' @param filePath A string of the full file path to the data 
#' @param column A non-quoted string of the column name
#' @return A df (row vector) of summary statistics 
#' @export
#' @importFrom dplyr summarise
#' @examples
#' fn_read_data_file(filePath = ls_files[i], column = trait)

fn_read_data_file = function(filePath, column){
  
  # load the data to a temp object
  tmp_df = read.table(filePath, sep="\t", header=TRUE)
  
  # get the summary data
  dfRow_summary_results = tmp_df %>% 
    summarise(
      names = basename(filePath), #removes all of the path up to and including the last path separator (if any) = get the last element in the path name
        # str_split_i(string = filePath, pattern = "/", i=-1), #split -> get the last element (dataset name)
      N=nrow(.),
      mean=mean({{column}}),
      median=median({{column}}),
      var=var({{column}})) #returns a row vector (in df format)
  
  return(dfRow_summary_results)
}

3 Get the summary table

path_to_dir = "~/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data"
ls_files = list.files(path = path_to_dir, full.names = TRUE)
ls_files
## [1] "/Users/vincent/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data/dataset1.txt"
## [2] "/Users/vincent/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data/dataset2.txt"
## [3] "/Users/vincent/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data/dataset3.txt"
## [4] "/Users/vincent/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data/dataset4.txt"
## [5] "/Users/vincent/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data/dataset5.txt"
## [6] "/Users/vincent/Desktop/HUGEN2071/07_R_functions_debug_BuildPackage/data/dataset6.txt"
df_summary = data.frame(names = rep(NA, length(ls_files)),
                        N=NA, mean=NA, median=NA, var=NA)

# create a function, return a vector -> store in each row

# store the results in the 
for (i in seq(length(ls_files)) ){
  df_summary[i,] = fn_read_data_file(filePath = ls_files[i], column = trait)
}
df_summary