Title: Produce_Single_AA_Percent_Composition_Gamma .rmd

Summary: This program produces a test-harness file. This program takes a protein sequence from a randomly sampled file of 500 polypeptides from lists of oxygern binding proteins. This program converts the linear polypeptide sequence to a percent amino acid composition.

Libraries:

knitr::opts_chunk$set(echo = TRUE)

Libraries = c("readr", "stringr", "dplyr")

# Install if not present
for(p in Libraries){
    if(!require(p, character.only = TRUE)) {install.packages(p)}
    library(p, character.only = TRUE)
}

Import polypeptide strings line by line:

import_polypeptide_line <- function(FILE_2_RUN) {
    con <- file(description = FILE_2_RUN, open = "r") # Create connection
    # Find number of lines from source file
    bash_com <- paste("wc -l ", FILE_2_RUN, " | awk '{ print $1 }'", sep="")
    n <- system(command = bash_com, intern = TRUE)
    ## Loop over a file connection
    for(i in 1:n) {
        line_2_process <- scan(file = con, 
                               what = character(), 
                               nlines = 1, 
                               quiet = T, 
                               text)
        #print(line_2_process)
        linear_pp_to_percent_aa(line_2_process, i) # Process polypeptides to paa.
    }
}

#import_polypeptide_line(file) ## It Works

Initialize file ‘test_harness_paa.csv’:

initialize_file <- function() {
    col_titles = t(c("Class", "id", "TotalAA", 
                 "G", "P", "A", "V", "L",
                 "I", "M", "C", "F", "Y",
                 "W", "H", "K", "R", "Q",
                 "N", "E", "D", "S", "T"))
    file_name <- paste("test_harness", "_paa.csv", sep = "")
    write.table(col_titles,
                file_name,
                sep = ",",
                col.names = FALSE,
                row.names = FALSE,
               eol = "\n")
}

#initialize_file()

Produce % AA Composition from linear polypeptides:

linear_pp_to_percent_aa <- function(line_2_process, i) {
    aa_vector = as.vector(matrix(0, ncol = 23))
    code_key <- c("Ctrl", "Ery", "Hcy", "Hgb", "Hhe", "Lgb", "Mgb")
    amino_acids = c("G", "P", "A", "V", "L", "I", "M", "C", "F", "Y",
                    "W", "H", "K", "R", "Q", "N", "E", "D", "S", "T")
    # First column is protein 'Class'
    if (i < 501) {
        (aa_vector[1] <- "Ctrl")
    } else if (i < 1001) {
        (aa_vector[1] <- "Ery")
    } else if (i < 1501) {
        (aa_vector[1] <- "Hcy")
    } else if (i < 2001) {
        (aa_vector[1] <- "Hgb")
    } else if (i < 2501) {
        (aa_vector[1] <- "Hhe")
    } else if (i < 3001) {
        (aa_vector[1] <- "Lgb")
    } else {
        (aa_vector[1] <- "Mgb")
    }
    # Second column is 'id'
    aa_vector[2] = paste("P", i, sep = "")
    # Third column is 'TotalAA'
    total_aa = nchar(line_2_process, keepNA = FALSE)
    aa_vector[3] = total_aa
    # Columns 4:23 - Calculate % AA per line
    for (k in 4:23) {
        aa_vector[k] = str_count(line_2_process, pattern = amino_acids[k-3]) / total_aa
    }
    write(x = aa_vector,
          file = "test_harness_paa.csv", 
          append = TRUE, 
          ncolumns = 23, 
          sep = ",")
        }

test_harness_paa.csv

MAIN:

start_time <- Sys.time() # Start timer

FILE_2_RUN <- "7_class_500_test_harness_2019-03-22_12-33-09-AM.lst"

setwd("~/Dropbox/Oxy-RF/2B_Test_Harness/Test-Harness-2-PAA")

initialize_file()
import_polypeptide_line(FILE_2_RUN)

end_time <- Sys.time()   # End timer
end_time - start_time    # Display time
## Time difference of 3.530343 secs

Machine Settings:

Sys.info()[c(1:3,5)]
##                                               sysname 
##                                               "Linux" 
##                                               release 
##                                   "4.15.0-46-generic" 
##                                               version 
## "#49~16.04.1-Ubuntu SMP Tue Feb 12 17:45:24 UTC 2019" 
##                                               machine 
##                                              "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_0.7.8   stringr_1.3.1 readr_1.3.1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0       assertthat_0.2.0 crayon_1.3.4     digest_0.6.18   
##  [5] R6_2.3.0         magrittr_1.5     evaluate_0.12    pillar_1.3.1    
##  [9] rlang_0.3.0.1    stringi_1.2.4    bindrcpp_0.2.2   rmarkdown_1.11  
## [13] tools_3.4.4      glue_1.3.0       purrr_0.2.5      hms_0.4.2       
## [17] xfun_0.4         yaml_2.2.0       compiler_3.4.4   pkgconfig_2.0.2 
## [21] htmltools_0.3.6  tidyselect_0.2.5 bindr_0.1.1      knitr_1.21      
## [25] tibble_1.4.2

EOF