Introduction

text

This work can be found on GitHub and on RPubs.

Protein classes:

Produce AA matrix:

Column 1 = Protein Class (0:6)
Column 2 = Total AA per protein
Columns 3:22; (G, P, A, V, L, I, M, C, F, Y, W, H, K, R, Q, N, E, D, S, T)

suppressMessages(library(seqinr))
suppressMessages(library(stringr))
suppressMessages(library(readr))
suppressMessages(library(randomForest))
suppressMessages(library(dplyr))

# devtools::install_github("MI2DataLab/randomForestExplainer")
suppressMessages(library(randomForestExplainer))

Read FASTA files and store sequences ONLY in .txt files.

fasta_2_sequences <- function() {
    setwd("~/Dropbox/git_projects/random_forest_dk_project/uniprot_1")
    o2_binders = list.files(pattern = ".fasta$")
    # o2_binders
    # length(o2_binders)
    protein_class = c("erythrocruorin", "hemerythrin", "hemocyanin",
                      "leghemoglobin", "myoglobin", "Homosapiens", 
                      "hemoglobin")
    # for loop to read sequences using seqinr:read.fasta
    for (i in seq(length(o2_binders))) {
    prot_seq <- read.fasta(file = o2_binders[i], seqtype = "AA", 
                           seqonly = TRUE, as.string = TRUE)
    # prot_seq_is.vector = is.vector(prot_seq)
    file_name <- paste("protein_class", protein_class[i], ".txt", sep = "")
    write.table(x = prot_seq, file = file_name, sep  = "\n",
                row.names = FALSE, col.names = col_titles)
    }
}

Function: percent_aa; takes a string of single letter amino acid abrevations and returns a vector of length (nrow = 1, ncol = 22) of %AA.

cell(1,1) is the protein class, (0:6),
** protein class(0) = null sett of human proteins, see above
cell(1,2) is total number of characters in string
cell(1,3:22) is percent AA

percent_aa <- function(df, protein_class) {
    col_titles = t(c("Class", "TotalAA", "G", "P", "A", "V", "L", "I", "M", "C",
                   "F", "Y", "W", "H", "K", "R", "Q", "N", "E", "D", "S", "T"))
    file_name <- paste("protein_class", protein_class, ".csv", sep = "")
    write.table(matrix(0, ncol = 22), file_name,  sep = ",", 
    col.names = col_titles, row.names = FALSE, eol = "\n")
    ############################################
    aa_nums = as.vector(matrix(0, ncol = 22))
    for (row in seq(length(df))) {
        # First column is protein_class
        aa_nums[1] = protein_class
        # Second column is total number of amino acids
        total_aa = nchar(df[1,row], keepNA = FALSE)
        aa_nums[2] = total_aa
        # Column 3:22 - Calculate percent AA
        aa_nums[3] = str_count(df[1,row], "G") / total_aa
        aa_nums[4] = str_count(df[1,row], "P") / total_aa
        aa_nums[5] = str_count(df[1,row], "A") / total_aa
        aa_nums[6] = str_count(df[1,row], "V") / total_aa
        aa_nums[7] = str_count(df[1,row], "L") / total_aa
        aa_nums[8] = str_count(df[1,row], "I") / total_aa
        aa_nums[9] = str_count(df[1,row], "M") / total_aa
        aa_nums[10] = str_count(df[1,row], "C") / total_aa
        aa_nums[11] = str_count(df[1,row], "F") / total_aa
        aa_nums[12] = str_count(df[1,row], "Y") / total_aa
        aa_nums[13] = str_count(df[1,row], "W") / total_aa
        aa_nums[14] = str_count(df[1,row], "H") / total_aa
        aa_nums[15] = str_count(df[1,row], "K") / total_aa
        aa_nums[16] = str_count(df[1,row], "R") / total_aa
        aa_nums[17] = str_count(df[1,row], "Q") / total_aa
        aa_nums[18] = str_count(df[1,row], "N") / total_aa
        aa_nums[19] = str_count(df[1,row], "E") / total_aa
        aa_nums[20] = str_count(df[1,row], "D") / total_aa
        aa_nums[21] = str_count(df[1,row], "S") / total_aa
        aa_nums[22] = str_count(df[1,row], "T") / total_aa
        aa_nums = t(aa_nums)
        # Write/append vector of AA values
        write(aa_nums, file = file_name, append = TRUE, ncolumns = 22, sep = ",")
    }
}

read_seq_txt <- function() {
    uniprot_dir = "~/Dropbox/git_projects/random_forest_dk_project/uniprot_1"
    setwd(uniprot_dir)
    protein_class = 0:6
    o2_seqs = list.files(pattern = ".txt$")
    for (i in seq(length(o2_seqs))) {
        df = read.csv(file = o2_seqs[i], header = FALSE)
        df %>% mutate_if(is.factor, as.character) -> df
        df = t(df)
        percent_aa(df, protein_class[i])
    }
}

#files <- list.files(pattern = "\\.csv$")

# Concatenate all protein files.csv to all_classes.csv
# cat protein_class*.csv > all_classes.csv


# wc -l all*
# 20786 all_classes.csv

library(readr)
FILE_PATH = "~/Dropbox/git_projects/random_forest_dk_project/uniprot_1/all_classes.csv"
all_classes <- read_csv(FILE_PATH)

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )

## See spec(...) for full column specifications.

# View(all_classes)
# names(all_classes)

table(is.na(all_classes$TotalAA))

## 
## FALSE 
## 20785

# max = max(all_classes, na.rm=TRUE)
# min = min(all_classes, na.rm=TRUE)
# mean = mean(all_classes, na.rm = TRUE)
# 
# is_numeric = is.numeric(all_classes)
# 
# median = median(all_classes, na.rm = TRUE)
# mode = mode(all_classes, na.rm = TRUE)
# 
# # Initial investigation of "all_classes" dataset.
# barplot(table(all_classes$TotalAA))

The data set ‘all_classes’ has the following structure:

str(all_classes)

## Classes 'tbl_df', 'tbl' and 'data.frame':    20785 obs. of  22 variables:
##  $ Class  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TotalAA: int  393 2016 2871 1210 770 781 2351 2527 920 34350 ...
##  $ G      : num  0.0585 0.058 0.1069 0.0702 0.0494 ...
##  $ P      : num  0.1145 0.0526 0.061 0.062 0.0455 ...
##  $ A      : num  0.0611 0.0675 0.0334 0.0595 0.0818 ...
##  $ V      : num  0.0458 0.059 0.0376 0.0579 0.0844 ...
##  $ L      : num  0.0814 0.1126 0.0491 0.0917 0.0727 ...
##  $ I      : num  0.0204 0.0605 0.0519 0.057 0.0312 ...
##  $ M      : num  0.0305 0.0342 0.0181 0.0207 0.0312 ...
##  $ C      : num  0.0254 0.0208 0.1261 0.0496 0.0234 ...
##  $ F      : num  0.028 0.0615 0.0293 0.0298 0.0273 ...
##  $ Y      : num  0.0229 0.0268 0.0324 0.0298 0.026 ...
##  $ W      : num  0.01018 0.01637 0.00453 0.01074 0.01169 ...
##  $ H      : num  0.0305 0.0134 0.0167 0.0256 0.0325 ...
##  $ K      : num  0.0509 0.0412 0.0387 0.0545 0.0532 ...
##  $ R      : num  0.0662 0.0516 0.0456 0.0496 0.0481 ...
##  $ Q      : num  0.0382 0.0352 0.0348 0.0405 0.0468 ...
##  $ N      : num  0.0356 0.0382 0.0655 0.0545 0.0403 ...
##  $ E      : num  0.0763 0.067 0.07 0.0636 0.1195 ...
##  $ D      : num  0.0509 0.0466 0.0603 0.0504 0.0649 ...
##  $ S      : num  0.0967 0.0784 0.0603 0.0694 0.0455 ...
##  $ T      : num  0.056 0.0585 0.0578 0.0529 0.0649 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 22
##   .. ..$ Class  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ TotalAA: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ G      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ P      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ A      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ V      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ L      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ I      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ M      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ C      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ F      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Y      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ W      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ H      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ K      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ R      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Q      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ N      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ E      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ D      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ S      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ T      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Training the Random Forest

We will use all the Predictors in the dataset.

Before we build our model, let’s separate our data into testing and training sets. This will place 75% of the observations of the original dataset into ‘train’ and the remaining 25% of the observations into ‘test.’

Total number of samples from “all_classes.csv” = 20786 observations 75% of 20786 = 15589.5 Therefore, training sample = 15590 observations

Seperating Training and Test Sets

set.seed(123)
train = sample(1:nrow(all_classes), 15590)

# Start the clock!
ptm <- proc.time()

all_classes.rf = randomForest(Class ~ . , data = all_classes , subset = train)
all_classes.rf

## 
## Call:
##  randomForest(formula = Class ~ ., data = all_classes, subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##           Mean of squared residuals: 0.05967919
##                     % Var explained: 89.22

cat("\n")

# Stop the clock
cat ("Run time:" , (proc.time() - ptm), "sec")

## Run time: 52.936 0.06 52.998 0 0 sec

Plotting the Error vs Number of Trees Graph.

plot(all_classes.rf, main = "MS Error of Resisduals Vs Number of Trees Produced",
     col = "dark red")

This plot shows the MS Error Vs. the Number of Trees. We can notice that the Error drops as the program adds more trees then averages them. At 400 trees the MSE is at a plateau.

===================================

Now we can compare the Out of Bag Sample Errors and Error on Test set.

The above Random Forest model chose Randomly 7 variables to be considered at each split. We could now try all possible 20 predictors which can be found at each split.

mtry
Number of variables randomly sampled as candidates at each split. Note that the default values are classification (sqrt(p) where p is number of variables in x) and regression (p/3)
===================================

# Start the clock!
ptm <- proc.time()

oob.err = double(20)
test.err = double(20)

#mtry is no of Variables randomly chosen at each split
for(mtry in 1:20) {
    rf = randomForest(Class ~ . ,
                      data = all_classes,
                      subset = train,
                      mtry = mtry,
                      ntree = 400,
                      localImp = TRUE,
                      importance = TRUE)
    
    oob.err[mtry] = rf$mse[400] #Error of all Trees fitted
    
    pred <- predict(rf, all_classes[-train, ]) 
    #Predictions on Test Set for each Tree
    
    test.err[mtry] = with(all_classes[-train, ], mean((Class - pred) ^ 2)) 
    #Mean Squared Test Error
    
    cat(mtry, " ") 
    #print the output to the console
}

## 1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20

cat ("\n")

# Stop the clock
cat ("Run time:" , (proc.time() - ptm)/60, "minutes")

## Run time: 25.2024 0.0196 25.22393 0 0 minutes

# Test Error
test.err

##  [1] 0.08392382 0.06694464 0.06070022 0.05597037 0.05616387 0.05528229
##  [7] 0.05438539 0.05462076 0.05463020 0.05686442 0.05821712 0.05881789
## [13] 0.06009367 0.06101072 0.06405407 0.06421689 0.06636766 0.06791265
## [19] 0.07063672 0.07372990

oob.err

##  [1] 0.07933952 0.06725685 0.06305201 0.06064184 0.06125738 0.05985970
##  [7] 0.05886961 0.06191989 0.06272479 0.06381689 0.06505609 0.06366956
## [13] 0.06652310 0.06735589 0.07031350 0.07048365 0.07245007 0.07330003
## [19] 0.07513694 0.07805369

What happens is that we are growing 400 trees for 20 times i.e. for all 20 predictors.

Plotting both Test Error and Out of Bag Error

matplot(1:mtry, cbind(oob.err,test.err), 
        pch = 19, col = c("red","blue"), type = "b", 
        ylab = "Mean Squared Error",
        xlab = "Number Of Predictors Considered At Each Split")
legend("topright", legend = c("Out of Bag Error", "Test Error"),
       pch = 19, col = c("red", "blue"))

Classification of Oxygen Binding Proteins Using Random Forest Machine Learning

Matthew Curcio

August 16, 2017