text
This work can be found on GitHub and on RPubs.
Column 1 = Protein Class (0:6)
Column 2 = Total AA per protein
Columns 3:22; (G, P, A, V, L, I, M, C, F, Y, W, H, K, R, Q, N, E, D, S, T)
suppressMessages(library(seqinr))
suppressMessages(library(stringr))
suppressMessages(library(readr))
suppressMessages(library(randomForest))
suppressMessages(library(dplyr))
# devtools::install_github("MI2DataLab/randomForestExplainer")
suppressMessages(library(randomForestExplainer))
fasta_2_sequences <- function() {
setwd("~/Dropbox/git_projects/random_forest_dk_project/uniprot_1")
o2_binders = list.files(pattern = ".fasta$")
# o2_binders
# length(o2_binders)
protein_class = c("erythrocruorin", "hemerythrin", "hemocyanin",
"leghemoglobin", "myoglobin", "Homosapiens",
"hemoglobin")
# for loop to read sequences using seqinr:read.fasta
for (i in seq(length(o2_binders))) {
prot_seq <- read.fasta(file = o2_binders[i], seqtype = "AA",
seqonly = TRUE, as.string = TRUE)
# prot_seq_is.vector = is.vector(prot_seq)
file_name <- paste("protein_class", protein_class[i], ".txt", sep = "")
write.table(x = prot_seq, file = file_name, sep = "\n",
row.names = FALSE, col.names = col_titles)
}
}
percent_aa <- function(df, protein_class) {
col_titles = t(c("Class", "TotalAA", "G", "P", "A", "V", "L", "I", "M", "C",
"F", "Y", "W", "H", "K", "R", "Q", "N", "E", "D", "S", "T"))
file_name <- paste("protein_class", protein_class, ".csv", sep = "")
write.table(matrix(0, ncol = 22), file_name, sep = ",",
col.names = col_titles, row.names = FALSE, eol = "\n")
############################################
aa_nums = as.vector(matrix(0, ncol = 22))
for (row in seq(length(df))) {
# First column is protein_class
aa_nums[1] = protein_class
# Second column is total number of amino acids
total_aa = nchar(df[1,row], keepNA = FALSE)
aa_nums[2] = total_aa
# Column 3:22 - Calculate percent AA
aa_nums[3] = str_count(df[1,row], "G") / total_aa
aa_nums[4] = str_count(df[1,row], "P") / total_aa
aa_nums[5] = str_count(df[1,row], "A") / total_aa
aa_nums[6] = str_count(df[1,row], "V") / total_aa
aa_nums[7] = str_count(df[1,row], "L") / total_aa
aa_nums[8] = str_count(df[1,row], "I") / total_aa
aa_nums[9] = str_count(df[1,row], "M") / total_aa
aa_nums[10] = str_count(df[1,row], "C") / total_aa
aa_nums[11] = str_count(df[1,row], "F") / total_aa
aa_nums[12] = str_count(df[1,row], "Y") / total_aa
aa_nums[13] = str_count(df[1,row], "W") / total_aa
aa_nums[14] = str_count(df[1,row], "H") / total_aa
aa_nums[15] = str_count(df[1,row], "K") / total_aa
aa_nums[16] = str_count(df[1,row], "R") / total_aa
aa_nums[17] = str_count(df[1,row], "Q") / total_aa
aa_nums[18] = str_count(df[1,row], "N") / total_aa
aa_nums[19] = str_count(df[1,row], "E") / total_aa
aa_nums[20] = str_count(df[1,row], "D") / total_aa
aa_nums[21] = str_count(df[1,row], "S") / total_aa
aa_nums[22] = str_count(df[1,row], "T") / total_aa
aa_nums = t(aa_nums)
# Write/append vector of AA values
write(aa_nums, file = file_name, append = TRUE, ncolumns = 22, sep = ",")
}
}
read_seq_txt <- function() {
uniprot_dir = "~/Dropbox/git_projects/random_forest_dk_project/uniprot_1"
setwd(uniprot_dir)
protein_class = 0:6
o2_seqs = list.files(pattern = ".txt$")
for (i in seq(length(o2_seqs))) {
df = read.csv(file = o2_seqs[i], header = FALSE)
df %>% mutate_if(is.factor, as.character) -> df
df = t(df)
percent_aa(df, protein_class[i])
}
}
#files <- list.files(pattern = "\\.csv$")
# Concatenate all protein files.csv to all_classes.csv
# cat protein_class*.csv > all_classes.csv
# wc -l all*
# 20786 all_classes.csv
library(readr)
FILE_PATH = "~/Dropbox/git_projects/random_forest_dk_project/uniprot_1/all_classes.csv"
all_classes <- read_csv(FILE_PATH)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Class = col_integer(),
## TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
# View(all_classes)
# names(all_classes)
table(is.na(all_classes$TotalAA))
##
## FALSE
## 20785
# max = max(all_classes, na.rm=TRUE)
# min = min(all_classes, na.rm=TRUE)
# mean = mean(all_classes, na.rm = TRUE)
#
# is_numeric = is.numeric(all_classes)
#
# median = median(all_classes, na.rm = TRUE)
# mode = mode(all_classes, na.rm = TRUE)
#
# # Initial investigation of "all_classes" dataset.
# barplot(table(all_classes$TotalAA))
The data set ‘all_classes’ has the following structure:
str(all_classes)
## Classes 'tbl_df', 'tbl' and 'data.frame': 20785 obs. of 22 variables:
## $ Class : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TotalAA: int 393 2016 2871 1210 770 781 2351 2527 920 34350 ...
## $ G : num 0.0585 0.058 0.1069 0.0702 0.0494 ...
## $ P : num 0.1145 0.0526 0.061 0.062 0.0455 ...
## $ A : num 0.0611 0.0675 0.0334 0.0595 0.0818 ...
## $ V : num 0.0458 0.059 0.0376 0.0579 0.0844 ...
## $ L : num 0.0814 0.1126 0.0491 0.0917 0.0727 ...
## $ I : num 0.0204 0.0605 0.0519 0.057 0.0312 ...
## $ M : num 0.0305 0.0342 0.0181 0.0207 0.0312 ...
## $ C : num 0.0254 0.0208 0.1261 0.0496 0.0234 ...
## $ F : num 0.028 0.0615 0.0293 0.0298 0.0273 ...
## $ Y : num 0.0229 0.0268 0.0324 0.0298 0.026 ...
## $ W : num 0.01018 0.01637 0.00453 0.01074 0.01169 ...
## $ H : num 0.0305 0.0134 0.0167 0.0256 0.0325 ...
## $ K : num 0.0509 0.0412 0.0387 0.0545 0.0532 ...
## $ R : num 0.0662 0.0516 0.0456 0.0496 0.0481 ...
## $ Q : num 0.0382 0.0352 0.0348 0.0405 0.0468 ...
## $ N : num 0.0356 0.0382 0.0655 0.0545 0.0403 ...
## $ E : num 0.0763 0.067 0.07 0.0636 0.1195 ...
## $ D : num 0.0509 0.0466 0.0603 0.0504 0.0649 ...
## $ S : num 0.0967 0.0784 0.0603 0.0694 0.0455 ...
## $ T : num 0.056 0.0585 0.0578 0.0529 0.0649 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 22
## .. ..$ Class : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ TotalAA: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ G : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ P : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ A : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ V : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ L : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ I : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ M : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ C : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ F : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Y : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ W : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ H : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ K : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ R : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Q : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ N : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ E : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ D : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ S : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ T : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
We will use all the Predictors in the dataset.
Before we build our model, let’s separate our data into testing and training sets. This will place 75% of the observations of the original dataset into ‘train’ and the remaining 25% of the observations into ‘test.’
Total number of samples from “all_classes.csv” = 20786 observations 75% of 20786 = 15589.5 Therefore, training sample = 15590 observations
set.seed(123)
train = sample(1:nrow(all_classes), 15590)
# Start the clock!
ptm <- proc.time()
all_classes.rf = randomForest(Class ~ . , data = all_classes , subset = train)
all_classes.rf
##
## Call:
## randomForest(formula = Class ~ ., data = all_classes, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 0.05967919
## % Var explained: 89.22
cat("\n")
# Stop the clock
cat ("Run time:" , (proc.time() - ptm), "sec")
## Run time: 52.936 0.06 52.998 0 0 sec
plot(all_classes.rf, main = "MS Error of Resisduals Vs Number of Trees Produced",
col = "dark red")
This plot shows the MS Error Vs. the Number of Trees. We can notice that the Error drops as the program adds more trees then averages them. At 400 trees the MSE is at a plateau.
===================================
Now we can compare the Out of Bag Sample Errors and Error on Test set.
The above Random Forest model chose Randomly 7 variables to be considered at each split. We could now try all possible 20 predictors which can be found at each split.
mtry
Number of variables randomly sampled as candidates at each split. Note that the default values are classification (sqrt(p) where p is number of variables in x) and regression (p/3)
===================================
# Start the clock!
ptm <- proc.time()
oob.err = double(20)
test.err = double(20)
#mtry is no of Variables randomly chosen at each split
for(mtry in 1:20) {
rf = randomForest(Class ~ . ,
data = all_classes,
subset = train,
mtry = mtry,
ntree = 400,
localImp = TRUE,
importance = TRUE)
oob.err[mtry] = rf$mse[400] #Error of all Trees fitted
pred <- predict(rf, all_classes[-train, ])
#Predictions on Test Set for each Tree
test.err[mtry] = with(all_classes[-train, ], mean((Class - pred) ^ 2))
#Mean Squared Test Error
cat(mtry, " ")
#print the output to the console
}
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
cat ("\n")
# Stop the clock
cat ("Run time:" , (proc.time() - ptm)/60, "minutes")
## Run time: 25.2024 0.0196 25.22393 0 0 minutes
# Test Error
test.err
## [1] 0.08392382 0.06694464 0.06070022 0.05597037 0.05616387 0.05528229
## [7] 0.05438539 0.05462076 0.05463020 0.05686442 0.05821712 0.05881789
## [13] 0.06009367 0.06101072 0.06405407 0.06421689 0.06636766 0.06791265
## [19] 0.07063672 0.07372990
oob.err
## [1] 0.07933952 0.06725685 0.06305201 0.06064184 0.06125738 0.05985970
## [7] 0.05886961 0.06191989 0.06272479 0.06381689 0.06505609 0.06366956
## [13] 0.06652310 0.06735589 0.07031350 0.07048365 0.07245007 0.07330003
## [19] 0.07513694 0.07805369
What happens is that we are growing 400 trees for 20 times i.e. for all 20 predictors.
Plotting both Test Error and Out of Bag Error
matplot(1:mtry, cbind(oob.err,test.err),
pch = 19, col = c("red","blue"), type = "b",
ylab = "Mean Squared Error",
xlab = "Number Of Predictors Considered At Each Split")
legend("topright", legend = c("Out of Bag Error", "Test Error"),
pch = 19, col = c("red", "blue"))