library(tidyverse)
library(readr)
# Reading the fasta file from internet
dna.data <- readLines("https://arken.nmbu.no/~jonvi/stin300/data/Neuraminidase.fasta")
# Now we want to locate all the lines with a header, which means a different sequence.
seq.idx <- which(str_starts(string = dna.data, pattern = ">"))
# And put them in a tibble; the sequences will be stored in the table as follows...
dna.table <- tibble(Header = dna.data[seq.idx], Sequence = "")
# Then we make a cycle that will take everything (all lines) in between headers.
N = nrow(dna.table)
seq.idx <- c(seq.idx, length(dna.data) + 1) # the index of the sequences should be 'updated' with the 'location' of the last element in the 'source' (the .txt file) plus one, cause in the last round of the cycle, we need to define an 'l2'
for(row in 1:(N)){
# first we define line 1 as the starting point (of the sequence) after which lines will stored as sequence and l2 as the last sequence line.
l1 <- seq.idx[row] + 1
l2 <- seq.idx[row + 1] - 1
temp.seq <- dna.data[l1:l2]
dna.table$Sequence[row] <- str_c(temp.seq, collapse = "")
}
## Now we want to detect and extract the HxNy patterns
# First we create a vector that shows all the 'Headers' that have the HxNy serotype (it returns TRUE when there is serotype in the form we are looking for..)
vector.sero <- str_detect(string = dna.table$Header, pattern = "H[0-9]{1,2}N[0-9]{1,2}")
# Now, we want to index them...
idx.sero <- which(vector.sero)
# Now we modify the table and get rid of the 'Headers' without a serotype..
dna.table %>%
slice(idx.sero) ->new.dna.table
# Now we want to create the column 'Serotype' in the 'new dna table' and put there the serotype
new.dna.table %>%
mutate(Serotype = "", H.type = "", N.type = "") -> new.dna.table
n <- nrow(new.dna.table)
for(i in 1:n){
serotype <- str_extract(string = new.dna.table$Header[i], pattern = "H[0-9]{1,2}N[0-9]{1,2}")
new.dna.table$Serotype[i] <-serotype
H <- str_extract(string = serotype, pattern = "H[0-9]{1,2}")
new.dna.table$H.type[i] <- H
N <- str_extract(string = serotype, pattern = "N[0-9]{1,2}")
new.dna.table$N.type[i] <- N
}
# Now, we are making the plot of of how frequent each N-type
new.dna.table %>%
group_by(N.type) %>%
summarise(N.type.frq = n()) %>%
arrange(N.type.frq) -> Nplot
g <- ggplot(data = new.dna.table) + geom_histogram(mapping = aes(x = N.type, fill = N.type), stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
print(g)
# From this histogram we can observe that there are 9 N-types.
# From Day 6, this is what I coded. This basically will give me a vector containing all the L-mers based on a given L.
dna <- new.dna.table$Sequence[47]
L <- 2
N <- str_length(dna)
lmers.dna <- str_sub(dna, start = 1:(N-L+1), end = L:N)
# The following function returns all possible permutations -L-mers- given a certain L.
elmers <- function(L, iter = 1, alphabet = c("A","C","G","T")){
require(stringr)
if(iter < L){
w <- elmers(L, iter + 1, alphabet)
w <- str_c(rep(w, each = length(alphabet)),
rep(alphabet, times = length(alphabet)))
} else {
w <- alphabet
}
return(sort(w))
}
l.mers <- elmers(L = 2)
## Now we shall make a function that first:
# 1. Creates a tibble with a number of rows equal to the possible permutations, based on the L given as argument. IT USES THE OUTPUT OF THE elmers() function (so this function is called first)
# 2. Now we split up the DNA.sequence into L-mers (based on the specified L)
# 3. Find and count how many time we have each L-mer in the sequence; if it is not found will retur a zero.
# Then we do the final step which would be generating a table with all possible L-mers and the counts in the sequence we want to apply the function.
tbl <- tibble(L_mer = as.character(1:length(l.mers)), Count = 1:length(l.mers))
for(i in 1:length(l.mers)){
i.elmer <- length(which(lmers.dna %in% l.mers[i]))
tbl$L_mer[i] <- l.mers[i]
tbl$Count[i] <- i.elmer
}
# So... Finally the function!!:
dna_counts <- function(DNAsequence, L){
N <- str_length(DNAsequence)
lmers.dna <- str_sub(DNAsequence, start = 1:(N-L+1), end = L:N)
l.mers <- elmers(L = 2) # Create the permutations list, calling on the 'elmers' function
tbl <- tibble(L_mer = as.character(1:length(l.mers)), Count = 1:length(l.mers)) # create a # tibble to store the data.
for(i in 1:length(l.mers)){
i.elmer <- length(which(lmers.dna %in% l.mers[i]))
tbl$L_mer[i] <- l.mers[i]
tbl$Count[i] <- i.elmer
}
cols <- tbl$L_mer # Gets a vector with the names of the columns from the table generated in the function (the 16 L-mers).
tbl <- t(as.matrix(tbl[,2])) # transforms the generated table (in the function) and returs a one-row-matrix
colnames(tbl) <- cols # Asigns a name to the matrix' columns (the 16 L-mers)
rownames(tbl) <- NULL # Resets the row names (to erase that anoying "Count"...)
all <- sum(tbl[1,])
for(i in 1:length(tbl)){
new.value <- (tbl[1,i]) / all
tbl[1,i] <- new.value
}
return(tbl)
}
test <- dna_counts(new.dna.table$Sequence[475], L = 2)
#
# # Experimenting with matices, to get the last part of "Part 2"...
#
# mat <- matrix(nrow = 1, ncol = 16) # just an example of creating a matrix...
#
# mat2 <- rbind(tbl, mat) # binding together two matrices, by row.
# First we create a matrix with the proper size...
mat <- matrix(nrow = nrow(new.dna.table), ncol = 16)
l.mers <- elmers(L = 2) # Just copying the names of the L-mers to..
colnames(mat) <- l.mers # Name the columns of the matrix
for(i in 1:nrow(new.dna.table)){
temp <- dna_counts(new.dna.table$Sequence[i], L = 2)
mat[i,] <- temp
}
as_tibble(mat)
## # A tibble: 5,986 x 16
## AA AC AG AT CA CC CG CT GA GC GG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.101 0.0517 0.0822 0.0885 0.0864 0.0418 0.0127 0.0390 0.0800 0.0347 0.0772
## 2 0.0857 0.0538 0.0772 0.0829 0.0765 0.0397 0.0212 0.0439 0.0822 0.0390 0.0885
## 3 0.102 0.0581 0.0758 0.0914 0.0814 0.0368 0.0212 0.0404 0.0822 0.0326 0.0722
## 4 0.0873 0.0525 0.0703 0.0823 0.0802 0.0390 0.0170 0.0525 0.0731 0.0426 0.0802
## 5 0.101 0.0737 0.0793 0.0843 0.0864 0.0482 0.0205 0.0545 0.0779 0.0439 0.0659
## 6 0.0967 0.0477 0.0768 0.0875 0.0761 0.0391 0.0213 0.0363 0.0832 0.0349 0.0804
## 7 0.143 0.0646 0.0703 0.0972 0.0852 0.0454 0.00923 0.0433 0.0696 0.0334 0.0525
## 8 0.104 0.0681 0.0653 0.0880 0.0830 0.0497 0.0220 0.0426 0.0781 0.0326 0.0667
## 9 0.104 0.0681 0.0653 0.0873 0.0830 0.0497 0.0220 0.0426 0.0774 0.0326 0.0667
## 10 0.0951 0.0518 0.0724 0.0887 0.0866 0.0390 0.0156 0.0468 0.0696 0.0454 0.0653
## # … with 5,976 more rows, and 5 more variables: GT <dbl>, TA <dbl>, TC <dbl>,
## # TG <dbl>, TT <dbl>
tbl.part2 <- cbind(new.dna.table, mat) # We bind together the tables. Seems like cbind is a better function, bind.cols was giving an error (saying that argument 2 had no names..)
# tbl.part2 is the outcome of part 2.
The last part is to extract the subset containing only the cases for N-types N1, N3 and N6
tbl.part2 %>%
filter(N.type == "N1") -> t1
tbl.part2 %>%
filter(N.type == "N3") -> t2
tbl.part2 %>%
filter(N.type == "N6") -> t3
tbl.3Ncases <- rbind(t1, t2, t3)
# Here we have a table with only the N1, N2 nd N3 cases.
# Now we create de X.train and the 'grouping'
tbl.3Ncases %>%
select(-(1:5)) %>%
as.matrix() -> X.train # to train the model
Y.train <- factor(tbl.3Ncases$N.type)
# Now we shall create the model:
fit.lda <- MASS::lda(x = X.train, grouping = Y.train)
## Warning in lda.default(x, grouping, ...): variables are collinear
print(fit.lda)
## Call:
## lda(X.train, grouping = Y.train)
##
## Prior probabilities of groups:
## N1 N3 N6
## 0.3788136 0.3177966 0.3033898
##
## Group means:
## AA AC AG AT CA CC CG
## N1 0.09917801 0.05036713 0.06939983 0.08990678 0.07900486 0.03873179 0.01609232
## N3 0.10136852 0.06989958 0.07043283 0.08364254 0.08874924 0.04637195 0.01731957
## N6 0.11237964 0.06011365 0.07798339 0.09074060 0.09601064 0.04513204 0.01896308
## CT GA GC GG GT TA TC
## N1 0.04882048 0.07204038 0.04045343 0.07266370 0.05761099 0.05813660 0.05311164
## N3 0.04804692 0.07602062 0.03680073 0.06592284 0.05361676 0.05918122 0.04741459
## N6 0.04283306 0.07628629 0.04427824 0.07192100 0.04531908 0.05584114 0.05341887
## TG TT
## N1 0.08508763 0.06939443
## N3 0.07871105 0.05650104
## N6 0.06963174 0.03914753
##
## Coefficients of linear discriminants:
## LD1 LD2
## AA 112.07237 119.94829
## AC 206.76032 309.31401
## AG 1772.76536 1247.37233
## AT 101.83185 506.33954
## CA -469.56329 -972.04356
## CC -39.27346 -94.99039
## CG 1068.31366 367.85576
## CT -390.63877 84.79706
## GA -1692.79203 -1122.58583
## GC -1267.61206 -654.53266
## GG 160.75057 101.65963
## GT -1553.17574 -682.19271
## TA -418.75576 -852.08531
## TC 74.31625 152.31308
## TG 1219.61446 349.32595
## TT -291.93432 35.59190
##
## Proportion of trace:
## LD1 LD2
## 0.7021 0.2979
# And then we need to make a matricial multiplication for making the plot.
newcoord <- X.train %*% fit.lda$scaling
tibble(N.type = tbl.3Ncases$N.type) %>%
cbind(as_tibble(newcoord)) -> g
summary(fit.lda)
## Length Class Mode
## prior 3 -none- numeric
## counts 3 -none- numeric
## means 48 -none- numeric
## scaling 32 -none- numeric
## lev 3 -none- character
## svd 2 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
lda.plot <- ggplot(g) + geom_point(aes(x = LD1, y = LD2, color = N.type))
print(lda.plot)
# # We just take the first row (L-mers columns) as a Test data (for the distances function)
# tbl.part2 %>%
# slice(1) %>%
# select(-(1:5)) %>%
# as.matrix() -> X.test
PART 3, STARTS HERE ACTUALLY, creating the test and train entities.
# Here below we can take a random sample of 'k' observations as a Test data.
N <- nrow(tbl.part2)
set.seed(20200123)
test.cases.idx <- sample(1:N, size = 20)
X.test <- tbl.part2[test.cases.idx, ]
X.test %>%
select(-(1:5)) %>%
as.matrix() -> X.test # This X.test is a matrix of 20 randomly selected rows from tbl.part2 (we start with few, to test the further functions, namely KNN)
set.seed(20200124)
train.cases.idx <- sample(1:nrow(tbl.part2), size = 80) # Following Lars' advice, we select a slightly bigger set of data as X.train
X.train <- tbl.part2[train.cases.idx, ]
X.train %>%
select(-(1:5)) %>%
as.matrix() -> X.train
# THIS FOLLOWING COMMENTS INCLUDE MY PRACTICING TO WRITE THE DISTANCES CALCULATION FUNCTION, HENCE LEFT THEM AS COMMENTS.
# Let's calculate the distance 'd' as I have understood...
## Following the 'hints' of the assignment:
# replace X.test[1,] with p (a vector, a row of counts)
# replace X.train with q (a matrix, several rows of counts)
# Use division instead of subtraction
# There is no power involved, but instead a log2()
# So we change the name of the variables just to avoid confusion...
# p <- X.test[1,]
# Q <- X.train
#
# d <- rowSums(abs(log2(t(p / t(Q)))))
# length(d)
# So far, it seems like it does return a vector of distances, length equal to the number of rows in 'q' (2000 rows).
# So, we can make a function:
HERE WE CREATE THE DISTANCES FUNCTION
distances <- function(p.local, Q.local){ # X_test_matrix should be a matrix of one row and 'n' columns. X_train_matrix can be a matrix of any number of rows and 'n' columns as well.
d.local <- rowSums(abs(log2(t(p.local / t(Q.local)))))
return(d.local)
}
# Here below, we create the Y.train
set.seed(20200124) # Same seed so the random selection will 'correspond'.
Y.train.cases.idx <- sample(1:nrow(tbl.part2), size = 80) # Following Lars' advice, we select a slightly bigger set of data as X.train. # We could have actually used the same 'train.cases.idx', but anyways... is the same
Y.train <- tbl.part2[Y.train.cases.idx, ]
Y.train %>%
select(5) %>%
as.matrix() -> Y.train
source("MostFreq_Finder_V2.R") # FUNCTION Coded by myself but in the end did not use it because it #does not work for all K cases.
Now, we define here the KNN function.
my.KNN <- function(y.train.local, X.train.local, X.test.local, K.local){
# First we need to create the matrix of responses (Y.test)
N.local <- nrow(X.test.local)
y.test.local <- matrix(1:N.local, ncol = 1)
for(i in 1:N.local){
d <- distances(p.local = X.test.local[i,], Q.local = X.train.local)
idx <- order(d) # an indexing vector with the nearest neighbors
# Then I need to NOT compute an average, but to see the most frequent value (N-type)
neighbors <- table(y.train.local[idx[1:K.local]])
y.test.local[i] <- neighbors %>% which.max %>% names %>% sample(size = 1) # REMARK: it seems #like in case of two "max" elements, the which.max will always return the "first max value", which #in this case would be the most frequent one in the table() function output.
# So I coded the function mostFreq(). However, for unknown reasons to me, the function does #not work for any 'k' value I enter... so I left it as which.max %>% names %>% sample(size = 1)
}
return(y.test.local)
}
We test it and check how it performed with the entities created above.
KNN.test <- my.KNN(y.train.local = Y.train, X.train.local = X.train, X.test.local = X.test, K.local = 1)
print(KNN.test)
## [,1]
## [1,] "N1"
## [2,] "N2"
## [3,] "N9"
## [4,] "N6"
## [5,] "N2"
## [6,] "N3"
## [7,] "N7"
## [8,] "N4"
## [9,] "N4"
## [10,] "N1"
## [11,] "N2"
## [12,] "N1"
## [13,] "N1"
## [14,] "N2"
## [15,] "N7"
## [16,] "N8"
## [17,] "N3"
## [18,] "N1"
## [19,] "N3"
## [20,] "N4"
tbl.part2 %>%
slice(test.cases.idx) %>%
select(5) -> real
# To check how the model performed with the test
print(tibble(Real = real, Predicted = KNN.test))
## # A tibble: 20 x 2
## Real$N.type Predicted[,1]
## <chr> <chr>
## 1 N1 N1
## 2 N2 N2
## 3 N9 N9
## 4 N6 N6
## 5 N2 N2
## 6 N3 N3
## 7 N7 N7
## 8 N4 N4
## 9 N5 N4
## 10 N1 N1
## 11 N2 N2
## 12 N8 N1
## 13 N1 N1
## 14 N2 N2
## 15 N7 N7
## 16 N8 N8
## 17 N3 N3
## 18 N1 N1
## 19 N3 N3
## 20 N4 N4
Looks good! :)
First the DATA SPLITTING:
# First we take our data set 'tbl.part2'
C <- 100
set.seed(20200125)
tbl.part2 %>%
# Shuffle rows in random order:
slice(sample(1:n())) %>%
arrange(N.type) %>%
mutate(Segment = rep(1:C, length.out = n())) -> tbl.part2
# Double checking if we have a fair distribution of all N.types:
tbl.part2 %>%
group_by(Segment, N.type) %>%
summarise(ntypes = n()) %>%
spread(N.type, ntypes) %>% view # Looks good....
# Neighborhood sizes that will evaluate
K.values <- c(1, 3, 5)
# initiliaze as character so it can receive "N1", "N2",..., later
Y.test <- matrix("", nrow = nrow(tbl.part2), ncol = length(K.values))
colnames(Y.test) <- str_c("K.", K.values)
# Put the response and predictors in separate tibbles:
tbl.part2 %>%
select(N.type) -> tbl.y
tbl.part2 %>%
select(6:21) -> tbl.X
# Code taken from lectures and then modified.
for(i in 1:C){
# Which L-mers (i.e. their index numbers) are in this segment
tst.rows <- which(tbl.part2$Segment == i)
# Use these L-mers as our test set in this iteration of the "i" loop
X.test <- as.matrix(tbl.X[tst.rows,])
# Use the other L-mers as training.
X.train <- as.matrix(tbl.X[-tst.rows,])
y.train <- as_vector(tbl.y[-tst.rows,])
# Loop for candidate models, in this case "neighborhood size"
for(j in 1:ncol(Y.test)){
# compute predictions for each set of L-mers in the test set
# for this neighborhood size, K.values[j] and sotre them for later
prediction <- my.KNN(y.train.local = y.train,
X.train.local = X.train,
X.test.local = X.test,
K.local = K.values[j])
# Cannot put a factor directly into my character matrix!, must coerce with character() first
Y.test[tst.rows, j] <- as.character(prediction)
}
}
#print(Y.test)
Bind the results, first converted to tibble to the original table
tbl.part2 %>%
bind_cols(as_tibble(Y.test)) -> tbl.part2
Now we shall calculate the classification error rate (CER). The CER is just 1 minus the accuracy… So, calculating accuracy for each KNN model:
# For K = 1
accuracy.k1 <- sum(tbl.part2$N.type == tbl.part2$K.1) / nrow(tbl.part2)
cer.k1 <- 1-accuracy.k1
# For K = 3
accuracy.k3 <- sum(tbl.part2$N.type == tbl.part2$K.3) / nrow(tbl.part2)
cer.k3 <- 1-accuracy.k3
# For K = 5
accuracy.k5 <- sum(tbl.part2$N.type == tbl.part2$K.5) / nrow(tbl.part2)
cer.k5 <- 1-accuracy.k5
min(cer.k1, cer.k3, cer.k5)
## [1] 0.001503508
# Just summarizing the accuracies:
accuracies <- c(accuracy.k1, accuracy.k3, accuracy.k5)
cers <- c(cer.k1, cer.k3, cer.k5)
Accu.table <- matrix(0, nrow = 3, ncol = 2)
rownames(Accu.table) <- c("K1", "K3", "K5")
colnames(Accu.table) <- c("Accuracy", "CER")
Accu.table[,1] <- accuracies
Accu.table[,2] <- cers
Accu.table <- as.data.frame(Accu.table)
Accu.table %>%
mutate(KNN.pred = c("K1", "K3", "K5")) -> Accu.table
Accu.table %>%
mutate(Accuracy = Accuracy*100) -> Accu.table
print(Accu.table)
## Accuracy CER KNN.pred
## 1 99.79953 0.002004678 K1
## 2 99.84965 0.001503508 K3
## 3 99.74942 0.002505847 K5
iiddxx <- which.max(Accu.table$Accuracy)
cat("The most accurate model is when the number of neighbors(K) is: ", Accu.table$KNN.pred[iiddxx])
## The most accurate model is when the number of neighbors(K) is: K3
## That would be the 'global' accuracies for each model, however, we would like to check the accuracy for the prediction of each N.type.
# First we create simple tables summarizing all the N.types predicted by each model and the real ones:
matrix.final <- matrix(ncol = 7, nrow = 9)
rows.names <- c("N1", "N2", "N3", "N4", "N5", "N6", "N7", "N8", "N9")
rownames(matrix.final) <- rows.names
colums.names <- c("Freq.pred.K1", "Delta.K1", "Freq.pred.K3", "Delta.K3", "Freq.pred.K5", "Delta.K5", "Freq.N.type.Real")
colnames(matrix.final) <- colums.names
table.k1 <- table(tbl.part2$K.1)
table.k1 %>%
as.data.frame() -> table.k1
table.k3 <- table(tbl.part2$K.3)
table.k3 %>%
as.data.frame() -> table.k3
table.k5 <- table(tbl.part2$K.5)
table.k5 %>%
as.data.frame() -> table.k5
table.real <- table(tbl.part2$N.type)
table.real %>%
as.data.frame() -> table.real
matrix.final[,1] <- table.k1$Freq
matrix.final[,3] <- table.k3$Freq
matrix.final[,5] <- table.k5$Freq
matrix.final[,7] <- table.real$Freq
for(i in 1:nrow(matrix.final)){
delta.k1 <- abs(matrix.final[i,7] - matrix.final[i,1])
matrix.final[i,2] <- delta.k1
delta.k3 <- abs(matrix.final[i,7] - matrix.final[i,3])
matrix.final[i,4] <- delta.k3
delta.k5 <- abs(matrix.final[i,7] - matrix.final[i,5])
matrix.final[i,6] <- delta.k5
}
matrix.final %>%
as.data.frame() -> matrix.final
matrix.final %>%
mutate(N.types = rows.names) -> matrix.final
# Now let's try to plot the results
final.plot <- ggplot(data = matrix.final) + geom_point(aes(x = N.types, y = Delta.K1), shape = 2, size = 4, show.legend = T) + geom_point(aes(x = N.types, y = Delta.K3), shape = 4, size = 4, show.legend = T) + geom_point(aes(x = N.types, y = Delta.K5), shape = 1, size = 3, show.legend = T) + labs(y = "Number of prediction differences (absolute)")
print(final.plot)
I could not find a way to make a nice legend indicating which symbols correspond to which K… :( So:
- TRIANGLE is K1
- X is K3
- CIRCLE is K5