library(tidyverse)
library(readr)

Project STIN300 2020 – Classification of influenza virus

Part 1.

# Reading the fasta file from internet

dna.data <- readLines("https://arken.nmbu.no/~jonvi/stin300/data/Neuraminidase.fasta")

# Now we want to locate all the lines with a header, which means a different sequence.

seq.idx <- which(str_starts(string = dna.data, pattern = ">"))

# And put them in a tibble; the sequences will be stored in the table as follows...

dna.table <- tibble(Header = dna.data[seq.idx], Sequence = "")

# Then we make a cycle that will take everything (all lines) in between headers.

N = nrow(dna.table)
seq.idx <- c(seq.idx, length(dna.data) + 1) # the index of the sequences should be 'updated' with the 'location' of the last element in the 'source' (the .txt file) plus one, cause in the last round of the cycle, we need to define an 'l2'

for(row in 1:(N)){
  # first we define line 1 as the starting point (of the sequence) after which lines will stored as sequence and l2 as the last sequence line.
  l1 <- seq.idx[row] + 1
  l2 <- seq.idx[row + 1] - 1
  temp.seq <- dna.data[l1:l2]
  dna.table$Sequence[row] <- str_c(temp.seq, collapse = "")
}

## Now we want to detect and extract the HxNy patterns

# First we create a vector that shows all the 'Headers' that have the HxNy serotype  (it returns TRUE when there is serotype in the form we are looking for..)
vector.sero <- str_detect(string = dna.table$Header, pattern = "H[0-9]{1,2}N[0-9]{1,2}")

# Now, we want to index them...

idx.sero <- which(vector.sero)

# Now we modify the table and get rid of the 'Headers' without a serotype..

dna.table %>% 
  slice(idx.sero) ->new.dna.table

# Now we want to create the column 'Serotype' in the 'new dna table' and put there the serotype

new.dna.table %>% 
  mutate(Serotype = "", H.type = "", N.type = "") -> new.dna.table
n <- nrow(new.dna.table)

for(i in 1:n){
  serotype <- str_extract(string = new.dna.table$Header[i], pattern = "H[0-9]{1,2}N[0-9]{1,2}")
  new.dna.table$Serotype[i] <-serotype
  H <- str_extract(string = serotype, pattern = "H[0-9]{1,2}")
  new.dna.table$H.type[i] <- H
  N <- str_extract(string = serotype, pattern = "N[0-9]{1,2}")
  new.dna.table$N.type[i] <- N
}

# Now, we are making the plot of of how frequent each N-type 

new.dna.table %>% 
  group_by(N.type) %>% 
  summarise(N.type.frq = n()) %>% 
  arrange(N.type.frq) -> Nplot
 
g <- ggplot(data = new.dna.table) + geom_histogram(mapping = aes(x = N.type, fill = N.type), stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
print(g)

# From this histogram we can observe that there are 9 N-types. 

Part 2.

# From Day 6, this is what I coded. This basically will give me a vector containing all the L-mers based on a given L.

dna <- new.dna.table$Sequence[47]
L <- 2
N <- str_length(dna)

lmers.dna <- str_sub(dna, start = 1:(N-L+1), end = L:N)

# The following function returns all possible permutations -L-mers- given a certain L.

elmers <- function(L, iter = 1, alphabet = c("A","C","G","T")){
  require(stringr)
  if(iter < L){
    w <- elmers(L, iter + 1, alphabet)
    w <- str_c(rep(w, each = length(alphabet)),
               rep(alphabet, times = length(alphabet)))
  } else {
    w <- alphabet
  }
  return(sort(w))
}

l.mers <- elmers(L = 2)

## Now we shall make a function that first:

# 1. Creates a tibble with a number of rows equal to the possible permutations, based on the L given as argument. IT USES THE OUTPUT OF THE elmers() function (so this function is called first)

# 2. Now we split up the DNA.sequence into L-mers (based on the specified L)

# 3. Find and count how many time we have each L-mer in the sequence; if it is not found will retur a zero.

Now bundling everything together:

  # Then we do the final step which would be generating a table with all possible L-mers and the counts in the sequence we want to apply the function.
tbl <- tibble(L_mer = as.character(1:length(l.mers)), Count = 1:length(l.mers))
  
for(i in 1:length(l.mers)){
    i.elmer <- length(which(lmers.dna %in% l.mers[i]))
    tbl$L_mer[i] <- l.mers[i]
    tbl$Count[i] <- i.elmer
}  

# So... Finally the function!!:

dna_counts <- function(DNAsequence, L){
  N <- str_length(DNAsequence)
  lmers.dna <- str_sub(DNAsequence, start = 1:(N-L+1), end = L:N)
  
  l.mers <- elmers(L = 2)  # Create the permutations list, calling on the 'elmers' function
  
  tbl <- tibble(L_mer = as.character(1:length(l.mers)), Count = 1:length(l.mers)) # create a      # tibble to store the data.
  for(i in 1:length(l.mers)){
    i.elmer <- length(which(lmers.dna %in% l.mers[i]))
    tbl$L_mer[i] <- l.mers[i]
    tbl$Count[i] <- i.elmer
  }
  cols <- tbl$L_mer # Gets a vector with the names of the columns from the table generated in the function (the 16 L-mers).
  
  tbl <- t(as.matrix(tbl[,2])) # transforms the generated table (in the function) and returs a one-row-matrix
  colnames(tbl) <- cols # Asigns a name to the matrix' columns (the 16 L-mers)
  rownames(tbl) <- NULL  # Resets the row names (to erase that anoying "Count"...)
  
  all <- sum(tbl[1,])
  for(i in 1:length(tbl)){
    new.value <- (tbl[1,i]) / all
    tbl[1,i] <- new.value 
  }
  
  return(tbl)
}


test <- dna_counts(new.dna.table$Sequence[475], L = 2)


# 
# # Experimenting with matices, to get the last part of "Part 2"...
# 
# mat <- matrix(nrow = 1, ncol = 16) # just an example of creating a matrix...
# 
# mat2 <- rbind(tbl, mat) # binding together two matrices, by row.

Now… we need to use the dna_counts() function to create a big matrix applying the function to each line or sequence in the table created in the first part.

# First we create a matrix with the proper size...
mat <- matrix(nrow = nrow(new.dna.table), ncol = 16)

l.mers <- elmers(L = 2) # Just copying the names of the L-mers to..
colnames(mat) <- l.mers # Name the columns of the matrix

for(i in 1:nrow(new.dna.table)){
  temp <- dna_counts(new.dna.table$Sequence[i], L = 2)
  mat[i,] <- temp
}

as_tibble(mat)
## # A tibble: 5,986 x 16
##        AA     AC     AG     AT     CA     CC      CG     CT     GA     GC     GG
##     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 0.101  0.0517 0.0822 0.0885 0.0864 0.0418 0.0127  0.0390 0.0800 0.0347 0.0772
##  2 0.0857 0.0538 0.0772 0.0829 0.0765 0.0397 0.0212  0.0439 0.0822 0.0390 0.0885
##  3 0.102  0.0581 0.0758 0.0914 0.0814 0.0368 0.0212  0.0404 0.0822 0.0326 0.0722
##  4 0.0873 0.0525 0.0703 0.0823 0.0802 0.0390 0.0170  0.0525 0.0731 0.0426 0.0802
##  5 0.101  0.0737 0.0793 0.0843 0.0864 0.0482 0.0205  0.0545 0.0779 0.0439 0.0659
##  6 0.0967 0.0477 0.0768 0.0875 0.0761 0.0391 0.0213  0.0363 0.0832 0.0349 0.0804
##  7 0.143  0.0646 0.0703 0.0972 0.0852 0.0454 0.00923 0.0433 0.0696 0.0334 0.0525
##  8 0.104  0.0681 0.0653 0.0880 0.0830 0.0497 0.0220  0.0426 0.0781 0.0326 0.0667
##  9 0.104  0.0681 0.0653 0.0873 0.0830 0.0497 0.0220  0.0426 0.0774 0.0326 0.0667
## 10 0.0951 0.0518 0.0724 0.0887 0.0866 0.0390 0.0156  0.0468 0.0696 0.0454 0.0653
## # … with 5,976 more rows, and 5 more variables: GT <dbl>, TA <dbl>, TC <dbl>,
## #   TG <dbl>, TT <dbl>
tbl.part2 <- cbind(new.dna.table, mat)   # We bind together the tables. Seems like cbind is a better function, bind.cols was giving an error (saying that argument 2 had no names..)

# tbl.part2 is the outcome of part 2.

The last part is to extract the subset containing only the cases for N-types N1, N3 and N6

tbl.part2 %>% 
  filter(N.type == "N1") -> t1
tbl.part2 %>% 
  filter(N.type == "N3") -> t2
tbl.part2 %>% 
  filter(N.type == "N6") -> t3
  
tbl.3Ncases <- rbind(t1, t2, t3)

# Here we have a table with only the N1, N2 nd N3 cases.

# Now we create de X.train and the 'grouping'

tbl.3Ncases %>% 
  select(-(1:5)) %>% 
  as.matrix() -> X.train   # to train the model

Y.train <- factor(tbl.3Ncases$N.type)

# Now we shall create the model:

fit.lda <- MASS::lda(x = X.train, grouping = Y.train)
## Warning in lda.default(x, grouping, ...): variables are collinear
print(fit.lda)
## Call:
## lda(X.train, grouping = Y.train)
## 
## Prior probabilities of groups:
##        N1        N3        N6 
## 0.3788136 0.3177966 0.3033898 
## 
## Group means:
##            AA         AC         AG         AT         CA         CC         CG
## N1 0.09917801 0.05036713 0.06939983 0.08990678 0.07900486 0.03873179 0.01609232
## N3 0.10136852 0.06989958 0.07043283 0.08364254 0.08874924 0.04637195 0.01731957
## N6 0.11237964 0.06011365 0.07798339 0.09074060 0.09601064 0.04513204 0.01896308
##            CT         GA         GC         GG         GT         TA         TC
## N1 0.04882048 0.07204038 0.04045343 0.07266370 0.05761099 0.05813660 0.05311164
## N3 0.04804692 0.07602062 0.03680073 0.06592284 0.05361676 0.05918122 0.04741459
## N6 0.04283306 0.07628629 0.04427824 0.07192100 0.04531908 0.05584114 0.05341887
##            TG         TT
## N1 0.08508763 0.06939443
## N3 0.07871105 0.05650104
## N6 0.06963174 0.03914753
## 
## Coefficients of linear discriminants:
##            LD1         LD2
## AA   112.07237   119.94829
## AC   206.76032   309.31401
## AG  1772.76536  1247.37233
## AT   101.83185   506.33954
## CA  -469.56329  -972.04356
## CC   -39.27346   -94.99039
## CG  1068.31366   367.85576
## CT  -390.63877    84.79706
## GA -1692.79203 -1122.58583
## GC -1267.61206  -654.53266
## GG   160.75057   101.65963
## GT -1553.17574  -682.19271
## TA  -418.75576  -852.08531
## TC    74.31625   152.31308
## TG  1219.61446   349.32595
## TT  -291.93432    35.59190
## 
## Proportion of trace:
##    LD1    LD2 
## 0.7021 0.2979
# And then we need to make a matricial multiplication for making the plot.

newcoord <- X.train %*% fit.lda$scaling

tibble(N.type = tbl.3Ncases$N.type) %>% 
  cbind(as_tibble(newcoord)) -> g
summary(fit.lda)
##         Length Class  Mode     
## prior    3     -none- numeric  
## counts   3     -none- numeric  
## means   48     -none- numeric  
## scaling 32     -none- numeric  
## lev      3     -none- character
## svd      2     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call
lda.plot <- ggplot(g) + geom_point(aes(x = LD1, y = LD2, color = N.type))

print(lda.plot)

Part 3.

# # We just take the first row  (L-mers columns) as a Test data (for the distances function)
# tbl.part2 %>% 
#   slice(1) %>% 
#   select(-(1:5)) %>%  
#   as.matrix() -> X.test

PART 3, STARTS HERE ACTUALLY, creating the test and train entities.

# Here below we can take a random sample of 'k' observations as a Test data.
N <- nrow(tbl.part2)
set.seed(20200123)
test.cases.idx <- sample(1:N, size = 20)
X.test <- tbl.part2[test.cases.idx, ]
X.test %>% 
  select(-(1:5)) %>%  
  as.matrix() -> X.test  # This X.test is a matrix of 20 randomly selected rows from tbl.part2 (we start with few, to test the further functions, namely KNN)


set.seed(20200124)
train.cases.idx <- sample(1:nrow(tbl.part2), size = 80) # Following Lars' advice, we select a slightly bigger set of data as X.train
X.train <- tbl.part2[train.cases.idx, ]
X.train %>%
  select(-(1:5)) %>% 
  as.matrix() -> X.train

# THIS FOLLOWING COMMENTS INCLUDE MY PRACTICING TO WRITE THE DISTANCES CALCULATION FUNCTION, HENCE LEFT THEM AS COMMENTS.
# Let's calculate the distance 'd' as I have understood...

## Following the 'hints' of the assignment:

# replace X.test[1,] with p (a vector, a row of counts)
# replace X.train with q (a matrix, several rows of counts)
# Use division instead of subtraction
# There is no power involved, but instead a log2()

# So we change the name of the variables just to avoid confusion...

# p <- X.test[1,]
# Q <- X.train
# 
# d <- rowSums(abs(log2(t(p / t(Q)))))
# length(d)

# So far, it seems like it does return a vector of distances, length equal to the number of rows in 'q'  (2000 rows).

# So, we can make a function:

HERE WE CREATE THE DISTANCES FUNCTION

distances <- function(p.local, Q.local){  # X_test_matrix should be a matrix of one row and 'n' columns. X_train_matrix can be a matrix of any number of rows and 'n' columns as well.
  d.local <- rowSums(abs(log2(t(p.local / t(Q.local)))))
  return(d.local)
}



# Here below, we create the Y.train

set.seed(20200124) # Same seed so the random selection will 'correspond'.
Y.train.cases.idx <- sample(1:nrow(tbl.part2), size = 80) # Following Lars' advice, we select a slightly bigger set of data as X.train.  # We could have actually used the same 'train.cases.idx', but anyways... is the same
Y.train <- tbl.part2[Y.train.cases.idx, ]
Y.train %>%
  select(5) %>% 
  as.matrix() -> Y.train

source("MostFreq_Finder_V2.R") # FUNCTION Coded by myself but in the end did not use it because it #does not work for all K cases.

Now, we define here the KNN function.

my.KNN <- function(y.train.local, X.train.local, X.test.local, K.local){
  # First we need to create the matrix of responses (Y.test)
  N.local <- nrow(X.test.local)
  y.test.local <- matrix(1:N.local, ncol = 1)

  for(i in 1:N.local){
    d <- distances(p.local = X.test.local[i,], Q.local = X.train.local)
    idx <- order(d)  # an indexing vector with the nearest neighbors
    # Then I need to NOT compute an average, but to see the most frequent value (N-type) 
    neighbors <- table(y.train.local[idx[1:K.local]])
    y.test.local[i] <- neighbors %>% which.max %>% names %>% sample(size = 1)  # REMARK: it seems #like in case of two "max" elements, the which.max will always return the "first max value", which #in this case would be the most frequent one in the table() function output.  
    # So I coded the function mostFreq(). However, for unknown reasons to me, the function does #not work for any 'k' value I enter... so I left it as which.max %>% names %>% sample(size = 1)
  }
  return(y.test.local)
}

We test it and check how it performed with the entities created above.

KNN.test <- my.KNN(y.train.local = Y.train, X.train.local = X.train, X.test.local = X.test, K.local = 1)
print(KNN.test)
##       [,1]
##  [1,] "N1"
##  [2,] "N2"
##  [3,] "N9"
##  [4,] "N6"
##  [5,] "N2"
##  [6,] "N3"
##  [7,] "N7"
##  [8,] "N4"
##  [9,] "N4"
## [10,] "N1"
## [11,] "N2"
## [12,] "N1"
## [13,] "N1"
## [14,] "N2"
## [15,] "N7"
## [16,] "N8"
## [17,] "N3"
## [18,] "N1"
## [19,] "N3"
## [20,] "N4"
tbl.part2 %>%
  slice(test.cases.idx) %>% 
  select(5) -> real

# To check how the model performed with the test
print(tibble(Real = real, Predicted = KNN.test))
## # A tibble: 20 x 2
##    Real$N.type Predicted[,1]
##    <chr>       <chr>        
##  1 N1          N1           
##  2 N2          N2           
##  3 N9          N9           
##  4 N6          N6           
##  5 N2          N2           
##  6 N3          N3           
##  7 N7          N7           
##  8 N4          N4           
##  9 N5          N4           
## 10 N1          N1           
## 11 N2          N2           
## 12 N8          N1           
## 13 N1          N1           
## 14 N2          N2           
## 15 N7          N7           
## 16 N8          N8           
## 17 N3          N3           
## 18 N1          N1           
## 19 N3          N3           
## 20 N4          N4

Looks good! :)

Now we want to classify accuracy with a 100-fold cross-validation and visualize the results.

First the DATA SPLITTING:

# First we take our data set 'tbl.part2'

C <- 100

set.seed(20200125)
tbl.part2 %>% 
  # Shuffle rows in random order:
  slice(sample(1:n())) %>% 
  arrange(N.type) %>%
  mutate(Segment = rep(1:C, length.out = n())) -> tbl.part2 

# Double checking if we have a fair distribution of all N.types:
tbl.part2 %>%
  group_by(Segment, N.type) %>% 
  summarise(ntypes = n()) %>% 
  spread(N.type, ntypes) %>% view  # Looks good....

# Neighborhood sizes that will evaluate
K.values <- c(1, 3, 5)

# initiliaze as character so it can receive "N1", "N2",..., later 
Y.test <- matrix("", nrow = nrow(tbl.part2), ncol = length(K.values))

colnames(Y.test) <- str_c("K.", K.values)

# Put the response and predictors in separate tibbles:

tbl.part2 %>% 
  select(N.type) -> tbl.y

tbl.part2 %>% 
  select(6:21) -> tbl.X

Finally, the cross-validation loop…

# Code taken from lectures and then modified.

for(i in 1:C){
    # Which L-mers (i.e. their index numbers) are in this segment
    tst.rows <- which(tbl.part2$Segment == i)  
    #  Use these L-mers as our test set in this iteration of the "i" loop
    X.test <- as.matrix(tbl.X[tst.rows,])   
    # Use the other L-mers as training.
    X.train <- as.matrix(tbl.X[-tst.rows,])   
    
    y.train <- as_vector(tbl.y[-tst.rows,])   
    
    # Loop for candidate models, in this case "neighborhood size"
    
    for(j in 1:ncol(Y.test)){        
      # compute predictions for each set of L-mers in the test set
      # for this neighborhood size, K.values[j] and sotre them for later 
      prediction <- my.KNN(y.train.local = y.train,
                           X.train.local = X.train,
                           X.test.local = X.test,
                           K.local = K.values[j])
      
      # Cannot put a factor directly into my character matrix!, must coerce with character() first
      Y.test[tst.rows, j] <- as.character(prediction)
      
    }
}

#print(Y.test)

Now, wrapping up the results of the data:

Bind the results, first converted to tibble to the original table

tbl.part2 %>% 
  bind_cols(as_tibble(Y.test)) -> tbl.part2

Now we shall calculate the classification error rate (CER). The CER is just 1 minus the accuracy… So, calculating accuracy for each KNN model:

# For K = 1

accuracy.k1 <- sum(tbl.part2$N.type == tbl.part2$K.1) / nrow(tbl.part2)
cer.k1 <- 1-accuracy.k1

# For K = 3

accuracy.k3 <- sum(tbl.part2$N.type == tbl.part2$K.3) / nrow(tbl.part2)
cer.k3 <- 1-accuracy.k3

# For K = 5

accuracy.k5 <- sum(tbl.part2$N.type == tbl.part2$K.5) / nrow(tbl.part2)
cer.k5 <- 1-accuracy.k5

min(cer.k1, cer.k3, cer.k5)
## [1] 0.001503508
# Just summarizing the accuracies:

accuracies <- c(accuracy.k1, accuracy.k3, accuracy.k5)
cers <- c(cer.k1, cer.k3, cer.k5)

Accu.table <- matrix(0, nrow = 3, ncol = 2)

rownames(Accu.table) <- c("K1", "K3", "K5")
colnames(Accu.table) <- c("Accuracy", "CER")

Accu.table[,1] <- accuracies
Accu.table[,2] <- cers

Accu.table <- as.data.frame(Accu.table)
Accu.table %>% 
  mutate(KNN.pred = c("K1", "K3", "K5")) -> Accu.table

Accu.table %>% 
  mutate(Accuracy = Accuracy*100) -> Accu.table

print(Accu.table)
##   Accuracy         CER KNN.pred
## 1 99.79953 0.002004678       K1
## 2 99.84965 0.001503508       K3
## 3 99.74942 0.002505847       K5
iiddxx <- which.max(Accu.table$Accuracy)

cat("The most accurate model is when the number of neighbors(K) is: ", Accu.table$KNN.pred[iiddxx])
## The most accurate model is when the number of neighbors(K) is:  K3
## That would be the 'global' accuracies for each model, however, we would like to check the accuracy for the prediction of each N.type.

# First we create simple tables summarizing all the N.types predicted by each model and the real ones:
matrix.final <- matrix(ncol = 7, nrow = 9)
rows.names <- c("N1", "N2", "N3", "N4", "N5", "N6", "N7", "N8", "N9")
rownames(matrix.final) <- rows.names

colums.names <- c("Freq.pred.K1", "Delta.K1", "Freq.pred.K3", "Delta.K3", "Freq.pred.K5", "Delta.K5", "Freq.N.type.Real")
colnames(matrix.final) <- colums.names

table.k1 <- table(tbl.part2$K.1)
table.k1 %>%
  as.data.frame() -> table.k1

table.k3 <- table(tbl.part2$K.3)
table.k3 %>%
  as.data.frame() -> table.k3

table.k5 <- table(tbl.part2$K.5)
table.k5 %>%
  as.data.frame() -> table.k5

table.real <- table(tbl.part2$N.type)
table.real %>%
  as.data.frame() -> table.real

matrix.final[,1] <- table.k1$Freq
matrix.final[,3] <- table.k3$Freq
matrix.final[,5] <- table.k5$Freq
matrix.final[,7] <- table.real$Freq

for(i in 1:nrow(matrix.final)){
  delta.k1 <- abs(matrix.final[i,7] - matrix.final[i,1])
  matrix.final[i,2] <- delta.k1
  
  delta.k3 <- abs(matrix.final[i,7] - matrix.final[i,3])
  matrix.final[i,4] <- delta.k3
  
  delta.k5 <- abs(matrix.final[i,7] - matrix.final[i,5])
  matrix.final[i,6] <- delta.k5
}

matrix.final %>% 
  as.data.frame() -> matrix.final
matrix.final %>% 
  mutate(N.types = rows.names) -> matrix.final
  
# Now let's try to plot the results

final.plot <- ggplot(data = matrix.final) + geom_point(aes(x = N.types, y = Delta.K1), shape = 2, size = 4, show.legend = T) + geom_point(aes(x = N.types, y = Delta.K3), shape = 4, size = 4, show.legend = T) + geom_point(aes(x = N.types, y = Delta.K5), shape = 1, size = 3, show.legend = T) + labs(y = "Number of prediction differences (absolute)")
print(final.plot)


I could not find a way to make a nice legend indicating which symbols correspond to which K… :( So:
- TRIANGLE is K1
- X is K3
- CIRCLE is K5