First, we read the data. Below is a sample

list.of.packages <- c("tidyverse", "readxl", "Rtsne", "M3C", 
                      "arules", "arulesViz", "ggdendro")

new.packages <- list.of.packages[!(list.of.packages 
                                   %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only = TRUE)
## Loading required package: tidyverse
## -- Attaching packages ------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: readxl
## Loading required package: Rtsne
## Loading required package: M3C
## Loading required package: arules
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: arulesViz
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
## Loading required package: ggdendro
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
## 
## [[5]]
## [1] TRUE
## 
## [[6]]
## [1] TRUE
## 
## [[7]]
## [1] TRUE
# X <- read_excel("Betacoronavirus_CpG.xlsx", 
#                 sheet = "CpG_Compare_CoV") %>%
#   select(-c("A", "C", "G", "T", "SeqLen",
#             "...11", "Sum", "...45", "GC%",
#             "SeqName...50"))

X <- read_csv("Betacoronavirus_CpG.csv")
## Parsed with column specification:
## cols(
##   SeqName = col_character(),
##   AA = col_double(),
##   AC = col_double(),
##   AG = col_double(),
##   AT = col_double(),
##   CA = col_double(),
##   CC = col_double(),
##   CG = col_double(),
##   CT = col_double(),
##   GA = col_double(),
##   GC = col_double(),
##   GG = col_double(),
##   GT = col_double(),
##   TA = col_double(),
##   TC = col_double(),
##   TG = col_double(),
##   TT = col_double()
## )
head(X)

Variables and their types are

sapply(X, class)
##     SeqName          AA          AC          AG          AT          CA 
## "character"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##          CC          CG          CT          GA          GC          GG 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##          GT          TA          TC          TG          TT 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"

Dimensions of the data are

dim(X)
## [1] 927  17

Some viruses that we have:

sample(X$SeqName, 10)
##  [1] "KT368854_MERS-related_coronavirus"                  
##  [2] "KJ473815_BtRs-BetaCoV/GX2013"                       
##  [3] "KP198611_Human_coronavirus_OC43"                    
##  [4] "AY545919_SARS-related_coronavirus_Melogale_moschata"
##  [5] "KU851860_MERS-related_coronavirus"                  
##  [6] "MK679660_Hedgehog_coronavirus_1"                    
##  [7] "MK062182_SARS_coronavirus_Urbani"                   
##  [8] "MH259486_MERS-related_coronavirus"                  
##  [9] "DQ084199_Bat_SARS_coronavirus_HKU3-2"               
## [10] "MH013216_MERS-related_coronavirus"

t-SNE

We will need some labels. Our labels are just substrings of the virus name from character 10 to character 10. Below is the frequency table of all labels:

lab <- X$SeqName %>% substr(10, 13) 
table(lab)
## lab
## _Bat _Bet _Bov _Hum _MER _Mur _Rab _Rat _Rou _SAR _Tyl _Wuh at/Y Bat_ Beta Bovi 
##    1    3    1    2    1    2    1    1    1    1    1    1    1   51   15   20 
## BtPa BtRf BtRs BtTp BtVs Calf Cani Cive Coro Drom Equi Erin Gira Hedg Huma MERS 
##    1    3    3    1    1    1    2    3    6    6    4    2    2    1  186  405 
## Muri Pipi Porc Rabb Rhin Rous Sabl Samb SARS Tylo Wate Whit Wuha Yak_ 
##   24    1   10    3    1    1    1    2  144    1    3    1    4    1

Some labels are rare. We will rename all labes that occur fewer than 10 times to “others”. Below is the new frequency table of labels:

rare_lab <- names(which(table(lab) <= 10))
rare_lab
##  [1] "_Bat" "_Bet" "_Bov" "_Hum" "_MER" "_Mur" "_Rab" "_Rat" "_Rou" "_SAR"
## [11] "_Tyl" "_Wuh" "at/Y" "BtPa" "BtRf" "BtRs" "BtTp" "BtVs" "Calf" "Cani"
## [21] "Cive" "Coro" "Drom" "Equi" "Erin" "Gira" "Hedg" "Pipi" "Porc" "Rabb"
## [31] "Rhin" "Rous" "Sabl" "Samb" "Tylo" "Wate" "Whit" "Wuha" "Yak_"
lab[lab %in% rare_lab] <- "others"
table(lab)
## lab
##   Bat_   Beta   Bovi   Huma   MERS   Muri others   SARS 
##     51     15     20    186    405     24     82    144

t-SNE is a nonlinear method of dimensionality reduction.

eps <- matrix(0.1 * runif(nrow(X) * (ncol(X)-1)) - 0.05, 
              nrow = ncol(X) - 1)
tsne(t(X[ , -1]) + eps, labels = lab)
## ***t-SNE wrapper function***
## running...
## done.