First, we read the data. Below is a sample
list.of.packages <- c("tidyverse", "readxl", "Rtsne", "M3C",
"arules", "arulesViz", "ggdendro")
new.packages <- list.of.packages[!(list.of.packages
%in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only = TRUE)
## Loading required package: tidyverse
## -- Attaching packages ------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 3.0.0 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: readxl
## Loading required package: Rtsne
## Loading required package: M3C
## Loading required package: arules
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: arulesViz
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
## Loading required package: ggdendro
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
# X <- read_excel("Betacoronavirus_CpG.xlsx",
# sheet = "CpG_Compare_CoV") %>%
# select(-c("A", "C", "G", "T", "SeqLen",
# "...11", "Sum", "...45", "GC%",
# "SeqName...50"))
X <- read_csv("Betacoronavirus_CpG.csv")
## Parsed with column specification:
## cols(
## SeqName = col_character(),
## AA = col_double(),
## AC = col_double(),
## AG = col_double(),
## AT = col_double(),
## CA = col_double(),
## CC = col_double(),
## CG = col_double(),
## CT = col_double(),
## GA = col_double(),
## GC = col_double(),
## GG = col_double(),
## GT = col_double(),
## TA = col_double(),
## TC = col_double(),
## TG = col_double(),
## TT = col_double()
## )
head(X)
Variables and their types are
sapply(X, class)
## SeqName AA AC AG AT CA
## "character" "numeric" "numeric" "numeric" "numeric" "numeric"
## CC CG CT GA GC GG
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## GT TA TC TG TT
## "numeric" "numeric" "numeric" "numeric" "numeric"
Dimensions of the data are
dim(X)
## [1] 927 17
Some viruses that we have:
sample(X$SeqName, 10)
## [1] "KT368854_MERS-related_coronavirus"
## [2] "KJ473815_BtRs-BetaCoV/GX2013"
## [3] "KP198611_Human_coronavirus_OC43"
## [4] "AY545919_SARS-related_coronavirus_Melogale_moschata"
## [5] "KU851860_MERS-related_coronavirus"
## [6] "MK679660_Hedgehog_coronavirus_1"
## [7] "MK062182_SARS_coronavirus_Urbani"
## [8] "MH259486_MERS-related_coronavirus"
## [9] "DQ084199_Bat_SARS_coronavirus_HKU3-2"
## [10] "MH013216_MERS-related_coronavirus"
We will need some labels. Our labels are just substrings of the virus name from character 10 to character 10. Below is the frequency table of all labels:
lab <- X$SeqName %>% substr(10, 13)
table(lab)
## lab
## _Bat _Bet _Bov _Hum _MER _Mur _Rab _Rat _Rou _SAR _Tyl _Wuh at/Y Bat_ Beta Bovi
## 1 3 1 2 1 2 1 1 1 1 1 1 1 51 15 20
## BtPa BtRf BtRs BtTp BtVs Calf Cani Cive Coro Drom Equi Erin Gira Hedg Huma MERS
## 1 3 3 1 1 1 2 3 6 6 4 2 2 1 186 405
## Muri Pipi Porc Rabb Rhin Rous Sabl Samb SARS Tylo Wate Whit Wuha Yak_
## 24 1 10 3 1 1 1 2 144 1 3 1 4 1
Some labels are rare. We will rename all labes that occur fewer than 10 times to “others”. Below is the new frequency table of labels:
rare_lab <- names(which(table(lab) <= 10))
rare_lab
## [1] "_Bat" "_Bet" "_Bov" "_Hum" "_MER" "_Mur" "_Rab" "_Rat" "_Rou" "_SAR"
## [11] "_Tyl" "_Wuh" "at/Y" "BtPa" "BtRf" "BtRs" "BtTp" "BtVs" "Calf" "Cani"
## [21] "Cive" "Coro" "Drom" "Equi" "Erin" "Gira" "Hedg" "Pipi" "Porc" "Rabb"
## [31] "Rhin" "Rous" "Sabl" "Samb" "Tylo" "Wate" "Whit" "Wuha" "Yak_"
lab[lab %in% rare_lab] <- "others"
table(lab)
## lab
## Bat_ Beta Bovi Huma MERS Muri others SARS
## 51 15 20 186 405 24 82 144
t-SNE is a nonlinear method of dimensionality reduction.
eps <- matrix(0.1 * runif(nrow(X) * (ncol(X)-1)) - 0.05,
nrow = ncol(X) - 1)
tsne(t(X[ , -1]) + eps, labels = lab)
## ***t-SNE wrapper function***
## running...
## done.