Создаем вектор:
v1 <- c(1, 2, 3, 1, 2, 9)
v1
## [1] 1 2 3 1 2 9
v2 <- 7:12
v2
## [1] 7 8 9 10 11 12
v3 <- c(v1, v2)
v3
## [1] 1 2 3 1 2 9 7 8 9 10 11 12
v6 <- rep(x = c(1, 2), each = 4)
v6
## [1] 1 1 1 1 2 2 2 2
v7 <- rep(x = c(1, 2))
v7
## [1] 1 2
v8 <- rep(x = c(1, 2), times = 3)
v8
## [1] 1 2 1 2 1 2
v9 <- sample(c(1, 3, 5, 6), 10, replace = T)
v9
## [1] 5 3 1 1 1 6 6 6 6 5
v11 <- sample(10)
v11
## [1] 4 1 7 10 3 5 9 8 6 2
v10 <- seq(from = 1, to = 10, by = 2)
v10
## [1] 1 3 5 7 9
Сумма векторов
v4 <- v1 + v2
v4
## [1] 8 10 12 11 13 21
Произведение векторов
v5 <- v1 * v2
v5
## [1] 7 16 27 10 22 108
Скалярное произведение двух векторов (одинаковой длины!)
v1 %*% v2
## [,1]
## [1,] 190
Вхождение v1 в v2
v4 <- v1 %in% v2
v4
## [1] FALSE FALSE FALSE FALSE FALSE TRUE
Получение элементов вектора по индексу
v1[2]
## [1] 2
v1[3:5]
## [1] 3 1 2
v1[-1]
## [1] 2 3 1 2 9
Сумма всех координат вектора v1
sum(v1)
## [1] 18
Среднее значение координат вектора v1
mean(v1)
## [1] 3
Медиана координат вектора v1
median(v1)
## [1] 2
Максимальное значение координат вектора v1
max(v1)
## [1] 9
Минимальное значение координат вектора v1
min(v1)
## [1] 1
Количество позиций в векторе
length(v1)
## [1] 6
Округление числа
round(2.45865, digits = 3)
## [1] 2.459
Создание матрицы
m1 <- matrix(1:8, ncol = 2)
m1
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
m2 <- matrix(v1, nrow = 2)
m2
## [,1] [,2] [,3]
## [1,] 1 3 2
## [2,] 2 1 9
Произведение матриц. Количество строк одной матрицы должно быть равно количеству столбцов другой.
m3 <- m1 %*% m2
m3
## [,1] [,2] [,3]
## [1,] 11 8 47
## [2,] 14 12 58
## [3,] 17 16 69
## [4,] 20 20 80
Вывести размер матрицы
dim(m3)
## [1] 4 3
Вывести количество строк в матрице
nrow(m3)
## [1] 4
Вывести количество столбцов в матрице
ncol(m1)
## [1] 2
Задать имена столбцов и строк в матрице
rownames(m3) <- c("Bio1", "Bio2", "Bio3", "Bio4")
colnames(m3) <- c("Col1", "Col2", "Col3")
m3
## Col1 Col2 Col3
## Bio1 11 8 47
## Bio2 14 12 58
## Bio3 17 16 69
## Bio4 20 20 80
Вывести названия строк и столбцов матрицы
rownames(m3)
## [1] "Bio1" "Bio2" "Bio3" "Bio4"
colnames(m3)
## [1] "Col1" "Col2" "Col3"
Получение элементов матрицы по индексам и именам строк и столбцов
m3[1, 2]
## [1] 8
m3[, 2]
## Bio1 Bio2 Bio3 Bio4
## 8 12 16 20
m3[1, ]
## Col1 Col2 Col3
## 11 8 47
m3["Bio1", "Col3"]
## [1] 47
Создать список
l1 <- list()
Добавить элемент в список
l1[[1]] <- 1:10
l1
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
l1[["matrix"]] <- matrix(1:10, nrow = 2)
l1
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
l1[[3]] <- "test"
l1
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
##
## [[3]]
## [1] "test"
Создаем data frame
df1 <- data.frame(v11 = c(1, 2, 3, 4, 5), v12 = c("T1", "T2", "T3", "T4", "T5"))
df1
## v11 v12
## 1 1 T1
## 2 2 T2
## 3 3 T3
## 4 4 T4
## 5 5 T5
Загрузить таблицу
test <- read.table("annotation.ptt", header = T, sep = "\t", quote = "")
Узнать размер таблицы (кол-во строк х кол-во столбцов)
dim(test)
## [1] 763 10
Вывести шапку таблицы
head(test)
## Start End Strand Length PID Gene Synonym Code COG
## 1 1893 2681 + 262 294660181 parA MGA_0619 - COG1192D
## 2 2671 3147 + 158 294660182 - MGA_0621 - -
## 3 3163 4548 + 461 31544207 dnaA MGA_0622 - COG0593L
## 4 4939 5133 + 64 294660183 - MGA_1322d - COG1132V
## 5 5160 6077 + 305 294660184 - MGA_0625a - COG1132V
## 6 6393 8294 + 633 294660185 - MGA_0626 - COG1132V
## Product
## 1 ParA/Soj family protein
## 2 hypothetical protein
## 3 chromosomal replication initiator protein DnaA
## 4 ABC transporter
## 5 multidrug/protein/lipid ABC transporter
## 6 multidrug/protein/lipid ABC transporter
Вывести 2 верхние строки таблицы
head(test, 2)
## Start End Strand Length PID Gene Synonym Code COG
## 1 1893 2681 + 262 294660181 parA MGA_0619 - COG1192D
## 2 2671 3147 + 158 294660182 - MGA_0621 - -
## Product
## 1 ParA/Soj family protein
## 2 hypothetical protein
Вывести 2 последние строки таблицы
tail(test, 2)
## Start End Strand Length PID Gene Synonym Code COG
## 762 1010633 1011601 - 322 294660636 dnaJ_7 MGA_0617 - COG0484O
## 763 1011604 1012779 - 391 294660637 dnaN MGA_0618 - COG0592L
## Product
## 762 DnaJ-like molecular chaperone
## 763 DNA polymerase III beta subunit DnaN
Статистика по колонкам таблицы
summary(test)
## Start End Strand Length
## Min. : 1893 Min. : 2681 -:362 Min. : 37
## 1st Qu.: 243677 1st Qu.: 245410 +:401 1st Qu.: 188
## Median : 517500 Median : 519914 Median : 314
## Mean : 505601 Mean : 506773 Mean : 390
## 3rd Qu.: 756280 3rd Qu.: 757211 3rd Qu.: 539
## Max. :1011604 Max. :1012779 Max. :1969
##
## PID Gene Synonym Code COG
## Min. :3.15e+07 - :351 MGA_0001: 1 -:763 - :274
## 1st Qu.:3.15e+07 cbiO : 2 MGA_0002: 1 COG0484O: 8
## Median :2.95e+08 vlhA.3.0.1: 2 MGA_0004: 1 COG1132V: 6
## Mean :1.92e+08 aceF : 1 MGA_0005: 1 COG3328L: 6
## 3rd Qu.:2.95e+08 ach1 : 1 MGA_0008: 1 COG0561R: 5
## Max. :2.95e+08 ackA : 1 MGA_0009: 1 COG1404O: 5
## (Other) :405 (Other) :757 (Other) :459
## Product
## hypothetical protein :220
## DnaJ-like molecular chaperone : 7
## putative transposase domain-containing protein: 6
## HAD superfamily hydrolase Cof : 5
## putative transposase : 5
## ABC transporter permease : 4
## (Other) :516
Структура таблицы
str(test)
## 'data.frame': 763 obs. of 10 variables:
## $ Start : int 1893 2671 3163 4939 5160 6393 8389 8574 8929 10080 ...
## $ End : int 2681 3147 4548 5133 6077 8294 8538 8924 10080 10886 ...
## $ Strand : Factor w/ 2 levels "-","+": 2 2 2 2 2 2 2 2 2 2 ...
## $ Length : int 262 158 461 64 305 633 49 116 383 268 ...
## $ PID : int 294660181 294660182 31544207 294660183 294660184 294660185 31544211 294660186 31544213 294660187 ...
## $ Gene : Factor w/ 411 levels "-","aceF","ach1",..: 190 1 56 1 1 1 244 240 411 151 ...
## $ Synonym: Factor w/ 763 levels "MGA_0001","MGA_0002",..: 349 350 351 731 352 353 732 354 355 356 ...
## $ Code : Factor w/ 1 level "-": 1 1 1 1 1 1 1 1 1 1 ...
## $ COG : Factor w/ 411 levels "-","COG0006E",..: 319 1 240 306 306 306 112 241 269 13 ...
## $ Product: Factor w/ 483 levels "1-acyl-sn-glycerol-3-phosphate acyltransferase",..: 261 220 105 65 250 250 55 356 315 137 ...
Вывести с третьего по пятое значения столбца Length таблицы test
test$Length[1:5]
## [1] 262 158 461 64 305
Вывести начало столбца Length таблицы test
head(test$Length)
## [1] 262 158 461 64 305 633
Вывести последние значения столбца Length таблицы test
tail(test$Length)
## [1] 210 420 846 648 322 391
Вывести элемент первой строки и второго столбца таблицы test
test[1, 2]
## [1] 2681
Вывести название строк и столбцов таблицы test
colnames(test)
## [1] "Start" "End" "Strand" "Length" "PID" "Gene" "Synonym"
## [8] "Code" "COG" "Product"
rownames(test)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11"
## [12] "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22"
## [23] "23" "24" "25" "26" "27" "28" "29" "30" "31" "32" "33"
## [34] "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44"
## [45] "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55"
## [56] "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66"
## [67] "67" "68" "69" "70" "71" "72" "73" "74" "75" "76" "77"
## [78] "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88"
## [89] "89" "90" "91" "92" "93" "94" "95" "96" "97" "98" "99"
## [100] "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110"
## [111] "111" "112" "113" "114" "115" "116" "117" "118" "119" "120" "121"
## [122] "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143"
## [144] "144" "145" "146" "147" "148" "149" "150" "151" "152" "153" "154"
## [155] "155" "156" "157" "158" "159" "160" "161" "162" "163" "164" "165"
## [166] "166" "167" "168" "169" "170" "171" "172" "173" "174" "175" "176"
## [177] "177" "178" "179" "180" "181" "182" "183" "184" "185" "186" "187"
## [188] "188" "189" "190" "191" "192" "193" "194" "195" "196" "197" "198"
## [199] "199" "200" "201" "202" "203" "204" "205" "206" "207" "208" "209"
## [210] "210" "211" "212" "213" "214" "215" "216" "217" "218" "219" "220"
## [221] "221" "222" "223" "224" "225" "226" "227" "228" "229" "230" "231"
## [232] "232" "233" "234" "235" "236" "237" "238" "239" "240" "241" "242"
## [243] "243" "244" "245" "246" "247" "248" "249" "250" "251" "252" "253"
## [254] "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264"
## [265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275"
## [276] "276" "277" "278" "279" "280" "281" "282" "283" "284" "285" "286"
## [287] "287" "288" "289" "290" "291" "292" "293" "294" "295" "296" "297"
## [298] "298" "299" "300" "301" "302" "303" "304" "305" "306" "307" "308"
## [309] "309" "310" "311" "312" "313" "314" "315" "316" "317" "318" "319"
## [320] "320" "321" "322" "323" "324" "325" "326" "327" "328" "329" "330"
## [331] "331" "332" "333" "334" "335" "336" "337" "338" "339" "340" "341"
## [342] "342" "343" "344" "345" "346" "347" "348" "349" "350" "351" "352"
## [353] "353" "354" "355" "356" "357" "358" "359" "360" "361" "362" "363"
## [364] "364" "365" "366" "367" "368" "369" "370" "371" "372" "373" "374"
## [375] "375" "376" "377" "378" "379" "380" "381" "382" "383" "384" "385"
## [386] "386" "387" "388" "389" "390" "391" "392" "393" "394" "395" "396"
## [397] "397" "398" "399" "400" "401" "402" "403" "404" "405" "406" "407"
## [408] "408" "409" "410" "411" "412" "413" "414" "415" "416" "417" "418"
## [419] "419" "420" "421" "422" "423" "424" "425" "426" "427" "428" "429"
## [430] "430" "431" "432" "433" "434" "435" "436" "437" "438" "439" "440"
## [441] "441" "442" "443" "444" "445" "446" "447" "448" "449" "450" "451"
## [452] "452" "453" "454" "455" "456" "457" "458" "459" "460" "461" "462"
## [463] "463" "464" "465" "466" "467" "468" "469" "470" "471" "472" "473"
## [474] "474" "475" "476" "477" "478" "479" "480" "481" "482" "483" "484"
## [485] "485" "486" "487" "488" "489" "490" "491" "492" "493" "494" "495"
## [496] "496" "497" "498" "499" "500" "501" "502" "503" "504" "505" "506"
## [507] "507" "508" "509" "510" "511" "512" "513" "514" "515" "516" "517"
## [518] "518" "519" "520" "521" "522" "523" "524" "525" "526" "527" "528"
## [529] "529" "530" "531" "532" "533" "534" "535" "536" "537" "538" "539"
## [540] "540" "541" "542" "543" "544" "545" "546" "547" "548" "549" "550"
## [551] "551" "552" "553" "554" "555" "556" "557" "558" "559" "560" "561"
## [562] "562" "563" "564" "565" "566" "567" "568" "569" "570" "571" "572"
## [573] "573" "574" "575" "576" "577" "578" "579" "580" "581" "582" "583"
## [584] "584" "585" "586" "587" "588" "589" "590" "591" "592" "593" "594"
## [595] "595" "596" "597" "598" "599" "600" "601" "602" "603" "604" "605"
## [606] "606" "607" "608" "609" "610" "611" "612" "613" "614" "615" "616"
## [617] "617" "618" "619" "620" "621" "622" "623" "624" "625" "626" "627"
## [628] "628" "629" "630" "631" "632" "633" "634" "635" "636" "637" "638"
## [639] "639" "640" "641" "642" "643" "644" "645" "646" "647" "648" "649"
## [650] "650" "651" "652" "653" "654" "655" "656" "657" "658" "659" "660"
## [661] "661" "662" "663" "664" "665" "666" "667" "668" "669" "670" "671"
## [672] "672" "673" "674" "675" "676" "677" "678" "679" "680" "681" "682"
## [683] "683" "684" "685" "686" "687" "688" "689" "690" "691" "692" "693"
## [694] "694" "695" "696" "697" "698" "699" "700" "701" "702" "703" "704"
## [705] "705" "706" "707" "708" "709" "710" "711" "712" "713" "714" "715"
## [716] "716" "717" "718" "719" "720" "721" "722" "723" "724" "725" "726"
## [727] "727" "728" "729" "730" "731" "732" "733" "734" "735" "736" "737"
## [738] "738" "739" "740" "741" "742" "743" "744" "745" "746" "747" "748"
## [749] "749" "750" "751" "752" "753" "754" "755" "756" "757" "758" "759"
## [760] "760" "761" "762" "763"
Вывести элемент третьей строки столбца End
test[3, "End"]
## [1] 4548
Вывести с третье по пятую строки всех столбцов
test[3:5, ]
## Start End Strand Length PID Gene Synonym Code COG
## 3 3163 4548 + 461 31544207 dnaA MGA_0622 - COG0593L
## 4 4939 5133 + 64 294660183 - MGA_1322d - COG1132V
## 5 5160 6077 + 305 294660184 - MGA_0625a - COG1132V
## Product
## 3 chromosomal replication initiator protein DnaA
## 4 ABC transporter
## 5 multidrug/protein/lipid ABC transporter
Посчитать среднее значение столбца Length
mean(test$Length)
## [1] 389.8
Добавить новый столбец к таблице
test$new <- 0
head(test)
## Start End Strand Length PID Gene Synonym Code COG
## 1 1893 2681 + 262 294660181 parA MGA_0619 - COG1192D
## 2 2671 3147 + 158 294660182 - MGA_0621 - -
## 3 3163 4548 + 461 31544207 dnaA MGA_0622 - COG0593L
## 4 4939 5133 + 64 294660183 - MGA_1322d - COG1132V
## 5 5160 6077 + 305 294660184 - MGA_0625a - COG1132V
## 6 6393 8294 + 633 294660185 - MGA_0626 - COG1132V
## Product new
## 1 ParA/Soj family protein 0
## 2 hypothetical protein 0
## 3 chromosomal replication initiator protein DnaA 0
## 4 ABC transporter 0
## 5 multidrug/protein/lipid ABC transporter 0
## 6 multidrug/protein/lipid ABC transporter 0
Записать в новый столбец значения разницы двух столбцов
test$new <- test$End - test$Start
head(test)
## Start End Strand Length PID Gene Synonym Code COG
## 1 1893 2681 + 262 294660181 parA MGA_0619 - COG1192D
## 2 2671 3147 + 158 294660182 - MGA_0621 - -
## 3 3163 4548 + 461 31544207 dnaA MGA_0622 - COG0593L
## 4 4939 5133 + 64 294660183 - MGA_1322d - COG1132V
## 5 5160 6077 + 305 294660184 - MGA_0625a - COG1132V
## 6 6393 8294 + 633 294660185 - MGA_0626 - COG1132V
## Product new
## 1 ParA/Soj family protein 788
## 2 hypothetical protein 476
## 3 chromosomal replication initiator protein DnaA 1385
## 4 ABC transporter 194
## 5 multidrug/protein/lipid ABC transporter 917
## 6 multidrug/protein/lipid ABC transporter 1901
Удалить столбец new
test$new <- NULL
head(test)
## Start End Strand Length PID Gene Synonym Code COG
## 1 1893 2681 + 262 294660181 parA MGA_0619 - COG1192D
## 2 2671 3147 + 158 294660182 - MGA_0621 - -
## 3 3163 4548 + 461 31544207 dnaA MGA_0622 - COG0593L
## 4 4939 5133 + 64 294660183 - MGA_1322d - COG1132V
## 5 5160 6077 + 305 294660184 - MGA_0625a - COG1132V
## 6 6393 8294 + 633 294660185 - MGA_0626 - COG1132V
## Product
## 1 ParA/Soj family protein
## 2 hypothetical protein
## 3 chromosomal replication initiator protein DnaA
## 4 ABC transporter
## 5 multidrug/protein/lipid ABC transporter
## 6 multidrug/protein/lipid ABC transporter
Выбор из таблицы с условием
a <- test[test$Strand == "+" & test$Length > 1300, ]
a
## Start End Strand Length PID Gene Synonym Code COG
## 21 24273 28358 + 1361 31544225 - MGA_0654 - -
## 94 110558 115042 + 1494 294660232 polC MGA_0791 - COG2176L
## 228 303943 308115 + 1390 294660312 rpoB MGA_1000 - COG0085K
## 265 364332 369059 + 1575 294660333 - MGA_1079 - -
## 337 448671 454466 + 1931 31544525 hlp2 MGA_1203 - COG1196D
## Product
## 21 ABC transporter permease
## 94 DNA polymerase III subunit alpha polC/Gram positive-type
## 228 DNA-directed RNA polymerase subunit beta
## 265 hypothetical protein
## 337 cytadherence-associated protein Hlp2
Задать имена колонок
m <- matrix(sample(c(1, 0), replace = T, size = 2000), ncol = 20)
head(m)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 1 1 0 0 1 0 1 0 1 1 0 0
## [2,] 1 0 1 1 0 0 0 0 0 0 0 1 1
## [3,] 0 0 0 0 0 0 0 1 0 0 0 1 1
## [4,] 0 1 1 0 1 1 1 1 0 0 1 0 1
## [5,] 1 1 0 1 1 0 0 0 1 1 0 0 1
## [6,] 0 0 1 1 0 0 0 0 1 0 1 1 0
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 0 0 1 1 0 1 0
## [2,] 0 1 1 1 1 1 0
## [3,] 0 0 1 1 1 1 0
## [4,] 1 0 1 0 1 1 0
## [5,] 1 0 1 1 1 1 1
## [6,] 0 0 0 1 0 1 0
md <- as.data.frame(m)
colnames(md) <- paste("SAMPLE", 1:20, sep = "_")
head(md)
## SAMPLE_1 SAMPLE_2 SAMPLE_3 SAMPLE_4 SAMPLE_5 SAMPLE_6 SAMPLE_7 SAMPLE_8
## 1 1 1 1 0 0 1 0 1
## 2 1 0 1 1 0 0 0 0
## 3 0 0 0 0 0 0 0 1
## 4 0 1 1 0 1 1 1 1
## 5 1 1 0 1 1 0 0 0
## 6 0 0 1 1 0 0 0 0
## SAMPLE_9 SAMPLE_10 SAMPLE_11 SAMPLE_12 SAMPLE_13 SAMPLE_14 SAMPLE_15
## 1 0 1 1 0 0 0 0
## 2 0 0 0 1 1 0 1
## 3 0 0 0 1 1 0 0
## 4 0 0 1 0 1 1 0
## 5 1 1 0 0 1 1 0
## 6 1 0 1 1 0 0 0
## SAMPLE_16 SAMPLE_17 SAMPLE_18 SAMPLE_19 SAMPLE_20
## 1 1 1 0 1 0
## 2 1 1 1 1 0
## 3 1 1 1 1 0
## 4 1 0 1 1 0
## 5 1 1 1 1 1
## 6 0 1 0 1 0
Функция aggregate. ищем среднее значение по столбцу Length для строк, имеющих одно значение столбца Strand
agg <- aggregate(x = test$Length, by = list(test$Strand), FUN = mean)
agg
## Group.1 x
## 1 - 377.2
## 2 + 401.2
Задаем имена полей
colnames(agg) <- c("Strand", "MeanLength")
agg
## Strand MeanLength
## 1 - 377.2
## 2 + 401.2
Назвать только 2 столбец таблицы
colnames(agg)[2] <- "NewName"
agg
## Strand NewName
## 1 - 377.2
## 2 + 401.2
Соединить 2 таблицы
test.agg <- merge(test, agg, by.x = "Strand", by.y = "Strand")
head(test.agg)
## Strand Start End Length PID Gene Synonym Code COG
## 1 - 193268 195040 590 31544361 metG_2 MGA_0893 - COG0143J
## 2 - 384996 386495 499 31544468 - MGA_1107 - COG1322S
## 3 - 180028 180384 118 31544347 himA/hup_1 MGA_0869 - COG0776L
## 4 - 149313 150758 481 31544321 - MGA_0829a - -
## 5 - 57481 59649 722 31544247 nrdA MGA_0695 - COG0209F
## 6 - 197100 197741 213 294660276 - MGA_0901 - -
## Product
## 1 methionyl-tRNA synthetase
## 2 hypothetical protein
## 3 histone-like DNA-binding superfamily protein HimA/HU/Integration host factor
## 4 hypothetical protein
## 5 ribonucleotide-diphosphate reductase subunit alpha
## 6 hypothetical protein
## NewName
## 1 377.2
## 2 377.2
## 3 377.2
## 4 377.2
## 5 377.2
## 6 377.2
Apply - матрицы Создаем случайную матрицу
m <- matrix(sample(1:100, 100), ncol = 10)
m
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 19 94 23 4 74 60 61 5 90 57
## [2,] 67 38 65 75 3 18 59 52 63 72
## [3,] 1 9 36 6 10 89 7 15 81 96
## [4,] 28 55 58 78 84 32 27 86 43 79
## [5,] 47 14 50 29 11 44 42 31 24 93
## [6,] 99 22 2 83 30 51 21 69 100 66
## [7,] 49 73 98 85 80 68 39 56 91 71
## [8,] 97 87 54 46 48 26 34 16 62 64
## [9,] 82 95 92 17 40 37 88 70 20 8
## [10,] 76 13 77 33 41 25 45 35 53 12
Находим среднее по строкам
apply(X = m, MARGIN = 1, FUN = mean)
## [1] 48.7 51.2 35.0 57.0 38.5 54.3 71.0 53.4 54.9 41.0
Находим среднее по столбцам
apply(m, 2, mean)
## [1] 56.5 50.0 55.5 45.6 42.1 45.0 42.3 43.5 62.7 61.8
Lapply - списки Создаем список из 3 векторов. Находим максимальное значение для каждого из векторов списка
l <- list()
l[[1]] <- 1:10
l[[2]] <- seq(1, 20, 2)
l[[3]] <- sample(1:30, size = 10, replace = F)
l
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
##
## [[2]]
## [1] 1 3 5 7 9 11 13 15 17 19
##
## [[3]]
## [1] 28 20 7 27 25 19 18 11 3 24
lapply(X = l, FUN = max)
## [[1]]
## [1] 10
##
## [[2]]
## [1] 19
##
## [[3]]
## [1] 28
Tapply - таблицы Считаем среднее значений столбца Length для строк, сргуппированных по значениям столбца Strand
tapply(X = test$Length, INDEX = test$Strand, FUN = mean)
## - +
## 377.2 401.2
Выбор уникальных значений из столбца таблицы
unique(test$Strand)
## [1] + -
## Levels: - +
t.cover <- read.table("metagenomic_coverage_distributions.txt", header = T,
sep = " ")
t.groups <- read.table("sample_groups.txt", header = T, sep = " ")
t.all <- merge(t.cover, t.groups, by.x = "names", by.y = "names")
Гистограмма
hist(t.all$length, breaks = 30, main = "Histogram", col = "lightgreen", xlab = "Length",
ylab = "Freq")
Точки
plot(x = t.all$length, y = t.all$cover, col = t.all$country, cex = 0.6, main = "Coverage by length",
xlab = "Length", ylab = "Coverage", pch = 16)
legend(1e+06, 5e+06, levels(t.all$country), col = 1:3, pch = 16)
Боксплот
boxplot(data = t.all, cover ~ country, col = "lightblue", main = "Coverage by country",
xlab = "Country", ylab = "Coverage")
График плотности
plot(density(t.all$length), col = "red", main = "Length density", xlab = "Length")
Барплот
barplot(c(1:30), col = heat.colors(30))
m1 <- matrix(sample(30), ncol = 5)
colnames(m1) <- c("M1", "M2", "M3", "M4", "M5")
m1
## M1 M2 M3 M4 M5
## [1,] 15 8 30 16 7
## [2,] 29 11 10 20 28
## [3,] 25 6 2 13 9
## [4,] 21 14 3 5 4
## [5,] 17 24 23 12 22
## [6,] 26 19 1 27 18
barplot(m1, col = rainbow(6), beside = TRUE)
Несколько линий создаем матрицу со случайными данными
m2 <- matrix(c(1:5, sample(20)), nrow = 5)
colnames(m2) <- c("x", "y1", "y2", "y3", "y4")
m2
## x y1 y2 y3 y4
## [1,] 1 10 13 16 19
## [2,] 2 14 15 8 11
## [3,] 3 2 12 9 4
## [4,] 4 3 18 17 1
## [5,] 5 6 20 7 5
Рисуем первую линию. Задаем границы графика Добавляем новые линии других цветов и типов
plot(x = m2[, 1], y = m2[, 2], type = "l", col = "red", ylim = c(min(m2), max(m2)),
lty = 1, xlab = "x", ylab = "y")
lines(x = m2[, 1], y = m2[, 3], type = "l", col = "blue", lty = 2)
lines(x = m2[, 1], y = m2[, 4], type = "l", col = "green", lty = 3, lwd = 3)
lines(x = m2[, 1], y = m2[, 5], type = "l", col = "orange", lty = 4)
lines(x = m2[, 1], y = m2[, 5], type = "l", col = "orange", lty = 4, lwd = 2)
Несколько графиков на одном листе
layout(mat = matrix(1:4, ncol = 2))
plot(sample(5))
plot(sample(5))
plot(sample(5))
plot(sample(5))
layout(mat = matrix(1:8, ncol = 4))
for (i in 1:8) {
plot(sample(100), col = i, pch = 16)
}
If
x <- 4
if (x == 4) print("yes") else print("no")
## [1] "yes"
if (x < 2) {
print("x <= 2")
} else if (x <= 3) {
print("x <= 3")
} else {
print("x > 3")
}
## [1] "x > 3"
Switch
switch(3, "A", "B", "C", "D", "E")
## [1] "C"
switch("T", A = print("A"), T = {
m <- 1
m <- m + 2
}, C = print("C"))
print(m)
## [1] 3
switch("T", A = print("A"), B = print("B"), print("not eq"))
## [1] "not eq"
Repeat
i <- 1
repeat {
i <- i + 1
if (i >= 5)
break
}
print(i)
## [1] 5
While
i <- 1
while (i <= 5) {
i <- i + 1
}
print(i)
## [1] 6
For
for (i in 1:5) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
for (i in c("A", "B", "C", "D")) {
print(i)
}
## [1] "A"
## [1] "B"
## [1] "C"
## [1] "D"
myFunc <- function(x) {
x <- x + 2
return(x)
}
myFunc(2)
## [1] 4
x <- 2
myFunc(x)
## [1] 4
x
## [1] 2
myFunc2 <- function(x, y) {
return(x + y)
}
myFunc2(x = 3, y = 5)
## [1] 8
myFunc2(3, 5)
## [1] 8
plotVec <- function(vec = rep(1, 5)) {
plot(vec)
}
plotVec(vec = c(1, 2, 3, 4, 5))
plotVec()
m <- matrix(1:10, nrow = 2, byrow = T)
m
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 4 5
## [2,] 6 7 8 9 10
apply(m, 1, FUN = function(x) {
x + 2
})
## [,1] [,2]
## [1,] 3 8
## [2,] 4 9
## [3,] 5 10
## [4,] 6 11
## [5,] 7 12
apply(m, 2, FUN = function(x) {
x[1]
})
## [1] 1 2 3 4 5
library(reshape)
## Warning: package 'reshape' was built under R version 3.0.1
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.0.1
## Attaching package: 'reshape'
## Следующий объект скрыт from 'package:plyr':
##
## rename, round_any
df1 <- data.frame(NAME = c("A", "A", "A", "B", "B", "B", "C", "C", "C"), VALUE = 1:9,
SAMPLE = paste("S", rep(1:3, time = 3), sep = ""))
df1
## NAME VALUE SAMPLE
## 1 A 1 S1
## 2 A 2 S2
## 3 A 3 S3
## 4 B 4 S1
## 5 B 5 S2
## 6 B 6 S3
## 7 C 7 S1
## 8 C 8 S2
## 9 C 9 S3
df1.c <- cast(df1, NAME ~ SAMPLE, value = "VALUE")
df1.c
## NAME S1 S2 S3
## 1 A 1 2 3
## 2 B 4 5 6
## 3 C 7 8 9
df1.m <- melt(df1.c, id.vars = "NAME")
colnames(df1.m) <- c("NAME", "VALUE", "SAMPLE")
df1.m
## NAME VALUE SAMPLE
## S1 A 1 S1
## S1.1 B 4 S1
## S1.2 C 7 S1
## S2 A 2 S2
## S2.1 B 5 S2
## S2.2 C 8 S2
## S3 A 3 S3
## S3.1 B 6 S3
## S3.2 C 9 S3
snps <- read.table("tuber_snps.txt", header = T, sep = "\t")
head(snps)
## NUMB POS REF SNP SAMPLE
## 1 1 1849 C A KS_1180C
## 2 2 1977 A G KS_1180C
## 3 3 3446 C T KS_1180C
## 4 4 4013 T C KS_1180C
## 5 5 7362 G C KS_1180C
## 6 6 7585 G C KS_1180C
snps$S <- 1
head(snps)
## NUMB POS REF SNP SAMPLE S
## 1 1 1849 C A KS_1180C 1
## 2 2 1977 A G KS_1180C 1
## 3 3 3446 C T KS_1180C 1
## 4 4 4013 T C KS_1180C 1
## 5 5 7362 G C KS_1180C 1
## 6 6 7585 G C KS_1180C 1
Разворачиваем таблицу, заполняя отсутствующие значения - 0
snps.cast <- cast(snps, POS ~ SAMPLE, value = "S", fill = 0)
head(snps.cast)
## POS KS_1180C KS_751B KS_855B KS_DY_131 KS_DY_135 KS_DY_167 KS_DY_195
## 1 782 0 0 0 0 0 0 0
## 2 1849 1 1 1 0 0 0 0
## 3 1894 0 0 0 0 0 0 0
## 4 1977 1 1 1 1 1 1 1
## 5 2140 0 0 0 0 0 0 0
## 6 2222 0 0 0 1 0 0 1
## KS_DY_20 KS_DY_21 KS_DY_22 KS_DY_26 KS_DY_28 KS_DY_8 KS_GT_281 KS_GT_333
## 1 0 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 1
## 3 1 0 0 0 0 0 0 0
## 4 1 1 1 1 1 0 1 1
## 5 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0
## KS_GT_345 KS_GT_411 KS_GT_649 KS_N0001 KS_N0002
## 1 0 0 0 0 0
## 2 1 1 1 0 1
## 3 0 0 0 0 0
## 4 1 1 1 1 1
## 5 0 0 0 0 0
## 6 0 0 0 0 0
Делаем из таблицы матрицу
snps.m <- as.matrix(snps.cast[, 2:ncol(snps.cast)])
colnames(snps.m) <- colnames(snps.cast)[2:ncol(snps.cast)]
head(snps.m)
## KS_1180C KS_751B KS_855B KS_DY_131 KS_DY_135 KS_DY_167 KS_DY_195
## [1,] 0 0 0 0 0 0 0
## [2,] 1 1 1 0 0 0 0
## [3,] 0 0 0 0 0 0 0
## [4,] 1 1 1 1 1 1 1
## [5,] 0 0 0 0 0 0 0
## [6,] 0 0 0 1 0 0 1
## KS_DY_20 KS_DY_21 KS_DY_22 KS_DY_26 KS_DY_28 KS_DY_8 KS_GT_281
## [1,] 0 1 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0
## [3,] 1 0 0 0 0 0 0
## [4,] 1 1 1 1 1 0 1
## [5,] 0 0 0 0 1 0 0
## [6,] 0 0 0 0 0 0 0
## KS_GT_333 KS_GT_345 KS_GT_411 KS_GT_649 KS_N0001 KS_N0002
## [1,] 0 0 0 0 0 0
## [2,] 1 1 1 1 0 1
## [3,] 0 0 0 0 0 0
## [4,] 1 1 1 1 1 1
## [5,] 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0
Транспонируем матрицу
snps.m <- t(snps.m)
head(snps.m[, 1:5])
## [,1] [,2] [,3] [,4] [,5]
## KS_1180C 0 1 0 1 0
## KS_751B 0 1 0 1 0
## KS_855B 0 1 0 1 0
## KS_DY_131 0 0 0 1 0
## KS_DY_135 0 0 0 1 0
## KS_DY_167 0 0 0 1 0
Расчитываем евклидово расстояние между образцами
snps.dist <- dist(snps.m)
Создаем дендрограмму
snps.hc <- hclust(snps.dist)
plot(hclust(snps.dist))
Дерево по матрице расстояний
library(phangorn)
## Warning: package 'phangorn' was built under R version 3.0.1
## Loading required package: ape
## Warning: package 'ape' was built under R version 3.0.1
## Loading required package: rgl
## Warning: package 'rgl' was built under R version 3.0.1
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 3.0.1
## Attaching package: 'igraph'
## Следующий объект скрыт from 'package:ape':
##
## as.igraph, edges
## Loading required package: Matrix
## Loading required package: lattice
## Attaching package: 'Matrix'
## Следующий объект скрыт from 'package:reshape':
##
## expand
## Warning: replacing previous import 'as.igraph' when loading 'igraph'
## Warning: replacing previous import 'edges' when loading 'igraph'
snps.NJ <- NJ(snps.dist)
plot(snps.NJ)
РСА анализ
snps.pr <- prcomp(snps.dist)
plot(x = snps.pr$x[, 1], y = snps.pr$x[, 2])
text(x = snps.pr$x[, 1], y = snps.pr$x[, 2], labels = rownames(snps.pr$x))
Heatmap
library(gplots)
## Warning: package 'gplots' was built under R version 3.0.1
## Loading required package: gtools
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## ```
```r
heatmap.2(snps.m[,1:10],tracecol=NA,margins=c(10,10),col=c('#FF3300','#33FF00'))
Диаграммы Венна
library(VennDiagram)
## Attaching package: 'VennDiagram'
## Следующий объект скрыт from 'package:ape':
##
## rotate
us.names <- unique(snps$SAMPLE)
vl <- list()
for (i in 1:5) {
vl[[as.character(us.names[i])]] <- snps$POS[snps$SAMPLE == us.names[i]]
}
venn.diagram(x = vl, file = "venn.tiff", col = "transparent", fill = rainbow(5),
sub.cex = 0.2)
## [1] 1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.0.1
library(reshape)
ma <- matrix(sample(30), ncol = 5)
df <- as.data.frame(ma)
df
## V1 V2 V3 V4 V5
## 1 25 3 26 20 12
## 2 13 27 10 11 8
## 3 19 22 5 7 1
## 4 4 2 18 15 17
## 5 9 24 16 14 23
## 6 6 29 28 21 30
df <- melt(data = df, id.vars = "V1")
df
## V1 variable value
## 1 25 V2 3
## 2 13 V2 27
## 3 19 V2 22
## 4 4 V2 2
## 5 9 V2 24
## 6 6 V2 29
## 7 25 V3 26
## 8 13 V3 10
## 9 19 V3 5
## 10 4 V3 18
## 11 9 V3 16
## 12 6 V3 28
## 13 25 V4 20
## 14 13 V4 11
## 15 19 V4 7
## 16 4 V4 15
## 17 9 V4 14
## 18 6 V4 21
## 19 25 V5 12
## 20 13 V5 8
## 21 19 V5 1
## 22 4 V5 17
## 23 9 V5 23
## 24 6 V5 30
ggplot(df, aes(x = V1, y = value, color = variable)) + geom_line()
cover <- read.table("metagenomic_coverage_distributions.txt", header = T, sep = " ",
quote = "\"")
head(cover)
## numb names cover nts redund uncov length setnums frac
## 1 1 MH0002_090117 893069 2001825 0.6228 2321349 3214418 0 0.2778
## 2 2 MH0003_090107 2084687 9524024 2.9629 1129731 3214418 0 0.6485
## 3 3 MH0004_081123 1132424 2289496 0.7123 2081994 3214418 0 0.3523
## 4 4 MH0005_081120 1225936 2159212 0.6717 1988482 3214418 0 0.3814
## 5 5 MH0006_081120 1901123 4666464 1.4517 1313295 3214418 0 0.5914
## 6 6 MH0007_081123 1733793 3473712 1.0807 1480625 3214418 0 0.5394
## mean
## 1 2.242
## 2 4.569
## 3 2.022
## 4 1.761
## 5 2.455
## 6 2.004
sample <- read.table("sample_groups.txt", header = T, sep = " ", quote = "\"")
head(sample)
## numb names short_names groups country
## 1 98 MH0002_090117 MH0002 MH0 EUR
## 2 99 MH0003_090107 MH0003 MH0 EUR
## 3 100 MH0004_081123 MH0004 MH0 EUR
## 4 101 MH0005_081120 MH0005 MH0 EUR
## 5 102 MH0006_081120 MH0006 MH0 EUR
## 6 103 MH0007_081123 MH0007 MH0 EUR
mer <- merge(cover, sample, by.x = "names", by.y = "names")
head(mer)
## names numb.x cover nts redund uncov length setnums
## 1 MH0002_090117 5504 7836 23550 0.007961 2950280 2958116 18
## 2 MH0002_090117 15841 748911 1248525 0.397445 2392470 3141381 57
## 3 MH0002_090117 17317 415754 568574 0.086713 6141234 6556988 68
## 4 MH0002_090117 10012 260338 468825 0.143367 3009771 3270109 33
## 5 MH0002_090117 6470 6440 12675 0.003613 3501433 3507873 21
## 6 MH0002_090117 4538 328107 1600348 0.417116 3508587 3836694 15
## frac mean numb.y short_names groups country
## 1 0.002649 3.005 98 MH0002 MH0 EUR
## 2 0.238402 1.667 98 MH0002 MH0 EUR
## 3 0.063406 1.368 98 MH0002 MH0 EUR
## 4 0.079611 1.801 98 MH0002 MH0 EUR
## 5 0.001836 1.968 98 MH0002 MH0 EUR
## 6 0.085518 4.878 98 MH0002 MH0 EUR
ggplot(mer, aes(x = length)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.
ggplot(mer, aes(x = length, color = country)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.
ggplot(mer, aes(x = length, fill = country)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.
ggplot(mer, aes(x = length, y = cover, color = country)) + geom_point()
ggplot(mer, aes(x = length, y = cover, color = country)) + geom_boxplot()
ggplot(mer, aes(x = length, y = cover, fill = country)) + geom_boxplot()