- Use the two data sets expr.csv and biol.csv from the class webpage.
- Use read.table(), with appropriate values for the arguments header and sep, to load the data into objects expr and biol.
# Read expr.csv
expr <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/expr.csv", header=TRUE, sep=',')
# Read biol.csv
biol <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/biol.csv", header=TRUE, sep=',')
# Show head data of expr
head(expr)
## sample gene1 gene2 gene3 gene4 gene5 gene6
## 1 6021 1.33 3.23 2.5 3.18 1.79 2.23
## 2 8092 -0.62 1.95 0.89 -0.09 -0.05 0.60
## 3 5240 1.53 3.18 1.7 1.26 1.31 4.64
## 4 9815 0.69 3.39 3.63 1.16 3.06 1.51
## 5 5396 -99 -99 -99 6.14 -99 3.38
## 6 7193 1.16 4.56 4.27 3.59 4.15 0.95
# Show head data of biol
head(biol)
## sample f1 f2
## 1 Y6021 2 S
## 2 Y8092 2 S
## 3 X5240 1 T
## 4 A9815 1 U
## 5 Y5396 2 T
## 6 X7193 . T
- Re-load the data using read.table with the argument na.strings set so that these values become NA
# Read expr.csv
expr <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/expr.csv", header=TRUE, sep=",", na.strings=c("-99", "."))
# Read biol.csv
biol <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/biol.csv", header=TRUE, sep=",", na.strings=c("-99", "."))
head(expr)
## sample gene1 gene2 gene3 gene4 gene5 gene6
## 1 6021 1.33 3.23 2.50 3.18 1.79 2.23
## 2 8092 -0.62 1.95 0.89 -0.09 -0.05 0.60
## 3 5240 1.53 3.18 1.70 1.26 1.31 4.64
## 4 9815 0.69 3.39 3.63 1.16 3.06 1.51
## 5 5396 NA NA NA 6.14 NA 3.38
## 6 7193 1.16 4.56 4.27 3.59 4.15 0.95
head(biol)
## sample f1 f2
## 1 Y6021 2 S
## 2 Y8092 2 S
## 3 X5240 1 T
## 4 A9815 1 U
## 5 Y5396 2 T
## 6 X7193 NA T
# Jumlah NA per kolom
colSums(is.na(expr))
## sample gene1 gene2 gene3 gene4 gene5 gene6
## 0 5 4 7 2 6 2
colSums(is.na(biol))
## sample f1 f2
## 0 3 0
- Are the datasets expr and biol data frames, lists or matrices?
# Cek expr
is.data.frame(expr)
## [1] TRUE
is.list(expr)
## [1] TRUE
is.matrix(expr)
## [1] FALSE
# Cek biol
is.data.frame(biol)
## [1] TRUE
is.list(biol)
## [1] TRUE
is.matrix(biol)
## [1] FALSE
- Use lapply or sapply with FUN, is.factor and is.numeric to see what the different columns of expr and biol were turned into.
# Cek tipe kolom dalam data expr
sapply(expr, function(x) c(is.factor=is.factor(x), is.numeric=is.numeric(x)))
## sample gene1 gene2 gene3 gene4 gene5 gene6
## is.factor FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## is.numeric TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Cek tipe kolom dalam data biol
sapply(biol, function(x) c(is.factor=is.factor(x), is.numeric=is.numeric(x)))
## sample f1 f2
## is.factor FALSE FALSE FALSE
## is.numeric FALSE TRUE FALSE
- Play around with indexing data frames.
names(expr); dimnames(expr); names(biol)
## [1] "sample" "gene1" "gene2" "gene3" "gene4" "gene5" "gene6"
## [[1]]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [91] "91" "92"
##
## [[2]]
## [1] "sample" "gene1" "gene2" "gene3" "gene4" "gene5" "gene6"
## [1] "sample" "f1" "f2"
biol[,-1] # semua kolom kecuali kolom pertama
## f1 f2
## 1 2 S
## 2 2 S
## 3 1 T
## 4 1 U
## 5 2 T
## 6 NA T
## 7 1 T
## 8 2 S
## 9 2 T
## 10 2 U
## 11 1 S
## 12 2 U
## 13 1 U
## 14 2 U
## 15 1 NA
## 16 1 U
## 17 NA S
## 18 2 U
## 19 2 T
## 20 1 U
## 21 1 T
## 22 1 S
## 23 NA NA
## 24 2 T
## 25 1 U
## 26 1 U
## 27 2 T
## 28 2 U
## 29 1 U
## 30 1 U
## 31 1 U
## 32 1 U
## 33 1 U
## 34 1 U
## 35 2 S
## 36 2 U
## 37 2 T
## 38 1 S
## 39 2 T
## 40 1 U
## 41 2 S
## 42 1 T
## 43 2 T
## 44 2 S
## 45 1 S
## 46 1 U
## 47 2 U
## 48 2 U
## 49 2 U
## 50 1 U
## 51 2 S
## 52 2 S
## 53 1 T
## 54 2 T
## 55 2 S
## 56 2 S
## 57 2 T
## 58 1 U
## 59 2 S
## 60 2 S
## 61 1 U
## 62 2 T
## 63 2 T
## 64 1 T
## 65 2 U
## 66 1 U
## 67 1 T
## 68 2 S
## 69 2 T
## 70 2 T
## 71 1 T
## 72 2 T
## 73 2 S
## 74 1 S
## 75 2 T
## 76 2 T
## 77 2 U
## 78 1 U
## 79 2 T
## 80 2 S
## 81 1 S
## 82 2 T
## 83 2 S
## 84 2 S
## 85 1 T
## 86 1 T
## 87 2 U
## 88 1 T
## 89 1 T
## 90 2 T
## 91 1 S
## 92 1 S
biol[1:20, ] # 20 baris pertama
## sample f1 f2
## 1 Y6021 2 S
## 2 Y8092 2 S
## 3 X5240 1 T
## 4 A9815 1 U
## 5 Y5396 2 T
## 6 X7193 NA T
## 7 Y8629 1 T
## 8 X7297 2 S
## 9 A8013 2 T
## 10 A7757 2 U
## 11 X5603 1 S
## 12 A8078 2 U
## 13 Y7953 1 U
## 14 Y6276 2 U
## 15 A5351 1 NA
## 16 X7634 1 U
## 17 A5515 NA S
## 18 X9036 2 U
## 19 A8520 2 T
## 20 A6930 1 U
biol[["f1"]] # akses kolom 'f1' (sama seperti biol$f1)
## [1] 2 2 1 1 2 NA 1 2 2 2 1 2 1 2 1 1 NA 2 2 1 1 1 NA 2 1
## [26] 1 2 2 1 1 1 1 1 1 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 1
## [51] 2 2 1 2 2 2 2 1 2 2 1 2 2 1 2 1 1 2 2 2 1 2 2 1 2
## [76] 2 2 1 2 2 1 2 2 2 1 1 2 1 1 2 1 1
biol[,"f2"] # akses kolom 'f2' pakai nama kolom
## [1] "S" "S" "T" "U" "T" "T" "T" "S" "T" "U" "S" "U" "U" "U" "NA"
## [16] "U" "S" "U" "T" "U" "T" "S" "NA" "T" "U" "U" "T" "U" "U" "U"
## [31] "U" "U" "U" "U" "S" "U" "T" "S" "T" "U" "S" "T" "T" "S" "S"
## [46] "U" "U" "U" "U" "U" "S" "S" "T" "T" "S" "S" "T" "U" "S" "S"
## [61] "U" "T" "T" "T" "U" "U" "T" "S" "T" "T" "T" "T" "S" "S" "T"
## [76] "T" "U" "U" "T" "S" "S" "T" "S" "S" "T" "T" "U" "T" "T" "T"
## [91] "S" "S"
expr[1:10, -c(3,5,6)] # ambil baris 1–10, dan hapus kolom ke-3, 5, 6
## sample gene1 gene3 gene6
## 1 6021 1.33 2.50 2.23
## 2 8092 -0.62 0.89 0.60
## 3 5240 1.53 1.70 4.64
## 4 9815 0.69 3.63 1.51
## 5 5396 NA NA 3.38
## 6 7193 1.16 4.27 0.95
## 7 8629 -0.38 0.22 0.10
## 8 7297 1.30 NA 2.96
## 9 8013 2.01 4.16 0.64
## 10 7757 2.39 3.15 2.43
expr$gene3 # akses kolom bernama 'gene3'
## [1] 2.50 0.89 1.70 3.63 NA 4.27 0.22 NA 4.16 3.15 -0.20 NA
## [13] 3.18 0.69 2.69 -2.00 2.34 0.48 -0.95 2.86 2.60 3.91 0.97 3.20
## [25] 1.88 1.40 1.27 2.88 -0.46 0.88 1.38 -0.41 2.26 2.00 0.58 3.32
## [37] 1.58 NA 1.06 3.94 2.66 5.61 2.87 NA 3.09 1.67 3.34 2.36
## [49] 2.93 -0.89 4.35 1.62 2.92 3.07 2.04 NA 5.91 2.51 1.75 3.01
## [61] 4.07 0.93 3.92 3.00 2.75 4.12 -0.39 2.81 4.24 5.18 3.12 1.40
## [73] NA 0.16 2.99 -0.29 0.80 0.15 2.18 3.32 3.57 1.57 3.87 2.79
## [85] 2.79 0.89 2.32 4.39 -0.48 4.19 0.35 2.64
expr[expr[,3] > 4.5, ] # ambil semua baris di mana kolom ke-3 > 4.5
## sample gene1 gene2 gene3 gene4 gene5 gene6
## NA NA NA NA NA NA NA NA
## 6 7193 1.16 4.56 4.27 3.59 4.15 0.95
## 9 8013 2.01 4.55 4.16 3.72 5.44 0.64
## NA.1 NA NA NA NA NA NA NA
## NA.2 NA NA NA NA NA NA NA
## 57 8382 3.00 5.83 5.91 7.69 6.62 2.34
## NA.3 NA NA NA NA NA NA NA
expr[!is.na(expr[,3]) & expr[,3] > 4.5, ]
## sample gene1 gene2 gene3 gene4 gene5 gene6
## 6 7193 1.16 4.56 4.27 3.59 4.15 0.95
## 9 8013 2.01 4.55 4.16 3.72 5.44 0.64
## 57 8382 3.00 5.83 5.91 7.69 6.62 2.34
# ambil baris yang kolom ke-3-nya TIDAK NA dan > 4.5
- Use apply, any and is.na to find rows in biol with at least one NA.
biol[apply(is.na(biol), 1, any), ]
## sample f1 f2
## 6 X7193 NA T
## 17 A5515 NA S
## 23 Y9329 NA NA
- Use apply or sapply to find the means of each column of expr. Note that you may need to use the na.rm argument for the function mean.
sapply(expr, mean, na.rm = TRUE)
## sample gene1 gene2 gene3 gene4 gene5
## 7179.989130 1.142989 3.091591 2.212000 2.071111 2.169651
## gene6
## 2.107333
- Use dimnames and as.character to made the names of the rows in sample column for expr, and then delete the sample column. Get the column means again. Also get the SDs and ranges for each column.
rownames(expr) <- as.character(expr$sample)
expr$sample <- NULL
colMeans(expr, na.rm = TRUE)
## gene1 gene2 gene3 gene4 gene5 gene6
## 1.142989 3.091591 2.212000 2.071111 2.169651 2.107333
apply(expr, 2, sd, na.rm = TRUE)
## gene1 gene2 gene3 gene4 gene5 gene6
## 0.9279386 0.8775408 1.6011802 1.5699720 1.4791384 1.0812943
apply(expr, 2, range, na.rm = TRUE)
## gene1 gene2 gene3 gene4 gene5 gene6
## [1,] -1.53 0.86 -2.00 -2.21 -1.04 0.03
## [2,] 3.00 5.83 5.91 7.69 6.62 4.64
- Calculate the correlation matrix of expr using the function cor. You’ll need to use the argument use: try use=“complete.obs” and use=“pairwise.complete.obs”.
cor(expr, use = "complete.obs")
## gene1 gene2 gene3 gene4 gene5 gene6
## gene1 1.0000000 0.8109350 0.7884901 0.6846526 0.7392696 0.4381714
## gene2 0.8109350 1.0000000 0.7880232 0.7725580 0.7989718 0.3825137
## gene3 0.7884901 0.7880232 1.0000000 0.6800959 0.6889018 0.2336774
## gene4 0.6846526 0.7725580 0.6800959 1.0000000 0.6779315 0.1294823
## gene5 0.7392696 0.7989718 0.6889018 0.6779315 1.0000000 0.1527897
## gene6 0.4381714 0.3825137 0.2336774 0.1294823 0.1527897 1.0000000
cor(expr, use = "pairwise.complete.obs")
## gene1 gene2 gene3 gene4 gene5 gene6
## gene1 1.0000000 0.7948008 0.7636731 0.6975228 0.7574929 0.4463219
## gene2 0.7948008 1.0000000 0.7791394 0.7721993 0.7819576 0.4220130
## gene3 0.7636731 0.7791394 1.0000000 0.6636195 0.7004765 0.2776524
## gene4 0.6975228 0.7721993 0.6636195 1.0000000 0.6815926 0.1893331
## gene5 0.7574929 0.7819576 0.7004765 0.6815926 1.0000000 0.2020812
## gene6 0.4463219 0.4220130 0.2776524 0.1893331 0.2020812 1.0000000
- For each row, subtract the average of the first two columns from each of the other four columns, and then delete first two columns. Re-calculate the correlation matrix.
expr_adj <- expr
expr_adj[, 3:6] <- expr_adj[, 3:6] - rowMeans(expr_adj[, 1:2], na.rm = TRUE)
expr_adj <- expr_adj[, -c(1,2)]
cor(expr_adj, use = "pairwise.complete.obs")
## gene3 gene4 gene5 gene6
## gene3 1.0000000 0.2816152 0.2620337 -0.2717860
## gene4 0.2816152 1.0000000 0.3126408 -0.3274785
## gene5 0.2620337 0.3126408 1.0000000 -0.3748852
## gene6 -0.2717860 -0.3274785 -0.3748852 1.0000000
- Use pairs to get a scatterplot matrix of the columns of expr against each other.
pairs(expr)
> 13. Remove the character from the front of the sample, and create a
new column (a factor), f3, containing that character. (Use the function
substring.) Make the row names of biol the sample number (with the
character removed), and then get the rows of biol in the same order as
the rows of expr.
# Buat kolom baru f3 dari karakter pertama
biol$f3 <- substring(biol$sample, 1, 1)
# Hapus karakter pertama dan jadikan rowname
biol$sample_clean <- substring(biol$sample, 2)
rownames(biol) <- biol$sample_clean
# Samakan urutan dengan expr
biol <- biol[rownames(expr), ]
head(biol)
## sample f1 f2 f3 sample_clean
## 6021 Y6021 2 S Y 6021
## 8092 Y8092 2 S Y 8092
## 5240 X5240 1 T X 5240
## 9815 A9815 1 U A 9815
## 5396 Y5396 2 T Y 5396
## 7193 X7193 NA T X 7193
- Use apply and tapply to find the mean and SD of each column of expr within each group biol\(f1==1 and biol\)f1==2.
apply(expr, 2, function(x) tapply(x, biol$f1, mean, na.rm = TRUE))
## gene1 gene2 gene3 gene4 gene5 gene6
## 1 1.117838 2.853684 1.916667 1.354103 1.953947 2.112308
## 2 1.188723 3.277447 2.457907 2.654792 2.336000 2.154167
apply(expr, 2, function(x) tapply(x, biol$f1, sd, na.rm = TRUE))
## gene1 gene2 gene3 gene4 gene5 gene6
## 1 1.0359996 0.8455685 1.758154 1.340537 1.555466 1.149955
## 2 0.8657684 0.8550501 1.431775 1.533136 1.411438 1.028452
- Use t.test (you may need to first do library(c.test)) with apply and split to do t-tests comparing the groups defined by biol$f2 for the values in each column of expr.
library(stats)
# split berdasarkan f2
t_results <- apply(expr, 2, function(x) {
t.test(split(x, biol$f2)[[1]], split(x, biol$f2)[[2]])$p.value
})
t_results
## gene1 gene2 gene3 gene4 gene5 gene6
## 0.81449926 0.48424932 0.68870255 0.09942554 0.15953411 0.16031764
Uji t dilakukan untuk membandingkan rata-rata nilai ekspresi setiap gen antara dua kelompok yang didefinisikan oleh variabel biol$f2. Hasil uji menghasilkan p-value untuk masing-masing gen. Berdasarkan hasil perhitungan, seluruh p-value memiliki nilai lebih besar dari 0.05 (gene1 = 0.8145, gene2 = 0.4842, gene3 = 0.6887, gene4 = 0.0994, gene5 = 0.1595, gene6 = 0.1603).
Hal ini menunjukkan bahwa tidak terdapat perbedaan yang signifikan secara statistik dalam rata-rata ekspresi keenam gen antara dua kelompok f2. Dengan kata lain, faktor f2 tidak memiliki pengaruh yang bermakna terhadap tingkat ekspresi gen dalam data ini.