Quiz 1 Data Analysis (Retake)

Use the two data sets expr.csv and biol.csv from the class webpage.

Use read.table(), with appropriate values for the arguments header and sep, to load the data into objects expr and biol.

# Read expr.csv
expr <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/expr.csv", header=TRUE, sep=',')
# Read biol.csv
biol <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/biol.csv", header=TRUE, sep=',')

# Show head data of expr
head(expr)

##   sample gene1 gene2 gene3 gene4 gene5 gene6
## 1   6021  1.33  3.23   2.5  3.18  1.79  2.23
## 2   8092 -0.62  1.95  0.89 -0.09 -0.05  0.60
## 3   5240  1.53  3.18   1.7  1.26  1.31  4.64
## 4   9815  0.69  3.39  3.63  1.16  3.06  1.51
## 5   5396   -99   -99   -99  6.14   -99  3.38
## 6   7193  1.16  4.56  4.27  3.59  4.15  0.95

# Show head data of biol
head(biol)

##   sample f1 f2
## 1  Y6021  2  S
## 2  Y8092  2  S
## 3  X5240  1  T
## 4  A9815  1  U
## 5  Y5396  2  T
## 6  X7193  .  T

Re-load the data using read.table with the argument na.strings set so that these values become NA

# Read expr.csv
expr <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/expr.csv", header=TRUE, sep=",", na.strings=c("-99", "."))
# Read biol.csv
biol <- read.table("D:/Kuliah Tata/semester 5/Analisis Data Statistik/Quiz 1 ADS/biol.csv", header=TRUE, sep=",", na.strings=c("-99", "."))

head(expr)

##   sample gene1 gene2 gene3 gene4 gene5 gene6
## 1   6021  1.33  3.23  2.50  3.18  1.79  2.23
## 2   8092 -0.62  1.95  0.89 -0.09 -0.05  0.60
## 3   5240  1.53  3.18  1.70  1.26  1.31  4.64
## 4   9815  0.69  3.39  3.63  1.16  3.06  1.51
## 5   5396    NA    NA    NA  6.14    NA  3.38
## 6   7193  1.16  4.56  4.27  3.59  4.15  0.95

head(biol)

##   sample f1 f2
## 1  Y6021  2  S
## 2  Y8092  2  S
## 3  X5240  1  T
## 4  A9815  1  U
## 5  Y5396  2  T
## 6  X7193 NA  T

# Jumlah NA per kolom
colSums(is.na(expr))

## sample  gene1  gene2  gene3  gene4  gene5  gene6 
##      0      5      4      7      2      6      2

colSums(is.na(biol))

## sample     f1     f2 
##      0      3      0

Are the datasets expr and biol data frames, lists or matrices?

# Cek expr
is.data.frame(expr)

## [1] TRUE

is.list(expr)

## [1] TRUE

is.matrix(expr)

## [1] FALSE

# Cek biol
is.data.frame(biol)

## [1] TRUE

is.list(biol)

## [1] TRUE

is.matrix(biol)

## [1] FALSE

Use lapply or sapply with FUN, is.factor and is.numeric to see what the different columns of expr and biol were turned into.

# Cek tipe kolom dalam data expr
sapply(expr, function(x) c(is.factor=is.factor(x), is.numeric=is.numeric(x)))

##            sample gene1 gene2 gene3 gene4 gene5 gene6
## is.factor   FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## is.numeric   TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

# Cek tipe kolom dalam data biol
sapply(biol, function(x) c(is.factor=is.factor(x), is.numeric=is.numeric(x)))

##            sample    f1    f2
## is.factor   FALSE FALSE FALSE
## is.numeric  FALSE  TRUE FALSE

Play around with indexing data frames.

names(expr); dimnames(expr); names(biol)

## [1] "sample" "gene1"  "gene2"  "gene3"  "gene4"  "gene5"  "gene6"

## [[1]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [91] "91" "92"
## 
## [[2]]
## [1] "sample" "gene1"  "gene2"  "gene3"  "gene4"  "gene5"  "gene6"

## [1] "sample" "f1"     "f2"

biol[,-1]                 # semua kolom kecuali kolom pertama

##    f1 f2
## 1   2  S
## 2   2  S
## 3   1  T
## 4   1  U
## 5   2  T
## 6  NA  T
## 7   1  T
## 8   2  S
## 9   2  T
## 10  2  U
## 11  1  S
## 12  2  U
## 13  1  U
## 14  2  U
## 15  1 NA
## 16  1  U
## 17 NA  S
## 18  2  U
## 19  2  T
## 20  1  U
## 21  1  T
## 22  1  S
## 23 NA NA
## 24  2  T
## 25  1  U
## 26  1  U
## 27  2  T
## 28  2  U
## 29  1  U
## 30  1  U
## 31  1  U
## 32  1  U
## 33  1  U
## 34  1  U
## 35  2  S
## 36  2  U
## 37  2  T
## 38  1  S
## 39  2  T
## 40  1  U
## 41  2  S
## 42  1  T
## 43  2  T
## 44  2  S
## 45  1  S
## 46  1  U
## 47  2  U
## 48  2  U
## 49  2  U
## 50  1  U
## 51  2  S
## 52  2  S
## 53  1  T
## 54  2  T
## 55  2  S
## 56  2  S
## 57  2  T
## 58  1  U
## 59  2  S
## 60  2  S
## 61  1  U
## 62  2  T
## 63  2  T
## 64  1  T
## 65  2  U
## 66  1  U
## 67  1  T
## 68  2  S
## 69  2  T
## 70  2  T
## 71  1  T
## 72  2  T
## 73  2  S
## 74  1  S
## 75  2  T
## 76  2  T
## 77  2  U
## 78  1  U
## 79  2  T
## 80  2  S
## 81  1  S
## 82  2  T
## 83  2  S
## 84  2  S
## 85  1  T
## 86  1  T
## 87  2  U
## 88  1  T
## 89  1  T
## 90  2  T
## 91  1  S
## 92  1  S

biol[1:20, ]              # 20 baris pertama

##    sample f1 f2
## 1   Y6021  2  S
## 2   Y8092  2  S
## 3   X5240  1  T
## 4   A9815  1  U
## 5   Y5396  2  T
## 6   X7193 NA  T
## 7   Y8629  1  T
## 8   X7297  2  S
## 9   A8013  2  T
## 10  A7757  2  U
## 11  X5603  1  S
## 12  A8078  2  U
## 13  Y7953  1  U
## 14  Y6276  2  U
## 15  A5351  1 NA
## 16  X7634  1  U
## 17  A5515 NA  S
## 18  X9036  2  U
## 19  A8520  2  T
## 20  A6930  1  U

biol[["f1"]]              # akses kolom 'f1' (sama seperti biol$f1)

##  [1]  2  2  1  1  2 NA  1  2  2  2  1  2  1  2  1  1 NA  2  2  1  1  1 NA  2  1
## [26]  1  2  2  1  1  1  1  1  1  2  2  2  1  2  1  2  1  2  2  1  1  2  2  2  1
## [51]  2  2  1  2  2  2  2  1  2  2  1  2  2  1  2  1  1  2  2  2  1  2  2  1  2
## [76]  2  2  1  2  2  1  2  2  2  1  1  2  1  1  2  1  1

biol[,"f2"]               # akses kolom 'f2' pakai nama kolom

##  [1] "S"  "S"  "T"  "U"  "T"  "T"  "T"  "S"  "T"  "U"  "S"  "U"  "U"  "U"  "NA"
## [16] "U"  "S"  "U"  "T"  "U"  "T"  "S"  "NA" "T"  "U"  "U"  "T"  "U"  "U"  "U" 
## [31] "U"  "U"  "U"  "U"  "S"  "U"  "T"  "S"  "T"  "U"  "S"  "T"  "T"  "S"  "S" 
## [46] "U"  "U"  "U"  "U"  "U"  "S"  "S"  "T"  "T"  "S"  "S"  "T"  "U"  "S"  "S" 
## [61] "U"  "T"  "T"  "T"  "U"  "U"  "T"  "S"  "T"  "T"  "T"  "T"  "S"  "S"  "T" 
## [76] "T"  "U"  "U"  "T"  "S"  "S"  "T"  "S"  "S"  "T"  "T"  "U"  "T"  "T"  "T" 
## [91] "S"  "S"

expr[1:10, -c(3,5,6)]     # ambil baris 1–10, dan hapus kolom ke-3, 5, 6

##    sample gene1 gene3 gene6
## 1    6021  1.33  2.50  2.23
## 2    8092 -0.62  0.89  0.60
## 3    5240  1.53  1.70  4.64
## 4    9815  0.69  3.63  1.51
## 5    5396    NA    NA  3.38
## 6    7193  1.16  4.27  0.95
## 7    8629 -0.38  0.22  0.10
## 8    7297  1.30    NA  2.96
## 9    8013  2.01  4.16  0.64
## 10   7757  2.39  3.15  2.43

expr$gene3                # akses kolom bernama 'gene3'

##  [1]  2.50  0.89  1.70  3.63    NA  4.27  0.22    NA  4.16  3.15 -0.20    NA
## [13]  3.18  0.69  2.69 -2.00  2.34  0.48 -0.95  2.86  2.60  3.91  0.97  3.20
## [25]  1.88  1.40  1.27  2.88 -0.46  0.88  1.38 -0.41  2.26  2.00  0.58  3.32
## [37]  1.58    NA  1.06  3.94  2.66  5.61  2.87    NA  3.09  1.67  3.34  2.36
## [49]  2.93 -0.89  4.35  1.62  2.92  3.07  2.04    NA  5.91  2.51  1.75  3.01
## [61]  4.07  0.93  3.92  3.00  2.75  4.12 -0.39  2.81  4.24  5.18  3.12  1.40
## [73]    NA  0.16  2.99 -0.29  0.80  0.15  2.18  3.32  3.57  1.57  3.87  2.79
## [85]  2.79  0.89  2.32  4.39 -0.48  4.19  0.35  2.64

expr[expr[,3] > 4.5, ]    # ambil semua baris di mana kolom ke-3 > 4.5

##      sample gene1 gene2 gene3 gene4 gene5 gene6
## NA       NA    NA    NA    NA    NA    NA    NA
## 6      7193  1.16  4.56  4.27  3.59  4.15  0.95
## 9      8013  2.01  4.55  4.16  3.72  5.44  0.64
## NA.1     NA    NA    NA    NA    NA    NA    NA
## NA.2     NA    NA    NA    NA    NA    NA    NA
## 57     8382  3.00  5.83  5.91  7.69  6.62  2.34
## NA.3     NA    NA    NA    NA    NA    NA    NA

expr[!is.na(expr[,3]) & expr[,3] > 4.5, ]

##    sample gene1 gene2 gene3 gene4 gene5 gene6
## 6    7193  1.16  4.56  4.27  3.59  4.15  0.95
## 9    8013  2.01  4.55  4.16  3.72  5.44  0.64
## 57   8382  3.00  5.83  5.91  7.69  6.62  2.34

# ambil baris yang kolom ke-3-nya TIDAK NA dan > 4.5

Use apply, any and is.na to find rows in biol with at least one NA.

biol[apply(is.na(biol), 1, any), ]

##    sample f1 f2
## 6   X7193 NA  T
## 17  A5515 NA  S
## 23  Y9329 NA NA

Use apply or sapply to find the means of each column of expr. Note that you may need to use the na.rm argument for the function mean.

sapply(expr, mean, na.rm = TRUE)

##      sample       gene1       gene2       gene3       gene4       gene5 
## 7179.989130    1.142989    3.091591    2.212000    2.071111    2.169651 
##       gene6 
##    2.107333

Use dimnames and as.character to made the names of the rows in sample column for expr, and then delete the sample column. Get the column means again. Also get the SDs and ranges for each column.

rownames(expr) <- as.character(expr$sample)
expr$sample <- NULL

colMeans(expr, na.rm = TRUE)

##    gene1    gene2    gene3    gene4    gene5    gene6 
## 1.142989 3.091591 2.212000 2.071111 2.169651 2.107333

apply(expr, 2, sd, na.rm = TRUE)

##     gene1     gene2     gene3     gene4     gene5     gene6 
## 0.9279386 0.8775408 1.6011802 1.5699720 1.4791384 1.0812943

apply(expr, 2, range, na.rm = TRUE)

##      gene1 gene2 gene3 gene4 gene5 gene6
## [1,] -1.53  0.86 -2.00 -2.21 -1.04  0.03
## [2,]  3.00  5.83  5.91  7.69  6.62  4.64

Calculate the correlation matrix of expr using the function cor. You’ll need to use the argument use: try use=“complete.obs” and use=“pairwise.complete.obs”.

cor(expr, use = "complete.obs")

##           gene1     gene2     gene3     gene4     gene5     gene6
## gene1 1.0000000 0.8109350 0.7884901 0.6846526 0.7392696 0.4381714
## gene2 0.8109350 1.0000000 0.7880232 0.7725580 0.7989718 0.3825137
## gene3 0.7884901 0.7880232 1.0000000 0.6800959 0.6889018 0.2336774
## gene4 0.6846526 0.7725580 0.6800959 1.0000000 0.6779315 0.1294823
## gene5 0.7392696 0.7989718 0.6889018 0.6779315 1.0000000 0.1527897
## gene6 0.4381714 0.3825137 0.2336774 0.1294823 0.1527897 1.0000000

cor(expr, use = "pairwise.complete.obs")

##           gene1     gene2     gene3     gene4     gene5     gene6
## gene1 1.0000000 0.7948008 0.7636731 0.6975228 0.7574929 0.4463219
## gene2 0.7948008 1.0000000 0.7791394 0.7721993 0.7819576 0.4220130
## gene3 0.7636731 0.7791394 1.0000000 0.6636195 0.7004765 0.2776524
## gene4 0.6975228 0.7721993 0.6636195 1.0000000 0.6815926 0.1893331
## gene5 0.7574929 0.7819576 0.7004765 0.6815926 1.0000000 0.2020812
## gene6 0.4463219 0.4220130 0.2776524 0.1893331 0.2020812 1.0000000

For each row, subtract the average of the first two columns from each of the other four columns, and then delete first two columns. Re-calculate the correlation matrix.

expr_adj <- expr
expr_adj[, 3:6] <- expr_adj[, 3:6] - rowMeans(expr_adj[, 1:2], na.rm = TRUE)
expr_adj <- expr_adj[, -c(1,2)]
cor(expr_adj, use = "pairwise.complete.obs")

##            gene3      gene4      gene5      gene6
## gene3  1.0000000  0.2816152  0.2620337 -0.2717860
## gene4  0.2816152  1.0000000  0.3126408 -0.3274785
## gene5  0.2620337  0.3126408  1.0000000 -0.3748852
## gene6 -0.2717860 -0.3274785 -0.3748852  1.0000000

Use pairs to get a scatterplot matrix of the columns of expr against each other.

pairs(expr)

> 13. Remove the character from the front of the sample, and create a new column (a factor), f3, containing that character. (Use the function substring.) Make the row names of biol the sample number (with the character removed), and then get the rows of biol in the same order as the rows of expr.

# Buat kolom baru f3 dari karakter pertama
biol$f3 <- substring(biol$sample, 1, 1)

# Hapus karakter pertama dan jadikan rowname
biol$sample_clean <- substring(biol$sample, 2)
rownames(biol) <- biol$sample_clean

# Samakan urutan dengan expr
biol <- biol[rownames(expr), ]
head(biol)

##      sample f1 f2 f3 sample_clean
## 6021  Y6021  2  S  Y         6021
## 8092  Y8092  2  S  Y         8092
## 5240  X5240  1  T  X         5240
## 9815  A9815  1  U  A         9815
## 5396  Y5396  2  T  Y         5396
## 7193  X7193 NA  T  X         7193

Use apply and tapply to find the mean and SD of each column of expr within each group biol$f1==1 and biol$f1==2.

apply(expr, 2, function(x) tapply(x, biol$f1, mean, na.rm = TRUE))

##      gene1    gene2    gene3    gene4    gene5    gene6
## 1 1.117838 2.853684 1.916667 1.354103 1.953947 2.112308
## 2 1.188723 3.277447 2.457907 2.654792 2.336000 2.154167

apply(expr, 2, function(x) tapply(x, biol$f1, sd, na.rm = TRUE))

##       gene1     gene2    gene3    gene4    gene5    gene6
## 1 1.0359996 0.8455685 1.758154 1.340537 1.555466 1.149955
## 2 0.8657684 0.8550501 1.431775 1.533136 1.411438 1.028452

Use t.test (you may need to first do library(c.test)) with apply and split to do t-tests comparing the groups defined by biol$f2 for the values in each column of expr.

library(stats)

# split berdasarkan f2
t_results <- apply(expr, 2, function(x) {
  t.test(split(x, biol$f2)[[1]], split(x, biol$f2)[[2]])$p.value
})

t_results

##      gene1      gene2      gene3      gene4      gene5      gene6 
## 0.81449926 0.48424932 0.68870255 0.09942554 0.15953411 0.16031764

Uji t dilakukan untuk membandingkan rata-rata nilai ekspresi setiap gen antara dua kelompok yang didefinisikan oleh variabel biol$f2. Hasil uji menghasilkan p-value untuk masing-masing gen. Berdasarkan hasil perhitungan, seluruh p-value memiliki nilai lebih besar dari 0.05 (gene1 = 0.8145, gene2 = 0.4842, gene3 = 0.6887, gene4 = 0.0994, gene5 = 0.1595, gene6 = 0.1603).

Hal ini menunjukkan bahwa tidak terdapat perbedaan yang signifikan secara statistik dalam rata-rata ekspresi keenam gen antara dua kelompok f2. Dengan kata lain, faktor f2 tidak memiliki pengaruh yang bermakna terhadap tingkat ekspresi gen dalam data ini.

Quiz 1 Data Analysis (Retake)

Putu Calista Arthanti Dewi (5003231096) - Analisis Data Statistik - K

2025-10-10