1.Use readRDS() to store the contents of the R data file derisi.rds in a new variable derisi.
derisi = readRDS("/Users/cutiecorgi/Documents/biostats_harman/derisi.rds")
class(derisi)
## [1] "data.frame"
it is a data frame.
head(derisi)
## ORF Name R1 R2 R3 R4 R5 R6 R7 R1.Bkg R2.Bkg R3.Bkg
## 1 YHR007C ERG11 7896 7484 14679 14617 9853 7599 6490 1155 1984 1323
## 2 YBR218C PYC2 12144 11177 10241 4820 4950 7047 17035 1074 1694 1243
## 3 YAL051W FUN43 4478 6435 6230 6848 5111 7180 4497 1140 1950 1649
## 4 YAL053W 6343 8243 6743 3304 3556 4694 3849 1020 1897 1196
## 5 YAL054C ACS1 1542 3044 2076 1695 1753 4806 10802 1082 1940 1504
## 6 YAL055W 1769 3243 2094 1367 1853 3580 1956 975 1821 1185
## R4.Bkg R5.Bkg R6.Bkg R7.Bkg G1 G2 G3 G4 G5 G6 G7 G1.Bkg
## 1 1171 914 2445 981 8432 7173 11736 16798 12315 16111 13931 2404
## 2 876 1211 2444 742 11509 10226 13372 6500 6255 9024 6904 2148
## 3 1183 898 2637 927 5865 5895 5345 6302 5400 7933 5026 2422
## 4 881 1045 2518 697 6762 7454 6323 3595 4689 5660 4145 2107
## 5 1108 902 2610 980 3138 3785 2419 2114 2763 3561 1897 2405
## 6 851 1047 2536 698 2844 4069 2583 1651 2530 3484 1550 1674
## G2.Bkg G3.Bkg G4.Bkg G5.Bkg G6.Bkg G7.Bkg
## 1 2561 1598 1506 1696 2667 1244
## 2 2527 1641 1196 1553 2569 848
## 3 2496 1902 1501 1644 2808 1154
## 4 2663 1607 1162 1577 2544 857
## 5 2528 1847 1445 1713 2767 1142
## 6 2648 1591 1114 1528 2668 870
names(derisi)
## [1] "ORF" "Name" "R1" "R2" "R3" "R4" "R5" "R6"
## [9] "R7" "R1.Bkg" "R2.Bkg" "R3.Bkg" "R4.Bkg" "R5.Bkg" "R6.Bkg" "R7.Bkg"
## [17] "G1" "G2" "G3" "G4" "G5" "G6" "G7" "G1.Bkg"
## [25] "G2.Bkg" "G3.Bkg" "G4.Bkg" "G5.Bkg" "G6.Bkg" "G7.Bkg"
It is a list of some ORFs (open reading frame) with their expression (red (R) and green (G) fluoresecent) intensity and background values at different time point (1-7).
dim(derisi)
## [1] 6102 30
nrow(derisi)
## [1] 6102
ncol(derisi)
## [1] 30
Derisi is a data frame with a 6102 x 30 dimension (6102 rows, 30 columns)
derisi$R3[5]
## [1] 2076
it is 2076
fifth_row = derisi[5,]
# subset all green intensity values
G1 = derisi$G1
plot(ecdf(G1))
summary(ecdf(G1))
## Empirical CDF: 3916 unique values with summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1120 3839 5100 6732 7504 41019
10.For the same first timepoint, generate a scatter plot that directly compares the red intensity against the green intensity, with each point representing a different gene.
R1 = derisi$R1
G1 = derisi$G1
plot(R1,G1)
R5 = derisi$R5
G5 = derisi$G5
plot(R5,G5)
R7 = derisi$R7
G7 = derisi$G7
plot(R7,G7)
The Green/Red fluorescent intensities ratio is diverging. This makes
sense because during the diauxic shift, some genes are expressed whereas
some others are repressed, and this causes different genes binding to
different cDNAs (Cy3 or Cy5) and shows different colours.
which(derisi == "ACR1",arr.ind = TRUE)
## row col
## 3309 3293 2
derisi[3293,1]
## [1] YJR095W
## 6102 Levels: YAL001C YAL002W YAL003W YAL004W YAL005C YAL007C YAL008W ... YPR204W
R1 = derisi$R1
G1 = derisi$G1
# log2(R/G) = log2(R) - log2(G)
LR1 = log(R1,base = exp(2)) - log(G1, base = exp(2))
R1 = derisi$R1
G1 = derisi$G1
# log2(R/G) = log2(R) - log2(G)
LR1 = log(R1)/log(2) - log(G1)/log(2)
plot(ecdf(LR1))
R_delta = derisi$R1 - derisi$R1.Bkg
G_delta = derisi$G1 - derisi$G1.Bkg
LR1.normalized = log(R_delta)/log(2) - log(G_delta)/log(2)
plot(ecdf(LR1.normalized))
R_delta = derisi$R1 - derisi$R1.Bkg
G_delta = derisi$G1 - derisi$G1.Bkg
LR1.normalized = log(R_delta)/log(2) - log(G_delta)/log(2)
plot(ecdf(LR1.normalized))
R1 = derisi$R1
G1 = derisi$G1
# log2(R/G) = log2(R) - log2(G)
LR1 = log(R1)/log(2) - log(G1)/log(2)
lines(ecdf(LR1), col = "red")
when log(R/G) = 0, it means R/G = 1, so R = G. The normalised curve
shows that the proportions of green and red are more even, which makes
more sense.
derisi$LR1 = LR1.normalized
derisi$LR2 = log(derisi$R2)/log(2) - log(derisi$G2)/log(2)
derisi$LR3 = log(derisi$R3)/log(2) - log(derisi$G3)/log(2)
derisi$LR4 = log(derisi$R4)/log(2) - log(derisi$G4)/log(2)
derisi$LR5 = log(derisi$R5)/log(2) - log(derisi$G5)/log(2)
derisi$LR6 = log(derisi$R6)/log(2) - log(derisi$G6)/log(2)
derisi$LR7 = log(derisi$R7)/log(2) - log(derisi$G7)/log(2)
head(derisi)
## ORF Name R1 R2 R3 R4 R5 R6 R7 R1.Bkg R2.Bkg R3.Bkg
## 1 YHR007C ERG11 7896 7484 14679 14617 9853 7599 6490 1155 1984 1323
## 2 YBR218C PYC2 12144 11177 10241 4820 4950 7047 17035 1074 1694 1243
## 3 YAL051W FUN43 4478 6435 6230 6848 5111 7180 4497 1140 1950 1649
## 4 YAL053W 6343 8243 6743 3304 3556 4694 3849 1020 1897 1196
## 5 YAL054C ACS1 1542 3044 2076 1695 1753 4806 10802 1082 1940 1504
## 6 YAL055W 1769 3243 2094 1367 1853 3580 1956 975 1821 1185
## R4.Bkg R5.Bkg R6.Bkg R7.Bkg G1 G2 G3 G4 G5 G6 G7 G1.Bkg
## 1 1171 914 2445 981 8432 7173 11736 16798 12315 16111 13931 2404
## 2 876 1211 2444 742 11509 10226 13372 6500 6255 9024 6904 2148
## 3 1183 898 2637 927 5865 5895 5345 6302 5400 7933 5026 2422
## 4 881 1045 2518 697 6762 7454 6323 3595 4689 5660 4145 2107
## 5 1108 902 2610 980 3138 3785 2419 2114 2763 3561 1897 2405
## 6 851 1047 2536 698 2844 4069 2583 1651 2530 3484 1550 1674
## G2.Bkg G3.Bkg G4.Bkg G5.Bkg G6.Bkg G7.Bkg LR1 LR2 LR3
## 1 2561 1598 1506 1696 2667 1244 0.16128321 0.06123293 0.32281291
## 2 2527 1641 1196 1553 2569 848 0.24192066 0.12829108 -0.38485866
## 3 2496 1902 1501 1644 2808 1154 -0.04468223 0.12644834 0.22104222
## 4 2663 1607 1162 1577 2544 857 0.19345840 0.14515468 0.09278138
## 5 2528 1847 1445 1713 2767 1142 -0.67217934 -0.31432494 -0.22060433
## 6 2648 1591 1114 1528 2668 870 -0.55929762 -0.32734526 -0.30278620
## LR4 LR5 LR6 LR7
## 1 -0.2006422 -0.32178167 -1.08416456 -1.1020084
## 2 -0.4314066 -0.33758136 -0.35675785 1.3029976
## 3 0.1198729 -0.07935382 -0.14388270 -0.1604478
## 4 -0.1217781 -0.39902495 -0.26998421 -0.1068884
## 5 -0.3186901 -0.65640957 0.43255421 2.5095069
## 6 -0.2723269 -0.44927450 0.03921496 0.3356382
filter = grep("LR",names(derisi))
logratios = derisi[,filter]
Names = derisi$Name
ORF = derisi$ORF
logratios = cbind(ORF,Names,logratios)
head(logratios)
## ORF Names LR1 LR2 LR3 LR4 LR5
## 1 YHR007C ERG11 0.16128321 0.06123293 0.32281291 -0.2006422 -0.32178167
## 2 YBR218C PYC2 0.24192066 0.12829108 -0.38485866 -0.4314066 -0.33758136
## 3 YAL051W FUN43 -0.04468223 0.12644834 0.22104222 0.1198729 -0.07935382
## 4 YAL053W 0.19345840 0.14515468 0.09278138 -0.1217781 -0.39902495
## 5 YAL054C ACS1 -0.67217934 -0.31432494 -0.22060433 -0.3186901 -0.65640957
## 6 YAL055W -0.55929762 -0.32734526 -0.30278620 -0.2723269 -0.44927450
## LR6 LR7
## 1 -1.08416456 -1.1020084
## 2 -0.35675785 1.3029976
## 3 -0.14388270 -0.1604478
## 4 -0.26998421 -0.1068884
## 5 0.43255421 2.5095069
## 6 0.03921496 0.3356382
saveRDS(logratios,file = "logratios.rds")
grep("ACR1",logratios$Names) #to find out the index of gene ACR1
## [1] 3293
logratios$LR7[3293]
## [1] 2.771354
at the last time point, the fold induction (log(Red/Green)) is 2.77.
# samples were collected at each 2 hours interval after a 9 hour of initial inoculation
hours = c(9,11,13,15,17,19,21)
plot(hours,logratios[3293,3:9],type = "b")
yes, it is consistent with the paper’s figure.
grep("SAM1",logratios$Names) #to find out the index of gene ACR1
## [1] 3940
plot(hours,logratios[3940,3:9],type = "b")
up = max(logratios$LR7)
down = min(logratios$LR7)
logratios[which(logratios$LR7 == up),]
## ORF Names LR1 LR2 LR3 LR4 LR5
## 3700 YKR097W PCK1 -0.126823 -0.5332018 0.2154702 -0.2540825 -0.3816576
## LR6 LR7
## 3700 -0.02246456 3.309514
logratios[which(logratios$LR7 == down),]
## ORF Names LR1 LR2 LR3 LR4 LR5
## 5547 YOR309C 0.2026396 -0.03513594 0.1900498 -0.6378539 -0.3760874
## LR6 LR7
## 5547 -1.878142 -2.720188
gene YKR097W (PCK1) is most strongly upregulated, whereas gene YOR309C is most strongly downregulated.
plot(ecdf(logratios$LR1), col = "red")
lines(ecdf(logratios$LR7))
The spread of data is more even at the last time point than the first
time point, as data the first time point distributes more at two ends.
Relating to that, at the last time point, there are more data in the
middle, suggesting that there are more genes which expressed at roughly
equal levels before and after the diauxic shift, so they manifest as
yellow (green + red).