dta_69284 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69284.txt", skip = 17, header = T)
dta_02340 = read.delim("C:/Users/beste/Downloads/raw-3/raw/2340.txt", skip = 17, header = T)
dta_33091 = read.delim("C:/Users/beste/Downloads/raw-3/raw/33091.txt", skip = 18, header = T)
dta_69976 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69976.txt", skip = 19, header = T)
dta_69971 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69971.txt", skip = 19, header = T)
dta_69973 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69973.txt", skip = 19, header = T)
dta_64995 = read.delim("C:/Users/beste/Downloads/raw-3/raw/64995.txt", skip = 17, header = T)
dta_48033 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48033.txt", skip = 17, header = T)
dta_48035 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48035.txt", skip = 17, header = T)
dta_69980 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69980.txt", skip = 19, header = T)
dta_69977 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69977.txt", skip = 19, header = T)
dta_69972 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69972.txt", skip = 19, header = T)
dta_48038 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48038.txt", skip = 17, header = T)
dta_64992 = read.delim("C:/Users/beste/Downloads/raw-3/raw/64992.txt", skip = 17, header = T)
dta_69659 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69659.txt", skip = 17, header = T)
dta_69974 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69974.txt", skip = 19, header = T)
dta_70239 = read.delim("C:/Users/beste/Downloads/raw-3/raw/70239.txt", skip = 19, header = T)
dta_65000 = read.delim("C:/Users/beste/Downloads/raw-3/raw/65000GENEPIX0.txt", skip = 0, header = T)
dta_65003 = read.delim("C:/Users/beste/Downloads/raw-3/raw/65003.txt", skip = 17, header = T)
dta_33093 = read.delim("C:/Users/beste/Downloads/raw-3/raw/33093.txt", skip = 18, header = T)
dta_48031 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48031.txt", skip = 17, header = T)
dta_69979 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69979.txt", skip = 19, header = T)
dta_69981 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69981.txt", skip = 19, header = T)
n=23
meta = data.frame("strain" = rep(NA, n), "tech" = rep(NA, n), "year" = rep(NA, n))
meta$strain = c("control", "ale", "control", rep("fuel", 3), "bakers", "wine", "wine",
rep("fuel", 3), "wine", "bakers", "control", "fuel", "fuel", "bakers", "bakers",
"control", "wine", "fuel", "fuel")
meta$tech = c("dunn", rep("unknown", 2), rep("stambuk", 3), rep("dunn", 3),
rep("stambuk", 3), rep("dunn", 3), rep("stambuk", 2), "dunn", "dunn", "unknown",
"dunn", rep("stambuk", 2))
meta$year = c(2006, 1999, "unknown", rep(2006, 3), 2005, 2004, 2004, rep(2006, 3), 2004,
2005, rep(2006, 3), 2005, 2005, "unknown", 2004, 2006, 2006)
keep_ii = (1:n)[-c(2, 3, 20)]
meta = meta[keep_ii, ]; rownames(meta) = 1:20
n = 20
all_spots = table(c(dta_69284[,1], dta_69976[,1], dta_69971[,1], dta_69973[,1],
dta_64995[,1], dta_48033[,1], dta_48035[,1], dta_69980[,1], dta_69977[,1],
dta_69972[,1], dta_48038[,1], dta_64992[,1], dta_69659[,1], dta_69974[,1],
dta_70239[,1], dta_65000[,1], dta_65003[,1], dta_48031[,1], dta_69979[,1],
dta_69981[,1]))
uniq_spots = as.numeric(names(all_spots))
shared_ii = (1:length(uniq_spots))[all_spots == n]
shared_spots = uniq_spots[shared_ii]
m_shared = length(shared_spots)
## Keep only the data for the shared features. Leaves m = 6740 features.
dta_69284 = dta_69284[dta_69284[, 1] %in% shared_spots, ]
dta_69976 = dta_69976[dta_69976[, 1] %in% shared_spots, ]
dta_69971 = dta_69971[dta_69971[, 1] %in% shared_spots, ]
dta_69973 = dta_69973[dta_69973[, 1] %in% shared_spots, ]
dta_64995 = dta_64995[dta_64995[, 1] %in% shared_spots, ]
dta_48033 = dta_48033[dta_48033[, 1] %in% shared_spots, ]
dta_48035 = dta_48035[dta_48035[, 1] %in% shared_spots, ]
dta_69980 = dta_69980[dta_69980[, 1] %in% shared_spots, ]
dta_69977 = dta_69977[dta_69977[, 1] %in% shared_spots, ]
dta_69972 = dta_69972[dta_69972[, 1] %in% shared_spots, ]
dta_48038 = dta_48038[dta_48038[, 1] %in% shared_spots, ]
dta_64992 = dta_64992[dta_64992[, 1] %in% shared_spots, ]
dta_69659 = dta_69659[dta_69659[, 1] %in% shared_spots, ]
dta_69974 = dta_69974[dta_69974[, 1] %in% shared_spots, ]
dta_70239 = dta_70239[dta_70239[, 1] %in% shared_spots, ]
dta_65000 = dta_65000[dta_65000[, 1] %in% shared_spots, ]
dta_65003 = dta_65003[dta_65003[, 1] %in% shared_spots, ]
dta_48031 = dta_48031[dta_48031[, 1] %in% shared_spots, ]
dta_69979 = dta_69979[dta_69979[, 1] %in% shared_spots, ]
dta_69981 = dta_69981[dta_69981[, 1] %in% shared_spots, ]
m = 6740
## Pull out median intensities by channel, log transform.
Y_1 = log2(cbind(dta_69284[,46], dta_69976[,45], dta_69971[,45], dta_69973[,45],
dta_64995[,45], dta_48033[,45], dta_48035[,45], dta_69980[,45], dta_69977[,45],
dta_69972[,45], dta_48038[,45], dta_64992[,45], dta_69659[,45], dta_69974[,45],
dta_70239[,45], dta_65000[,45], dta_65003[,45], dta_48031[,45], dta_69979[,45],
dta_69981[,45]))
Y_2 = log2(cbind(dta_69284[,51], dta_69976[,50], dta_69971[,50], dta_69973[,50],
dta_64995[,50], dta_48033[,50], dta_48035[,50], dta_69980[,50], dta_69977[,50],
dta_69972[,50], dta_48038[,50], dta_64992[,50], dta_69659[,50], dta_69974[,50],
dta_70239[,50], dta_65000[,50], dta_65003[,50], dta_48031[,50], dta_69979[,50],
dta_69981[,50]))
Y = Y_2 - Y_1
#1) YEAST DATA ANALYSIS
#1a) Hierarchical clustering dendrograms
par(mfrow=c(1,3))
HC1 = hclust(dist(t(Y)), "average")
plot(HC1)
HC2 = hclust(dist(t(Y)), "complete")
plot(HC2)
HC3 = hclust(dist(t(Y)), "single")
plot(HC3)
#1ai) First merge
HC1$merge
## [,1] [,2]
## [1,] -2 -9
## [2,] -6 -18
## [3,] -19 1
## [4,] -16 -17
## [5,] -15 3
## [6,] -4 -14
## [7,] 4 5
## [8,] -5 -12
## [9,] -7 2
## [10,] 6 7
## [11,] -3 -10
## [12,] -8 11
## [13,] -13 8
## [14,] -1 9
## [15,] 10 13
## [16,] -11 14
## [17,] -20 12
## [18,] 15 17
## [19,] 16 18
HC1$height[1]
## [1] 17.37377
#1ai) The samples that merged first were samples 2 and 9, and the distance between these samples is 17.37377.
#1aii) Height plot
plot(HC1$height, main="Average")
#1aii) For the average linkage function, there appears to be an “elbow” in the height plot at about index 17. This is at a merge distance of about 60. Looking at the dendrogram, this suggests that there are 3 clusters.
#1aiii) Meta data for the clusters
x<-c(rep(1,5),rep(2,11),rep(3,4))
df<-cbind(HC1$order,x)
df2<-df[order(df[,1]),]
meta<-cbind(meta,cluster=df2[,2])
meta[order(meta$cluster),]
## strain tech year cluster
## 1 control dunn 2006 1
## 6 wine dunn 2004 1
## 7 wine dunn 2004 1
## 11 wine dunn 2004 1
## 18 wine dunn 2004 1
## 2 fuel stambuk 2006 2
## 4 fuel stambuk 2006 2
## 5 bakers dunn 2005 2
## 9 fuel stambuk 2006 2
## 12 bakers dunn 2005 2
## 13 control dunn 2006 2
## 14 fuel stambuk 2006 2
## 15 fuel stambuk 2006 2
## 16 bakers dunn 2005 2
## 17 bakers dunn 2005 2
## 19 fuel stambuk 2006 2
## 3 fuel stambuk 2006 3
## 8 fuel stambuk 2006 3
## 10 fuel stambuk 2006 3
## 20 fuel stambuk 2006 3
par(mfrow=c(1,2))
boxplot(Y_1, main = "Control channels")
## Mean log intensity ratios (Ch2 : Ch1) by strain.
X_str = model.matrix(~ factor(meta$strain) - 1)
Y_bar_str = t(t(Y %*% X_str) / as.numeric(table(meta$strain)))
boxplot(Y_bar_str[, 2], Y_bar_str[, 3], Y_bar_str[, 1], Y_bar_str[, 4],
names = c("Control", "Fuel", "Bakers", "Wine"), main = "Feature means by strain")
## Now by technician.
X_tech = model.matrix(~ factor(meta$tech) - 1)
Y_bar_tech = t(t(Y %*% X_tech) / as.numeric(table(meta$tech)))
boxplot(Y_bar_tech[, 1], Y_bar_tech[, 2], names = c("Dunn", "Stambuk"), main =
"Feature means by technician")
## And by year.
X_year = model.matrix(~ factor(meta$year) - 1)
Y_bar_year = t(t(Y %*% X_year) / as.numeric(table(meta$year)))
boxplot(Y_bar_year[, 1], Y_bar_year[, 2], Y_bar_year[, 3], names = c("2004", "2005",
"2006"), main = "Feature means by year")
#1aiii) The strain and technician both appear to play a strong role in determining the cluster. Cluster 1 consists of samples processed by only Dunn, and 4 out of the 5 samples where from the wine strain. Cluster 3 consists of samples processed by only Stambuk, and they are all from the fuel strain. It is difficult to separate the effects of strain vs. technician since the wine, bakers, and control strains were used by only Dunn and the fuel strain was used by only Stambuk. Cluster 2 is a mix of both technicians and several of the strains.
#1b) K-means clustering
km<-kmeans(dist(t(Y)),3)
km
## K-means clustering with 3 clusters of sizes 4, 11, 5
##
## Cluster means:
## 1 2 3 4 5 6 7 8
## 1 120.51210 52.62688 29.13918 67.83583 94.91882 127.45514 145.64610 34.21302
## 2 65.41009 32.51808 71.94684 31.97844 43.33981 74.38643 90.26437 48.49420
## 3 34.79078 91.30074 137.36832 79.21907 63.45656 28.50473 31.74489 111.16084
## 9 10 11 12 13 14 15 16
## 1 60.31700 33.59372 139.68620 78.95923 92.61507 52.21049 58.67631 73.58231
## 2 28.61618 56.25923 86.45536 33.23977 42.23202 33.69062 29.50708 30.16616
## 3 82.05279 118.82275 40.28088 70.08953 55.58452 94.44468 85.77077 74.47636
## 17 18 19 20
## 1 66.35630 130.29557 55.88779 40.30615
## 2 29.36694 76.92072 31.87356 97.47647
## 3 80.50771 28.02288 88.65858 163.52417
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 2 1 2 2 3 3 1 2 1 3 2 2 2 2 2 2 3 2 1
##
## Within cluster sum of squares by cluster:
## [1] 31221.91 38387.47 14230.00
## (between_SS / total_SS = 79.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table(meta$cluster,km$cluster)
##
## 1 2 3
## 1 0 0 5
## 2 0 11 0
## 3 4 0 0
meta2<-cbind(meta,km$cluster)
meta2[order(meta$cluster),]
## strain tech year cluster km$cluster
## 1 control dunn 2006 1 3
## 6 wine dunn 2004 1 3
## 7 wine dunn 2004 1 3
## 11 wine dunn 2004 1 3
## 18 wine dunn 2004 1 3
## 2 fuel stambuk 2006 2 2
## 4 fuel stambuk 2006 2 2
## 5 bakers dunn 2005 2 2
## 9 fuel stambuk 2006 2 2
## 12 bakers dunn 2005 2 2
## 13 control dunn 2006 2 2
## 14 fuel stambuk 2006 2 2
## 15 fuel stambuk 2006 2 2
## 16 bakers dunn 2005 2 2
## 17 bakers dunn 2005 2 2
## 19 fuel stambuk 2006 2 2
## 3 fuel stambuk 2006 3 1
## 8 fuel stambuk 2006 3 1
## 10 fuel stambuk 2006 3 1
## 20 fuel stambuk 2006 3 1
#1b) Although the labels are different, we can see that the k-means clustering produced the same clusters as the hierarchical clustering using the average linkage function.
#2) CARDIO DATA ANALYSIS
#BiocManager::install("GEOquery")
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
gds<-getGEO(filename="C:/Users/beste/Downloads/GDS4308.soft",)
loggds<-GDS2MA(gds,do.log2=TRUE)
Z1<-cbind(P6=loggds$M[,1],P1=loggds$M[,2],P5=loggds$M[,3],P4=loggds$M[,4],P3=loggds$M[,5])
Z2<-cbind(P6=loggds$M[,6],P1=loggds$M[,7],P5=loggds$M[,8],P4=loggds$M[,9],P3=loggds$M[,10])
Z<-Z1-Z2
#2a) Hiearchical clustering
par(mfrow=c(1,2))
H4<-hclust(dist(t(Z)),"complete")
plot(H4)
plot(H4$height, main="Complete")
#2a) Looking at the height plot, there appears to be an “elbow” between the 3rd and 4th index starting at a height of about 140. Looking at our dendrogram, this corresponds to a model with two clusters: one cluster is patients 1, 3, 4, and 6 while the other cluster is patient 5. When looking at the meta data, it seems that age is a key factor in the clusters - the first few clusters consist of the oldest patients, and patient 5 is the youngest patient.
#2b) Principal component analysis
gds.pca<-prcomp(t(Z),center=T,scale=T)
gds.pca$x
## PC1 PC2 PC3 PC4 PC5
## P6 -115.71544 1.073683 -79.89080 128.726298 -7.004446e-13
## P1 -43.39557 165.756557 54.54176 -34.884923 -3.023250e-12
## P5 282.75266 -4.638492 -43.32329 4.201396 1.927066e-12
## P4 -105.11263 -64.950327 -76.74059 -119.851519 7.667220e-13
## P3 -18.52902 -97.241421 145.41291 21.808748 1.014956e-12
summary(gds.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 163.2592 101.4548 97.8114 90.3397 8.305e-13
## Proportion of Variance 0.4875 0.1883 0.1750 0.1493 0.000e+00
## Cumulative Proportion 0.4875 0.6757 0.8507 1.0000 1.000e+00
plot(gds.pca$x[,1],gds.pca$x[,2])
#reduce to 2 principal components
dim(Z)
## [1] 54675 5
red<-(Z)%*%gds.pca$x[,1:2]
red[1:5,]
## PC1 PC2
## [1,] 86.43747 46.603511
## [2,] 346.52273 -9.809721
## [3,] -444.40386 44.178868
## [4,] -170.77970 -16.104913
## [5,] -131.54657 93.110364
#2b) Looking at the principal component weights, there is no single patient that is significantly more influential than the rest; Patients 1 and 3 have relatively small PC1 values but the coefficients are still significant. When we look at the proportion of the variance, PC1 accounts for about 49%. This implies that there are likely several variables (patients) that account for the variability of the data. We do see that the coefficient for patient 1 is a large positive value, while all the rest of the coefficients are negative - this split corresponds with the clusters identified in part (a). Since the first two principal components account for about 68% of the variability, it would be sufficient to reduce the model to two principal components for ease of interpretation.
#2c) SVD
Z_centered = scale(t(Z), center = T, scale = T)
ss = svd(Z_centered)
dd = ss$d ^ 2 / sum(ss$d ^ 2)
round(dd, 4)
## [1] 0.4875 0.1883 0.1750 0.1493 0.0000
#eigenvalues:
e<-ss$d^2
round(e,4)
## [1] 106614.29 41172.34 38268.28 32645.08 0.00
e/sum(e)
## [1] 4.874911e-01 1.882595e-01 1.749807e-01 1.492688e-01 1.261556e-29
#2c) When we find the singular values and find the percentage of variation for each one, we see that the percentages correctly match the variation percentages for the PCA testing. The first eigenfeature accounts for about 49% of the variation.
#2d) Bootstrap t-test
m = nrow(Z)
n = ncol(Z)
## Paired differences.
t_stats <- apply(Z, 1, function(x) { t.test(x)$statistic })
## Bootstrap for statistical significance.
set.seed(101)
B <- 100
p_vals <- numeric(m)
num_na_t <- numeric(m)
for(i in 1:m) {
if(i == ((i %/% 1000) * 1000))
cat(i)
## To approximate the sampling distribution of the test statistic under the null
## hypothesis, we resample after centering to force the mean to be 0.
d_i_centered <- Z[i, ] - mean(Z[i, ])
t_b <- rep(NA, B)
for(b in 1:B) {
d_b <- sample(d_i_centered, replace = TRUE)
## If possible, compute t-statistic.
if(length(unique(d_b)) > 1)
t_b[b] <- t.test(d_b)$statistic
}
is_na_t <- is.na(t_b)
num_na_t[i] <- sum(is_na_t)
t_b_b <- t_b[!is_na_t]
p_vals[i] <- sum(abs(t_b[!is_na_t]) >= abs(t_stats[i])) / (B - num_na_t[i])
}
## 100020003000400050006000700080009000100001100012000130001400015000160001700018000190002000021000220002300024000250002600027000280002900030000310003200033000340003500036000370003800039000400004100042000430004400045000460004700048000490005000051000520005300054000
hist(p_vals)
fdr <- p.adjust(p_vals, method = "fdr")
sum(p_vals<.05)
## [1] 2070
sum(p_vals<.05)/length(p_vals)
## [1] 0.03786008
sum(fdr<.05)
## [1] 396
sum(fdr<.05)/length(fdr)
## [1] 0.007242798
#2d) Without adjusting for the multiple testing, there are 2070 p=values that are significant, about 3.8% of the features tested. When we adjust for mulitple testing, there are 396 features that are significant at an FDR of 0.05. This is roughly 0.7% of the features that were tested.
#3) DATA MATCHING
BRCA_raw <- read.delim("C:/Users/beste/Downloads/TCGA-BRCA_htseq_rawcounts.tsv")
BRCA_phen <- read.delim("C:/Users/beste/Downloads/TCGA-BRCA.GDC_phenotype-1.tsv")
#examine tables
BRCA_raw[1:5,1:5]
## Ensembl_ID TCGA.E9.A1NI.01A TCGA.A1.A0SP.01A TCGA.BH.A201.01A
## 1 ENSG00000000003.13 8.787903 12.064743 11.801304
## 2 ENSG00000000005.5 0.000000 2.807355 4.954196
## 3 ENSG00000000419.11 11.054604 11.292897 11.314017
## 4 ENSG00000000457.12 10.246741 9.905387 11.117643
## 5 ENSG00000000460.15 8.965784 10.053926 9.957102
## TCGA.E2.A14T.01A
## 1 10.723661
## 2 6.658211
## 3 11.214926
## 4 12.093748
## 5 9.503826
BRCA_phen[1:5,1:5]
## submitter_id.samples additional_pharmaceutical_therapy
## 1 TCGA-A2-A0CY-01A
## 2 TCGA-B6-A40B-01A
## 3 TCGA-AO-A0J8-01A
## 4 TCGA-A8-A08J-01A
## 5 TCGA-E2-A14N-01A
## additional_radiation_therapy additional_surgery_locoregional_procedure
## 1
## 2
## 3
## 4
## 5
## additional_surgery_metastatic_procedure
## 1
## 2
## 3
## 4
## 5
#rename rows/cols
rownames(BRCA_raw)<-BRCA_raw[,1]
BRCA_raw<-BRCA_raw[,-1]
BRCA_phen$submitter_id.samples<-gsub("-",".",BRCA_phen$submitter_id.samples)
rownames(BRCA_phen)<-BRCA_phen$submitter_id.samples
BRCA_phen<-BRCA_phen[,-1]
#find intersection
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
com<-intersect(colnames(BRCA_raw),rownames(BRCA_phen))
BRCA_raw_com = BRCA_raw[,colnames(BRCA_raw) %in% com]
BRCA_phen_com = BRCA_phen[rownames(BRCA_phen) %in% com, ]
#reorder
ord<-colnames(BRCA_raw_com)
BRCA_phen_com<-BRCA_phen_com[order(match(rownames(BRCA_phen_com),ord)),]
#show matching
identical(rownames(BRCA_phen_com),colnames(BRCA_raw_com))
## [1] TRUE
df<-cbind(rownames(BRCA_phen_com),colnames(BRCA_raw_com))
df[1:6,]
## [,1] [,2]
## [1,] "TCGA.E9.A1NI.01A" "TCGA.E9.A1NI.01A"
## [2,] "TCGA.A1.A0SP.01A" "TCGA.A1.A0SP.01A"
## [3,] "TCGA.BH.A201.01A" "TCGA.BH.A201.01A"
## [4,] "TCGA.E2.A14T.01A" "TCGA.E2.A14T.01A"
## [5,] "TCGA.AC.A8OS.01A" "TCGA.AC.A8OS.01A"
## [6,] "TCGA.A8.A09K.01A" "TCGA.A8.A09K.01A"