DATA PREP

dta_69284 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69284.txt", skip = 17, header = T)
dta_02340 = read.delim("C:/Users/beste/Downloads/raw-3/raw/2340.txt", skip = 17, header = T)
dta_33091 = read.delim("C:/Users/beste/Downloads/raw-3/raw/33091.txt", skip = 18, header = T)
dta_69976 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69976.txt", skip = 19, header = T)
dta_69971 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69971.txt", skip = 19, header = T)
dta_69973 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69973.txt", skip = 19, header = T)
dta_64995 = read.delim("C:/Users/beste/Downloads/raw-3/raw/64995.txt", skip = 17, header = T)
dta_48033 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48033.txt", skip = 17, header = T)
dta_48035 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48035.txt", skip = 17, header = T)
dta_69980 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69980.txt", skip = 19, header = T)
dta_69977 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69977.txt", skip = 19, header = T)
dta_69972 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69972.txt", skip = 19, header = T)
dta_48038 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48038.txt", skip = 17, header = T)
dta_64992 = read.delim("C:/Users/beste/Downloads/raw-3/raw/64992.txt", skip = 17, header = T)
dta_69659 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69659.txt", skip = 17, header = T)
dta_69974 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69974.txt", skip = 19, header = T)
dta_70239 = read.delim("C:/Users/beste/Downloads/raw-3/raw/70239.txt", skip = 19, header = T)
dta_65000 = read.delim("C:/Users/beste/Downloads/raw-3/raw/65000GENEPIX0.txt", skip = 0, header = T)
dta_65003 = read.delim("C:/Users/beste/Downloads/raw-3/raw/65003.txt", skip = 17, header = T)
dta_33093 = read.delim("C:/Users/beste/Downloads/raw-3/raw/33093.txt", skip = 18, header = T)
dta_48031 = read.delim("C:/Users/beste/Downloads/raw-3/raw/48031.txt", skip = 17, header = T)
dta_69979 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69979.txt", skip = 19, header = T)
dta_69981 = read.delim("C:/Users/beste/Downloads/raw-3/raw/69981.txt", skip = 19, header = T)
n=23

meta = data.frame("strain" = rep(NA, n), "tech" = rep(NA, n), "year" = rep(NA, n))
meta$strain = c("control", "ale", "control", rep("fuel", 3), "bakers", "wine", "wine", 
                rep("fuel", 3), "wine", "bakers", "control", "fuel", "fuel", "bakers", "bakers", 
                "control", "wine", "fuel", "fuel")
meta$tech = c("dunn", rep("unknown", 2), rep("stambuk", 3), rep("dunn", 3), 
              rep("stambuk", 3), rep("dunn", 3), rep("stambuk", 2), "dunn", "dunn", "unknown", 
              "dunn", rep("stambuk", 2))
meta$year = c(2006, 1999, "unknown", rep(2006, 3), 2005, 2004, 2004, rep(2006, 3), 2004, 
              2005, rep(2006, 3), 2005, 2005, "unknown", 2004, 2006, 2006)

keep_ii = (1:n)[-c(2, 3, 20)]
meta = meta[keep_ii, ]; rownames(meta) = 1:20
n = 20

all_spots = table(c(dta_69284[,1], dta_69976[,1], dta_69971[,1], dta_69973[,1], 
                    dta_64995[,1], dta_48033[,1], dta_48035[,1], dta_69980[,1], dta_69977[,1], 
                    dta_69972[,1], dta_48038[,1], dta_64992[,1], dta_69659[,1], dta_69974[,1],
                    dta_70239[,1], dta_65000[,1], dta_65003[,1], dta_48031[,1], dta_69979[,1],
                    dta_69981[,1]))
uniq_spots = as.numeric(names(all_spots))
shared_ii = (1:length(uniq_spots))[all_spots == n]
shared_spots = uniq_spots[shared_ii]
m_shared = length(shared_spots)

## Keep only the data for the shared features. Leaves m = 6740 features.
dta_69284 = dta_69284[dta_69284[, 1] %in% shared_spots, ]
dta_69976 = dta_69976[dta_69976[, 1] %in% shared_spots, ]
dta_69971 = dta_69971[dta_69971[, 1] %in% shared_spots, ]
dta_69973 = dta_69973[dta_69973[, 1] %in% shared_spots, ]
dta_64995 = dta_64995[dta_64995[, 1] %in% shared_spots, ]
dta_48033 = dta_48033[dta_48033[, 1] %in% shared_spots, ]
dta_48035 = dta_48035[dta_48035[, 1] %in% shared_spots, ]
dta_69980 = dta_69980[dta_69980[, 1] %in% shared_spots, ]
dta_69977 = dta_69977[dta_69977[, 1] %in% shared_spots, ]
dta_69972 = dta_69972[dta_69972[, 1] %in% shared_spots, ]
dta_48038 = dta_48038[dta_48038[, 1] %in% shared_spots, ]
dta_64992 = dta_64992[dta_64992[, 1] %in% shared_spots, ]
dta_69659 = dta_69659[dta_69659[, 1] %in% shared_spots, ]
dta_69974 = dta_69974[dta_69974[, 1] %in% shared_spots, ]
dta_70239 = dta_70239[dta_70239[, 1] %in% shared_spots, ]
dta_65000 = dta_65000[dta_65000[, 1] %in% shared_spots, ]
dta_65003 = dta_65003[dta_65003[, 1] %in% shared_spots, ]
dta_48031 = dta_48031[dta_48031[, 1] %in% shared_spots, ]
dta_69979 = dta_69979[dta_69979[, 1] %in% shared_spots, ]
dta_69981 = dta_69981[dta_69981[, 1] %in% shared_spots, ]

m = 6740

## Pull out median intensities by channel, log transform.  
Y_1 = log2(cbind(dta_69284[,46], dta_69976[,45], dta_69971[,45], dta_69973[,45], 
                 dta_64995[,45], dta_48033[,45], dta_48035[,45], dta_69980[,45], dta_69977[,45], 
                 dta_69972[,45], dta_48038[,45], dta_64992[,45], dta_69659[,45], dta_69974[,45], 
                 dta_70239[,45], dta_65000[,45], dta_65003[,45], dta_48031[,45], dta_69979[,45], 
                 dta_69981[,45]))

Y_2 = log2(cbind(dta_69284[,51], dta_69976[,50], dta_69971[,50], dta_69973[,50], 
                 dta_64995[,50], dta_48033[,50], dta_48035[,50], dta_69980[,50], dta_69977[,50], 
                 dta_69972[,50], dta_48038[,50], dta_64992[,50], dta_69659[,50], dta_69974[,50], 
                 dta_70239[,50], dta_65000[,50], dta_65003[,50], dta_48031[,50], dta_69979[,50], 
                 dta_69981[,50]))

Y = Y_2 - Y_1

#1) YEAST DATA ANALYSIS

#1a) Hierarchical clustering dendrograms

par(mfrow=c(1,3))
HC1 = hclust(dist(t(Y)), "average")
plot(HC1)
HC2 = hclust(dist(t(Y)), "complete")
plot(HC2)
HC3 = hclust(dist(t(Y)), "single")
plot(HC3)

#1ai) First merge
HC1$merge

##       [,1] [,2]
##  [1,]   -2   -9
##  [2,]   -6  -18
##  [3,]  -19    1
##  [4,]  -16  -17
##  [5,]  -15    3
##  [6,]   -4  -14
##  [7,]    4    5
##  [8,]   -5  -12
##  [9,]   -7    2
## [10,]    6    7
## [11,]   -3  -10
## [12,]   -8   11
## [13,]  -13    8
## [14,]   -1    9
## [15,]   10   13
## [16,]  -11   14
## [17,]  -20   12
## [18,]   15   17
## [19,]   16   18

HC1$height[1]

## [1] 17.37377

#1ai) The samples that merged first were samples 2 and 9, and the distance between these samples is 17.37377.

#1aii) Height plot

plot(HC1$height, main="Average")

#1aii) For the average linkage function, there appears to be an “elbow” in the height plot at about index 17. This is at a merge distance of about 60. Looking at the dendrogram, this suggests that there are 3 clusters.

#1aiii) Meta data for the clusters

x<-c(rep(1,5),rep(2,11),rep(3,4))
df<-cbind(HC1$order,x)
df2<-df[order(df[,1]),]
meta<-cbind(meta,cluster=df2[,2])
meta[order(meta$cluster),]

##     strain    tech year cluster
## 1  control    dunn 2006       1
## 6     wine    dunn 2004       1
## 7     wine    dunn 2004       1
## 11    wine    dunn 2004       1
## 18    wine    dunn 2004       1
## 2     fuel stambuk 2006       2
## 4     fuel stambuk 2006       2
## 5   bakers    dunn 2005       2
## 9     fuel stambuk 2006       2
## 12  bakers    dunn 2005       2
## 13 control    dunn 2006       2
## 14    fuel stambuk 2006       2
## 15    fuel stambuk 2006       2
## 16  bakers    dunn 2005       2
## 17  bakers    dunn 2005       2
## 19    fuel stambuk 2006       2
## 3     fuel stambuk 2006       3
## 8     fuel stambuk 2006       3
## 10    fuel stambuk 2006       3
## 20    fuel stambuk 2006       3

par(mfrow=c(1,2))
boxplot(Y_1, main = "Control channels")

## Mean log intensity ratios (Ch2 : Ch1) by strain. 
X_str = model.matrix(~ factor(meta$strain) - 1)
Y_bar_str = t(t(Y %*% X_str) / as.numeric(table(meta$strain)))
boxplot(Y_bar_str[, 2], Y_bar_str[, 3], Y_bar_str[, 1], Y_bar_str[, 4], 
        names = c("Control", "Fuel", "Bakers", "Wine"), main = "Feature means by strain")

## Now by technician.  
X_tech = model.matrix(~ factor(meta$tech) - 1)
Y_bar_tech = t(t(Y %*% X_tech) / as.numeric(table(meta$tech)))
boxplot(Y_bar_tech[, 1], Y_bar_tech[, 2], names = c("Dunn", "Stambuk"), main = 
          "Feature means by technician")

## And by year.
X_year = model.matrix(~ factor(meta$year) - 1)
Y_bar_year = t(t(Y %*% X_year) / as.numeric(table(meta$year)))
boxplot(Y_bar_year[, 1], Y_bar_year[, 2], Y_bar_year[, 3], names = c("2004", "2005", 
                                                                     "2006"), main = "Feature means by year")

#1aiii) The strain and technician both appear to play a strong role in determining the cluster. Cluster 1 consists of samples processed by only Dunn, and 4 out of the 5 samples where from the wine strain. Cluster 3 consists of samples processed by only Stambuk, and they are all from the fuel strain. It is difficult to separate the effects of strain vs. technician since the wine, bakers, and control strains were used by only Dunn and the fuel strain was used by only Stambuk. Cluster 2 is a mix of both technicians and several of the strains.

#1b) K-means clustering

km<-kmeans(dist(t(Y)),3)
km

## K-means clustering with 3 clusters of sizes 4, 11, 5
## 
## Cluster means:
##           1        2         3        4        5         6         7         8
## 1 120.51210 52.62688  29.13918 67.83583 94.91882 127.45514 145.64610  34.21302
## 2  65.41009 32.51808  71.94684 31.97844 43.33981  74.38643  90.26437  48.49420
## 3  34.79078 91.30074 137.36832 79.21907 63.45656  28.50473  31.74489 111.16084
##          9        10        11       12       13       14       15       16
## 1 60.31700  33.59372 139.68620 78.95923 92.61507 52.21049 58.67631 73.58231
## 2 28.61618  56.25923  86.45536 33.23977 42.23202 33.69062 29.50708 30.16616
## 3 82.05279 118.82275  40.28088 70.08953 55.58452 94.44468 85.77077 74.47636
##         17        18       19        20
## 1 66.35630 130.29557 55.88779  40.30615
## 2 29.36694  76.92072 31.87356  97.47647
## 3 80.50771  28.02288 88.65858 163.52417
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  3  2  1  2  2  3  3  1  2  1  3  2  2  2  2  2  2  3  2  1 
## 
## Within cluster sum of squares by cluster:
## [1] 31221.91 38387.47 14230.00
##  (between_SS / total_SS =  79.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

table(meta$cluster,km$cluster)

##    
##      1  2  3
##   1  0  0  5
##   2  0 11  0
##   3  4  0  0

meta2<-cbind(meta,km$cluster)
meta2[order(meta$cluster),]

##     strain    tech year cluster km$cluster
## 1  control    dunn 2006       1          3
## 6     wine    dunn 2004       1          3
## 7     wine    dunn 2004       1          3
## 11    wine    dunn 2004       1          3
## 18    wine    dunn 2004       1          3
## 2     fuel stambuk 2006       2          2
## 4     fuel stambuk 2006       2          2
## 5   bakers    dunn 2005       2          2
## 9     fuel stambuk 2006       2          2
## 12  bakers    dunn 2005       2          2
## 13 control    dunn 2006       2          2
## 14    fuel stambuk 2006       2          2
## 15    fuel stambuk 2006       2          2
## 16  bakers    dunn 2005       2          2
## 17  bakers    dunn 2005       2          2
## 19    fuel stambuk 2006       2          2
## 3     fuel stambuk 2006       3          1
## 8     fuel stambuk 2006       3          1
## 10    fuel stambuk 2006       3          1
## 20    fuel stambuk 2006       3          1

#1b) Although the labels are different, we can see that the k-means clustering produced the same clusters as the hierarchical clustering using the average linkage function.

#2) CARDIO DATA ANALYSIS

#BiocManager::install("GEOquery")
library(GEOquery)

## Loading required package: Biobase

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Setting options('download.file.method.GEOquery'='auto')

## Setting options('GEOquery.inmemory.gpl'=FALSE)

gds<-getGEO(filename="C:/Users/beste/Downloads/GDS4308.soft",)
loggds<-GDS2MA(gds,do.log2=TRUE)
Z1<-cbind(P6=loggds$M[,1],P1=loggds$M[,2],P5=loggds$M[,3],P4=loggds$M[,4],P3=loggds$M[,5])
Z2<-cbind(P6=loggds$M[,6],P1=loggds$M[,7],P5=loggds$M[,8],P4=loggds$M[,9],P3=loggds$M[,10])
Z<-Z1-Z2

#2a) Hiearchical clustering
par(mfrow=c(1,2))
H4<-hclust(dist(t(Z)),"complete")
plot(H4)
plot(H4$height, main="Complete")

#2a) Looking at the height plot, there appears to be an “elbow” between the 3rd and 4th index starting at a height of about 140. Looking at our dendrogram, this corresponds to a model with two clusters: one cluster is patients 1, 3, 4, and 6 while the other cluster is patient 5. When looking at the meta data, it seems that age is a key factor in the clusters - the first few clusters consist of the oldest patients, and patient 5 is the youngest patient.

#2b) Principal component analysis

gds.pca<-prcomp(t(Z),center=T,scale=T)
gds.pca$x

##           PC1        PC2       PC3         PC4           PC5
## P6 -115.71544   1.073683 -79.89080  128.726298 -7.004446e-13
## P1  -43.39557 165.756557  54.54176  -34.884923 -3.023250e-12
## P5  282.75266  -4.638492 -43.32329    4.201396  1.927066e-12
## P4 -105.11263 -64.950327 -76.74059 -119.851519  7.667220e-13
## P3  -18.52902 -97.241421 145.41291   21.808748  1.014956e-12

summary(gds.pca)

## Importance of components:
##                             PC1      PC2     PC3     PC4       PC5
## Standard deviation     163.2592 101.4548 97.8114 90.3397 8.305e-13
## Proportion of Variance   0.4875   0.1883  0.1750  0.1493 0.000e+00
## Cumulative Proportion    0.4875   0.6757  0.8507  1.0000 1.000e+00

plot(gds.pca$x[,1],gds.pca$x[,2])

#reduce to 2 principal components
dim(Z)

## [1] 54675     5

red<-(Z)%*%gds.pca$x[,1:2]
red[1:5,]

##             PC1        PC2
## [1,]   86.43747  46.603511
## [2,]  346.52273  -9.809721
## [3,] -444.40386  44.178868
## [4,] -170.77970 -16.104913
## [5,] -131.54657  93.110364

#2b) Looking at the principal component weights, there is no single patient that is significantly more influential than the rest; Patients 1 and 3 have relatively small PC1 values but the coefficients are still significant. When we look at the proportion of the variance, PC1 accounts for about 49%. This implies that there are likely several variables (patients) that account for the variability of the data. We do see that the coefficient for patient 1 is a large positive value, while all the rest of the coefficients are negative - this split corresponds with the clusters identified in part (a). Since the first two principal components account for about 68% of the variability, it would be sufficient to reduce the model to two principal components for ease of interpretation.

#2c) SVD

Z_centered = scale(t(Z), center = T, scale = T)
ss = svd(Z_centered)
dd = ss$d ^ 2 / sum(ss$d ^ 2)
round(dd, 4)

## [1] 0.4875 0.1883 0.1750 0.1493 0.0000

#eigenvalues:
e<-ss$d^2
round(e,4)

## [1] 106614.29  41172.34  38268.28  32645.08      0.00

e/sum(e)

## [1] 4.874911e-01 1.882595e-01 1.749807e-01 1.492688e-01 1.261556e-29

#2c) When we find the singular values and find the percentage of variation for each one, we see that the percentages correctly match the variation percentages for the PCA testing. The first eigenfeature accounts for about 49% of the variation.

#2d) Bootstrap t-test

m = nrow(Z)
n = ncol(Z)
## Paired differences.
t_stats <- apply(Z, 1, function(x) { t.test(x)$statistic })
## Bootstrap for statistical significance.
set.seed(101)
B <- 100
p_vals <- numeric(m)
num_na_t <- numeric(m)
for(i in 1:m) {
  if(i == ((i %/% 1000) * 1000))
    cat(i)
  ## To approximate the sampling distribution of the test statistic under the null
  ## hypothesis, we resample after centering to force the mean to be 0.
  d_i_centered <- Z[i, ] - mean(Z[i, ])
  t_b <- rep(NA, B)
  for(b in 1:B) {
    d_b <- sample(d_i_centered, replace = TRUE)
    ## If possible, compute t-statistic.
    if(length(unique(d_b)) > 1)
      t_b[b] <- t.test(d_b)$statistic
  }
  is_na_t <- is.na(t_b)
  num_na_t[i] <- sum(is_na_t)
  t_b_b <- t_b[!is_na_t]
  p_vals[i] <- sum(abs(t_b[!is_na_t]) >= abs(t_stats[i])) / (B - num_na_t[i])
}

## 100020003000400050006000700080009000100001100012000130001400015000160001700018000190002000021000220002300024000250002600027000280002900030000310003200033000340003500036000370003800039000400004100042000430004400045000460004700048000490005000051000520005300054000

hist(p_vals)

fdr <- p.adjust(p_vals, method = "fdr")

sum(p_vals<.05)

## [1] 2070

sum(p_vals<.05)/length(p_vals)

## [1] 0.03786008

sum(fdr<.05)

## [1] 396

sum(fdr<.05)/length(fdr)

## [1] 0.007242798

#2d) Without adjusting for the multiple testing, there are 2070 p=values that are significant, about 3.8% of the features tested. When we adjust for mulitple testing, there are 396 features that are significant at an FDR of 0.05. This is roughly 0.7% of the features that were tested.

#3) DATA MATCHING

BRCA_raw <- read.delim("C:/Users/beste/Downloads/TCGA-BRCA_htseq_rawcounts.tsv")
BRCA_phen <- read.delim("C:/Users/beste/Downloads/TCGA-BRCA.GDC_phenotype-1.tsv")

#examine tables
BRCA_raw[1:5,1:5]

##           Ensembl_ID TCGA.E9.A1NI.01A TCGA.A1.A0SP.01A TCGA.BH.A201.01A
## 1 ENSG00000000003.13         8.787903        12.064743        11.801304
## 2  ENSG00000000005.5         0.000000         2.807355         4.954196
## 3 ENSG00000000419.11        11.054604        11.292897        11.314017
## 4 ENSG00000000457.12        10.246741         9.905387        11.117643
## 5 ENSG00000000460.15         8.965784        10.053926         9.957102
##   TCGA.E2.A14T.01A
## 1        10.723661
## 2         6.658211
## 3        11.214926
## 4        12.093748
## 5         9.503826

BRCA_phen[1:5,1:5]

##   submitter_id.samples additional_pharmaceutical_therapy
## 1     TCGA-A2-A0CY-01A                                  
## 2     TCGA-B6-A40B-01A                                  
## 3     TCGA-AO-A0J8-01A                                  
## 4     TCGA-A8-A08J-01A                                  
## 5     TCGA-E2-A14N-01A                                  
##   additional_radiation_therapy additional_surgery_locoregional_procedure
## 1                                                                       
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
##   additional_surgery_metastatic_procedure
## 1                                        
## 2                                        
## 3                                        
## 4                                        
## 5

#rename rows/cols
rownames(BRCA_raw)<-BRCA_raw[,1]
BRCA_raw<-BRCA_raw[,-1]
BRCA_phen$submitter_id.samples<-gsub("-",".",BRCA_phen$submitter_id.samples)
rownames(BRCA_phen)<-BRCA_phen$submitter_id.samples
BRCA_phen<-BRCA_phen[,-1]

#find intersection
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

com<-intersect(colnames(BRCA_raw),rownames(BRCA_phen))
BRCA_raw_com = BRCA_raw[,colnames(BRCA_raw) %in% com]
BRCA_phen_com = BRCA_phen[rownames(BRCA_phen) %in% com, ]

#reorder
ord<-colnames(BRCA_raw_com)
BRCA_phen_com<-BRCA_phen_com[order(match(rownames(BRCA_phen_com),ord)),]

#show matching
identical(rownames(BRCA_phen_com),colnames(BRCA_raw_com))

## [1] TRUE

df<-cbind(rownames(BRCA_phen_com),colnames(BRCA_raw_com))
df[1:6,]

##      [,1]               [,2]              
## [1,] "TCGA.E9.A1NI.01A" "TCGA.E9.A1NI.01A"
## [2,] "TCGA.A1.A0SP.01A" "TCGA.A1.A0SP.01A"
## [3,] "TCGA.BH.A201.01A" "TCGA.BH.A201.01A"
## [4,] "TCGA.E2.A14T.01A" "TCGA.E2.A14T.01A"
## [5,] "TCGA.AC.A8OS.01A" "TCGA.AC.A8OS.01A"
## [6,] "TCGA.A8.A09K.01A" "TCGA.A8.A09K.01A"

Stat 646 HW 2

Bailey Stewart

2023-03-05

DATA PREP