count_matrix <-readRDS("F:/BAI HC/Research 2/Covid19/count_matrix_file.RDS")
patient_info <- readRDS("F:/BAI HC/Research 2/Covid19/patient_info_0404.RDS")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# read in all the expression estimate files
# List of file names
setwd("F:/BAI HC/Research 2/Expression_estimate")
ex_file <- list.files(pattern="*results")
ex_file
## [1] "Wuhan_case_1_rsem.genes.results"
## [2] "Wuhan_case_2_rsem.genes.results"
## [3] "Wuhan_case_3_rsem.genes.results"
## [4] "Wuhan_case_4_rsem.genes.results"
## [5] "Wuhan_case_5_rsem.genes.results"
## [6] "Wuhan_case_6_rsem.genes.results"
## [7] "Wuhan_case_7_rsem.genes.results"
## [8] "Wuhan_case_8_rsem.genes.results"
## [9] "Wuhan_case_9_rsem.genes.results"
## [10] "Wuhan_case_colon_1_rsem.genes.results"
## [11] "Wuhan_case_colon_2_rsem.genes.results"
## [12] "Wuhan_case_colon_3_rsem.genes.results"
## [13] "Wuhan_case_colon_4_rsem.genes.results"
## [14] "Wuhan_case_colon_5_rsem.genes.results"
## [15] "Wuhan_case_colon_6_rsem.genes.results"
## [16] "Wuhan_case_colon_7_rsem.genes.results"
## [17] "Wuhan_case_colon_8_rsem.genes.results"
## [18] "Wuhan_case_colon_9_rsem.genes.results"
## [19] "Wuhan_control_1_rsem.genes.results"
## [20] "Wuhan_control_10_rsem.genes.results"
## [21] "Wuhan_control_2_rsem.genes.results"
## [22] "Wuhan_control_3_rsem.genes.results"
## [23] "Wuhan_control_4_rsem.genes.results"
## [24] "Wuhan_control_5_rsem.genes.results"
## [25] "Wuhan_control_6_rsem.genes.results"
## [26] "Wuhan_control_7_rsem.genes.results"
## [27] "Wuhan_control_8_rsem.genes.results"
## [28] "Wuhan_control_9_rsem.genes.results"
## [29] "Wuhan_control_colon_1_rsem.genes.results"
## [30] "Wuhan_control_colon_10_rsem.genes.results"
## [31] "Wuhan_control_colon_2_rsem.genes.results"
## [32] "Wuhan_control_colon_3_rsem.genes.results"
## [33] "Wuhan_control_colon_4_rsem.genes.results"
## [34] "Wuhan_control_colon_5_rsem.genes.results"
## [35] "Wuhan_control_colon_6_rsem.genes.results"
## [36] "Wuhan_control_colon_7_rsem.genes.results"
## [37] "Wuhan_control_colon_8_rsem.genes.results"
## [38] "Wuhan_control_colon_9_rsem.genes.results"
Notice “Wuhan_control_10_rsem.genes.results” and “Wuhan_control_colon_10_rsem.genes.results” order in the ex_file vector .
setwd("F:/BAI HC/Research 2/Expression_estimate")
# Read in all the files
data_i <- list()
for (i in 1:38) {
data_i[[i]] <-read.table(ex_file[i])
}
# Select only the first two columns in each dataframe in the list
data_i_df <-lapply(data_i, '[', c(1,6))
data_df <- data_i_df %>%
as.data.frame()%>%
cbind()
# Remove the first four rows
data_df <- data_df[2:60684,]
nrow(data_df)
## [1] 60683
ncol(data_df)
## [1] 76
# Change order of the column for "wuhan_control_10" and "wuhan_control_colon_10"
data_adjusted <- data_df[, c(seq(from= 1, to= 38, by =1),
seq(from=41, to =56, by =1), 39, 40,
57, 58, seq(from=61, to = 76, by =1), 59,60)]
head(data_adjusted, 2)
## V1 V6 V1.1 V6.1 V1.2 V6.2
## 2 ENSG00000000003 1.95 ENSG00000000003 1.12 ENSG00000000003 9.39
## 3 ENSG00000000005 0.00 ENSG00000000005 0.00 ENSG00000000005 0.05
## V1.3 V6.3 V1.4 V6.4 V1.5 V6.5
## 2 ENSG00000000003 10.59 ENSG00000000003 1.96 ENSG00000000003 11.42
## 3 ENSG00000000005 0.12 ENSG00000000005 0.00 ENSG00000000005 0.00
## V1.6 V6.6 V1.7 V6.7 V1.8 V6.8
## 2 ENSG00000000003 7.53 ENSG00000000003 6.03 ENSG00000000003 3.27
## 3 ENSG00000000005 0.00 ENSG00000000005 0.18 ENSG00000000005 0.73
## V1.9 V6.9 V1.10 V6.10 V1.11 V6.11
## 2 ENSG00000000003 1.84 ENSG00000000003 55.29 ENSG00000000003 14.89
## 3 ENSG00000000005 0.20 ENSG00000000005 0.05 ENSG00000000005 0.00
## V1.12 V6.12 V1.13 V6.13 V1.14 V6.14
## 2 ENSG00000000003 2.92 ENSG00000000003 0.70 ENSG00000000003 0.18
## 3 ENSG00000000005 0.48 ENSG00000000005 0.00 ENSG00000000005 0.00
## V1.15 V6.15 V1.16 V6.16 V1.17 V6.17
## 2 ENSG00000000003 7.34 ENSG00000000003 1.51 ENSG00000000003 1.30
## 3 ENSG00000000005 0.02 ENSG00000000005 0.15 ENSG00000000005 0.00
## V1.18 V6.18 V1.20 V6.20 V1.21 V6.21
## 2 ENSG00000000003 4.78 ENSG00000000003 5.39 ENSG00000000003 7.04
## 3 ENSG00000000005 0.20 ENSG00000000005 0.03 ENSG00000000005 0.16
## V1.22 V6.22 V1.23 V6.23 V1.24 V6.24
## 2 ENSG00000000003 50.64 ENSG00000000003 6.26 ENSG00000000003 7.04
## 3 ENSG00000000005 0.00 ENSG00000000005 0.11 ENSG00000000005 0.50
## V1.25 V6.25 V1.26 V6.26 V1.27 V6.27
## 2 ENSG00000000003 2.85 ENSG00000000003 2.28 ENSG00000000003 8.98
## 3 ENSG00000000005 0.26 ENSG00000000005 0.28 ENSG00000000005 1.01
## V1.19 V6.19 V1.28 V6.28 V1.30 V6.30
## 2 ENSG00000000003 5.83 ENSG00000000003 5.29 ENSG00000000003 3.98
## 3 ENSG00000000005 0.72 ENSG00000000005 0.08 ENSG00000000005 0.13
## V1.31 V6.31 V1.32 V6.32 V1.33 V6.33
## 2 ENSG00000000003 5.37 ENSG00000000003 4.17 ENSG00000000003 5.54
## 3 ENSG00000000005 0.76 ENSG00000000005 0.03 ENSG00000000005 0.39
## V1.34 V6.34 V1.35 V6.35 V1.36 V6.36
## 2 ENSG00000000003 20.23 ENSG00000000003 4.10 ENSG00000000003 16.53
## 3 ENSG00000000005 0.20 ENSG00000000005 0.12 ENSG00000000005 0.49
## V1.37 V6.37 V1.29 V6.29
## 2 ENSG00000000003 15.05 ENSG00000000003 11.76
## 3 ENSG00000000005 0.48 ENSG00000000005 1.18
# Extract only 1 gene ID column and the TPM column
data_df_clean <- data_adjusted[, c(1,seq(from = 2, to = 76, by = 2))]
# Check if the TPM matrix has the same number of genes with the count_matrix (60683)
nrow(data_df_clean)
## [1] 60683
rownames(data_df_clean) <-data_df_clean$V1
data_df_clean <- data_df_clean[, 2:39]
# Set column name for the TPM matrix
wuhan_case <-paste("Wuhan_case", sep = "_", seq(from=1, to=9, by = 1))
wuhan_colon <- paste("Wuhan_case_colon", sep = "_", seq(from =1, to = 9, by =1))
wuhan_control <- paste("Wuhan_control", sep = "_", seq(from =1, to = 10, by =1))
wuhan_control_colon <- paste("Wuhan_control_colon", sep= "_", seq(from=1, to =10, by =1))
column_name_1 <- c(wuhan_case, wuhan_colon, wuhan_control, wuhan_control_colon)
column_name_1
## [1] "Wuhan_case_1" "Wuhan_case_2" "Wuhan_case_3"
## [4] "Wuhan_case_4" "Wuhan_case_5" "Wuhan_case_6"
## [7] "Wuhan_case_7" "Wuhan_case_8" "Wuhan_case_9"
## [10] "Wuhan_case_colon_1" "Wuhan_case_colon_2" "Wuhan_case_colon_3"
## [13] "Wuhan_case_colon_4" "Wuhan_case_colon_5" "Wuhan_case_colon_6"
## [16] "Wuhan_case_colon_7" "Wuhan_case_colon_8" "Wuhan_case_colon_9"
## [19] "Wuhan_control_1" "Wuhan_control_2" "Wuhan_control_3"
## [22] "Wuhan_control_4" "Wuhan_control_5" "Wuhan_control_6"
## [25] "Wuhan_control_7" "Wuhan_control_8" "Wuhan_control_9"
## [28] "Wuhan_control_10" "Wuhan_control_colon_1" "Wuhan_control_colon_2"
## [31] "Wuhan_control_colon_3" "Wuhan_control_colon_4" "Wuhan_control_colon_5"
## [34] "Wuhan_control_colon_6" "Wuhan_control_colon_7" "Wuhan_control_colon_8"
## [37] "Wuhan_control_colon_9" "Wuhan_control_colon_10"
# Change column name
colnames(data_df_clean) <- column_name_1
# Change the column order in the TPM matrix according to the column
#order of the real count matrix
sam_info_name <- patient_info$sample_name
data_df_clean_changed <-data_df_clean[, sam_info_name]
# Rearrange the order of the gene ID in the TPM matrix
#according to row name of the count matrix
geneID <- rownames(count_matrix)
data_df_clean_changed<-data_df_clean_changed[geneID,]
# Rename the column
colnames(data_df_clean_changed) <- patient_info$Run
tpm_df <- data_df_clean_changed
head(tpm_df, 3)
## SRR12816720 SRR12816722 SRR12816727 SRR12816730 SRR12816733
## ENSG00000223972 0.00 0.00 0.00 0.09 0.00
## ENSG00000227232 0.00 0.00 0.00 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12951211 SRR12951215 SRR12951216 SRR12951224 SRR12816718
## ENSG00000223972 0.00 0.00 0.00 0.21 0.17
## ENSG00000227232 0.00 0.00 0.00 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12816719 SRR12816721 SRR12816723 SRR12816724 SRR12816725
## ENSG00000223972 0.00 0.00 0.00 0.00 0.00
## ENSG00000227232 0.00 0.00 0.22 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12816726 SRR12816728 SRR12816729 SRR12816731 SRR12816732
## ENSG00000223972 0.00 0.00 0.00 0.00 0.00
## ENSG00000227232 0.00 0.00 0.00 0.15 0.29
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12816734 SRR12816735 SRR12816736 SRR12951212 SRR12951213
## ENSG00000223972 0.00 0.00 0.00 0.00 0.00
## ENSG00000227232 0.78 0.00 0.00 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12951214 SRR12951217 SRR12951218 SRR12951219 SRR12951220
## ENSG00000223972 0.00 0.00 0.00 0.00 0.00
## ENSG00000227232 0.00 0.00 0.15 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12951221 SRR12951222 SRR12951223 SRR12951225 SRR12951226
## ENSG00000223972 0.00 0.00 0.00 0.23 0.00
## ENSG00000227232 0.00 0.00 0.00 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00 0.00 0.00
## SRR12951227 SRR12951228 SRR12951229
## ENSG00000223972 0.00 0.00 0.00
## ENSG00000227232 0.00 0.00 0.00
## ENSG00000278267 0.00 0.00 0.00
head(count_matrix, 3)
## SRR12816720 SRR12816722 SRR12816727 SRR12816730 SRR12816733
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 0 0 0 0 0
## ENSG00000278267 0 0 0 0 0
## SRR12951211 SRR12951215 SRR12951216 SRR12951224 SRR12816718
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 0 0 0 1 0
## ENSG00000278267 0 0 0 0 0
## SRR12816719 SRR12816721 SRR12816723 SRR12816724 SRR12816725
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 1 0 0 0 0
## ENSG00000278267 0 0 0 0 0
## SRR12816726 SRR12816728 SRR12816729 SRR12816731 SRR12816732
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 0 0 0 0 0
## ENSG00000278267 0 0 0 0 0
## SRR12816734 SRR12816735 SRR12816736 SRR12951212 SRR12951213
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 0 0 0 0 0
## ENSG00000278267 0 0 0 0 0
## SRR12951214 SRR12951217 SRR12951218 SRR12951219 SRR12951220
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 0 0 0 0 0
## ENSG00000278267 0 0 0 0 0
## SRR12951221 SRR12951222 SRR12951223 SRR12951225 SRR12951226
## ENSG00000223972 0 0 0 0 0
## ENSG00000227232 0 0 0 0 0
## ENSG00000278267 0 0 0 0 0
## SRR12951227 SRR12951228 SRR12951229
## ENSG00000223972 0 0 0
## ENSG00000227232 0 0 2
## ENSG00000278267 0 0 0
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:purrr':
##
## reduce
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
two_matrix <- list(count_matrix,tpm_df)
# SummarizedExperiment object with count matrix and tpm_matrix
tpm_and_real_sum <- SummarizedExperiment::SummarizedExperiment(two_matrix, colData = patient_info)
tpm_and_real_sum
## class: SummarizedExperiment
## dim: 60683 38
## metadata(0):
## assays(2): '' ''
## rownames(60683): ENSG00000223972 ENSG00000227232 ... ENSG00000277475
## ENSG00000268674
## rowData names(0):
## colnames(38): SRR12816720 SRR12816722 ... SRR12951228 SRR12951229
## colData names(30): Run AGE ... Tissue condition