Untitled

count_matrix <-readRDS("F:/BAI HC/Research 2/Covid19/count_matrix_file.RDS")
patient_info <- readRDS("F:/BAI HC/Research 2/Covid19/patient_info_0404.RDS")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# read in all the expression estimate files
 # List of file names
setwd("F:/BAI HC/Research 2/Expression_estimate")

ex_file <- list.files(pattern="*results")

ex_file

##  [1] "Wuhan_case_1_rsem.genes.results"          
##  [2] "Wuhan_case_2_rsem.genes.results"          
##  [3] "Wuhan_case_3_rsem.genes.results"          
##  [4] "Wuhan_case_4_rsem.genes.results"          
##  [5] "Wuhan_case_5_rsem.genes.results"          
##  [6] "Wuhan_case_6_rsem.genes.results"          
##  [7] "Wuhan_case_7_rsem.genes.results"          
##  [8] "Wuhan_case_8_rsem.genes.results"          
##  [9] "Wuhan_case_9_rsem.genes.results"          
## [10] "Wuhan_case_colon_1_rsem.genes.results"    
## [11] "Wuhan_case_colon_2_rsem.genes.results"    
## [12] "Wuhan_case_colon_3_rsem.genes.results"    
## [13] "Wuhan_case_colon_4_rsem.genes.results"    
## [14] "Wuhan_case_colon_5_rsem.genes.results"    
## [15] "Wuhan_case_colon_6_rsem.genes.results"    
## [16] "Wuhan_case_colon_7_rsem.genes.results"    
## [17] "Wuhan_case_colon_8_rsem.genes.results"    
## [18] "Wuhan_case_colon_9_rsem.genes.results"    
## [19] "Wuhan_control_1_rsem.genes.results"       
## [20] "Wuhan_control_10_rsem.genes.results"      
## [21] "Wuhan_control_2_rsem.genes.results"       
## [22] "Wuhan_control_3_rsem.genes.results"       
## [23] "Wuhan_control_4_rsem.genes.results"       
## [24] "Wuhan_control_5_rsem.genes.results"       
## [25] "Wuhan_control_6_rsem.genes.results"       
## [26] "Wuhan_control_7_rsem.genes.results"       
## [27] "Wuhan_control_8_rsem.genes.results"       
## [28] "Wuhan_control_9_rsem.genes.results"       
## [29] "Wuhan_control_colon_1_rsem.genes.results" 
## [30] "Wuhan_control_colon_10_rsem.genes.results"
## [31] "Wuhan_control_colon_2_rsem.genes.results" 
## [32] "Wuhan_control_colon_3_rsem.genes.results" 
## [33] "Wuhan_control_colon_4_rsem.genes.results" 
## [34] "Wuhan_control_colon_5_rsem.genes.results" 
## [35] "Wuhan_control_colon_6_rsem.genes.results" 
## [36] "Wuhan_control_colon_7_rsem.genes.results" 
## [37] "Wuhan_control_colon_8_rsem.genes.results" 
## [38] "Wuhan_control_colon_9_rsem.genes.results"

Notice “Wuhan_control_10_rsem.genes.results” and “Wuhan_control_colon_10_rsem.genes.results” order in the ex_file vector .

setwd("F:/BAI HC/Research 2/Expression_estimate")

# Read in all the files
data_i <- list()
for (i in 1:38) {
data_i[[i]] <-read.table(ex_file[i])
}

# Select only the first two columns in each dataframe in the list
data_i_df <-lapply(data_i, '[', c(1,6))
data_df <- data_i_df %>%
as.data.frame()%>%
cbind()

# Remove the first four rows
data_df <- data_df[2:60684,]

nrow(data_df)

## [1] 60683

ncol(data_df)

## [1] 76

# Change order of the column for "wuhan_control_10" and "wuhan_control_colon_10"
data_adjusted <- data_df[, c(seq(from= 1, to= 38, by =1),
                             seq(from=41, to =56, by =1), 39, 40, 
                             57, 58, seq(from=61, to = 76, by =1), 59,60)]

head(data_adjusted, 2)

##                V1   V6            V1.1 V6.1            V1.2 V6.2
## 2 ENSG00000000003 1.95 ENSG00000000003 1.12 ENSG00000000003 9.39
## 3 ENSG00000000005 0.00 ENSG00000000005 0.00 ENSG00000000005 0.05
##              V1.3  V6.3            V1.4 V6.4            V1.5  V6.5
## 2 ENSG00000000003 10.59 ENSG00000000003 1.96 ENSG00000000003 11.42
## 3 ENSG00000000005  0.12 ENSG00000000005 0.00 ENSG00000000005  0.00
##              V1.6 V6.6            V1.7 V6.7            V1.8 V6.8
## 2 ENSG00000000003 7.53 ENSG00000000003 6.03 ENSG00000000003 3.27
## 3 ENSG00000000005 0.00 ENSG00000000005 0.18 ENSG00000000005 0.73
##              V1.9 V6.9           V1.10 V6.10           V1.11 V6.11
## 2 ENSG00000000003 1.84 ENSG00000000003 55.29 ENSG00000000003 14.89
## 3 ENSG00000000005 0.20 ENSG00000000005  0.05 ENSG00000000005  0.00
##             V1.12 V6.12           V1.13 V6.13           V1.14 V6.14
## 2 ENSG00000000003  2.92 ENSG00000000003  0.70 ENSG00000000003  0.18
## 3 ENSG00000000005  0.48 ENSG00000000005  0.00 ENSG00000000005  0.00
##             V1.15 V6.15           V1.16 V6.16           V1.17 V6.17
## 2 ENSG00000000003  7.34 ENSG00000000003  1.51 ENSG00000000003  1.30
## 3 ENSG00000000005  0.02 ENSG00000000005  0.15 ENSG00000000005  0.00
##             V1.18 V6.18           V1.20 V6.20           V1.21 V6.21
## 2 ENSG00000000003  4.78 ENSG00000000003  5.39 ENSG00000000003  7.04
## 3 ENSG00000000005  0.20 ENSG00000000005  0.03 ENSG00000000005  0.16
##             V1.22 V6.22           V1.23 V6.23           V1.24 V6.24
## 2 ENSG00000000003 50.64 ENSG00000000003  6.26 ENSG00000000003  7.04
## 3 ENSG00000000005  0.00 ENSG00000000005  0.11 ENSG00000000005  0.50
##             V1.25 V6.25           V1.26 V6.26           V1.27 V6.27
## 2 ENSG00000000003  2.85 ENSG00000000003  2.28 ENSG00000000003  8.98
## 3 ENSG00000000005  0.26 ENSG00000000005  0.28 ENSG00000000005  1.01
##             V1.19 V6.19           V1.28 V6.28           V1.30 V6.30
## 2 ENSG00000000003  5.83 ENSG00000000003  5.29 ENSG00000000003  3.98
## 3 ENSG00000000005  0.72 ENSG00000000005  0.08 ENSG00000000005  0.13
##             V1.31 V6.31           V1.32 V6.32           V1.33 V6.33
## 2 ENSG00000000003  5.37 ENSG00000000003  4.17 ENSG00000000003  5.54
## 3 ENSG00000000005  0.76 ENSG00000000005  0.03 ENSG00000000005  0.39
##             V1.34 V6.34           V1.35 V6.35           V1.36 V6.36
## 2 ENSG00000000003 20.23 ENSG00000000003  4.10 ENSG00000000003 16.53
## 3 ENSG00000000005  0.20 ENSG00000000005  0.12 ENSG00000000005  0.49
##             V1.37 V6.37           V1.29 V6.29
## 2 ENSG00000000003 15.05 ENSG00000000003 11.76
## 3 ENSG00000000005  0.48 ENSG00000000005  1.18

# Extract only 1 gene ID column and the TPM column
data_df_clean <- data_adjusted[, c(1,seq(from = 2, to = 76, by = 2))]

# Check if the TPM matrix has the same number of genes with the count_matrix (60683)
nrow(data_df_clean)

## [1] 60683

rownames(data_df_clean) <-data_df_clean$V1
data_df_clean <- data_df_clean[, 2:39]

# Set column name for the TPM matrix
wuhan_case <-paste("Wuhan_case", sep = "_", seq(from=1, to=9, by = 1))
wuhan_colon <- paste("Wuhan_case_colon", sep = "_", seq(from =1, to = 9, by =1))
wuhan_control <- paste("Wuhan_control", sep = "_", seq(from =1, to = 10, by =1))
wuhan_control_colon <- paste("Wuhan_control_colon", sep= "_", seq(from=1, to =10, by =1))

column_name_1 <- c(wuhan_case, wuhan_colon, wuhan_control, wuhan_control_colon)
column_name_1

##  [1] "Wuhan_case_1"           "Wuhan_case_2"           "Wuhan_case_3"          
##  [4] "Wuhan_case_4"           "Wuhan_case_5"           "Wuhan_case_6"          
##  [7] "Wuhan_case_7"           "Wuhan_case_8"           "Wuhan_case_9"          
## [10] "Wuhan_case_colon_1"     "Wuhan_case_colon_2"     "Wuhan_case_colon_3"    
## [13] "Wuhan_case_colon_4"     "Wuhan_case_colon_5"     "Wuhan_case_colon_6"    
## [16] "Wuhan_case_colon_7"     "Wuhan_case_colon_8"     "Wuhan_case_colon_9"    
## [19] "Wuhan_control_1"        "Wuhan_control_2"        "Wuhan_control_3"       
## [22] "Wuhan_control_4"        "Wuhan_control_5"        "Wuhan_control_6"       
## [25] "Wuhan_control_7"        "Wuhan_control_8"        "Wuhan_control_9"       
## [28] "Wuhan_control_10"       "Wuhan_control_colon_1"  "Wuhan_control_colon_2" 
## [31] "Wuhan_control_colon_3"  "Wuhan_control_colon_4"  "Wuhan_control_colon_5" 
## [34] "Wuhan_control_colon_6"  "Wuhan_control_colon_7"  "Wuhan_control_colon_8" 
## [37] "Wuhan_control_colon_9"  "Wuhan_control_colon_10"

# Change column name
colnames(data_df_clean) <- column_name_1

# Change the column order in the TPM matrix according to the column
#order of the real count matrix

sam_info_name <- patient_info$sample_name

data_df_clean_changed <-data_df_clean[, sam_info_name]

# Rearrange the order of the gene ID in the TPM matrix 
#according to row name of the count matrix

geneID <- rownames(count_matrix)
data_df_clean_changed<-data_df_clean_changed[geneID,]

# Rename the column
colnames(data_df_clean_changed) <- patient_info$Run
tpm_df <- data_df_clean_changed

head(tpm_df, 3)

##                 SRR12816720 SRR12816722 SRR12816727 SRR12816730 SRR12816733
## ENSG00000223972        0.00        0.00        0.00        0.09        0.00
## ENSG00000227232        0.00        0.00        0.00        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12951211 SRR12951215 SRR12951216 SRR12951224 SRR12816718
## ENSG00000223972        0.00        0.00        0.00        0.21        0.17
## ENSG00000227232        0.00        0.00        0.00        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12816719 SRR12816721 SRR12816723 SRR12816724 SRR12816725
## ENSG00000223972        0.00        0.00        0.00        0.00        0.00
## ENSG00000227232        0.00        0.00        0.22        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12816726 SRR12816728 SRR12816729 SRR12816731 SRR12816732
## ENSG00000223972        0.00        0.00        0.00        0.00        0.00
## ENSG00000227232        0.00        0.00        0.00        0.15        0.29
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12816734 SRR12816735 SRR12816736 SRR12951212 SRR12951213
## ENSG00000223972        0.00        0.00        0.00        0.00        0.00
## ENSG00000227232        0.78        0.00        0.00        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12951214 SRR12951217 SRR12951218 SRR12951219 SRR12951220
## ENSG00000223972        0.00        0.00        0.00        0.00        0.00
## ENSG00000227232        0.00        0.00        0.15        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12951221 SRR12951222 SRR12951223 SRR12951225 SRR12951226
## ENSG00000223972        0.00        0.00        0.00        0.23        0.00
## ENSG00000227232        0.00        0.00        0.00        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00        0.00        0.00
##                 SRR12951227 SRR12951228 SRR12951229
## ENSG00000223972        0.00        0.00        0.00
## ENSG00000227232        0.00        0.00        0.00
## ENSG00000278267        0.00        0.00        0.00

head(count_matrix, 3)

##                 SRR12816720 SRR12816722 SRR12816727 SRR12816730 SRR12816733
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           0           0           0           0           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12951211 SRR12951215 SRR12951216 SRR12951224 SRR12816718
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           0           0           0           1           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12816719 SRR12816721 SRR12816723 SRR12816724 SRR12816725
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           1           0           0           0           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12816726 SRR12816728 SRR12816729 SRR12816731 SRR12816732
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           0           0           0           0           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12816734 SRR12816735 SRR12816736 SRR12951212 SRR12951213
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           0           0           0           0           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12951214 SRR12951217 SRR12951218 SRR12951219 SRR12951220
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           0           0           0           0           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12951221 SRR12951222 SRR12951223 SRR12951225 SRR12951226
## ENSG00000223972           0           0           0           0           0
## ENSG00000227232           0           0           0           0           0
## ENSG00000278267           0           0           0           0           0
##                 SRR12951227 SRR12951228 SRR12951229
## ENSG00000223972           0           0           0
## ENSG00000227232           0           0           2
## ENSG00000278267           0           0           0

library(SummarizedExperiment)

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:purrr':
## 
##     reduce

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

two_matrix <- list(count_matrix,tpm_df)

# SummarizedExperiment object with count matrix and tpm_matrix

tpm_and_real_sum <- SummarizedExperiment::SummarizedExperiment(two_matrix, colData = patient_info)

tpm_and_real_sum

## class: SummarizedExperiment 
## dim: 60683 38 
## metadata(0):
## assays(2): '' ''
## rownames(60683): ENSG00000223972 ENSG00000227232 ... ENSG00000277475
##   ENSG00000268674
## rowData names(0):
## colnames(38): SRR12816720 SRR12816722 ... SRR12951228 SRR12951229
## colData names(30): Run AGE ... Tissue condition

Untitled

Phuc

5/14/2021