Study set:
coldata <- read.delim(paste(wd, "meta.data.table.txt", sep = ""), row.names = 1)
coldata$Condition <- factor(coldata$Condition)
coldata$PatientID <- factor(coldata$PatientID)
coldata
## Condition PatientID Reponse
## P01_S_L1.LB25 Pre-treatment P01 non_reponder
## P01_SOC_L1.LB23 Post-treatment P01 non_reponder
## P03_S_L1.LB20 Pre-treatment P03 non_reponder
## P03_SOC_L2.LB8 Post-treatment P03 non_reponder
## P04_S_L1.LB4 Pre-treatment P04 high_responder
## P04_SOC_L2.LB15 Post-treatment P04 high_responder
## P05_S_L2.LB9 Pre-treatment P05 non_reponder
## P05_SOC_L2.LB16 Post-treatment P05 non_reponder
## P06_S_L2.LB12 Pre-treatment P06 non_reponder
## P06_SOC_L2.LB13 Post-treatment P06 non_reponder
## P07_S_L1.LB5 Pre-treatment P07 non_reponder
## P07_SOC_L2.LB18 Post-treatment P07 non_reponder
## P09_S_L2.LB19 Pre-treatment P09 non_reponder
## P09_SOC_L2.LB14 Post-treatment P09 non_reponder
## P12_S_L1.LB6 Pre-treatment P12 non_reponder
## P12_SOC_L2.LB10 Post-treatment P12 non_reponder
## P14_S_L1.LB19 Pre-treatment P14 non_reponder
## P14_SOC_L2.LB21 Post-treatment P14 non_reponder
## P15_S_L2.LB11 Pre-treatment P15 high_responder
## P15_SOC_L2.LB20 Post-treatment P15 high_responder
## P16_S_L1.LB22 Pre-treatment P16 non_reponder
## P16_SOC_L1.LB21 Post-treatment P16 non_reponder
## P17_S_L1.LB7 Pre-treatment P17 non_reponder
## P17_SOC_L1.LB27 Post-treatment P17 non_reponder
## P18_S_L1.LB1 Pre-treatment P18 non_reponder
## P18_SOC_L1.LB2 Post-treatment P18 non_reponder
## P19_S_L1.LB3 Pre-treatment P19 non_reponder
table(coldata$PatientID)
##
## P01 P03 P04 P05 P06 P07 P09 P12 P14 P15 P16 P17 P18 P19
## 2 2 2 2 2 2 2 2 2 2 2 2 2 1
table(coldata$Condition)
##
## Post-treatment Pre-treatment
## 13 14
Read expression files:
tpm <- read.delim(paste(wd, "salmon.merged.gene_tpm.tsv", sep = ""), header = T, row.names = 1)
#Counts matrix
sscounts <- read.table(paste(wd, "salmon.merged.gene_counts.tsv", sep = ""), row.names = 1, header = T)
sscounts %>% dplyr::select(gene_name) -> geneSymbol.dic #Dictionary for gene symbols
sscounts %>% dplyr::select(-gene_name) -> cts
rm(sscounts)
#TPM matrix
tpm <- read.delim(paste(wd, "salmon.merged.gene_tpm.tsv", sep = ""), header = T, row.names = 1)
#Counts matrix
sscounts <- read.table(paste(wd, "salmon.merged.gene_counts.tsv", sep = ""), row.names = 1, header = T)
t2g1.dic<- sscounts %>% dplyr::select(gene_name) #Dictionary for gene symbols
sscounts %>% dplyr::select(-gene_name) -> cts
rm(sscounts)
Read the new metadata table:
#library(readxl)
sample_background<-readxl::read_excel("/Users/ebrukocakaya/HnN_Sven/updated_sample_overview_FFPE.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...8`
head(sample_background)
## # A tibble: 6 × 8
## `Group Name` ...2 `Pre-treatment` Lokalisation Sublokalisation `p16 Status`
## <chr> <lgl> <chr> <chr> <chr> <chr>
## 1 <NA> NA <NA> <NA> <NA> <NA>
## 2 Specimen 1 NA P01_S Oropharynx Uvula neg
## 3 Specimen 2 NA P03_S Oropharynx Tonsil pos
## 4 Specimen 3 NA P04_S Oropharynx Base of tongue pos
## 5 Specimen 4 NA P05_S Oropharynx Tonsil pos
## 6 Specimen 5 NA P06_S Larynx Supraglottis N/A
## # ℹ 2 more variables: `x-fold CD8/GranB` <chr>, ...8 <chr>
#What to add to the existing metadata from new metadata
#From sample background: Lokalisation Sublokalisation p16 Status x-fold CD8/GranB columns
#From Standard H&E Results Whole Slide Percent Tumor
#From QC TRR Pool DAY
#clean up the sample background df:
sample_background<-sample_background[-1,]
sample_background<-sample_background[,-8]
sample_background<-sample_background[,-2]
sample_background<-data.frame(sample_background)
rownames(sample_background)<-sample_background[,1]
#add Patient ID info to the table:
patid<-c(levels(coldata$PatientID))
sample_background$PatientID<-patid
sample_background
## Group.Name Pre.treatment Lokalisation Sublokalisation
## Specimen 1 Specimen 1 P01_S Oropharynx Uvula
## Specimen 2 Specimen 2 P03_S Oropharynx Tonsil
## Specimen 3 Specimen 3 P04_S Oropharynx Base of tongue
## Specimen 4 Specimen 4 P05_S Oropharynx Tonsil
## Specimen 5 Specimen 5 P06_S Larynx Supraglottis
## Specimen 6 Specimen 6 P07_S Oropharynx Tonsil
## Specimen 7 Specimen 7 P09_S oral cavity tongue
## Specimen 8 Specimen 8 P12_S oral cavity tongue
## Specimen 9 Specimen 9 P14_S Oropharynx Tonsil
## Specimen 10 Specimen 10 P15_S Oropharynx Tonsil
## Specimen 11 Specimen 11 P16 _S Oropharynx posterior pharyngeal wall
## Specimen 12 Specimen 12 P17_S Oropharynx Base of tongue
## Specimen 13 Specimen 13 P18_S Larynx Glottis
## Specimen 14 Specimen 14 P19_S oral cavity Floor of mouth
## p16.Status x.fold.CD8.GranB PatientID
## Specimen 1 neg 2.1998224 P01
## Specimen 2 pos 1.2671452999999999 P03
## Specimen 3 pos high responder P04
## Specimen 4 pos 0.42182770000000003 P05
## Specimen 5 N/A 23.7710422 P06
## Specimen 6 pos 10.010180500000001 P07
## Specimen 7 N/A 1.2193797 P09
## Specimen 8 N/A 6.9578436000000004 P12
## Specimen 9 pos 0.89413869999999995 P14
## Specimen 10 pos high responder P15
## Specimen 11 neg 6.5588876999999997 P16
## Specimen 12 pos 4.2564292999999997 P17
## Specimen 13 N/A 0.34499999999999997 P18
## Specimen 14 N/A high responder P19
#read percetn tumor table:
per_tumor<-readxl::read_excel("/Users/ebrukocakaya/HnN_Sven/Percentage_tumor_in_sample.xlsx")
head(per_tumor)
## # A tibble: 6 × 3
## `P. ID` `Percent tumor in processed sample` PatientID
## <chr> <chr> <chr>
## 1 P01_S 20 P01
## 2 P01_SOC 20 P01
## 3 P03_S 60 P03
## 4 P03_SOC 20 P03
## 5 P04_S 20 P04
## 6 P04_SOC 0 P04
#there is no data on P19_SOC remove row 28
per_tumor<-per_tumor[-28,]
#orders of the coldata metadata and the per tumor is same so just merge them:
coldata$percent_tumor_in_sample<-per_tumor$`Percent tumor in processed sample`
#add the sample ID as well (same order)
coldata$sampleID<-per_tumor$`P. ID`
head(coldata)
## Condition PatientID Reponse percent_tumor_in_sample
## P01_S_L1.LB25 Pre-treatment P01 non_reponder 20
## P01_SOC_L1.LB23 Post-treatment P01 non_reponder 20
## P03_S_L1.LB20 Pre-treatment P03 non_reponder 60
## P03_SOC_L2.LB8 Post-treatment P03 non_reponder 20
## P04_S_L1.LB4 Pre-treatment P04 high_responder 20
## P04_SOC_L2.LB15 Post-treatment P04 high_responder 0
## sampleID
## P01_S_L1.LB25 P01_S
## P01_SOC_L1.LB23 P01_SOC
## P03_S_L1.LB20 P03_S
## P03_SOC_L2.LB8 P03_SOC
## P04_S_L1.LB4 P04_S
## P04_SOC_L2.LB15 P04_SOC
#per_tumor<-excel_sheets(path=/Users/ebrukocakaya/HnN_Sven/Percentage_tumor_in_sample.xlsx)
#add sample background info
#sample_background$Pre.treatment is the same with coldata$sampleID:
colnames(sample_background)[2]<-"sampleID"
#since rownames can get lost at full_join:
#save rownames as another column and then remove them
rows_coldata<-rownames(coldata)
coldata<-full_join(coldata,sample_background,by="sampleID")
rownames(coldata)<-rows_coldata
coldata<-coldata[,-11]
colnames(coldata)[2]<-"PatientID"
head(coldata)
## Condition PatientID Reponse percent_tumor_in_sample
## P01_S_L1.LB25 Pre-treatment P01 non_reponder 20
## P01_SOC_L1.LB23 Post-treatment P01 non_reponder 20
## P03_S_L1.LB20 Pre-treatment P03 non_reponder 60
## P03_SOC_L2.LB8 Post-treatment P03 non_reponder 20
## P04_S_L1.LB4 Pre-treatment P04 high_responder 20
## P04_SOC_L2.LB15 Post-treatment P04 high_responder 0
## sampleID Group.Name Lokalisation Sublokalisation p16.Status
## P01_S_L1.LB25 P01_S Specimen 1 Oropharynx Uvula neg
## P01_SOC_L1.LB23 P01_SOC <NA> <NA> <NA> <NA>
## P03_S_L1.LB20 P03_S Specimen 2 Oropharynx Tonsil pos
## P03_SOC_L2.LB8 P03_SOC <NA> <NA> <NA> <NA>
## P04_S_L1.LB4 P04_S Specimen 3 Oropharynx Base of tongue pos
## P04_SOC_L2.LB15 P04_SOC <NA> <NA> <NA> <NA>
## x.fold.CD8.GranB
## P01_S_L1.LB25 2.1998224
## P01_SOC_L1.LB23 <NA>
## P03_S_L1.LB20 1.2671452999999999
## P03_SOC_L2.LB8 <NA>
## P04_S_L1.LB4 high responder
## P04_SOC_L2.LB15 <NA>
Select only protein coding genes
pcg <- read.delim(paste(wd, "protein_coding_genes.Ensembl.ids.txt", sep = ""), header = F)
tpm %>% dplyr::select(-gene_name) -> tpm
#Keep only Protein coding genes:
#table(rownames(tpm) %in% pcg$V1)
tpm[which(rownames(tpm) %in% pcg$V1),] -> tpm
cts[which(rownames(cts) %in% pcg$V1),] -> cts
#Convert to integers
cts <- round(cts, digits = 0)
cts[which(rowMeans(cts) > 10.0),] -> cts
#Filter out genes with mean exp less than 1.0 tpm across all samples.
tpm[which(rowMeans(tpm) > 1.0),] -> tpm
#Select top highly variable genes based on their Coefficient of Variation
tpm[order(apply(tpm, 1, sd, na.rm=TRUE) / apply(tpm, 1, mean, na.rm=TRUE), decreasing = T),] %>% head(2000) -> tpm.top.cv
tpm %>% reshape2::melt() %>% ggplot(aes(variable, log(value+.01) )) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## No id variables; using all as measure variables
#subset coldata to only project condition, response, localization and sublocalization:
heatmap_coldata<-coldata[,c("Condition","Reponse","Lokalisation","Sublokalisation")]
heatmap_coldata
## Condition Reponse Lokalisation
## P01_S_L1.LB25 Pre-treatment non_reponder Oropharynx
## P01_SOC_L1.LB23 Post-treatment non_reponder <NA>
## P03_S_L1.LB20 Pre-treatment non_reponder Oropharynx
## P03_SOC_L2.LB8 Post-treatment non_reponder <NA>
## P04_S_L1.LB4 Pre-treatment high_responder Oropharynx
## P04_SOC_L2.LB15 Post-treatment high_responder <NA>
## P05_S_L2.LB9 Pre-treatment non_reponder Oropharynx
## P05_SOC_L2.LB16 Post-treatment non_reponder <NA>
## P06_S_L2.LB12 Pre-treatment non_reponder Larynx
## P06_SOC_L2.LB13 Post-treatment non_reponder <NA>
## P07_S_L1.LB5 Pre-treatment non_reponder Oropharynx
## P07_SOC_L2.LB18 Post-treatment non_reponder <NA>
## P09_S_L2.LB19 Pre-treatment non_reponder oral cavity
## P09_SOC_L2.LB14 Post-treatment non_reponder <NA>
## P12_S_L1.LB6 Pre-treatment non_reponder oral cavity
## P12_SOC_L2.LB10 Post-treatment non_reponder <NA>
## P14_S_L1.LB19 Pre-treatment non_reponder Oropharynx
## P14_SOC_L2.LB21 Post-treatment non_reponder <NA>
## P15_S_L2.LB11 Pre-treatment high_responder Oropharynx
## P15_SOC_L2.LB20 Post-treatment high_responder <NA>
## P16_S_L1.LB22 Pre-treatment non_reponder Oropharynx
## P16_SOC_L1.LB21 Post-treatment non_reponder <NA>
## P17_S_L1.LB7 Pre-treatment non_reponder Oropharynx
## P17_SOC_L1.LB27 Post-treatment non_reponder <NA>
## P18_S_L1.LB1 Pre-treatment non_reponder Larynx
## P18_SOC_L1.LB2 Post-treatment non_reponder <NA>
## P19_S_L1.LB3 Pre-treatment non_reponder oral cavity
## Sublokalisation
## P01_S_L1.LB25 Uvula
## P01_SOC_L1.LB23 <NA>
## P03_S_L1.LB20 Tonsil
## P03_SOC_L2.LB8 <NA>
## P04_S_L1.LB4 Base of tongue
## P04_SOC_L2.LB15 <NA>
## P05_S_L2.LB9 Tonsil
## P05_SOC_L2.LB16 <NA>
## P06_S_L2.LB12 Supraglottis
## P06_SOC_L2.LB13 <NA>
## P07_S_L1.LB5 Tonsil
## P07_SOC_L2.LB18 <NA>
## P09_S_L2.LB19 tongue
## P09_SOC_L2.LB14 <NA>
## P12_S_L1.LB6 tongue
## P12_SOC_L2.LB10 <NA>
## P14_S_L1.LB19 Tonsil
## P14_SOC_L2.LB21 <NA>
## P15_S_L2.LB11 Tonsil
## P15_SOC_L2.LB20 <NA>
## P16_S_L1.LB22 posterior pharyngeal wall
## P16_SOC_L1.LB21 <NA>
## P17_S_L1.LB7 Base of tongue
## P17_SOC_L1.LB27 <NA>
## P18_S_L1.LB1 Glottis
## P18_SOC_L1.LB2 <NA>
## P19_S_L1.LB3 Floor of mouth
pheatmap::pheatmap(cor(tpm.top.cv), annotation_row = heatmap_coldata, annotation_col = heatmap_coldata)
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ PatientID + Condition)
smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 100) >= smallestGroupSize
dds <- dds[keep,]
vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Condition, vsd$PatientID, sep="-")
colnames(sampleDistMatrix) <- paste(vsd$Condition, vsd$PatientID, sep="-")
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
plotPCA(vsd, intgroup=c("Reponse"))+
geom_label(aes(label=vsd$PatientID))
plotPCA(vsd, intgroup=c("Condition"))+
geom_label(aes(label=vsd$PatientID))
plotPCA(vsd, intgroup=c("Lokalisation"))+
geom_label(aes(label=vsd$PatientID))
dds$Condition <- factor(dds$Condition, levels = c("Post-treatment", "Pre-treatment"))
dds$Condition <- relevel(dds$Condition, "Pre-treatment")
dds <- DESeq(dds)
rownames(dds) <- geneSymbol.dic[rownames(dds),"gene_name"]
res <- results(dds, alpha = 0.05)
#rownames(res) <- geneSymbol.dic[rownames(res),"gene_name"]
summary(res)
##
## out of 14209 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1225, 8.6%
## LFC < 0 (down) : 806, 5.7%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 17)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#res[order(res$padj, decreasing = F),] %>% head(350)
resultsNames(dds)
## [1] "Intercept"
## [2] "PatientID_P03_vs_P01"
## [3] "PatientID_P04_vs_P01"
## [4] "PatientID_P05_vs_P01"
## [5] "PatientID_P06_vs_P01"
## [6] "PatientID_P07_vs_P01"
## [7] "PatientID_P09_vs_P01"
## [8] "PatientID_P12_vs_P01"
## [9] "PatientID_P14_vs_P01"
## [10] "PatientID_P15_vs_P01"
## [11] "PatientID_P16_vs_P01"
## [12] "PatientID_P17_vs_P01"
## [13] "PatientID_P18_vs_P01"
## [14] "PatientID_P19_vs_P01"
## [15] "Condition_Post.treatment_vs_Pre.treatment"
EnhancedVolcano(res,
labSize = 2,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
pCutoff = 0.05,
FCcutoff = 1)
resOrdered <- res[order(res$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(resOrdered)
d <- plotCounts(dds, gene=which.min(res$padj), intgroup="Condition",
returnData=TRUE)
#matched_data <- subset(d, PatientID %in% matching_patient_ids)
d$PatientID<-coldata$PatientID
ggplot(d, aes(x=Condition, y=count,group = PatientID, color = PatientID)) +
geom_point() +
scale_y_log10(breaks=c(25,100,400))+
ggtitle(geneSymbol.dic[rownames(dds[which.min(res$padj),]),])+
geom_line()
sgenes <- resOrdered[order(abs(resOrdered$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()
rld <- vst(dds, blind=FALSE)
de_mat <- assay(rld)[sgenes,]
pheatmap(t(scale(t(de_mat))),show_rownames = F,show_colnames = T,annotation_col =heatmap_coldata)
To further investigate the 2 high responders, they are subsetted with
their counterparts. Only pre-treatment samples are taken into
account.
Sample from High Responder p4 taken from Oropharynx-base of tongue and
it’s p16 positive. For it’s non-responder counterpart p17 is subsettted.
Sample from High Responder p15 taken from Oropharynx-tonsil and it’s p16
positive. For it’s non-responder counterpart p3, p5, p14 are subsettted.
P4,P17,P15,P3,P5,P14 e subsetted and merged.
to_keep<-c("P03","P04","P05", "P14","P15","P17")
sub_coldata<- coldata %>%
filter(PatientID %in% to_keep & Condition == "Pre-treatment")
sub_coldata
## Condition PatientID Reponse percent_tumor_in_sample
## P03_S_L1.LB20 Pre-treatment P03 non_reponder 60
## P04_S_L1.LB4 Pre-treatment P04 high_responder 20
## P05_S_L2.LB9 Pre-treatment P05 non_reponder 40
## P14_S_L1.LB19 Pre-treatment P14 non_reponder 50
## P15_S_L2.LB11 Pre-treatment P15 high_responder 40
## P17_S_L1.LB7 Pre-treatment P17 non_reponder 60
## sampleID Group.Name Lokalisation Sublokalisation p16.Status
## P03_S_L1.LB20 P03_S Specimen 2 Oropharynx Tonsil pos
## P04_S_L1.LB4 P04_S Specimen 3 Oropharynx Base of tongue pos
## P05_S_L2.LB9 P05_S Specimen 4 Oropharynx Tonsil pos
## P14_S_L1.LB19 P14_S Specimen 9 Oropharynx Tonsil pos
## P15_S_L2.LB11 P15_S Specimen 10 Oropharynx Tonsil pos
## P17_S_L1.LB7 P17_S Specimen 12 Oropharynx Base of tongue pos
## x.fold.CD8.GranB
## P03_S_L1.LB20 1.2671452999999999
## P04_S_L1.LB4 high responder
## P05_S_L2.LB9 0.42182770000000003
## P14_S_L1.LB19 0.89413869999999995
## P15_S_L2.LB11 high responder
## P17_S_L1.LB7 4.2564292999999997
to_keep_tpm<-rownames(sub_coldata)
sub_tpm<- tpm %>%
dplyr::select(all_of(to_keep_tpm))
head(sub_tpm)
## P03_S_L1.LB20 P04_S_L1.LB4 P05_S_L2.LB9 P14_S_L1.LB19
## ENSG00000000003 4.385979 5.137219 0.035638 5.019029
## ENSG00000000419 81.059832 54.917647 110.597554 41.961887
## ENSG00000000457 7.298586 7.188495 9.206459 6.934495
## ENSG00000000460 12.892440 7.604308 23.650866 12.551062
## ENSG00000000938 2.967168 4.369779 2.587410 2.907970
## ENSG00000000971 67.588255 78.079333 13.130865 32.701267
## P15_S_L2.LB11 P17_S_L1.LB7
## ENSG00000000003 4.579444 8.556004
## ENSG00000000419 102.027287 105.243919
## ENSG00000000457 6.852893 14.134774
## ENSG00000000460 20.119463 34.528346
## ENSG00000000938 1.666307 2.177609
## ENSG00000000971 49.540988 17.372839
#Filter out genes with mean exp less than 1.0 tpm across all samples.
sub_tpm[which(rowMeans(sub_tpm) > 1.0),] -> sub_tpm
#Select top highly variable genes based on their Coefficient of Variation
sub_tpm[order(apply(sub_tpm, 1, sd, na.rm=TRUE) / apply(sub_tpm, 1, mean, na.rm=TRUE), decreasing = T),] %>% head(2000) -> sub_tpm.top.cv
sub_tpm %>% reshape2::melt() %>% ggplot(aes(variable, log(value+.01) )) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## No id variables; using all as measure variables
#to only project condition, response and sublokalization since they are all from oropharynx region
ann_df<-sub_coldata[, c("Reponse", "Sublokalisation")]
pheatmap::pheatmap(cor(sub_tpm.top.cv), annotation_row = ann_df, annotation_col = ann_df)
#some formatting
#sub counts mtx:
sub_cts<- cts %>%
dplyr::select(all_of(to_keep_tpm))
#add sublokalisation data into the post treatment samples:
head(sub_cts)
## P03_S_L1.LB20 P04_S_L1.LB4 P05_S_L2.LB9 P14_S_L1.LB19
## ENSG00000000003 304 370 6 321
## ENSG00000000419 4123 2391 4729 1924
## ENSG00000000457 1108 940 1191 916
## ENSG00000000460 1400 723 2279 1472
## ENSG00000000938 220 264 160 197
## ENSG00000000971 13821 13500 2121 5821
## P15_S_L2.LB11 P17_S_L1.LB7
## ENSG00000000003 344 511
## ENSG00000000419 4539 4412
## ENSG00000000457 948 1700
## ENSG00000000460 1992 3514
## ENSG00000000938 107 133
## ENSG00000000971 8671 2842
sub_dds <- DESeqDataSetFromMatrix(countData = sub_cts,
colData = sub_coldata,
design = ~Sublokalisation + Reponse)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
smallestGroupSize <- 3
keep <- rowSums(counts(sub_dds) >= 100) >= smallestGroupSize
sub_dds <- sub_dds[keep,]
sub_vsd <- vst(sub_dds, blind=FALSE)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
sub_sampleDists <- dist(t(assay(sub_vsd)))
sub_sampleDistMatrix <- as.matrix(sub_sampleDists)
rownames(sub_sampleDistMatrix ) <- paste(sub_vsd$Condition, sub_vsd$PatientID, sep="-")
colnames(sub_sampleDistMatrix ) <- paste(sub_vsd$Condition, sub_vsd$PatientID, sep="-")
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sub_sampleDistMatrix,
clustering_distance_rows=sub_sampleDists,
clustering_distance_cols=sub_sampleDists,
col=colors)
plotPCA(sub_vsd, intgroup=c("Reponse"))
#plotPCA(sub_vsd, intgroup=c("PatientID"))+
# geom_point(aes(shape=sub_vsd$Condition), size=3)+
#scale_shape_manual(values = c(5,1))+
#geom_line()
#To project names:
#geom_text(aes(label = PatientID))
sub_dds$Reponse <- factor(sub_dds$Reponse, levels = c("non_reponder", "high_responder"))
sub_dds$Response <- relevel(sub_dds$Reponse, "non_reponder")
sub_dds <- DESeq(sub_dds)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
rownames(sub_dds) <- geneSymbol.dic[rownames(sub_dds),"gene_name"]
sub_res <- results(sub_dds, alpha = 0.05)
#rownames(sub_res) <- geneSymbol.dic[rownames(sub_res),"gene_name"]
summary(sub_res)
##
## out of 12710 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 26, 0.2%
## LFC < 0 (down) : 25, 0.2%
## outliers [1] : 2, 0.016%
## low counts [2] : 0, 0%
## (mean count < 61)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#sub_res[order(res$padj, decreasing = F),] %>% head(350)
resultsNames(sub_dds)
## [1] "Intercept"
## [2] "Sublokalisation_Tonsil_vs_Base.of.tongue"
## [3] "Reponse_high_responder_vs_non_reponder"
EnhancedVolcano(sub_res,
labSize = 2,
lab = rownames(sub_res),
x = 'log2FoldChange',
y = 'pvalue',
pCutoff = 0.05,
FCcutoff = 1)
sub_resOrdered <- sub_res[order(sub_res$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(sub_resOrdered)
sub_d <- plotCounts(sub_dds, gene=which.min(sub_res$padj), intgroup="Reponse",
returnData=TRUE)
ggplot(sub_d, aes(x=Reponse, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
scale_y_log10(breaks=c(25,100,400))+
ggtitle(geneSymbol.dic[rownames(sub_dds[which.min(sub_res$padj),]),])
sub
## standardGeneric for "sub" defined from package "base"
##
## function (pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
## fixed = FALSE, useBytes = FALSE)
## standardGeneric("sub")
## <environment: 0x7fb1b231c950>
## Methods may be defined for arguments: pattern, replacement, x, ignore.case, perl, fixed, useBytes
## Use showMethods(sub) for currently available ones.
sub_sgenes <- sub_resOrdered[order(abs(sub_resOrdered$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()
sub_rld <- vst(sub_dds, blind=FALSE)
sub_de_mat <- assay(sub_rld)[sub_sgenes,]
pheatmap(t(scale(t(sub_de_mat))),show_rownames = T,show_colnames = T,annotation_col =ann_df)
write_csv(coldata, "hnn_metadata.csv",col_names = T)
new_mtd<-readxl::read_excel("/Users/ebrukocakaya/HnN_Sven/hnn_metadata_v2.xlsx")
new_mtd
## # A tibble: 27 × 13
## Condition PatientID Reponse percent_tumor_in_sam…¹ sampleID Group.Name
## <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Pre-treatment P01 non_repo… 20 P01_S Specimen 1
## 2 Post-treatment P01 non_repo… 20 P01_SOC NA
## 3 Pre-treatment P03 non_repo… 60 P03_S Specimen 2
## 4 Post-treatment P03 non_repo… 20 P03_SOC NA
## 5 Pre-treatment P04 high_res… 20 P04_S Specimen 3
## 6 Post-treatment P04 high_res… 0 P04_SOC NA
## 7 Pre-treatment P05 non_repo… 40 P05_S Specimen 4
## 8 Post-treatment P05 non_repo… 35 P05_SOC NA
## 9 Pre-treatment P06 non_repo… 50 P06_S Specimen 5
## 10 Post-treatment P06 non_repo… 20 P06_SOC NA
## # ℹ 17 more rows
## # ℹ abbreviated name: ¹percent_tumor_in_sample
## # ℹ 7 more variables: Lokalisation <chr>, Sublokalisation <chr>,
## # p16.Status <chr>, x.fold.CD8.GranB <chr>, TStage_Response <chr>,
## # N_Stage_Response <chr>, Primary_Endpoint <chr>
dds_2<- DESeqDataSetFromMatrix(countData = cts,
colData = new_mtd,
design = ~ PatientID + Condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
keep_2 <- rowSums(counts(dds_2) >= 100) >= smallestGroupSize
dds_2 <- dds_2[keep_2,]
vsd_2 <- vst(dds_2, blind=FALSE)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
sampleDists_2 <- dist(t(assay(vsd_2)))
sampleDistMatrix_2 <- as.matrix(sampleDists_2)
rownames(sampleDistMatrix_2) <- paste(vsd_2$Condition, vsd_2$PatientID, sep="-")
colnames(sampleDistMatrix_2) <- paste(vsd_2$Condition, vsd_2$PatientID, sep="-")
plotPCA(vsd_2, intgroup=c("Reponse"))+
geom_label(aes(label=vsd_2$PatientID))
plotPCA(vsd_2, intgroup=c("TStage_Response"))+
geom_label(aes(label=vsd_2$PatientID))
plotPCA(vsd_2, intgroup=c("N_Stage_Response"))+
geom_label(aes(label=vsd_2$PatientID))
plotPCA(vsd_2, intgroup=c("Primary_Endpoint"))+
geom_label(aes(label=vsd_2$PatientID))
dds_2$Condition <- factor(dds_2$Condition, levels = c("Post-treatment", "Pre-treatment"))
dds_2$Condition <- relevel(dds_2$Condition, "Pre-treatment")
dds_2 <- DESeq(dds_2)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
rownames(dds_2) <- geneSymbol.dic[rownames(dds_2),"gene_name"]
res_2 <- results(dds_2, alpha = 0.05)
#rownames(res) <- geneSymbol.dic[rownames(res),"gene_name"]
summary(res_2)
##
## out of 14209 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1225, 8.6%
## LFC < 0 (down) : 806, 5.7%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 17)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resultsNames(dds_2)
## [1] "Intercept"
## [2] "PatientID_P03_vs_P01"
## [3] "PatientID_P04_vs_P01"
## [4] "PatientID_P05_vs_P01"
## [5] "PatientID_P06_vs_P01"
## [6] "PatientID_P07_vs_P01"
## [7] "PatientID_P09_vs_P01"
## [8] "PatientID_P12_vs_P01"
## [9] "PatientID_P14_vs_P01"
## [10] "PatientID_P15_vs_P01"
## [11] "PatientID_P16_vs_P01"
## [12] "PatientID_P17_vs_P01"
## [13] "PatientID_P18_vs_P01"
## [14] "PatientID_P19_vs_P01"
## [15] "Condition_Post.treatment_vs_Pre.treatment"
EnhancedVolcano(res_2,
labSize = 2,
lab = rownames(res_2),
x = 'log2FoldChange',
y = 'pvalue',
pCutoff = 0.05,
FCcutoff = 1)
resOrdered_2 <- res_2[order(res_2$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(resOrdered_2)
d_2 <- plotCounts(dds_2, gene=which.min(res_2$padj), intgroup="Condition",
returnData=TRUE)
#matched_data <- subset(d, PatientID %in% matching_patient_ids)
d_2$PatientID<-new_mtd$PatientID
ggplot(d_2, aes(x=Condition, y=count,group = PatientID, color = PatientID)) +
geom_point() +
scale_y_log10(breaks=c(25,100,400))+
ggtitle(geneSymbol.dic[rownames(dds_2[which.min(res_2$padj),]),])+
geom_line()
sgenes_2 <- resOrdered_2[order(abs(resOrdered_2$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()
rld_2 <- vst(dds_2, blind=FALSE)
de_mat_2 <- assay(rld_2)[sgenes_2,]
#Subset metadata for the clustering:
new_heatmap_coldata<-new_mtd[,c("Condition","Reponse","Lokalisation","Sublokalisation", "TStage_Response", "N_Stage_Response", "Primary_Endpoint")]
new_heatmap_coldata<-as.data.frame(new_heatmap_coldata)
rownames(new_heatmap_coldata)<-rownames(heatmap_coldata)
pheatmap(t(scale(t(de_mat_2))),show_rownames = T,show_colnames = T,annotation_col =new_heatmap_coldata)
When you compare the samples based on the T or N stage response, you can combine SD and PD as non-responder for an first insight. Merge SD + PD as NR in T Stage Response:
new_mtd <- new_mtd %>%
mutate(`TStage_Response` = if_else( `TStage_Response` %in% c("SD", "PD"), "NR", `TStage_Response`))
table(new_mtd$`TStage_Response`)
##
## NR R
## 10 17
#subset only pre values:
pre_cts<- cts %>%
select(seq(1, ncol(.), by = 2))
pre_mtd <- new_mtd %>%
dplyr::filter(Condition == "Pre-treatment") %>%
dplyr::select(everything())
rownames(pre_mtd)<- colnames(pre_cts)
## Warning: Setting row names on a tibble is deprecated.
dds_tsr<- DESeqDataSetFromMatrix(countData = pre_cts,
colData = pre_mtd,
design = ~ Lokalisation + TStage_Response)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
keep_tsr<- rowSums(counts(dds_tsr) >= 100) >= smallestGroupSize
dds_tsr <- dds_tsr[keep_tsr,]
vsd_tsr <- vst(dds_tsr, blind=FALSE)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
sampleDists_tsr <- dist(t(assay(vsd_tsr)))
sampleDistMatrix_tsr <- as.matrix(sampleDists_tsr)
rownames(sampleDistMatrix_tsr) <- paste(vsd_tsr$`T-Stage-Response`, vsd_tsr$PatientID, sep="-")
colnames(sampleDistMatrix_tsr) <- paste(vsd_tsr$`T-Stage-Response`, vsd_tsr$PatientID, sep="-")
heatmap(sampleDistMatrix_tsr,
clustering_distance_rows=sampleDists_tsr,
clustering_distance_cols=sampleDists_tsr,
col=colors)
## Warning in plot.window(...): "clustering_distance_rows" is not a graphical
## parameter
## Warning in plot.window(...): "clustering_distance_cols" is not a graphical
## parameter
## Warning in plot.xy(xy, type, ...): "clustering_distance_rows" is not a
## graphical parameter
## Warning in plot.xy(xy, type, ...): "clustering_distance_cols" is not a
## graphical parameter
## Warning in title(...): "clustering_distance_rows" is not a graphical parameter
## Warning in title(...): "clustering_distance_cols" is not a graphical parameter
plotPCA(vsd_tsr, intgroup=c("TStage_Response"))+
geom_label(aes(label=vsd_tsr$PatientID))
dds_tsr$Condition <- factor(dds_tsr$TStage_Response, levels = c("NR", "R"))
dds_tsr$Condition <- relevel(dds_tsr$TStage_Response, "R")
dds_tsr <- DESeq(dds_tsr)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 393 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
rownames(dds_tsr) <- geneSymbol.dic[rownames(dds_tsr),"gene_name"]
res_tsr <- results(dds_tsr, alpha = 0.05)
#rownames(sub_res) <- geneSymbol.dic[rownames(sub_res),"gene_name"]
summary(res_tsr)
##
## out of 13551 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6, 0.044%
## LFC < 0 (down) : 7, 0.052%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 25)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resultsNames(dds_tsr)
## [1] "Intercept" "Lokalisation_oral.cavity_vs_Larynx"
## [3] "Lokalisation_Oropharynx_vs_Larynx" "TStage_Response_R_vs_NR"
EnhancedVolcano(res_tsr,
labSize = 2,
lab = rownames(res_tsr),
x = 'log2FoldChange',
y = 'pvalue',
pCutoff = 0.05,
FCcutoff = 1)
resOrdered_tsr <- res_tsr[order(res_tsr$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(resOrdered_tsr)
d_tsr <- plotCounts(dds_tsr, gene=which.min(res_tsr$padj), intgroup="TStage_Response",
returnData=TRUE)
ggplot(d_tsr, aes(x=TStage_Response, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
scale_y_log10(breaks=c(25,100,400))+
ggtitle(geneSymbol.dic[rownames(dds_tsr[which.min(res_tsr$padj),]),])
sgenes_tsr <- resOrdered_tsr[order(abs(resOrdered_tsr$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()
rld_tsr <- vst(dds_tsr, blind=FALSE)
heatmap_coldata_tsr<-as.data.frame(pre_mtd)
rownames(heatmap_coldata_tsr)<-colnames(pre_cts)
heatmap_coldata_tsr<-heatmap_coldata_tsr[,c("Reponse","Lokalisation","Sublokalisation","TStage_Response","N_Stage_Response","Primary_Endpoint")]
de_mat_tsr <- assay(rld_tsr)[sgenes_tsr,]
pheatmap(t(scale(t(de_mat_tsr))),show_rownames = F,show_colnames = T,annotation_col =heatmap_coldata_tsr)