Head and Neck Cancer Analysis

Bulk RNAseq of FFPE tissues from HnN cancer patients.

Study set:

coldata <- read.delim(paste(wd, "meta.data.table.txt", sep = ""), row.names = 1)
coldata$Condition <- factor(coldata$Condition)
coldata$PatientID <- factor(coldata$PatientID)

coldata

##                      Condition PatientID        Reponse
## P01_S_L1.LB25    Pre-treatment       P01   non_reponder
## P01_SOC_L1.LB23 Post-treatment       P01   non_reponder
## P03_S_L1.LB20    Pre-treatment       P03   non_reponder
## P03_SOC_L2.LB8  Post-treatment       P03   non_reponder
## P04_S_L1.LB4     Pre-treatment       P04 high_responder
## P04_SOC_L2.LB15 Post-treatment       P04 high_responder
## P05_S_L2.LB9     Pre-treatment       P05   non_reponder
## P05_SOC_L2.LB16 Post-treatment       P05   non_reponder
## P06_S_L2.LB12    Pre-treatment       P06   non_reponder
## P06_SOC_L2.LB13 Post-treatment       P06   non_reponder
## P07_S_L1.LB5     Pre-treatment       P07   non_reponder
## P07_SOC_L2.LB18 Post-treatment       P07   non_reponder
## P09_S_L2.LB19    Pre-treatment       P09   non_reponder
## P09_SOC_L2.LB14 Post-treatment       P09   non_reponder
## P12_S_L1.LB6     Pre-treatment       P12   non_reponder
## P12_SOC_L2.LB10 Post-treatment       P12   non_reponder
## P14_S_L1.LB19    Pre-treatment       P14   non_reponder
## P14_SOC_L2.LB21 Post-treatment       P14   non_reponder
## P15_S_L2.LB11    Pre-treatment       P15 high_responder
## P15_SOC_L2.LB20 Post-treatment       P15 high_responder
## P16_S_L1.LB22    Pre-treatment       P16   non_reponder
## P16_SOC_L1.LB21 Post-treatment       P16   non_reponder
## P17_S_L1.LB7     Pre-treatment       P17   non_reponder
## P17_SOC_L1.LB27 Post-treatment       P17   non_reponder
## P18_S_L1.LB1     Pre-treatment       P18   non_reponder
## P18_SOC_L1.LB2  Post-treatment       P18   non_reponder
## P19_S_L1.LB3     Pre-treatment       P19   non_reponder

table(coldata$PatientID)

## 
## P01 P03 P04 P05 P06 P07 P09 P12 P14 P15 P16 P17 P18 P19 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   1

table(coldata$Condition)

## 
## Post-treatment  Pre-treatment 
##             13             14

Read expression files:

tpm <- read.delim(paste(wd, "salmon.merged.gene_tpm.tsv", sep = ""), header = T, row.names = 1)
#Counts matrix
sscounts <- read.table(paste(wd, "salmon.merged.gene_counts.tsv", sep = ""), row.names = 1, header = T)
sscounts %>% dplyr::select(gene_name) -> geneSymbol.dic #Dictionary for gene symbols
sscounts %>% dplyr::select(-gene_name) -> cts
rm(sscounts)

#TPM matrix
tpm <- read.delim(paste(wd, "salmon.merged.gene_tpm.tsv", sep = ""), header = T, row.names = 1)
#Counts matrix
sscounts <- read.table(paste(wd, "salmon.merged.gene_counts.tsv", sep = ""), row.names = 1, header = T)
t2g1.dic<- sscounts %>% dplyr::select(gene_name) #Dictionary for gene symbols
sscounts %>% dplyr::select(-gene_name) -> cts
rm(sscounts)

Read the new metadata table:

#library(readxl)
sample_background<-readxl::read_excel("/Users/ebrukocakaya/HnN_Sven/updated_sample_overview_FFPE.xlsx")

## New names:
## • `` -> `...2`
## • `` -> `...8`

head(sample_background)

## # A tibble: 6 × 8
##   `Group Name` ...2  `Pre-treatment` Lokalisation Sublokalisation `p16 Status`
##   <chr>        <lgl> <chr>           <chr>        <chr>           <chr>       
## 1 <NA>         NA    <NA>            <NA>         <NA>            <NA>        
## 2 Specimen 1   NA    P01_S           Oropharynx   Uvula           neg         
## 3 Specimen 2   NA    P03_S           Oropharynx   Tonsil          pos         
## 4 Specimen 3   NA    P04_S           Oropharynx   Base of tongue  pos         
## 5 Specimen 4   NA    P05_S           Oropharynx   Tonsil          pos         
## 6 Specimen 5   NA    P06_S           Larynx       Supraglottis    N/A         
## # ℹ 2 more variables: `x-fold CD8/GranB` <chr>, ...8 <chr>

#What to add to the existing metadata from new metadata
#From sample background: Lokalisation   Sublokalisation p16 Status  x-fold CD8/GranB columns 
#From Standard H&E Results  Whole Slide Percent Tumor
#From QC TRR Pool DAY 

#clean up the sample background df:
sample_background<-sample_background[-1,]
sample_background<-sample_background[,-8]
sample_background<-sample_background[,-2]
sample_background<-data.frame(sample_background)
rownames(sample_background)<-sample_background[,1]
#add Patient ID info to the table:
patid<-c(levels(coldata$PatientID))
sample_background$PatientID<-patid
sample_background

##              Group.Name Pre.treatment Lokalisation           Sublokalisation
## Specimen 1   Specimen 1         P01_S   Oropharynx                     Uvula
## Specimen 2   Specimen 2         P03_S   Oropharynx                    Tonsil
## Specimen 3   Specimen 3         P04_S   Oropharynx            Base of tongue
## Specimen 4   Specimen 4         P05_S   Oropharynx                    Tonsil
## Specimen 5   Specimen 5         P06_S       Larynx              Supraglottis
## Specimen 6   Specimen 6         P07_S   Oropharynx                    Tonsil
## Specimen 7   Specimen 7         P09_S  oral cavity                    tongue
## Specimen 8   Specimen 8         P12_S  oral cavity                    tongue
## Specimen 9   Specimen 9         P14_S   Oropharynx                    Tonsil
## Specimen 10 Specimen 10         P15_S   Oropharynx                    Tonsil
## Specimen 11 Specimen 11        P16 _S   Oropharynx posterior pharyngeal wall
## Specimen 12 Specimen 12         P17_S   Oropharynx            Base of tongue
## Specimen 13 Specimen 13         P18_S       Larynx                   Glottis
## Specimen 14 Specimen 14         P19_S  oral cavity            Floor of mouth
##             p16.Status    x.fold.CD8.GranB PatientID
## Specimen 1         neg           2.1998224       P01
## Specimen 2         pos  1.2671452999999999       P03
## Specimen 3         pos      high responder       P04
## Specimen 4         pos 0.42182770000000003       P05
## Specimen 5         N/A          23.7710422       P06
## Specimen 6         pos  10.010180500000001       P07
## Specimen 7         N/A           1.2193797       P09
## Specimen 8         N/A  6.9578436000000004       P12
## Specimen 9         pos 0.89413869999999995       P14
## Specimen 10        pos      high responder       P15
## Specimen 11        neg  6.5588876999999997       P16
## Specimen 12        pos  4.2564292999999997       P17
## Specimen 13        N/A 0.34499999999999997       P18
## Specimen 14        N/A      high responder       P19

#read percetn tumor table:
per_tumor<-readxl::read_excel("/Users/ebrukocakaya/HnN_Sven/Percentage_tumor_in_sample.xlsx")
head(per_tumor)

## # A tibble: 6 × 3
##   `P. ID` `Percent tumor in processed sample` PatientID
##   <chr>   <chr>                               <chr>    
## 1 P01_S   20                                  P01      
## 2 P01_SOC 20                                  P01      
## 3 P03_S   60                                  P03      
## 4 P03_SOC 20                                  P03      
## 5 P04_S   20                                  P04      
## 6 P04_SOC 0                                   P04

#there is no data on P19_SOC remove row 28
per_tumor<-per_tumor[-28,]

#orders of the coldata metadata and the per tumor is same so just merge them:
coldata$percent_tumor_in_sample<-per_tumor$`Percent tumor in processed sample`
#add the sample ID as well (same order)
coldata$sampleID<-per_tumor$`P. ID`
head(coldata)

##                      Condition PatientID        Reponse percent_tumor_in_sample
## P01_S_L1.LB25    Pre-treatment       P01   non_reponder                      20
## P01_SOC_L1.LB23 Post-treatment       P01   non_reponder                      20
## P03_S_L1.LB20    Pre-treatment       P03   non_reponder                      60
## P03_SOC_L2.LB8  Post-treatment       P03   non_reponder                      20
## P04_S_L1.LB4     Pre-treatment       P04 high_responder                      20
## P04_SOC_L2.LB15 Post-treatment       P04 high_responder                       0
##                 sampleID
## P01_S_L1.LB25      P01_S
## P01_SOC_L1.LB23  P01_SOC
## P03_S_L1.LB20      P03_S
## P03_SOC_L2.LB8   P03_SOC
## P04_S_L1.LB4       P04_S
## P04_SOC_L2.LB15  P04_SOC

#per_tumor<-excel_sheets(path=/Users/ebrukocakaya/HnN_Sven/Percentage_tumor_in_sample.xlsx)

#add sample background info 
#sample_background$Pre.treatment is the same with coldata$sampleID:

colnames(sample_background)[2]<-"sampleID"
#since rownames can get lost at full_join:
#save rownames as another column and then remove them
rows_coldata<-rownames(coldata)

coldata<-full_join(coldata,sample_background,by="sampleID")
rownames(coldata)<-rows_coldata
coldata<-coldata[,-11]
colnames(coldata)[2]<-"PatientID"
head(coldata)

##                      Condition PatientID        Reponse percent_tumor_in_sample
## P01_S_L1.LB25    Pre-treatment       P01   non_reponder                      20
## P01_SOC_L1.LB23 Post-treatment       P01   non_reponder                      20
## P03_S_L1.LB20    Pre-treatment       P03   non_reponder                      60
## P03_SOC_L2.LB8  Post-treatment       P03   non_reponder                      20
## P04_S_L1.LB4     Pre-treatment       P04 high_responder                      20
## P04_SOC_L2.LB15 Post-treatment       P04 high_responder                       0
##                 sampleID Group.Name Lokalisation Sublokalisation p16.Status
## P01_S_L1.LB25      P01_S Specimen 1   Oropharynx           Uvula        neg
## P01_SOC_L1.LB23  P01_SOC       <NA>         <NA>            <NA>       <NA>
## P03_S_L1.LB20      P03_S Specimen 2   Oropharynx          Tonsil        pos
## P03_SOC_L2.LB8   P03_SOC       <NA>         <NA>            <NA>       <NA>
## P04_S_L1.LB4       P04_S Specimen 3   Oropharynx  Base of tongue        pos
## P04_SOC_L2.LB15  P04_SOC       <NA>         <NA>            <NA>       <NA>
##                   x.fold.CD8.GranB
## P01_S_L1.LB25            2.1998224
## P01_SOC_L1.LB23               <NA>
## P03_S_L1.LB20   1.2671452999999999
## P03_SOC_L2.LB8                <NA>
## P04_S_L1.LB4        high responder
## P04_SOC_L2.LB15               <NA>

Select only protein coding genes

pcg <- read.delim(paste(wd, "protein_coding_genes.Ensembl.ids.txt", sep = ""), header = F)

tpm %>% dplyr::select(-gene_name) -> tpm

#Keep only Protein coding genes:
#table(rownames(tpm) %in% pcg$V1)
tpm[which(rownames(tpm) %in% pcg$V1),] -> tpm

cts[which(rownames(cts) %in% pcg$V1),] -> cts

#Convert to integers
cts <- round(cts, digits = 0)
cts[which(rowMeans(cts) > 10.0),] -> cts

Filter genes based on low expression and variance

#Filter out genes with mean exp less than 1.0 tpm across all samples.
tpm[which(rowMeans(tpm) > 1.0),] -> tpm
#Select top highly variable genes based on their Coefficient of Variation
tpm[order(apply(tpm, 1, sd, na.rm=TRUE) / apply(tpm, 1, mean, na.rm=TRUE), decreasing = T),] %>% head(2000) -> tpm.top.cv

Gene expression distributions of each sample

tpm %>% reshape2::melt() %>% ggplot(aes(variable, log(value+.01) )) + 
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## No id variables; using all as measure variables

#subset coldata to only project condition, response, localization and sublocalization:
heatmap_coldata<-coldata[,c("Condition","Reponse","Lokalisation","Sublokalisation")]
heatmap_coldata

##                      Condition        Reponse Lokalisation
## P01_S_L1.LB25    Pre-treatment   non_reponder   Oropharynx
## P01_SOC_L1.LB23 Post-treatment   non_reponder         <NA>
## P03_S_L1.LB20    Pre-treatment   non_reponder   Oropharynx
## P03_SOC_L2.LB8  Post-treatment   non_reponder         <NA>
## P04_S_L1.LB4     Pre-treatment high_responder   Oropharynx
## P04_SOC_L2.LB15 Post-treatment high_responder         <NA>
## P05_S_L2.LB9     Pre-treatment   non_reponder   Oropharynx
## P05_SOC_L2.LB16 Post-treatment   non_reponder         <NA>
## P06_S_L2.LB12    Pre-treatment   non_reponder       Larynx
## P06_SOC_L2.LB13 Post-treatment   non_reponder         <NA>
## P07_S_L1.LB5     Pre-treatment   non_reponder   Oropharynx
## P07_SOC_L2.LB18 Post-treatment   non_reponder         <NA>
## P09_S_L2.LB19    Pre-treatment   non_reponder  oral cavity
## P09_SOC_L2.LB14 Post-treatment   non_reponder         <NA>
## P12_S_L1.LB6     Pre-treatment   non_reponder  oral cavity
## P12_SOC_L2.LB10 Post-treatment   non_reponder         <NA>
## P14_S_L1.LB19    Pre-treatment   non_reponder   Oropharynx
## P14_SOC_L2.LB21 Post-treatment   non_reponder         <NA>
## P15_S_L2.LB11    Pre-treatment high_responder   Oropharynx
## P15_SOC_L2.LB20 Post-treatment high_responder         <NA>
## P16_S_L1.LB22    Pre-treatment   non_reponder   Oropharynx
## P16_SOC_L1.LB21 Post-treatment   non_reponder         <NA>
## P17_S_L1.LB7     Pre-treatment   non_reponder   Oropharynx
## P17_SOC_L1.LB27 Post-treatment   non_reponder         <NA>
## P18_S_L1.LB1     Pre-treatment   non_reponder       Larynx
## P18_SOC_L1.LB2  Post-treatment   non_reponder         <NA>
## P19_S_L1.LB3     Pre-treatment   non_reponder  oral cavity
##                           Sublokalisation
## P01_S_L1.LB25                       Uvula
## P01_SOC_L1.LB23                      <NA>
## P03_S_L1.LB20                      Tonsil
## P03_SOC_L2.LB8                       <NA>
## P04_S_L1.LB4               Base of tongue
## P04_SOC_L2.LB15                      <NA>
## P05_S_L2.LB9                       Tonsil
## P05_SOC_L2.LB16                      <NA>
## P06_S_L2.LB12                Supraglottis
## P06_SOC_L2.LB13                      <NA>
## P07_S_L1.LB5                       Tonsil
## P07_SOC_L2.LB18                      <NA>
## P09_S_L2.LB19                      tongue
## P09_SOC_L2.LB14                      <NA>
## P12_S_L1.LB6                       tongue
## P12_SOC_L2.LB10                      <NA>
## P14_S_L1.LB19                      Tonsil
## P14_SOC_L2.LB21                      <NA>
## P15_S_L2.LB11                      Tonsil
## P15_SOC_L2.LB20                      <NA>
## P16_S_L1.LB22   posterior pharyngeal wall
## P16_SOC_L1.LB21                      <NA>
## P17_S_L1.LB7               Base of tongue
## P17_SOC_L1.LB27                      <NA>
## P18_S_L1.LB1                      Glottis
## P18_SOC_L1.LB2                       <NA>
## P19_S_L1.LB3               Floor of mouth

Sample to sample correlation heatmap with highly variable genes

pheatmap::pheatmap(cor(tpm.top.cv), annotation_row = heatmap_coldata, annotation_col = heatmap_coldata)

Computing VSD to investigate sample-to-sample further

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ PatientID + Condition)

smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 100) >= smallestGroupSize
dds <- dds[keep,]

vsd <- vst(dds, blind=FALSE)

sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Condition, vsd$PatientID, sep="-")
colnames(sampleDistMatrix) <- paste(vsd$Condition, vsd$PatientID, sep="-")
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

plotPCA(vsd, intgroup=c("Reponse"))+
  geom_label(aes(label=vsd$PatientID))

plotPCA(vsd, intgroup=c("Condition"))+
  geom_label(aes(label=vsd$PatientID))

plotPCA(vsd, intgroup=c("Lokalisation"))+
  geom_label(aes(label=vsd$PatientID))

Differential gene expression analysis - Treatment: Pre vs Post

dds$Condition <- factor(dds$Condition, levels = c("Post-treatment", "Pre-treatment"))
dds$Condition <- relevel(dds$Condition, "Pre-treatment")

dds <- DESeq(dds)
rownames(dds) <- geneSymbol.dic[rownames(dds),"gene_name"]

res <- results(dds, alpha = 0.05)
#rownames(res) <- geneSymbol.dic[rownames(res),"gene_name"]
summary(res)

## 
## out of 14209 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1225, 8.6%
## LFC < 0 (down)     : 806, 5.7%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 17)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#res[order(res$padj, decreasing = F),] %>% head(350)
resultsNames(dds)

##  [1] "Intercept"                                
##  [2] "PatientID_P03_vs_P01"                     
##  [3] "PatientID_P04_vs_P01"                     
##  [4] "PatientID_P05_vs_P01"                     
##  [5] "PatientID_P06_vs_P01"                     
##  [6] "PatientID_P07_vs_P01"                     
##  [7] "PatientID_P09_vs_P01"                     
##  [8] "PatientID_P12_vs_P01"                     
##  [9] "PatientID_P14_vs_P01"                     
## [10] "PatientID_P15_vs_P01"                     
## [11] "PatientID_P16_vs_P01"                     
## [12] "PatientID_P17_vs_P01"                     
## [13] "PatientID_P18_vs_P01"                     
## [14] "PatientID_P19_vs_P01"                     
## [15] "Condition_Post.treatment_vs_Pre.treatment"

Volcano plot

EnhancedVolcano(res,
                labSize = 2,
                lab = rownames(res),
                x = 'log2FoldChange',
                y = 'pvalue',
                pCutoff = 0.05, 
                FCcutoff = 1)

resOrdered <- res[order(res$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(resOrdered)

d <- plotCounts(dds, gene=which.min(res$padj), intgroup="Condition", 
                returnData=TRUE)

#matched_data <- subset(d, PatientID %in% matching_patient_ids)
d$PatientID<-coldata$PatientID

ggplot(d, aes(x=Condition, y=count,group = PatientID, color = PatientID)) + 
  geom_point() + 
  scale_y_log10(breaks=c(25,100,400))+ 
  ggtitle(geneSymbol.dic[rownames(dds[which.min(res$padj),]),])+
  geom_line()

Expression heatmap of top 500 highest LFC genes

sgenes <- resOrdered[order(abs(resOrdered$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()

rld <- vst(dds, blind=FALSE)

de_mat <- assay(rld)[sgenes,]
pheatmap(t(scale(t(de_mat))),show_rownames = F,show_colnames = T,annotation_col =heatmap_coldata)

Subset the high responder vs non responder

To further investigate the 2 high responders, they are subsetted with their counterparts. Only pre-treatment samples are taken into account.
Sample from High Responder p4 taken from Oropharynx-base of tongue and it’s p16 positive. For it’s non-responder counterpart p17 is subsettted. Sample from High Responder p15 taken from Oropharynx-tonsil and it’s p16 positive. For it’s non-responder counterpart p3, p5, p14 are subsettted. P4,P17,P15,P3,P5,P14 e subsetted and merged.

to_keep<-c("P03","P04","P05", "P14","P15","P17")

sub_coldata<- coldata %>%
  filter(PatientID %in% to_keep & Condition == "Pre-treatment")
sub_coldata

##                   Condition PatientID        Reponse percent_tumor_in_sample
## P03_S_L1.LB20 Pre-treatment       P03   non_reponder                      60
## P04_S_L1.LB4  Pre-treatment       P04 high_responder                      20
## P05_S_L2.LB9  Pre-treatment       P05   non_reponder                      40
## P14_S_L1.LB19 Pre-treatment       P14   non_reponder                      50
## P15_S_L2.LB11 Pre-treatment       P15 high_responder                      40
## P17_S_L1.LB7  Pre-treatment       P17   non_reponder                      60
##               sampleID  Group.Name Lokalisation Sublokalisation p16.Status
## P03_S_L1.LB20    P03_S  Specimen 2   Oropharynx          Tonsil        pos
## P04_S_L1.LB4     P04_S  Specimen 3   Oropharynx  Base of tongue        pos
## P05_S_L2.LB9     P05_S  Specimen 4   Oropharynx          Tonsil        pos
## P14_S_L1.LB19    P14_S  Specimen 9   Oropharynx          Tonsil        pos
## P15_S_L2.LB11    P15_S Specimen 10   Oropharynx          Tonsil        pos
## P17_S_L1.LB7     P17_S Specimen 12   Oropharynx  Base of tongue        pos
##                  x.fold.CD8.GranB
## P03_S_L1.LB20  1.2671452999999999
## P04_S_L1.LB4       high responder
## P05_S_L2.LB9  0.42182770000000003
## P14_S_L1.LB19 0.89413869999999995
## P15_S_L2.LB11      high responder
## P17_S_L1.LB7   4.2564292999999997

to_keep_tpm<-rownames(sub_coldata)

sub_tpm<- tpm %>%
  dplyr::select(all_of(to_keep_tpm))
head(sub_tpm)

##                 P03_S_L1.LB20 P04_S_L1.LB4 P05_S_L2.LB9 P14_S_L1.LB19
## ENSG00000000003      4.385979     5.137219     0.035638      5.019029
## ENSG00000000419     81.059832    54.917647   110.597554     41.961887
## ENSG00000000457      7.298586     7.188495     9.206459      6.934495
## ENSG00000000460     12.892440     7.604308    23.650866     12.551062
## ENSG00000000938      2.967168     4.369779     2.587410      2.907970
## ENSG00000000971     67.588255    78.079333    13.130865     32.701267
##                 P15_S_L2.LB11 P17_S_L1.LB7
## ENSG00000000003      4.579444     8.556004
## ENSG00000000419    102.027287   105.243919
## ENSG00000000457      6.852893    14.134774
## ENSG00000000460     20.119463    34.528346
## ENSG00000000938      1.666307     2.177609
## ENSG00000000971     49.540988    17.372839

#Filter out genes with mean exp less than 1.0 tpm across all samples.
sub_tpm[which(rowMeans(sub_tpm) > 1.0),] -> sub_tpm
#Select top highly variable genes based on their Coefficient of Variation
sub_tpm[order(apply(sub_tpm, 1, sd, na.rm=TRUE) / apply(sub_tpm, 1, mean, na.rm=TRUE), decreasing = T),] %>% head(2000) -> sub_tpm.top.cv

sub_tpm %>% reshape2::melt() %>% ggplot(aes(variable, log(value+.01) )) + 
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## No id variables; using all as measure variables

#to only project condition, response and sublokalization since they are all from oropharynx region
ann_df<-sub_coldata[, c("Reponse", "Sublokalisation")]

pheatmap::pheatmap(cor(sub_tpm.top.cv), annotation_row = ann_df, annotation_col = ann_df)

#some formatting
#sub counts mtx:
sub_cts<- cts %>%
  dplyr::select(all_of(to_keep_tpm))
#add sublokalisation data into the post treatment samples:

head(sub_cts)

##                 P03_S_L1.LB20 P04_S_L1.LB4 P05_S_L2.LB9 P14_S_L1.LB19
## ENSG00000000003           304          370            6           321
## ENSG00000000419          4123         2391         4729          1924
## ENSG00000000457          1108          940         1191           916
## ENSG00000000460          1400          723         2279          1472
## ENSG00000000938           220          264          160           197
## ENSG00000000971         13821        13500         2121          5821
##                 P15_S_L2.LB11 P17_S_L1.LB7
## ENSG00000000003           344          511
## ENSG00000000419          4539         4412
## ENSG00000000457           948         1700
## ENSG00000000460          1992         3514
## ENSG00000000938           107          133
## ENSG00000000971          8671         2842

sub_dds <- DESeqDataSetFromMatrix(countData = sub_cts,
                                  colData = sub_coldata,
                                  design = ~Sublokalisation + Reponse)

## converting counts to integer mode

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

smallestGroupSize <- 3
keep <- rowSums(counts(sub_dds) >= 100) >= smallestGroupSize
sub_dds <- sub_dds[keep,]

sub_vsd <- vst(sub_dds, blind=FALSE)

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

sub_sampleDists <- dist(t(assay(sub_vsd)))

sub_sampleDistMatrix <- as.matrix(sub_sampleDists)
rownames(sub_sampleDistMatrix ) <- paste(sub_vsd$Condition, sub_vsd$PatientID, sep="-")
colnames(sub_sampleDistMatrix ) <- paste(sub_vsd$Condition, sub_vsd$PatientID, sep="-")
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

pheatmap(sub_sampleDistMatrix,
         clustering_distance_rows=sub_sampleDists,
         clustering_distance_cols=sub_sampleDists,
         col=colors)

plotPCA(sub_vsd, intgroup=c("Reponse"))

#plotPCA(sub_vsd, intgroup=c("PatientID"))+
 # geom_point(aes(shape=sub_vsd$Condition), size=3)+
  #scale_shape_manual(values = c(5,1))+
  #geom_line()
  #To project names:
  #geom_text(aes(label = PatientID))

sub_dds$Reponse <- factor(sub_dds$Reponse, levels = c("non_reponder", "high_responder"))
sub_dds$Response <- relevel(sub_dds$Reponse, "non_reponder")

sub_dds <- DESeq(sub_dds)

## estimating size factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## final dispersion estimates

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## fitting model and testing

rownames(sub_dds) <- geneSymbol.dic[rownames(sub_dds),"gene_name"]

sub_res <- results(sub_dds, alpha = 0.05)
#rownames(sub_res) <- geneSymbol.dic[rownames(sub_res),"gene_name"]
summary(sub_res)

## 
## out of 12710 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 26, 0.2%
## LFC < 0 (down)     : 25, 0.2%
## outliers [1]       : 2, 0.016%
## low counts [2]     : 0, 0%
## (mean count < 61)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#sub_res[order(res$padj, decreasing = F),] %>% head(350)
resultsNames(sub_dds)

## [1] "Intercept"                               
## [2] "Sublokalisation_Tonsil_vs_Base.of.tongue"
## [3] "Reponse_high_responder_vs_non_reponder"

EnhancedVolcano(sub_res,
                labSize = 2,
                lab = rownames(sub_res),
                x = 'log2FoldChange',
                y = 'pvalue',
                pCutoff = 0.05, 
                FCcutoff = 1)

sub_resOrdered <- sub_res[order(sub_res$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(sub_resOrdered)

sub_d <- plotCounts(sub_dds, gene=which.min(sub_res$padj), intgroup="Reponse", 
                returnData=TRUE)

ggplot(sub_d, aes(x=Reponse, y=count)) + 
  geom_point(position=position_jitter(w=0.1,h=0)) + 
  scale_y_log10(breaks=c(25,100,400))+ 
  ggtitle(geneSymbol.dic[rownames(sub_dds[which.min(sub_res$padj),]),])

sub

## standardGeneric for "sub" defined from package "base"
## 
## function (pattern, replacement, x, ignore.case = FALSE, perl = FALSE, 
##     fixed = FALSE, useBytes = FALSE) 
## standardGeneric("sub")
## <environment: 0x7fb1b231c950>
## Methods may be defined for arguments: pattern, replacement, x, ignore.case, perl, fixed, useBytes
## Use  showMethods(sub)  for currently available ones.

sub_sgenes <- sub_resOrdered[order(abs(sub_resOrdered$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()

sub_rld <- vst(sub_dds, blind=FALSE)

sub_de_mat <- assay(sub_rld)[sub_sgenes,]
pheatmap(t(scale(t(sub_de_mat))),show_rownames = T,show_colnames = T,annotation_col =ann_df)

write_csv(coldata, "hnn_metadata.csv",col_names = T)

New Metadata from Hendrik with T Stage, N Stage and Primary Endpoint

new_mtd<-readxl::read_excel("/Users/ebrukocakaya/HnN_Sven/hnn_metadata_v2.xlsx")
new_mtd

## # A tibble: 27 × 13
##    Condition      PatientID Reponse   percent_tumor_in_sam…¹ sampleID Group.Name
##    <chr>          <chr>     <chr>                      <dbl> <chr>    <chr>     
##  1 Pre-treatment  P01       non_repo…                     20 P01_S    Specimen 1
##  2 Post-treatment P01       non_repo…                     20 P01_SOC  NA        
##  3 Pre-treatment  P03       non_repo…                     60 P03_S    Specimen 2
##  4 Post-treatment P03       non_repo…                     20 P03_SOC  NA        
##  5 Pre-treatment  P04       high_res…                     20 P04_S    Specimen 3
##  6 Post-treatment P04       high_res…                      0 P04_SOC  NA        
##  7 Pre-treatment  P05       non_repo…                     40 P05_S    Specimen 4
##  8 Post-treatment P05       non_repo…                     35 P05_SOC  NA        
##  9 Pre-treatment  P06       non_repo…                     50 P06_S    Specimen 5
## 10 Post-treatment P06       non_repo…                     20 P06_SOC  NA        
## # ℹ 17 more rows
## # ℹ abbreviated name: ¹percent_tumor_in_sample
## # ℹ 7 more variables: Lokalisation <chr>, Sublokalisation <chr>,
## #   p16.Status <chr>, x.fold.CD8.GranB <chr>, TStage_Response <chr>,
## #   N_Stage_Response <chr>, Primary_Endpoint <chr>

dds_2<- DESeqDataSetFromMatrix(countData = cts,
                              colData = new_mtd,
                              design = ~ PatientID + Condition)

## converting counts to integer mode

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

keep_2 <- rowSums(counts(dds_2) >= 100) >= smallestGroupSize
dds_2 <- dds_2[keep_2,]

vsd_2 <- vst(dds_2, blind=FALSE)

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

sampleDists_2 <- dist(t(assay(vsd_2)))

sampleDistMatrix_2 <- as.matrix(sampleDists_2)
rownames(sampleDistMatrix_2) <- paste(vsd_2$Condition, vsd_2$PatientID, sep="-")
colnames(sampleDistMatrix_2) <- paste(vsd_2$Condition, vsd_2$PatientID, sep="-")


plotPCA(vsd_2, intgroup=c("Reponse"))+
  geom_label(aes(label=vsd_2$PatientID))

plotPCA(vsd_2, intgroup=c("TStage_Response"))+
  geom_label(aes(label=vsd_2$PatientID))

plotPCA(vsd_2, intgroup=c("N_Stage_Response"))+
  geom_label(aes(label=vsd_2$PatientID))

plotPCA(vsd_2, intgroup=c("Primary_Endpoint"))+
  geom_label(aes(label=vsd_2$PatientID))

dds_2$Condition <- factor(dds_2$Condition, levels = c("Post-treatment", "Pre-treatment"))
dds_2$Condition <- relevel(dds_2$Condition, "Pre-treatment")

dds_2 <- DESeq(dds_2)

## estimating size factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## final dispersion estimates

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## fitting model and testing

rownames(dds_2) <- geneSymbol.dic[rownames(dds_2),"gene_name"]
res_2 <- results(dds_2, alpha = 0.05)
#rownames(res) <- geneSymbol.dic[rownames(res),"gene_name"]
summary(res_2)

## 
## out of 14209 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1225, 8.6%
## LFC < 0 (down)     : 806, 5.7%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 17)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resultsNames(dds_2)

##  [1] "Intercept"                                
##  [2] "PatientID_P03_vs_P01"                     
##  [3] "PatientID_P04_vs_P01"                     
##  [4] "PatientID_P05_vs_P01"                     
##  [5] "PatientID_P06_vs_P01"                     
##  [6] "PatientID_P07_vs_P01"                     
##  [7] "PatientID_P09_vs_P01"                     
##  [8] "PatientID_P12_vs_P01"                     
##  [9] "PatientID_P14_vs_P01"                     
## [10] "PatientID_P15_vs_P01"                     
## [11] "PatientID_P16_vs_P01"                     
## [12] "PatientID_P17_vs_P01"                     
## [13] "PatientID_P18_vs_P01"                     
## [14] "PatientID_P19_vs_P01"                     
## [15] "Condition_Post.treatment_vs_Pre.treatment"

EnhancedVolcano(res_2,
                labSize = 2,
                lab = rownames(res_2),
                x = 'log2FoldChange',
                y = 'pvalue',
                pCutoff = 0.05, 
                FCcutoff = 1)

resOrdered_2 <- res_2[order(res_2$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(resOrdered_2)

d_2 <- plotCounts(dds_2, gene=which.min(res_2$padj), intgroup="Condition", 
                returnData=TRUE)

#matched_data <- subset(d, PatientID %in% matching_patient_ids)
d_2$PatientID<-new_mtd$PatientID

ggplot(d_2, aes(x=Condition, y=count,group = PatientID, color = PatientID)) + 
  geom_point() + 
  scale_y_log10(breaks=c(25,100,400))+ 
  ggtitle(geneSymbol.dic[rownames(dds_2[which.min(res_2$padj),]),])+
  geom_line()

sgenes_2 <- resOrdered_2[order(abs(resOrdered_2$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()

rld_2 <- vst(dds_2, blind=FALSE)

de_mat_2 <- assay(rld_2)[sgenes_2,]


#Subset  metadata for the clustering:
new_heatmap_coldata<-new_mtd[,c("Condition","Reponse","Lokalisation","Sublokalisation", "TStage_Response", "N_Stage_Response", "Primary_Endpoint")]
new_heatmap_coldata<-as.data.frame(new_heatmap_coldata)
rownames(new_heatmap_coldata)<-rownames(heatmap_coldata)
pheatmap(t(scale(t(de_mat_2))),show_rownames = T,show_colnames = T,annotation_col =new_heatmap_coldata)

DEG Based on T-Stage-Response

When you compare the samples based on the T or N stage response, you can combine SD and PD as non-responder for an first insight. Merge SD + PD as NR in T Stage Response:

new_mtd <- new_mtd %>%
  mutate(`TStage_Response` = if_else( `TStage_Response` %in% c("SD", "PD"), "NR", `TStage_Response`))
table(new_mtd$`TStage_Response`)

## 
## NR  R 
## 10 17

#subset only pre values:
pre_cts<- cts %>%
  select(seq(1, ncol(.), by = 2))

pre_mtd <- new_mtd %>% 
  dplyr::filter(Condition == "Pre-treatment") %>% 
  dplyr::select(everything())

rownames(pre_mtd)<- colnames(pre_cts)

## Warning: Setting row names on a tibble is deprecated.

dds_tsr<- DESeqDataSetFromMatrix(countData = pre_cts,
                              colData = pre_mtd,
                              design = ~ Lokalisation + TStage_Response)

## converting counts to integer mode

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

keep_tsr<- rowSums(counts(dds_tsr) >= 100) >= smallestGroupSize
dds_tsr <- dds_tsr[keep_tsr,]

vsd_tsr <- vst(dds_tsr, blind=FALSE)

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

sampleDists_tsr <- dist(t(assay(vsd_tsr)))

sampleDistMatrix_tsr <- as.matrix(sampleDists_tsr)
rownames(sampleDistMatrix_tsr) <- paste(vsd_tsr$`T-Stage-Response`, vsd_tsr$PatientID, sep="-")
colnames(sampleDistMatrix_tsr) <- paste(vsd_tsr$`T-Stage-Response`, vsd_tsr$PatientID, sep="-")

heatmap(sampleDistMatrix_tsr,
         clustering_distance_rows=sampleDists_tsr,
         clustering_distance_cols=sampleDists_tsr,
         col=colors)

## Warning in plot.window(...): "clustering_distance_rows" is not a graphical
## parameter

## Warning in plot.window(...): "clustering_distance_cols" is not a graphical
## parameter

## Warning in plot.xy(xy, type, ...): "clustering_distance_rows" is not a
## graphical parameter

## Warning in plot.xy(xy, type, ...): "clustering_distance_cols" is not a
## graphical parameter

## Warning in title(...): "clustering_distance_rows" is not a graphical parameter

## Warning in title(...): "clustering_distance_cols" is not a graphical parameter

plotPCA(vsd_tsr, intgroup=c("TStage_Response"))+
  geom_label(aes(label=vsd_tsr$PatientID))

dds_tsr$Condition <- factor(dds_tsr$TStage_Response, levels = c("NR", "R"))
dds_tsr$Condition <- relevel(dds_tsr$TStage_Response, "R")

dds_tsr <- DESeq(dds_tsr)

## estimating size factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## final dispersion estimates

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## fitting model and testing

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## -- replacing outliers and refitting for 393 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## fitting model and testing

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

rownames(dds_tsr) <- geneSymbol.dic[rownames(dds_tsr),"gene_name"]

res_tsr <- results(dds_tsr, alpha = 0.05)
#rownames(sub_res) <- geneSymbol.dic[rownames(sub_res),"gene_name"]
summary(res_tsr)

## 
## out of 13551 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 6, 0.044%
## LFC < 0 (down)     : 7, 0.052%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 25)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resultsNames(dds_tsr)

## [1] "Intercept"                          "Lokalisation_oral.cavity_vs_Larynx"
## [3] "Lokalisation_Oropharynx_vs_Larynx"  "TStage_Response_R_vs_NR"

EnhancedVolcano(res_tsr,
                labSize = 2,
                lab = rownames(res_tsr),
                x = 'log2FoldChange',
                y = 'pvalue',
                pCutoff = 0.05, 
                FCcutoff = 1)

resOrdered_tsr <- res_tsr[order(res_tsr$padj, decreasing = F),] %>% as.data.frame() %>% filter(padj < 0.05) %>% filter(abs(log2FoldChange) > 1.0 )
reactable::reactable(resOrdered_tsr)

d_tsr <- plotCounts(dds_tsr, gene=which.min(res_tsr$padj), intgroup="TStage_Response",
                returnData=TRUE)

ggplot(d_tsr, aes(x=TStage_Response, y=count)) + 
  geom_point(position=position_jitter(w=0.1,h=0)) + 
  scale_y_log10(breaks=c(25,100,400))+ 
  ggtitle(geneSymbol.dic[rownames(dds_tsr[which.min(res_tsr$padj),]),])

sgenes_tsr <- resOrdered_tsr[order(abs(resOrdered_tsr$log2FoldChange),decreasing = T),] %>% head(500) %>% rownames()

rld_tsr <- vst(dds_tsr, blind=FALSE)

heatmap_coldata_tsr<-as.data.frame(pre_mtd)
rownames(heatmap_coldata_tsr)<-colnames(pre_cts)
heatmap_coldata_tsr<-heatmap_coldata_tsr[,c("Reponse","Lokalisation","Sublokalisation","TStage_Response","N_Stage_Response","Primary_Endpoint")]

de_mat_tsr <- assay(rld_tsr)[sgenes_tsr,]
pheatmap(t(scale(t(de_mat_tsr))),show_rownames = F,show_colnames = T,annotation_col =heatmap_coldata_tsr)