Analysis of log-FC BGC vs noBCG

The problem

Compare log-FC between two treatment group. FC is between Stimulated and Non-stimulated responses from each animal n=0.

Inputs

Two inputs are are:

log-FC-CPM table with 16 columns and G genes
design matrix with 16 rows that indicates what is the treatment of each column of the log-FC table

# read and extract treatment label from column names

setwd("C:/Users/marti/OneDrive/Documents/Penny/")
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(DT)
expression<-read_delim("lcpm.summary.table.lcpm.LPS.subtract.lcpm.NS.BCG.noBCG.16.columns.14889.genes.geneid.ordered.txt")

Rows: 14889 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (1): Gene
dbl (16): N_4, N_12, N_14, N_16, N_18, N_20, N_6, N_8, B_1, B_7, B_9, B_3, B...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

expression<-group_by(expression,Gene)
trt<-colnames(expression)[-1]%>%strsplit(split = "_")%>%sapply(.,function(x) x[1])%>%as.factor()
trt

 [1] N N N N N N N N B B B B B B B B
Levels: B N

Now the clasification criteria read from a file and checked against the one extracted from the column names of expression matrix

BCG<-read_delim("BCG.df.used.as.design.matrix.model.for.lcpm.summary.table.lcpm.LPS.subtract.lcpm.NS.BCG.noBCG.16.columns.14889.genes.geneid.ordered.txt")

Rows: 16 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): BCG
dbl (1): row

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

BCG<-BCG[,-1]
BCG

# A tibble: 16 × 1
   BCG     
   <chr>   
 1 IV_noBCG
 2 IV_noBCG
 3 IV_noBCG
 4 IV_noBCG
 5 IV_noBCG
 6 IV_noBCG
 7 IV_noBCG
 8 IV_noBCG
 9 IV_lvBCG
10 IV_lvBCG
11 IV_lvBCG
12 IV_lvBCG
13 IV_lvBCG
14 IV_lvBCG
15 IV_lvBCG
16 IV_lvBCG

length(as.vector(BCG[[1]]))

[1] 16

table(as.vector(BCG[[1]]),as.vector(trt))

          
           B N
  IV_lvBCG 8 0
  IV_noBCG 0 8

Everything checks, we can use either one.

prepare functions for analysis

expression

# A tibble: 14,889 × 17
# Groups:   Gene [14,889]
   Gene      N_4      N_12   N_14      N_16   N_18   N_20       N_6       N_8
   <chr>   <dbl>     <dbl>  <dbl>     <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
 1 A2ML1   0.980 -3.00e-15  0      8.16e- 1  2.42   0     -3.00e-15 -3.00e-15
 2 AADAC  -1.15  -5.67e- 1  0      3.00e-15  0.822  0     -3.00e-15  2.01e+ 0
 3 AAMDC  -0.448 -1.87e+ 0 -1.95  -9.86e- 1 -1.91   1.23  -1.06e+ 0 -1.15e+ 0
 4 AARS1   1.05  -1.64e+ 0 -1.39  -6.04e- 1 -0.254 -0.244  8.79e- 1 -3.39e+ 0
 5 AARSD1 -0.455 -7.44e- 1 -1.50   6.42e- 1 -2.32  -1.76   8.76e- 1 -3.66e+ 0
 6 ABCA2  -2.45   2.01e+ 0  1.22   2.34e- 1  1.46  -0.258 -8.42e- 1  1.67e+ 0
 7 ABCB1   1.43   2.97e+ 0  1.73   3.38e- 1  0.904  2.09  -1.72e- 1 -1.26e- 1
 8 ABCB4   0     -3.00e-15  0.658 -1.76e+ 0  0      0     -3.00e-15 -2.22e+ 0
 9 ABCC4  -0.818  2.74e- 2 -0.939 -1.97e+ 0 -1.26  -0.108 -9.20e- 1 -1.11e+ 0
10 ABCF2  -1.91  -3.18e+ 0 -2.55  -2.21e+ 0 -0.873 -1.48  -9.76e- 1 -2.27e+ 0
# ℹ 14,879 more rows
# ℹ 8 more variables: B_1 <dbl>, B_7 <dbl>, B_9 <dbl>, B_3 <dbl>, B_11 <dbl>,
#   B_13 <dbl>, B_17 <dbl>, B_19 <dbl>

#reformat, against tidyverse good practices, but it's more conveniente for traditional R programming.

GE<-as.matrix(expression[,-1])
dim(GE)

[1] 14889    16

genenames<-expression[,1]
length(genenames[[1]])

[1] 14889

rownames(GE)<-genenames[[1]]

#just a dry run with a single gene
tt<-t.test(GE[1,]~trt,var.equal = TRUE)

tt


    Two Sample t-test

data:  GE[1, ] by trt
t = 0.11986, df = 14, p-value = 0.9063
alternative hypothesis: true difference in means between group B and group N is not equal to 0
95 percent confidence interval:
 -1.003294  1.122071
sample estimates:
mean in group B mean in group N 
      0.5865607       0.5271721

tt$statistic

        t 
0.1198626

tt$estimate

mean in group B mean in group N 
      0.5865607       0.5271721

tt$parameter

df 
14

tt$stderr

[1] 0.4954725

t_test_ge<-function(exp_row,group=trt,homosk=TRUE){
tt<-t.test(exp_row~group,var.equal = homosk)
rest<-c(tt$estimate[1],tt$estimate[2],
                 dif=tt$estimate[1]-tt$estimate[2],
                 df=tt$parameter,SE=tt$stderr,
                 t_test=tt$statistic,p_value=tt$p.value)

return(rest)

}

t_test_ge(GE[1,])

    mean in group B     mean in group N dif.mean in group B               df.df 
         0.58656071          0.52717208          0.05938863         14.00000000 
                 SE            t_test.t             p_value 
         0.49547251          0.11986261          0.90629508

top_table<-apply(GE[1:3,],1,t_test_ge)%>%(t)
top_table

      mean in group B mean in group N dif.mean in group B df.df        SE
A2ML1       0.5865607       0.5271721          0.05938863    14 0.4954725
AADAC       0.1864595       0.1399737          0.04648578    14 0.3556881
AAMDC      -0.2649282      -1.0181096          0.75318135    14 0.8105920
       t_test.t   p_value
A2ML1 0.1198626 0.9062951
AADAC 0.1306925 0.8978779
AAMDC 0.9291745 0.3685433

Now apply it to all the table and format the output.

Homoskedastic analyses

top_table<-apply(GE,1,t_test_ge)%>%(t)
top_table<-as.data.frame(top_table)

report_table<-mutate(top_table,gene=rownames(top_table),p_adjust=p.adjust(p_value,method="BH"))
rownames(report_table)<-NULL
report_table<-report_table%>%relocate(gene)%>%arrange(p_value)


datatable(report_table)

Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Heteroskedastic analyses

top_table2<-apply(GE,1,t_test_ge,homosk=FALSE)%>%(t)
top_table2<-as.data.frame(top_table2)

report_table2<-mutate(top_table2,gene=rownames(top_table2),p_adjust=p.adjust(p_value,method="BH"))
rownames(report_table2)<-NULL
report_table2<-report_table2%>%relocate(gene)%>%arrange(p_value)


datatable(report_table2)

Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html