2x2 tables and mosaic plots for top Fisher’s tests of strength of correlation (RPs and nucleolar genes) and RP location

Author: Charleen D. Adams

Abbreviations and definitions

Gene-Tissue Expression (GTex) Project
Ribosomal protein (RP)
Transcripts per million (TPM)

Background

Below are the 2x2 tables and mosaic plots for the top findings from the analysis of the dichotomized absolute strength of correlation (RPs and nucleolar genes) and location (mitochondrial vs cytosolic) – in breast

Topic mosaics

##############################################################################
#### 2x2 tables and mosaic plots to look at top findings
##############################################################################
#use 'mydata4' since it has the binary correlations and the location variable
library(vcd)
library(corrplot)
library(graphics)

setwd("/n/home04/cdadams/TPM/")
mydata4=read.csv('data_for_top_mosaics_7-12.csv')
#head(colnames(mydata4))
mydata4$X <- NULL

#### IMP3_dich
tbl_IMP3_dich=table(mydata4$IMP3_dich, mydata4$location)
tbl_IMP3_dich

##         
##          cyto mito
##   <0.50    52   26
##   >=0.50    2   47

#chisq.test(tbl_IMP3_dich)
#fisher.test(tbl_IMP3_dich)
mosaicplot(tbl_IMP3_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & IMP3) & location", 
           type = c("pearson"))

assoc(tbl_IMP3_dich, shade = TRUE, main="Strength of correlation (RP & IMP3) & location")

#### POP5_dich
tbl_POP5_dich=table(mydata4$POP5_dich, mydata4$location)
tbl_POP5_dich

##         
##          cyto mito
##   <0.50    49   27
##   >=0.50    2   47

#chisq.test(tbl_POP5_dich)
#fisher.test(tbl_POP5_dich)
mosaicplot(tbl_POP5_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & POP5) & location", 
           type = c("pearson"))

assoc(tbl_POP5_dich, shade = TRUE, main="Strength of correlation (RP & POP5) & location")

#### TXN2_dich 
tbl_TXN2_dich=table(mydata4$TXN2_dich , mydata4$location)
tbl_TXN2_dich

##         
##          cyto mito
##   <0.50    46   24
##   >=0.50    4   50

#chisq.test(tbl_TXN2_dich)
#fisher.test(tbl_TXN2_dich)
mosaicplot(tbl_TXN2_dich , shade = TRUE, las=2,
           main = "Strength of correlation (RP & TXN2) & location", 
           type = c("pearson"))

assoc(tbl_TXN2_dich , shade = TRUE, main="Strength of correlation (RP & TXN2) & location")

#### RPL26L1_dich
tbl_RPL26L1_dich=table(mydata4$RPL26L1_dich, mydata4$location)
tbl_RPL26L1_dich

##         
##          cyto mito
##   <0.50    49   30
##   >=0.50    3   42

#chisq.test(tbl_RPL26L1_dich)
#fisher.test(tbl_RPL26L1_dich)
mosaicplot(tbl_RPL26L1_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & RPL26L1) & location", 
           type = c("pearson"))

assoc(tbl_RPL26L1_dich, shade = TRUE, main="Strength of correlation (RP & RPL26L1) & location")

#### GOLGA3_dich
tbl_GOLGA3_dich=table(mydata4$GOLGA3_dich, mydata4$location)
#chisq.test(tbl_GOLGA3_dich)
#fisher.test(tbl_GOLGA3_dich)
mosaicplot(tbl_GOLGA3_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & GOLGA3) & location", 
           type = c("pearson"))

assoc(tbl_GOLGA3_dich, shade = TRUE, main="Strength of correlation (RP & GOLGA3) & location")

#### XPO6_dich
tbl_XPO6_dich=table(mydata4$XPO6_dich, mydata4$location)
tbl_XPO6_dich

##         
##          cyto mito
##   <0.50    26   49
##   >=0.50   35    4

#chisq.test(tbl_XPO6_dich)
#fisher.test(tbl_XPO6_dich)
mosaicplot(tbl_XPO6_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & XPO6) & location", 
           type = c("pearson"))

assoc(tbl_XPO6_dich, shade = TRUE, main="Strength of correlation (RP & XPO6) & location")

TL;DR Full code

#### TPM
# Nucleolar and nucleolus searches were done in the protein atlas: 
# https://www.proteinatlas.org/humanproteome/tissue
# This provided lists of genes expression inthe nucleolus
# Also in doing this, I found more RP genes than were in the list I'd used previously.
# I reran the correlation analysis to include the expanded RP list against all 
# proteins in the nucleolus, as well as the ESR1, ESR2, ESRRA, ESRRB, ESRRG, MYC, and TP53

setwd("/n/home04/cdadams/TPM/")
(.packages())
#### Packages used in this script but also for the entire GTEx RP project
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
library(RBGL)
library(graph)
library(devtools)
library(Vennerable)
library(limma)
library(broom)
library(rstatix)
library(pastecs)
library(statmod)
library(stringr)
library(PerformanceAnalytics)
library(taRifx)
library(Hmisc)
library(EnsDb.Hsapiens.v86)
library(SNPlocs.Hsapiens.dbSNP144.GRCh38)
library(hyprcoloc)
library(e1071)
library(rcompanion)
library(car)
library(sqldf)
library(kableExtra)
library(gplots)
library(graphics)
library(gridExtra)
library(GGally)
library(CCA)
library(lme4)
library(CCP)
library(qqman)
library(dplyr)
library(tidyr)
library(EnsDb.Hsapiens.v79) 
library(biomaRt)
library(SNPlocs.Hsapiens.dbSNP.20120608)
library(data.table)
library(DataCombine)
library(Cairo)
library(devtools)
library(RColorBrewer)
library(MRInstruments) 
library(TwoSampleMR)
library(RadialMR)
library(tidyverse)
library(packcircles)
library(ggplot2)
library(viridis)
library(igraph)
library(CMplot)
library(oncofunco)
library(metaviz)
library(vcd)
library(corrplot)
library(CePa)
library(Biobase)
library(caret)
library(GO.db)
library(edgeR)
library(limma)
library(Glimma)
library(biomaRt)
library(annotate)
library(genefu)
library(xtable)
library(rmeta)
library(Biobase)
library(org.Hs.eg.db)

#### Read-in the TPM data
tpm=read.gct('GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm3gct')
rna=as.data.frame(tpm)
rna$GENEID=rownames(rna)

#### Read-in the TPM data (previously formatted and saved as a .csv)
#rna=read.csv('rna.csv') #deleted must re-transfer to cluster

#### Remove the version numbers at the end of the ensembl names
rna$GENEID <- gsub('\\..+$', '', rna$GENEID)

#### Fetch the gene names: convert from ensembl.gene to gene.symbol
ensembl.genes=as.character(rna$GENEID)
geneIDs1 <- ensembldb::select(EnsDb.Hsapiens.v79, keys= ensembl.genes, keytype = "GENEID", columns = c("SYMBOL","GENEID"));

#### Merge the gene names 
rna <- merge(x = geneIDs1, 
             y = rna, 
             by.x="GENEID",
             by.y="GENEID")

which(rna$GENEID=='ENSG00000273716')
which(rna$GENEID=='ENSG00000279483')

#### Remove the two RPL41 pseudogenes: "ENSG00000273716" "ENSG00000279483"
myData <- rna[-c(51625, 54471), ]

#### Read-in the set of nucleolar genes
nuc=read.csv('nucelolar.csv')
nuc2=nuc %>% distinct(nuc, .keep_all = TRUE)

#### Read-in the set of RPs from the Protein Atlas
expand=read.csv('expanded_verified.csv')#verified
dim(expand)
colnames(nuc2)[1]="all_RP"
vars=rbind(nuc2, expand)
vars2=vars %>% distinct(all_RP, .keep_all = TRUE)

#### Add the select cancer genes
lemos_vars=0
lemos_vars$all_RP=c("MYC", "TP53", "MKI67", "ESR1", "ESR2","ESRRA", "ESRRB", "ESRRG" )
lemos_vars=as.data.frame(lemos_vars)
lemos_vars$X0=NULL

#### Bind gene list together
complete_vars=rbind(lemos_vars, vars2)

complete_vars <- complete_vars[order(complete_vars$all_RP),]
complete_vars=as.data.frame(complete_vars)

complete_vars %>% n_distinct(complete_vars, .keep_all=TRUE)
complete_vars$complete_vars=as.character(complete_vars$complete_vars)

#### Select just the genes in the gene list: complete_vars
rps=myData[which(myData$SYMBOL %in% complete_vars$complete_vars),]

#rm(list=setdiff(ls(), c("rna", "rps", "complete_vars", "myData")))
rps2=rps
length(rps$SYMBOL)

rps3=rps2 %>% distinct(SYMBOL, .keep_all = TRUE)
rownames(rps3)=rps3$SYMBOL
dim(rps3)
head(colnames(rps3))
#write.csv(rps3, 'rps3_6-24.csv')

#### Remve GENEID, SYMBOL, and X from the colnames
rps4=rps3[,-1] 
rps4=rps4[,-1] 
rps4=rps4[,-1]

#dat <- japply( rps5, which(sapply(rps5, class)=="factor"), as.numeric )

#### Get just the breast cancer data 
read.url <- function(url, ...){
  tmpFile <- tempfile()
  download.file(url, destfile = tmpFile, method = "curl")
  url.data <- fread(tmpFile, ...)
  return(url.data)
}

samples_v8=read.url("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")
#write.csv(samples_v8, 'samples_v8.csv')

breast=samples_v8[which(samples_v8$SMTSD=="Breast - Mammary Tissue"),]
breast$SAMPID <- gsub("\\-", ".", breast$SAMPID)

rps5=t(rps4)
rps6=as.data.frame(rps5)

breast_rp=rps6[which(rownames(rps6) %in% breast$SAMPID),]
breast_rp6=t(breast_rp)

#write.csv(breast_rp6, "breast_rp_6-24.csv")

# Which values in TPM are greater than 0.5?
thresh <- breast_rp6 > 0.5

# This produces a logical matrix with TRUEs and FALSEs
head(thresh)

# Summary of how many TRUEs there are in each row
table(rowSums(thresh))

# we would like to keep genes that have at least 2 TRUES in each row of thresh
keep <- rowSums(thresh) >= 2

# Subset the rows of rna2 to keep the more highly expressed genes
counts.keep <- breast_rp6[keep,]

dgeObj <- DGEList(counts.keep)

# have a look at dgeObj
dgeObj

# See what slots are stored in dgeObj
names(dgeObj)

# Library size information is stored in the samples slot
dgeObj$samples

# Get log2 counts per million
logcounts <- cpm(dgeObj,log=TRUE)

#####################################################################
#### Correlation matrix for RMD table
#####################################################################
logs2=t(logcounts)
logs2=as.data.frame(logs2)
logs2=logs2[,order(colnames(logs2))]
dim(logs2)
write.csv(logs2, 'logs2_6_28.csv')

expand=read.csv('expanded_verified.csv')#verified
dim(expand)
logs2=read.csv('logs2_6_28.csv')
str(logs2)
dim(logs2)
head(colnames(logs2))
head(logs2$X)
head(rownames(logs2))
rownames(logs2) <- logs2$X
logs2$X <- NULL
dim(logs2)

res3 <- rcorr(as.matrix(logs2))
str(res3)

#Get the pvals
cor.gather <- res3 %>% cor_gather()
dim(cor.gather)
colnames(cor.gather)
rownames(cor.gather)=cor.gather$var2
#write.csv(cor.gather, "cor.gather_6-28.csv")

#Get just the RPs in var2
#cor.gather=as.data.frame(cor.gather)
cor.gather_RP=cor.gather[which(cor.gather$var2 %in% expand$all_RP),]
dim(cor.gather_RP)
str(cor.gather_RP)
#cor.gather1<-cor.gather[grepl("RPL", cor.gather$var2),]

#Keep only the non-RP nucleolar & cancer genes in var1
'%ni%' <- Negate('%in%')
cor.gather_RP_not=cor.gather_RP[which(cor.gather_RP$var1 %ni% expand$all_RP),]
dim(cor.gather_RP_not)

cor.gather_RP_not_uniq_var1 <- unique(cor.gather_RP_not$var1)
length(cor.gather_RP_not_uniq_var1)

cor.gather_RP_not_uniq_var2 <- unique(cor.gather_RP_not$var2)
length(cor.gather_RP_not_uniq_var2)

bf_thresh=0.05/(1325*150)
bf_thresh
#write.csv(cor.gather_RP_not, 'cor.gather_RP_not_6_28.csv')

head(cor.gather_RP_not[1:4], n=50)
cor.gather_RP_not=cor.gather_RP_not[order(-cor.gather_RP_not$cor),]
dim(cor.gather_RP_not)

bf <- cor.gather_RP_not[ which(cor.gather_RP_not$p<2.515723e-07),]
str(bf)
dim(bf)

#write.csv(bf, 'bf_6-28.csv')

#Remove the pvalues
bf$p <- NULL

#Rename the variables
colnames(bf)[1] <- "gene"
colnames(bf)[2] <- "RP"

wide=xtabs(cor ~ RP + gene, data = bf)

wide2=as.matrix(wide)
colnames(wide2)
rownames(wide2)
str(wide2)
wide_final=as.data.frame(wide2)

#Change the 0's to NA
is.na(wide_final) <- !wide_final
head(wide_final[2:2])

write.csv(wide_final, 'wide_final_6-28.csv')

#Get the absolute values
dt_abs <- lapply(wide_final, abs)

dt_abs2=as.data.frame(dt_abs)
str(dt_abs2)

rownames(dt_abs2)=rownames(wide_final)

#Transform to a tibble to create the dichotomized variables
mydata=as_tibble(dt_abs2)
mydata2 = mutate_all(mydata, funs("dich"= ifelse(.>=.50, '>=0.50', '<0.50')))
mydata2=as.data.frame(mydata2)
rownames(mydata2)=rownames(dt_abs2)
colnames(mydata2)
#write.csv(mydata2, 'mydata2_strength_6_28.csv')

#Create the cell location variable
mydata2$location=0
mydata2$location=ifelse(startsWith(rownames(mydata2), 'RP'), 'cyto', 'mito')
dim(mydata2)
colnames(mydata2)[2597]
colnames(mydata2)[1298]
colnames(mydata2)[1299]

#Select just the dichotomized variables and the location variable
mydata3= mydata2[c(1299:2597)]

# Convert all the variables in factors for the chi-squared tests
for(i in 1:ncol(mydata3)){
  mydata3[,i] <- as.factor(mydata3[,i])
}

# Remove data with fewer than two levels
mydata4=mydata3[, sapply(mydata3, nlevels) > 1]
dim(mydata4)
head(mydata4[,2])
tail(colnames(mydata4))
rownames(mydata4)
dim(mydata4)
mydata4$location=0
mydata4$location=ifelse(startsWith(rownames(mydata2), 'RP'), 'cyto', 'mito')

colnames(mydata4)[1082]
str(mydata4)

fishers=data.frame(lapply(mydata4[,-1082], 
  function(x) fisher.test(table(x,mydata4$location), 
  simulate.p.value = TRUE)$p.value), B = 10000)

str(fishers)
#fishers.observed=data.frame(lapply(mydata3[,-1476], function(x) chisq.test(table(x,mydata3$location), simulate.p.value = TRUE)$observed))
#write.csv(fishers,'fishers_6-28.csv')

#Transpose the data to get the simulated Monte Carlo p-values in a column
fishers2=t(fishers)
fishers2=as.data.frame(fishers2)
colnames(fishers2)[1]='Pval'

fishers2$gene=rownames(fishers2)
myvars=rownames(dt_abs2)
fishers2$gene=str_remove_all(fishers2$gene, "[_dich]")
dim(fishers2)

#To get the proper FDR adjusted list, remove the RPs
fishers3=fishers2[which(!fishers2$gene %in% myvars),] 
dim(fishers3)

fishers3 <- fishers3[order(fishers3$Pval),]
fishers3$fdr=p.adjust(fishers3$Pval, 'fdr')
fishers3_fdr=fishers3[which(fishers3$fdr<0.05),]
dim(fishers3_fdr)
fishers3_fdr
head(fishers3)

#write.csv(fishers3, 'fishers_6-28.csv')
write.csv(fishers3_fdr, 'fishers3_fdr_7-10.csv')
write.csv(mydata4, 'data_for_top_mosaics_7-12.csv')

##############################################################################
#### 2x2 tables and mosaic plots to look at top findings
##############################################################################
#use 'mydata4' since it has the binary correlations and the location variable

setwd("/n/home04/cdadams/TPM/")
mydata5=read.csv('data_for_top_mosaics_7-12.csv')
head(colnames(mydata4))
mydata4$X <- NULL

#### IMP3_dich
tbl_IMP3_dich=table(mydata4$IMP3_dich, mydata4$location)
tbl_IMP3_dich
chisq.test(tbl_IMP3_dich)
fisher.test(tbl_IMP3_dich)
mosaicplot(tbl_IMP3_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & IMP3) & location", 
           type = c("pearson"))
assoc(tbl_IMP3_dich, shade = TRUE, main="Strength of correlation (RP & IMP3) & location", compress=FALSE)

#### POP5_dich
tbl_POP5_dich=table(mydata4$POP5_dich, mydata4$location)
tbl_POP5_dich
chisq.test(tbl_POP5_dich)
fisher.test(tbl_POP5_dich)
mosaicplot(tbl_POP5_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & POP5) & location", 
           type = c("pearson"))
assoc(tbl_POP5_dich, shade = TRUE, main="Strength of correlation (RP & POP5) & location")

#### TXN2_dich 
tbl_TXN2_dich=table(mydata4$TXN2_dich , mydata4$location)
tbl_TXN2_dich
chisq.test(tbl_TXN2_dich)
fisher.test(tbl_TXN2_dich)
mosaicplot(tbl_TXN2_dich , shade = TRUE, las=2,
           main = "Strength of correlation (RP & TXN2) & location", 
           type = c("pearson"))
assoc(tbl_TXN2_dich , shade = TRUE, main="Strength of correlation (RP & TXN2) & location")

#### RPL26L1_dich
tbl_RPL26L1_dich=table(mydata4$RPL26L1_dich, mydata4$location)
tbl_RPL26L1_dich
chisq.test(tbl_RPL26L1_dich)
fisher.test(tbl_RPL26L1_dich)
mosaicplot(tbl_RPL26L1_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & RPL26L1) & location", 
           type = c("pearson"))
assoc(tbl_RPL26L1_dich, shade = TRUE, main="Strength of correlation (RP & RPL26L1) & location")

#### GOLGA3_dich
tbl_GOLGA3_dich=table(mydata4$GOLGA3_dich, mydata4$location)
chisq.test(tbl_GOLGA3_dich)
fisher.test(tbl_GOLGA3_dich)

mosaicplot(tbl_GOLGA3_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & GOLGA3) & location", 
           type = c("pearson"))

assoc(tbl_GOLGA3_dich, shade = TRUE, main="Strength of correlation (RP & GOLGA3) & location")

#### XPO6_dich
tbl_XPO6_dich=table(mydata4$XPO6_dich, mydata4$location)
tbl_XPO6_dich
chisq.test(tbl_XPO6_dich)
fisher.test(tbl_XPO6_dich)
mosaicplot(tbl_XPO6_dich, shade = TRUE, las=2,
           main = "Strength of correlation (RP & XPO6) & location", 
           type = c("pearson"))
assoc(tbl_XPO6_dich, shade = TRUE, main="Strength of correlation (RP & XPO6) & location")

2x2 tables and mosaic plots for top Fisher’s tests of strength of correlation (RPs and nucleolar genes) and RP location

2020-July-12

Abbreviations and definitions

Background

Topic mosaics

TL;DR Full code