But if you do get an error, read it carefully to see if it is simply that you need to load another package
You will need to ask for the login credentials for this step. Storing them in your .Renviron makes it easy, but also make sure that you are careful not to push them up to a remote repo
metadata_df = getMetadata(database_info$kn99_host,
database_info$kn99_db_name,
Sys.getenv("DB_USERNAME"),
Sys.getenv("DB_PASSWORD"))
# I do plan to make this part of the getMetaData() function, but it isn't yet
metadata_df$fastqFileName = str_remove(metadata_df$fastqFileName, ".fastq.gz")
raw_counts_df = getRawCounts(database_info$kn99_host,
database_info$kn99_db_name,
Sys.getenv("DB_USERNAME"),
Sys.getenv("DB_PASSWORD"))
# subset to only protein coding
raw_counts_df = raw_counts_df[1:6967,]subset_metadata = metadata_df %>%
# replace empty strings with NA
mutate_if(is.character, list(~na_if(.,""))) %>%
# replace NAs with string entry
mutate(perturbation1 = replace_na(perturbation1, "noPerturbation")) %>%
mutate(perturbation2 = replace_na(perturbation2, "noPerturbation")) %>%
mutate(treatment = replace_na(treatment, 'noTreatment')) %>%
mutate(otherConditions = replace_na(otherConditions, 'noOtherConditions')) %>%
mutate(medium = replace_na(medium, 'noMedium')) %>%
mutate(atmosphere = replace_na(atmosphere, 'noAtmosphere')) %>%
mutate(pH = replace_na(pH, 'noPh')) %>%
mutate(timePoint = as.integer(timePoint)) %>%
filter(medium %in% c("DMEM"),
temperature %in% c(37),
atmosphere %in% c("CO2"),
treatment %in% c("noTreatment"),
otherConditions %in% c("noOtherConditions"),
pH %in% c("noPh"),
timePoint %in% c(90),
strain != "TDY1993",
purpose=="fullRNASeq",
perturbation1 == "deletion" | perturbation1 == "noPerturbation",
!is.na(fastqFileName))
df = tibble(doubles = nrow(subset_metadata %>% filter(!is.na(genotype2))),
overexpression = nrow(subset_metadata %>% filter(perturbation1 == "over")),
singles = nrow(subset_metadata %>% filter(is.na(genotype2), perturbation1 == "deletion")),
wt = nrow(subset_metadata %>% filter(genotype1 == "CNAG_00000"))
)
print(df)since I’m not evaluating this code for the vignette, this is what it looks like. If you copy and paste this code (after getting the login credentials, you can just run the code directly).
doubles: 116
overexpression: 0 (purposefully filtered out due to linear dependence issue in model matrix) singles: 947
wt: 238
NOTE! this function currently casts the column names to uppers. I am going to be removing this – previous, it was necessary b/c there were sometimes the same columns with different formatting. That can’t happen anymore
passing_subset_metadata = qualityAssessmentFilter(subset_metadata)
passing_subset_metadata = droplevels(passing_subset_metadata)
df = tibble(doubles = nrow(passing_subset_metadata %>% filter(!is.na(GENOTYPE2))),
overexpression = nrow(passing_subset_metadata %>% filter(PERTURBATION1 == "over")),
singles = nrow(passing_subset_metadata %>% filter(is.na(GENOTYPE2), PERTURBATION1 == "deletion")),
wt = nrow(passing_subset_metadata %>% filter(GENOTYPE1 == "CNAG_00000"))
)
print(df)doubles: 73
overexpression: 0
singles: 658
wt: 179
passing_subset_metadata_formatted = passing_subset_metadata %>%
mutate(genotype = paste(passing_subset_metadata$GENOTYPE1,
passing_subset_metadata$GENOTYPE2,
sep="_")) %>%
mutate(genotype = str_remove(genotype, "_NA")) %>%
mutate(LIBRARYDATE = as.factor(LIBRARYDATE)) %>%
mutate(LIBRARYPROTOCOL = as.factor(LIBRARYPROTOCOL)) %>%
mutate(genotype = as.factor(genotype))
passing_subset_metadata_formatted$LIBRARYDATE = relevel(passing_subset_metadata_formatted$LIBRARYDATE,
ref=min(levels(passing_subset_metadata_formatted$LIBRARYDATE)))
passing_subset_metadata_formatted$genotype = relevel(passing_subset_metadata_formatted$genotype,
ref='CNAG_00000')
passing_subset_metadata_formatted = droplevels(passing_subset_metadata_formatted)# create list of high dispersion genes to filter out. Note: you'll need to get this from me for the time being
old_protocol_high_disp_genes = read_csv("~/projects/brentlab/90minuteInduction/datafreeze_20210528/data/old_protocol_high_disp_genes.csv")$gene_ids
new_protocol_high_disp_genes = read_csv("~/projects/brentlab/90minuteInduction/datafreeze_20210528/data/new_protocol_high_disp_genes.csv")$gene_ids
high_disp_genes = union(old_protocol_high_disp_genes, new_protocol_high_disp_genes)
full_gene_id_vector = read_csv("~/Desktop/rnaseq_pipeline/rnaseq_pipeline/genome_files/KN99/KN99_gene_id_list.txt", col_names = 'gene_id')$gene_id
full_gene_id_vector = full_gene_id_vector[1:6967]
fltr_passing_subset_counts = raw_counts_df[!full_gene_id_vector %in% high_disp_genes,fltr_passing_subset_metadata_formatted$FASTQFILENAME]dds = DESeqDataSetFromMatrix(countData = fltr_passing_subset_counts,
colData = fltr_passing_subset_metadata_formatted,
design = design_formula)Error in checkFullRank(modelMatrix) : the model matrix is not full rank, so the model cannot be fit as specified. One or more variables or interaction terms in the design formula are linear combinations of the others and must be removed. Please read the vignette section ‘Model matrix not full rank’: vignette(‘DESeq2’) 4. stop(“the model matrix is not full rank, so the model cannot be fit as specified.One or more variables or interaction terms in the design formula are linearcombinations of the others and must be removed.Please read the vignette section ‘Model matrix not full rank’:vignette(‘DESeq2’)”) 3. checkFullRank(modelMatrix) 2. DESeqDataSet(se, design = design, ignoreRank) 1. DESeqDataSetFromMatrix(countData = fltr_passing_subset_counts, colData = fltr_passing_subset_metadata_formatted, design = design_formula)
library(caret)
model_matrix = model.matrix(design_formula, fltr_passing_subset_metadata_formatted)
caret_output = findLinearCombos(model_matrix)$linearCombos $linearCombos[[1]] [1] 59 1 2 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
$remove [1] 59
# we are filtering out this: "LIBRARYDATE2021-05-06", which is a 'new protocol' library date. Note that a
# 'old protocol' date was also dropped
augment_model_matrix = model_matrix[, -59]
findLinearCombos(augment_model_matrix)$linearCombos list()
$remove NULL ## get protocol specific size factors
# note: i am going to re-do this function so it only gives back the size factors. for now, it is purpose built for the 90minuteInduction
size_factor_dds = deseqObjectWithProtocolSpecificSizeFactors(fltr_passing_subset_metadata_formatted, fltr_passing_subset_counts)
size_factor_df = tibble(FASTQFILENAME = names(sizeFactors(dds)), size_factors = sizeFactors(dds))
fltr_passing_subset_metadata_formatted_with_sizeFactors = left_join(fltr_passing_subset_metadata_formatted, size_factor_df, by='FASTQFILENAME')dds = DESeqDataSetFromMatrix(countData = fltr_passing_subset_counts,
colData = fltr_passing_subset_metadata_formatted_with_sizeFactors,
design = augment_model_matrix)
output_dir = "/mnt/htcf_scratch/chasem/rnaseq_pipeline/experiments/20210610/frankenstein_set"
dir.create(output_dir, recursive=TRUE)
write_rds(dds, file.path(output_dir,"input_dds_before_rle.rds"))output_dir = "/mnt/htcf_scratch/chasem/rnaseq_pipeline/experiments/20210610/frankenstein_set"
filename = "deseq_output.rds"
dds = readRDS(file.path(output_dir, filename))
metadata_df = as_tibble(colData(dds)) replicate_split = as_tibble(colData(dds)) %>%
group_by(genotype) %>%
group_split()
replicate_names = lapply(replicate_split, function(x) as.character(unique(pull(x,genotype))))
replicate_sample_list = lapply(replicate_split, function(x) as.vector(pull(x,FASTQFILENAME)))
names(replicate_sample_list) = replicate_names
norm_counts_rle_by_replicate_group = rleByReplicateGroup(replicate_sample_list,
norm_counts,
log2_transformed_flag = FALSE)
norm_counts_rle_summary = bind_rows(lapply(norm_counts_rle_by_replicate_group, rleSummary)) %>%
left_join(metadata_df, by='FASTQFILENAME')
removed_effect_rle_by_replicate_group = rleByReplicateGroup(replicate_sample_list,
removed_librarydate_effect_log_norm_counts,
log2_transformed_flag = TRUE)
removed_effect_rle_summary = bind_rows(lapply(removed_effect_rle_by_replicate_group, rleSummary)) %>%
left_join(metadata_df, by='FASTQFILENAME')
lapply(high_iqr_genotype_groups, function(x) rlePlotCompareEffectRemoved(norm_counts_rle_by_replicate_group[[x]],
removed_effect_rle_by_replicate_group[[x]],
metadata_df,
x))
subset_samples = metadata_df %>% filter(LIBRARYDATE == '2013-04-17', GENOTYPE == "CNAG_00000") %>% pull(FASTQFILENAME)
rlePlotCompareEffectRemoved(norm_counts_rle_by_replicate_group$CNAG_00000[,subset_samples],
removed_effect_rle_by_replicate_group$CNAG_00000[,subset_samples],
metadata_df,
'cnag_00000')iqr_fltr_meta = fltrLowReplicateParams(iqr_fltr_rle_summary, design(dds))
View(colSums(model.matrix(design(dds), iqr_fltr_meta)))note: if samples are removed, it is likely that you’ll need to check for linear dependencies in the model matrix again
# note: i am going to re-do this function so it only gives back the size factors. for now, it is purpose built for the 90minuteInduction
size_factor_dds = deseqObjectWithProtocolSpecificSizeFactors(iqr_fltr_rle_summary, fltr_passing_subset_counts)
size_factor_df = tibble(FASTQFILENAME = names(sizeFactors(dds)), size_factors = sizeFactors(dds))
fltr_passing_subset_metadata_formatted_with_sizeFactors = left_join(fltr_passing_subset_metadata_formatted, size_factor_df, by='FASTQFILENAME')revised_dds = DESeqDataSetFromMatrix(countData = iqr_fltr_counts,
colData = iqr_fltr_meta,
design = # note -- this may need to be re-done after iqr filtering)
# write_rds(revised_dds, "/mnt/htcf_scratch/chasem/rnaseq_pipeline/experiments/20210605/other_regulators/after_rle/other_regulators_after_rle_deseq_input.rds")library(sva)
library(BiocParallel)
MulticoreParam(3)
sva_func = function(num_sv){
svaseq(as.matrix(iqr_fltr_counts),
mod=# note -- this may need to be re-done after iqr filtering,
mod0=# note -- this may need to be re-done after iqr filtering,
n.sv=num_sv)
}
# note: the first 1-58 is intercept to the last batch index
#sva_output_list = bplapply(1:3, sva_func) --> when trying, kept getting error about computationally singular matrix
# when trying without specifying n.sv, the estimate was 22 SV
sv_1 = sva_func(1)
sv_2 = sva_func(2)
sv_3 = sva_func(3)# dds_list = list(
# dds_0 = DESeqDataSetFromMatrix(countData = fltr_passing_subset_counts,
# colData = fltr_passing_subset_metadata_formatted,
# design = augment_model_matrix),
# dds_1 = DESeqDataSetFromMatrix(countData = fltr_passing_subset_counts,
# colData = fltr_passing_subset_metadata_formatted,
# design = augment_model_matrix_sv1),
# dds_2 = DESeqDataSetFromMatrix(countData = fltr_passing_subset_counts,
# colData = fltr_passing_subset_metadata_formatted,
# design = augment_model_matrix_sv2)
# dds_3
#
# )