library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##enter in the data for gene expression in the brain of a 90yo woman with alzheimers
genequal <- read.table("ENCFF166SFX.tsv", header = TRUE)
#FPKM is normalized reads
##create a plot to show the frequency distribution of all the gene lengths
ggplot(genequal, aes(x=length)) + geom_histogram(binwidth = 50)

##create another plot to show the frequency distribution of gene lengths, filtered to only show gene lengths that are less than 10,000.
normalLength <- genequal |>
filter(length<10000)
ggplot(normalLength, aes(x=length))+
geom_histogram(binwidth = 1000, fill = "pink")

##I tried to add the gene names in but cant figure out how to make it work, maybe you can take a look and see what I am missing?
#BiocManager::install("biomaRt")
library(biomaRt)
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
gene_annotations <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
filters = "ensembl_gene_id",
values = genequal$gene_id,
mart = ensembl)
#attempt to merge the gene names with the ensembl ID (did not work)
genequalannotation <- merge(genequal, gene_annotations,
by.x = "gene_id", by.y = "ensembl_gene_id",
all.x = TRUE)
#create a plot showing the genes with the highest normalized expression for each gene ID
highex <- genequal |>
filter(FPKM>5000)
ggplot(highex,aes(x=gene_id, y=FPKM, fill=FPKM, color = FPKM))+
geom_col()+
scale_fill_gradient(low = "cyan", high = "magenta") +
scale_color_gradient(low = "cyan", high = "magenta") +
theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1))

#create a linear regression to show the relation of TPM to FPKM using the filtered normalized gene lengths
regression <- lm(formula = TPM ~ FPKM, data = normalLength)
Coeff <- coefficients(regression)
intl <-Coeff[1]
slo <- Coeff[2]
ggplot(normalLength,aes(x=FPKM, y=TPM))+
geom_point(position = "jitter", alpha = 0.5)+
geom_abline(intercept = intl, slope = slo, color = "coral", linewidth = 2, alpha=0.2)
