Encode Expression

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

##enter in the data for gene expression in the brain of a 90yo woman with alzheimers 
genequal <- read.table("ENCFF166SFX.tsv", header = TRUE) 
#FPKM is normalized reads

##create a plot to show the frequency distribution of all the gene lengths 
ggplot(genequal, aes(x=length)) + geom_histogram(binwidth = 50)

##create another plot to show the frequency distribution of gene lengths, filtered to only show gene lengths that are less than 10,000. 
normalLength <- genequal |>
  filter(length<10000)
ggplot(normalLength, aes(x=length))+
  geom_histogram(binwidth = 1000, fill = "pink")

##I tried to add the gene names in but cant figure out how to make it work, maybe you can take a look and see what I am missing?

#BiocManager::install("biomaRt")

library(biomaRt)
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")


gene_annotations <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
                              filters = "ensembl_gene_id",
                              values = genequal$gene_id,
                              mart = ensembl)

#attempt to merge the gene names with the ensembl ID (did not work)
genequalannotation <- merge(genequal, gene_annotations,
                                       by.x = "gene_id", by.y = "ensembl_gene_id",
                                       all.x = TRUE)

#create a plot showing the genes with the highest normalized expression for each gene ID
highex <- genequal |>
  filter(FPKM>5000)
ggplot(highex,aes(x=gene_id, y=FPKM, fill=FPKM, color = FPKM))+ 
  geom_col()+
 scale_fill_gradient(low = "cyan", high = "magenta") +
  scale_color_gradient(low = "cyan", high = "magenta") +
  theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1))

#create a linear regression to show the relation of TPM to FPKM using the filtered normalized gene lengths 
regression <- lm(formula = TPM ~ FPKM, data = normalLength)
Coeff <- coefficients(regression)
intl <-Coeff[1]
slo <- Coeff[2]
ggplot(normalLength,aes(x=FPKM, y=TPM))+
  geom_point(position = "jitter", alpha = 0.5)+ 
  geom_abline(intercept = intl, slope = slo, color = "coral", linewidth = 2, alpha=0.2)

Encode Expression

2025-09-22