We will explore GWAS data for Diabates mellitus, produce Manhattan and QQ plots .
Load Libraries
library(data.table)
library(qqman)
library(tidyverse)
library(ggplot2)
library(CMplot)
library(dplyr)
# Load the data
diabetes <- read.csv("Gwas Diabetes 2 data.csv")
# Inspect column names
dim(diabetes)
## [1] 10996 5
colnames(diabetes)
## [1] "SNP" "CHR" "BP" "P" "Gene"
head(diabetes)
## SNP CHR BP P Gene
## 1 rs9273368-G 6 32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-? 6 32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3 rs689-? 11 2160994 1e-18 INS-IGF2,INS
## 4 rs2476601-? 1 113834946 5e-16 PTPN22,AP4B1-AS1
## 5 rs3184504-? 12 111446804 2e-08 SH2B3,ATXN2
## 6 rs1983890-C 10 6136651 3e-08 RBM17,PFKFB3
summary(diabetes)
## SNP CHR BP P
## Length:10996 Length:10996 Length:10996 Min. :0.000e+00
## Class :character Class :character Class :character 1st Qu.:0.000e+00
## Mode :character Mode :character Mode :character Median :3.000e-11
## Mean :3.861e-07
## 3rd Qu.:1.000e-08
## Max. :1.000e-05
## Gene
## Length:10996
## Class :character
## Mode :character
##
##
##
In the data set, there are a total of 5 variables and a total of 10,996 observations. The summary table shows the variable type and metrics of the data set.
# Histogram of P-values
hist(diabetes$P)
The histogram indicates that most of SNPs are not associated with Diabetes (p-values are not significant) and only a small subset of SNPs have significant p-values indicating strong association with Diabetes.
# Log-transform P-values
diabetes$logP <- -log10(diabetes$P)
# Histogram of -log10(P)
ggplot(diabetes, aes(x = logP)) +
geom_histogram(fill = "steelblue", color = "black", bins = 50) +
labs(
title = "Histogram of -log10(P) values (Diabetes Disease)",
x = expression(-log[10](P)),
y = "Frequency"
) +
theme_minimal()
The -log10(P)value plot also indicates that most of the SNPs have no significant association with Diabetes with only a small fraction of the SNPs being linked to the condition.
# Histogram of SNPs/Chromosomes
# Ensure chromosome order is numeric or natural (1–22, X, Y)
diabetes$CHR <- factor(diabetes$CHR,
levels = c(as.character(1:22), "X", "Y"))
# Histogram of SNPs per Chromosome
# ---- Order chromosomes (1–22, then X) ----
diabetes$CHR <- factor(diabetes$CHR,
levels = c(as.character(1:22), "X"))
ggplot(diabetes, aes(x = CHR)) +
geom_bar(fill = "steelblue", color = "black") +
labs(
title = "Histogram of SNPs per Chromosome",
x = "Chromosome",
y = "Frequency"
) +
theme_minimal()
SNP distribution across chromosome is uneven as there is a lot of missing data that contributes to the current distribution. The plot can be remade after the data is normalized according to chromosome size for a better understanding of the data.
# Normalize per chromosome size:
# ============================================================
# SNP Density per Chromosome (Normalized by Chromosome Size)
# ============================================================
# ---- Chromosome sizes (GRCh38, in base pairs) ----
chr_sizes <- data.frame(
CHR = as.character(c(1:22, "X")),
size_bp = c(
248956422, 242193529, 198295559, 190214555, 181538259,
170805979, 159345973, 145138636, 138394717, 133797422,
135086622, 133275309, 114364328, 107043718, 101991189,
90338345, 83257441, 80373285, 58617616, 64444167,
46709983, 50818468, 156040895 # X
)
)
# ---- Compute SNP count and density ----
snp_density <- diabetes %>%
filter(!is.na(CHR)) %>%
mutate(CHR = as.character(CHR)) %>%
group_by(CHR) %>%
summarise(
SNP_count = n(),
Significant = sum(P < 1e-5),
.groups = "drop"
) %>%
left_join(chr_sizes, by = "CHR") %>%
mutate(
SNPs_per_Mb = SNP_count / (size_bp / 1e6),
Sig_density = Significant / (size_bp / 1e6)
)
# ---- Order chromosomes (1–22, then X) ----
snp_density$CHR <- factor(snp_density$CHR,
levels = c(as.character(1:22), "X"))
# ---- Plot SNP density ----
ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(
values = c("FALSE" = "steelblue", "TRUE" = "blue"),
name = "Contains Significant SNP (P < 1e-5)"
) +
labs(
title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
x = "Chromosome",
y = "SNPs per Megabase (Mb)"
) +
theme_minimal() +
theme(
legend.position = "top",
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 0, hjust = 0.5)
)
After adjusting according to chromosome size, the plot shows that chromosome 10, 11, and 6 have some the highest SNP densities. Most chromosomes also contain at least one SNP that significant association with Diabetes as indicated by their p-values.
# ============================================================
# Manhattan Plot
# ============================================================
# CMplot requires columns in the order: SNP, CHR, BP, P
# So if needed, reorder or subset:
# Clean P-values
diabetes_clean <- diabetes %>%
mutate(P = as.numeric(P)) %>% # ensure numeric
filter(!is.na(P) & P > 0 & P < 1) # keep valid range
diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]
# Manhattan plot
CMplot(
diabetes_cm,
plot.type = "m",
col = c("grey30", "skyblue3"), # alternating colors
LOG10 = TRUE,
threshold = c(1e-5, 5e-8), # suggestive & genome-wide
threshold.col = c("orange", "red"),
threshold.lty = c(2, 1),
main = "Diabetes",
cex = 0.6, # point size
file = "jpg", # output format
dpi = 300,
file.output = FALSE, # show in RStudio, don't save
verbose = TRUE
)
## Rectangular Manhattan plotting P.
The Manhattan plot shows the SNP variants of each chromosomes and indicates the statistical significance of their association to Diabetes. Chromosome 10 and 6 have loci that have the strongest association with this condition.
# ============================================================
# QQ Plot
# ============================================================
# Compute expected and observed
n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))
qq_df <- data.frame(expected, observed)
ggplot(qq_df, aes(x = expected, y = observed)) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
geom_point(color = "dodgerblue3", size = 1) +
labs(
title = "QQ Plot: Diabetes Disease",
x = "Expected -log10(P)",
y = "Observed -log10(P)"
) +
theme_minimal()
The QQ Plot shows that most of the SNPs lie along the red line (which represents the null hypothesis) and these points are consistent with random chance. The points that begin to sharply deviate from the line and spike represent SNPs that are the strongest genetic associations with Diabetes.