library(data.table)
library(qqman)
##
## For example usage please run: vignette('qqman')
##
## Citation appreciated but not required:
## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.
##
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(CMplot)
## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot
library(dplyr)
# Load the data
# adjust filepath to where you saved the summary stats
diabetes <- read.csv("Gwas Diabetes 2 data.csv")
# Inspect column names
dim(diabetes)
## [1] 10996 5
colnames(diabetes)
## [1] "SNP" "CHR" "BP" "P" "Gene"
head(diabetes)
## SNP CHR BP P Gene
## 1 rs9273368-G 6 32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-? 6 32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3 rs689-? 11 2160994 1e-18 INS-IGF2,INS
## 4 rs2476601-? 1 113834946 5e-16 PTPN22,AP4B1-AS1
## 5 rs3184504-? 12 111446804 2e-08 SH2B3,ATXN2
## 6 rs1983890-C 10 6136651 3e-08 RBM17,PFKFB3
summary(diabetes)
## SNP CHR BP P
## Length:10996 Length:10996 Length:10996 Min. :0.000e+00
## Class :character Class :character Class :character 1st Qu.:0.000e+00
## Mode :character Mode :character Mode :character Median :3.000e-11
## Mean :3.861e-07
## 3rd Qu.:1.000e-08
## Max. :1.000e-05
## Gene
## Length:10996
## Class :character
## Mode :character
##
##
##
# Histogram of P-values
hist(diabetes$P)

# The histogram above shows that almost all SNPs have large non-significant p-values, while a small group shows strong evidence of association with Diabetes.
# Log-transform P-values
diabetes$logP <- -log10(diabetes$P)
# Histogram of -log10(P)
ggplot(diabetes, aes(x = logP)) +
geom_histogram(fill = "steelblue", color = "black", bins = 50) +
labs(
title = "Histogram of -log10(P) values (Diabetes Disease)",
x = expression(-log[10](P)),
y = "Frequency"
) +
theme_minimal()

#The histogram above with the -log10 values for P confirms the information from the previous histogram.
# Histogram of SNPs/Chromosomes
# Ensure chromosome order is numeric or natural (1–22, X, Y)
diabetes$CHR <- factor(diabetes$CHR,
levels = c(as.character(1:22), "X", "Y"))
# Histogram of SNPs per Chromosome
# ---- Order chromosomes (1–22, then X) ----
diabetes$CHR <- factor(diabetes$CHR,
levels = c(as.character(1:22), "X"))
ggplot(diabetes, aes(x = CHR)) +
geom_bar(fill = "steelblue", color = "black") +
labs(
title = "Histogram of SNPs per Chromosome",
x = "Chromosome",
y = "Frequency"
) +
theme_minimal()

#The histogram above shows that some chromosomes are more densely represented like chromosome 6 while others have fewer SNPs or missing data.
# Normalize per chromosome size:
# ============================================================
# SNP Density per Chromosome (Normalized by Chromosome Size)
# ============================================================
# ---- Chromosome sizes (GRCh38, in base pairs) ----
chr_sizes <- data.frame(
CHR = as.character(c(1:22, "X")),
size_bp = c(
248956422, 242193529, 198295559, 190214555, 181538259,
170805979, 159345973, 145138636, 138394717, 133797422,
135086622, 133275309, 114364328, 107043718, 101991189,
90338345, 83257441, 80373285, 58617616, 64444167,
46709983, 50818468, 156040895 # X
)
)
# ---- Compute SNP count and density ----
snp_density <- diabetes %>%
filter(!is.na(CHR)) %>%
mutate(CHR = as.character(CHR)) %>%
group_by(CHR) %>%
summarise(
SNP_count = n(),
Significant = sum(P < 1e-5),
.groups = "drop"
) %>%
left_join(chr_sizes, by = "CHR") %>%
mutate(
SNPs_per_Mb = SNP_count / (size_bp / 1e6),
Sig_density = Significant / (size_bp / 1e6)
)
# ---- Order chromosomes (1–22, then X) ----
snp_density$CHR <- factor(snp_density$CHR,
levels = c(as.character(1:22), "X"))
# ---- Plot SNP density ----
ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(
values = c("FALSE" = "steelblue", "TRUE" = "blue"),
name = "Contains Significant SNP (P < 1e-5)"
) +
labs(
title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
x = "Chromosome",
y = "SNPs per Megabase (Mb)"
) +
theme_minimal() +
theme(
legend.position = "top",
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 0, hjust = 0.5)
)

#After adjusting for chromosome size, SNP coverage is uneven across the genome, with chromosomes 10, 11 and 6 having relatively high SNP densities.
# ============================================================
# CMplot requires columns in the order: SNP, CHR, BP, P
# So if needed, reorder or subset:
# Clean P-values
diabetes_clean <- diabetes %>%
mutate(P = as.numeric(P)) %>% # ensure numeric
filter(!is.na(P) & P > 0 & P < 1) # keep valid range
diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]
# Manhattan plot
CMplot(
diabetes_cm,
plot.type = "m",
col = c("grey30", "skyblue3"), # alternating colors
LOG10 = TRUE,
threshold = c(1e-5, 5e-8), # suggestive & genome-wide
threshold.col = c("orange", "red"),
threshold.lty = c(2, 1),
main = "Diabetes",
cex = 0.6, # point size
file = "jpg", # output format
dpi = 300,
file.output = FALSE, # show in RStudio, don't save
verbose = TRUE
)
## Rectangular Manhattan plotting P.

#The Manhattan plot shows that chromsomes 10, 11, and 6 have genomic regions that harbor risks for diabetes.
# ============================================================
# QQ Plot
# ============================================================
# Compute expected and observed
n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))
qq_df <- data.frame(expected, observed)
ggplot(qq_df, aes(x = expected, y = observed)) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
geom_point(color = "dodgerblue3", size = 1) +
labs(
title = "QQ Plot: Diabetes Disease",
x = "Expected -log10(P)",
y = "Observed -log10(P)"
) +
theme_minimal()

#The plot below shows a few variants that don't lie along the red lines. This shows that a few are associated with Diabetes.