We will explore GWAS data for Diabates mellitus, produce Manhattan and QQ plots .


Libraries

library(data.table)
library(qqman)
library(tidyverse)
library(ggplot2)
library(CMplot)
library(dplyr)


Load & Inspect Data

# Load the data
# adjust filepath to where you saved the summary stats
diabetes <- read.csv("Gwas Diabetes 2 data.csv")

# Inspect column names
dim(diabetes)
## [1] 10996     5
colnames(diabetes)
## [1] "SNP"  "CHR"  "BP"   "P"    "Gene"
head(diabetes)
##           SNP CHR        BP     P              Gene
## 1 rs9273368-G   6  32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-?   6  32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3     rs689-?  11   2160994 1e-18      INS-IGF2,INS
## 4 rs2476601-?   1 113834946 5e-16  PTPN22,AP4B1-AS1
## 5 rs3184504-?  12 111446804 2e-08       SH2B3,ATXN2
## 6 rs1983890-C  10   6136651 3e-08      RBM17,PFKFB3
summary(diabetes)
##      SNP                CHR                 BP                  P            
##  Length:10996       Length:10996       Length:10996       Min.   :0.000e+00  
##  Class :character   Class :character   Class :character   1st Qu.:0.000e+00  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000e-11  
##                                                           Mean   :3.861e-07  
##                                                           3rd Qu.:1.000e-08  
##                                                           Max.   :1.000e-05  
##      Gene          
##  Length:10996      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 


Histogram of P-values

hist(diabetes$P)


Log-transform P-values & Histogram of -log10(P)

# Log-transform P-values
diabetes$logP <- -log10(diabetes$P)

# Histogram of -log10(P)
ggplot(diabetes, aes(x = logP)) +
  geom_histogram(fill = "steelblue", color = "black", bins = 50) +
  labs(
    title = "Histogram of -log10(P) values (Diabetes Disease)",
    x = expression(-log[10](P)),
    y = "Frequency"
  ) +
  theme_minimal()


Histogram of SNPs per Chromosome

# Histogram of SNPs/Chromosomes
# Ensure chromosome order is numeric or natural (1–22, X, Y)
diabetes$CHR <- factor(diabetes$CHR, 
                     levels = c(as.character(1:22), "X", "Y"))

# Histogram of SNPs per Chromosome
# ---- Order chromosomes (1–22, then X) ----
diabetes$CHR <- factor(diabetes$CHR,
                       levels = c(as.character(1:22), "X"))

ggplot(diabetes, aes(x = CHR)) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Histogram of SNPs per Chromosome",
    x = "Chromosome",
    y = "Frequency"
  ) +
  theme_minimal()


SNP Density per Chromosome (Normalized by Chromosome Size)

# ============================================================
# SNP Density per Chromosome (Normalized by Chromosome Size)
# ============================================================

# ---- Chromosome sizes (GRCh38, in base pairs) ----
chr_sizes <- data.frame(
  CHR = as.character(c(1:22, "X")),
  size_bp = c(
    248956422, 242193529, 198295559, 190214555, 181538259,
    170805979, 159345973, 145138636, 138394717, 133797422,
    135086622, 133275309, 114364328, 107043718, 101991189,
    90338345, 83257441, 80373285, 58617616, 64444167,
    46709983, 50818468, 156040895  # X
  )
)

# ---- Compute SNP count and density ----
snp_density <- diabetes %>%
  filter(!is.na(CHR)) %>%
  mutate(CHR = as.character(CHR)) %>%
  group_by(CHR) %>%
  summarise(
    SNP_count = n(),
    Significant = sum(P < 1e-5),
    .groups = "drop"
  ) %>%
  left_join(chr_sizes, by = "CHR") %>%
  mutate(
    SNPs_per_Mb = SNP_count / (size_bp / 1e6),
    Sig_density = Significant / (size_bp / 1e6)
  )

# ---- Order chromosomes (1–22, then X) ----
snp_density$CHR <- factor(snp_density$CHR, 
                          levels = c(as.character(1:22), "X"))

# ---- Plot SNP density ----
ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(
    values = c("FALSE" = "steelblue", "TRUE" = "blue"),
    name = "Contains Significant SNP (P < 1e-5)"
  ) +
  labs(
    title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
    x = "Chromosome",
    y = "SNPs per Megabase (Mb)"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 0.5)
  )


Manhattan Plot

# ============================================================
#  Manhattan Plot
# ============================================================

# CMplot requires columns in the order: SNP, CHR, BP, P
# So if needed, reorder or subset:

# Clean P-values
diabetes_clean <- diabetes %>%
  mutate(P = as.numeric(P)) %>%   # ensure numeric
  filter(!is.na(P) & P > 0 & P < 1)  # keep valid range

diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]

# Manhattan plot
CMplot(
  diabetes_cm,
  plot.type = "m",
  col = c("grey30", "skyblue3"),      # alternating colors
  LOG10 = TRUE,
  threshold = c(1e-5, 5e-8),          # suggestive & genome-wide
  threshold.col = c("orange", "red"),
  threshold.lty = c(2, 1),
  main = "Crohn's Disease",
  cex = 0.6,                          # point size
  file = "jpg",                       # output format
  dpi = 300,
  file.output = FALSE, # show in RStudio, don't save
  verbose = TRUE
)
##  Rectangular Manhattan plotting P.


QQ Plot

# ============================================================
#  QQ Plot
# ============================================================

# Compute expected and observed
n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))

qq_df <- data.frame(expected, observed)

ggplot(qq_df, aes(x = expected, y = observed)) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  geom_point(color = "dodgerblue3", size = 1) +
  labs(
    title = "QQ Plot: Diabetes Disease",
    x = "Expected -log10(P)",
    y = "Observed -log10(P)"
  ) +
  theme_minimal()