We will explore GWAS data for Diabates mellitus, produce Manhattan and QQ plots .

Load Libraries

library(data.table)
library(qqman)
library(tidyverse)
library(ggplot2)
library(CMplot)
library(dplyr)
# Load the data
diabetes <- read.csv("Gwas Diabetes 2 data.csv")

# Inspect column names
dim(diabetes)
## [1] 10996     5
colnames(diabetes)
## [1] "SNP"  "CHR"  "BP"   "P"    "Gene"
head(diabetes)
##           SNP CHR        BP     P              Gene
## 1 rs9273368-G   6  32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-?   6  32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3     rs689-?  11   2160994 1e-18      INS-IGF2,INS
## 4 rs2476601-?   1 113834946 5e-16  PTPN22,AP4B1-AS1
## 5 rs3184504-?  12 111446804 2e-08       SH2B3,ATXN2
## 6 rs1983890-C  10   6136651 3e-08      RBM17,PFKFB3
summary(diabetes)
##      SNP                CHR                 BP                  P            
##  Length:10996       Length:10996       Length:10996       Min.   :0.000e+00  
##  Class :character   Class :character   Class :character   1st Qu.:0.000e+00  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000e-11  
##                                                           Mean   :3.861e-07  
##                                                           3rd Qu.:1.000e-08  
##                                                           Max.   :1.000e-05  
##      Gene          
##  Length:10996      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

In the data set, there are a total of 5 variables and a total of 10,996 observations. The summary table shows the variable type and metrics of the data set.

# Histogram of P-values
hist(diabetes$P)

The histogram indicates that most of SNPs are not associated with Diabetes (p-values are not significant) and only a small subset of SNPs have significant p-values indicating strong association with Diabetes.

# Log-transform P-values
diabetes$logP <- -log10(diabetes$P)

# Histogram of -log10(P)
ggplot(diabetes, aes(x = logP)) +
  geom_histogram(fill = "steelblue", color = "black", bins = 50) +
  labs(
    title = "Histogram of -log10(P) values (Diabetes Disease)",
    x = expression(-log[10](P)),
    y = "Frequency"
  ) +
  theme_minimal()

The -log10(P)value plot also indicates that most of the SNPs have no significant association with Diabetes with only a small fraction of the SNPs being linked to the condition.

# Histogram of SNPs/Chromosomes
# Ensure chromosome order is numeric or natural (1–22, X, Y)
diabetes$CHR <- factor(diabetes$CHR, 
                     levels = c(as.character(1:22), "X", "Y"))


# Histogram of SNPs per Chromosome
# ---- Order chromosomes (1–22, then X) ----

diabetes$CHR <- factor(diabetes$CHR,
                       levels = c(as.character(1:22), "X"))

ggplot(diabetes, aes(x = CHR)) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Histogram of SNPs per Chromosome",
    x = "Chromosome",
    y = "Frequency"
  ) +
  theme_minimal()

SNP distribution across chromosome is uneven as there is a lot of missing data that contributes to the current distribution. The plot can be remade after the data is normalized according to chromosome size for a better understanding of the data.

# Normalize per chromosome size:
# ============================================================
# SNP Density per Chromosome (Normalized by Chromosome Size)
# ============================================================


# ---- Chromosome sizes (GRCh38, in base pairs) ----
chr_sizes <- data.frame(
  CHR = as.character(c(1:22, "X")),
  size_bp = c(
    248956422, 242193529, 198295559, 190214555, 181538259,
    170805979, 159345973, 145138636, 138394717, 133797422,
    135086622, 133275309, 114364328, 107043718, 101991189,
    90338345, 83257441, 80373285, 58617616, 64444167,
    46709983, 50818468, 156040895  # X
  )
)


# ---- Compute SNP count and density ----
snp_density <- diabetes %>%
  filter(!is.na(CHR)) %>%
  mutate(CHR = as.character(CHR)) %>%
  group_by(CHR) %>%
  summarise(
    SNP_count = n(),
    Significant = sum(P < 1e-5),
    .groups = "drop"
  ) %>%
  left_join(chr_sizes, by = "CHR") %>%
  mutate(
    SNPs_per_Mb = SNP_count / (size_bp / 1e6),
    Sig_density = Significant / (size_bp / 1e6)
  )


# ---- Order chromosomes (1–22, then X) ----
snp_density$CHR <- factor(snp_density$CHR, 
                          levels = c(as.character(1:22), "X"))


# ---- Plot SNP density ----
ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(
    values = c("FALSE" = "steelblue", "TRUE" = "blue"),
    name = "Contains Significant SNP (P < 1e-5)"
  ) +
  labs(
    title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
    x = "Chromosome",
    y = "SNPs per Megabase (Mb)"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 0.5)
  )

After adjusting according to chromosome size, the plot shows that chromosome 10, 11, and 6 have some the highest SNP densities. Most chromosomes also contain at least one SNP that significant association with Diabetes as indicated by their p-values.

# ============================================================
#  Manhattan Plot
# ============================================================

# CMplot requires columns in the order: SNP, CHR, BP, P
# So if needed, reorder or subset:

# Clean P-values
diabetes_clean <- diabetes %>%
  mutate(P = as.numeric(P)) %>%   # ensure numeric
  filter(!is.na(P) & P > 0 & P < 1)  # keep valid range

diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]

# Manhattan plot
CMplot(
  diabetes_cm,
  plot.type = "m",
  col = c("grey30", "skyblue3"),      # alternating colors
  LOG10 = TRUE,
  threshold = c(1e-5, 5e-8),          # suggestive & genome-wide
  threshold.col = c("orange", "red"),
  threshold.lty = c(2, 1),
  main = "Diabetes",
  cex = 0.6,                          # point size
  file = "jpg",                       # output format
  dpi = 300,
  file.output = FALSE, # show in RStudio, don't save
  verbose = TRUE
)
##  Rectangular Manhattan plotting P.

The Manhattan plot shows the SNP variants of each chromosomes and indicates the statistical significance of their association to Diabetes. Chromosome 10 and 6 have loci that have the strongest association with this condition.

# ============================================================
#  QQ Plot
# ============================================================

# Compute expected and observed
n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))

qq_df <- data.frame(expected, observed)

ggplot(qq_df, aes(x = expected, y = observed)) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  geom_point(color = "dodgerblue3", size = 1) +
  labs(
    title = "QQ Plot: Diabetes Disease",
    x = "Expected -log10(P)",
    y = "Observed -log10(P)"
  ) +
  theme_minimal()

The QQ Plot shows that most of the SNPs lie along the red line (which represents the null hypothesis) and these points are consistent with random chance. The points that begin to sharply deviate from the line and spike represent SNPs that are the strongest genetic associations with Diabetes.