We will explore GWAS data for Diabates mellitus, produce Manhattan and QQ plots.

library(data.table)
library(qqman)
library(tidyverse)
library(ggplot2)
library(CMplot)
library(dplyr)

Load and Inspect Data

# Load the data
diabetes <- read.csv("Gwas Diabetes 2 data.csv")

# Inspect column names
dim(diabetes)
## [1] 10996     5
colnames(diabetes)
## [1] "SNP"  "CHR"  "BP"   "P"    "Gene"
head(diabetes)
##           SNP CHR        BP     P              Gene
## 1 rs9273368-G   6  32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-?   6  32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3     rs689-?  11   2160994 1e-18      INS-IGF2,INS
## 4 rs2476601-?   1 113834946 5e-16  PTPN22,AP4B1-AS1
## 5 rs3184504-?  12 111446804 2e-08       SH2B3,ATXN2
## 6 rs1983890-C  10   6136651 3e-08      RBM17,PFKFB3
summary(diabetes)
##      SNP                CHR                 BP                  P            
##  Length:10996       Length:10996       Length:10996       Min.   :0.000e+00  
##  Class :character   Class :character   Class :character   1st Qu.:0.000e+00  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000e-11  
##                                                           Mean   :3.861e-07  
##                                                           3rd Qu.:1.000e-08  
##                                                           Max.   :1.000e-05  
##      Gene          
##  Length:10996      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Histogram of P-values

This histogram shows the distribution of p-values across all SNPs tested for association with diabetes. Most SNPs have very small p-values, indicating many variants show some statistical correlation, though only those below significance thresholds (e.g., p < 5e-8) are considered truly significant.

hist(diabetes$P)

Log-transform P-values and Plot

The log-transformed p-values highlight the strength of association. Taller bars at higher –log10(P) indicate more SNPs with stronger significance. The transformation makes it easier to visualize small p-values that would otherwise be compressed near zero.

diabetes$logP <- -log10(diabetes$P)

ggplot(diabetes, aes(x = logP)) +
  geom_histogram(fill = "steelblue", color = "black", bins = 50) +
  labs(
    title = "Histogram of -log10(P) values (Diabetes Disease)",
    x = expression(-log[10](P)),
    y = "Frequency"
  ) +
  theme_minimal()

Histogram of SNPs per Chromosome

This plot shows how many SNPs were analyzed per chromosome. Chromosomes vary in SNP count depending on their size and genetic variation density. A chromosome labeled “NA” may indicate unassigned SNPs due to missing chromosome data.

diabetes$CHR <- factor(diabetes$CHR, 
                     levels = c(as.character(1:22), "X", "Y"))

diabetes$CHR <- factor(diabetes$CHR,
                       levels = c(as.character(1:22), "X"))

ggplot(diabetes, aes(x = CHR)) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Histogram of SNPs per Chromosome",
    x = "Chromosome",
    y = "Frequency"
  ) +
  theme_minimal()

SNP Density per Chromosome (Normalized by Chromosome Size)

This figure normalizes SNP counts by chromosome length, showing the density of SNPs per megabase. Blue bars indicate chromosomes containing significant SNPs (p < 1e-5). Taller bars near the left suggest some chromosomes (e.g., 6 or 11) have higher SNP densities relative to size.

chr_sizes <- data.frame(
  CHR = as.character(c(1:22, "X")),
  size_bp = c(
    248956422, 242193529, 198295559, 190214555, 181538259,
    170805979, 159345973, 145138636, 138394717, 133797422,
    135086622, 133275309, 114364328, 107043718, 101991189,
    90338345, 83257441, 80373285, 58617616, 64444167,
    46709983, 50818468, 156040895
  )
)

snp_density <- diabetes %>%
  filter(!is.na(CHR)) %>%
  mutate(CHR = as.character(CHR)) %>%
  group_by(CHR) %>%
  summarise(
    SNP_count = n(),
    Significant = sum(P < 1e-5),
    .groups = "drop"
  ) %>%
  left_join(chr_sizes, by = "CHR") %>%
  mutate(
    SNPs_per_Mb = SNP_count / (size_bp / 1e6),
    Sig_density = Significant / (size_bp / 1e6)
  )

snp_density$CHR <- factor(snp_density$CHR, 
                          levels = c(as.character(1:22), "X"))

ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(
    values = c("FALSE" = "steelblue", "TRUE" = "blue"),
    name = "Contains Significant SNP (P < 1e-5)"
  ) +
  labs(
    title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
    x = "Chromosome",
    y = "SNPs per Megabase (Mb)"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 0.5)
  )

Manhattan Plot

The Manhattan plot visualizes each SNP’s genomic position (x-axis) versus its –log10(p-value) (y-axis). Peaks above the red or orange thresholds mark genome-wide significant loci associated with diabetes, particularly on chromosomes with key immune or insulin-related genes such as HLA-DQB1 and INS.

diabetes_clean <- diabetes %>%
  mutate(P = as.numeric(P)) %>%
  filter(!is.na(P) & P > 0 & P < 1)

diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]

CMplot(
  diabetes_cm,
  plot.type = "m",
  col = c("grey30", "skyblue3"),
  LOG10 = TRUE,
  threshold = c(1e-5, 5e-8),
  threshold.col = c("orange", "red"),
  threshold.lty = c(2, 1),
  main = "Crohn's Disease",
  cex = 0.6,
  file = "jpg",
  dpi = 300,
  file.output = FALSE,
  verbose = TRUE
)
##  Rectangular Manhattan plotting P.

QQ Plot

The QQ plot compares observed versus expected p-values under the null hypothesis. Points deviating upward from the red dashed line indicate SNPs with stronger associations than expected by chance, suggesting potential true genetic associations with diabetes.

n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))

qq_df <- data.frame(expected, observed)

ggplot(qq_df, aes(x = expected, y = observed)) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  geom_point(color = "dodgerblue3", size = 1) +
  labs(
    title = "QQ Plot: Diabetes Disease",
    x = "Expected -log10(P)",
    y = "Observed -log10(P)"
  ) +
  theme_minimal()