We will explore GWAS data for Diabates mellitus, produce Manhattan and QQ plots.

library(data.table)
library(qqman)
library(tidyverse)
library(ggplot2)
library(CMplot)
library(dplyr)

Load and Inspect Data

diabetes <- read.csv("Gwas Diabetes 2 data.csv")
dim(diabetes)
## [1] 10996     5
colnames(diabetes)
## [1] "SNP"  "CHR"  "BP"   "P"    "Gene"
head(diabetes)
##           SNP CHR        BP     P              Gene
## 1 rs9273368-G   6  32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-?   6  32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3     rs689-?  11   2160994 1e-18      INS-IGF2,INS
## 4 rs2476601-?   1 113834946 5e-16  PTPN22,AP4B1-AS1
## 5 rs3184504-?  12 111446804 2e-08       SH2B3,ATXN2
## 6 rs1983890-C  10   6136651 3e-08      RBM17,PFKFB3
summary(diabetes)
##      SNP                CHR                 BP                  P            
##  Length:10996       Length:10996       Length:10996       Min.   :0.000e+00  
##  Class :character   Class :character   Class :character   1st Qu.:0.000e+00  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000e-11  
##                                                           Mean   :3.861e-07  
##                                                           3rd Qu.:1.000e-08  
##                                                           Max.   :1.000e-05  
##      Gene          
##  Length:10996      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

The summary of the data shows the basic statistic analysis for this data set obtained from GWAS.

Histogram of P-values

hist(diabetes$P)

The histogram shows that the most frequent P-values are in the 0e+00 range. This means that most of the SNPs are not significant.

Log-transform P-values

diabetes$logP <- -log10(diabetes$P)

ggplot(diabetes, aes(x = logP)) +
  geom_histogram(fill = "steelblue", color = "black", bins = 50) +
  labs(
    title = "Histogram of -log10(P) values (Diabetes Disease)",
    x = expression(-log[10](P)),
    y = "Frequency"
  ) +
  theme_minimal()

When the log of the P-values is taken, the histogram also shows similar results. This is another way to interpret the P value plot, and it shows that most SNPs are not significant.

Histogram of SNPs per Chromosome

diabetes$CHR <- factor(diabetes$CHR, levels = c(as.character(1:22), "X", "Y"))

ggplot(diabetes, aes(x = CHR)) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Histogram of SNPs per Chromosome",
    x = "Chromosome",
    y = "Frequency"
  ) +
  theme_minimal()

This figure shows that the number of SNPs per chromosome but the data is not represented well. It shows a lot of missing and unlabeled data.

SNP Density per Chromosome

chr_sizes <- data.frame(
  CHR = as.character(c(1:22, "X")),
  size_bp = c(
    248956422, 242193529, 198295559, 190214555, 181538259,
    170805979, 159345973, 145138636, 138394717, 133797422,
    135086622, 133275309, 114364328, 107043718, 101991189,
    90338345, 83257441, 80373285, 58617616, 64444167,
    46709983, 50818468, 156040895
  )
)

snp_density <- diabetes %>%
  filter(!is.na(CHR)) %>%
  mutate(CHR = as.character(CHR)) %>%
  group_by(CHR) %>%
  summarise(
    SNP_count = n(),
    Significant = sum(P < 1e-5),
    .groups = "drop"
  ) %>%
  left_join(chr_sizes, by = "CHR") %>%
  mutate(
    SNPs_per_Mb = SNP_count / (size_bp / 1e6),
    Sig_density = Significant / (size_bp / 1e6)
  )

snp_density$CHR <- factor(snp_density$CHR, levels = c(as.character(1:22), "X"))

ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(
    values = c("FALSE" = "steelblue", "TRUE" = "blue"),
    name = "Contains Significant SNP (P < 1e-5)"
  ) +
  labs(
    title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
    x = "Chromosome",
    y = "SNPs per Megabase (Mb)"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 0.5)
  )

When the plot is normalized, the data is able to be visualized better.This plot only contains significant SNPs. It shows that almost all of the chromosomes contain significant SNPs for diabetes. The chromosome with the highest density of SNPs associated with diabetes 2 is chromosome 10.

Manhattan Plot

diabetes_clean <- diabetes %>%
  mutate(P = as.numeric(P)) %>%
  filter(!is.na(P) & P > 0 & P < 1)

diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]

CMplot(
  diabetes_cm,
  plot.type = "m",
  col = c("grey30", "skyblue3"),
  LOG10 = TRUE,
  threshold = c(1e-5, 5e-8),
  threshold.col = c("orange", "red"),
  threshold.lty = c(2, 1),
  main = "Diabetes",
  cex = 0.6,
  file = "jpg",
  dpi = 300,
  file.output = FALSE,
  verbose = TRUE
)
##  Rectangular Manhattan plotting P.

The manhattan plot is similar to what can be viewed on the GWAS page. The plot shows that 5 chromosomes have the highest SNPs that are significant for diabetes. The chromosomes 3, 6, 9, 10, and 16 seem to have significant SNPs, with chromosome 10 having the most significant SNP.

QQ Plot

n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))

qq_df <- data.frame(expected, observed)

ggplot(qq_df, aes(x = expected, y = observed)) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  geom_point(color = "dodgerblue3", size = 1) +
  labs(
    title = "QQ Plot: Diabetes Disease",
    x = "Expected -log10(P)",
    y = "Observed -log10(P)"
  ) +
  theme_minimal()

The QQ plot shows that a majority of the observed values (blue) lie near the expected (red) so they’re not significant but the others that rise above show that these SNPs may be significant.