Diabetes_GWAS_Assign9.1

library(data.table)
library(qqman)

##

## For example usage please run: vignette('qqman')

##

## Citation appreciated but not required:

## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.

##

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(CMplot)

## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot

library(dplyr)

# Load the data
# adjust filepath to where you saved the summary stats
diabetes <- read.csv("Gwas Diabetes 2 data.csv")

# Inspect column names
dim(diabetes)

## [1] 10996     5

colnames(diabetes)

## [1] "SNP"  "CHR"  "BP"   "P"    "Gene"

head(diabetes)

##           SNP CHR        BP     P              Gene
## 1 rs9273368-G   6  32658698 8e-40 HLA-DQB1,HLA-DQA1
## 2 rs9273368-?   6  32658698 3e-78 HLA-DQB1,HLA-DQA1
## 3     rs689-?  11   2160994 1e-18      INS-IGF2,INS
## 4 rs2476601-?   1 113834946 5e-16  PTPN22,AP4B1-AS1
## 5 rs3184504-?  12 111446804 2e-08       SH2B3,ATXN2
## 6 rs1983890-C  10   6136651 3e-08      RBM17,PFKFB3

summary(diabetes)

##      SNP                CHR                 BP                  P            
##  Length:10996       Length:10996       Length:10996       Min.   :0.000e+00  
##  Class :character   Class :character   Class :character   1st Qu.:0.000e+00  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000e-11  
##                                                           Mean   :3.861e-07  
##                                                           3rd Qu.:1.000e-08  
##                                                           Max.   :1.000e-05  
##      Gene          
##  Length:10996      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

# Histogram of P-values
hist(diabetes$P)

# The histogram above shows that almost all SNPs have large non-significant p-values, while a small group shows strong evidence of association with Diabetes.

# Log-transform P-values
diabetes$logP <- -log10(diabetes$P)

# Histogram of -log10(P)
ggplot(diabetes, aes(x = logP)) +
  geom_histogram(fill = "steelblue", color = "black", bins = 50) +
  labs(
    title = "Histogram of -log10(P) values (Diabetes Disease)",
    x = expression(-log[10](P)),
    y = "Frequency"
  ) +
  theme_minimal()

#The histogram above with the -log10 values for P confirms the information from the previous histogram.

# Histogram of SNPs/Chromosomes
# Ensure chromosome order is numeric or natural (1–22, X, Y)
diabetes$CHR <- factor(diabetes$CHR, 
                     levels = c(as.character(1:22), "X", "Y"))

# Histogram of SNPs per Chromosome
# ---- Order chromosomes (1–22, then X) ----

diabetes$CHR <- factor(diabetes$CHR,
                       levels = c(as.character(1:22), "X"))

ggplot(diabetes, aes(x = CHR)) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Histogram of SNPs per Chromosome",
    x = "Chromosome",
    y = "Frequency"
  ) +
  theme_minimal()

#The histogram above shows that some chromosomes are more densely represented like chromosome 6 while others have fewer SNPs or missing data.

# Normalize per chromosome size:
# ============================================================
# SNP Density per Chromosome (Normalized by Chromosome Size)
# ============================================================


# ---- Chromosome sizes (GRCh38, in base pairs) ----
chr_sizes <- data.frame(
  CHR = as.character(c(1:22, "X")),
  size_bp = c(
    248956422, 242193529, 198295559, 190214555, 181538259,
    170805979, 159345973, 145138636, 138394717, 133797422,
    135086622, 133275309, 114364328, 107043718, 101991189,
    90338345, 83257441, 80373285, 58617616, 64444167,
    46709983, 50818468, 156040895  # X
  )
)

# ---- Compute SNP count and density ----
snp_density <- diabetes %>%
  filter(!is.na(CHR)) %>%
  mutate(CHR = as.character(CHR)) %>%
  group_by(CHR) %>%
  summarise(
    SNP_count = n(),
    Significant = sum(P < 1e-5),
    .groups = "drop"
  ) %>%
  left_join(chr_sizes, by = "CHR") %>%
  mutate(
    SNPs_per_Mb = SNP_count / (size_bp / 1e6),
    Sig_density = Significant / (size_bp / 1e6)
  )

# ---- Order chromosomes (1–22, then X) ----
snp_density$CHR <- factor(snp_density$CHR, 
                          levels = c(as.character(1:22), "X"))

# ---- Plot SNP density ----
ggplot(snp_density, aes(x = CHR, y = SNPs_per_Mb, fill = Sig_density > 0)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(
    values = c("FALSE" = "steelblue", "TRUE" = "blue"),
    name = "Contains Significant SNP (P < 1e-5)"
  ) +
  labs(
    title = "SNP Density per Chromosome (Normalized by Chromosome Size)",
    x = "Chromosome",
    y = "SNPs per Megabase (Mb)"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 0.5)
  )

#After adjusting for chromosome size, SNP coverage is uneven across the genome, with chromosomes 10, 11 and 6 having relatively high SNP densities.

# ============================================================

# CMplot requires columns in the order: SNP, CHR, BP, P
# So if needed, reorder or subset:

# Clean P-values
diabetes_clean <- diabetes %>%
  mutate(P = as.numeric(P)) %>%   # ensure numeric
  filter(!is.na(P) & P > 0 & P < 1)  # keep valid range

diabetes_cm <- diabetes_clean[, c("SNP", "CHR", "BP", "P")]

# Manhattan plot
CMplot(
  diabetes_cm,
  plot.type = "m",
  col = c("grey30", "skyblue3"),      # alternating colors
  LOG10 = TRUE,
  threshold = c(1e-5, 5e-8),          # suggestive & genome-wide
  threshold.col = c("orange", "red"),
  threshold.lty = c(2, 1),
  main = "Diabetes",
  cex = 0.6,                          # point size
  file = "jpg",                       # output format
  dpi = 300,
  file.output = FALSE, # show in RStudio, don't save
  verbose = TRUE
)

##  Rectangular Manhattan plotting P.

#The Manhattan plot shows that chromsomes 10, 11, and 6 have genomic regions that harbor risks for diabetes.

# ============================================================
#  QQ Plot
# ============================================================

# Compute expected and observed
n <- length(diabetes$P)
expected <- -log10(ppoints(n))
observed <- -log10(sort(diabetes$P))

qq_df <- data.frame(expected, observed)

ggplot(qq_df, aes(x = expected, y = observed)) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  geom_point(color = "dodgerblue3", size = 1) +
  labs(
    title = "QQ Plot: Diabetes Disease",
    x = "Expected -log10(P)",
    y = "Observed -log10(P)"
  ) +
  theme_minimal()

#The plot below shows a few variants that don't lie along the red lines.  This shows that a few are associated with Diabetes.

Diabetes_GWAS_Assign9.1

L Daniels

2025-10-27