Data preparation

setDTthreads(threads = 28)

Array2 <- fread("/mnt/data/pgs/ArrayPGSnew.csv")
Array2$Population <- sub("^1KG\\.", "", Array2$Population)
Array2$Latitude=NULL

Fst <- fread("/mnt/data/newfiles/merged_new/pairwise_fst_array_hudson_with1kg.fst.summary")
colnames(Fst)[colnames(Fst) == "#POP1"] <- "POP1"

Heatmap of Fst distances

Distance matrix calculation

## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
##   POP1     POP2 EA3_distance EA4_distance SKINC_distance Height_distance
## 1  ACB  Algeria   0.85669742    1.5854682      0.6580122       0.6904268
## 2  ACB Amazonia   0.29660255    1.5010435      0.7580104       1.0260928
## 3  ACB    Andes   0.17054116    1.7295879      0.9022190       0.6180687
## 4  ACB Armenian   1.37302935    2.7790826      1.1739626       0.6212062
## 5  ACB      ASW   0.05052031    0.4585567      0.1741379       0.1838117
## 6  ACB Athabask   0.03542741    1.9414306      1.0343298       0.4326350
##   Skin_colour_distance NIQ_seb_distance Height_seb_distance
## 1                  4.0             1.84                 3.5
## 2                  1.5             3.51                18.0
## 3                  1.5             3.51                19.5
## 4                  6.8             6.14                 5.2
## 5                  0.0             2.51                 1.6
## 6                  3.5             8.82                13.2

Permutation test

Piffer-Mantel test: linear regression

## 
## Call:
## lm(formula = NIQ_seb_distance ~ EA3_distance + HUDSON_FST, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.059  -6.410  -1.261   6.047  25.594 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.5366     0.3657  20.611   <2e-16 ***
## EA3_distance   5.6716     0.4204  13.493   <2e-16 ***
## HUDSON_FST     3.8752     4.7985   0.808    0.419    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.25 on 1805 degrees of freedom
## Multiple R-squared:  0.1401, Adjusted R-squared:  0.1392 
## F-statistic: 147.1 on 2 and 1805 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = NIQ_seb_distance ~ EA3_distance + HUDSON_FST, data = merged_df)
## 
## Standardized Coefficients::
##  (Intercept) EA3_distance   HUDSON_FST 
##           NA   0.36136106   0.02162895
## 
## Call:
## lm(formula = NIQ_seb_distance ~ EA4_distance + HUDSON_FST, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.994  -6.113  -1.379   5.429  25.757 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.3239     0.3399  27.431  < 2e-16 ***
## EA4_distance   5.0037     0.2718  18.409  < 2e-16 ***
## HUDSON_FST   -19.2418     5.0023  -3.847 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.943 on 1805 degrees of freedom
## Multiple R-squared:  0.203,  Adjusted R-squared:  0.2021 
## F-statistic: 229.9 on 2 and 1805 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = NIQ_seb_distance ~ EA4_distance + HUDSON_FST, data = merged_df)
## 
## Standardized Coefficients::
##  (Intercept) EA4_distance   HUDSON_FST 
##           NA    0.5139860   -0.1073951
## 
## Call:
## lm(formula = Height_seb_distance ~ Height_distance + HUDSON_FST, 
##     data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7358 -3.0797 -0.6242  2.7492 14.4989 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.5138     0.1946  18.058  < 2e-16 ***
## Height_distance   2.3200     0.2295  10.109  < 2e-16 ***
## HUDSON_FST       14.0429     2.0926   6.711 2.58e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.096 on 1805 degrees of freedom
## Multiple R-squared:  0.113,  Adjusted R-squared:  0.112 
## F-statistic:   115 on 2 and 1805 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = Height_seb_distance ~ Height_distance + HUDSON_FST, 
##     data = merged_df)
## 
## Standardized Coefficients::
##     (Intercept) Height_distance      HUDSON_FST 
##              NA       0.2415568       0.1603586
## 
## Call:
## lm(formula = Skin_colour_distance ~ SKINC_distance + HUDSON_FST, 
##     data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0641 -1.1179 -0.1157  1.2261  4.6946 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.53216    0.07358   7.232 6.99e-13 ***
## SKINC_distance  1.99071    0.06143  32.405  < 2e-16 ***
## HUDSON_FST      9.96777    0.80776  12.340  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.57 on 1805 degrees of freedom
## Multiple R-squared:  0.4971, Adjusted R-squared:  0.4965 
## F-statistic: 892.1 on 2 and 1805 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = Skin_colour_distance ~ SKINC_distance + HUDSON_FST, 
##     data = merged_df)
## 
## Standardized Coefficients::
##    (Intercept) SKINC_distance     HUDSON_FST 
##             NA      0.5872355      0.2236193

Prepare dataset for K Neighbours analysis

# Ensure the kknn package is installed
fst_distance_matrix <- fread("/mnt/data/newfiles/merged_new/pairwise_fst_array_hudson_with1kg.fst.summary")
fst_distance_matrix=fst_distance_matrix[fst_distance_matrix$HUDSON_FST>0,]
colnames(fst_distance_matrix)[colnames(fst_distance_matrix) == "#POP1"] <- "POP1"
Array2 <- fread("/mnt/data/pgs/ArrayPGSnew.csv")
Array2$Population <- sub("^1KG\\.", "", Array2$Population)

populations_fst <- unique(c(fst_distance_matrix$POP1, fst_distance_matrix$POP2))
populations_phenotypic <- Array2$Population
common_populations <- intersect(populations_fst, populations_phenotypic)

filtered_fst_distance_matrix <- fst_distance_matrix %>%
  filter(POP1 %in% common_populations & POP2 %in% common_populations)

filtered_phenotypic_data <- Array2 %>%
  filter(Population %in% common_populations)

fst_matrix <- matrix(NA, nrow = length(common_populations), ncol = length(common_populations),
                     dimnames = list(common_populations, common_populations))

for (i in 1:nrow(filtered_fst_distance_matrix)) {
  fst_matrix[filtered_fst_distance_matrix$POP1[i], filtered_fst_distance_matrix$POP2[i]] <- filtered_fst_distance_matrix$HUDSON_FST[i]
  fst_matrix[filtered_fst_distance_matrix$POP2[i], filtered_fst_distance_matrix$POP1[i]] <- filtered_fst_distance_matrix$HUDSON_FST[i]
}
fst_matrix[is.na(fst_matrix)] <- max(fst_matrix, na.rm = TRUE) + 1

ordered_phenotypic_data <- filtered_phenotypic_data[match(common_populations, filtered_phenotypic_data$Population),]

library(spdep)
library(kirkegaard)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## Loading required package: weights
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:gt':
## 
##     html
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: assertthat
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:effsize':
## 
##     cohen.d
## The following object is masked from 'package:effectsize':
## 
##     phi
## 
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:psych':
## 
##     rescale
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## The following object is masked from 'package:purrr':
## 
##     is_logical
## The following object is masked from 'package:effectsize':
## 
##     standardize
## The following object is masked from 'package:base':
## 
##     +
# Check for missing values in NIQ_seb
missing_values <- sum(is.na(ordered_phenotypic_data$NIQ_seb))
if (missing_values > 0) {
  cat("There are", missing_values, "missing values in NIQ_seb.\n")
  # Option 1: Remove entries with missing NIQ_seb values
  ordered_phenotypic_data <- ordered_phenotypic_data[!is.na(ordered_phenotypic_data$NIQ_seb),]
  
  # Option 2: Impute missing values (e.g., with the mean or median)
  # ordered_phenotypic_data$NIQ_seb[is.na(ordered_phenotypic_data$NIQ_seb)] <- mean(ordered_phenotypic_data$NIQ_seb, na.rm = TRUE)
}
## There are 5 missing values in NIQ_seb.
# Ensure fst_matrix and ordered_phenotypic_data are still aligned after removing/imputing missing values
common_populations <- intersect(rownames(fst_matrix), ordered_phenotypic_data$Population)
fst_matrix <- fst_matrix[common_populations, common_populations]
ordered_phenotypic_data <- ordered_phenotypic_data[match(common_populations, ordered_phenotypic_data$Population),]

for (k in 1:10) {
  ordered_phenotypic_data = spatial_knn(ordered_phenotypic_data, 
                       k = k, 
                       vars = c("Skin_colour", "NIQ_seb", "Height_seb"),
                       suffix = "_k" + k,
                       dists = fst_matrix
                       )
}

library(spdep)

# Define the function to compute Moran's I for the lagged variables

Correlation between PGS and lag PGS for K 1 to 10

# Check for missing values in NIQ_seb
missing_values_NIQ <- sum(is.na(ordered_phenotypic_data$NIQ_seb))
if (missing_values_NIQ > 0) {
  cat("There are", missing_values_NIQ, "missing values in NIQ_seb.\n")
  # Option 1: Remove entries with missing NIQ_seb values
  ordered_phenotypic_data <- ordered_phenotypic_data[!is.na(ordered_phenotypic_data$NIQ_seb),]
  
  # Option 2: Impute missing values (e.g., with the mean or median)
  # ordered_phenotypic_data$NIQ_seb[is.na(ordered_phenotypic_data$NIQ_seb)] <- mean(ordered_phenotypic_data$NIQ_seb, na.rm = TRUE)
}

# Ensure fst_matrix and ordered_phenotypic_data are still aligned after removing/imputing missing values
common_populations <- intersect(rownames(fst_matrix), ordered_phenotypic_data$Population)
fst_matrix <- fst_matrix[common_populations, common_populations]
ordered_phenotypic_data <- ordered_phenotypic_data[match(common_populations, ordered_phenotypic_data$Population),]

# Define the function to compute correlation between NIQ_seb and its lags
compute_correlation_with_lag <- function(k) {
  variable_name <- paste0("NIQ_seb_k", k)
  correlation <- cor(ordered_phenotypic_data$NIQ_seb, ordered_phenotypic_data[[variable_name]])
  return(correlation)
}

# Compute correlations for k values from 1 to 10
k_values <- 1:10
correlation_values <- sapply(k_values, compute_correlation_with_lag)

# Plot the results
plot(k_values, correlation_values, type='b', xlab='Number of Neighbors (k)', ylab="Correlation",
     main="Correlation between NIQ_seb and its Spatial Lags")

Regression with spatial lag computed using K nearest neighbour. K= 3

## 
## Call:
## lm(formula = NIQ_seb ~ EA3 + NIQ_seb_k10, data = ordered_phenotypic_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.1675  -5.2255   0.9065   5.8425  18.4382 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  45.4910    19.0139   2.393  0.01902 * 
## EA3           7.2836     2.3273   3.130  0.00242 **
## NIQ_seb_k10   0.4739     0.2074   2.285  0.02488 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.858 on 82 degrees of freedom
## Multiple R-squared:  0.5435, Adjusted R-squared:  0.5323 
## F-statistic: 48.81 on 2 and 82 DF,  p-value: 1.092e-14
## 
## Call:
## lm(formula = NIQ_seb ~ EA3 + NIQ_seb_k10, data = ordered_phenotypic_data)
## 
## Standardized Coefficients::
## (Intercept)         EA3 NIQ_seb_k10 
##          NA   0.4426437   0.3232060