Data preparation
setDTthreads(threads = 28)
Array2 <- fread("/mnt/data/pgs/ArrayPGSnew.csv")
Array2$Population <- sub("^1KG\\.", "", Array2$Population)
Array2$Latitude=NULL
Fst <- fread("/mnt/data/newfiles/merged_new/pairwise_fst_array_hudson_with1kg.fst.summary")
colnames(Fst)[colnames(Fst) == "#POP1"] <- "POP1"
Heatmap of Fst distances
Distance matrix calculation
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## Warning: The melt generic in data.table has been passed a matrix and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(distance_matrix). In the next version, this warning will become
## an error.
## POP1 POP2 EA3_distance EA4_distance SKINC_distance Height_distance
## 1 ACB Algeria 0.85669742 1.5854682 0.6580122 0.6904268
## 2 ACB Amazonia 0.29660255 1.5010435 0.7580104 1.0260928
## 3 ACB Andes 0.17054116 1.7295879 0.9022190 0.6180687
## 4 ACB Armenian 1.37302935 2.7790826 1.1739626 0.6212062
## 5 ACB ASW 0.05052031 0.4585567 0.1741379 0.1838117
## 6 ACB Athabask 0.03542741 1.9414306 1.0343298 0.4326350
## Skin_colour_distance NIQ_seb_distance Height_seb_distance
## 1 4.0 1.84 3.5
## 2 1.5 3.51 18.0
## 3 1.5 3.51 19.5
## 4 6.8 6.14 5.2
## 5 0.0 2.51 1.6
## 6 3.5 8.82 13.2
Permutation test
Piffer-Mantel test: linear regression
##
## Call:
## lm(formula = NIQ_seb_distance ~ EA3_distance + HUDSON_FST, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.059 -6.410 -1.261 6.047 25.594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.5366 0.3657 20.611 <2e-16 ***
## EA3_distance 5.6716 0.4204 13.493 <2e-16 ***
## HUDSON_FST 3.8752 4.7985 0.808 0.419
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.25 on 1805 degrees of freedom
## Multiple R-squared: 0.1401, Adjusted R-squared: 0.1392
## F-statistic: 147.1 on 2 and 1805 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = NIQ_seb_distance ~ EA3_distance + HUDSON_FST, data = merged_df)
##
## Standardized Coefficients::
## (Intercept) EA3_distance HUDSON_FST
## NA 0.36136106 0.02162895
##
## Call:
## lm(formula = NIQ_seb_distance ~ EA4_distance + HUDSON_FST, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.994 -6.113 -1.379 5.429 25.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.3239 0.3399 27.431 < 2e-16 ***
## EA4_distance 5.0037 0.2718 18.409 < 2e-16 ***
## HUDSON_FST -19.2418 5.0023 -3.847 0.000124 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.943 on 1805 degrees of freedom
## Multiple R-squared: 0.203, Adjusted R-squared: 0.2021
## F-statistic: 229.9 on 2 and 1805 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = NIQ_seb_distance ~ EA4_distance + HUDSON_FST, data = merged_df)
##
## Standardized Coefficients::
## (Intercept) EA4_distance HUDSON_FST
## NA 0.5139860 -0.1073951
##
## Call:
## lm(formula = Height_seb_distance ~ Height_distance + HUDSON_FST,
## data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7358 -3.0797 -0.6242 2.7492 14.4989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5138 0.1946 18.058 < 2e-16 ***
## Height_distance 2.3200 0.2295 10.109 < 2e-16 ***
## HUDSON_FST 14.0429 2.0926 6.711 2.58e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.096 on 1805 degrees of freedom
## Multiple R-squared: 0.113, Adjusted R-squared: 0.112
## F-statistic: 115 on 2 and 1805 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = Height_seb_distance ~ Height_distance + HUDSON_FST,
## data = merged_df)
##
## Standardized Coefficients::
## (Intercept) Height_distance HUDSON_FST
## NA 0.2415568 0.1603586
##
## Call:
## lm(formula = Skin_colour_distance ~ SKINC_distance + HUDSON_FST,
## data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0641 -1.1179 -0.1157 1.2261 4.6946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.53216 0.07358 7.232 6.99e-13 ***
## SKINC_distance 1.99071 0.06143 32.405 < 2e-16 ***
## HUDSON_FST 9.96777 0.80776 12.340 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.57 on 1805 degrees of freedom
## Multiple R-squared: 0.4971, Adjusted R-squared: 0.4965
## F-statistic: 892.1 on 2 and 1805 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = Skin_colour_distance ~ SKINC_distance + HUDSON_FST,
## data = merged_df)
##
## Standardized Coefficients::
## (Intercept) SKINC_distance HUDSON_FST
## NA 0.5872355 0.2236193
Prepare dataset for K Neighbours analysis
# Ensure the kknn package is installed
fst_distance_matrix <- fread("/mnt/data/newfiles/merged_new/pairwise_fst_array_hudson_with1kg.fst.summary")
fst_distance_matrix=fst_distance_matrix[fst_distance_matrix$HUDSON_FST>0,]
colnames(fst_distance_matrix)[colnames(fst_distance_matrix) == "#POP1"] <- "POP1"
Array2 <- fread("/mnt/data/pgs/ArrayPGSnew.csv")
Array2$Population <- sub("^1KG\\.", "", Array2$Population)
populations_fst <- unique(c(fst_distance_matrix$POP1, fst_distance_matrix$POP2))
populations_phenotypic <- Array2$Population
common_populations <- intersect(populations_fst, populations_phenotypic)
filtered_fst_distance_matrix <- fst_distance_matrix %>%
filter(POP1 %in% common_populations & POP2 %in% common_populations)
filtered_phenotypic_data <- Array2 %>%
filter(Population %in% common_populations)
fst_matrix <- matrix(NA, nrow = length(common_populations), ncol = length(common_populations),
dimnames = list(common_populations, common_populations))
for (i in 1:nrow(filtered_fst_distance_matrix)) {
fst_matrix[filtered_fst_distance_matrix$POP1[i], filtered_fst_distance_matrix$POP2[i]] <- filtered_fst_distance_matrix$HUDSON_FST[i]
fst_matrix[filtered_fst_distance_matrix$POP2[i], filtered_fst_distance_matrix$POP1[i]] <- filtered_fst_distance_matrix$HUDSON_FST[i]
}
fst_matrix[is.na(fst_matrix)] <- max(fst_matrix, na.rm = TRUE) + 1
ordered_phenotypic_data <- filtered_phenotypic_data[match(common_populations, filtered_phenotypic_data$Population),]
library(spdep)
library(kirkegaard)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: weights
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:gt':
##
## html
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: assertthat
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:effsize':
##
## cohen.d
## The following object is masked from 'package:effectsize':
##
## phi
##
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:psych':
##
## rescale
## The following object is masked from 'package:assertthat':
##
## are_equal
## The following object is masked from 'package:purrr':
##
## is_logical
## The following object is masked from 'package:effectsize':
##
## standardize
## The following object is masked from 'package:base':
##
## +
# Check for missing values in NIQ_seb
missing_values <- sum(is.na(ordered_phenotypic_data$NIQ_seb))
if (missing_values > 0) {
cat("There are", missing_values, "missing values in NIQ_seb.\n")
# Option 1: Remove entries with missing NIQ_seb values
ordered_phenotypic_data <- ordered_phenotypic_data[!is.na(ordered_phenotypic_data$NIQ_seb),]
# Option 2: Impute missing values (e.g., with the mean or median)
# ordered_phenotypic_data$NIQ_seb[is.na(ordered_phenotypic_data$NIQ_seb)] <- mean(ordered_phenotypic_data$NIQ_seb, na.rm = TRUE)
}
## There are 5 missing values in NIQ_seb.
# Ensure fst_matrix and ordered_phenotypic_data are still aligned after removing/imputing missing values
common_populations <- intersect(rownames(fst_matrix), ordered_phenotypic_data$Population)
fst_matrix <- fst_matrix[common_populations, common_populations]
ordered_phenotypic_data <- ordered_phenotypic_data[match(common_populations, ordered_phenotypic_data$Population),]
for (k in 1:10) {
ordered_phenotypic_data = spatial_knn(ordered_phenotypic_data,
k = k,
vars = c("Skin_colour", "NIQ_seb", "Height_seb"),
suffix = "_k" + k,
dists = fst_matrix
)
}
library(spdep)
# Define the function to compute Moran's I for the lagged variables
Correlation between PGS and lag PGS for K 1 to 10
# Check for missing values in NIQ_seb
missing_values_NIQ <- sum(is.na(ordered_phenotypic_data$NIQ_seb))
if (missing_values_NIQ > 0) {
cat("There are", missing_values_NIQ, "missing values in NIQ_seb.\n")
# Option 1: Remove entries with missing NIQ_seb values
ordered_phenotypic_data <- ordered_phenotypic_data[!is.na(ordered_phenotypic_data$NIQ_seb),]
# Option 2: Impute missing values (e.g., with the mean or median)
# ordered_phenotypic_data$NIQ_seb[is.na(ordered_phenotypic_data$NIQ_seb)] <- mean(ordered_phenotypic_data$NIQ_seb, na.rm = TRUE)
}
# Ensure fst_matrix and ordered_phenotypic_data are still aligned after removing/imputing missing values
common_populations <- intersect(rownames(fst_matrix), ordered_phenotypic_data$Population)
fst_matrix <- fst_matrix[common_populations, common_populations]
ordered_phenotypic_data <- ordered_phenotypic_data[match(common_populations, ordered_phenotypic_data$Population),]
# Define the function to compute correlation between NIQ_seb and its lags
compute_correlation_with_lag <- function(k) {
variable_name <- paste0("NIQ_seb_k", k)
correlation <- cor(ordered_phenotypic_data$NIQ_seb, ordered_phenotypic_data[[variable_name]])
return(correlation)
}
# Compute correlations for k values from 1 to 10
k_values <- 1:10
correlation_values <- sapply(k_values, compute_correlation_with_lag)
# Plot the results
plot(k_values, correlation_values, type='b', xlab='Number of Neighbors (k)', ylab="Correlation",
main="Correlation between NIQ_seb and its Spatial Lags")
Regression with spatial lag computed using K nearest neighbour. K= 3
##
## Call:
## lm(formula = NIQ_seb ~ EA3 + NIQ_seb_k10, data = ordered_phenotypic_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.1675 -5.2255 0.9065 5.8425 18.4382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.4910 19.0139 2.393 0.01902 *
## EA3 7.2836 2.3273 3.130 0.00242 **
## NIQ_seb_k10 0.4739 0.2074 2.285 0.02488 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.858 on 82 degrees of freedom
## Multiple R-squared: 0.5435, Adjusted R-squared: 0.5323
## F-statistic: 48.81 on 2 and 82 DF, p-value: 1.092e-14
##
## Call:
## lm(formula = NIQ_seb ~ EA3 + NIQ_seb_k10, data = ordered_phenotypic_data)
##
## Standardized Coefficients::
## (Intercept) EA3 NIQ_seb_k10
## NA 0.4426437 0.3232060