A summary of the performance of different classification algorithms to predict ethnicity in COG’s AAML-1031 and AAML-0531 patients with AML as well as AML patients from Germany, Canada, and Australia. The control population is derived from the Healthy Retirement Study (HRS) acquired via dbGAP. The HapMap 3 is using as the training dataset. The patients and controls have been genotyped on the Illumina 2.5M Omni bead chip array, and only the common variants with HapMap 3 data are included in the current analysis.
The following methods are shown:
K-nearest neighbors
Support Vector Machines
Gradient Boosted Machine
Naive Bayes
Random Forest
Linear Discrimant Analysis
Logistic Regression
The data were formatted in plink. PCA was performed with GCTA. Next, separate dummy variables for HapMap 3 patients to distinguish European, Hispanic, and African ancestry. This is done so that our models are not influenced by potential multiclassification issues.
# Dummy variables: Ethnicity (EUR, MEX, AFR)
pca$EUR <- as.factor(ifelse(pca$Ethnicity == "EUR", 1, 0))
pca$MEX <- as.factor(ifelse(pca$Ethnicity == "MEX", 1, 0))
pca$AFR <- as.factor(ifelse(pca$Ethnicity == "AFR", 1, 0))
# define groups: AML, Controls (HRS), and HAP
pca$AML <- as.factor(ifelse(pca$Ethnicity == "TEST", ifelse(is.na(pca$PHE) == FALSE, "AML", "HRS"), "HAP"))
Seperate data into a training (hapmap) and test set (AML patients + HRS controls).
# create train and test set
train <- subset(pca, pca$AML == "HAP")
test <- subset(pca, pca$AML != "HAP")
The PCA’s of the clean hapmap individuals has the following shape and is used to compare our classification models against. Evaluating admixture will be of high importance.
# HAPMAP REFERENCE
ggplot(train, aes(x = PCA1, y = PCA2)) + geom_point(aes(colour = Ethnicity), size = 1)
ggplot(train, aes(x = PCA1, y = PCA3)) + geom_point(aes(colour = Ethnicity), size = 1)
That is looking super clean, there are well defined clusters, so that should be helpful for the classification algorithms.
# K-Nearest Neighbors
library(kknn)
knn.fit.eur <- kknn(EUR ~ PCA1 + PCA2 + PCA3, train, test)
knn.fit.mex <- kknn(MEX ~ PCA1 + PCA2 + PCA3, train, test)
knn.fit.afr <- kknn(AFR ~ PCA1 + PCA2 + PCA3, train, test)
knn.fit.eur <- fitted(knn.fit.eur)
knn.fit.mex <- fitted(knn.fit.mex)
knn.fit.afr <- fitted(knn.fit.afr)
knn.eur <- cbind(test, knn.fit.eur)
knn.mex <- cbind(test, knn.fit.mex)
knn.afr <- cbind(test, knn.fit.afr)
# SUPPORT VECTOR MACHINES
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:kknn':
##
## contr.dummy
# 5-fold repeated cross-validation
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)
svm.fit.eur <- train(EUR ~ PCA1 + PCA2 + PCA3, method = 'svmRadial', trControl = fitControl, data = train)
## Loading required package: kernlab
svm.fit.mex <- train(MEX ~ PCA1 + PCA2 + PCA3, method = 'svmRadial', trControl = fitControl, data = train)
svm.fit.afr <- train(AFR ~ PCA1 + PCA2 + PCA3, method = 'svmRadial', trControl = fitControl, data = train)
svm.pred.eur <- predict(svm.fit.eur, test)
svm.pred.mex <- predict(svm.fit.mex, test)
svm.pred.afr <- predict(svm.fit.afr, test)
svm.eur <- cbind(test, svm.pred.eur)
svm.mex <- cbind(test, svm.pred.mex)
svm.afr <- cbind(test, svm.pred.afr)
# GRADIENT BOOSTED MACHINE
gbm.fit.eur <- train(EUR ~ PCA1 + PCA2 + PCA3, method = 'gbm', trControl = fitControl, data = train, verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
gbm.fit.mex <- train(MEX ~ PCA1 + PCA2 + PCA3, method = 'gbm', trControl = fitControl, data = train, verbose = FALSE)
gbm.fit.afr <- train(AFR ~ PCA1 + PCA2 + PCA3, method = 'gbm', trControl = fitControl, data = train, verbose = FALSE)
gbm.pred.eur <- predict(gbm.fit.eur, test)
gbm.pred.mex <- predict(gbm.fit.mex, test)
gbm.pred.afr <- predict(gbm.fit.afr, test)
gbm.eur <- cbind(test, gbm.pred.eur)
gbm.mex <- cbind(test, gbm.pred.mex)
gbm.afr <- cbind(test, gbm.pred.afr)
# NAIVE BAYES
library(e1071)
nb.fit.eur <- naiveBayes(EUR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
nb.fit.mex <- naiveBayes(MEX ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
nb.fit.afr <- naiveBayes(AFR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
nb.pred.eur <- predict(nb.fit.eur, test)
nb.pred.mex <- predict(nb.fit.mex, test)
nb.pred.afr <- predict(nb.fit.afr, test)
nb.eur <- cbind(test, nb.pred.eur)
nb.mex <- cbind(test, nb.pred.mex)
nb.afr <- cbind(test, nb.pred.afr)
# Random Forests
rf.fit.eur <- naiveBayes(EUR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
rf.fit.mex <- naiveBayes(MEX ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
rf.fit.afr <- naiveBayes(AFR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
rf.pred.eur <- predict(rf.fit.eur, test)
rf.pred.mex <- predict(rf.fit.mex, test)
rf.pred.afr <- predict(rf.fit.afr, test)
rf.eur <- cbind(test, rf.pred.eur)
rf.mex <- cbind(test, rf.pred.mex)
rf.afr <- cbind(test, rf.pred.afr)
#LINEAR DISCRIMINANT ANALYSIS
library(MASS)
lda.eur <- lda(EUR ~ PCA1 + PCA2 + PCA3, data = train)
lda.mex <- lda(MEX ~ PCA1 + PCA2 + PCA3, data = train)
lda.afr <- lda(AFR ~ PCA1 + PCA2 + PCA3, data = train)
lda.pred.eur <- predict(lda.eur, test)
lda.pred.mex <- predict(lda.mex, test)
lda.pred.afr <- predict(lda.afr, test)
lda.eur <- cbind(test, lda.pred.eur)
lda.mex <- cbind(test, lda.pred.mex)
lda.afr <- cbind(test, lda.pred.afr)
# Logistic Regression
glm.eur <- glm(EUR ~ PCA1 + PCA2 + PCA3, data = train, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.mex <- lda(MEX ~ PCA1 + PCA2 + PCA3, data = train, family = binomial)
glm.afr <- lda(AFR ~ PCA1 + PCA2 + PCA3, data = train, family = binomial)
glm.pred.eur <- predict(glm.eur, type = "response", newdata = test)
glm.pred.mex <- predict(glm.mex, type = "response", newdata = test)
glm.pred.afr <- predict(glm.afr, type = "response", newdata = test)
glm.eur <- cbind(test, glm.pred.eur)
glm.mex <- cbind(test, glm.pred.mex)
glm.afr <- cbind(test, glm.pred.afr)
# problem with perfect separation for whites in logistic regression.
# Model coefficient estimates are inflated.
glm.eur$class <- as.factor(ifelse(glm.eur$glm.pred.eur > 0.5, 1, 0))
Combining all output into one dataset
# cobmine
eur.total <- do.call("cbind", list(test[,c("IID.x", "FID", "AML", "PCA1", "PCA2", "PCA3")],
knn = knn.fit.eur, svm = svm.pred.eur,
gbm = gbm.pred.eur, nb = nb.pred.eur, rf = rf.pred.eur,
lda = lda.pred.eur$class, glm = glm.eur$class))
mex.total <- do.call("cbind", list(test[,c("IID.x", "FID", "AML", "PCA1", "PCA2", "PCA3")],
knn = knn.fit.mex, svm = svm.pred.mex,
gbm = gbm.pred.mex, nb = nb.pred.mex, rf = rf.pred.mex,
lda = lda.pred.mex$class, glm = glm.mex$class))
afr.total <- do.call("cbind", list(test[,c("IID.x", "FID", "AML", "PCA1", "PCA2", "PCA3")],
knn = knn.fit.afr, svm = svm.pred.afr,
gbm = gbm.pred.afr, nb = nb.pred.afr, rf = rf.pred.afr,
lda = lda.pred.afr$class, glm = glm.afr$class))
total <- do.call("rbind", list(eur.total, mex.total, afr.total))
EUR: many admix patients classified as white
MEX: many admix patients classified as hispanic
AFR: many admix patients classified as black
## Loading required package: gridExtra
## Loading required package: grid
##
## AML HRS
## EUR 1237 9473
## MEX 298 1034
## AFR 139 1665
## Loading required package: data.table
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
EUR: clean subset of white patients
MEX: clean subset of hispanic patients
AFR: clean subset of black patients
##
## AML HRS
## EUR 1135 9138
## MEX 268 971
## AFR 98 1586
EUR: many admix patients classified as white
MEX: many admix patients classified as hispanic [no missed hispanics]
AFR: many admix patients classified as black
##
## AML HRS
## EUR 1171 8996
## MEX 281 975
## AFR 127 1644
EUR: clean subset of white patients [true hits]
MEX: clean subset of hispanic patients [+ misses true hispanic patients]
AFR: many admix patients classified as black
##
## AML HRS
## EUR 1129 9087
## MEX 214 785
## AFR 119 1641
EUR: clean subset of white patients [true hits]
MEX: clean subset of hispanic patients [+ misses true hispanic patients]
AFR: many admix patients classified as black
##
## AML HRS
## EUR 1129 9087
## MEX 214 785
## AFR 119 1641
EUR: many admix patients classified as white
MEX: many admix patients classified as hispanic [+ misses true hispanic patients]
AFR: many admix patients classified as black
##
## AML HRS
## EUR 1338 9629
## MEX 227 801
## AFR 117 1626
EUR: many admix patients classified as white
MEX: many admix patients classified as hispanic [+ misses true hispanic patients]
AFR: many admix patients classified as black
##
## AML HRS
## EUR 1221 9421
## MEX 227 801
## AFR 117 1626
In terms of further genome-wide analysis, this is the current list of prioritized ethnicity classification to discuss further.
Support Vector Machine
Naive Bayes
Random Forest
Gradient Boosted Machine
K-Nearest Neighbors
Linear Discriminant Analysis
Logistic Regression