A summary of the performance of different classification algorithms to predict ethnicity in COG’s AAML-1031 and AAML-0531 patients with AML as well as AML patients from Germany, Canada, and Australia. The control population is derived from the Healthy Retirement Study (HRS) acquired via dbGAP. The HapMap 3 is using as the training dataset. The patients and controls have been genotyped on the Illumina 2.5M Omni bead chip array, and only the common variants with HapMap 3 data are included in the current analysis.

The following methods are shown:

The data were formatted in plink. PCA was performed with GCTA. Next, separate dummy variables for HapMap 3 patients to distinguish European, Hispanic, and African ancestry. This is done so that our models are not influenced by potential multiclassification issues.

# Dummy variables: Ethnicity (EUR, MEX, AFR)
pca$EUR <- as.factor(ifelse(pca$Ethnicity == "EUR", 1, 0))
pca$MEX <- as.factor(ifelse(pca$Ethnicity == "MEX", 1, 0))
pca$AFR <- as.factor(ifelse(pca$Ethnicity == "AFR", 1, 0))

# define groups: AML, Controls (HRS), and HAP
pca$AML <- as.factor(ifelse(pca$Ethnicity == "TEST", ifelse(is.na(pca$PHE) == FALSE, "AML", "HRS"), "HAP"))

Seperate data into a training (hapmap) and test set (AML patients + HRS controls).

# create train and test set
train <- subset(pca, pca$AML == "HAP")
test <- subset(pca, pca$AML != "HAP")

The PCA’s of the clean hapmap individuals has the following shape and is used to compare our classification models against. Evaluating admixture will be of high importance.

# HAPMAP REFERENCE
ggplot(train, aes(x = PCA1, y = PCA2)) + geom_point(aes(colour = Ethnicity), size = 1) 

ggplot(train, aes(x = PCA1, y = PCA3)) + geom_point(aes(colour = Ethnicity), size = 1) 

That is looking super clean, there are well defined clusters, so that should be helpful for the classification algorithms.

Building the Model

K-Nearest Neighbors

# K-Nearest Neighbors 
library(kknn)
knn.fit.eur <- kknn(EUR ~ PCA1 + PCA2 + PCA3, train, test)
knn.fit.mex <- kknn(MEX ~ PCA1 + PCA2 + PCA3, train, test)
knn.fit.afr <- kknn(AFR ~ PCA1 + PCA2 + PCA3, train, test)

knn.fit.eur <- fitted(knn.fit.eur)
knn.fit.mex <- fitted(knn.fit.mex)
knn.fit.afr <- fitted(knn.fit.afr)

knn.eur <- cbind(test, knn.fit.eur)
knn.mex <- cbind(test, knn.fit.mex)
knn.afr <- cbind(test, knn.fit.afr)

Support Vector Machine

# SUPPORT VECTOR MACHINES
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:kknn':
## 
##     contr.dummy
# 5-fold repeated cross-validation
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)

svm.fit.eur <- train(EUR ~ PCA1 + PCA2 + PCA3, method = 'svmRadial', trControl = fitControl, data = train)
## Loading required package: kernlab
svm.fit.mex <- train(MEX ~ PCA1 + PCA2 + PCA3, method = 'svmRadial', trControl = fitControl, data = train)
svm.fit.afr <- train(AFR ~ PCA1 + PCA2 + PCA3, method = 'svmRadial', trControl = fitControl, data = train)

svm.pred.eur <- predict(svm.fit.eur, test)
svm.pred.mex <- predict(svm.fit.mex, test)
svm.pred.afr <- predict(svm.fit.afr, test)

svm.eur <- cbind(test, svm.pred.eur)
svm.mex <- cbind(test, svm.pred.mex)
svm.afr <- cbind(test, svm.pred.afr)

Gradient Boosted Machine

# GRADIENT BOOSTED MACHINE
gbm.fit.eur <- train(EUR ~ PCA1 + PCA2 + PCA3, method = 'gbm', trControl = fitControl, data = train, verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
gbm.fit.mex <- train(MEX ~ PCA1 + PCA2 + PCA3, method = 'gbm', trControl = fitControl, data = train, verbose = FALSE)
gbm.fit.afr <- train(AFR ~ PCA1 + PCA2 + PCA3, method = 'gbm', trControl = fitControl, data = train, verbose = FALSE)

gbm.pred.eur <- predict(gbm.fit.eur, test)
gbm.pred.mex <- predict(gbm.fit.mex, test)
gbm.pred.afr <- predict(gbm.fit.afr, test)

gbm.eur <- cbind(test, gbm.pred.eur)
gbm.mex <- cbind(test, gbm.pred.mex)
gbm.afr <- cbind(test, gbm.pred.afr)

Naive Bayes

# NAIVE BAYES
library(e1071)
nb.fit.eur <- naiveBayes(EUR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
nb.fit.mex <- naiveBayes(MEX ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
nb.fit.afr <- naiveBayes(AFR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)

nb.pred.eur <- predict(nb.fit.eur, test)
nb.pred.mex <- predict(nb.fit.mex, test)
nb.pred.afr <- predict(nb.fit.afr, test)

nb.eur  <- cbind(test, nb.pred.eur)
nb.mex  <- cbind(test, nb.pred.mex)
nb.afr  <- cbind(test, nb.pred.afr)

Random Forests

# Random Forests
rf.fit.eur <- naiveBayes(EUR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
rf.fit.mex <- naiveBayes(MEX ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)
rf.fit.afr <- naiveBayes(AFR ~ PCA1 + PCA2 + PCA3, data = train, laplace = 3)

rf.pred.eur <- predict(rf.fit.eur, test)
rf.pred.mex <- predict(rf.fit.mex, test)
rf.pred.afr <- predict(rf.fit.afr, test)

rf.eur  <- cbind(test, rf.pred.eur)
rf.mex  <- cbind(test, rf.pred.mex)
rf.afr  <- cbind(test, rf.pred.afr)

Linear Discriminant Analysis

#LINEAR DISCRIMINANT ANALYSIS
library(MASS)
lda.eur <- lda(EUR ~ PCA1 + PCA2 + PCA3, data = train)
lda.mex <- lda(MEX ~ PCA1 + PCA2 + PCA3, data = train)
lda.afr <- lda(AFR ~ PCA1 + PCA2 + PCA3, data = train)

lda.pred.eur <- predict(lda.eur, test)
lda.pred.mex <- predict(lda.mex, test)
lda.pred.afr <- predict(lda.afr, test)

lda.eur <- cbind(test, lda.pred.eur)
lda.mex <- cbind(test, lda.pred.mex)
lda.afr <- cbind(test, lda.pred.afr)

Logistic Regression

# Logistic Regression
glm.eur <- glm(EUR ~ PCA1 + PCA2 + PCA3, data = train, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.mex <- lda(MEX ~ PCA1 + PCA2 + PCA3, data = train, family = binomial)
glm.afr <- lda(AFR ~ PCA1 + PCA2 + PCA3, data = train, family = binomial)

glm.pred.eur <- predict(glm.eur, type = "response", newdata = test)
glm.pred.mex <- predict(glm.mex, type = "response", newdata = test)
glm.pred.afr <- predict(glm.afr, type = "response", newdata = test)

glm.eur <- cbind(test, glm.pred.eur)
glm.mex <- cbind(test, glm.pred.mex)
glm.afr <- cbind(test, glm.pred.afr)

# problem with perfect separation for whites in logistic regression. 
# Model coefficient estimates are inflated. 
glm.eur$class <- as.factor(ifelse(glm.eur$glm.pred.eur > 0.5, 1, 0))

Output

Combining all output into one dataset

# cobmine
eur.total <- do.call("cbind", list(test[,c("IID.x", "FID", "AML", "PCA1", "PCA2", "PCA3")],
                                   knn = knn.fit.eur, svm = svm.pred.eur, 
                                   gbm = gbm.pred.eur, nb = nb.pred.eur, rf = rf.pred.eur, 
                                   lda = lda.pred.eur$class, glm = glm.eur$class))

mex.total <- do.call("cbind", list(test[,c("IID.x", "FID", "AML", "PCA1", "PCA2", "PCA3")],
                                   knn = knn.fit.mex, svm = svm.pred.mex, 
                                   gbm = gbm.pred.mex, nb = nb.pred.mex, rf = rf.pred.mex, 
                                   lda = lda.pred.mex$class, glm = glm.mex$class))

afr.total <- do.call("cbind", list(test[,c("IID.x", "FID", "AML", "PCA1", "PCA2", "PCA3")],
                                   knn = knn.fit.afr, svm = svm.pred.afr, 
                                   gbm = gbm.pred.afr, nb = nb.pred.afr, rf = rf.pred.afr, 
                                   lda = lda.pred.afr$class, glm = glm.afr$class))
total <- do.call("rbind", list(eur.total, mex.total, afr.total))

K-Nearest Neighbors

  • EUR: many admix patients classified as white

  • MEX: many admix patients classified as hispanic

  • AFR: many admix patients classified as black

## Loading required package: gridExtra
## Loading required package: grid

##      
##        AML  HRS
##   EUR 1237 9473
##   MEX  298 1034
##   AFR  139 1665
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

Support Vector Machine

  • EUR: clean subset of white patients

  • MEX: clean subset of hispanic patients

  • AFR: clean subset of black patients

##      
##        AML  HRS
##   EUR 1135 9138
##   MEX  268  971
##   AFR   98 1586

Gradient Boosted Machine

  • EUR: many admix patients classified as white

  • MEX: many admix patients classified as hispanic [no missed hispanics]

  • AFR: many admix patients classified as black

##      
##        AML  HRS
##   EUR 1171 8996
##   MEX  281  975
##   AFR  127 1644

Naive Bayes

  • EUR: clean subset of white patients [true hits]

  • MEX: clean subset of hispanic patients [+ misses true hispanic patients]

  • AFR: many admix patients classified as black

##      
##        AML  HRS
##   EUR 1129 9087
##   MEX  214  785
##   AFR  119 1641

Random Forests

  • EUR: clean subset of white patients [true hits]

  • MEX: clean subset of hispanic patients [+ misses true hispanic patients]

  • AFR: many admix patients classified as black

##      
##        AML  HRS
##   EUR 1129 9087
##   MEX  214  785
##   AFR  119 1641

Linear Discriminant Analysis

  • EUR: many admix patients classified as white

  • MEX: many admix patients classified as hispanic [+ misses true hispanic patients]

  • AFR: many admix patients classified as black

##      
##        AML  HRS
##   EUR 1338 9629
##   MEX  227  801
##   AFR  117 1626

Logistic Regression

  • EUR: many admix patients classified as white

  • MEX: many admix patients classified as hispanic [+ misses true hispanic patients]

  • AFR: many admix patients classified as black

##      
##        AML  HRS
##   EUR 1221 9421
##   MEX  227  801
##   AFR  117 1626

In terms of further genome-wide analysis, this is the current list of prioritized ethnicity classification to discuss further.

  1. Support Vector Machine

  2. Naive Bayes

  3. Random Forest

  4. Gradient Boosted Machine

  5. K-Nearest Neighbors

  6. Linear Discriminant Analysis

  7. Logistic Regression