# populations
eur <- read.table("JAN2017.eur", T, stringsAsFactors = F)
eur$race <- "EUR"
mex <- read.table("JAN2017.mex", T, stringsAsFactors = F)
mex$race <- "MEX"
afr <- read.table("JAN2017.afr", T, stringsAsFactors = F)
afr$race <- "AFR"

pop <- rbind(eur, mex, afr)

#
# AML.A: REMOVE PCA4 > 0.01 | PCA6 > 0.2
#
AML.A <- read.table("110302_IRA-AMLx864-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.A) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.A <- merge(AML.A, pop, by = c("FID", "IID"), all.x = T)
AML.A$race <- ifelse(is.na(AML.A$race) == T, "EXCL", AML.A$race)
table(AML.A$race)
## 
##  EUR EXCL  MEX 
##  381   20   81
ggpairs(data = AML.A,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.A$CUT <- ifelse(AML.A$PCA4 > 0.1 | AML.A$PCA6 > 0.2, "OUT", "IN")
ggpairs(data = AML.A,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = CUT),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

write.table(AML.A[which(AML.A$CUT == "OUT"), c("FID", "IID")], 
            "110302_IRA-AMLx864-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# AML.B: LOOKING GOOD
#
AML.B <- read.table("110621_IRA-AMLx48-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.B) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.B <- merge(AML.B, pop, by = c("FID", "IID"), all.x = T)
AML.B$race <- ifelse(is.na(AML.B$race) == T, "EXCL", AML.B$race)
table(AML.B$race)
## 
##  AFR  EUR EXCL  MEX 
##    2   19    4   10
ggpairs(data = AML.B,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.B$CUT <- NA
write.table(AML.B[which(AML.B$CUT == "OUT"), c("FID", "IID")], 
            "110621_IRA-AMLx48-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# AML.C: LOOKING GOOD
#
AML.C <- read.table("120221_IRA-AMLx119-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.C) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.C <- merge(AML.C, pop, by = c("FID", "IID"), all.x = T)
AML.C$race <- ifelse(is.na(AML.C$race) == T, "EXCL", AML.C$race)
table(AML.C$race)
## 
##  EUR EXCL  MEX 
##   89    9   15
ggpairs(data = AML.C,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.C$CUT <- NA
write.table(AML.C[which(AML.C$CUT == "OUT"), c("FID", "IID")], 
            "120221_IRA-AMLx119-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#  
# AML.D: PCA3 < -0.2 | PCA4 < -0.2 | PCA7 > 0.2 
#
AML.D <- read.table("130625_IRA-AMLx376-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.D) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.D <- merge(AML.D, pop, by = c("FID", "IID"), all.x = T)
AML.D$race <- ifelse(is.na(AML.D$race) == T, "EXCL", AML.D$race)
table(AML.D$race)
## 
##  AFR  EUR EXCL  MEX 
##    4  277   30   49
ggpairs(data = AML.D,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.D$CUT <- ifelse(AML.D$PCA3 < -0.2 | AML.D$PCA4 < -0.2 | AML.D$PCA7 > 0.2, "OUT", "IN")
ggpairs(data = AML.D,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = CUT),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

write.table(AML.D[which(AML.D$CUT == "OUT"), c("FID", "IID")], 
            "130625_IRA-AMLx376-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# AML.E: LOOKING GOOD
#
AML.E <- read.table("140214_IRA-AMLx264-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.E) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.E <- merge(AML.E, pop, by = c("FID", "IID"), all.x = T)
AML.E$race <- ifelse(is.na(AML.E$race) == T, "EXCL", AML.E$race)
table(AML.E$race)
## 
##  AFR  EUR EXCL  MEX 
##    5  193   24   11
ggpairs(data = AML.E,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.E$CUT <- NA
write.table(AML.E[which(AML.E$CUT == "OUT"), c("FID", "IID")], 
            "140214_IRA-AMLx264-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# AML.F: LOOKING GOOD
#
AML.F <- read.table("140923_IRA-AMLx264-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.F) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.F <- merge(AML.F, pop, by = c("FID", "IID"), all.x = T)
AML.F$race <- ifelse(is.na(AML.F$race) == T, "EXCL", AML.F$race)
table(AML.F$race)
## 
##  AFR  EUR EXCL  MEX 
##    1  179   28   30
ggpairs(data = AML.F,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.F$CUT <- NA
write.table(AML.F[which(AML.F$CUT == "OUT"), c("FID", "IID")], 
            "140923_IRA-AMLx264-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# AML.G: LOOKING GOOD
#
AML.G <- read.table("150728_IRA-AMLx440-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.G) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.G <- merge(AML.G, pop, by = c("FID", "IID"), all.x = T)
AML.G$race <- ifelse(is.na(AML.G$race) == T, "EXCL", AML.G$race)
table(AML.G$race)
## 
##  AFR  EUR EXCL  MEX 
##   87  188   57   39
ggpairs(data = AML.G,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.G$CUT <- NA
write.table(AML.G[which(AML.G$CUT == "OUT"), c("FID", "IID")], 
            "150728_IRA-AMLx440-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)
#
# AML.H: LOOKING GOOD
#
AML.H <- read.table("151014_IRA-AMLx26-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(AML.H) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
AML.H <- merge(AML.H, pop, by = c("FID", "IID"), all.x = T)
AML.H$race <- ifelse(is.na(AML.H$race) == T, "EXCL", AML.H$race)
table(AML.H$race)
## 
##  AFR  EUR EXCL  MEX 
##    3   11    1   10
ggpairs(data = AML.H,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

AML.H$CUT <- NA
write.table(AML.H[which(AML.H$CUT == "OUT"), c("FID", "IID")], 
            "151014_IRA-AMLx26-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)
#
# HRS: LOOKING GOOD
#
hrs <- read.table("HRS-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(hrs) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
hrs <- merge(hrs, pop, by = c("FID", "IID"), all.x = T)
hrs$race <- ifelse(is.na(hrs$race) == T, "EXCL", hrs$race)
table(hrs$race)
## 
##  AFR  EUR EXCL  MEX 
##  412 9525  224  814
ggpairs(data = hrs,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

hrs$CUT <- NA
write.table(hrs[which(hrs$CUT == "OUT"), c("FID", "IID")], 
            "HRS-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)
#
# FUCHS: REMOVE PATIENTS WITH PCA > 0.03
#
fuchs <- read.table("FUCHS-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(fuchs) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
fuchs <- merge(fuchs, pop, by = c("FID", "IID"), all.x = T)
fuchs$race <- ifelse(is.na(fuchs$race) == T, "EXCL", fuchs$race)
table(fuchs$race)
## 
##  AFR  EUR EXCL  MEX 
##    4 1585 1658    1
ggpairs(data = fuchs,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

fuchs$CUT <- ifelse(fuchs$PCA1 > 0.03, "OUT", "IN")
ggpairs(data = fuchs,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = CUT),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

write.table(fuchs[which(fuchs$CUT == "OUT"), c("FID", "IID")], 
            "FUCHS-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# KORA: LOOKING GOOD
#
kora <- read.table("KORA-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(kora) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
kora <- merge(kora, pop, by = c("FID", "IID"), all.x = T)
kora$race <- ifelse(is.na(kora$race) == T, "EXCL", kora$race)
table(kora$race)
## 
##  EUR EXCL 
## 1867    1
ggpairs(data = kora,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

kora$CUT <- NA
write.table(kora[which(kora$CUT == "OUT"), c("FID", "IID")], 
            "KORA-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# SAGE: LOOKING GOOD
#
sage <- read.table("SAGE-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(sage) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
sage <- merge(sage, pop, by = c("FID", "IID"), all.x = T)
sage$race <- ifelse(is.na(sage$race) == T, "EXCL", sage$race)
table(sage$race)
## 
##  AFR  EUR EXCL  MEX 
##  481  927   24   22
ggpairs(data = sage,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

sage$CUT <- NA
write.table(sage[which(sage$CUT == "OUT"), c("FID", "IID")], 
            "SAGE-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)

#
# AREDS: REMOVE PEOPE WITH PCA1 < -0.02 
# THE SEPERATION IN PCA2 IS ACTUALLY HIGH VALUES IN PCA1
# so its' residual variance captured from PCA1 (DON"T REMOVE)
#
areds <- read.table("AREDS-b37.eigenvec", header = F, sep = " ", stringsAsFactors = F)
names(areds) <- c("FID", "IID", "PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", 
                "PCA7", "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20")
areds <- merge(areds, pop, by = c("FID", "IID"), all.x = T)
areds$race <- ifelse(is.na(areds$race) == T, "EXCL", areds$race)
table(areds$race)
## 
##  EUR EXCL  MEX 
## 1660    1    2
ggpairs(data = areds,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = race),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

areds$CUT <- ifelse(areds$PCA1 < -0.02, "OUT", "IN")
ggpairs(data = areds,
        diag=list(continuous = "density"),
        columns=3:10,
        mapping=ggplot2::aes(colour = CUT),
        axisLabels="show")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'density' to 'densityDiag'

write.table(areds[which(areds$CUT == "OUT"), c("FID", "IID")], 
            "AREDS-b37.batch-remove", 
            sep = "\t", 
            col.names = T,
            row.names = F,
            quote = F)