first need to deal with families that have more than 1 or 2 sibs/twins
# read in raw data files
acspsw03 <- fread("/Users/clairemorrison/Desktop/ctc_sleep_psych/acspsw03.txt", header=T, data.table=F)
# just keep baseline for dems/ids
# it looks like zygo is only in baseline
ids1 <- acspsw03[ which(acspsw03$eventname=='baseline_year_1_arm_1'), ]
# look for family that have multiple sets of twins/sibs
# if families have multiple sets of sibs/twins we're prioritizing:
# 1 - MZs over any other pairs
# 2 - DZs over sibs
# 3 - if 2 pairs of sibs pick one set at random
table(ids1$rel_relationship)
##
## 0 1 2 3
## 7898 1810 2138 30
# create a subset of data with just subjectkey and family ID info that is easier to work with
vars <- c("subjectkey", "interview_age", "rel_family_id", "rel_group_id", "rel_ingroup_order",
"rel_relationship", "rel_same_sex", "genetic_zygosity_status_1", "genetic_zygosity_status_2", "genetic_zygosity_status_3", "genetic_zygosity_status_4",
"genetic_paired_subjectid_1", "genetic_paired_subjectid_2", "genetic_paired_subjectid_3", "genetic_paired_subjectid_4", "genetic_pi_hat_1", "genetic_pi_hat_2", "genetic_pi_hat_3", "genetic_pi_hat_4")
idsub<-ids1[vars]
# need to ID folks whose family IDs repeat more than once
# order and then count fam IDs
idsub <- idsub[order(idsub$rel_family_id, idsub$rel_group_id),]
idsub$person_count <- with(idsub, ave(as.character(rel_family_id), rel_family_id, FUN = seq_along))
table(idsub$person_count)
##
## 1 2 3 4 5
## 9854 1956 63 2 1
FamMems<-idsub %>% count(rel_family_id)
idsub<-merge(idsub, FamMems, by = "rel_family_id", all = TRUE)
names(idsub)[names(idsub) == 'n'] <- 'fam_mems'
# dealing with the fams with 3 kids, 4 kids, and 5
# create subsets of each by fam_mems
idsub1 <- idsub[ which(idsub$fam_mems==1), ]
idsub2 <- idsub[ which(idsub$fam_mems==2), ]
idsub3 <- idsub[ which(idsub$fam_mems==3), ]
idsub4 <- idsub[ which(idsub$fam_mems==4), ]
idsub5 <- idsub[ which(idsub$fam_mems==5), ]
# don't need to mess with 1s and 2s
# start with fams of 3s
table(idsub3$rel_relationship)
##
## 1 2 3
## 86 70 27
# subset within fams of 3 to different sib types for reducing to 2 sibs per family
idsub3trips <- idsub3[ which(idsub3$rel_relationship==3), ]
idsub3twinssibs <- idsub3[ which(idsub3$rel_relationship==2 | idsub3$rel_relationship==1), ]
#### fams of 3 ####
# start with triplets
# in theory, I'd start by prioritizing MZs within trip pairs
# but genetic zygostiy has a lot of NAs and the only ones listed are DZ
# so taking a random sample of 2 of the 3 trips
# set seed so random person IDs don't change everytime you run the code
set.seed(13)
idsub3trips$rand<-runif(nrow(idsub3trips), min=0, max=1)
# order by fam and new random ID
idsub3trips <- idsub3trips[order(idsub3trips$rel_family_id, idsub3trips$rand),]
# keep first 2 rows based on duplicates in rel_family_id
idsub3trips<-idsub3trips %>%
group_by(rel_family_id) %>%
filter(row_number() <= 2)
# onto the families of 3 that have some combination of twins and sibs
table(idsub3twinssibs$rel_relationship)
##
## 1 2
## 86 70
# prioritize twins
# ie if a family has a twin set and 1 sibling, just keep the twins
# if any of these families just have 3 non twin siblings, randomly pick 2
# use Group ID (twins and triplets in the same family share a group ID) here
# a Group ID = 3 indicates 3 sibs, drop person 3 as group ID seems to have been randomly assigned
idsub3twinssibs <- idsub3twinssibs[ which(idsub3twinssibs$rel_group_id <3),]
# now deal with the twin/sib combos
idsub3twinssibs <- idsub3twinssibs[order(idsub3twinssibs$rel_family_id, idsub3twinssibs$rel_group_id),]
# keep first 2 rows based on duplicates in rel_family_id and rel_group_id
idsub3twinssibs<-idsub3twinssibs %>%
group_by(rel_family_id) %>%
filter(row_number() <= 2)
# combine families of 3 back into one dataframe
idsub3clean<-rbind(idsub3trips,idsub3twinssibs)
#### deal with fams of 4 ####
# it's only 1 fam, both pairs are MZ
# pick one at random
sample(idsub4$rel_group_id, 1)
## [1] 2
# 2
idsub4clean <- idsub4[ which(idsub4$rel_group_id == 2),]
#### deal with fams of 5 ####
# one set of triplets, one set of DZ twins
# 2 combos appear to be flagged as DZ, pick one at random
sample(idsub5$rel_group_id, 1)
## [1] 2
# 2
idsub5clean <- idsub5[ which(idsub5$rel_group_id == 2),]
#### rbind all the cleaned up ID files back together
idsub3clean$rand<-NULL
idsclean<-rbind(idsub1, idsub2, idsub3clean, idsub4clean, idsub5clean)
# double check no more than 2 people per fam
FamMems<-idsclean %>% count(rel_family_id)
head(idsclean)
## rel_family_id subjectkey interview_age rel_group_id rel_ingroup_order
## 1 0 NDAR_INV0A86UD86 117 1 1
## 4 3 NDAR_INV0HALXUFY 126 1 1
## 5 4 NDAR_INV0HL5D3DF 120 1 1
## 6 5 NDAR_INV0JVUB582 112 1 1
## 7 6 NDAR_INV0JYWL5T8 115 1 1
## 13 12 NDAR_INV0UV5WZUN 121 1 1
## rel_relationship rel_same_sex genetic_zygosity_status_1
## 1 0 NA NA
## 4 0 NA NA
## 5 0 NA NA
## 6 0 NA NA
## 7 0 NA NA
## 13 0 NA NA
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 13 NA
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1
## 4
## 5
## 6
## 7
## 13
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1 NA NA 1 1
## 4 NA NA 1 1
## 5 NA NA 1 1
## 6 NA NA 1 1
## 7 NA NA 1 1
## 13 NA NA 1 1
table(idsclean$rel_relationship) # singleton, sib, twin, trip
##
## 0 1 2 3
## 7898 1771 2123 18
table(idsclean$fam_mems) # how many were in each family -- before we removed
##
## 1 2 3 4 5
## 7898 3786 122 2 2
nrow(ids1)
## [1] 11876
nrow(idsclean)
## [1] 11810
nrow(idsclean)
## [1] 11810
# sam has selected 11810 to keep
sam picked those 11810 people… and now we need to match them to a sib or twin based on the genetic paired subjects 1-5
basically what we need to do is pick sib/twin pairs and extract zygosity. if we look at sam’s file it has multiple permutations of relatedness and their zyg, but lots of NAs
genetic_zygosity_status: 1= monozygotic ; 2= dizygotic ; 3=siblings ;-1= not available (twins/sibs, genetic_pi_hat not calculated)
library(stringr)
head(idsclean)
## rel_family_id subjectkey interview_age rel_group_id rel_ingroup_order
## 1 0 NDAR_INV0A86UD86 117 1 1
## 4 3 NDAR_INV0HALXUFY 126 1 1
## 5 4 NDAR_INV0HL5D3DF 120 1 1
## 6 5 NDAR_INV0JVUB582 112 1 1
## 7 6 NDAR_INV0JYWL5T8 115 1 1
## 13 12 NDAR_INV0UV5WZUN 121 1 1
## rel_relationship rel_same_sex genetic_zygosity_status_1
## 1 0 NA NA
## 4 0 NA NA
## 5 0 NA NA
## 6 0 NA NA
## 7 0 NA NA
## 13 0 NA NA
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 13 NA
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1
## 4
## 5
## 6
## 7
## 13
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1 NA NA 1 1
## 4 NA NA 1 1
## 5 NA NA 1 1
## 6 NA NA 1 1
## 7 NA NA 1 1
## 13 NA NA 1 1
twins<- idsclean %>% filter(rel_relationship==2)
head(twins)
## rel_family_id subjectkey interview_age rel_group_id rel_ingroup_order
## 85 87 NDAR_INVARKPPU88 115 1 2
## 86 87 NDAR_INV6XTE4VEF 115 1 1
## 109 113 NDAR_INV9WJ1WBFC 125 1 1
## 110 113 NDAR_INVBMFRB748 125 1 2
## 114 118 NDAR_INV24D005FC 121 1 1
## 115 118 NDAR_INVK598LFL1 121 1 2
## rel_relationship rel_same_sex genetic_zygosity_status_1
## 85 2 0 NA
## 86 2 0 NA
## 109 2 1 NA
## 110 2 1 NA
## 114 2 0 NA
## 115 2 0 NA
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 85 NA NA
## 86 NA NA
## 109 NA NA
## 110 NA NA
## 114 NA NA
## 115 NA NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 85 NA NDAR_INV6XTE4VEF
## 86 NA NDAR_INVARKPPU88
## 109 NA NDAR_INVBMFRB748
## 110 NA NDAR_INV9WJ1WBFC
## 114 NA NDAR_INVK598LFL1
## 115 NA NDAR_INV24D005FC
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 85
## 86
## 109
## 110
## 114
## 115
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 85 NA NA
## 86 NA NA
## 109 NA NA
## 110 NA NA
## 114 NA NA
## 115 NA NA
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 85 NA NA 1 2
## 86 NA NA 2 2
## 109 NA NA 1 2
## 110 NA NA 2 2
## 114 NA NA 1 2
## 115 NA NA 2 2
sibs<- idsclean %>% filter(rel_relationship==1)
head(sibs)
## rel_family_id subjectkey interview_age rel_group_id rel_ingroup_order
## 2 1 NDAR_INV0FAET6YA 128 1 1
## 3 1 NDAR_INV2VD87TK5 113 2 1
## 8 9 NDAR_INV0N0JE94U 108 1 1
## 9 9 NDAR_INVJPKAU189 111 2 1
## 18 17 NDAR_INV098DVK8B 112 1 1
## 19 17 NDAR_INVMP7JC377 127 2 1
## rel_relationship rel_same_sex genetic_zygosity_status_1
## 2 1 NA NA
## 3 1 NA NA
## 8 1 NA NA
## 9 1 NA NA
## 18 1 NA NA
## 19 1 NA NA
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 2 NA NA
## 3 NA NA
## 8 NA NA
## 9 NA NA
## 18 NA NA
## 19 NA NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 2 NA NDAR_INV2VD87TK5
## 3 NA NDAR_INV0FAET6YA
## 8 NA NDAR_INVJPKAU189
## 9 NA NDAR_INV0N0JE94U
## 18 NA NDAR_INVMP7JC377
## 19 NA NDAR_INV098DVK8B
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 2
## 3
## 8
## 9
## 18
## 19
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 2 NA NA
## 3 NA NA
## 8 NA NA
## 9 NA NA
## 18 NA NA
## 19 NA NA
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 2 NA NA 1 2
## 3 NA NA 2 2
## 8 NA NA 1 2
## 9 NA NA 2 2
## 18 NA NA 1 2
## 19 NA NA 2 2
trips<- idsclean %>% filter(rel_relationship==3)
trips
## rel_family_id subjectkey interview_age rel_group_id
## 1387 11 NDAR_INVY4FZKKCZ 115 1
## 2250 11 NDAR_INV0MCARYLD 115 1
## 395 811 NDAR_INVWP2U8M5Y 120 1
## 4322 811 NDAR_INVTMFT5TJF 120 1
## 5143 3072 NDAR_INVTDXHD95M 126 1
## 6334 3072 NDAR_INV35DUHZ83 126 1
## 7339 5159 NDAR_INV6K2HE999 117 1
## 8210 5159 NDAR_INVCV07Y8GY 117 1
## 9435 5875 NDAR_INVRDMA0DZT 123 1
## 10 5875 NDAR_INVNEBJA9J9 123 1
## 11 6578 NDAR_INVF20K5ZNE 110 1
## 12 6578 NDAR_INVBR0R4R8M 110 1
## 1388 6931 NDAR_INVMLFXTJVE 107 1
## 1416 6931 NDAR_INV9RYUGDF2 107 1
## 1567 8409 NDAR_INV5PWUMKR0 118 1
## 1625 8409 NDAR_INV7P9PRTBA 118 1
## 17100 11671 NDAR_INVGMYU4GT3 126 1
## 18100 11671 NDAR_INVBKNVGWPC 126 1
## rel_ingroup_order rel_relationship rel_same_sex genetic_zygosity_status_1
## 1387 3 3 0 NA
## 2250 2 3 0 NA
## 395 1 3 1 NA
## 4322 3 3 1 NA
## 5143 1 3 1 NA
## 6334 2 3 1 NA
## 7339 2 3 1 NA
## 8210 3 3 1 NA
## 9435 2 3 0 2
## 10 1 3 0 2
## 11 3 3 0 NA
## 12 2 3 0 NA
## 1388 1 3 0 NA
## 1416 2 3 0 NA
## 1567 2 3 1 NA
## 1625 1 3 1 NA
## 17100 3 3 0 2
## 18100 2 3 0 2
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 1387 NA NA
## 2250 NA NA
## 395 NA NA
## 4322 NA NA
## 5143 NA NA
## 6334 NA NA
## 7339 NA NA
## 8210 NA NA
## 9435 NA NA
## 10 2 NA
## 11 NA NA
## 12 NA NA
## 1388 NA NA
## 1416 NA NA
## 1567 NA NA
## 1625 NA NA
## 17100 2 NA
## 18100 2 NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1387 NA NDAR_INV0TEH16CM
## 2250 NA NDAR_INV0TEH16CM
## 395 NA NDAR_INVCPUDHWZM
## 4322 NA NDAR_INVWP2U8M5Y
## 5143 NA NDAR_INV35DUHZ83
## 6334 NA NDAR_INVTDXHD95M
## 7339 NA NDAR_INVNU2URG3T
## 8210 NA NDAR_INVNU2URG3T
## 9435 NA NDAR_INVNEBJA9J9
## 10 NA NDAR_INV851T3JYC
## 11 NA NDAR_INVV5X50ZMM
## 12 NA NDAR_INVV5X50ZMM
## 1388 NA NDAR_INV9RYUGDF2
## 1416 NA NDAR_INVMLFXTJVE
## 1567 NA NDAR_INV7P9PRTBA
## 1625 NA NDAR_INV5PWUMKR0
## 17100 NA NDAR_INVBKNVGWPC
## 18100 NA NDAR_INVGMYU4GT3
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1387 NDAR_INV0MCARYLD
## 2250 NDAR_INVY4FZKKCZ
## 395 NDAR_INVTMFT5TJF
## 4322 NDAR_INVCPUDHWZM
## 5143 NDAR_INVRLL3TP2P
## 6334 NDAR_INVRLL3TP2P
## 7339 NDAR_INVCV07Y8GY
## 8210 NDAR_INV6K2HE999
## 9435 NDAR_INV851T3JYC
## 10 NDAR_INVRDMA0DZT
## 11 NDAR_INVBR0R4R8M
## 12 NDAR_INVF20K5ZNE
## 1388 NDAR_INVRL95P8LJ
## 1416 NDAR_INVRL95P8LJ
## 1567 NDAR_INV1X4F1YB5
## 1625 NDAR_INV1X4F1YB5
## 17100 NDAR_INVH1BGGDYR
## 18100 NDAR_INVH1BGGDYR
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1387 NA NA
## 2250 NA NA
## 395 NA NA
## 4322 NA NA
## 5143 NA NA
## 6334 NA NA
## 7339 NA NA
## 8210 NA NA
## 9435 0.4672 NA
## 10 0.4927 0.4672
## 11 NA NA
## 12 NA NA
## 1388 NA NA
## 1416 NA NA
## 1567 NA NA
## 1625 NA NA
## 17100 0.5002 0.5418
## 18100 0.5002 0.4813
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1387 NA NA 2 3
## 2250 NA NA 3 3
## 395 NA NA 3 3
## 4322 NA NA 1 3
## 5143 NA NA 1 3
## 6334 NA NA 2 3
## 7339 NA NA 1 3
## 8210 NA NA 2 3
## 9435 NA NA 2 3
## 10 NA NA 3 3
## 11 NA NA 2 3
## 12 NA NA 1 3
## 1388 NA NA 3 3
## 1416 NA NA 2 3
## 1567 NA NA 3 3
## 1625 NA NA 1 3
## 17100 NA NA 3 3
## 18100 NA NA 1 3
nrow(idsclean) # we selected 11810 subjects for inclusion
## [1] 11810
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_1) # 3882 are matched in this column
## [1] 3882
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_2) # 29 are matched in this column
## [1] 29
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_3) # 0 are matched in this column
## [1] 0
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_4) # 1 are matched in this column
## [1] 1
## which means.... 3882+29+1 are the number of twins/sibs?
head(idsclean)
## rel_family_id subjectkey interview_age rel_group_id rel_ingroup_order
## 1 0 NDAR_INV0A86UD86 117 1 1
## 4 3 NDAR_INV0HALXUFY 126 1 1
## 5 4 NDAR_INV0HL5D3DF 120 1 1
## 6 5 NDAR_INV0JVUB582 112 1 1
## 7 6 NDAR_INV0JYWL5T8 115 1 1
## 13 12 NDAR_INV0UV5WZUN 121 1 1
## rel_relationship rel_same_sex genetic_zygosity_status_1
## 1 0 NA NA
## 4 0 NA NA
## 5 0 NA NA
## 6 0 NA NA
## 7 0 NA NA
## 13 0 NA NA
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 13 NA
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1
## 4
## 5
## 6
## 7
## 13
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1 NA NA 1 1
## 4 NA NA 1 1
## 5 NA NA 1 1
## 6 NA NA 1 1
## 7 NA NA 1 1
## 13 NA NA 1 1
subj1<- idsclean %>% filter(str_detect(genetic_paired_subjectid_1, 'NDAR')) %>%
filter(!is.na(genetic_zygosity_status_1)) %>%
select(rel_family_id, subjectkey, genetic_zygosity_status_1, genetic_paired_subjectid_1) %>%
filter(genetic_paired_subjectid_1 %in% idsclean$subjectkey) %>%
rename(matched_subject= genetic_paired_subjectid_1,
zyg=genetic_zygosity_status_1)
table(subj1$zyg)
##
## 1 2 3
## 666 857 93
# 666 MZs, 857 DZs, 93 sibs
subj2<- idsclean %>% filter(str_detect(genetic_paired_subjectid_2, 'NDAR')) %>%
filter(!is.na(genetic_zygosity_status_2)) %>%
select(rel_family_id, subjectkey, genetic_zygosity_status_2, genetic_paired_subjectid_2) %>%
filter(genetic_paired_subjectid_2 %in% idsclean$subjectkey) %>%
rename(matched_subject= genetic_paired_subjectid_2,
zyg=genetic_zygosity_status_2)
table(subj2$zyg)
##
## 1 2 3
## 6 10 1
# 6 MZs, 10 DZs, 3 sibs
subj3<- idsclean %>% filter(str_detect(genetic_paired_subjectid_3, 'NDAR')) %>%
filter(!is.na(genetic_zygosity_status_3)) %>%
select(rel_family_id, subjectkey, genetic_zygosity_status_3, genetic_paired_subjectid_3) %>%
filter(genetic_paired_subjectid_3 %in% idsclean$subjectkey) %>%
rename(matched_subject= genetic_paired_subjectid_3,
zyg=genetic_zygosity_status_3)
table(subj3$zyg)
## < table of extent 0 >
# none here as expected
subj4<- idsclean %>% filter(str_detect(genetic_paired_subjectid_4, 'NDAR')) %>%
filter(!is.na(genetic_zygosity_status_4)) %>%
select(rel_family_id, subjectkey, genetic_zygosity_status_4, genetic_paired_subjectid_4) %>%
filter(genetic_paired_subjectid_4 %in% idsclean$subjectkey) %>%
rename(matched_subject= genetic_paired_subjectid_4,
zyg=genetic_zygosity_status_4)
table(subj4$zyg)
##
## 2
## 1
# 2 DZs
zygs<- rbind(subj1,subj2,subj3,subj4)
head(zygs)
## rel_family_id subjectkey zyg matched_subject
## 314 320 NDAR_INVNDGUJMHY 3 NDAR_INV6XT4AD09
## 315 320 NDAR_INV6XT4AD09 3 NDAR_INVNDGUJMHY
## 3187 3189 NDAR_INV0A9K5L4R 2 NDAR_INV3X41ZHVL
## 3188 3189 NDAR_INV3X41ZHVL 2 NDAR_INV0A9K5L4R
## 3191 3193 NDAR_INV0CBPKF6W 2 NDAR_INV9LRE3A49
## 3192 3193 NDAR_INV9LRE3A49 2 NDAR_INV0CBPKF6W
nrow(zygs)
## [1] 1634
sum(zygs$subjectkey %in% zygs$matched_subject)
## [1] 1634
sum(zygs$subjectkey %in% idsclean$subjectkey)
## [1] 1634
sum(zygs$matched_subject %in% idsclean$subjectkey)
## [1] 1634
# 1634 all around... so there are 1634 individuals we have zygosity for
table(zygs$zyg)
##
## 1 2 3
## 672 868 94
# 672 MZs, 868 DZs and 94 sibs
# ok sooooo does that make sense? i remember sam saying we wouldn't expect the N here to be 11k bc not everyone had zygo. so if i merge these 1634 with the idsclean file (by subjectkey) will we have a full set?
head(zygs)
## rel_family_id subjectkey zyg matched_subject
## 314 320 NDAR_INVNDGUJMHY 3 NDAR_INV6XT4AD09
## 315 320 NDAR_INV6XT4AD09 3 NDAR_INVNDGUJMHY
## 3187 3189 NDAR_INV0A9K5L4R 2 NDAR_INV3X41ZHVL
## 3188 3189 NDAR_INV3X41ZHVL 2 NDAR_INV0A9K5L4R
## 3191 3193 NDAR_INV0CBPKF6W 2 NDAR_INV9LRE3A49
## 3192 3193 NDAR_INV9LRE3A49 2 NDAR_INV0CBPKF6W
head(idsclean)
## rel_family_id subjectkey interview_age rel_group_id rel_ingroup_order
## 1 0 NDAR_INV0A86UD86 117 1 1
## 4 3 NDAR_INV0HALXUFY 126 1 1
## 5 4 NDAR_INV0HL5D3DF 120 1 1
## 6 5 NDAR_INV0JVUB582 112 1 1
## 7 6 NDAR_INV0JYWL5T8 115 1 1
## 13 12 NDAR_INV0UV5WZUN 121 1 1
## rel_relationship rel_same_sex genetic_zygosity_status_1
## 1 0 NA NA
## 4 0 NA NA
## 5 0 NA NA
## 6 0 NA NA
## 7 0 NA NA
## 13 0 NA NA
## genetic_zygosity_status_2 genetic_zygosity_status_3
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 13 NA
## genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1
## 4
## 5
## 6
## 7
## 13
## genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 13 NA NA
## genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1 NA NA 1 1
## 4 NA NA 1 1
## 5 NA NA 1 1
## 6 NA NA 1 1
## 7 NA NA 1 1
## 13 NA NA 1 1
nozyg<- idsclean %>% filter(!idsclean$subjectkey%in%zygs$subjectkey) %>%
select(rel_family_id, subjectkey)
nozyg$zyg<- NA
nozyg$matched_subject<- NA
head(zygs)
## rel_family_id subjectkey zyg matched_subject
## 314 320 NDAR_INVNDGUJMHY 3 NDAR_INV6XT4AD09
## 315 320 NDAR_INV6XT4AD09 3 NDAR_INVNDGUJMHY
## 3187 3189 NDAR_INV0A9K5L4R 2 NDAR_INV3X41ZHVL
## 3188 3189 NDAR_INV3X41ZHVL 2 NDAR_INV0A9K5L4R
## 3191 3193 NDAR_INV0CBPKF6W 2 NDAR_INV9LRE3A49
## 3192 3193 NDAR_INV9LRE3A49 2 NDAR_INV0CBPKF6W
head(nozyg)
## rel_family_id subjectkey zyg matched_subject
## 1 0 NDAR_INV0A86UD86 NA NA
## 4 3 NDAR_INV0HALXUFY NA NA
## 5 4 NDAR_INV0HL5D3DF NA NA
## 6 5 NDAR_INV0JVUB582 NA NA
## 7 6 NDAR_INV0JYWL5T8 NA NA
## 13 12 NDAR_INV0UV5WZUN NA NA
full<- rbind(zygs,nozyg)
nrow(full)
## [1] 11810
nrow(idsclean)
## [1] 11810
sum(full$subjectkey%in%idsclean$subjectkey)
## [1] 11810
# 11810 all match
head(full)
## rel_family_id subjectkey zyg matched_subject
## 314 320 NDAR_INVNDGUJMHY 3 NDAR_INV6XT4AD09
## 315 320 NDAR_INV6XT4AD09 3 NDAR_INVNDGUJMHY
## 3187 3189 NDAR_INV0A9K5L4R 2 NDAR_INV3X41ZHVL
## 3188 3189 NDAR_INV3X41ZHVL 2 NDAR_INV0A9K5L4R
## 3191 3193 NDAR_INV0CBPKF6W 2 NDAR_INV9LRE3A49
## 3192 3193 NDAR_INV9LRE3A49 2 NDAR_INV0CBPKF6W
table(full$zyg)
##
## 1 2 3
## 672 868 94
rel201_zygosity <- read.csv("/Users/clairemorrison/Documents/IBG:Lab/ABCD/ABCD_Econtexts/data/rel201_zygosity.dat", header = T)
colnames(rel201_zygosity)
## [1] "FID" "IID1" "IID2" "zygosity"
## [5] "Z0" "Z1" "Z2" "PI_HAT"
## [9] "interview_date_1" "interview_date_2" "interview_age_1" "interview_age_2"
## [13] "gender_1" "gender_2" "twinsite_1" "twinsite_2"
#View(rel201_zygosity[1:40,1:16])
nrow(rel201_zygosity)
## [1] 1592
#rel201_zygosity<- merge(rel201_zygosity, idskeep, by.y='subjectkey', by.x='IID1', all=T) working on this...
zyg<- subset(rel201_zygosity, rel201_zygosity$zygosity<6)
# 1= M MZ
# 2= M DZ
# 3= F MZ
# 4= F DZ
# 5= OS DZ
nrow(zyg)
## [1] 950
t1<- zyg[c(1,2,4,8)]
t2<- zyg[c(1,3,4,8)]
colnames(t1)[colnames(t1)=="IID1"] <- "subjectkey"
colnames(t2)[colnames(t2)=="IID2"] <- "subjectkey"
t1$twin<-1
t2$twin<-2
t1<-t1[complete.cases(t1), ]
t2<-t2[complete.cases(t2), ]
zygfull<- rbind(t1,t2)
zygfull <- zygfull[!duplicated(zygfull$subjectkey),]
table(zygfull$twin)
##
## 1 2
## 935 923
test<- merge(zygfull, full, by='subjectkey', all=T)
nrow(test)
## [1] 11836
nrow(zygfull)
## [1] 1858
nrow(full)
## [1] 11810
## from jeff MZs=1 or 3, DZs=2 or 4
table(test$zygosity);table(test$zyg)
##
## 1 2 3 4 5
## 372 492 368 499 127
##
## 1 2 3
## 672 868 94
# we have slightly more MZs and slightly fewer DZs?