zygosity

first need to deal with families that have more than 1 or 2 sibs/twins

# read in raw data files
acspsw03 <- fread("/Users/clairemorrison/Desktop/ctc_sleep_psych/acspsw03.txt", header=T, data.table=F)

# just keep baseline for dems/ids
# it looks like zygo is only in baseline 
ids1 <- acspsw03[ which(acspsw03$eventname=='baseline_year_1_arm_1'), ]

# look for family that have multiple sets of twins/sibs
# if families have multiple sets of sibs/twins we're prioritizing: 
# 1 - MZs over any other pairs
# 2 - DZs over sibs
# 3 - if 2 pairs of sibs pick one set at random 
table(ids1$rel_relationship)
## 
##    0    1    2    3 
## 7898 1810 2138   30
# create a subset of data with just subjectkey and family ID info that is easier to work with
vars <- c("subjectkey", "interview_age", "rel_family_id", "rel_group_id", "rel_ingroup_order",
          "rel_relationship", "rel_same_sex", "genetic_zygosity_status_1", "genetic_zygosity_status_2", "genetic_zygosity_status_3", "genetic_zygosity_status_4", 
          "genetic_paired_subjectid_1", "genetic_paired_subjectid_2", "genetic_paired_subjectid_3", "genetic_paired_subjectid_4", "genetic_pi_hat_1", "genetic_pi_hat_2", "genetic_pi_hat_3", "genetic_pi_hat_4")

idsub<-ids1[vars]

# need to ID folks whose family IDs repeat more than once
# order and then count fam IDs
idsub <- idsub[order(idsub$rel_family_id, idsub$rel_group_id),]
idsub$person_count <- with(idsub, ave(as.character(rel_family_id), rel_family_id, FUN = seq_along))

table(idsub$person_count)
## 
##    1    2    3    4    5 
## 9854 1956   63    2    1
FamMems<-idsub %>% count(rel_family_id)

idsub<-merge(idsub, FamMems, by = "rel_family_id", all = TRUE)

names(idsub)[names(idsub) == 'n'] <- 'fam_mems'

# dealing with the fams with 3 kids, 4 kids, and 5
# create subsets of each by fam_mems
idsub1 <- idsub[ which(idsub$fam_mems==1), ]
idsub2 <- idsub[ which(idsub$fam_mems==2), ]
idsub3 <- idsub[ which(idsub$fam_mems==3), ]
idsub4 <- idsub[ which(idsub$fam_mems==4), ]
idsub5 <- idsub[ which(idsub$fam_mems==5), ]

# don't need to mess with 1s and 2s 
# start with fams of 3s
table(idsub3$rel_relationship)
## 
##  1  2  3 
## 86 70 27
# subset within fams of 3 to different sib types for reducing to 2 sibs per family
idsub3trips <- idsub3[ which(idsub3$rel_relationship==3), ]
idsub3twinssibs <- idsub3[ which(idsub3$rel_relationship==2 | idsub3$rel_relationship==1), ]

#### fams of 3 ####
# start with triplets
# in theory, I'd start by prioritizing MZs within trip pairs 
# but genetic zygostiy has a lot of NAs and the only ones listed are DZ
# so taking a random sample of 2 of the 3 trips

# set seed so random person IDs don't change everytime you run the code
set.seed(13)

idsub3trips$rand<-runif(nrow(idsub3trips), min=0, max=1)

# order by fam and new random ID 
idsub3trips <- idsub3trips[order(idsub3trips$rel_family_id, idsub3trips$rand),]

# keep first 2 rows based on duplicates in rel_family_id 
idsub3trips<-idsub3trips %>%
  group_by(rel_family_id) %>%
  filter(row_number() <= 2)

# onto the families of 3 that have some combination of twins and sibs
table(idsub3twinssibs$rel_relationship)
## 
##  1  2 
## 86 70
# prioritize twins
# ie if a family has a twin set and 1 sibling, just keep the twins 
# if any of these families just have 3 non twin siblings, randomly pick 2 

# use Group ID (twins and triplets in the same family share a group ID) here 
# a Group ID = 3 indicates 3 sibs, drop person 3 as group ID seems to have been randomly assigned
idsub3twinssibs <- idsub3twinssibs[ which(idsub3twinssibs$rel_group_id <3),]
                         
# now deal with the twin/sib combos
idsub3twinssibs <- idsub3twinssibs[order(idsub3twinssibs$rel_family_id, idsub3twinssibs$rel_group_id),]

# keep first 2 rows based on duplicates in rel_family_id and rel_group_id
idsub3twinssibs<-idsub3twinssibs %>%
  group_by(rel_family_id) %>%
  filter(row_number() <= 2)

# combine families of 3 back into one dataframe
idsub3clean<-rbind(idsub3trips,idsub3twinssibs)

#### deal with fams of 4 ####
# it's only 1 fam, both pairs are MZ 
# pick one at random
sample(idsub4$rel_group_id, 1)
## [1] 2
# 2 
idsub4clean <- idsub4[ which(idsub4$rel_group_id == 2),]

#### deal with fams of 5 ####
# one set of triplets, one set of DZ twins
# 2 combos appear to be flagged as DZ, pick one at random
sample(idsub5$rel_group_id, 1)
## [1] 2
# 2
idsub5clean <- idsub5[ which(idsub5$rel_group_id == 2),]

#### rbind all the cleaned up ID files back together
idsub3clean$rand<-NULL
idsclean<-rbind(idsub1, idsub2, idsub3clean, idsub4clean, idsub5clean)

# double check no more than 2 people per fam
FamMems<-idsclean %>% count(rel_family_id)

head(idsclean)
##    rel_family_id       subjectkey interview_age rel_group_id rel_ingroup_order
## 1              0 NDAR_INV0A86UD86           117            1                 1
## 4              3 NDAR_INV0HALXUFY           126            1                 1
## 5              4 NDAR_INV0HL5D3DF           120            1                 1
## 6              5 NDAR_INV0JVUB582           112            1                 1
## 7              6 NDAR_INV0JYWL5T8           115            1                 1
## 13            12 NDAR_INV0UV5WZUN           121            1                 1
##    rel_relationship rel_same_sex genetic_zygosity_status_1
## 1                 0           NA                        NA
## 4                 0           NA                        NA
## 5                 0           NA                        NA
## 6                 0           NA                        NA
## 7                 0           NA                        NA
## 13                0           NA                        NA
##    genetic_zygosity_status_2 genetic_zygosity_status_3
## 1                         NA                        NA
## 4                         NA                        NA
## 5                         NA                        NA
## 6                         NA                        NA
## 7                         NA                        NA
## 13                        NA                        NA
##    genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1                         NA                           
## 4                         NA                           
## 5                         NA                           
## 6                         NA                           
## 7                         NA                           
## 13                        NA                           
##    genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1                                                       
## 4                                                       
## 5                                                       
## 6                                                       
## 7                                                       
## 13                                                      
##    genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1                                           NA               NA
## 4                                           NA               NA
## 5                                           NA               NA
## 6                                           NA               NA
## 7                                           NA               NA
## 13                                          NA               NA
##    genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1                NA               NA            1        1
## 4                NA               NA            1        1
## 5                NA               NA            1        1
## 6                NA               NA            1        1
## 7                NA               NA            1        1
## 13               NA               NA            1        1
table(idsclean$rel_relationship) # singleton, sib, twin, trip
## 
##    0    1    2    3 
## 7898 1771 2123   18
table(idsclean$fam_mems) # how many were in each family -- before we removed
## 
##    1    2    3    4    5 
## 7898 3786  122    2    2
nrow(ids1)
## [1] 11876
nrow(idsclean)
## [1] 11810
nrow(idsclean)
## [1] 11810
# sam has selected 11810 to keep

sam picked those 11810 people… and now we need to match them to a sib or twin based on the genetic paired subjects 1-5

  1. filter for genetic paired subjects 1-5 and non NA for genetic zygosity status
  2. filter if the paired subject ID is in idsclean
  3. repeat for other paired subejcts
  4. make sure no duplicates

basically what we need to do is pick sib/twin pairs and extract zygosity. if we look at sam’s file it has multiple permutations of relatedness and their zyg, but lots of NAs

genetic_zygosity_status: 1= monozygotic ; 2= dizygotic ; 3=siblings ;-1= not available (twins/sibs, genetic_pi_hat not calculated)

library(stringr)
head(idsclean)
##    rel_family_id       subjectkey interview_age rel_group_id rel_ingroup_order
## 1              0 NDAR_INV0A86UD86           117            1                 1
## 4              3 NDAR_INV0HALXUFY           126            1                 1
## 5              4 NDAR_INV0HL5D3DF           120            1                 1
## 6              5 NDAR_INV0JVUB582           112            1                 1
## 7              6 NDAR_INV0JYWL5T8           115            1                 1
## 13            12 NDAR_INV0UV5WZUN           121            1                 1
##    rel_relationship rel_same_sex genetic_zygosity_status_1
## 1                 0           NA                        NA
## 4                 0           NA                        NA
## 5                 0           NA                        NA
## 6                 0           NA                        NA
## 7                 0           NA                        NA
## 13                0           NA                        NA
##    genetic_zygosity_status_2 genetic_zygosity_status_3
## 1                         NA                        NA
## 4                         NA                        NA
## 5                         NA                        NA
## 6                         NA                        NA
## 7                         NA                        NA
## 13                        NA                        NA
##    genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1                         NA                           
## 4                         NA                           
## 5                         NA                           
## 6                         NA                           
## 7                         NA                           
## 13                        NA                           
##    genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1                                                       
## 4                                                       
## 5                                                       
## 6                                                       
## 7                                                       
## 13                                                      
##    genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1                                           NA               NA
## 4                                           NA               NA
## 5                                           NA               NA
## 6                                           NA               NA
## 7                                           NA               NA
## 13                                          NA               NA
##    genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1                NA               NA            1        1
## 4                NA               NA            1        1
## 5                NA               NA            1        1
## 6                NA               NA            1        1
## 7                NA               NA            1        1
## 13               NA               NA            1        1
twins<- idsclean %>% filter(rel_relationship==2)
head(twins)
##     rel_family_id       subjectkey interview_age rel_group_id rel_ingroup_order
## 85             87 NDAR_INVARKPPU88           115            1                 2
## 86             87 NDAR_INV6XTE4VEF           115            1                 1
## 109           113 NDAR_INV9WJ1WBFC           125            1                 1
## 110           113 NDAR_INVBMFRB748           125            1                 2
## 114           118 NDAR_INV24D005FC           121            1                 1
## 115           118 NDAR_INVK598LFL1           121            1                 2
##     rel_relationship rel_same_sex genetic_zygosity_status_1
## 85                 2            0                        NA
## 86                 2            0                        NA
## 109                2            1                        NA
## 110                2            1                        NA
## 114                2            0                        NA
## 115                2            0                        NA
##     genetic_zygosity_status_2 genetic_zygosity_status_3
## 85                         NA                        NA
## 86                         NA                        NA
## 109                        NA                        NA
## 110                        NA                        NA
## 114                        NA                        NA
## 115                        NA                        NA
##     genetic_zygosity_status_4 genetic_paired_subjectid_1
## 85                         NA           NDAR_INV6XTE4VEF
## 86                         NA           NDAR_INVARKPPU88
## 109                        NA           NDAR_INVBMFRB748
## 110                        NA           NDAR_INV9WJ1WBFC
## 114                        NA           NDAR_INVK598LFL1
## 115                        NA           NDAR_INV24D005FC
##     genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 85                                                       
## 86                                                       
## 109                                                      
## 110                                                      
## 114                                                      
## 115                                                      
##     genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 85                                           NA               NA
## 86                                           NA               NA
## 109                                          NA               NA
## 110                                          NA               NA
## 114                                          NA               NA
## 115                                          NA               NA
##     genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 85                NA               NA            1        2
## 86                NA               NA            2        2
## 109               NA               NA            1        2
## 110               NA               NA            2        2
## 114               NA               NA            1        2
## 115               NA               NA            2        2
sibs<- idsclean %>% filter(rel_relationship==1)
head(sibs)
##    rel_family_id       subjectkey interview_age rel_group_id rel_ingroup_order
## 2              1 NDAR_INV0FAET6YA           128            1                 1
## 3              1 NDAR_INV2VD87TK5           113            2                 1
## 8              9 NDAR_INV0N0JE94U           108            1                 1
## 9              9 NDAR_INVJPKAU189           111            2                 1
## 18            17 NDAR_INV098DVK8B           112            1                 1
## 19            17 NDAR_INVMP7JC377           127            2                 1
##    rel_relationship rel_same_sex genetic_zygosity_status_1
## 2                 1           NA                        NA
## 3                 1           NA                        NA
## 8                 1           NA                        NA
## 9                 1           NA                        NA
## 18                1           NA                        NA
## 19                1           NA                        NA
##    genetic_zygosity_status_2 genetic_zygosity_status_3
## 2                         NA                        NA
## 3                         NA                        NA
## 8                         NA                        NA
## 9                         NA                        NA
## 18                        NA                        NA
## 19                        NA                        NA
##    genetic_zygosity_status_4 genetic_paired_subjectid_1
## 2                         NA           NDAR_INV2VD87TK5
## 3                         NA           NDAR_INV0FAET6YA
## 8                         NA           NDAR_INVJPKAU189
## 9                         NA           NDAR_INV0N0JE94U
## 18                        NA           NDAR_INVMP7JC377
## 19                        NA           NDAR_INV098DVK8B
##    genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 2                                                       
## 3                                                       
## 8                                                       
## 9                                                       
## 18                                                      
## 19                                                      
##    genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 2                                           NA               NA
## 3                                           NA               NA
## 8                                           NA               NA
## 9                                           NA               NA
## 18                                          NA               NA
## 19                                          NA               NA
##    genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 2                NA               NA            1        2
## 3                NA               NA            2        2
## 8                NA               NA            1        2
## 9                NA               NA            2        2
## 18               NA               NA            1        2
## 19               NA               NA            2        2
trips<- idsclean %>% filter(rel_relationship==3)
trips
##       rel_family_id       subjectkey interview_age rel_group_id
## 1387             11 NDAR_INVY4FZKKCZ           115            1
## 2250             11 NDAR_INV0MCARYLD           115            1
## 395             811 NDAR_INVWP2U8M5Y           120            1
## 4322            811 NDAR_INVTMFT5TJF           120            1
## 5143           3072 NDAR_INVTDXHD95M           126            1
## 6334           3072 NDAR_INV35DUHZ83           126            1
## 7339           5159 NDAR_INV6K2HE999           117            1
## 8210           5159 NDAR_INVCV07Y8GY           117            1
## 9435           5875 NDAR_INVRDMA0DZT           123            1
## 10             5875 NDAR_INVNEBJA9J9           123            1
## 11             6578 NDAR_INVF20K5ZNE           110            1
## 12             6578 NDAR_INVBR0R4R8M           110            1
## 1388           6931 NDAR_INVMLFXTJVE           107            1
## 1416           6931 NDAR_INV9RYUGDF2           107            1
## 1567           8409 NDAR_INV5PWUMKR0           118            1
## 1625           8409 NDAR_INV7P9PRTBA           118            1
## 17100         11671 NDAR_INVGMYU4GT3           126            1
## 18100         11671 NDAR_INVBKNVGWPC           126            1
##       rel_ingroup_order rel_relationship rel_same_sex genetic_zygosity_status_1
## 1387                  3                3            0                        NA
## 2250                  2                3            0                        NA
## 395                   1                3            1                        NA
## 4322                  3                3            1                        NA
## 5143                  1                3            1                        NA
## 6334                  2                3            1                        NA
## 7339                  2                3            1                        NA
## 8210                  3                3            1                        NA
## 9435                  2                3            0                         2
## 10                    1                3            0                         2
## 11                    3                3            0                        NA
## 12                    2                3            0                        NA
## 1388                  1                3            0                        NA
## 1416                  2                3            0                        NA
## 1567                  2                3            1                        NA
## 1625                  1                3            1                        NA
## 17100                 3                3            0                         2
## 18100                 2                3            0                         2
##       genetic_zygosity_status_2 genetic_zygosity_status_3
## 1387                         NA                        NA
## 2250                         NA                        NA
## 395                          NA                        NA
## 4322                         NA                        NA
## 5143                         NA                        NA
## 6334                         NA                        NA
## 7339                         NA                        NA
## 8210                         NA                        NA
## 9435                         NA                        NA
## 10                            2                        NA
## 11                           NA                        NA
## 12                           NA                        NA
## 1388                         NA                        NA
## 1416                         NA                        NA
## 1567                         NA                        NA
## 1625                         NA                        NA
## 17100                         2                        NA
## 18100                         2                        NA
##       genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1387                         NA           NDAR_INV0TEH16CM
## 2250                         NA           NDAR_INV0TEH16CM
## 395                          NA           NDAR_INVCPUDHWZM
## 4322                         NA           NDAR_INVWP2U8M5Y
## 5143                         NA           NDAR_INV35DUHZ83
## 6334                         NA           NDAR_INVTDXHD95M
## 7339                         NA           NDAR_INVNU2URG3T
## 8210                         NA           NDAR_INVNU2URG3T
## 9435                         NA           NDAR_INVNEBJA9J9
## 10                           NA           NDAR_INV851T3JYC
## 11                           NA           NDAR_INVV5X50ZMM
## 12                           NA           NDAR_INVV5X50ZMM
## 1388                         NA           NDAR_INV9RYUGDF2
## 1416                         NA           NDAR_INVMLFXTJVE
## 1567                         NA           NDAR_INV7P9PRTBA
## 1625                         NA           NDAR_INV5PWUMKR0
## 17100                        NA           NDAR_INVBKNVGWPC
## 18100                        NA           NDAR_INVGMYU4GT3
##       genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1387            NDAR_INV0MCARYLD                           
## 2250            NDAR_INVY4FZKKCZ                           
## 395             NDAR_INVTMFT5TJF                           
## 4322            NDAR_INVCPUDHWZM                           
## 5143            NDAR_INVRLL3TP2P                           
## 6334            NDAR_INVRLL3TP2P                           
## 7339            NDAR_INVCV07Y8GY                           
## 8210            NDAR_INV6K2HE999                           
## 9435            NDAR_INV851T3JYC                           
## 10              NDAR_INVRDMA0DZT                           
## 11              NDAR_INVBR0R4R8M                           
## 12              NDAR_INVF20K5ZNE                           
## 1388            NDAR_INVRL95P8LJ                           
## 1416            NDAR_INVRL95P8LJ                           
## 1567            NDAR_INV1X4F1YB5                           
## 1625            NDAR_INV1X4F1YB5                           
## 17100           NDAR_INVH1BGGDYR                           
## 18100           NDAR_INVH1BGGDYR                           
##       genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1387                                           NA               NA
## 2250                                           NA               NA
## 395                                            NA               NA
## 4322                                           NA               NA
## 5143                                           NA               NA
## 6334                                           NA               NA
## 7339                                           NA               NA
## 8210                                           NA               NA
## 9435                                       0.4672               NA
## 10                                         0.4927           0.4672
## 11                                             NA               NA
## 12                                             NA               NA
## 1388                                           NA               NA
## 1416                                           NA               NA
## 1567                                           NA               NA
## 1625                                           NA               NA
## 17100                                      0.5002           0.5418
## 18100                                      0.5002           0.4813
##       genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1387                NA               NA            2        3
## 2250                NA               NA            3        3
## 395                 NA               NA            3        3
## 4322                NA               NA            1        3
## 5143                NA               NA            1        3
## 6334                NA               NA            2        3
## 7339                NA               NA            1        3
## 8210                NA               NA            2        3
## 9435                NA               NA            2        3
## 10                  NA               NA            3        3
## 11                  NA               NA            2        3
## 12                  NA               NA            1        3
## 1388                NA               NA            3        3
## 1416                NA               NA            2        3
## 1567                NA               NA            3        3
## 1625                NA               NA            1        3
## 17100               NA               NA            3        3
## 18100               NA               NA            1        3
nrow(idsclean) # we selected 11810 subjects for inclusion
## [1] 11810
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_1) # 3882 are matched in this column
## [1] 3882
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_2) # 29 are matched in this column
## [1] 29
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_3) # 0 are matched in this column
## [1] 0
sum(idsclean$subjectkey %in% idsclean$genetic_paired_subjectid_4) # 1 are matched in this column
## [1] 1
## which means.... 3882+29+1 are the number of twins/sibs? 

head(idsclean)
##    rel_family_id       subjectkey interview_age rel_group_id rel_ingroup_order
## 1              0 NDAR_INV0A86UD86           117            1                 1
## 4              3 NDAR_INV0HALXUFY           126            1                 1
## 5              4 NDAR_INV0HL5D3DF           120            1                 1
## 6              5 NDAR_INV0JVUB582           112            1                 1
## 7              6 NDAR_INV0JYWL5T8           115            1                 1
## 13            12 NDAR_INV0UV5WZUN           121            1                 1
##    rel_relationship rel_same_sex genetic_zygosity_status_1
## 1                 0           NA                        NA
## 4                 0           NA                        NA
## 5                 0           NA                        NA
## 6                 0           NA                        NA
## 7                 0           NA                        NA
## 13                0           NA                        NA
##    genetic_zygosity_status_2 genetic_zygosity_status_3
## 1                         NA                        NA
## 4                         NA                        NA
## 5                         NA                        NA
## 6                         NA                        NA
## 7                         NA                        NA
## 13                        NA                        NA
##    genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1                         NA                           
## 4                         NA                           
## 5                         NA                           
## 6                         NA                           
## 7                         NA                           
## 13                        NA                           
##    genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1                                                       
## 4                                                       
## 5                                                       
## 6                                                       
## 7                                                       
## 13                                                      
##    genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1                                           NA               NA
## 4                                           NA               NA
## 5                                           NA               NA
## 6                                           NA               NA
## 7                                           NA               NA
## 13                                          NA               NA
##    genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1                NA               NA            1        1
## 4                NA               NA            1        1
## 5                NA               NA            1        1
## 6                NA               NA            1        1
## 7                NA               NA            1        1
## 13               NA               NA            1        1
subj1<- idsclean %>% filter(str_detect(genetic_paired_subjectid_1, 'NDAR')) %>%
  filter(!is.na(genetic_zygosity_status_1)) %>%
  select(rel_family_id, subjectkey, genetic_zygosity_status_1, genetic_paired_subjectid_1) %>%
  filter(genetic_paired_subjectid_1 %in% idsclean$subjectkey) %>%
  rename(matched_subject= genetic_paired_subjectid_1,
         zyg=genetic_zygosity_status_1)

table(subj1$zyg)
## 
##   1   2   3 
## 666 857  93
# 666 MZs, 857 DZs, 93 sibs

subj2<- idsclean %>% filter(str_detect(genetic_paired_subjectid_2, 'NDAR')) %>%
  filter(!is.na(genetic_zygosity_status_2)) %>%
  select(rel_family_id, subjectkey, genetic_zygosity_status_2, genetic_paired_subjectid_2) %>%
  filter(genetic_paired_subjectid_2 %in% idsclean$subjectkey) %>%
  rename(matched_subject= genetic_paired_subjectid_2,
         zyg=genetic_zygosity_status_2)

table(subj2$zyg)
## 
##  1  2  3 
##  6 10  1
# 6 MZs, 10 DZs, 3 sibs

subj3<- idsclean %>% filter(str_detect(genetic_paired_subjectid_3, 'NDAR')) %>%
  filter(!is.na(genetic_zygosity_status_3)) %>%
  select(rel_family_id, subjectkey, genetic_zygosity_status_3, genetic_paired_subjectid_3) %>%
  filter(genetic_paired_subjectid_3 %in% idsclean$subjectkey) %>%
  rename(matched_subject= genetic_paired_subjectid_3,
         zyg=genetic_zygosity_status_3)

table(subj3$zyg)
## < table of extent 0 >
# none here as expected

subj4<- idsclean %>% filter(str_detect(genetic_paired_subjectid_4, 'NDAR')) %>%
  filter(!is.na(genetic_zygosity_status_4)) %>%
  select(rel_family_id, subjectkey, genetic_zygosity_status_4, genetic_paired_subjectid_4) %>%
  filter(genetic_paired_subjectid_4 %in% idsclean$subjectkey) %>%
  rename(matched_subject= genetic_paired_subjectid_4,
         zyg=genetic_zygosity_status_4)

table(subj4$zyg)
## 
## 2 
## 1
# 2 DZs 


zygs<- rbind(subj1,subj2,subj3,subj4)
head(zygs)
##      rel_family_id       subjectkey zyg  matched_subject
## 314            320 NDAR_INVNDGUJMHY   3 NDAR_INV6XT4AD09
## 315            320 NDAR_INV6XT4AD09   3 NDAR_INVNDGUJMHY
## 3187          3189 NDAR_INV0A9K5L4R   2 NDAR_INV3X41ZHVL
## 3188          3189 NDAR_INV3X41ZHVL   2 NDAR_INV0A9K5L4R
## 3191          3193 NDAR_INV0CBPKF6W   2 NDAR_INV9LRE3A49
## 3192          3193 NDAR_INV9LRE3A49   2 NDAR_INV0CBPKF6W
nrow(zygs)
## [1] 1634
sum(zygs$subjectkey %in% zygs$matched_subject)
## [1] 1634
sum(zygs$subjectkey %in% idsclean$subjectkey)
## [1] 1634
sum(zygs$matched_subject %in% idsclean$subjectkey)
## [1] 1634
# 1634 all around... so there are 1634 individuals we have zygosity for
table(zygs$zyg)
## 
##   1   2   3 
## 672 868  94
# 672 MZs, 868 DZs and 94 sibs

# ok sooooo does that make sense? i remember sam saying we wouldn't expect the N here to be 11k bc not everyone had zygo. so if i merge these 1634 with the idsclean file (by subjectkey) will we have a full set?

head(zygs)
##      rel_family_id       subjectkey zyg  matched_subject
## 314            320 NDAR_INVNDGUJMHY   3 NDAR_INV6XT4AD09
## 315            320 NDAR_INV6XT4AD09   3 NDAR_INVNDGUJMHY
## 3187          3189 NDAR_INV0A9K5L4R   2 NDAR_INV3X41ZHVL
## 3188          3189 NDAR_INV3X41ZHVL   2 NDAR_INV0A9K5L4R
## 3191          3193 NDAR_INV0CBPKF6W   2 NDAR_INV9LRE3A49
## 3192          3193 NDAR_INV9LRE3A49   2 NDAR_INV0CBPKF6W
head(idsclean)
##    rel_family_id       subjectkey interview_age rel_group_id rel_ingroup_order
## 1              0 NDAR_INV0A86UD86           117            1                 1
## 4              3 NDAR_INV0HALXUFY           126            1                 1
## 5              4 NDAR_INV0HL5D3DF           120            1                 1
## 6              5 NDAR_INV0JVUB582           112            1                 1
## 7              6 NDAR_INV0JYWL5T8           115            1                 1
## 13            12 NDAR_INV0UV5WZUN           121            1                 1
##    rel_relationship rel_same_sex genetic_zygosity_status_1
## 1                 0           NA                        NA
## 4                 0           NA                        NA
## 5                 0           NA                        NA
## 6                 0           NA                        NA
## 7                 0           NA                        NA
## 13                0           NA                        NA
##    genetic_zygosity_status_2 genetic_zygosity_status_3
## 1                         NA                        NA
## 4                         NA                        NA
## 5                         NA                        NA
## 6                         NA                        NA
## 7                         NA                        NA
## 13                        NA                        NA
##    genetic_zygosity_status_4 genetic_paired_subjectid_1
## 1                         NA                           
## 4                         NA                           
## 5                         NA                           
## 6                         NA                           
## 7                         NA                           
## 13                        NA                           
##    genetic_paired_subjectid_2 genetic_paired_subjectid_3
## 1                                                       
## 4                                                       
## 5                                                       
## 6                                                       
## 7                                                       
## 13                                                      
##    genetic_paired_subjectid_4 genetic_pi_hat_1 genetic_pi_hat_2
## 1                                           NA               NA
## 4                                           NA               NA
## 5                                           NA               NA
## 6                                           NA               NA
## 7                                           NA               NA
## 13                                          NA               NA
##    genetic_pi_hat_3 genetic_pi_hat_4 person_count fam_mems
## 1                NA               NA            1        1
## 4                NA               NA            1        1
## 5                NA               NA            1        1
## 6                NA               NA            1        1
## 7                NA               NA            1        1
## 13               NA               NA            1        1
nozyg<- idsclean %>% filter(!idsclean$subjectkey%in%zygs$subjectkey) %>%
  select(rel_family_id, subjectkey)

nozyg$zyg<- NA
nozyg$matched_subject<- NA

head(zygs)
##      rel_family_id       subjectkey zyg  matched_subject
## 314            320 NDAR_INVNDGUJMHY   3 NDAR_INV6XT4AD09
## 315            320 NDAR_INV6XT4AD09   3 NDAR_INVNDGUJMHY
## 3187          3189 NDAR_INV0A9K5L4R   2 NDAR_INV3X41ZHVL
## 3188          3189 NDAR_INV3X41ZHVL   2 NDAR_INV0A9K5L4R
## 3191          3193 NDAR_INV0CBPKF6W   2 NDAR_INV9LRE3A49
## 3192          3193 NDAR_INV9LRE3A49   2 NDAR_INV0CBPKF6W
head(nozyg)
##    rel_family_id       subjectkey zyg matched_subject
## 1              0 NDAR_INV0A86UD86  NA              NA
## 4              3 NDAR_INV0HALXUFY  NA              NA
## 5              4 NDAR_INV0HL5D3DF  NA              NA
## 6              5 NDAR_INV0JVUB582  NA              NA
## 7              6 NDAR_INV0JYWL5T8  NA              NA
## 13            12 NDAR_INV0UV5WZUN  NA              NA
full<- rbind(zygs,nozyg)
nrow(full)
## [1] 11810
nrow(idsclean)
## [1] 11810
sum(full$subjectkey%in%idsclean$subjectkey)
## [1] 11810
# 11810 all match

head(full)
##      rel_family_id       subjectkey zyg  matched_subject
## 314            320 NDAR_INVNDGUJMHY   3 NDAR_INV6XT4AD09
## 315            320 NDAR_INV6XT4AD09   3 NDAR_INVNDGUJMHY
## 3187          3189 NDAR_INV0A9K5L4R   2 NDAR_INV3X41ZHVL
## 3188          3189 NDAR_INV3X41ZHVL   2 NDAR_INV0A9K5L4R
## 3191          3193 NDAR_INV0CBPKF6W   2 NDAR_INV9LRE3A49
## 3192          3193 NDAR_INV9LRE3A49   2 NDAR_INV0CBPKF6W
table(full$zyg)
## 
##   1   2   3 
## 672 868  94

comparing ABCD zyg to jeff’s

rel201_zygosity <- read.csv("/Users/clairemorrison/Documents/IBG:Lab/ABCD/ABCD_Econtexts/data/rel201_zygosity.dat", header = T)

colnames(rel201_zygosity)
##  [1] "FID"              "IID1"             "IID2"             "zygosity"        
##  [5] "Z0"               "Z1"               "Z2"               "PI_HAT"          
##  [9] "interview_date_1" "interview_date_2" "interview_age_1"  "interview_age_2" 
## [13] "gender_1"         "gender_2"         "twinsite_1"       "twinsite_2"
#View(rel201_zygosity[1:40,1:16])
nrow(rel201_zygosity)
## [1] 1592
#rel201_zygosity<- merge(rel201_zygosity, idskeep, by.y='subjectkey', by.x='IID1', all=T) working on this...

zyg<- subset(rel201_zygosity, rel201_zygosity$zygosity<6)
  
# 1= M MZ
# 2= M DZ
# 3= F MZ
# 4= F DZ
# 5= OS DZ

nrow(zyg)
## [1] 950
t1<- zyg[c(1,2,4,8)]
t2<- zyg[c(1,3,4,8)]

colnames(t1)[colnames(t1)=="IID1"] <- "subjectkey"
colnames(t2)[colnames(t2)=="IID2"] <- "subjectkey"

t1$twin<-1
t2$twin<-2

t1<-t1[complete.cases(t1), ]
t2<-t2[complete.cases(t2), ]

zygfull<- rbind(t1,t2)
zygfull <- zygfull[!duplicated(zygfull$subjectkey),]

table(zygfull$twin)
## 
##   1   2 
## 935 923
test<- merge(zygfull, full, by='subjectkey', all=T)
nrow(test)
## [1] 11836
nrow(zygfull)
## [1] 1858
nrow(full)
## [1] 11810
## from jeff MZs=1 or 3, DZs=2 or 4

table(test$zygosity);table(test$zyg)
## 
##   1   2   3   4   5 
## 372 492 368 499 127
## 
##   1   2   3 
## 672 868  94
# we have slightly more MZs and slightly fewer DZs?