#Topics to review:
#What is genetic linage?
#Parametric linkage analysis
# LOD score method
# Factors influencing linage analysis
#1. Phenotypes and penetrance
#2. heterogeneity
#Linkage analysis using sibling pairs
# Affected sib pairs
# Quantitative trait analysis
sibpair <- read.csv("HW2_sibpairs.csv")
head(sibpair)
## famid id1 id2 ibd trait1 trait2 resid1 resid2 disease1 disease2
## 1 1 1 2 1 30.7 33.0 0.2285 0.8570 1 1
## 2 2 3 4 1 29.0 31.0 -0.2430 -0.0543 2 2
## 3 3 5 6 0 31.0 28.6 0.3557 -0.4258 2 2
## 4 4 7 8 0 28.1 29.7 -0.6286 -0.1067 2 1
## 5 5 9 10 2 29.1 27.7 -0.3164 -0.6969 2 1
## 6 6 11 12 1 31.1 29.8 0.3804 -0.0122 1 1
n <- table(sibpair $ibd) #identity by descent
print(n)
##
## 0 1 2
## 275 600 325
print(n/sum(n))
##
## 0 1 2
## 0.2291667 0.5000000 0.2708333
#need to restrict to affected sib pairs: table to check affection status
table(sibpair$disease1,sibpair$disease2)
##
## 1 2
## 1 229 296
## 2 301 374
asp <- sibpair[sibpair$disease1==2 & sibpair$disease2==2,]
print(asp[1:10,])
## famid id1 id2 ibd trait1 trait2 resid1 resid2 disease1 disease2
## 2 2 3 4 1 29.0 31.0 -0.2430 -0.0543 2 2
## 3 3 5 6 0 31.0 28.6 0.3557 -0.4258 2 2
## 12 12 23 24 1 29.2 29.7 -0.2459 -0.0897 2 2
## 15 15 29 30 2 30.7 31.4 0.2875 0.3908 2 2
## 20 20 39 40 2 31.3 30.8 0.2817 0.2261 2 2
## 23 23 45 46 2 30.0 28.8 0.0028 -0.4146 2 2
## 24 24 47 48 2 31.6 30.4 0.5209 0.2066 2 2
## 29 29 57 58 1 31.0 30.5 0.2815 0.1879 2 2
## 30 30 59 60 1 28.9 30.5 -0.3377 0.1781 2 2
## 32 32 63 64 0 32.3 29.9 0.7601 -0.1831 2 2
ibd.count <- table(asp$ibd)
print(ibd.count)
##
## 0 1 2
## 72 176 126
print(ibd.count/sum(ibd.count))
##
## 0 1 2
## 0.1925134 0.4705882 0.3368984
#one-sided test- increased sharing
goodness.test <- chisq.test(ibd.count,p=c(0.25,0.5,0.25))
print(goodness.test)
##
## Chi-squared test for given probabilities
##
## data: ibd.count
## X-squared = 16.888, df = 2, p-value = 0.0002152
print(goodness.test$p.value/2)
## [1] 0.0001076099
#p-value is sig, evidence of linkage between marker and disease status
N <- nrow(asp)
n2 <- sum(asp$ibd==2)
n1 <- sum(asp$ibd==1)
T2 <- sqrt(2*N)*(n1/N + 2*n2/N - 1)
print(T2)
## [1] 3.948871
pval2 <- pt(T2,N-1,lower=F)
print(pval2)
## [1] 4.693788e-05
##################################################################
sibpair$Yd <- (sibpair$resid1 - sibpair$resid2)^2
sibpair$pi <- sibpair$ibd/2
print(sibpair[1:10,])
## famid id1 id2 ibd trait1 trait2 resid1 resid2 disease1 disease2
## 1 1 1 2 1 30.7 33.0 0.2285 0.8570 1 1
## 2 2 3 4 1 29.0 31.0 -0.2430 -0.0543 2 2
## 3 3 5 6 0 31.0 28.6 0.3557 -0.4258 2 2
## 4 4 7 8 0 28.1 29.7 -0.6286 -0.1067 2 1
## 5 5 9 10 2 29.1 27.7 -0.3164 -0.6969 2 1
## 6 6 11 12 1 31.1 29.8 0.3804 -0.0122 1 1
## 7 7 13 14 1 31.4 29.4 0.4067 -0.1985 2 1
## 8 8 15 16 2 30.1 28.9 0.0110 -0.3645 1 2
## 9 9 17 18 1 28.7 29.5 -0.4094 -0.1296 2 1
## 10 10 19 20 1 32.5 29.7 0.8240 -0.1536 1 2
## Yd pi
## 1 0.39501225 0.5
## 2 0.03560769 0.5
## 3 0.61074225 0.0
## 4 0.27237961 0.0
## 5 0.14478025 1.0
## 6 0.15413476 0.5
## 7 0.36626704 0.5
## 8 0.14100025 1.0
## 9 0.07828804 0.5
## 10 0.95570176 0.5
original.he <- lm(Yd ~ pi,data=sibpair)
#as expected, slope is negative, p-value indictes high significance
#marker is linked to the quantitative trait locus in sib
x<- summary(original.he)
print(summary(original.he))
##
## Call:
## lm(formula = Yd ~ pi, data = sibpair)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2070 -0.1543 -0.1043 0.0230 6.7799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.20716 0.02081 9.956 <2e-16 ***
## pi -0.07402 0.03307 -2.238 0.0254 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4043 on 1198 degrees of freedom
## Multiple R-squared: 0.004164, Adjusted R-squared: 0.003333
## F-statistic: 5.01 on 1 and 1198 DF, p-value: 0.02539
tvalue<- summary(original.he)$coef[2,3]
tvalue
## [1] -2.238273
onesidedp <- pt(tvalue,original.he$df)
#left tail (since t #value is neg)
print(onesidedp)
## [1] 0.0126932
lod <- tvalue^2/(2*log(10)) #log in R is the narual log
print(lod) #HE stats
## [1] 1.087878
##################################################################
#product of centered phenotype and the slope is expected to be positive
popmean <- mean(c(sibpair$resid1,sibpair$resid2)) #vector
sibpair$Yp <- (sibpair$resid1 - popmean)*(sibpair$resid2-popmean)
revised.he <- lm(Yp ~ pi,data=sibpair)
print(summary(revised.he))
##
## Call:
## lm(formula = Yp ~ pi, data = sibpair)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98541 -0.04267 -0.01135 0.03541 0.62567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.001077 0.005402 -0.199 0.842
## pi 0.043593 0.008586 5.077 4.44e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.105 on 1198 degrees of freedom
## Multiple R-squared: 0.02107, Adjusted R-squared: 0.02025
## F-statistic: 25.78 on 1 and 1198 DF, p-value: 4.435e-07
revised.tvalue <- summary(revised.he)$coef[2,3] #positive t values
revised.onesidedp <- pt(revised.tvalue,revised.he$df,lower=F)
print(revised.onesidedp)
## [1] 2.217511e-07
revised.lod <- revised.tvalue^2/(2*log(10))
print(revised.lod)
## [1] 5.597882