tr(k3) tr(k4) for real data(3K.DEMO.UKB)

k

2021-09-28

key module

from pandas_plink import read_plink1_bin as rb
import numpy as np
import pandas as pd

data

G=rb('C:/Users/xuxiyun/Desktop/zherenyi/K4/code/3K.DEMO/3K.DEMO.UKB.bed','C:/Users/xuxiyun/Desktop/zherenyi/K4/code/3K.DEMO/3K.DEMO.UKB.bim','C:/Users/xuxiyun/Desktop/zherenyi/K4/code/3K.DEMO/3K.DEMO.UKB.fam',verbose=False)
G.values.shape
## (3000, 128800)

scale and fillna

scale

data=G.values
data=pd.DataFrame(data)
data1=data.apply(lambda x:(x-x.mean())/x.std())
data1.iloc[:5,:10]
##           0         1         2  ...         7         8         9
## 0  0.166689 -0.915431  1.160644  ...  0.371925  0.272879  0.486555
## 1  0.166689  0.786950  1.160644  ... -2.549784  0.272879 -1.898442
## 2  0.166689  0.786950  1.160644  ...  0.371925  0.272879  0.486555
## 3  0.166689 -0.915431 -1.731294  ...  0.371925  0.272879  0.486555
## 4  0.166689  0.786950 -0.285325  ...  0.371925  0.272879  0.486555
## 
## [5 rows x 10 columns]

fiina

data1.apply(lambda x:sum(x.isnull()))[0:10]
## 0     3
## 1    32
## 2    10
## 3     8
## 4     4
## 5     4
## 6     2
## 7     7
## 8     2
## 9     5
## dtype: int64
dat=data1.fillna(0)

GRM

K=dat.values@dat.values.T/dat.shape[1]
K[0:10,0:10]
## array([[ 9.88062828e-01, -1.78662472e-04, -5.73319796e-04,
##          3.50844673e-03, -6.87652552e-04,  2.33545694e-03,
##         -3.12459413e-03, -3.23467018e-03, -5.95952739e-04,
##          1.39147693e-03],
##        [-1.78662472e-04,  9.88673877e-01,  1.00515247e-03,
##          4.49886109e-03,  2.47705661e-03,  2.86454598e-03,
##          5.64560801e-04, -4.87256139e-04,  3.50549828e-03,
##          3.36872267e-03],
##        [-5.73319796e-04,  1.00515247e-03,  9.99318165e-01,
##          5.88256504e-03, -5.00237506e-03, -5.31566336e-04,
##         -1.01830311e-03,  2.47129097e-03, -3.40911320e-03,
##          1.54622783e-03],
##        [ 3.50844673e-03,  4.49886109e-03,  5.88256504e-03,
##          1.01129258e+00,  9.19627107e-05, -5.17398005e-03,
##          1.65046046e-04,  1.33345160e-03,  2.84834962e-03,
##         -1.11263216e-03],
##        [-6.87652552e-04,  2.47705661e-03, -5.00237506e-03,
##          9.19627107e-05,  1.00903709e+00, -2.36103579e-03,
##          3.06225842e-03,  1.26534124e-03, -1.62574981e-03,
##         -1.33268309e-03],
##        [ 2.33545694e-03,  2.86454598e-03, -5.31566336e-04,
##         -5.17398005e-03, -2.36103579e-03,  1.01439635e+00,
##         -3.54515407e-03, -1.22486683e-03, -4.84228045e-04,
##          1.45171574e-03],
##        [-3.12459413e-03,  5.64560801e-04, -1.01830311e-03,
##          1.65046046e-04,  3.06225842e-03, -3.54515407e-03,
##          1.01083948e+00,  3.54029898e-03, -5.01244492e-03,
##          5.58963859e-04],
##        [-3.23467018e-03, -4.87256139e-04,  2.47129097e-03,
##          1.33345160e-03,  1.26534124e-03, -1.22486683e-03,
##          3.54029898e-03,  9.80340099e-01, -6.70922854e-04,
##          1.73421836e-03],
##        [-5.95952739e-04,  3.50549828e-03, -3.40911320e-03,
##          2.84834962e-03, -1.62574981e-03, -4.84228045e-04,
##         -5.01244492e-03, -6.70922854e-04,  9.98234484e-01,
##          1.36143524e-03],
##        [ 1.39147693e-03,  3.36872267e-03,  1.54622783e-03,
##         -1.11263216e-03, -1.33268309e-03,  1.45171574e-03,
##          5.58963859e-04,  1.73421836e-03,  1.36143524e-03,
##          1.02183084e+00]])

me

tril=[K[i,j] for i in range(3000) for j in range(i)]
me=1/np.var(tril)
me
## 104254.66223384863

trace accuracy

real tr(k2)

K2=K@K
np.trace(K2)
## 3070.369458205084

\(estimate:\quad tr(k2)=n+n^2/me\)

n=dat.shape[0]
estimate2=n+n**2/me
estimate2
## 3086.327074561064

real tr(k3)

K3=K2@K
np.trace(K3)
## 3238.6133714200278

\(estimate:\quad tr(k3)=n+2+3n^2/me\)

estimate3=n+2+3*n**2/me
estimate3
## 3260.981223683192

real tr(k4)

K4=K3@K
np.trace(K4)
## 3503.679616350926

\(estimate:\quad tr(k4)=n+3+(6n^2+4n+4)/me+(2n^3+2n^2-n)/me^2\)

estimate4=n+3+(6*n**2+4*n+4)/me+(2*n**3+2*n**2-n)/me**2
estimate4
## 3526.0474868397337