gene

Liao

2022-11-23

print(Sys.time())
## [1] "2022-11-23 11:24:04 CST"

数量趋势

总体

hist(miss_rate*100, 
     main='缺失率分布', 
     breaks = 100, 
     xlab='missing_rate', 
     ylab='', 
     xlim=range(0, 100), 
     ylim=range(0, 500))

参考基因组和测试组比对图

# 载入数据
dat_1 <- read.table("D:/NIPT/CHB_CHS_CHR_ALL.Hg38.reference.frq", header = T)
dat_2 <- read.table("D:/NIPT/CHB_CHS_CHR_ALL.Hg38.bim", header = F)
exa = dat[which(dat$F_MISS <= 0.2),]   #筛选出缺失率小于20%的行
position = exa$POS                    #提取数字
dat_22 <- subset(dat_2,dat_2$V4 %in% position)  #根据bim文件提取相同位置的SNP名字
dat_11_return <- subset(dat_1,dat_1$SNP %in% c(dat_22$V2))   ##在freq文件中根据名字筛选共同行
d=data.frame(dat_22$V2,dat_22$V4)
colnames(d) <- c("SNP","POS")
dat_11_return <- left_join(dat_11_return,d,by="SNP") 


dat_3 <- read.table("D:/NIPT/snp_info.frq", header = T)
map <- read.table("D:/NIPT/snp_info.map", header = F)
map_1 <- subset(map,map$V4 %in% c(dat_22$V4))     #根据上述相同的pos位置找出测试中的相同SNP名字
dat_33 <- subset(dat_3,dat_3$SNP %in% c(map_1$V2)) #筛选相同行
e=data.frame(map_1$V2,map_1$V4)
colnames(e)<-c("SNP","POS")
dat_33_return <- left_join(dat_33, e, by="SNP")
dat_33_return <- dat_33_return %>% distinct(POS, .keep_all = TRUE)  #由于一个位置找到了多个SNP名字所以先删除了


position_1 <- Reduce(intersect,list(dat_11_return$POS,dat_33_return$POS))
length(unique(position_1))
## [1] 64
dat_11_return <- subset(dat_11_return,dat_11_return$POS %in% position_1)
dat_33_return <- subset(dat_33_return,dat_33_return$POS %in% position_1)

dt = left_join(dat_11_return, dat_33_return, by="POS")    #根据pos定位相同的SNP,A1,A2位置相反的1-maf
dt = dt[which(dt$CHR.x == dt$CHR.y),]
# Change major and minor
changeA1 = which(dt[,3]!=dt[,10])
newMaf = dt[,12]
newMaf[changeA1]=1-dt[changeA1,12]
Frq=data.frame(dt[,5],newMaf)
frq=na.omit(Frq)
cor(frq[,1],frq[,2])         ##找出两列freq做一个correlation
## [1] -0.004855248
dt[1:4,]
##    CHR.x       SNP.x A1.x A2.x   MAF.x NCHROBS.x      POS CHR.y
## 6      3  rs80283855    G    C 0.16350       416 31061439     3
## 8      4  rs62302678    T    C 0.29570       416 46400317     4
## 9      4  rs60435887    G    A 0.27880       416 56831606     4
## 10     5 rs117759241    A    C 0.07212       416 31068325     5
##                SNP.y A1.y A2.y   MAF.y NCHROBS.y
## 6  chr3_31061439_C_G    G    C 0.02439        82
## 8  chr4_46400317_C_T    T    C 0.03636       110
## 9  chr4_56831606_A_G    G    A 0.02083        96
## 10 chr5_31068325_C_A    A    C 0.01370       146

散点图

ggplot(data = frq) + 
  geom_point(mapping = aes(x = frq$dt...5., y = frq$newMaf))+
  labs(x = 'reference',y = 'test',family='serif') +
  theme(axis.title =element_text(size = 12),
        axis.text =element_text(size = 12, color = 'black'))+
  theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
  theme(text=element_text(family='serif'))
## Warning: Use of `frq$dt...5.` is discouraged. Use `dt...5.` instead.
## Warning: Use of `frq$newMaf` is discouraged. Use `newMaf` instead.