print(Sys.time())
## [1] "2022-11-23 11:24:04 CST"
数量趋势

总体
hist(miss_rate*100,
main='缺失率分布',
breaks = 100,
xlab='missing_rate',
ylab='',
xlim=range(0, 100),
ylim=range(0, 500))

参考基因组和测试组比对图
# 载入数据
dat_1 <- read.table("D:/NIPT/CHB_CHS_CHR_ALL.Hg38.reference.frq", header = T)
dat_2 <- read.table("D:/NIPT/CHB_CHS_CHR_ALL.Hg38.bim", header = F)
exa = dat[which(dat$F_MISS <= 0.2),] #筛选出缺失率小于20%的行
position = exa$POS #提取数字
dat_22 <- subset(dat_2,dat_2$V4 %in% position) #根据bim文件提取相同位置的SNP名字
dat_11_return <- subset(dat_1,dat_1$SNP %in% c(dat_22$V2)) ##在freq文件中根据名字筛选共同行
d=data.frame(dat_22$V2,dat_22$V4)
colnames(d) <- c("SNP","POS")
dat_11_return <- left_join(dat_11_return,d,by="SNP")
dat_3 <- read.table("D:/NIPT/snp_info.frq", header = T)
map <- read.table("D:/NIPT/snp_info.map", header = F)
map_1 <- subset(map,map$V4 %in% c(dat_22$V4)) #根据上述相同的pos位置找出测试中的相同SNP名字
dat_33 <- subset(dat_3,dat_3$SNP %in% c(map_1$V2)) #筛选相同行
e=data.frame(map_1$V2,map_1$V4)
colnames(e)<-c("SNP","POS")
dat_33_return <- left_join(dat_33, e, by="SNP")
dat_33_return <- dat_33_return %>% distinct(POS, .keep_all = TRUE) #由于一个位置找到了多个SNP名字所以先删除了
position_1 <- Reduce(intersect,list(dat_11_return$POS,dat_33_return$POS))
length(unique(position_1))
## [1] 64
dat_11_return <- subset(dat_11_return,dat_11_return$POS %in% position_1)
dat_33_return <- subset(dat_33_return,dat_33_return$POS %in% position_1)
dt = left_join(dat_11_return, dat_33_return, by="POS") #根据pos定位相同的SNP,A1,A2位置相反的1-maf
dt = dt[which(dt$CHR.x == dt$CHR.y),]
# Change major and minor
changeA1 = which(dt[,3]!=dt[,10])
newMaf = dt[,12]
newMaf[changeA1]=1-dt[changeA1,12]
Frq=data.frame(dt[,5],newMaf)
frq=na.omit(Frq)
cor(frq[,1],frq[,2]) ##找出两列freq做一个correlation
## [1] -0.004855248
dt[1:4,]
## CHR.x SNP.x A1.x A2.x MAF.x NCHROBS.x POS CHR.y
## 6 3 rs80283855 G C 0.16350 416 31061439 3
## 8 4 rs62302678 T C 0.29570 416 46400317 4
## 9 4 rs60435887 G A 0.27880 416 56831606 4
## 10 5 rs117759241 A C 0.07212 416 31068325 5
## SNP.y A1.y A2.y MAF.y NCHROBS.y
## 6 chr3_31061439_C_G G C 0.02439 82
## 8 chr4_46400317_C_T T C 0.03636 110
## 9 chr4_56831606_A_G G A 0.02083 96
## 10 chr5_31068325_C_A A C 0.01370 146
散点图
ggplot(data = frq) +
geom_point(mapping = aes(x = frq$dt...5., y = frq$newMaf))+
labs(x = 'reference',y = 'test',family='serif') +
theme(axis.title =element_text(size = 12),
axis.text =element_text(size = 12, color = 'black'))+
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))+
theme(text=element_text(family='serif'))
## Warning: Use of `frq$dt...5.` is discouraged. Use `dt...5.` instead.
## Warning: Use of `frq$newMaf` is discouraged. Use `newMaf` instead.
