2020 Biostat Seminar Final project

Load libraries

library(readxl)
require(psych) 
require(plyr)
require(dplyr)
require(graphics)
require(ggplot2)
require(svglite)
require(reshape2)
require(IDPmisc)
require(GLDEX)
require(ggfortify)
require(cluster)
require(ggrepel)
require(wesanderson)
require(gridExtra)
require(devtools)
require(finalfit)
require(table1)
require(MatchIt)
require(plotly)
require(rms)
require(xtable)
require(knitr)
require(corrplot)
require(EnvStats)

define phi matrix functions

phimat_fun<-function(x) { 
 xcol<-dim(x)[2] 
 newx<-matrix(NA,nrow=xcol,ncol=xcol) 
 for(i in 1:xcol) { 
  
  for(j in 1:xcol) {
    a = as.numeric(unlist(x[,i]))
    b = as.numeric(unlist(x[,j]))
    newx[i,j]<-phi(table(a,b)) 
  }
 } 
 rownames(newx)<-colnames(newx)<-colnames(x) 
 return(newx) 
} 



# Get lower triangle of the correlation matrix
  get_lower_tri<-function(phimat){
    phimat[upper.tri(phimat)] <- NA
    return(phimat)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(phimat){
    phimat[lower.tri(phimat)]<- NA
    return(phimat)
  }
  


reorder_phimat <- function(phimat){
# Use correlation between variables as distance
dd <- as.dist((1-phimat)/2)
hc <- hclust(dd)
phimat <-phimat[hc$order, hc$order]
}

load("/data/JMICC_cohort/R_workspace/DataClean.RData")

PCA analysis of 食事内容: 好みの傾向

##  男性全体
pca_diet_dat = subset(cleaned_survey, 性別 ==1 & 塩分制限!=1 & カロリー制限!=1 & 糖分制限!=1 & 脂肪制限!=1, select = c(ご飯, パン, めん,  肉類, 魚類, 油類, 脂類,乳類, 卵類, 野菜類, 豆類, 果実類,  菓子類, 有糖飲料類,コーヒー, 日本緑茶_量))

pca_diet = clara(scale(pca_diet_dat), 2)
autoplot(pca_diet,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "royalblue", loadings.label.size  = 5, loadings.colour = "royalblue") + ggtitle("PCA analysis of 食事内容: 好みの傾向（食事制限なし男性）") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("slategray1", "slategray1"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

##  女性全体
pca_diet_dat = subset(cleaned_survey, 性別 ==2 & 塩分制限!=1 & カロリー制限!=1 & 糖分制限!=1 & 脂肪制限!=1, select = c(ご飯, パン, めん,  肉類, 魚類, 油類, 脂類,乳類, 卵類, 野菜類, 豆類, 果実類,  菓子類, 有糖飲料類,コーヒー, 日本緑茶_量))

pca_diet = clara(scale(pca_diet_dat), 2)
autoplot(pca_diet,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "palevioletred", loadings.label.size  = 5, loadings.colour = "palevioletred") + ggtitle("PCA analysis of 食事内容: 好みの傾向（食事制限なし女性）") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("seashell", "seashell"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

Basic biographic information plot

ggplot(data=cleaned_survey, aes(x=身長_調査票,y=体重_現在, color=as.factor(性別))) + geom_point(alpha=0.2)+ 
  scale_color_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("身長体重") + xlab("身長")

ggplot(data=cleaned_survey, aes(x=開始時年齢,fill=as.factor(性別))) + geom_histogram(alpha=0.5, bins = 35)+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("年齢分布")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=BMI, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("BMI") + xlab("開始時年齢")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=エネルギー, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("摂取エネルギー") + xlab("開始時年齢")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=Daily身体活動メッツ, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("Daily身体活動メッツ") + xlab("開始時年齢")

Biomarker plots

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=中性脂肪, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HbA1c")+ ylim(c(0,2000))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HbA1c, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HbA1c")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=総コレステロール, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("総コレステロール")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HDLコレステロール, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HDLコレステロール")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GOT, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("GOT") + ylim(c(0,250))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GPT, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("GPT") + ylim(c(0,500))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=γGTP, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("γGTP") + ylim(c(0,500))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=収縮期血圧, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("収縮期血圧")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=拡張期血圧, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("拡張期血圧")

Biomarker and diabetes plots

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HbA1c, color=as.factor(糖尿病))) + geom_point(size=2) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")

PCA analysis of Biomarkers

biomarker_dat = cleaned_survey %>%
  select(性別, 開始時年齢, 中性脂肪 , HbA1c , 総コレステロール , GOT , GPT , γGTP , 尿酸 , クレアチニン , 収縮期血圧 , 拡張期血圧) %>%
  na.omit()

biomarker_dat$中性脂肪[biomarker_dat$中性脂肪 > 2000] = NA
biomarker_dat$GOT[biomarker_dat$GOT > 250] = NA
biomarker_dat$GPT[biomarker_dat$GPT > 500] = NA
biomarker_dat$γGTP[biomarker_dat$γGTP > 500] = NA
biomarker_dat$クレアチニン[biomarker_dat$クレアチニン > 10] = NA




biomarker_male = biomarker_dat %>% na.omit() %>% filter(性別==1, 開始時年齢>=35)  %>% select(-c(性別,開始時年齢))

biomarker_pca_male = clara(scale(biomarker_male), 2)
autoplot(biomarker_pca_male,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "royalblue", loadings.label.size  = 5, loadings.colour = "royalblue") + theme_minimal() +
  ggtitle("PCA analysis of Biomarker") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("slategray1", "slategray1"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

biomarker_female = biomarker_dat %>% na.omit() %>% filter(性別==2, 開始時年齢>=35 &開始時年齢<=45) %>% select(-c(性別,開始時年齢))

biomarker_pca_female = clara(scale(biomarker_female), 2)
autoplot(biomarker_pca_female,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "palevioletred", loadings.label.size  = 5, loadings.colour = "palevioletred") + theme_minimal() +
  ggtitle("PCA analysis of Biomarkers") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("seashell", "seashell"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

list of diseases

disease_freq = cleaned_survey %>%
  select(胃かいよう,十二指腸かいよう,慢性胃炎,大腸ポリープ,Ｂ型肝炎,Ｃ型肝炎,肝硬変,脂肪肝,結核,気管支喘息,慢性気管支炎,糖尿病,
           高脂血症,高血圧,狭心症_心筋梗塞,脳卒中,乳腺症,卵巣の病気,卵巣切除,子宮内膜異型増殖症,子宮の病気,子宮切除,がん1)

disease_freq[which(disease_freq == 1, arr.ind = T)] = 0
disease_freq[which(disease_freq == 2, arr.ind = T)] = 1
disease_freq[which(disease_freq == 3, arr.ind = T)] = 1
disease_freq[which(disease_freq == 4, arr.ind = T)] = 0
disease_freq[which(disease_freq == 9, arr.ind = T)] = 0



disease_freq = cbind(subset(cleaned_survey, select=c(性別, 開始時年齢)), disease_freq)  

disease_freq = disease_freq %>% 
  mutate(ag_group = factor(開始時年齢 > 55, labels = c("Below 55", "Above 55")), sex =factor(性別, labels = c("Male", "Female"))) 
  

my.render.cont <- function(x) {
  x = x*100
    with(stats.apply.rounding(stats.default(x), digits=2), c("% "=sprintf("%s %%", MEAN)))
}


table1(data=disease_freq, ~胃かいよう + 十二指腸かいよう + 慢性胃炎 + 大腸ポリープ + Ｂ型肝炎 + Ｃ型肝炎 + 肝硬変 + 脂肪肝 + 結核 + 気管支喘息 + 慢性気管支炎 + 糖尿病 + 
           高脂血症 + 高血圧 + 狭心症_心筋梗塞 + 脳卒中+ がん1 |ag_group*sex, render.continuous = my.render.cont, topclass = "Rtable1-zebra", overall=F)

	Below 55		Above 55
	Male (n=17217)	Female (n=25168)	Male (n=23665)	Female (n=26562)
胃かいよう	14 %	8.6 %	22 %	11 %
Missing	3112 (18.1%)	4963 (19.7%)	5697 (24.1%)	7740 (29.1%)
十二指腸かいよう	12 %	5.6 %	16 %	6.9 %
Missing	2705 (15.7%)	4435 (17.6%)	5464 (23.1%)	7475 (28.1%)
慢性胃炎	9.8 %	11 %	16 %	14 %
Missing	3128 (18.2%)	4976 (19.8%)	5724 (24.2%)	7765 (29.2%)
大腸ポリープ	7.0 %	3.0 %	20 %	8.8 %
Missing	2709 (15.7%)	4441 (17.6%)	5464 (23.1%)	7476 (28.1%)
Ｂ型肝炎	1.4 %	0.98 %	2.0 %	1.3 %
Missing	3106 (18.0%)	4954 (19.7%)	5700 (24.1%)	7753 (29.2%)
Ｃ型肝炎	0.77 %	0.61 %	1.9 %	1.5 %
Missing	3112 (18.1%)	4955 (19.7%)	5699 (24.1%)	7744 (29.2%)
肝硬変	0.26 %	0.11 %	0.69 %	0.25 %
Missing	926 (5.4%)	1131 (4.5%)	2136 (9.0%)	2525 (9.5%)
脂肪肝	16 %	4.2 %	13 %	7.5 %
Missing	3148 (18.3%)	4976 (19.8%)	5746 (24.3%)	7788 (29.3%)
結核	1.1 %	1.0 %	3.8 %	2.7 %
Missing	2702 (15.7%)	4441 (17.6%)	5456 (23.1%)	7471 (28.1%)
気管支喘息	6.7 %	7.5 %	4.7 %	5.8 %
Missing	497 (2.9%)	589 (2.3%)	1843 (7.8%)	2173 (8.2%)
慢性気管支炎	1.8 %	2.4 %	2.2 %	3.0 %
Missing	3117 (18.1%)	4956 (19.7%)	5705 (24.1%)	7760 (29.2%)
糖尿病	4.7 %	1.4 %	14 %	5.7 %
Missing	487 (2.8%)	599 (2.4%)	1627 (6.9%)	2141 (8.1%)
高脂血症	13 %	6.7 %	19 %	25 %
Missing	487 (2.8%)	577 (2.3%)	1695 (7.2%)	1789 (6.7%)
高血圧	14 %	6.7 %	34 %	26 %
Missing	432 (2.5%)	533 (2.1%)	1302 (5.5%)	1670 (6.3%)
狭心症_心筋梗塞	1.6 %	0.57 %	6.2 %	3.4 %
Missing	909 (5.3%)	1132 (4.5%)	2035 (8.6%)	2471 (9.3%)
脳卒中	0.90 %	0.55 %	3.6 %	2.0 %
Missing	527 (3.1%)	634 (2.5%)	1851 (7.8%)	2258 (8.5%)
がん1	3.4 %	4.6 %	10 %	7.2 %

list of female diseases

table1(data=subset(disease_freq,性別 ==2), ~ 乳腺症 + 卵巣の病気 + 卵巣切除 + 子宮内膜異型増殖症 + 子宮の病気 + 子宮切除 |ag_group,  render.continuous = my.render.cont, topclass = "Rtable1-zebra", overall=F)

	Below 55 (n=25168)	Above 55 (n=26562)
乳腺症	16 %	15 %
Missing	5027 (20.0%)	7800 (29.4%)
卵巣の病気	7.0 %	8.8 %
Missing	4995 (19.8%)	7824 (29.5%)
卵巣切除	87 %	82 %
Missing	19377 (77.0%)	17487 (65.8%)
子宮内膜異型増殖症	3.5 %	2.8 %
Missing	5115 (20.3%)	7953 (29.9%)
子宮の病気	13 %	17 %
Missing	5035 (20.0%)	7863 (29.6%)
子宮切除	81 %	55 %
Missing	19468 (77.4%)	19668 (74.0%)

Family correlation for diabetes - Male

## Male
fam_diabetes_male = cleaned_survey %>%
  filter(性別==1)%>%
  select(糖尿病,父親_糖尿病, 母親_糖尿病) %>%
  mutate(息子 = 糖尿病==2 | 糖尿病==3, 父=父親_糖尿病==3, 母=母親_糖尿病==3) %>%
  select(-c(糖尿病,父親_糖尿病, 母親_糖尿病))

phimat = round(phimat_fun(fam_diabetes_male),2) 

# Reorder the Phi correlation matrix
phimat <- reorder_phimat(phimat)
upper_tri <- get_upper_tri(phimat)
# Melt the correlation matrix
melted_phimat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_phimat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "seagreen", high = "salmon", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Phi\nCorrelation") +
#  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1), axis.text.y = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1))+
 coord_fixed()
# Print the heatmap
#print(ggheatmap)
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 8) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_rect(colour = "black", fill=NA, size=2),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("糖尿病の家族相関性：息子の場合")

Family correlation for diabetes - Female

## female
fam_diabetes_female = cleaned_survey %>%
  filter(性別==2)%>%
  select(糖尿病,父親_糖尿病, 母親_糖尿病) %>%
  mutate(娘 = 糖尿病==2 | 糖尿病==3, 父=父親_糖尿病==3, 母=母親_糖尿病==3) %>%
  select(-c(糖尿病,父親_糖尿病, 母親_糖尿病))

phimat = round(phimat_fun(fam_diabetes_female),2) 

# Reorder the Phi correlation matrix
phimat <- reorder_phimat(phimat)
upper_tri <- get_upper_tri(phimat)
# Melt the correlation matrix
melted_phimat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_phimat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "seagreen", high = "salmon", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Phi\nCorrelation") +
#  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1), axis.text.y = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1))+
 coord_fixed()
# Print the heatmap
#print(ggheatmap)
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 8) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_rect(colour = "black", fill=NA, size=2),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("糖尿病の家族相関性：娘の場合")

2020 Biostat Seminar Final project

Mindy Fang

2020/1/15

Load libraries

define phi matrix functions

PCA analysis of 食事内容: 好みの傾向

Basic biographic information plot

Biomarker plots

Biomarker and diabetes plots

PCA analysis of Biomarkers

list of diseases

list of female diseases

Family correlation for diabetes - Male

Family correlation for diabetes - Female