Load libraries

library(readxl)
require(psych) 
require(plyr)
require(dplyr)
require(graphics)
require(ggplot2)
require(svglite)
require(reshape2)
require(IDPmisc)
require(GLDEX)
require(ggfortify)
require(cluster)
require(ggrepel)
require(wesanderson)
require(gridExtra)
require(devtools)
require(finalfit)
require(table1)
require(MatchIt)
require(plotly)
require(rms)
require(xtable)
require(knitr)
require(corrplot)
require(EnvStats)

define phi matrix functions

phimat_fun<-function(x) { 
 xcol<-dim(x)[2] 
 newx<-matrix(NA,nrow=xcol,ncol=xcol) 
 for(i in 1:xcol) { 
  
  for(j in 1:xcol) {
    a = as.numeric(unlist(x[,i]))
    b = as.numeric(unlist(x[,j]))
    newx[i,j]<-phi(table(a,b)) 
  }
 } 
 rownames(newx)<-colnames(newx)<-colnames(x) 
 return(newx) 
} 



# Get lower triangle of the correlation matrix
  get_lower_tri<-function(phimat){
    phimat[upper.tri(phimat)] <- NA
    return(phimat)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(phimat){
    phimat[lower.tri(phimat)]<- NA
    return(phimat)
  }
  


reorder_phimat <- function(phimat){
# Use correlation between variables as distance
dd <- as.dist((1-phimat)/2)
hc <- hclust(dd)
phimat <-phimat[hc$order, hc$order]
}
load("/data/JMICC_cohort/R_workspace/DataClean.RData")

PCA analysis of 食事内容: 好みの傾向

##  男性全体
pca_diet_dat = subset(cleaned_survey, 性別 ==1 & 塩分制限!=1 & カロリー制限!=1 & 糖分制限!=1 & 脂肪制限!=1, select = c(ご飯, パン, めん,  肉類, 魚類, 油類, 脂類,乳類, 卵類, 野菜類, 豆類, 果実類,  菓子類, 有糖飲料類,コーヒー, 日本緑茶_量))

pca_diet = clara(scale(pca_diet_dat), 2)
autoplot(pca_diet,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "royalblue", loadings.label.size  = 5, loadings.colour = "royalblue") + ggtitle("PCA analysis of 食事内容: 好みの傾向(食事制限なし男性)") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("slategray1", "slategray1"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

##  女性全体
pca_diet_dat = subset(cleaned_survey, 性別 ==2 & 塩分制限!=1 & カロリー制限!=1 & 糖分制限!=1 & 脂肪制限!=1, select = c(ご飯, パン, めん,  肉類, 魚類, 油類, 脂類,乳類, 卵類, 野菜類, 豆類, 果実類,  菓子類, 有糖飲料類,コーヒー, 日本緑茶_量))

pca_diet = clara(scale(pca_diet_dat), 2)
autoplot(pca_diet,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "palevioletred", loadings.label.size  = 5, loadings.colour = "palevioletred") + ggtitle("PCA analysis of 食事内容: 好みの傾向(食事制限なし女性)") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("seashell", "seashell"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

Basic biographic information plot

ggplot(data=cleaned_survey, aes(x=身長_調査票,y=体重_現在, color=as.factor(性別))) + geom_point(alpha=0.2)+ 
  scale_color_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("身長体重") + xlab("身長")

ggplot(data=cleaned_survey, aes(x=開始時年齢,fill=as.factor(性別))) + geom_histogram(alpha=0.5, bins = 35)+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("年齢分布") 

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=BMI, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("BMI") + xlab("開始時年齢")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=エネルギー, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("摂取エネルギー") + xlab("開始時年齢")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=Daily身体活動メッツ, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("Daily身体活動メッツ") + xlab("開始時年齢")

Biomarker plots

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=中性脂肪, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HbA1c")+ ylim(c(0,2000))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HbA1c, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HbA1c")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=総コレステロール, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("総コレステロール")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HDLコレステロール, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HDLコレステロール")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GOT, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("GOT") + ylim(c(0,250))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GPT, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("GPT") + ylim(c(0,500))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=γGTP, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("γGTP") + ylim(c(0,500))

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=収縮期血圧, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("収縮期血圧")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=拡張期血圧, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("拡張期血圧")

Biomarker and diabetes plots

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HbA1c, color=as.factor(糖尿病))) + geom_point(size=2) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢") 

PCA analysis of Biomarkers

biomarker_dat = cleaned_survey %>%
  select(性別, 開始時年齢, 中性脂肪 , HbA1c , 総コレステロール , GOT , GPT , γGTP , 尿酸 , クレアチニン , 収縮期血圧 , 拡張期血圧) %>%
  na.omit()

biomarker_dat$中性脂肪[biomarker_dat$中性脂肪 > 2000] = NA
biomarker_dat$GOT[biomarker_dat$GOT > 250] = NA
biomarker_dat$GPT[biomarker_dat$GPT > 500] = NA
biomarker_dat$γGTP[biomarker_dat$γGTP > 500] = NA
biomarker_dat$クレアチニン[biomarker_dat$クレアチニン > 10] = NA




biomarker_male = biomarker_dat %>% na.omit() %>% filter(性別==1, 開始時年齢>=35)  %>% select(-c(性別,開始時年齢))

biomarker_pca_male = clara(scale(biomarker_male), 2)
autoplot(biomarker_pca_male,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "royalblue", loadings.label.size  = 5, loadings.colour = "royalblue") + theme_minimal() +
  ggtitle("PCA analysis of Biomarker") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("slategray1", "slategray1"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

biomarker_female = biomarker_dat %>% na.omit() %>% filter(性別==2, 開始時年齢>=35 &開始時年齢<=45) %>% select(-c(性別,開始時年齢))

biomarker_pca_female = clara(scale(biomarker_female), 2)
autoplot(biomarker_pca_female,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "palevioletred", loadings.label.size  = 5, loadings.colour = "palevioletred") + theme_minimal() +
  ggtitle("PCA analysis of Biomarkers") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("seashell", "seashell"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

list of diseases

disease_freq = cleaned_survey %>%
  select(胃かいよう,十二指腸かいよう,慢性胃炎,大腸ポリープ,B型肝炎,C型肝炎,肝硬変,脂肪肝,結核,気管支喘息,慢性気管支炎,糖尿病,
           高脂血症,高血圧,狭心症_心筋梗塞,脳卒中,乳腺症,卵巣の病気,卵巣切除,子宮内膜異型増殖症,子宮の病気,子宮切除,がん1)

disease_freq[which(disease_freq == 1, arr.ind = T)] = 0
disease_freq[which(disease_freq == 2, arr.ind = T)] = 1
disease_freq[which(disease_freq == 3, arr.ind = T)] = 1
disease_freq[which(disease_freq == 4, arr.ind = T)] = 0
disease_freq[which(disease_freq == 9, arr.ind = T)] = 0



disease_freq = cbind(subset(cleaned_survey, select=c(性別, 開始時年齢)), disease_freq)  

disease_freq = disease_freq %>% 
  mutate(ag_group = factor(開始時年齢 > 55, labels = c("Below 55", "Above 55")), sex =factor(性別, labels = c("Male", "Female"))) 
  

my.render.cont <- function(x) {
  x = x*100
    with(stats.apply.rounding(stats.default(x), digits=2), c("% "=sprintf("%s %%", MEAN)))
}


table1(data=disease_freq, ~胃かいよう + 十二指腸かいよう + 慢性胃炎 + 大腸ポリープ + B型肝炎 + C型肝炎 + 肝硬変 + 脂肪肝 + 結核 + 気管支喘息 + 慢性気管支炎 + 糖尿病 + 
           高脂血症 + 高血圧 + 狭心症_心筋梗塞 + 脳卒中+ がん1 |ag_group*sex, render.continuous = my.render.cont, topclass = "Rtable1-zebra", overall=F)
Below 55
Above 55
Male
(n=17217)
Female
(n=25168)
Male
(n=23665)
Female
(n=26562)
胃かいよう 14 % 8.6 % 22 % 11 %
Missing 3112 (18.1%) 4963 (19.7%) 5697 (24.1%) 7740 (29.1%)
十二指腸かいよう 12 % 5.6 % 16 % 6.9 %
Missing 2705 (15.7%) 4435 (17.6%) 5464 (23.1%) 7475 (28.1%)
慢性胃炎 9.8 % 11 % 16 % 14 %
Missing 3128 (18.2%) 4976 (19.8%) 5724 (24.2%) 7765 (29.2%)
大腸ポリープ 7.0 % 3.0 % 20 % 8.8 %
Missing 2709 (15.7%) 4441 (17.6%) 5464 (23.1%) 7476 (28.1%)
B型肝炎 1.4 % 0.98 % 2.0 % 1.3 %
Missing 3106 (18.0%) 4954 (19.7%) 5700 (24.1%) 7753 (29.2%)
C型肝炎 0.77 % 0.61 % 1.9 % 1.5 %
Missing 3112 (18.1%) 4955 (19.7%) 5699 (24.1%) 7744 (29.2%)
肝硬変 0.26 % 0.11 % 0.69 % 0.25 %
Missing 926 (5.4%) 1131 (4.5%) 2136 (9.0%) 2525 (9.5%)
脂肪肝 16 % 4.2 % 13 % 7.5 %
Missing 3148 (18.3%) 4976 (19.8%) 5746 (24.3%) 7788 (29.3%)
結核 1.1 % 1.0 % 3.8 % 2.7 %
Missing 2702 (15.7%) 4441 (17.6%) 5456 (23.1%) 7471 (28.1%)
気管支喘息 6.7 % 7.5 % 4.7 % 5.8 %
Missing 497 (2.9%) 589 (2.3%) 1843 (7.8%) 2173 (8.2%)
慢性気管支炎 1.8 % 2.4 % 2.2 % 3.0 %
Missing 3117 (18.1%) 4956 (19.7%) 5705 (24.1%) 7760 (29.2%)
糖尿病 4.7 % 1.4 % 14 % 5.7 %
Missing 487 (2.8%) 599 (2.4%) 1627 (6.9%) 2141 (8.1%)
高脂血症 13 % 6.7 % 19 % 25 %
Missing 487 (2.8%) 577 (2.3%) 1695 (7.2%) 1789 (6.7%)
高血圧 14 % 6.7 % 34 % 26 %
Missing 432 (2.5%) 533 (2.1%) 1302 (5.5%) 1670 (6.3%)
狭心症_心筋梗塞 1.6 % 0.57 % 6.2 % 3.4 %
Missing 909 (5.3%) 1132 (4.5%) 2035 (8.6%) 2471 (9.3%)
脳卒中 0.90 % 0.55 % 3.6 % 2.0 %
Missing 527 (3.1%) 634 (2.5%) 1851 (7.8%) 2258 (8.5%)
がん1 3.4 % 4.6 % 10 % 7.2 %

list of female diseases

table1(data=subset(disease_freq,性別 ==2), ~ 乳腺症 + 卵巣の病気 + 卵巣切除 + 子宮内膜異型増殖症 + 子宮の病気 + 子宮切除 |ag_group,  render.continuous = my.render.cont, topclass = "Rtable1-zebra", overall=F)
Below 55
(n=25168)
Above 55
(n=26562)
乳腺症 16 % 15 %
Missing 5027 (20.0%) 7800 (29.4%)
卵巣の病気 7.0 % 8.8 %
Missing 4995 (19.8%) 7824 (29.5%)
卵巣切除 87 % 82 %
Missing 19377 (77.0%) 17487 (65.8%)
子宮内膜異型増殖症 3.5 % 2.8 %
Missing 5115 (20.3%) 7953 (29.9%)
子宮の病気 13 % 17 %
Missing 5035 (20.0%) 7863 (29.6%)
子宮切除 81 % 55 %
Missing 19468 (77.4%) 19668 (74.0%)

Family correlation for diabetes - Male

## Male
fam_diabetes_male = cleaned_survey %>%
  filter(性別==1)%>%
  select(糖尿病,父親_糖尿病, 母親_糖尿病) %>%
  mutate(息子 = 糖尿病==2 | 糖尿病==3, 父=父親_糖尿病==3, 母=母親_糖尿病==3) %>%
  select(-c(糖尿病,父親_糖尿病, 母親_糖尿病))

phimat = round(phimat_fun(fam_diabetes_male),2) 

# Reorder the Phi correlation matrix
phimat <- reorder_phimat(phimat)
upper_tri <- get_upper_tri(phimat)
# Melt the correlation matrix
melted_phimat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_phimat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "seagreen", high = "salmon", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Phi\nCorrelation") +
#  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1), axis.text.y = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1))+
 coord_fixed()
# Print the heatmap
#print(ggheatmap)
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 8) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_rect(colour = "black", fill=NA, size=2),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("糖尿病の家族相関性:息子の場合")

Family correlation for diabetes - Female

## female
fam_diabetes_female = cleaned_survey %>%
  filter(性別==2)%>%
  select(糖尿病,父親_糖尿病, 母親_糖尿病) %>%
  mutate(娘 = 糖尿病==2 | 糖尿病==3, 父=父親_糖尿病==3, 母=母親_糖尿病==3) %>%
  select(-c(糖尿病,父親_糖尿病, 母親_糖尿病))

phimat = round(phimat_fun(fam_diabetes_female),2) 

# Reorder the Phi correlation matrix
phimat <- reorder_phimat(phimat)
upper_tri <- get_upper_tri(phimat)
# Melt the correlation matrix
melted_phimat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_phimat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "seagreen", high = "salmon", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Phi\nCorrelation") +
#  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1), axis.text.y = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1))+
 coord_fixed()
# Print the heatmap
#print(ggheatmap)
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 8) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_rect(colour = "black", fill=NA, size=2),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("糖尿病の家族相関性:娘の場合")