2020 Biostat Seminar Final project

Load libraries

library(readxl)
#library(summarytools)
require(psych) 
require(plyr)
require(dplyr)
require(graphics)
require(ggplot2)
require(svglite)
require(reshape2)
require(IDPmisc)
require(GLDEX)
require(ggfortify)
require(cluster)
require(ggrepel)
require(wesanderson)
require(gridExtra)
require(devtools)
#devtools::install_github("ewenharrison/finalfit")
require(finalfit)
require(table1)
require(MatchIt)
require(plotly)
require(rms)
require(xtable)
require(knitr)
require(corrplot)
require(EnvStats)

library(data.table)
library(stringr)
library(lubridate)


library(tidyr)
library(purrr)
library(readr)


library(tidytext)
library(widyr)

library(ggraph)
library(igraph)

define phi matrix functions

phimat_fun<-function(x) { 
 xcol<-dim(x)[2] 
 newx<-matrix(NA,nrow=xcol,ncol=xcol) 
 for(i in 1:xcol) { 
  
  for(j in 1:xcol) {
    a = as.numeric(unlist(x[,i]))
    b = as.numeric(unlist(x[,j]))
    newx[i,j]<-phi(table(a,b)) 
  }
 } 
 rownames(newx)<-colnames(newx)<-colnames(x) 
 return(newx) 
} 



# Get lower triangle of the correlation matrix
  get_lower_tri<-function(phimat){
    phimat[upper.tri(phimat)] <- NA
    return(phimat)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(phimat){
    phimat[lower.tri(phimat)]<- NA
    return(phimat)
  }
  


reorder_phimat <- function(phimat){
# Use correlation between variables as distance
dd <- as.dist((1-phimat)/2)
hc <- hclust(dd)
phimat <-phimat[hc$order, hc$order]
}

load("/data/JMICC_cohort/R_workspace/DataClean.RData")
#load("/data/JMICC_cohort/R_workspace/DataModeling.RData")

PCA analysis of 食事内容: 好みの傾向

##  男性全体
pca_diet_dat = subset(cleaned_survey, 性別 ==1 & 塩分制限!=1 & カロリー制限!=1 & 糖分制限!=1 & 脂肪制限!=1, select = c(ご飯, パン, めん,  肉類, 魚類, 油類, 脂類,乳類, 卵類, 野菜類, 豆類, 果実類,  菓子類, 有糖飲料類,コーヒー, 日本緑茶_量))

pca_diet = clara(scale(pca_diet_dat), 2)
autoplot(pca_diet,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "royalblue", loadings.label.size  = 5, loadings.colour = "royalblue") + ggtitle("PCA analysis of 食事内容: 好みの傾向（食事制限なし男性）") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("grey", "grey"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

##  女性全体
pca_diet_dat = subset(cleaned_survey, 性別 ==2 & 塩分制限!=1 & カロリー制限!=1 & 糖分制限!=1 & 脂肪制限!=1, select = c(ご飯, パン, めん,  肉類, 魚類, 油類, 脂類,乳類, 卵類, 野菜類, 豆類, 果実類,  菓子類, 有糖飲料類,コーヒー, 日本緑茶_量))

pca_diet = clara(scale(pca_diet_dat), 2)
autoplot(pca_diet,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "rosybrown", loadings.label.size  = 5, loadings.colour = "rosybrown") + ggtitle("PCA analysis of 食事内容: 好みの傾向（食事制限なし女性）") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("grey", "grey"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

Basic biographic information plot

ggplot(data=cleaned_survey, aes(x=身長_調査票,y=体重_現在, color=as.factor(性別))) + geom_point(alpha=0.2)+ 
  scale_color_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("身長体重") + xlab("身長")

## Warning: Removed 1982 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=開始時年齢,fill=as.factor(性別))) + geom_histogram(alpha=0.5, bins = 35)+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("年齢分布")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=BMI, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("BMI") + xlab("開始時年齢")

## Warning: Removed 1982 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=エネルギー, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("摂取エネルギー") + xlab("開始時年齢")

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢),y=Daily身体活動メッツ, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey")+ 
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"),values=c("royalblue", "palevioletred")) + theme_minimal() + facet_grid(~性別) + ggtitle("Daily身体活動メッツ") + xlab("開始時年齢")

Biomarker plots

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=中性脂肪, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HbA1c")

## Warning: Removed 18111 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HbA1c, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HbA1c")

## Warning: Removed 31096 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=総コレステロール, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("総コレステロール")

## Warning: Removed 25557 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HDLコレステロール, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("HDLコレステロール")

## Warning: Removed 18106 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GOT, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("GOT") + ylim(c(0,250))

## Warning: Removed 18118 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GPT, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("GPT") + ylim(c(0,500))

## Warning: Removed 18110 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=γGTP, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("γGTP") + ylim(c(0,500))

## Warning: Removed 19115 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=尿酸, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("尿酸")

## Warning: Removed 22549 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=クレアチニン, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢")  + ggtitle("クレアチニン")+ ylim(c(0,10))

## Warning: Removed 20141 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=収縮期血圧, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("収縮期血圧")

## Warning: Removed 19694 rows containing non-finite values (stat_boxplot).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=拡張期血圧, fill=as.factor(性別))) + geom_boxplot(alpha=0.7, color="lightgrey") + facet_grid(~性別)+ theme_minimal()+
  scale_fill_manual(name="性別", labels=c("男性","女性"), breaks = c("1", "2"), values=c("royalblue", "palevioletred"))+ xlab("年齢") + ggtitle("拡張期血圧")

## Warning: Removed 19736 rows containing non-finite values (stat_boxplot).

Biomarker and diabetes plots

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=中性脂肪, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢") + ylim(c(0,2000))

## Warning: Removed 22884 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HbA1c, color=as.factor(糖尿病))) + geom_point(size=2) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")

## Warning: Removed 35817 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=総コレステロール, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")

## Warning: Removed 30311 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=HDLコレステロール, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")

## Warning: Removed 22876 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GOT, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")  + ylim(c(0,250))

## Warning: Removed 22888 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=GPT, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢") + ylim(c(0,500))

## Warning: Removed 22880 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=γGTP, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢") + ylim(c(0,500))

## Warning: Removed 23865 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=尿酸, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")

## Warning: Removed 27316 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=クレアチニン, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢") + ylim(c(0,10))

## Warning: Removed 24896 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=収縮期血圧, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")+ xlab("年齢")

## Warning: Removed 24463 rows containing missing values (geom_point).

ggplot(data=cleaned_survey, aes(x=as.factor(開始時年齢), y=拡張期血圧, color=as.factor(糖尿病))) + geom_point(size=0.5) + facet_grid(~性別)+ theme_minimal()+
  scale_color_manual(name="糖尿病", labels=c("なし","かかっている","かかったことある"), breaks = c("1", "2", "3"), values=c("lightgrey", "salmon", "salmon"))+ xlab("年齢")

## Warning: Removed 24505 rows containing missing values (geom_point).

biomarker_dat = cleaned_survey %>%
  select(性別, 開始時年齢, 中性脂肪 , HbA1c , 総コレステロール , GOT , GPT , γGTP , 尿酸 , クレアチニン , 収縮期血圧 , 拡張期血圧) %>%
  na.omit()

biomarker_dat$中性脂肪[biomarker_dat$中性脂肪 > 2000] = NA
biomarker_dat$GOT[biomarker_dat$GOT > 250] = NA
biomarker_dat$GPT[biomarker_dat$GPT > 500] = NA
biomarker_dat$γGTP[biomarker_dat$γGTP > 500] = NA
biomarker_dat$クレアチニン[biomarker_dat$クレアチニン > 10] = NA




biomarker_male = biomarker_dat %>% na.omit() %>% filter(性別==1, 開始時年齢>=35)  %>% select(-c(性別,開始時年齢))

biomarker_pca_male = clara(scale(biomarker_male), 2)
autoplot(biomarker_pca_male,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "royalblue", loadings.label.size  = 5, loadings.colour = "royalblue") + theme_minimal() +
  ggtitle("PCA analysis of Biomarker") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("slategray1", "slategray1"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

biomarker_female = biomarker_dat %>% na.omit() %>% filter(性別==2, 開始時年齢>=35 &開始時年齢<=45) %>% select(-c(性別,開始時年齢))

biomarker_pca_female = clara(scale(biomarker_female), 2)
autoplot(biomarker_pca_female,loadings=TRUE, loadings.label = TRUE, loadings.label.colour = "palevioletred", loadings.label.size  = 5, loadings.colour = "palevioletred") + theme_minimal() +
  ggtitle("PCA analysis of Biomarkers") +scale_color_manual(name="", labels=c("",""), breaks = c("1", "2"), values=c("seashell", "seashell"))+theme(legend.text = element_text(size = 16)) + theme_minimal()

list of diseases

revalue_disease = function(x){
  if(is.na(x)){x=0}
  else{
    x = as.character(x)
  x = revalue(x, c("1"=0, "2"=1, "3"=1, "4"=0, "9" =0)) 
  x = as.numeric(x)
}}
  

disease_freq = cleaned_survey %>%
  select(胃かいよう,十二指腸かいよう,慢性胃炎,大腸ポリープ,Ｂ型肝炎,Ｃ型肝炎,肝硬変,脂肪肝,結核,気管支喘息,慢性気管支炎,糖尿病,
           高脂血症,高血圧,狭心症_心筋梗塞,脳卒中,乳腺症,卵巣の病気,卵巣切除,子宮内膜異型増殖症,子宮の病気,子宮切除,がん1)

disease_freq[which(disease_freq == 1, arr.ind = T)] = 0
disease_freq[which(disease_freq == 2, arr.ind = T)] = 1
disease_freq[which(disease_freq == 3, arr.ind = T)] = 1
disease_freq[which(disease_freq == 4, arr.ind = T)] = 0
disease_freq[which(disease_freq == 9, arr.ind = T)] = 0



disease_freq = cbind(subset(cleaned_survey, select=c(性別, 開始時年齢)), disease_freq)  

disease_freq = disease_freq %>% 
  mutate(ag_group = factor(開始時年齢 > 55, labels = c("Below 55", "Above 55")), sex =factor(性別, labels = c("Male", "Female"))) 
  

my.render.cont <- function(x) {
  x = x*100
    with(stats.apply.rounding(stats.default(x), digits=2), c("% "=sprintf("%s %%", MEAN)))
}


table1(data=disease_freq, ~胃かいよう + 十二指腸かいよう + 慢性胃炎 + 大腸ポリープ + Ｂ型肝炎 + Ｃ型肝炎 + 肝硬変 + 脂肪肝 + 結核 + 気管支喘息 + 慢性気管支炎 + 糖尿病 + 
           高脂血症 + 高血圧 + 狭心症_心筋梗塞 + 脳卒中+ がん1 |ag_group*sex, render.continuous = my.render.cont, topclass = "Rtable1-zebra", overall=F)

	Below 55		Above 55
	Male (n=17217)	Female (n=25168)	Male (n=23665)	Female (n=26562)
胃かいよう	14 %	8.6 %	22 %	11 %
Missing	3112 (18.1%)	4963 (19.7%)	5697 (24.1%)	7740 (29.1%)
十二指腸かいよう	12 %	5.6 %	16 %	6.9 %
Missing	2705 (15.7%)	4435 (17.6%)	5464 (23.1%)	7475 (28.1%)
慢性胃炎	9.8 %	11 %	16 %	14 %
Missing	3128 (18.2%)	4976 (19.8%)	5724 (24.2%)	7765 (29.2%)
大腸ポリープ	7.0 %	3.0 %	20 %	8.8 %
Missing	2709 (15.7%)	4441 (17.6%)	5464 (23.1%)	7476 (28.1%)
Ｂ型肝炎	1.4 %	0.98 %	2.0 %	1.3 %
Missing	3106 (18.0%)	4954 (19.7%)	5700 (24.1%)	7753 (29.2%)
Ｃ型肝炎	0.77 %	0.61 %	1.9 %	1.5 %
Missing	3112 (18.1%)	4955 (19.7%)	5699 (24.1%)	7744 (29.2%)
肝硬変	0.26 %	0.11 %	0.69 %	0.25 %
Missing	926 (5.4%)	1131 (4.5%)	2136 (9.0%)	2525 (9.5%)
脂肪肝	16 %	4.2 %	13 %	7.5 %
Missing	3148 (18.3%)	4976 (19.8%)	5746 (24.3%)	7788 (29.3%)
結核	1.1 %	1.0 %	3.8 %	2.7 %
Missing	2702 (15.7%)	4441 (17.6%)	5456 (23.1%)	7471 (28.1%)
気管支喘息	6.7 %	7.5 %	4.7 %	5.8 %
Missing	497 (2.9%)	589 (2.3%)	1843 (7.8%)	2173 (8.2%)
慢性気管支炎	1.8 %	2.4 %	2.2 %	3.0 %
Missing	3117 (18.1%)	4956 (19.7%)	5705 (24.1%)	7760 (29.2%)
糖尿病	4.7 %	1.4 %	14 %	5.7 %
Missing	487 (2.8%)	599 (2.4%)	1627 (6.9%)	2141 (8.1%)
高脂血症	13 %	6.7 %	19 %	25 %
Missing	487 (2.8%)	577 (2.3%)	1695 (7.2%)	1789 (6.7%)
高血圧	14 %	6.7 %	34 %	26 %
Missing	432 (2.5%)	533 (2.1%)	1302 (5.5%)	1670 (6.3%)
狭心症_心筋梗塞	1.6 %	0.57 %	6.2 %	3.4 %
Missing	909 (5.3%)	1132 (4.5%)	2035 (8.6%)	2471 (9.3%)
脳卒中	0.90 %	0.55 %	3.6 %	2.0 %
Missing	527 (3.1%)	634 (2.5%)	1851 (7.8%)	2258 (8.5%)
がん1	3.4 %	4.6 %	10 %	7.2 %

table1(data=subset(disease_freq,性別 ==2), ~ 乳腺症 + 卵巣の病気 + 卵巣切除 + 子宮内膜異型増殖症 + 子宮の病気 + 子宮切除 |ag_group,  render.continuous = my.render.cont, topclass = "Rtable1-zebra", overall=F)

	Below 55 (n=25168)	Above 55 (n=26562)
乳腺症	16 %	15 %
Missing	5027 (20.0%)	7800 (29.4%)
卵巣の病気	7.0 %	8.8 %
Missing	4995 (19.8%)	7824 (29.5%)
卵巣切除	87 %	82 %
Missing	19377 (77.0%)	17487 (65.8%)
子宮内膜異型増殖症	3.5 %	2.8 %
Missing	5115 (20.3%)	7953 (29.9%)
子宮の病気	13 %	17 %
Missing	5035 (20.0%)	7863 (29.6%)
子宮切除	81 %	55 %
Missing	19468 (77.4%)	19668 (74.0%)

Family correlation for diabetes - Male

## Male
fam_diabetes_male = cleaned_survey %>%
  filter(性別==1)%>%
  select(糖尿病,父親_糖尿病, 母親_糖尿病) %>%
  mutate(息子 = 糖尿病==2 | 糖尿病==3, 父=父親_糖尿病==3, 母=母親_糖尿病==3) %>%
  select(-c(糖尿病,父親_糖尿病, 母親_糖尿病))

phimat = round(phimat_fun(fam_diabetes_male),2) 

# Reorder the Phi correlation matrix
phimat <- reorder_phimat(phimat)
upper_tri <- get_upper_tri(phimat)
# Melt the correlation matrix
melted_phimat <- melt(upper_tri, na.rm = TRUE)

## Warning in melt(upper_tri, na.rm = TRUE): The melt generic in data.table
## has been passed a matrix and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(upper_tri). In the next version, this
## warning will become an error.

# Create a ggheatmap
ggheatmap <- ggplot(melted_phimat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "seagreen", high = "salmon", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Phi\nCorrelation") +
#  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1), axis.text.y = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1))+
 coord_fixed()
# Print the heatmap
#print(ggheatmap)
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 8) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_rect(colour = "black", fill=NA, size=2),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("糖尿病の家族相関性：息子の場合")

Family correlation for diabetes - Female

## female
fam_diabetes_female = cleaned_survey %>%
  filter(性別==2)%>%
  select(糖尿病,父親_糖尿病, 母親_糖尿病) %>%
  mutate(娘 = 糖尿病==2 | 糖尿病==3, 父=父親_糖尿病==3, 母=母親_糖尿病==3) %>%
  select(-c(糖尿病,父親_糖尿病, 母親_糖尿病))

phimat = round(phimat_fun(fam_diabetes_female),2) 

# Reorder the Phi correlation matrix
phimat <- reorder_phimat(phimat)
upper_tri <- get_upper_tri(phimat)
# Melt the correlation matrix
melted_phimat <- melt(upper_tri, na.rm = TRUE)

## Warning in melt(upper_tri, na.rm = TRUE): The melt generic in data.table
## has been passed a matrix and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(upper_tri). In the next version, this
## warning will become an error.

# Create a ggheatmap
ggheatmap <- ggplot(melted_phimat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "seagreen", high = "salmon", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Phi\nCorrelation") +
#  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1), axis.text.y = element_text(angle = 0, vjust = 1, 
    size = 16, hjust = 1))+
 coord_fixed()
# Print the heatmap
#print(ggheatmap)
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 8) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_rect(colour = "black", fill=NA, size=2),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("糖尿病の家族相関性：娘の場合")

2020 Biostat Seminar Final project

Mindy Fang

2020/1/15

Load libraries

define phi matrix functions

PCA analysis of 食事内容: 好みの傾向

Basic biographic information plot

Biomarker plots

Biomarker and diabetes plots

list of diseases

Family correlation for diabetes - Male

Family correlation for diabetes - Female