Untitled

library("data.table")
#dat = data.frame(data.table::fread("cat SHARE.csv | sed -e 's/ *//g; s/\"\"//g'", na.strings=c("",NA)))###this allows us to preserve variable types

#nums <- unlist(lapply(dat, is.numeric))  
#quant=dat[,nums]

#imputes = mice(quant, m=1, maxit=5, method='cart', seed=500)


library(mice)

## Loading required package: lattice

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

imputes=readRDS("~/Dropbox/impute.rds")
namat=is.na(complete(imputes))
complete=complete(imputes)[,colSums(namat)==0] ###return only the columns that still don't have navalues

cc=complete[,colSums(complete)!=0]
dd = as.matrix(cc[,-(1:4)])
dd[dd<0] = 0
library("CountClust")

## Loading required package: ggplot2

#f=FitGoM(dd,K = 5)



#saveRDS(f,"~/Dropbox/fk5.rds")

f=readRDS("~/Dropbox/fk5.rds")
omega <- f$fit$omega
cols <- c("blue", "darkgoldenrod1", "cyan", "red","green")
StructureGGplot(omega = omega,
                
                palette = cols,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .4,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 7,
                                 axis_label_face = "bold"))

This makes us think that

theta_mat <- f$fit$theta;
top_features <- ExtractTopFeatures(theta_mat, top_features=100,
                                   method="poisson", options="min");

variable_list <- do.call(rbind, lapply(1:dim(top_features$indices)[1],
                        function(x) colnames(dd)[top_features$indices[x,]]))

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("kableExtra")

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library("flashr")
tmp <- do.call(rbind, lapply(1:5, function(i) toString(variable_list[,i])))
rownames(tmp) <- paste("Cluster", c(1:5))
tmp %>%
  kable("html") %>%
  kable_styling()

Cluster 1	Echo_Z_ParentID, Demographics_Death_Year, Genetics_Nucleotide_Locus, LabValues_NT_proBNP, Genetics_Enrichment
Cluster 2	NA, Demographics_Diagnosis_Year, NA, LabValues_Creatinine, Echo_Z_Septal_E_prime
Cluster 3	NA, Demographics_Birth_Year, NA, MedicalHx_LVEF35, Echo_Z_Lateral_E_prime
Cluster 4	NA, CMRI_LV_Mass, NA, NA, NA
Cluster 5	NA, CMRI_LVEDV, NA, NA, NA

#flash=flashr::flash(as.matrix(scale(cc,center = T,scale = T)),Kmax = 10,var_type = "by_row",verbose = FALSE)
#saveRDS(flash,file="~/Dropbox/flash10.rds")

f=readRDS("~/Dropbox/flash10.rds")
plot(f,plot_factors = TRUE,factor_kset = 1:4)