Clustering.utf8.md

rm(list = ls(all.names  = TRUE))
library(fpc)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(DRrequiredAgeing)

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

## 
##  >===================================================================================<
##  "DRrequiredAgeing" is developed by International Mouse Phenotyping Consortium (IMPC) 
##  More details  https://www.mousephenotype.org/                                        
##  Contact us hamedhm@ebi.ac.uk                                                         
##  Source code and issues:  https://git.io/fj9WP                                        
##  >===================================================================================<

expList = c('time', 'data_point (for continuous data)')
df      = read.csv(file = UnzipAndfilePath('https://www.ebi.ac.uk/~hamedhm/CC/cc_all.zip'),
                   as.is = TRUE,
                   check.names = FALSE)

## 2019-11-25 15:35:58. Unziping file ...

## 2019-11-25 15:35:59. Successfully unziped. List of files:
##      1 ==> C:\Users\hamedhm\AppData\Local\Temp\RtmpiWRzSR/GSLS/cc_all.csv

df$time = as.integer(as.Date(df$DOT_Reformatted, tryFormats = '%d/%m/%Y'))
dfbck = df

df = df[!is.na(df$time), ]
numVars = sapply(df, function(x) {
  is.numeric(x) &&
    sum(!is.na(x)) > 40
})
numVars = numVars & !(names(df) %in% expList)
############
a = aggregate(
  x = df[, numVars],
  by = list(Strain = df$`strain_original`),
  FUN = function(x) {
    sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)
  }
)
a2 = a[, sapply(a, function(x) {
  sum(is.na(x))
}) == 0]

rownames(a2) = interaction(a2[, 1])
heatmap.2(
  (t(as.matrix(a2[, -1]))),
  scale = 'row',
  col = bluered(100),
  trace = "none",
  density.info = "none",
  Rowv = FALSE,
  labRow = FALSE,
  #NULL
  cexRow = .8,
  cexCol = 1,
  margins = c(6.5, 3),
  dendrogram = 'col',
  keysize = 1,
  key = FALSE,
  ylab = 'Parameters'
)

plot(hclust(dist(as.matrix(a2[, -1]))),
     xlab = 'Strains',
     sub = '')

# ############
# for (col in names(df)[numVars]) {
#   message('~> variable: ', col)
#   df2 = df[, c(col, 'time')]
#   df2 = df2[complete.cases(df2), ]
#   df2$time = jitter(df2$time)
#
#   sp = smooth.spline(x = df2[, 2], y = df2[, 1])
#   p = predict(sp, df[, 'time'])
#   df[is.na(df[, col]), col] = p$y[is.na(df[, col])]
#
#
#   # sp = lm(reformulate(termlabels = 'time', response = col), data = df2)
#   # p = predict(sp, newdata = df[, 'time', drop = FALSE])
#   # df[is.na(df[, col]), col] = p[is.na(df[, col])]
#
# }
#
#
# my_data = df[, numVars]
# my_data = my_data[,sapply(my_data,function(x){sd(x)>0})]
#
# heatmap(cor(my_data))
#
# cl = kmeans(cov(my_data), centers = 50)
# hc = hclust(d = as.dist(cor(my_data)))
# plot(hc,cex=.7,hang = 10)
#

Clustering.R

hamedhm

2019-11-25