Grouping Data in R

Principal Components Analysis and Singular Value Decomposition:

Are useful for both data exploration and in the final modeling phase. - data set with no apparent pattern

set.seed(12345)
par(mar = rep(0.2, 4))
dataMatrix <- matrix(rnorm(400), nrow = 40)
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

par(mar = rep(0.2, 4))
heatmap(dataMatrix)

- data set with a pattern

set.seed(678910)
for (i in 1:40) {
 # flip a coin
 coinFlip <- rbinom(1, size = 1, prob = 0.5)
 # if coin is heads add a common pattern to that row
 if (coinFlip) {
 dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 3), each = 5)
 }
}

par(mar = rep(0.2, 4))
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

par(mar = rep(0.2, 4))
heatmap(dataMatrix)

hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rowMeans(dataMatrixOrdered), 40:1, , xlab = "Row Mean", ylab = "Row", pch = 19)
plot(colMeans(dataMatrixOrdered), xlab = "Column", ylab = "Column Mean", pch = 19)

Components of the SVD - Components of the SVD - u and v

Variance:

svd1 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd1$u[, 1], 40:1, , xlab = "Row", ylab = "First left singular vector",
 pch = 19)
plot(svd1$v[, 1], xlab = "Column", ylab = "First right singular vector", pch = 19)

- variance explained

par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

### Relationship of SVD to PCA:

svd1 <- svd(scale(dataMatrixOrdered))
pca1 <- prcomp(dataMatrixOrdered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1], pch = 19, xlab = "Principal Component 1",
 ylab = "Right Singular Vector 1")
abline(c(0, 1))

### Example with 0 1 matrix:

constantMatrix <- dataMatrixOrdered*0
for(i in 1:dim(dataMatrixOrdered)[1]){constantMatrix[i,] <- rep(c(0,1),each=5)}
svd1 <- svd(constantMatrix)
par(mfrow=c(1,3))
image(t(constantMatrix)[,nrow(constantMatrix):1])
plot(svd1$d,xlab="Column",ylab="Singular value",pch=19)
plot(svd1$d^2/sum(svd1$d^2),xlab="Column",ylab="Prop. of variance explained",pch=19)

## SVD, True Patterns:

set.seed(678910)
for (i in 1:40) {
 # flip a coin
 coinFlip1 <- rbinom(1, size = 1, prob = 0.5)
 coinFlip2 <- rbinom(1, size = 1, prob = 0.5)
 # if coin is heads add a common pattern to that row
 if (coinFlip1) {
 dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), each = 5)
 }
 if (coinFlip2) {
 dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), 5)
 }
}
hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]

svd2 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rep(c(0, 1), each = 5), pch = 19, xlab = "Column", ylab = "Pattern 1")
plot(rep(c(0, 1), 5), pch = 19, xlab = "Column", ylab = "Pattern 2")

- v and patterns of variance in rows and patterns of variance in rows:

svd2 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd2$v[, 1], pch = 19, xlab = "Column", ylab = "First right singular vector")
plot(svd2$v[, 2], pch = 19, xlab = "Column", ylab = "Second right singular vector")

- d and variance explained

svd1 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained",
 pch = 19)

### Dealing with missing values:

dataMatrix2 <- dataMatrixOrdered
## Randomly insert some missing data
dataMatrix2[sample(1:100, size = 40, replace = FALSE)] <- NA
#svd1 <- svd(scale(dataMatrix2)) ## Doesn't work!
#Error in svd(scale(dataMatrix2)) : infinite or missing values in 'x'
summary(dataMatrix2)

##        V1                V2               V3                V4         
##  Min.   :-1.6621   Min.   :-2.347   Min.   :-1.3248   Min.   :-1.6193  
##  1st Qu.:-0.6037   1st Qu.: 0.311   1st Qu.:-0.6472   1st Qu.:-0.2743  
##  Median : 0.3950   Median : 1.593   Median :-0.3044   Median : 1.4262  
##  Mean   : 0.2437   Mean   : 2.152   Mean   : 0.1745   Mean   : 2.0516  
##  3rd Qu.: 0.7621   3rd Qu.: 4.140   3rd Qu.: 0.8293   3rd Qu.: 4.4068  
##  Max.   : 2.1968   Max.   : 6.129   Max.   : 2.4771   Max.   : 7.6558  
##  NA's   :14        NA's   :17       NA's   :9                          
##        V5                V6               V7               V8         
##  Min.   :-1.8157   Min.   :-2.290   Min.   :-2.113   Min.   :-0.6753  
##  1st Qu.:-0.4444   1st Qu.: 4.080   1st Qu.: 2.358   1st Qu.: 3.1898  
##  Median : 0.1382   Median : 8.073   Median : 5.558   Median : 8.0382  
##  Mean   : 0.1611   Mean   : 7.069   Mean   : 4.872   Mean   : 7.1558  
##  3rd Qu.: 0.7897   3rd Qu.: 9.906   3rd Qu.: 7.552   3rd Qu.:10.2048  
##  Max.   : 2.4836   Max.   :13.672   Max.   : 9.483   Max.   :14.3203  
##                                                                       
##        V9              V10        
##  Min.   :-2.582   Min.   :-1.195  
##  1st Qu.: 3.657   1st Qu.: 3.744  
##  Median : 5.421   Median : 8.263  
##  Mean   : 5.077   Mean   : 7.248  
##  3rd Qu.: 8.022   3rd Qu.:10.272  
##  Max.   : 9.550   Max.   :13.673  
##

library(mice)

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

md.pattern(dataMatrix2)

##    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## 11    1    1    1    1    1    1    1    1    1     1     0
## 8     1    1    1    1    1    1    1    1    1     0     1
## 9     1    1    1    1    1    1    1    1    0     1     1
## 3     1    1    1    1    1    1    1    1    0     0     2
## 2     1    1    1    1    1    1    1    0    1     1     1
## 5     1    1    1    1    1    1    1    0    1     0     2
## 1     1    1    1    1    1    1    1    0    0     1     2
## 1     1    1    1    1    1    1    1    0    0     0     3
##       0    0    0    0    0    0    0    9   14    17    40

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

mice_plot <- aggr(dataMatrix2, col=c('navyblue','yellow'),
                    numbers=TRUE, sortVars=TRUE,
                    labels=names(dataMatrix2), cex.axis=.7,
                    gap=3, ylab=c("Missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable Count
##        V2 0.425
##        V1 0.350
##        V3 0.225
##        V4 0.000
##        V5 0.000
##        V6 0.000
##        V7 0.000
##        V8 0.000
##        V9 0.000
##       V10 0.000

imputed_Data <- mice(dataMatrix2, m=5, maxit = 50, method = 'pmm', seed = 500)

## 
##  iter imp variable
##   1   1  V1  V2  V3
##   1   2  V1  V2  V3
##   1   3  V1  V2  V3
##   1   4  V1  V2  V3
##   1   5  V1  V2  V3
##   2   1  V1  V2  V3
##   2   2  V1  V2  V3
##   2   3  V1  V2  V3
##   2   4  V1  V2  V3
##   2   5  V1  V2  V3
##   3   1  V1  V2  V3
##   3   2  V1  V2  V3
##   3   3  V1  V2  V3
##   3   4  V1  V2  V3
##   3   5  V1  V2  V3
##   4   1  V1  V2  V3
##   4   2  V1  V2  V3
##   4   3  V1  V2  V3
##   4   4  V1  V2  V3
##   4   5  V1  V2  V3
##   5   1  V1  V2  V3
##   5   2  V1  V2  V3
##   5   3  V1  V2  V3
##   5   4  V1  V2  V3
##   5   5  V1  V2  V3
##   6   1  V1  V2  V3
##   6   2  V1  V2  V3
##   6   3  V1  V2  V3
##   6   4  V1  V2  V3
##   6   5  V1  V2  V3
##   7   1  V1  V2  V3
##   7   2  V1  V2  V3
##   7   3  V1  V2  V3
##   7   4  V1  V2  V3
##   7   5  V1  V2  V3
##   8   1  V1  V2  V3
##   8   2  V1  V2  V3
##   8   3  V1  V2  V3
##   8   4  V1  V2  V3
##   8   5  V1  V2  V3
##   9   1  V1  V2  V3
##   9   2  V1  V2  V3
##   9   3  V1  V2  V3
##   9   4  V1  V2  V3
##   9   5  V1  V2  V3
##   10   1  V1  V2  V3
##   10   2  V1  V2  V3
##   10   3  V1  V2  V3
##   10   4  V1  V2  V3
##   10   5  V1  V2  V3
##   11   1  V1  V2  V3
##   11   2  V1  V2  V3
##   11   3  V1  V2  V3
##   11   4  V1  V2  V3
##   11   5  V1  V2  V3
##   12   1  V1  V2  V3
##   12   2  V1  V2  V3
##   12   3  V1  V2  V3
##   12   4  V1  V2  V3
##   12   5  V1  V2  V3
##   13   1  V1  V2  V3
##   13   2  V1  V2  V3
##   13   3  V1  V2  V3
##   13   4  V1  V2  V3
##   13   5  V1  V2  V3
##   14   1  V1  V2  V3
##   14   2  V1  V2  V3
##   14   3  V1  V2  V3
##   14   4  V1  V2  V3
##   14   5  V1  V2  V3
##   15   1  V1  V2  V3
##   15   2  V1  V2  V3
##   15   3  V1  V2  V3
##   15   4  V1  V2  V3
##   15   5  V1  V2  V3
##   16   1  V1  V2  V3
##   16   2  V1  V2  V3
##   16   3  V1  V2  V3
##   16   4  V1  V2  V3
##   16   5  V1  V2  V3
##   17   1  V1  V2  V3
##   17   2  V1  V2  V3
##   17   3  V1  V2  V3
##   17   4  V1  V2  V3
##   17   5  V1  V2  V3
##   18   1  V1  V2  V3
##   18   2  V1  V2  V3
##   18   3  V1  V2  V3
##   18   4  V1  V2  V3
##   18   5  V1  V2  V3
##   19   1  V1  V2  V3
##   19   2  V1  V2  V3
##   19   3  V1  V2  V3
##   19   4  V1  V2  V3
##   19   5  V1  V2  V3
##   20   1  V1  V2  V3
##   20   2  V1  V2  V3
##   20   3  V1  V2  V3
##   20   4  V1  V2  V3
##   20   5  V1  V2  V3
##   21   1  V1  V2  V3
##   21   2  V1  V2  V3
##   21   3  V1  V2  V3
##   21   4  V1  V2  V3
##   21   5  V1  V2  V3
##   22   1  V1  V2  V3
##   22   2  V1  V2  V3
##   22   3  V1  V2  V3
##   22   4  V1  V2  V3
##   22   5  V1  V2  V3
##   23   1  V1  V2  V3
##   23   2  V1  V2  V3
##   23   3  V1  V2  V3
##   23   4  V1  V2  V3
##   23   5  V1  V2  V3
##   24   1  V1  V2  V3
##   24   2  V1  V2  V3
##   24   3  V1  V2  V3
##   24   4  V1  V2  V3
##   24   5  V1  V2  V3
##   25   1  V1  V2  V3
##   25   2  V1  V2  V3
##   25   3  V1  V2  V3
##   25   4  V1  V2  V3
##   25   5  V1  V2  V3
##   26   1  V1  V2  V3
##   26   2  V1  V2  V3
##   26   3  V1  V2  V3
##   26   4  V1  V2  V3
##   26   5  V1  V2  V3
##   27   1  V1  V2  V3
##   27   2  V1  V2  V3
##   27   3  V1  V2  V3
##   27   4  V1  V2  V3
##   27   5  V1  V2  V3
##   28   1  V1  V2  V3
##   28   2  V1  V2  V3
##   28   3  V1  V2  V3
##   28   4  V1  V2  V3
##   28   5  V1  V2  V3
##   29   1  V1  V2  V3
##   29   2  V1  V2  V3
##   29   3  V1  V2  V3
##   29   4  V1  V2  V3
##   29   5  V1  V2  V3
##   30   1  V1  V2  V3
##   30   2  V1  V2  V3
##   30   3  V1  V2  V3
##   30   4  V1  V2  V3
##   30   5  V1  V2  V3
##   31   1  V1  V2  V3
##   31   2  V1  V2  V3
##   31   3  V1  V2  V3
##   31   4  V1  V2  V3
##   31   5  V1  V2  V3
##   32   1  V1  V2  V3
##   32   2  V1  V2  V3
##   32   3  V1  V2  V3
##   32   4  V1  V2  V3
##   32   5  V1  V2  V3
##   33   1  V1  V2  V3
##   33   2  V1  V2  V3
##   33   3  V1  V2  V3
##   33   4  V1  V2  V3
##   33   5  V1  V2  V3
##   34   1  V1  V2  V3
##   34   2  V1  V2  V3
##   34   3  V1  V2  V3
##   34   4  V1  V2  V3
##   34   5  V1  V2  V3
##   35   1  V1  V2  V3
##   35   2  V1  V2  V3
##   35   3  V1  V2  V3
##   35   4  V1  V2  V3
##   35   5  V1  V2  V3
##   36   1  V1  V2  V3
##   36   2  V1  V2  V3
##   36   3  V1  V2  V3
##   36   4  V1  V2  V3
##   36   5  V1  V2  V3
##   37   1  V1  V2  V3
##   37   2  V1  V2  V3
##   37   3  V1  V2  V3
##   37   4  V1  V2  V3
##   37   5  V1  V2  V3
##   38   1  V1  V2  V3
##   38   2  V1  V2  V3
##   38   3  V1  V2  V3
##   38   4  V1  V2  V3
##   38   5  V1  V2  V3
##   39   1  V1  V2  V3
##   39   2  V1  V2  V3
##   39   3  V1  V2  V3
##   39   4  V1  V2  V3
##   39   5  V1  V2  V3
##   40   1  V1  V2  V3
##   40   2  V1  V2  V3
##   40   3  V1  V2  V3
##   40   4  V1  V2  V3
##   40   5  V1  V2  V3
##   41   1  V1  V2  V3
##   41   2  V1  V2  V3
##   41   3  V1  V2  V3
##   41   4  V1  V2  V3
##   41   5  V1  V2  V3
##   42   1  V1  V2  V3
##   42   2  V1  V2  V3
##   42   3  V1  V2  V3
##   42   4  V1  V2  V3
##   42   5  V1  V2  V3
##   43   1  V1  V2  V3
##   43   2  V1  V2  V3
##   43   3  V1  V2  V3
##   43   4  V1  V2  V3
##   43   5  V1  V2  V3
##   44   1  V1  V2  V3
##   44   2  V1  V2  V3
##   44   3  V1  V2  V3
##   44   4  V1  V2  V3
##   44   5  V1  V2  V3
##   45   1  V1  V2  V3
##   45   2  V1  V2  V3
##   45   3  V1  V2  V3
##   45   4  V1  V2  V3
##   45   5  V1  V2  V3
##   46   1  V1  V2  V3
##   46   2  V1  V2  V3
##   46   3  V1  V2  V3
##   46   4  V1  V2  V3
##   46   5  V1  V2  V3
##   47   1  V1  V2  V3
##   47   2  V1  V2  V3
##   47   3  V1  V2  V3
##   47   4  V1  V2  V3
##   47   5  V1  V2  V3
##   48   1  V1  V2  V3
##   48   2  V1  V2  V3
##   48   3  V1  V2  V3
##   48   4  V1  V2  V3
##   48   5  V1  V2  V3
##   49   1  V1  V2  V3
##   49   2  V1  V2  V3
##   49   3  V1  V2  V3
##   49   4  V1  V2  V3
##   49   5  V1  V2  V3
##   50   1  V1  V2  V3
##   50   2  V1  V2  V3
##   50   3  V1  V2  V3
##   50   4  V1  V2  V3
##   50   5  V1  V2  V3

summary(imputed_Data)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10 
## "pmm" "pmm" "pmm"    ""    ""    ""    ""    ""    ""    "" 
## PredictorMatrix:
##    V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## V1  0  1  1  1  1  1  1  1  1   1
## V2  1  0  1  1  1  1  1  1  1   1
## V3  1  1  0  1  1  1  1  1  1   1
## V4  1  1  1  0  1  1  1  1  1   1
## V5  1  1  1  1  0  1  1  1  1   1
## V6  1  1  1  1  1  0  1  1  1   1

mice(data = dataMatrix2, m = 5, method = "pmm", maxit = 50, seed = 500)

## 
##  iter imp variable
##   1   1  V1  V2  V3
##   1   2  V1  V2  V3
##   1   3  V1  V2  V3
##   1   4  V1  V2  V3
##   1   5  V1  V2  V3
##   2   1  V1  V2  V3
##   2   2  V1  V2  V3
##   2   3  V1  V2  V3
##   2   4  V1  V2  V3
##   2   5  V1  V2  V3
##   3   1  V1  V2  V3
##   3   2  V1  V2  V3
##   3   3  V1  V2  V3
##   3   4  V1  V2  V3
##   3   5  V1  V2  V3
##   4   1  V1  V2  V3
##   4   2  V1  V2  V3
##   4   3  V1  V2  V3
##   4   4  V1  V2  V3
##   4   5  V1  V2  V3
##   5   1  V1  V2  V3
##   5   2  V1  V2  V3
##   5   3  V1  V2  V3
##   5   4  V1  V2  V3
##   5   5  V1  V2  V3
##   6   1  V1  V2  V3
##   6   2  V1  V2  V3
##   6   3  V1  V2  V3
##   6   4  V1  V2  V3
##   6   5  V1  V2  V3
##   7   1  V1  V2  V3
##   7   2  V1  V2  V3
##   7   3  V1  V2  V3
##   7   4  V1  V2  V3
##   7   5  V1  V2  V3
##   8   1  V1  V2  V3
##   8   2  V1  V2  V3
##   8   3  V1  V2  V3
##   8   4  V1  V2  V3
##   8   5  V1  V2  V3
##   9   1  V1  V2  V3
##   9   2  V1  V2  V3
##   9   3  V1  V2  V3
##   9   4  V1  V2  V3
##   9   5  V1  V2  V3
##   10   1  V1  V2  V3
##   10   2  V1  V2  V3
##   10   3  V1  V2  V3
##   10   4  V1  V2  V3
##   10   5  V1  V2  V3
##   11   1  V1  V2  V3
##   11   2  V1  V2  V3
##   11   3  V1  V2  V3
##   11   4  V1  V2  V3
##   11   5  V1  V2  V3
##   12   1  V1  V2  V3
##   12   2  V1  V2  V3
##   12   3  V1  V2  V3
##   12   4  V1  V2  V3
##   12   5  V1  V2  V3
##   13   1  V1  V2  V3
##   13   2  V1  V2  V3
##   13   3  V1  V2  V3
##   13   4  V1  V2  V3
##   13   5  V1  V2  V3
##   14   1  V1  V2  V3
##   14   2  V1  V2  V3
##   14   3  V1  V2  V3
##   14   4  V1  V2  V3
##   14   5  V1  V2  V3
##   15   1  V1  V2  V3
##   15   2  V1  V2  V3
##   15   3  V1  V2  V3
##   15   4  V1  V2  V3
##   15   5  V1  V2  V3
##   16   1  V1  V2  V3
##   16   2  V1  V2  V3
##   16   3  V1  V2  V3
##   16   4  V1  V2  V3
##   16   5  V1  V2  V3
##   17   1  V1  V2  V3
##   17   2  V1  V2  V3
##   17   3  V1  V2  V3
##   17   4  V1  V2  V3
##   17   5  V1  V2  V3
##   18   1  V1  V2  V3
##   18   2  V1  V2  V3
##   18   3  V1  V2  V3
##   18   4  V1  V2  V3
##   18   5  V1  V2  V3
##   19   1  V1  V2  V3
##   19   2  V1  V2  V3
##   19   3  V1  V2  V3
##   19   4  V1  V2  V3
##   19   5  V1  V2  V3
##   20   1  V1  V2  V3
##   20   2  V1  V2  V3
##   20   3  V1  V2  V3
##   20   4  V1  V2  V3
##   20   5  V1  V2  V3
##   21   1  V1  V2  V3
##   21   2  V1  V2  V3
##   21   3  V1  V2  V3
##   21   4  V1  V2  V3
##   21   5  V1  V2  V3
##   22   1  V1  V2  V3
##   22   2  V1  V2  V3
##   22   3  V1  V2  V3
##   22   4  V1  V2  V3
##   22   5  V1  V2  V3
##   23   1  V1  V2  V3
##   23   2  V1  V2  V3
##   23   3  V1  V2  V3
##   23   4  V1  V2  V3
##   23   5  V1  V2  V3
##   24   1  V1  V2  V3
##   24   2  V1  V2  V3
##   24   3  V1  V2  V3
##   24   4  V1  V2  V3
##   24   5  V1  V2  V3
##   25   1  V1  V2  V3
##   25   2  V1  V2  V3
##   25   3  V1  V2  V3
##   25   4  V1  V2  V3
##   25   5  V1  V2  V3
##   26   1  V1  V2  V3
##   26   2  V1  V2  V3
##   26   3  V1  V2  V3
##   26   4  V1  V2  V3
##   26   5  V1  V2  V3
##   27   1  V1  V2  V3
##   27   2  V1  V2  V3
##   27   3  V1  V2  V3
##   27   4  V1  V2  V3
##   27   5  V1  V2  V3
##   28   1  V1  V2  V3
##   28   2  V1  V2  V3
##   28   3  V1  V2  V3
##   28   4  V1  V2  V3
##   28   5  V1  V2  V3
##   29   1  V1  V2  V3
##   29   2  V1  V2  V3
##   29   3  V1  V2  V3
##   29   4  V1  V2  V3
##   29   5  V1  V2  V3
##   30   1  V1  V2  V3
##   30   2  V1  V2  V3
##   30   3  V1  V2  V3
##   30   4  V1  V2  V3
##   30   5  V1  V2  V3
##   31   1  V1  V2  V3
##   31   2  V1  V2  V3
##   31   3  V1  V2  V3
##   31   4  V1  V2  V3
##   31   5  V1  V2  V3
##   32   1  V1  V2  V3
##   32   2  V1  V2  V3
##   32   3  V1  V2  V3
##   32   4  V1  V2  V3
##   32   5  V1  V2  V3
##   33   1  V1  V2  V3
##   33   2  V1  V2  V3
##   33   3  V1  V2  V3
##   33   4  V1  V2  V3
##   33   5  V1  V2  V3
##   34   1  V1  V2  V3
##   34   2  V1  V2  V3
##   34   3  V1  V2  V3
##   34   4  V1  V2  V3
##   34   5  V1  V2  V3
##   35   1  V1  V2  V3
##   35   2  V1  V2  V3
##   35   3  V1  V2  V3
##   35   4  V1  V2  V3
##   35   5  V1  V2  V3
##   36   1  V1  V2  V3
##   36   2  V1  V2  V3
##   36   3  V1  V2  V3
##   36   4  V1  V2  V3
##   36   5  V1  V2  V3
##   37   1  V1  V2  V3
##   37   2  V1  V2  V3
##   37   3  V1  V2  V3
##   37   4  V1  V2  V3
##   37   5  V1  V2  V3
##   38   1  V1  V2  V3
##   38   2  V1  V2  V3
##   38   3  V1  V2  V3
##   38   4  V1  V2  V3
##   38   5  V1  V2  V3
##   39   1  V1  V2  V3
##   39   2  V1  V2  V3
##   39   3  V1  V2  V3
##   39   4  V1  V2  V3
##   39   5  V1  V2  V3
##   40   1  V1  V2  V3
##   40   2  V1  V2  V3
##   40   3  V1  V2  V3
##   40   4  V1  V2  V3
##   40   5  V1  V2  V3
##   41   1  V1  V2  V3
##   41   2  V1  V2  V3
##   41   3  V1  V2  V3
##   41   4  V1  V2  V3
##   41   5  V1  V2  V3
##   42   1  V1  V2  V3
##   42   2  V1  V2  V3
##   42   3  V1  V2  V3
##   42   4  V1  V2  V3
##   42   5  V1  V2  V3
##   43   1  V1  V2  V3
##   43   2  V1  V2  V3
##   43   3  V1  V2  V3
##   43   4  V1  V2  V3
##   43   5  V1  V2  V3
##   44   1  V1  V2  V3
##   44   2  V1  V2  V3
##   44   3  V1  V2  V3
##   44   4  V1  V2  V3
##   44   5  V1  V2  V3
##   45   1  V1  V2  V3
##   45   2  V1  V2  V3
##   45   3  V1  V2  V3
##   45   4  V1  V2  V3
##   45   5  V1  V2  V3
##   46   1  V1  V2  V3
##   46   2  V1  V2  V3
##   46   3  V1  V2  V3
##   46   4  V1  V2  V3
##   46   5  V1  V2  V3
##   47   1  V1  V2  V3
##   47   2  V1  V2  V3
##   47   3  V1  V2  V3
##   47   4  V1  V2  V3
##   47   5  V1  V2  V3
##   48   1  V1  V2  V3
##   48   2  V1  V2  V3
##   48   3  V1  V2  V3
##   48   4  V1  V2  V3
##   48   5  V1  V2  V3
##   49   1  V1  V2  V3
##   49   2  V1  V2  V3
##   49   3  V1  V2  V3
##   49   4  V1  V2  V3
##   49   5  V1  V2  V3
##   50   1  V1  V2  V3
##   50   2  V1  V2  V3
##   50   3  V1  V2  V3
##   50   4  V1  V2  V3
##   50   5  V1  V2  V3

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10 
## "pmm" "pmm" "pmm"    ""    ""    ""    ""    ""    ""    "" 
## PredictorMatrix:
##    V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## V1  0  1  1  1  1  1  1  1  1   1
## V2  1  0  1  1  1  1  1  1  1   1
## V3  1  1  0  1  1  1  1  1  1   1
## V4  1  1  1  0  1  1  1  1  1   1
## V5  1  1  1  1  0  1  1  1  1   1
## V6  1  1  1  1  1  0  1  1  1   1

summary(dataMatrix2)

##        V1                V2               V3                V4         
##  Min.   :-1.6621   Min.   :-2.347   Min.   :-1.3248   Min.   :-1.6193  
##  1st Qu.:-0.6037   1st Qu.: 0.311   1st Qu.:-0.6472   1st Qu.:-0.2743  
##  Median : 0.3950   Median : 1.593   Median :-0.3044   Median : 1.4262  
##  Mean   : 0.2437   Mean   : 2.152   Mean   : 0.1745   Mean   : 2.0516  
##  3rd Qu.: 0.7621   3rd Qu.: 4.140   3rd Qu.: 0.8293   3rd Qu.: 4.4068  
##  Max.   : 2.1968   Max.   : 6.129   Max.   : 2.4771   Max.   : 7.6558  
##  NA's   :14        NA's   :17       NA's   :9                          
##        V5                V6               V7               V8         
##  Min.   :-1.8157   Min.   :-2.290   Min.   :-2.113   Min.   :-0.6753  
##  1st Qu.:-0.4444   1st Qu.: 4.080   1st Qu.: 2.358   1st Qu.: 3.1898  
##  Median : 0.1382   Median : 8.073   Median : 5.558   Median : 8.0382  
##  Mean   : 0.1611   Mean   : 7.069   Mean   : 4.872   Mean   : 7.1558  
##  3rd Qu.: 0.7897   3rd Qu.: 9.906   3rd Qu.: 7.552   3rd Qu.:10.2048  
##  Max.   : 2.4836   Max.   :13.672   Max.   : 9.483   Max.   :14.3203  
##                                                                       
##        V9              V10        
##  Min.   :-2.582   Min.   :-1.195  
##  1st Qu.: 3.657   1st Qu.: 3.744  
##  Median : 5.421   Median : 8.263  
##  Mean   : 5.077   Mean   : 7.248  
##  3rd Qu.: 8.022   3rd Qu.:10.272  
##  Max.   : 9.550   Max.   :13.673  
##

summary(imputed_Data)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10 
## "pmm" "pmm" "pmm"    ""    ""    ""    ""    ""    ""    "" 
## PredictorMatrix:
##    V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## V1  0  1  1  1  1  1  1  1  1   1
## V2  1  0  1  1  1  1  1  1  1   1
## V3  1  1  0  1  1  1  1  1  1   1
## V4  1  1  1  0  1  1  1  1  1   1
## V5  1  1  1  1  0  1  1  1  1   1
## V6  1  1  1  1  1  0  1  1  1   1

completeData <- complete(imputed_Data,1)

summary(completeData)

##        V1                V2                V3                 V4         
##  Min.   :-1.6621   Min.   :-2.3469   Min.   :-1.32476   Min.   :-1.6193  
##  1st Qu.:-0.3635   1st Qu.: 0.5674   1st Qu.:-0.59980   1st Qu.:-0.2743  
##  Median : 0.5957   Median : 1.5935   Median :-0.06902   Median : 1.4262  
##  Mean   : 0.4076   Mean   : 2.0845   Mean   : 0.25670   Mean   : 2.0516  
##  3rd Qu.: 1.6324   3rd Qu.: 3.9506   3rd Qu.: 1.26124   3rd Qu.: 4.4068  
##  Max.   : 2.1968   Max.   : 6.1285   Max.   : 2.47711   Max.   : 7.6558  
##        V5                V6               V7               V8         
##  Min.   :-1.8157   Min.   :-2.290   Min.   :-2.113   Min.   :-0.6753  
##  1st Qu.:-0.4444   1st Qu.: 4.080   1st Qu.: 2.358   1st Qu.: 3.1898  
##  Median : 0.1382   Median : 8.073   Median : 5.558   Median : 8.0382  
##  Mean   : 0.1611   Mean   : 7.069   Mean   : 4.872   Mean   : 7.1558  
##  3rd Qu.: 0.7897   3rd Qu.: 9.906   3rd Qu.: 7.552   3rd Qu.:10.2048  
##  Max.   : 2.4836   Max.   :13.672   Max.   : 9.483   Max.   :14.3203  
##        V9              V10        
##  Min.   :-2.582   Min.   :-1.195  
##  1st Qu.: 3.657   1st Qu.: 3.744  
##  Median : 5.421   Median : 8.263  
##  Mean   : 5.077   Mean   : 7.248  
##  3rd Qu.: 8.022   3rd Qu.:10.272  
##  Max.   : 9.550   Max.   :13.673

svd1 <- svd(scale(dataMatrixOrdered)); svd2 <- svd(scale(completeData))
par(mfrow=c(1,2)); plot(svd1$v[,1],pch=19); plot(svd2$v[,1],pch=19)

### Face Example:

load("C:/Users/angul/OneDrive/R/ExploreData/Data/face.rda")
image(t(faceData)[, nrow(faceData):1])

svd1 <- svd(scale(faceData))
plot(svd1$d^2/sum(svd1$d^2), pch = 19, xlab = "Singular vector", ylab = "Variance explained")

svd1 <- svd(scale(faceData))
## Note that %*% is matrix multiplication
# Here svd1$d[1] is a constant
approx1 <- svd1$u[, 1] %*% t(svd1$v[, 1]) * svd1$d[1]
# In these examples we need to make the diagonal matrix out of d
approx5 <- svd1$u[, 1:5] %*% diag(svd1$d[1:5]) %*% t(svd1$v[, 1:5])
approx10 <- svd1$u[, 1:10] %*% diag(svd1$d[1:10]) %*% t(svd1$v[, 1:10])

par(mfrow = c(2, 3))
image(t(approx1)[, nrow(approx1):1], main = "(a)")
image(t(approx5)[, nrow(approx5):1], main = "(b)")
image(t(approx10)[, nrow(approx10):1], main = "(c)")
image(t(faceData)[, nrow(faceData):1], main = "(d)") ## Original data

K-means clustering:

Uses a partitioning approach by fixing “centroids” of each cluster then by iteration observations are assigned to the closest centroid. This can also be visualized with a heat map function or by plotting the points.

set.seed(1234); par(mar=c(0,0,0,0))
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
plot(x,y,col="blue",pch=19,cex=2)
text(x+0.05,y+0.05,labels=as.character(1:12))

dataFrame <- data.frame(x,y)
kmeansObj <- kmeans(dataFrame,centers=3)
names(kmeansObj)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

kmeansObj$cluster

##  [1] 3 1 1 3 2 2 2 2 2 2 2 2

par(mar=rep(0.2,4))
plot(x,y,col=kmeansObj$cluster,pch=19,cex=2)
points(kmeansObj$centers,col=1:3,pch=3,cex=3,lwd=3)

set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]
kmeansObj2 <- kmeans(dataMatrix,centers=3)
par(mfrow=c(1,2), mar = c(2, 4, 0.1, 0.1))
image(t(dataMatrix)[,nrow(dataMatrix):1],yaxt="n")
image(t(dataMatrix)[,order(kmeansObj2$cluster)],yaxt="n")

Hierarchical clustering

Hierarchical clustering adopts an iterative merge of points by nearest distance, to form clusters. Hence the as the notion of distance is the most important basis for classification.

This type of analysis is good for exploratory analysis, as the picture may be unstable and is deterministic as you choose the cut of point and the distance chosen is the most important basis for classification. The global distance is usually defined as a weighted sum of the local distances, but it’s better to experiment and inspect the results to see if it makes sense.

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

dataFrame <- data.frame(x = x, y = y)
dist(dataFrame)

##             1          2          3          4          5          6          7
## 2  0.34120511                                                                  
## 3  0.57493739 0.24102750                                                       
## 4  0.26381786 0.52578819 0.71861759                                            
## 5  1.69424700 1.35818182 1.11952883 1.80666768                                 
## 6  1.65812902 1.31960442 1.08338841 1.78081321 0.08150268                      
## 7  1.49823399 1.16620981 0.92568723 1.60131659 0.21110433 0.21666557           
## 8  1.99149025 1.69093111 1.45648906 2.02849490 0.61704200 0.69791931 0.65062566
## 9  2.13629539 1.83167669 1.67835968 2.35675598 1.18349654 1.11500116 1.28582631
## 10 2.06419586 1.76999236 1.63109790 2.29239480 1.23847877 1.16550201 1.32063059
## 11 2.14702468 1.85183204 1.71074417 2.37461984 1.28153948 1.21077373 1.37369662
## 12 2.05664233 1.74662555 1.58658782 2.27232243 1.07700974 1.00777231 1.17740375
##             8          9         10         11
## 2                                             
## 3                                             
## 4                                             
## 5                                             
## 6                                             
## 7                                             
## 8                                             
## 9  1.76460709                                 
## 10 1.83517785 0.14090406                      
## 11 1.86999431 0.11624471 0.08317570           
## 12 1.66223814 0.10848966 0.19128645 0.20802789

dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)

library(rafalib)

dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
myplclust(hClustering, lab = rep(1:3, each = 4), lab.col = rep(1:3, each = 4))

myplclust <- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)),
 hang = 0.1, ...) {
 ## modifiction of plclust for plotting hclust objects *in colour*! Copyright
 ## Eva KF Chan 2009 Arguments: hclust: hclust object lab: a character vector
 ## of labels of the leaves of the tree lab.col: colour for the labels;
 ## NA=default device foreground colour hang: as in hclust & plclust Side
 ## effect: A display of hierarchical cluster with coloured leaf labels.
 y <- rep(hclust$height, 2)
 x <- as.numeric(hclust$merge)
 y <- y[which(x < 0)]
 x <- x[which(x < 0)]
 x <- abs(x)
 y <- y[order(x)]
 x <- x[order(x)]
 plot(hclust, labels = FALSE, hang = hang, ...)
 text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order],
 col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}

dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
myplclust(hClustering, lab = rep(1:3, each = 4), lab.col = rep(1:3, each = 4))

dataFrame <- data.frame(x = x, y = y )
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
heatmap(dataMatrix)

learn more about distance:, and see myCOURSENOTES

Grouping Data in R

Linda Angulo Lopez

19/09/2020

Principal Components Analysis and Singular Value Decomposition:

Components of the SVD - Components of the SVD - u and v

K-means clustering:

Hierarchical clustering