<< Data Analysis >> Note part 1

Created on 21 June 2013
Revised on Thu Aug 15 15:25:38 2013

Study notes of << Data Analysis >> from coursera.

1 Representing data in R

1.1 Character

firstName = "felix"
class(firstName)

## [1] "character"

firstName

## [1] "felix"

1.2 Numeric

heightCM = 188.2
class(heightCM)

## [1] "numeric"

heightCM

## [1] 188.2

1.3 Integer

numberSons = 1L
class(numberSons)  ## Integer

## [1] "integer"

numberSons

## [1] 1

numberSons = 1
class(numberSons)  ## numeric

## [1] "numeric"

numberSons

## [1] 1

1.4 Logical

teachingCoursera = TRUE
class(teachingCoursera)

## [1] "logical"

teachingCoursera

## [1] TRUE

1.5 Vectors

A set of values with the same class

heights = c(188.2, 181.3, 193.4)
heights

## [1] 188.2 181.3 193.4


firstNames = c("jeff", "roger", "andrew", "brian")
firstNames

## [1] "jeff"   "roger"  "andrew" "brian"

1.6 Lists

A vector of values of possibly different classes

vector1 = c(188.2, 181.3, 193.4)
vector2 = c("jeff", "roger", "andrew", "brian")
myList = list(heights = vector1, firstNames = vector2)
myList

## $heights
## [1] 188.2 181.3 193.4
## 
## $firstNames
## [1] "jeff"   "roger"  "andrew" "brian"

1.7 Matrices

Vectors with multiple dimensions

myMatrix = matrix(c(1, 2, 3, 4), byrow = T, nrow = 2)
myMatrix

##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4

1.8 Data frames

Multiple vectors of possibly different classes, of the same length

vector1 = c(188.2, 181.3, 193.4, 192.3)
vector2 = c("jeff", "roger", "andrew", "brian")
myDataFrame = data.frame(heights = vector1, firstNames = vector2)
myDataFrame

##   heights firstNames
## 1   188.2       jeff
## 2   181.3      roger
## 3   193.4     andrew
## 4   192.3      brian

1.9 Factors

Qualitative variables that can be included in models

smoker = c("yes", "no", "yes", "yes")
smokerFactor = as.factor(smoker)
smokerFactor

## [1] yes no  yes yes
## Levels: no yes

1.10 Missing values

In R they are usually coded NA

vector1 = c(188.2, 181.3, 193.4, NA)
vector1

## [1] 188.2 181.3 193.4    NA

is.na(vector1)

## [1] FALSE FALSE FALSE  TRUE

1.11 Subsetting

vector1 = c(188.2, 181.3, 193.4, 192.3)
vector2 = c("jeff", "roger", "andrew", "brian")

vector1[1]

## [1] 188.2

vector1[c(1, 2, 4)]

## [1] 188.2 181.3 192.3


myDataFrame = data.frame(heights = vector1, firstNames = vector2)
myDataFrame[1, 1:2]

##   heights firstNames
## 1   188.2       jeff

myDataFrame$firstNames

## [1] jeff   roger  andrew brian 
## Levels: andrew brian jeff roger

1.12 Logical subsetting

myDataFrame[myDataFrame$firstNames == "jeff", ]

##   heights firstNames
## 1   188.2       jeff

myDataFrame[myDataFrame$heights < 190, ]

##   heights firstNames
## 1   188.2       jeff
## 2   181.3      roger

2 Simulation basics

Important simulation functions

Distributions

rbeta, rbinom, rcauchy, rchisq, rexp, rf, rgamma, rgeom, rhyper, rlogis, rlnorm, rnbinom, rnorm, rpois, rt, runif, rweibull

Densities

dbeta,dbinom, dcauchy, dchisq, dexp, df, dgamma, dgeom, dhyper, dlogis, dlnorm, dnbinom, dnorm, dpois, dt, dunif, dweibull

Sampling

sample(,replace=TRUE),sample(replace=FALSE)

heights = rnorm(10, mean = 188, sd = 3)
heights

##  [1] 184.4 184.9 191.9 186.4 188.1 191.6 192.1 190.0 182.1 187.7

coinFlips = rbinom(10, size = 10, prob = 0.5)
coinFlips

##  [1] 2 5 5 7 6 5 4 4 5 5

x = seq(from = -5, to = 5, length = 10)
normalDensity = dnorm(x, mean = 0, sd = 1)
round(normalDensity, 2)

##  [1] 0.00 0.00 0.01 0.10 0.34 0.34 0.10 0.01 0.00 0.00

x = seq(0, 10, by = 1)
binomialDensity = dbinom(x, size = 10, prob = 0.5)
round(binomialDensity, 2)

##  [1] 0.00 0.01 0.04 0.12 0.21 0.25 0.21 0.12 0.04 0.01 0.00

heights = rnorm(10, mean = 188, sd = 3)
heights

##  [1] 182.1 186.8 181.2 189.4 186.7 180.7 184.4 190.9 186.5 187.3

sample(heights, size = 10, replace = TRUE)

##  [1] 184.4 181.2 182.1 180.7 186.7 186.5 186.7 189.4 182.1 184.4

sample(heights, size = 10, replace = FALSE)

##  [1] 186.7 181.2 180.7 187.3 189.4 186.8 182.1 184.4 186.5 190.9


probs = c(0.4, 0.3, 0.2, 0.1, 0, 0, 0, 0, 0, 0)  ## Sample can draw according to a set of probabilities
sum(probs)

## [1] 1

sample(heights, size = 10, replace = TRUE, prob = probs)

##  [1] 181.2 186.8 182.1 182.1 182.1 182.1 182.1 182.1 182.1 181.2

set.seed(12345)
rnorm(5, mean = 0, sd = 1)

## [1]  0.5855  0.7095 -0.1093 -0.4535  0.6059


set.seed(12345)
rnorm(5, mean = 0, sd = 1)

## [1]  0.5855  0.7095 -0.1093 -0.4535  0.6059

set.seed(5)
sample(1:8, size = 4, replace = FALSE)

## [1] 2 5 6 8


probs = c(5, 5, 5, 5, 1, 1, 1, 1)/24
sample(1:8, size = 4, replace = FALSE, prob = probs)

## [1] 4 1 2 5


treat1 = sample(1:8, size = 2, replace = FALSE)
treat2 = sample(2:7, size = 2, replace = FALSE)
c(treat1, treat2)

## [1] 8 1 3 4

3 Structure of a Data Analysis

library(kernlab)
data(spam)                                      ## Obtain the data
dim(spam)

set.seed(3435)                                  ## generate a test and training set
trainIndicator = rbinom(4601,size=1,prob=0.5)
table(trainIndicator)

trainSpam = spam[trainIndicator==1,]
testSpam = spam[trainIndicator==0,]

dim(trainSpam)

names(trainSpam)

head(trainSpam)

table(trainSpam$type)                               ## Look at summaries of the data

plot(trainSpam$capitalAve ~ trainSpam$type)         ## Create exploratory plots

plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)

plot(log10(trainSpam[, 1:4] + 1))                   ## Relationships between predictors

hCluster = hclust(dist(t(trainSpam[, 1:57])))      ## Perform exploratory analyses 
plot(hCluster)

hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
plot(hClusterUpdated)

trainSpam$numType = as.numeric(trainSpam$type) - 1   ## Statistical prediction/modeling
costFunction = function(x, y) {
  sum(x != (y > 0.5))
}

cvError = rep(NA, 55)

library(boot)

for (i in 1:55) {
  lmFormula = as.formula(paste("numType~", names(trainSpam)[i], sep = ""))
  glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
  cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}

which.min(cvError)

names(trainSpam)[which.min(cvError)]

predictionModel = glm(numType ~ charDollar, family = "binomial", data = trainSpam) 
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
predictedSpam[predictionModel$fitted > 0.5] = "spam"
table(predictedSpam, testSpam$type)  ## Get a measure of uncertainty

4 Getting Data

getwd()                                   ## Get/set your working directory
setwd("C:\\Users\\Felix\\Downloads")
setwd("./data")
setwd("../")

4.1 Types of files data may come from

Tab-delimited text
Comma-separated text
Excel file
JSON File
HTML/XML file
Database

4.2 Getting data from the internet - download.file()

Downloads a file from the internet
Even if you could do this by hand, helps with reproducibility
Important parameters are url, destfile, method
Useful for downloading tab-delimited, csv, etc.

fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./data/cameras.csv",method="curl")
list.files("./data")

dateDownloaded <- date()
dateDownloaded

## [1] "Thu Aug 15 15:25:39 2013"

Some notes about download.file()

If the url starts with http you can use download.file()
If the url starts with https on Windows you may be ok
If the url starts with https on Mac you may need to set method=“curl”
If the file is big, this might take a while
Be sure to record when you downloaded.

Loading data you have saved - read.table()

This is the main function for reading data into R
Flexible and robust but requires more parameters
Reads the data into RAM - big data can cause problems
Important parameters file, header, sep, row.names, nrows
Related: read.csv(), read.csv2()

read.xlsx(), read.xlsx2() {xlsx package}

Reads .xlsx files, but slow
Important parameters file, sheetIndex, rowIndex, colIndex, header
read.xlsx2() relies more on low level Java functions so may be a bit faster

library(xlsx)
read.xlsx2("./data/camera.xlsx",sheetIndex=1)

Picking a file - less reproducible, but useful

read.csv(file.choose())

Writing data - write.table()

cameraData <- read.csv("./data/cameras.csv")
tmpData <- cameraData[,-1]
write.table(tmpData,file="./data/camerasModified.csv",sep=",")

Writing data - save(), save.image()

save is used to save R objects
Important parameters: list of objects, file
save.image saves everything in your working directory

cameraData <- read.csv("./data/cameras.csv")
tmpData <- cameraData[,-1]
save(tmpData,cameraData,file="./data/cameras.rda")

Reading saved data - load()

Opposite of save()
Important parameters: file

rm(list=ls())               ## Remove everything from the workspace
load("./data/cameras.rda")  ## Load data

paste() and paste0()

These functions are for pasting character strings together.
Important parameters: list of text strings, sep
paste0() is the same as paste but with sep=“”
Great for looping over files

5 Summarizing data

Looking at data - dim(),names(),nrow(),ncol()
Looking at the data - quantile(),summary()
Looking at data - class()
Looking at data - unique(),length(),table()
Looking at data - any(), all()
Looking at subsets - &
Looking at subsets - |
Find if there are missing values - is.na()
Summarizing columns/rows - rowSums(),rowMeans(),colSums(),colMeans()

Important table()/NA issue:

table(c(0, 1, 2, 3, NA, 3, 3, 2, 2, 3))

## 
## 0 1 2 3 
## 1 1 3 4

table(c(0, 1, 2, 3, NA, 3, 3, 2, 2, 3), useNA = "ifany")

## 
##    0    1    2    3 <NA> 
##    1    1    3    4    1

6 Data munging basics

Fixing character vectors - tolower(), toupper()
Fixing character vectors - strsplit()
Fixing character vectors - sapply()
Fixing character vectors - sub(),gsub()
Quantitative variables in ranges - cut()
Quantitative variables in ranges - cut2() {Hmisc}
Merging data - merge()
Sorting values - sort()
Ordering values - order()
Reshaping data - melt()

7 Exploratory graphs

works on windows 7 + Rstudio  ## simple
pData <- read.table("https://dl.dropboxusercontent.com/u/8272421/ss06pid.csv",sep=",",header=TRUE)

library(RCurl)     ## complicated, download data from https
library(XML)
fileUrl <- "https://dl.dropboxusercontent.com/u/8272421/ss06pid.csv"
myCsv<-getURL(fileUrl, ssl.verifypeer = FALSE)
temporaryFile <- tempfile()
con <- file(temporaryFile, open = "w")
cat(myCsv, file = con) 
close(con)
pData <- read.csv(temporaryFile)

dim(pData)

boxplot(pData$AGEP,col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE)

barplot(table(pData$CIT),col="blue")

hist(pData$AGEP,col="blue")
hist(pData$AGEP,col="blue",breaks=100,main="Age")

dens <- density(pData$AGEP)   ## Density plots
plot(dens,lwd=3,col="blue")

dens <- density(pData$AGEP)  ## Density plots - multiple distributions
densMales <- density(pData$AGEP[which(pData$SEX==1)])
plot(dens,lwd=3,col="blue")
lines(densMales,lwd=3,col="orange")

plot(pData$JWMNP,pData$WAGP,pch=19,col="blue")                       ## Scatterplots
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)               ## Scatterplots - size matters
plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$SEX,cex=0.5)            ## Scatterplots - using color

percentMaxAge <- pData$AGEP/max(pData$AGEP)
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=percentMaxAge*0.5) ## Scatterplots - using size

plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)
lines(rep(100,dim(pData)[1]),pData$WAGP,col="grey",lwd=5) ## Scatterplots - overlaying lines/points
points(seq(0,200,length=100),seq(0,20e5,length=100),col="red",pch=19)

library(Hmisc)
ageGroups <- cut2(pData$AGEP,g=5)
plot(pData$JWMNP,pData$WAGP,pch=19,col=ageGroups,cex=0.5) ## Scatterplots - numeric variables as factors

x <- rnorm(1e+05)
y <- rnorm(1e+05)
plot(x, y, pch = 19)  ## If you have a lot of points

plot of chunk unnamed-chunk-23

sampledValues <- sample(1:1e+05, size = 1000, replace = FALSE)  ## If you have a lot of points - sampling
plot(x[sampledValues], y[sampledValues], pch = 19)

plot of chunk unnamed-chunk-24

smoothScatter(x, y)  ## If you have a lot of points - smoothScatter

## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009

plot of chunk unnamed-chunk-25

library(hexbin)  ## If you have a lot of points - hexbin {hexbin}

## Warning: package 'hexbin' was built under R version 3.0.1

## Loading required package: grid

## Loading required package: lattice

x <- rnorm(1e+05)
y <- rnorm(1e+05)
hbo <- hexbin(x, y)
plot(hbo)

plot of chunk unnamed-chunk-26

x <- rnorm(20)
y <- rnorm(20)
qqplot(x, y)  ## QQ-plots
abline(c(0, 1))

plot of chunk unnamed-chunk-27

X <- matrix(rnorm(20 * 5), nrow = 20)
matplot(X, type = "b")  ## Matplot and spaghetti

plot of chunk unnamed-chunk-28

image(1:10,161:236,as.matrix(pData[1:10,161:236])) ## Heatmaps

newMatrix <- as.matrix(pData[1:10,161:236])
newMatrix <- t(newMatrix)[,nrow(newMatrix):1]
image(161:236, 1:10, newMatrix)                        ## Heatmaps - matching intuition

library(maps)  ## Maps - very basics
map("world")
lat <- runif(40, -180, 180)
lon <- runif(40, -90, 90)
points(lat, lon, col = "blue", pch = 19)

plot of chunk unnamed-chunk-29

x <- c(NA, NA, NA, 4, 5, 6, 7, 8, 9, 10)
y <- 1:10
plot(x, y, pch = 19, xlim = c(0, 11), ylim = c(0, 11))  ## Missing values and plots

plot of chunk unnamed-chunk-30

x <- rnorm(100)
y <- rnorm(100)
y[x < 0] <- NA
boxplot(x ~ is.na(y))  ## Missing values and plots

plot of chunk unnamed-chunk-31

8 Expository graphs

plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5, xlab="Travel time (min)",ylab="Last 12 month wages (dollars)") ## Axes, Important parameters: xlab,ylab,cex.lab,cex.axis
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5, xlab="Travel time (min)",ylab="Last 12 month wages (dollars)",cex.lab=2,cex.axis=1.5)

plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5,xlab="TT (min)",ylab="Wages (dollars)")
legend(100,200000,legend="All surveyed",col="blue",pch=19,cex=0.5) ## Legends, Important paramters: x,y,legend, other plotting parameters

plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="TT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))

plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)", ylab="Wages (dollars)",
     col=pData$SEX,main ="Wages earned versus commute time")      ## Titles
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))

par(mfrow=c(1,2))          ## Multiple panels
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))

par(mfrow=c(1,2))  ## Adding text
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)

9 Save the plot file

pdf(file="twoPanel.pdf",height=4,width=8)           ## pdf
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
dev.off()

png(file="twoPanel.png",height=480,width=(2*480))   ## png
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
dev.off()

par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
dev.copy2pdf(file="twoPanelv2.pdf")        ## dev.copy2pdf

10 Hierarchical clustering

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

plot of chunk unnamed-chunk-32

dataFrame <- data.frame(x = x, y = y)
dist(dataFrame)

##          1       2       3       4       5       6       7       8       9
## 2  0.34121                                                                
## 3  0.57494 0.24103                                                        
## 4  0.26382 0.52579 0.71862                                                
## 5  1.69425 1.35818 1.11953 1.80667                                        
## 6  1.65813 1.31960 1.08339 1.78081 0.08150                                
## 7  1.49823 1.16621 0.92569 1.60132 0.21110 0.21667                        
## 8  1.99149 1.69093 1.45649 2.02849 0.61704 0.69792 0.65063                
## 9  2.13630 1.83168 1.67836 2.35676 1.18350 1.11500 1.28583 1.76461        
## 10 2.06420 1.76999 1.63110 2.29239 1.23848 1.16550 1.32063 1.83518 0.14090
## 11 2.14702 1.85183 1.71074 2.37462 1.28154 1.21077 1.37370 1.86999 0.11624
## 12 2.05664 1.74663 1.58659 2.27232 1.07701 1.00777 1.17740 1.66224 0.10849
##         10      11
## 2                 
## 3                 
## 4                 
## 5                 
## 6                 
## 7                 
## 8                 
## 9                 
## 10                
## 11 0.08318        
## 12 0.19129 0.20803

dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)

plot of chunk unnamed-chunk-34

Prettier dendrograms:

modifiction of plclust for plotting hclust objects *in colour*!
Copyright Eva KF Chan 2009
Arguments:
hclust: hclust object
lab: a character vector of labels of the leaves of the tree
lab.col: colour for the labels; NA=default device foreground colour
hang: as in hclust & plclust
Side effect:
A display of hierarchical cluster with coloured leaf labels.

myplclust <- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)), 
    hang = 0.1, ...) {
    y <- rep(hclust$height, 2)
    x <- as.numeric(hclust$merge)
    y <- y[which(x < 0)]
    x <- x[which(x < 0)]
    x <- abs(x)
    y <- y[order(x)]
    x <- x[order(x)]
    plot(hclust, labels = FALSE, hang = hang, ...)
    text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order], 
        col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}

op = par(mfrow = c(1, 1))
dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
myplclust(hClustering, lab = rep(1:3, each = 4), lab.col = rep(1:3, each = 4))

plot of chunk unnamed-chunk-35

par(op)

dataFrame <- data.frame(x = x, y = y)
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
heatmap(dataMatrix)  ## heatmap

plot of chunk unnamed-chunk-36

11 K-means clustering

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

plot of chunk unnamed-chunk-37

dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
names(kmeansObj)

## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"

kmeansObj$cluster

##  [1] 3 3 3 3 1 1 1 1 2 2 2 2

par(mar = rep(0.2, 4))
plot(x, y, col = kmeansObj$cluster, pch = 19, cex = 2)
points(kmeansObj$centers, col = 1:3, pch = 3, cex = 3, lwd = 3)

plot of chunk unnamed-chunk-39

set.seed(1234)  ## Heatmaps
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj2 <- kmeans(dataMatrix, centers = 3)
par(mfrow = c(1, 2), mar = rep(0.2, 4))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n")

plot of chunk unnamed-chunk-40

12 Principal components analysis and singular value decomposition

set.seed(12345)
par(mar = rep(0.2, 4))
dataMatrix <- matrix(rnorm(400), nrow = 40)  ## Matrix data
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

plot of chunk unnamed-chunk-41

par(mar = rep(0.2, 4))
heatmap(dataMatrix)  ## Cluster the data

plot of chunk unnamed-chunk-42

set.seed(678910)
for (i in 1:40) {
    ## What if we add a pattern?
    coinFlip <- rbinom(1, size = 1, prob = 0.5)
    if (coinFlip) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 3), each = 5)
    }
}

par(mar = rep(0.2, 4))
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

plot of chunk unnamed-chunk-44

par(mar = rep(0.2, 4))
heatmap(dataMatrix)

plot of chunk unnamed-chunk-45

hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rowMeans(dataMatrixOrdered), 40:1, , xlab = "Row", ylab = "Row Mean", pch = 19)
plot(colMeans(dataMatrixOrdered), xlab = "Column", ylab = "Column Mean", pch = 19)

plot of chunk unnamed-chunk-46

svd1 <- svd(scale(dataMatrixOrdered))  ## Components of the SVD - u and v
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd1$u[, 1], 40:1, , xlab = "Row", ylab = "First left singular vector", 
    pch = 19)
plot(svd1$v[, 1], xlab = "Column", ylab = "First right singular vector", pch = 19)

plot of chunk unnamed-chunk-47

svd1 <- svd(scale(dataMatrixOrdered))  ## Components of the SVD - d and variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained", 
    pch = 19)

plot of chunk unnamed-chunk-48

par(mfrow = c(1, 1))
svd1 <- svd(scale(dataMatrixOrdered))  ## Relationship to principal components
pca1 <- prcomp(dataMatrixOrdered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1], pch = 19, xlab = "Principal Component 1", 
    ylab = "Right Singular Vector 1")
abline(c(0, 1))

plot of chunk unnamed-chunk-49

constantMatrix <- dataMatrixOrdered * 0  ## Components of the SVD - variance explained
for (i in 1:dim(dataMatrixOrdered)[1]) {
    constantMatrix[i, ] <- rep(c(0, 1), each = 5)
}
svd1 <- svd(constantMatrix)
par(mfrow = c(1, 3))
image(t(constantMatrix)[, nrow(constantMatrix):1])
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained", 
    pch = 19)

plot of chunk unnamed-chunk-50

set.seed(678910)
for (i in 1:40) {
    ## What if we add a second pattern?
    coinFlip1 <- rbinom(1, size = 1, prob = 0.5)
    coinFlip2 <- rbinom(1, size = 1, prob = 0.5)
    if (coinFlip1) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), each = 5)
    }
    if (coinFlip2) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), 5)
    }
}

hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]

svd2 <- svd(scale(dataMatrixOrdered))  ## Singular value decomposition - true patterns
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rep(c(0, 1), each = 5), pch = 19, xlab = "Column", ylab = "Pattern 1")
plot(rep(c(0, 1), 5), pch = 19, xlab = "Column", ylab = "Pattern 2")

plot of chunk unnamed-chunk-52

svd2 <- svd(scale(dataMatrixOrdered))  ## v and patterns of variance in rows
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd2$v[, 1], pch = 19, xlab = "Column", ylab = "First right singluar vector")
plot(svd2$v[, 2], pch = 19, xlab = "Column", ylab = "Second right singluar vector")

plot of chunk unnamed-chunk-53

svd1 <- svd(scale(dataMatrixOrdered))  ## d and variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained", 
    pch = 19)

plot of chunk unnamed-chunk-54

library(corpcor)
bigMatrix <- matrix(rnorm(10000 * 40), nrow = 10000)
system.time(svd(scale(bigMatrix)))

##    user  system elapsed 
##    0.33    0.00    0.33

system.time(fast.svd(scale(bigMatrix), tol = 0))

##    user  system elapsed 
##    0.09    0.03    0.12

library(impute)
dataMatrix2 <- dataMatrixOrdered
dataMatrix2[sample(1:100,size=40,replace=F)] <- NA
dataMatrix2 <- impute.knn(dataMatrix2)$data           ## Imputing missing data
svd1 <- svd(scale(dataMatrixOrdered))
svd2 <- svd(scale(dataMatrix2))
par(mfrow=c(1,2))
plot(svd1$v[,1],pch=19)
plot(svd2$v[,1],pch=19)