<< Data Analysis >> Note part 1

Created on 21 June 2013
Revised on Thu Aug 15 15:25:38 2013

Study notes of << Data Analysis >> from coursera.

1 Representing data in R

1.1 Character

firstName = "felix"
class(firstName)
## [1] "character"
firstName
## [1] "felix"

1.2 Numeric

heightCM = 188.2
class(heightCM)
## [1] "numeric"
heightCM
## [1] 188.2

1.3 Integer

numberSons = 1L
class(numberSons)  ## Integer
## [1] "integer"
numberSons
## [1] 1
numberSons = 1
class(numberSons)  ## numeric
## [1] "numeric"
numberSons
## [1] 1

1.4 Logical

teachingCoursera = TRUE
class(teachingCoursera)
## [1] "logical"
teachingCoursera
## [1] TRUE

1.5 Vectors

A set of values with the same class

heights = c(188.2, 181.3, 193.4)
heights
## [1] 188.2 181.3 193.4

firstNames = c("jeff", "roger", "andrew", "brian")
firstNames
## [1] "jeff"   "roger"  "andrew" "brian"

1.6 Lists

A vector of values of possibly different classes

vector1 = c(188.2, 181.3, 193.4)
vector2 = c("jeff", "roger", "andrew", "brian")
myList = list(heights = vector1, firstNames = vector2)
myList
## $heights
## [1] 188.2 181.3 193.4
## 
## $firstNames
## [1] "jeff"   "roger"  "andrew" "brian"

1.7 Matrices

Vectors with multiple dimensions

myMatrix = matrix(c(1, 2, 3, 4), byrow = T, nrow = 2)
myMatrix
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4

1.8 Data frames

Multiple vectors of possibly different classes, of the same length

vector1 = c(188.2, 181.3, 193.4, 192.3)
vector2 = c("jeff", "roger", "andrew", "brian")
myDataFrame = data.frame(heights = vector1, firstNames = vector2)
myDataFrame
##   heights firstNames
## 1   188.2       jeff
## 2   181.3      roger
## 3   193.4     andrew
## 4   192.3      brian

1.9 Factors

Qualitative variables that can be included in models

smoker = c("yes", "no", "yes", "yes")
smokerFactor = as.factor(smoker)
smokerFactor
## [1] yes no  yes yes
## Levels: no yes

1.10 Missing values

In R they are usually coded NA

vector1 = c(188.2, 181.3, 193.4, NA)
vector1
## [1] 188.2 181.3 193.4    NA
is.na(vector1)
## [1] FALSE FALSE FALSE  TRUE

1.11 Subsetting

vector1 = c(188.2, 181.3, 193.4, 192.3)
vector2 = c("jeff", "roger", "andrew", "brian")

vector1[1]
## [1] 188.2
vector1[c(1, 2, 4)]
## [1] 188.2 181.3 192.3

myDataFrame = data.frame(heights = vector1, firstNames = vector2)
myDataFrame[1, 1:2]
##   heights firstNames
## 1   188.2       jeff
myDataFrame$firstNames
## [1] jeff   roger  andrew brian 
## Levels: andrew brian jeff roger

1.12 Logical subsetting

myDataFrame[myDataFrame$firstNames == "jeff", ]
##   heights firstNames
## 1   188.2       jeff
myDataFrame[myDataFrame$heights < 190, ]
##   heights firstNames
## 1   188.2       jeff
## 2   181.3      roger

2 Simulation basics

Important simulation functions

Distributions

Densities

Sampling

heights = rnorm(10, mean = 188, sd = 3)
heights
##  [1] 184.4 184.9 191.9 186.4 188.1 191.6 192.1 190.0 182.1 187.7
coinFlips = rbinom(10, size = 10, prob = 0.5)
coinFlips
##  [1] 2 5 5 7 6 5 4 4 5 5
x = seq(from = -5, to = 5, length = 10)
normalDensity = dnorm(x, mean = 0, sd = 1)
round(normalDensity, 2)
##  [1] 0.00 0.00 0.01 0.10 0.34 0.34 0.10 0.01 0.00 0.00
x = seq(0, 10, by = 1)
binomialDensity = dbinom(x, size = 10, prob = 0.5)
round(binomialDensity, 2)
##  [1] 0.00 0.01 0.04 0.12 0.21 0.25 0.21 0.12 0.04 0.01 0.00
heights = rnorm(10, mean = 188, sd = 3)
heights
##  [1] 182.1 186.8 181.2 189.4 186.7 180.7 184.4 190.9 186.5 187.3
sample(heights, size = 10, replace = TRUE)
##  [1] 184.4 181.2 182.1 180.7 186.7 186.5 186.7 189.4 182.1 184.4
sample(heights, size = 10, replace = FALSE)
##  [1] 186.7 181.2 180.7 187.3 189.4 186.8 182.1 184.4 186.5 190.9

probs = c(0.4, 0.3, 0.2, 0.1, 0, 0, 0, 0, 0, 0)  ## Sample can draw according to a set of probabilities
sum(probs)
## [1] 1
sample(heights, size = 10, replace = TRUE, prob = probs)
##  [1] 181.2 186.8 182.1 182.1 182.1 182.1 182.1 182.1 182.1 181.2
set.seed(12345)
rnorm(5, mean = 0, sd = 1)
## [1]  0.5855  0.7095 -0.1093 -0.4535  0.6059

set.seed(12345)
rnorm(5, mean = 0, sd = 1)
## [1]  0.5855  0.7095 -0.1093 -0.4535  0.6059
set.seed(5)
sample(1:8, size = 4, replace = FALSE)
## [1] 2 5 6 8

probs = c(5, 5, 5, 5, 1, 1, 1, 1)/24
sample(1:8, size = 4, replace = FALSE, prob = probs)
## [1] 4 1 2 5

treat1 = sample(1:8, size = 2, replace = FALSE)
treat2 = sample(2:7, size = 2, replace = FALSE)
c(treat1, treat2)
## [1] 8 1 3 4

3 Structure of a Data Analysis

library(kernlab)
data(spam)                                      ## Obtain the data
dim(spam)

set.seed(3435)                                  ## generate a test and training set
trainIndicator = rbinom(4601,size=1,prob=0.5)
table(trainIndicator)

trainSpam = spam[trainIndicator==1,]
testSpam = spam[trainIndicator==0,]

dim(trainSpam)

names(trainSpam)

head(trainSpam)

table(trainSpam$type)                               ## Look at summaries of the data

plot(trainSpam$capitalAve ~ trainSpam$type)         ## Create exploratory plots

plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)

plot(log10(trainSpam[, 1:4] + 1))                   ## Relationships between predictors

hCluster = hclust(dist(t(trainSpam[, 1:57])))      ## Perform exploratory analyses 
plot(hCluster)

hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
plot(hClusterUpdated)

trainSpam$numType = as.numeric(trainSpam$type) - 1   ## Statistical prediction/modeling
costFunction = function(x, y) {
  sum(x != (y > 0.5))
}

cvError = rep(NA, 55)

library(boot)

for (i in 1:55) {
  lmFormula = as.formula(paste("numType~", names(trainSpam)[i], sep = ""))
  glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
  cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}

which.min(cvError)

names(trainSpam)[which.min(cvError)]

predictionModel = glm(numType ~ charDollar, family = "binomial", data = trainSpam) 
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
predictedSpam[predictionModel$fitted > 0.5] = "spam"
table(predictedSpam, testSpam$type)  ## Get a measure of uncertainty

4 Getting Data

getwd()                                   ## Get/set your working directory
setwd("C:\\Users\\Felix\\Downloads")
setwd("./data")
setwd("../")

4.1 Types of files data may come from

4.2 Getting data from the internet - download.file()

fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./data/cameras.csv",method="curl")
list.files("./data")
dateDownloaded <- date()
dateDownloaded
## [1] "Thu Aug 15 15:25:39 2013"

Some notes about download.file()

Loading data you have saved - read.table()

read.xlsx(), read.xlsx2() {xlsx package}

library(xlsx)
read.xlsx2("./data/camera.xlsx",sheetIndex=1)

Picking a file - less reproducible, but useful

read.csv(file.choose())

Writing data - write.table()

cameraData <- read.csv("./data/cameras.csv")
tmpData <- cameraData[,-1]
write.table(tmpData,file="./data/camerasModified.csv",sep=",")

Writing data - save(), save.image()

cameraData <- read.csv("./data/cameras.csv")
tmpData <- cameraData[,-1]
save(tmpData,cameraData,file="./data/cameras.rda")

Reading saved data - load()

rm(list=ls())               ## Remove everything from the workspace
load("./data/cameras.rda")  ## Load data

paste() and paste0()

5 Summarizing data

Important table()/NA issue:

table(c(0, 1, 2, 3, NA, 3, 3, 2, 2, 3))
## 
## 0 1 2 3 
## 1 1 3 4
table(c(0, 1, 2, 3, NA, 3, 3, 2, 2, 3), useNA = "ifany")
## 
##    0    1    2    3 <NA> 
##    1    1    3    4    1

6 Data munging basics

7 Exploratory graphs

works on windows 7 + Rstudio  ## simple
pData <- read.table("https://dl.dropboxusercontent.com/u/8272421/ss06pid.csv",sep=",",header=TRUE)
library(RCurl)     ## complicated, download data from https
library(XML)
fileUrl <- "https://dl.dropboxusercontent.com/u/8272421/ss06pid.csv"
myCsv<-getURL(fileUrl, ssl.verifypeer = FALSE)
temporaryFile <- tempfile()
con <- file(temporaryFile, open = "w")
cat(myCsv, file = con) 
close(con)
pData <- read.csv(temporaryFile)
dim(pData)
boxplot(pData$AGEP,col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE)
barplot(table(pData$CIT),col="blue")
hist(pData$AGEP,col="blue")
hist(pData$AGEP,col="blue",breaks=100,main="Age")
dens <- density(pData$AGEP)   ## Density plots
plot(dens,lwd=3,col="blue")
dens <- density(pData$AGEP)  ## Density plots - multiple distributions
densMales <- density(pData$AGEP[which(pData$SEX==1)])
plot(dens,lwd=3,col="blue")
lines(densMales,lwd=3,col="orange")
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue")                       ## Scatterplots
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)               ## Scatterplots - size matters
plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$SEX,cex=0.5)            ## Scatterplots - using color
percentMaxAge <- pData$AGEP/max(pData$AGEP)
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=percentMaxAge*0.5) ## Scatterplots - using size
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)
lines(rep(100,dim(pData)[1]),pData$WAGP,col="grey",lwd=5) ## Scatterplots - overlaying lines/points
points(seq(0,200,length=100),seq(0,20e5,length=100),col="red",pch=19)
library(Hmisc)
ageGroups <- cut2(pData$AGEP,g=5)
plot(pData$JWMNP,pData$WAGP,pch=19,col=ageGroups,cex=0.5) ## Scatterplots - numeric variables as factors
x <- rnorm(1e+05)
y <- rnorm(1e+05)
plot(x, y, pch = 19)  ## If you have a lot of points

plot of chunk unnamed-chunk-23

sampledValues <- sample(1:1e+05, size = 1000, replace = FALSE)  ## If you have a lot of points - sampling
plot(x[sampledValues], y[sampledValues], pch = 19)

plot of chunk unnamed-chunk-24

smoothScatter(x, y)  ## If you have a lot of points - smoothScatter
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009

plot of chunk unnamed-chunk-25

library(hexbin)  ## If you have a lot of points - hexbin {hexbin}
## Warning: package 'hexbin' was built under R version 3.0.1
## Loading required package: grid
## Loading required package: lattice
x <- rnorm(1e+05)
y <- rnorm(1e+05)
hbo <- hexbin(x, y)
plot(hbo)

plot of chunk unnamed-chunk-26

x <- rnorm(20)
y <- rnorm(20)
qqplot(x, y)  ## QQ-plots
abline(c(0, 1))

plot of chunk unnamed-chunk-27

X <- matrix(rnorm(20 * 5), nrow = 20)
matplot(X, type = "b")  ## Matplot and spaghetti

plot of chunk unnamed-chunk-28

image(1:10,161:236,as.matrix(pData[1:10,161:236])) ## Heatmaps
newMatrix <- as.matrix(pData[1:10,161:236])
newMatrix <- t(newMatrix)[,nrow(newMatrix):1]
image(161:236, 1:10, newMatrix)                        ## Heatmaps - matching intuition
library(maps)  ## Maps - very basics
map("world")
lat <- runif(40, -180, 180)
lon <- runif(40, -90, 90)
points(lat, lon, col = "blue", pch = 19)

plot of chunk unnamed-chunk-29

x <- c(NA, NA, NA, 4, 5, 6, 7, 8, 9, 10)
y <- 1:10
plot(x, y, pch = 19, xlim = c(0, 11), ylim = c(0, 11))  ## Missing values and plots

plot of chunk unnamed-chunk-30

x <- rnorm(100)
y <- rnorm(100)
y[x < 0] <- NA
boxplot(x ~ is.na(y))  ## Missing values and plots

plot of chunk unnamed-chunk-31

8 Expository graphs

plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5, xlab="Travel time (min)",ylab="Last 12 month wages (dollars)") ## Axes, Important parameters: xlab,ylab,cex.lab,cex.axis
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5, xlab="Travel time (min)",ylab="Last 12 month wages (dollars)",cex.lab=2,cex.axis=1.5)
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5,xlab="TT (min)",ylab="Wages (dollars)")
legend(100,200000,legend="All surveyed",col="blue",pch=19,cex=0.5) ## Legends, Important paramters: x,y,legend, other plotting parameters
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="TT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)", ylab="Wages (dollars)",
     col=pData$SEX,main ="Wages earned versus commute time")      ## Titles
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
par(mfrow=c(1,2))          ## Multiple panels
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
par(mfrow=c(1,2))  ## Adding text
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)

9 Save the plot file

pdf(file="twoPanel.pdf",height=4,width=8)           ## pdf
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
dev.off()
png(file="twoPanel.png",height=480,width=(2*480))   ## png
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
dev.off()
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
dev.copy2pdf(file="twoPanelv2.pdf")        ## dev.copy2pdf

10 Hierarchical clustering

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

plot of chunk unnamed-chunk-32

dataFrame <- data.frame(x = x, y = y)
dist(dataFrame)
##          1       2       3       4       5       6       7       8       9
## 2  0.34121                                                                
## 3  0.57494 0.24103                                                        
## 4  0.26382 0.52579 0.71862                                                
## 5  1.69425 1.35818 1.11953 1.80667                                        
## 6  1.65813 1.31960 1.08339 1.78081 0.08150                                
## 7  1.49823 1.16621 0.92569 1.60132 0.21110 0.21667                        
## 8  1.99149 1.69093 1.45649 2.02849 0.61704 0.69792 0.65063                
## 9  2.13630 1.83168 1.67836 2.35676 1.18350 1.11500 1.28583 1.76461        
## 10 2.06420 1.76999 1.63110 2.29239 1.23848 1.16550 1.32063 1.83518 0.14090
## 11 2.14702 1.85183 1.71074 2.37462 1.28154 1.21077 1.37370 1.86999 0.11624
## 12 2.05664 1.74663 1.58659 2.27232 1.07701 1.00777 1.17740 1.66224 0.10849
##         10      11
## 2                 
## 3                 
## 4                 
## 5                 
## 6                 
## 7                 
## 8                 
## 9                 
## 10                
## 11 0.08318        
## 12 0.19129 0.20803
dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)

plot of chunk unnamed-chunk-34

Prettier dendrograms:

modifiction of plclust for plotting hclust objects *in colour*!
Copyright Eva KF Chan 2009
Arguments:
hclust: hclust object
lab: a character vector of labels of the leaves of the tree
lab.col: colour for the labels; NA=default device foreground colour
hang: as in hclust & plclust
Side effect:
A display of hierarchical cluster with coloured leaf labels.
myplclust <- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)), 
    hang = 0.1, ...) {
    y <- rep(hclust$height, 2)
    x <- as.numeric(hclust$merge)
    y <- y[which(x < 0)]
    x <- x[which(x < 0)]
    x <- abs(x)
    y <- y[order(x)]
    x <- x[order(x)]
    plot(hclust, labels = FALSE, hang = hang, ...)
    text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order], 
        col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}

op = par(mfrow = c(1, 1))
dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
myplclust(hClustering, lab = rep(1:3, each = 4), lab.col = rep(1:3, each = 4))

plot of chunk unnamed-chunk-35

par(op)
dataFrame <- data.frame(x = x, y = y)
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
heatmap(dataMatrix)  ## heatmap

plot of chunk unnamed-chunk-36

11 K-means clustering

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

plot of chunk unnamed-chunk-37

dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
names(kmeansObj)
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"
kmeansObj$cluster
##  [1] 3 3 3 3 1 1 1 1 2 2 2 2
par(mar = rep(0.2, 4))
plot(x, y, col = kmeansObj$cluster, pch = 19, cex = 2)
points(kmeansObj$centers, col = 1:3, pch = 3, cex = 3, lwd = 3)

plot of chunk unnamed-chunk-39

set.seed(1234)  ## Heatmaps
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj2 <- kmeans(dataMatrix, centers = 3)
par(mfrow = c(1, 2), mar = rep(0.2, 4))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n")

plot of chunk unnamed-chunk-40

12 Principal components analysis and singular value decomposition

set.seed(12345)
par(mar = rep(0.2, 4))
dataMatrix <- matrix(rnorm(400), nrow = 40)  ## Matrix data
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

plot of chunk unnamed-chunk-41

par(mar = rep(0.2, 4))
heatmap(dataMatrix)  ## Cluster the data

plot of chunk unnamed-chunk-42

set.seed(678910)
for (i in 1:40) {
    ## What if we add a pattern?
    coinFlip <- rbinom(1, size = 1, prob = 0.5)
    if (coinFlip) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 3), each = 5)
    }
}
par(mar = rep(0.2, 4))
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

plot of chunk unnamed-chunk-44

par(mar = rep(0.2, 4))
heatmap(dataMatrix)

plot of chunk unnamed-chunk-45

hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rowMeans(dataMatrixOrdered), 40:1, , xlab = "Row", ylab = "Row Mean", pch = 19)
plot(colMeans(dataMatrixOrdered), xlab = "Column", ylab = "Column Mean", pch = 19)

plot of chunk unnamed-chunk-46

svd1 <- svd(scale(dataMatrixOrdered))  ## Components of the SVD - u and v
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd1$u[, 1], 40:1, , xlab = "Row", ylab = "First left singular vector", 
    pch = 19)
plot(svd1$v[, 1], xlab = "Column", ylab = "First right singular vector", pch = 19)

plot of chunk unnamed-chunk-47

svd1 <- svd(scale(dataMatrixOrdered))  ## Components of the SVD - d and variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained", 
    pch = 19)

plot of chunk unnamed-chunk-48

par(mfrow = c(1, 1))
svd1 <- svd(scale(dataMatrixOrdered))  ## Relationship to principal components
pca1 <- prcomp(dataMatrixOrdered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1], pch = 19, xlab = "Principal Component 1", 
    ylab = "Right Singular Vector 1")
abline(c(0, 1))

plot of chunk unnamed-chunk-49

constantMatrix <- dataMatrixOrdered * 0  ## Components of the SVD - variance explained
for (i in 1:dim(dataMatrixOrdered)[1]) {
    constantMatrix[i, ] <- rep(c(0, 1), each = 5)
}
svd1 <- svd(constantMatrix)
par(mfrow = c(1, 3))
image(t(constantMatrix)[, nrow(constantMatrix):1])
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained", 
    pch = 19)

plot of chunk unnamed-chunk-50

set.seed(678910)
for (i in 1:40) {
    ## What if we add a second pattern?
    coinFlip1 <- rbinom(1, size = 1, prob = 0.5)
    coinFlip2 <- rbinom(1, size = 1, prob = 0.5)
    if (coinFlip1) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), each = 5)
    }
    if (coinFlip2) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), 5)
    }
}

hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
svd2 <- svd(scale(dataMatrixOrdered))  ## Singular value decomposition - true patterns
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rep(c(0, 1), each = 5), pch = 19, xlab = "Column", ylab = "Pattern 1")
plot(rep(c(0, 1), 5), pch = 19, xlab = "Column", ylab = "Pattern 2")

plot of chunk unnamed-chunk-52

svd2 <- svd(scale(dataMatrixOrdered))  ## v and patterns of variance in rows
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd2$v[, 1], pch = 19, xlab = "Column", ylab = "First right singluar vector")
plot(svd2$v[, 2], pch = 19, xlab = "Column", ylab = "Second right singluar vector")

plot of chunk unnamed-chunk-53

svd1 <- svd(scale(dataMatrixOrdered))  ## d and variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained", 
    pch = 19)

plot of chunk unnamed-chunk-54

library(corpcor)
bigMatrix <- matrix(rnorm(10000 * 40), nrow = 10000)
system.time(svd(scale(bigMatrix)))
##    user  system elapsed 
##    0.33    0.00    0.33
system.time(fast.svd(scale(bigMatrix), tol = 0))
##    user  system elapsed 
##    0.09    0.03    0.12
library(impute)
dataMatrix2 <- dataMatrixOrdered
dataMatrix2[sample(1:100,size=40,replace=F)] <- NA
dataMatrix2 <- impute.knn(dataMatrix2)$data           ## Imputing missing data
svd1 <- svd(scale(dataMatrixOrdered))
svd2 <- svd(scale(dataMatrix2))
par(mfrow=c(1,2))
plot(svd1$v[,1],pch=19)
plot(svd2$v[,1],pch=19)