Created on 21 June 2013
Revised on Thu Aug 15 15:25:38 2013
Study notes of << Data Analysis >> from coursera.
firstName = "felix"
class(firstName)
## [1] "character"
firstName
## [1] "felix"
heightCM = 188.2
class(heightCM)
## [1] "numeric"
heightCM
## [1] 188.2
numberSons = 1L
class(numberSons) ## Integer
## [1] "integer"
numberSons
## [1] 1
numberSons = 1
class(numberSons) ## numeric
## [1] "numeric"
numberSons
## [1] 1
teachingCoursera = TRUE
class(teachingCoursera)
## [1] "logical"
teachingCoursera
## [1] TRUE
A set of values with the same class
heights = c(188.2, 181.3, 193.4)
heights
## [1] 188.2 181.3 193.4
firstNames = c("jeff", "roger", "andrew", "brian")
firstNames
## [1] "jeff" "roger" "andrew" "brian"
A vector of values of possibly different classes
vector1 = c(188.2, 181.3, 193.4)
vector2 = c("jeff", "roger", "andrew", "brian")
myList = list(heights = vector1, firstNames = vector2)
myList
## $heights
## [1] 188.2 181.3 193.4
##
## $firstNames
## [1] "jeff" "roger" "andrew" "brian"
Vectors with multiple dimensions
myMatrix = matrix(c(1, 2, 3, 4), byrow = T, nrow = 2)
myMatrix
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
Multiple vectors of possibly different classes, of the same length
vector1 = c(188.2, 181.3, 193.4, 192.3)
vector2 = c("jeff", "roger", "andrew", "brian")
myDataFrame = data.frame(heights = vector1, firstNames = vector2)
myDataFrame
## heights firstNames
## 1 188.2 jeff
## 2 181.3 roger
## 3 193.4 andrew
## 4 192.3 brian
Qualitative variables that can be included in models
smoker = c("yes", "no", "yes", "yes")
smokerFactor = as.factor(smoker)
smokerFactor
## [1] yes no yes yes
## Levels: no yes
In R they are usually coded NA
vector1 = c(188.2, 181.3, 193.4, NA)
vector1
## [1] 188.2 181.3 193.4 NA
is.na(vector1)
## [1] FALSE FALSE FALSE TRUE
vector1 = c(188.2, 181.3, 193.4, 192.3)
vector2 = c("jeff", "roger", "andrew", "brian")
vector1[1]
## [1] 188.2
vector1[c(1, 2, 4)]
## [1] 188.2 181.3 192.3
myDataFrame = data.frame(heights = vector1, firstNames = vector2)
myDataFrame[1, 1:2]
## heights firstNames
## 1 188.2 jeff
myDataFrame$firstNames
## [1] jeff roger andrew brian
## Levels: andrew brian jeff roger
myDataFrame[myDataFrame$firstNames == "jeff", ]
## heights firstNames
## 1 188.2 jeff
myDataFrame[myDataFrame$heights < 190, ]
## heights firstNames
## 1 188.2 jeff
## 2 181.3 roger
Important simulation functions
Distributions
Densities
Sampling
heights = rnorm(10, mean = 188, sd = 3)
heights
## [1] 184.4 184.9 191.9 186.4 188.1 191.6 192.1 190.0 182.1 187.7
coinFlips = rbinom(10, size = 10, prob = 0.5)
coinFlips
## [1] 2 5 5 7 6 5 4 4 5 5
x = seq(from = -5, to = 5, length = 10)
normalDensity = dnorm(x, mean = 0, sd = 1)
round(normalDensity, 2)
## [1] 0.00 0.00 0.01 0.10 0.34 0.34 0.10 0.01 0.00 0.00
x = seq(0, 10, by = 1)
binomialDensity = dbinom(x, size = 10, prob = 0.5)
round(binomialDensity, 2)
## [1] 0.00 0.01 0.04 0.12 0.21 0.25 0.21 0.12 0.04 0.01 0.00
heights = rnorm(10, mean = 188, sd = 3)
heights
## [1] 182.1 186.8 181.2 189.4 186.7 180.7 184.4 190.9 186.5 187.3
sample(heights, size = 10, replace = TRUE)
## [1] 184.4 181.2 182.1 180.7 186.7 186.5 186.7 189.4 182.1 184.4
sample(heights, size = 10, replace = FALSE)
## [1] 186.7 181.2 180.7 187.3 189.4 186.8 182.1 184.4 186.5 190.9
probs = c(0.4, 0.3, 0.2, 0.1, 0, 0, 0, 0, 0, 0) ## Sample can draw according to a set of probabilities
sum(probs)
## [1] 1
sample(heights, size = 10, replace = TRUE, prob = probs)
## [1] 181.2 186.8 182.1 182.1 182.1 182.1 182.1 182.1 182.1 181.2
set.seed(12345)
rnorm(5, mean = 0, sd = 1)
## [1] 0.5855 0.7095 -0.1093 -0.4535 0.6059
set.seed(12345)
rnorm(5, mean = 0, sd = 1)
## [1] 0.5855 0.7095 -0.1093 -0.4535 0.6059
set.seed(5)
sample(1:8, size = 4, replace = FALSE)
## [1] 2 5 6 8
probs = c(5, 5, 5, 5, 1, 1, 1, 1)/24
sample(1:8, size = 4, replace = FALSE, prob = probs)
## [1] 4 1 2 5
treat1 = sample(1:8, size = 2, replace = FALSE)
treat2 = sample(2:7, size = 2, replace = FALSE)
c(treat1, treat2)
## [1] 8 1 3 4
library(kernlab)
data(spam) ## Obtain the data
dim(spam)
set.seed(3435) ## generate a test and training set
trainIndicator = rbinom(4601,size=1,prob=0.5)
table(trainIndicator)
trainSpam = spam[trainIndicator==1,]
testSpam = spam[trainIndicator==0,]
dim(trainSpam)
names(trainSpam)
head(trainSpam)
table(trainSpam$type) ## Look at summaries of the data
plot(trainSpam$capitalAve ~ trainSpam$type) ## Create exploratory plots
plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)
plot(log10(trainSpam[, 1:4] + 1)) ## Relationships between predictors
hCluster = hclust(dist(t(trainSpam[, 1:57]))) ## Perform exploratory analyses
plot(hCluster)
hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
plot(hClusterUpdated)
trainSpam$numType = as.numeric(trainSpam$type) - 1 ## Statistical prediction/modeling
costFunction = function(x, y) {
sum(x != (y > 0.5))
}
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
lmFormula = as.formula(paste("numType~", names(trainSpam)[i], sep = ""))
glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}
which.min(cvError)
names(trainSpam)[which.min(cvError)]
predictionModel = glm(numType ~ charDollar, family = "binomial", data = trainSpam)
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
predictedSpam[predictionModel$fitted > 0.5] = "spam"
table(predictedSpam, testSpam$type) ## Get a measure of uncertainty
getwd() ## Get/set your working directory
setwd("C:\\Users\\Felix\\Downloads")
setwd("./data")
setwd("../")
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./data/cameras.csv",method="curl")
list.files("./data")
dateDownloaded <- date()
dateDownloaded
## [1] "Thu Aug 15 15:25:39 2013"
Some notes about download.file()
Loading data you have saved - read.table()
read.xlsx(), read.xlsx2() {xlsx package}
library(xlsx)
read.xlsx2("./data/camera.xlsx",sheetIndex=1)
Picking a file - less reproducible, but useful
read.csv(file.choose())
Writing data - write.table()
cameraData <- read.csv("./data/cameras.csv")
tmpData <- cameraData[,-1]
write.table(tmpData,file="./data/camerasModified.csv",sep=",")
Writing data - save(), save.image()
cameraData <- read.csv("./data/cameras.csv")
tmpData <- cameraData[,-1]
save(tmpData,cameraData,file="./data/cameras.rda")
Reading saved data - load()
rm(list=ls()) ## Remove everything from the workspace
load("./data/cameras.rda") ## Load data
paste() and paste0()
Important table()/NA issue:
table(c(0, 1, 2, 3, NA, 3, 3, 2, 2, 3))
##
## 0 1 2 3
## 1 1 3 4
table(c(0, 1, 2, 3, NA, 3, 3, 2, 2, 3), useNA = "ifany")
##
## 0 1 2 3 <NA>
## 1 1 3 4 1
works on windows 7 + Rstudio ## simple
pData <- read.table("https://dl.dropboxusercontent.com/u/8272421/ss06pid.csv",sep=",",header=TRUE)
library(RCurl) ## complicated, download data from https
library(XML)
fileUrl <- "https://dl.dropboxusercontent.com/u/8272421/ss06pid.csv"
myCsv<-getURL(fileUrl, ssl.verifypeer = FALSE)
temporaryFile <- tempfile()
con <- file(temporaryFile, open = "w")
cat(myCsv, file = con)
close(con)
pData <- read.csv(temporaryFile)
dim(pData)
boxplot(pData$AGEP,col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE)
barplot(table(pData$CIT),col="blue")
hist(pData$AGEP,col="blue")
hist(pData$AGEP,col="blue",breaks=100,main="Age")
dens <- density(pData$AGEP) ## Density plots
plot(dens,lwd=3,col="blue")
dens <- density(pData$AGEP) ## Density plots - multiple distributions
densMales <- density(pData$AGEP[which(pData$SEX==1)])
plot(dens,lwd=3,col="blue")
lines(densMales,lwd=3,col="orange")
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue") ## Scatterplots
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5) ## Scatterplots - size matters
plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$SEX,cex=0.5) ## Scatterplots - using color
percentMaxAge <- pData$AGEP/max(pData$AGEP)
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=percentMaxAge*0.5) ## Scatterplots - using size
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5)
lines(rep(100,dim(pData)[1]),pData$WAGP,col="grey",lwd=5) ## Scatterplots - overlaying lines/points
points(seq(0,200,length=100),seq(0,20e5,length=100),col="red",pch=19)
library(Hmisc)
ageGroups <- cut2(pData$AGEP,g=5)
plot(pData$JWMNP,pData$WAGP,pch=19,col=ageGroups,cex=0.5) ## Scatterplots - numeric variables as factors
x <- rnorm(1e+05)
y <- rnorm(1e+05)
plot(x, y, pch = 19) ## If you have a lot of points
sampledValues <- sample(1:1e+05, size = 1000, replace = FALSE) ## If you have a lot of points - sampling
plot(x[sampledValues], y[sampledValues], pch = 19)
smoothScatter(x, y) ## If you have a lot of points - smoothScatter
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
library(hexbin) ## If you have a lot of points - hexbin {hexbin}
## Warning: package 'hexbin' was built under R version 3.0.1
## Loading required package: grid
## Loading required package: lattice
x <- rnorm(1e+05)
y <- rnorm(1e+05)
hbo <- hexbin(x, y)
plot(hbo)
x <- rnorm(20)
y <- rnorm(20)
qqplot(x, y) ## QQ-plots
abline(c(0, 1))
X <- matrix(rnorm(20 * 5), nrow = 20)
matplot(X, type = "b") ## Matplot and spaghetti
image(1:10,161:236,as.matrix(pData[1:10,161:236])) ## Heatmaps
newMatrix <- as.matrix(pData[1:10,161:236])
newMatrix <- t(newMatrix)[,nrow(newMatrix):1]
image(161:236, 1:10, newMatrix) ## Heatmaps - matching intuition
library(maps) ## Maps - very basics
map("world")
lat <- runif(40, -180, 180)
lon <- runif(40, -90, 90)
points(lat, lon, col = "blue", pch = 19)
x <- c(NA, NA, NA, 4, 5, 6, 7, 8, 9, 10)
y <- 1:10
plot(x, y, pch = 19, xlim = c(0, 11), ylim = c(0, 11)) ## Missing values and plots
x <- rnorm(100)
y <- rnorm(100)
y[x < 0] <- NA
boxplot(x ~ is.na(y)) ## Missing values and plots
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5, xlab="Travel time (min)",ylab="Last 12 month wages (dollars)") ## Axes, Important parameters: xlab,ylab,cex.lab,cex.axis
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5, xlab="Travel time (min)",ylab="Last 12 month wages (dollars)",cex.lab=2,cex.axis=1.5)
plot(pData$JWMNP,pData$WAGP,pch=19,col="blue",cex=0.5,xlab="TT (min)",ylab="Wages (dollars)")
legend(100,200000,legend="All surveyed",col="blue",pch=19,cex=0.5) ## Legends, Important paramters: x,y,legend, other plotting parameters
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="TT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)", ylab="Wages (dollars)",
col=pData$SEX,main ="Wages earned versus commute time") ## Titles
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
par(mfrow=c(1,2)) ## Multiple panels
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
par(mfrow=c(1,2)) ## Adding text
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
pdf(file="twoPanel.pdf",height=4,width=8) ## pdf
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
dev.off()
png(file="twoPanel.png",height=480,width=(2*480)) ## png
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
mtext(text="(a)",side=3,line=1)
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
legend(100,200000,legend=c("men","women"),col=c("black","red"),pch=c(19,19),cex=c(0.5,0.5))
mtext(text="(b)",side=3,line=1)
dev.off()
par(mfrow=c(1,2))
hist(pData$JWMNP,xlab="CT (min)",col="blue",breaks=100,main="")
plot(pData$JWMNP,pData$WAGP,pch=19,cex=0.5,xlab="CT (min)",ylab="Wages (dollars)",col=pData$SEX)
dev.copy2pdf(file="twoPanelv2.pdf") ## dev.copy2pdf
set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))
dataFrame <- data.frame(x = x, y = y)
dist(dataFrame)
## 1 2 3 4 5 6 7 8 9
## 2 0.34121
## 3 0.57494 0.24103
## 4 0.26382 0.52579 0.71862
## 5 1.69425 1.35818 1.11953 1.80667
## 6 1.65813 1.31960 1.08339 1.78081 0.08150
## 7 1.49823 1.16621 0.92569 1.60132 0.21110 0.21667
## 8 1.99149 1.69093 1.45649 2.02849 0.61704 0.69792 0.65063
## 9 2.13630 1.83168 1.67836 2.35676 1.18350 1.11500 1.28583 1.76461
## 10 2.06420 1.76999 1.63110 2.29239 1.23848 1.16550 1.32063 1.83518 0.14090
## 11 2.14702 1.85183 1.71074 2.37462 1.28154 1.21077 1.37370 1.86999 0.11624
## 12 2.05664 1.74663 1.58659 2.27232 1.07701 1.00777 1.17740 1.66224 0.10849
## 10 11
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11 0.08318
## 12 0.19129 0.20803
dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)
Prettier dendrograms:
modifiction of plclust for plotting hclust objects *in colour*!
Copyright Eva KF Chan 2009
Arguments:
hclust: hclust object
lab: a character vector of labels of the leaves of the tree
lab.col: colour for the labels; NA=default device foreground colour
hang: as in hclust & plclust
Side effect:
A display of hierarchical cluster with coloured leaf labels.
myplclust <- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)),
hang = 0.1, ...) {
y <- rep(hclust$height, 2)
x <- as.numeric(hclust$merge)
y <- y[which(x < 0)]
x <- x[which(x < 0)]
x <- abs(x)
y <- y[order(x)]
x <- x[order(x)]
plot(hclust, labels = FALSE, hang = hang, ...)
text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order],
col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}
op = par(mfrow = c(1, 1))
dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
myplclust(hClustering, lab = rep(1:3, each = 4), lab.col = rep(1:3, each = 4))
par(op)
dataFrame <- data.frame(x = x, y = y)
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
heatmap(dataMatrix) ## heatmap
set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))
dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
names(kmeansObj)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size"
kmeansObj$cluster
## [1] 3 3 3 3 1 1 1 1 2 2 2 2
par(mar = rep(0.2, 4))
plot(x, y, col = kmeansObj$cluster, pch = 19, cex = 2)
points(kmeansObj$centers, col = 1:3, pch = 3, cex = 3, lwd = 3)
set.seed(1234) ## Heatmaps
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj2 <- kmeans(dataMatrix, centers = 3)
par(mfrow = c(1, 2), mar = rep(0.2, 4))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n")
set.seed(12345)
par(mar = rep(0.2, 4))
dataMatrix <- matrix(rnorm(400), nrow = 40) ## Matrix data
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])
par(mar = rep(0.2, 4))
heatmap(dataMatrix) ## Cluster the data
set.seed(678910)
for (i in 1:40) {
## What if we add a pattern?
coinFlip <- rbinom(1, size = 1, prob = 0.5)
if (coinFlip) {
dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 3), each = 5)
}
}
par(mar = rep(0.2, 4))
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])
par(mar = rep(0.2, 4))
heatmap(dataMatrix)
hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rowMeans(dataMatrixOrdered), 40:1, , xlab = "Row", ylab = "Row Mean", pch = 19)
plot(colMeans(dataMatrixOrdered), xlab = "Column", ylab = "Column Mean", pch = 19)
svd1 <- svd(scale(dataMatrixOrdered)) ## Components of the SVD - u and v
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd1$u[, 1], 40:1, , xlab = "Row", ylab = "First left singular vector",
pch = 19)
plot(svd1$v[, 1], xlab = "Column", ylab = "First right singular vector", pch = 19)
svd1 <- svd(scale(dataMatrixOrdered)) ## Components of the SVD - d and variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained",
pch = 19)
par(mfrow = c(1, 1))
svd1 <- svd(scale(dataMatrixOrdered)) ## Relationship to principal components
pca1 <- prcomp(dataMatrixOrdered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1], pch = 19, xlab = "Principal Component 1",
ylab = "Right Singular Vector 1")
abline(c(0, 1))
constantMatrix <- dataMatrixOrdered * 0 ## Components of the SVD - variance explained
for (i in 1:dim(dataMatrixOrdered)[1]) {
constantMatrix[i, ] <- rep(c(0, 1), each = 5)
}
svd1 <- svd(constantMatrix)
par(mfrow = c(1, 3))
image(t(constantMatrix)[, nrow(constantMatrix):1])
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained",
pch = 19)
set.seed(678910)
for (i in 1:40) {
## What if we add a second pattern?
coinFlip1 <- rbinom(1, size = 1, prob = 0.5)
coinFlip2 <- rbinom(1, size = 1, prob = 0.5)
if (coinFlip1) {
dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), each = 5)
}
if (coinFlip2) {
dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), 5)
}
}
hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
svd2 <- svd(scale(dataMatrixOrdered)) ## Singular value decomposition - true patterns
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rep(c(0, 1), each = 5), pch = 19, xlab = "Column", ylab = "Pattern 1")
plot(rep(c(0, 1), 5), pch = 19, xlab = "Column", ylab = "Pattern 2")
svd2 <- svd(scale(dataMatrixOrdered)) ## v and patterns of variance in rows
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd2$v[, 1], pch = 19, xlab = "Column", ylab = "First right singluar vector")
plot(svd2$v[, 2], pch = 19, xlab = "Column", ylab = "Second right singluar vector")
svd1 <- svd(scale(dataMatrixOrdered)) ## d and variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singluar value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Percent of variance explained",
pch = 19)
library(corpcor)
bigMatrix <- matrix(rnorm(10000 * 40), nrow = 10000)
system.time(svd(scale(bigMatrix)))
## user system elapsed
## 0.33 0.00 0.33
system.time(fast.svd(scale(bigMatrix), tol = 0))
## user system elapsed
## 0.09 0.03 0.12
library(impute)
dataMatrix2 <- dataMatrixOrdered
dataMatrix2[sample(1:100,size=40,replace=F)] <- NA
dataMatrix2 <- impute.knn(dataMatrix2)$data ## Imputing missing data
svd1 <- svd(scale(dataMatrixOrdered))
svd2 <- svd(scale(dataMatrix2))
par(mfrow=c(1,2))
plot(svd1$v[,1],pch=19)
plot(svd2$v[,1],pch=19)