Problem 3

rm(list=ls())

#load the data
H <- matrix(readBin("./histograms.bin", "double", 640000), 40000, 16)

#define the function
MultinomialEM <- function(H,K,tau)
  #H the matrix of input histograms, K the number of clusters, and tau the threshold parameter Ï„
{
  n <- nrow(H)
  d <- ncol(H)
  sigma <- 1.0
  normalize <- function(x) if(sum(x) != 0) {x / sum(x)} else x
  T_dK <- replicate(K, normalize(runif(d,min=0.01, max=10))) 
  C <- rep(1, K)
  A <- replicate(K, rep(0, n))
  while(sigma >= tau) 
  {
    T_dK[T_dK <= 0] = 1e-20
    Phi = exp( H %*% log(T_dK) ) # n by K matrix
    pre_A = A
    A = (C * Phi) / (rowSums(C * Phi)) # n by K matrix
    A[abs(A) <= 0] = 1e-20
    C = colMeans(A) 
    B = t(H) %*% A
    T_dK = apply(B, 2, normalize) 
    sigma = max(colSums(abs(A - pre_A)))
  }
  m = apply(A, 1, function(x) which.max(x))
  return(m)
}

#get the EM
K <- 3
tau <- 0.01
m <- MultinomialEM(H,K,tau)
image(matrix(m, 200), main = "Visulization of EM Algorithm (K=3)")

K <- 4
m <- MultinomialEM(H,K,tau)
image(matrix(m, 200), main = "Visulization of EM Algorithm (K=4)")

K <- 5
m <- MultinomialEM(H,K,tau)
image(matrix(m, 200), main = "Visulization of EM Algorithm (K=5)")

Problem 4

#1.

#read the data
DowName<-read.csv("./Dow2010Names.csv", header = T) 
##source: http://siblisresearch.com/data/dow-jones-30-weightings/

#construct the data matrix by reading the URLs
matrix<-c()
for (i in 1:30){
  name = DowName$Ticker[i]
  url = paste("http://ichart.finance.yahoo.com/table.csv?s=",name,
              "&a=00&b=1&c=2010&d=00&e=1&f=2011&g=d&ignore=.csv", sep="")
  dat = read.table(url,header=TRUE,sep=",")
  matrix = cbind(matrix, dat[,5])}

#change data name to stock symbols
df<-as.data.frame(matrix)
names(df)<-DowName$Ticker


#2.
pca<-princomp(df,cor=F)

#create sectors
library(TTR)
sectors<-subset(stockSymbols(), Symbol %in% DowName$Ticker)
## Fetching AMEX symbols...
## Fetching NASDAQ symbols...
## Fetching NYSE symbols...
s<-factor(sectors$Sector)
names(s)<-sectors$Symbol
s<-s[names(df)]
cols = as.double(s)

library(RColorBrewer)
palette(brewer.pal(9, "Paired"))  

#biplot
plot(pca$loadings[,1], pca$loadings[,2], type='p', col=cols,pch=20, xlab='Comp.1', ylab='Comp.2')
text(pca$loadings[,1], pca$loadings[,2], names(df), col=cols,cex=.8, pos=4)
legend('bottomleft', cex=.8,  legend = levels(s), fill = 1:nlevels(s), merge = F, bty = 'n') 

##it seems that the stock in the same sector (category) has more similarities compared to the ones in other groups (except there is discripancy in consumer service and energy.)

#screeplot
plot(pca, type = "l")

##accorting to the screeplot, first 2 or 3 components are enough to explain the variablity in the dataset.


#3.
pca <- princomp(df, cor = T)

#biplot
plot(pca$loadings[,1], pca$loadings[,2], type='p', col=cols,pch=20, xlab='Comp.1', ylab='Comp.2')
text(pca$loadings[,1], pca$loadings[,2], names(df), col=cols,cex=.8, pos=4)
legend('topright', cex=.8,  legend = levels(s), fill = 1:nlevels(s), merge = F, bty = 'n') 

##this time the pattern in the stock sector is not that obvious compared to the result in question 2.

#screeplot
plot(pca, type = "l")

##accorting to the screeplot, first 4 or 5 components are enough to explain the variablity in the dataset.


#4.
#calculate the return
ret<-exp(diff(log(matrix))) - 1
df.ret<-as.data.frame(ret) 

pca <- princomp(df.ret, cor = T)

#biplot
plot(pca$loadings[,1], pca$loadings[,2], type='p', col=cols,pch=20, xlab='Comp.1', ylab='Comp.2')
text(pca$loadings[,1], pca$loadings[,2], names(df), col=cols,cex=.8, pos=4)
legend('topleft', cex=.6,  legend = levels(s), fill = 1:nlevels(s), merge = F, bty = 'n') 

##still, the pattern in the stock sector is not that obvious compared to the result in question 2.

#screeplot
plot(pca, type = "l")

##accorting to the screeplot, the first component is pretty enough to explain the majority of variablity. This means the dependency in the data is relatively large. If the data sets are independent, we would need a lot more components so has flatter scree plot as a result.