Question 1: College Data

#Prepare Data

#College Data
college_data <- read.csv("college_data.csv", header=FALSE)
#Numeric College Data (recorded as factor type)
number <- college_data[-c(1),-c(1,2,3)]
#Convert it to Numeric Type
college_num <- sapply(number,function(x) as.numeric(as.character(x)))
#Scale Numeric College Data (Centered is a requirement for PCA)
college_scale <- scale(college_num, center=TRUE, scale=TRUE)

Q1a

library(cluster)
#Dissimilarity of Scaled College Data
D = dist(college_scale)
#Clusters for Scaled College Data
tree.college.scale = hclust(D,method="complete")
#Dendrogram for Scaled College Data
plot(tree.college.scale, xlab="Clusters", main = "College Data Complete Linkage")

Q1b

#Remove CSU-Chico, Columbia, Northwestern, SFSU, Berkeley, UCDavid, UCSB, WPI from College Data
new_college <- subset(college_data, V3 != 'CSU-Chico' & V3 != 'Columbia' & V3 != 'Northwestern' 
  & V3 != 'SFSU' & V3 != 'Berkeley' & V3 != 'UCDavis' & V1 != '110705' & V3 != 'WPI')

#Change New College Data Factors to Numeric Type and Scale
number2 <- new_college[-c(1),-c(1,2,3)]
college_num2 <- sapply(number2, function(x) as.numeric(as.character(x)))
college_scale2 <- scale(college_num2, center=TRUE, scale=TRUE)

#Dissimilarity New College Data
D2 <- dist(college_scale2)

#Complete Linkage Clusters
complete.newcollege.scale <- hclust(D2, method="complete")

#Average Linkage Clusters
average.newcollege.scale <- hclust(D2, method="average")

#Complete Linkage Dendrogram
plot(complete.newcollege.scale, xlab = "Clusters", main="New College Data Complete Linkage")

#Average Linkage Dendrogram
plot(average.newcollege.scale, xlab = "Clusters", main="New College Data Average Linkage")

Q1c

#Single Linkage Clusters
single.newcollege.scale <- hclust(D2, method="single")
#Single Linkage Dendrogram
plot(single.newcollege.scale, xlab="Clusters", main="New College Data Single Linkage")

Cluster Assignments (k=3)

Cluster 1 (Stanford,CMU,MIT)
Cluster 2 (UDel,MTU,MSU,CSUEB,SJSU,CUNY,CalPoly,UCSD)
Cluster 3 (NYU,UMinn)

Q1d

library(ggplot2)
library(ggfortify)

#Principal Component Analysis

#college_scale from part a)

#School Labels to Numeric College Data
row.names(college_scale) <- c("CalPoly","CSU-Chico","CSUEB","CMU","Columbia","CUNY","MIT","MTU","MSU","NYU"
      ,"Northwestern","SFSU","SJSU","Stanford","Berkeley","UCDavis","UCSD","UCSB","UDel","UMinn","WPI")

#Principal Components
pc <- prcomp(college_scale)

#Plot of 1st Two Component Scores
autoplot(pc, shape=FALSE, label.size=3, loadings=TRUE, loadings.label = TRUE, loadings.label.size = 4)

Q1e

#K-means Clustering

#college_scale2 from part b)

row.names(college_scale2) <- c("CalPoly","CSUEB","CMU","CUNY","MIT","MTU","MSU","NYU","SJSU","Stanford"
                               ,"UCSD","UDel","UMinn")

km <- kmeans(college_scale2, centers=3, algorithm="Lloyd")
fitted(km,method = 'classes')

##  CalPoly    CSUEB      CMU     CUNY      MIT      MTU      MSU      NYU 
##        3        1        3        1        3        1        1        2 
##     SJSU Stanford     UCSD     UDel    UMinn 
##        1        3        3        2        2

Cluster Assignments (k = 3)

Cluster	Elements	Method
1	(CalPoly,CMU,MIT,NYU,Stanford,UCSD,UDel,UMinn)	K-means
2	(CUNY)	K-means
3	(CSUEB,MTU,MSU,SJSU)	K-means
——-	——–	——
1	(Stanford,CMU,MIT)	Single Linkage
2	(UDel,MTU,MSU,CSUEB,SJSU,CUNY,CalPoly,UCSD)	Single Linkage
3	(NYU,UMinn)	Single Linkage

The Single linkage clusterings are easier to see how the colleges are grouped. With K-means, the clusterings seem more random; CUNY is all by itself in a cluster and UMinn is clustered with MIT and Stanford.

Q1f

#K-medoids Clustering

#college_scale2 from part b)

pm <- pam(college_scale2,3)
pm$clustering

##  CalPoly    CSUEB      CMU     CUNY      MIT      MTU      MSU      NYU 
##        1        2        3        2        3        2        2        1 
##     SJSU Stanford     UCSD     UDel    UMinn 
##        2        3        1        1        1

Cluster	Elements	Method
1	(CalPoly,CMU,MIT,NYU,Stanford,UCSD,UDel,UMinn)	K-means
2	(CUNY)	K-means
3	(CSUEB,MTU,MSU,SJSU)	K-means
——-	——–	——
1	(Stanford,CMU,MIT)	Single Linkage
2	(UDel,MTU,MSU,CSUEB,SJSU,CUNY,CalPoly,UCSD)	Single Linkage
3	(NYU,UMinn)	Single Linkage
——-	——–	——
1	(CalPoly,NYU,UCSD,UDel,UMinn)	K-medoids
2	(CSUEB,CUNY,MTU,MSU,SJSU)	K-medoids
3	(CMU,MIT,Stanford)	K-medoids

K-mediods seems like a slightly better clustering than single linkage. The clusterings are more stratified.

Q1g

#SOURCE
#https://cran.r-project.org/web/packages/protoclust/protoclust.pdf

#Minimax Linkage of College Data
library(protoclust)
#Clusters for Scaled College Data
tree.college.scale <- protoclust(D)
#Dendrogram for Scaled College Data
plot(tree.college.scale, xlab="Clusters", main = "College Data Minimax Linkage")

Question 2: Music Data

#Import Music Data
music2 <- read.csv("music2.csv")
#Numerical Music Data
musicNum <- music2[,-c(1,2,3)]
#Standardized Music Data
music_scaled <- scale(musicNum, center=TRUE, scale=TRUE)
row.names(music_scaled) <- music2$Song

#Single Linkage (Type)
row.names(music_scaled) <- music2$Type
D <- dist(music_scaled)
single.music <- hclust(D,method="single")
plot(single.music, xlab="Clusters", main="Music Data Single Linkage by Type", cex=0.8)

#Complete Linkage (Type)
complete.music <- hclust(D,method="complete")
plot(complete.music, xlab="Clusters", main="Music Data Complete Linkage by Type", cex=0.8)

#Single Linkage (Artist)
row.names(music_scaled) <- music2$Artist
D <- dist(music_scaled)
single.artist.music <- hclust(D, method="single")
plot(single.artist.music, xlab="Clusters", main="Music Data Single Linkage by Artist", cex=0.8)

#Complete Linkage (Artist)
complete.artist.music <- hclust(D, method="complete")
plot(complete.artist.music, xlab="Clusters", main="Music Data Complete Linkage by Artist", cex=0.8)

The Complete Linkage method looks easier to decipher because the clusters are more uniform on the dedrograms when considering Type and Artist. More information is required to tell whether clustering by Artist or Type is better. It seems that Artist may be more telling because you can view the similarities in artists whereas we already know the similarities in genre.

Question 3: Voting Data

## 20 Random Sample
#Import Voting Data
H114 <- read.fwf(file = "H114.ord", widths = c(3,5,2,2,8,3,2,11,10000), strip.white = T
                 , colClasses = "character")
#Remove First 5 votes
H114$V9 = substr(H114$V9, 6, nchar(H114$V9))
#Standardize the Votes
for(i in 1:nrow(H114))
{
  votes <- H114$V9[i]
  votesplit <- strsplit(votes, "")
  vU = votesplit[[1]]
  myList = ""
  
  for(j in 1:length(vU))
  {
    k = vU[j]
    
    if(k == 0)
    {
      myList = paste0(myList, "0")
    }
    else if (k == 1 || k ==2 || k ==3)
    {
      myList = paste0(myList, "1")
    }
    else if (k == 4 || k ==5 || k ==6)
    {
      myList = paste0(myList, "2")
    }
    else
    {
      myList = paste0(myList, "3") 
    } 
  }
  
  H114[[paste("V10")]][i] = myList
}

for(i in 1:nrow(H114))
{
  if(H114$V6[i] == "100")
  {
    H114$V11[i] = "blue"
  }
  else
  {
    H114$V11[i] = "red"
  }
}

#Split the Votes into columns
vote <- lapply(H114$V10,function(x) strsplit(x,split = ""))
#List to Matrix
vote.max <- matrix(unlist(vote), ncol = 1322 ,byrow=TRUE)
row.names(vote.max) <- H114$V11
set.seed(43446553)
samp <- sample(1:1322, 20)
v.matrix <- vote.max[,samp]
v.num <- apply(v.matrix,1,as.numeric)
v.num <- t(v.num)

#PCA (20)
pca <- prcomp(v.num)

autoplot(pca, label.size=3, loadings=TRUE, loadings.size = 1, shape = FALSE, loadings.colour='green'
         , col=rownames(v.num))

#100 Sample
set.seed(43443215)
samp2 <- sample(1:1322, 100)
v.matrix2 <- vote.max[,samp2]
v.num2 <- apply(v.matrix2,1,as.numeric)
v.num2 <- t(v.num2)

Comments

There are a lot of democrats clustered on the left, and republicans clustered on the right. You can see that some democrats may vote more republican-like, but those are outliers to the rest of the democrat data.

100 Random Sample

#PCA (100)
pca2 <- prcomp(v.num2)
autoplot(pca2, label.size=3, loadings=TRUE, loadings.size = 1, shape = FALSE, loadings.colour='green'
         , col=rownames(v.num2))

#Variance of first 10 PCA (from 100 Sample)
pca.num <- sapply(pca2,function(x) as.numeric(x))
Var1 = pca.num$sdev/sum(pca.num$sdev)
Var1[1:10]

##  [1] 0.10124204 0.05281862 0.02260990 0.02047705 0.02000581 0.01968867
##  [7] 0.01858561 0.01805787 0.01689017 0.01625032

Comments

You should be able to predict a Congress person’s party affiliation from their voting record. The variance amounts are low from the sample of 100 data and it is shown in the PCA plot by having all democrats clustered on the left and republicans clustered on the right.

Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-11

Question 1: College Data

Q1a

Q1b

Q1c

Q1d

Q1e

Q1f

Q1g

Question 2: Music Data

Question 3: Voting Data

Comments

100 Random Sample

Comments