#Prepare Data
#College Data
college_data <- read.csv("college_data.csv", header=FALSE)
#Numeric College Data (recorded as factor type)
number <- college_data[-c(1),-c(1,2,3)]
#Convert it to Numeric Type
college_num <- sapply(number,function(x) as.numeric(as.character(x)))
#Scale Numeric College Data (Centered is a requirement for PCA)
college_scale <- scale(college_num, center=TRUE, scale=TRUE)
library(cluster)
#Dissimilarity of Scaled College Data
D = dist(college_scale)
#Clusters for Scaled College Data
tree.college.scale = hclust(D,method="complete")
#Dendrogram for Scaled College Data
plot(tree.college.scale, xlab="Clusters", main = "College Data Complete Linkage")
#Remove CSU-Chico, Columbia, Northwestern, SFSU, Berkeley, UCDavid, UCSB, WPI from College Data
new_college <- subset(college_data, V3 != 'CSU-Chico' & V3 != 'Columbia' & V3 != 'Northwestern'
& V3 != 'SFSU' & V3 != 'Berkeley' & V3 != 'UCDavis' & V1 != '110705' & V3 != 'WPI')
#Change New College Data Factors to Numeric Type and Scale
number2 <- new_college[-c(1),-c(1,2,3)]
college_num2 <- sapply(number2, function(x) as.numeric(as.character(x)))
college_scale2 <- scale(college_num2, center=TRUE, scale=TRUE)
#Dissimilarity New College Data
D2 <- dist(college_scale2)
#Complete Linkage Clusters
complete.newcollege.scale <- hclust(D2, method="complete")
#Average Linkage Clusters
average.newcollege.scale <- hclust(D2, method="average")
#Complete Linkage Dendrogram
plot(complete.newcollege.scale, xlab = "Clusters", main="New College Data Complete Linkage")
#Average Linkage Dendrogram
plot(average.newcollege.scale, xlab = "Clusters", main="New College Data Average Linkage")
#Single Linkage Clusters
single.newcollege.scale <- hclust(D2, method="single")
#Single Linkage Dendrogram
plot(single.newcollege.scale, xlab="Clusters", main="New College Data Single Linkage")
Cluster Assignments (k=3)
Cluster 1 (Stanford,CMU,MIT)
Cluster 2 (UDel,MTU,MSU,CSUEB,SJSU,CUNY,CalPoly,UCSD)
Cluster 3 (NYU,UMinn)
library(ggplot2)
library(ggfortify)
#Principal Component Analysis
#college_scale from part a)
#School Labels to Numeric College Data
row.names(college_scale) <- c("CalPoly","CSU-Chico","CSUEB","CMU","Columbia","CUNY","MIT","MTU","MSU","NYU"
,"Northwestern","SFSU","SJSU","Stanford","Berkeley","UCDavis","UCSD","UCSB","UDel","UMinn","WPI")
#Principal Components
pc <- prcomp(college_scale)
#Plot of 1st Two Component Scores
autoplot(pc, shape=FALSE, label.size=3, loadings=TRUE, loadings.label = TRUE, loadings.label.size = 4)
#K-means Clustering
#college_scale2 from part b)
row.names(college_scale2) <- c("CalPoly","CSUEB","CMU","CUNY","MIT","MTU","MSU","NYU","SJSU","Stanford"
,"UCSD","UDel","UMinn")
km <- kmeans(college_scale2, centers=3, algorithm="Lloyd")
fitted(km,method = 'classes')
## CalPoly CSUEB CMU CUNY MIT MTU MSU NYU
## 3 1 3 1 3 1 1 2
## SJSU Stanford UCSD UDel UMinn
## 1 3 3 2 2
Cluster Assignments (k = 3)
| Cluster | Elements | Method |
|---|---|---|
| 1 | (CalPoly,CMU,MIT,NYU,Stanford,UCSD,UDel,UMinn) | K-means |
| 2 | (CUNY) | K-means |
| 3 | (CSUEB,MTU,MSU,SJSU) | K-means |
| ——- | ——– | —— |
| 1 | (Stanford,CMU,MIT) | Single Linkage |
| 2 | (UDel,MTU,MSU,CSUEB,SJSU,CUNY,CalPoly,UCSD) | Single Linkage |
| 3 | (NYU,UMinn) | Single Linkage |
The Single linkage clusterings are easier to see how the colleges are grouped. With K-means, the clusterings seem more random; CUNY is all by itself in a cluster and UMinn is clustered with MIT and Stanford.
#K-medoids Clustering
#college_scale2 from part b)
pm <- pam(college_scale2,3)
pm$clustering
## CalPoly CSUEB CMU CUNY MIT MTU MSU NYU
## 1 2 3 2 3 2 2 1
## SJSU Stanford UCSD UDel UMinn
## 2 3 1 1 1
| Cluster | Elements | Method |
|---|---|---|
| 1 | (CalPoly,CMU,MIT,NYU,Stanford,UCSD,UDel,UMinn) | K-means |
| 2 | (CUNY) | K-means |
| 3 | (CSUEB,MTU,MSU,SJSU) | K-means |
| ——- | ——– | —— |
| 1 | (Stanford,CMU,MIT) | Single Linkage |
| 2 | (UDel,MTU,MSU,CSUEB,SJSU,CUNY,CalPoly,UCSD) | Single Linkage |
| 3 | (NYU,UMinn) | Single Linkage |
| ——- | ——– | —— |
| 1 | (CalPoly,NYU,UCSD,UDel,UMinn) | K-medoids |
| 2 | (CSUEB,CUNY,MTU,MSU,SJSU) | K-medoids |
| 3 | (CMU,MIT,Stanford) | K-medoids |
K-mediods seems like a slightly better clustering than single linkage. The clusterings are more stratified.
#SOURCE
#https://cran.r-project.org/web/packages/protoclust/protoclust.pdf
#Minimax Linkage of College Data
library(protoclust)
#Clusters for Scaled College Data
tree.college.scale <- protoclust(D)
#Dendrogram for Scaled College Data
plot(tree.college.scale, xlab="Clusters", main = "College Data Minimax Linkage")
#Import Music Data
music2 <- read.csv("music2.csv")
#Numerical Music Data
musicNum <- music2[,-c(1,2,3)]
#Standardized Music Data
music_scaled <- scale(musicNum, center=TRUE, scale=TRUE)
row.names(music_scaled) <- music2$Song
#Single Linkage (Type)
row.names(music_scaled) <- music2$Type
D <- dist(music_scaled)
single.music <- hclust(D,method="single")
plot(single.music, xlab="Clusters", main="Music Data Single Linkage by Type", cex=0.8)
#Complete Linkage (Type)
complete.music <- hclust(D,method="complete")
plot(complete.music, xlab="Clusters", main="Music Data Complete Linkage by Type", cex=0.8)
#Single Linkage (Artist)
row.names(music_scaled) <- music2$Artist
D <- dist(music_scaled)
single.artist.music <- hclust(D, method="single")
plot(single.artist.music, xlab="Clusters", main="Music Data Single Linkage by Artist", cex=0.8)
#Complete Linkage (Artist)
complete.artist.music <- hclust(D, method="complete")
plot(complete.artist.music, xlab="Clusters", main="Music Data Complete Linkage by Artist", cex=0.8)
The Complete Linkage method looks easier to decipher because the clusters are more uniform on the dedrograms when considering Type and Artist. More information is required to tell whether clustering by Artist or Type is better. It seems that Artist may be more telling because you can view the similarities in artists whereas we already know the similarities in genre.
## 20 Random Sample
#Import Voting Data
H114 <- read.fwf(file = "H114.ord", widths = c(3,5,2,2,8,3,2,11,10000), strip.white = T
, colClasses = "character")
#Remove First 5 votes
H114$V9 = substr(H114$V9, 6, nchar(H114$V9))
#Standardize the Votes
for(i in 1:nrow(H114))
{
votes <- H114$V9[i]
votesplit <- strsplit(votes, "")
vU = votesplit[[1]]
myList = ""
for(j in 1:length(vU))
{
k = vU[j]
if(k == 0)
{
myList = paste0(myList, "0")
}
else if (k == 1 || k ==2 || k ==3)
{
myList = paste0(myList, "1")
}
else if (k == 4 || k ==5 || k ==6)
{
myList = paste0(myList, "2")
}
else
{
myList = paste0(myList, "3")
}
}
H114[[paste("V10")]][i] = myList
}
for(i in 1:nrow(H114))
{
if(H114$V6[i] == "100")
{
H114$V11[i] = "blue"
}
else
{
H114$V11[i] = "red"
}
}
#Split the Votes into columns
vote <- lapply(H114$V10,function(x) strsplit(x,split = ""))
#List to Matrix
vote.max <- matrix(unlist(vote), ncol = 1322 ,byrow=TRUE)
row.names(vote.max) <- H114$V11
set.seed(43446553)
samp <- sample(1:1322, 20)
v.matrix <- vote.max[,samp]
v.num <- apply(v.matrix,1,as.numeric)
v.num <- t(v.num)
#PCA (20)
pca <- prcomp(v.num)
autoplot(pca, label.size=3, loadings=TRUE, loadings.size = 1, shape = FALSE, loadings.colour='green'
, col=rownames(v.num))
#100 Sample
set.seed(43443215)
samp2 <- sample(1:1322, 100)
v.matrix2 <- vote.max[,samp2]
v.num2 <- apply(v.matrix2,1,as.numeric)
v.num2 <- t(v.num2)
#PCA (100)
pca2 <- prcomp(v.num2)
autoplot(pca2, label.size=3, loadings=TRUE, loadings.size = 1, shape = FALSE, loadings.colour='green'
, col=rownames(v.num2))
#Variance of first 10 PCA (from 100 Sample)
pca.num <- sapply(pca2,function(x) as.numeric(x))
Var1 = pca.num$sdev/sum(pca.num$sdev)
Var1[1:10]
## [1] 0.10124204 0.05281862 0.02260990 0.02047705 0.02000581 0.01968867
## [7] 0.01858561 0.01805787 0.01689017 0.01625032
You should be able to predict a Congress person’s party affiliation from their voting record. The variance amounts are low from the sample of 100 data and it is shown in the PCA plot by having all democrats clustered on the left and republicans clustered on the right.
Comments
There are a lot of democrats clustered on the left, and republicans clustered on the right. You can see that some democrats may vote more republican-like, but those are outliers to the rest of the democrat data.