# Clustering example
library(MASS)
library(ggplot2)
wages<-read.csv('http://inta.gatech.s3.amazonaws.com/wage2.csv')
subwages <-wages[,c("wage","IQ","KWW","educ","age","married")]
# Cluster based only on a subset of the data
cluster.results<-kmeans(subwages, 4, nstart=5)
# Calculate clusters, using 5 different starting samples
print(cluster.results)
## K-means clustering with 4 clusters of sizes 214, 52, 376, 293
##
## Cluster means:
## wage IQ KWW educ age married
## 1 1319.3645 106.32243 38.33178 14.24299 33.34112 0.9485981
## 2 2013.2692 111.05769 41.38462 15.53846 34.28846 0.9230769
## 3 920.2074 101.18617 35.73936 13.25798 33.30585 0.9069149
## 4 555.1092 95.98976 32.86007 12.80546 32.38567 0.8293515
##
## Clustering vector:
## [1] 3 3 3 4 4 1 4 3 1 3 3 3 3 1 2 3 1 3 3 4 1 1 3 2 3 1 1 3 1 4 3 1 2 4 2 3 1
## [38] 1 4 3 4 1 1 3 2 3 3 1 1 3 3 1 2 3 4 3 3 1 1 3 2 1 3 3 3 4 4 4 1 1 1 2 2 1
## [75] 3 3 3 3 4 3 3 4 1 2 4 3 3 3 4 4 2 3 3 1 3 4 1 1 3 3 1 3 3 4 1 3 1 3 1 3 3
## [112] 3 1 3 4 4 4 4 1 4 1 1 3 3 3 3 3 2 1 3 1 1 1 4 1 3 3 3 3 3 3 4 3 4 3 1 2 3
## [149] 3 1 1 4 4 1 4 3 2 2 3 1 1 3 3 4 1 1 2 1 1 4 3 3 3 3 3 4 1 4 4 4 4 3 1 3 1
## [186] 4 4 3 4 3 3 3 3 2 1 1 4 2 2 3 3 4 4 4 4 3 4 4 4 3 3 1 4 4 4 1 3 4 1 4 3 4
## [223] 3 1 3 1 3 1 4 4 1 1 1 1 3 3 4 4 2 2 1 2 3 3 4 1 3 3 3 3 1 1 3 1 3 3 3 4 4
## [260] 3 3 1 4 4 3 3 4 2 3 4 1 1 3 1 4 1 3 3 4 3 3 1 1 1 2 3 3 3 4 4 4 1 3 3 3 1
## [297] 4 3 3 3 3 4 3 3 2 1 4 3 3 3 3 4 3 3 1 3 1 3 3 1 3 4 3 2 3 4 3 4 1 3 3 4 1
## [334] 1 1 4 3 3 3 4 1 3 1 3 4 3 3 4 3 4 1 1 3 1 1 3 4 3 3 4 3 3 4 4 3 4 4 3 4 3
## [371] 1 4 3 3 3 1 3 4 3 1 3 3 1 4 4 4 3 3 3 3 3 3 3 4 3 4 3 1 3 3 1 4 4 4 4 4 4
## [408] 4 3 1 1 1 1 1 3 3 1 4 1 3 2 1 3 1 4 1 3 4 1 4 2 4 3 4 3 1 2 4 1 1 4 3 3 3
## [445] 4 1 1 1 4 4 2 3 4 3 3 3 4 3 2 4 3 1 4 3 3 4 4 3 3 1 4 4 1 1 1 3 1 1 3 4 1
## [482] 3 4 4 4 3 3 4 3 3 3 4 1 3 1 4 3 4 3 3 3 4 3 1 3 4 3 3 3 4 3 4 3 4 4 4 3 4
## [519] 1 1 1 4 3 1 3 4 3 4 1 3 3 4 4 4 4 1 4 4 3 4 1 1 4 4 4 2 3 1 3 1 1 1 3 4 3
## [556] 3 3 3 1 2 4 4 4 3 1 4 4 3 3 4 4 4 4 1 3 3 4 1 3 4 1 1 1 4 4 4 4 4 4 4 1 1
## [593] 1 4 1 3 3 3 3 1 4 1 2 3 3 4 3 4 3 2 3 3 4 4 3 2 3 1 4 4 1 4 4 3 1 1 1 1 4
## [630] 3 2 3 4 3 2 4 4 4 3 4 4 1 4 1 1 1 3 4 4 3 1 3 4 4 3 4 1 3 3 3 3 1 3 4 3 4
## [667] 1 3 4 3 3 4 4 4 4 4 4 4 4 3 4 4 3 3 3 1 1 4 3 4 3 1 3 3 3 3 4 1 3 4 2 4 2
## [704] 3 2 1 4 1 1 3 1 3 3 3 1 1 3 3 3 3 4 1 4 1 3 1 4 1 2 3 2 4 3 2 4 3 3 3 1 4
## [741] 1 4 3 3 1 1 2 1 3 4 2 2 3 3 4 4 3 1 1 3 2 3 2 1 3 3 3 2 1 3 3 3 1 3 1 3 1
## [778] 3 3 3 1 1 3 4 4 3 3 3 4 2 3 3 3 3 3 4 3 3 3 3 1 4 3 3 3 3 1 4 3 3 3 1 3 4
## [815] 4 4 3 2 1 4 4 1 3 3 4 1 4 3 3 1 3 3 4 3 4 3 1 4 4 4 4 4 4 1 4 3 4 4 3 4 4
## [852] 4 4 4 4 4 3 3 4 3 3 3 3 3 3 1 4 4 4 4 4 3 1 4 3 3 4 4 3 4 3 3 4 4 4 3 4 3
## [889] 1 4 4 3 4 3 3 4 4 3 3 3 1 4 4 4 3 4 3 4 1 4 4 1 3 3 3 4 4 4 4 1 4 3 4 4 1
## [926] 4 3 4 4 4 4 1 4 3 3
##
## Within cluster sum of squares by cluster:
## [1] 4471404 6153621 4231857 4165182
## (between_SS / total_SS = 87.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Look at cluster results
subwages$cluster<-as.factor(cluster.results$cluster)
# The 'cluster' element of the results is the cluster which each data
# point belongs
ggplot(data=subwages, aes(x=educ, y=wage)) + geom_point(aes(colour=cluster)) +
geom_point(data=data.frame(cluster.results$centers), colour='red', size=7)

ggplot(data=subwages, aes(x=educ, y=IQ)) + geom_point(aes(colour=cluster)) +
geom_point(data=data.frame(cluster.results$centers), colour='red', size=7)

# We can plot the data points with colors for the different clusters.
# The colors match up differently along different dimensions
# If you want to see these plots with the data points less on top of each other,
# 'geom_jitter' adds soem noise to each point to move them off of each other:
ggplot(data=subwages, aes(x=educ, y=wage)) + geom_jitter(aes(colour=cluster)) +
geom_point(data=data.frame(cluster.results$centers), colour='red', size=7)

# When you look at these graphs, it's clear that clustering is mostly happening
# on wage. Why is that? How could you fix it?
####################
# Standardize before doing clustering
zscore<-function(x){
out<-(x - mean(x))/sd(x)
return(out)
}
subwages2<-subwages
subwages2$cluster<-NULL
for (col in names(subwages2)){
subwages2[,col]<-zscore(subwages2[,col])
}
# This loop standardizes the data (subtracts mean and divides by standard deviation)
# so that the data is in z-scores
cluster.results2<-kmeans(subwages2, 4, nstart=5)
subwages$cluster2<-as.factor(cluster.results2$cluster)
ggplot(data=subwages, aes(x=educ, y=wage)) + geom_point(aes(colour=cluster2))

ggplot(data=subwages, aes(x=educ, y=IQ)) + geom_point(aes(colour=cluster2))

# Plot the original data using the new clusters - note there is more of an IQ-based
# pattern and less a pure wages one.
# Explore clusters in this data:
# 1) What happens if you use a different number of clusters?
# 2) How many clusters do you think there should be?
# 3) How good a fit can you get of wages using these clusters?
##########
# Principal Components Analysis
wages<-read.csv('http://inta.gatech.s3.amazonaws.com/wage2.csv')
subwages <-wages[,c("IQ","KWW","educ")]
# Create a subset of wages with just a few columns
principal.components <- prcomp(subwages, retx=T, center=T, scale=T)
# Do a principal components analysis on just the columns in subwages
print(principal.components$rotation) # The weights on each variable
## PC1 PC2 PC3
## IQ 0.5994431 -0.3116398 -0.73725745
## KWW 0.5413277 0.8363384 0.08661666
## educ 0.5896035 -0.4510196 0.67003657
summary(principal.components) # The proportion of variance explained
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.3714 0.7979 0.6948
## Proportion of Variance 0.6269 0.2122 0.1609
## Cumulative Proportion 0.6269 0.8391 1.0000
plot(principal.components) # Graphical depiction of proportion of variance

sw <- cbind(wages,data.frame(principal.components$x))
# Add the new principal components as columns to a new data frame, sw
# (New so we don't muck up going back and running other stuff on wages now)
one.pc <- lm(wage ~ PC1, data= sw)
all.variables <- lm(wage ~ IQ + KWW + educ, data= sw)
summary(one.pc)
##
## Call:
## lm(formula = wage ~ PC1, data = sw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -914.52 -241.02 -35.99 194.77 2215.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 957.945 12.100 79.17 <2e-16 ***
## PC1 119.264 8.828 13.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 370 on 933 degrees of freedom
## Multiple R-squared: 0.1636, Adjusted R-squared: 0.1627
## F-statistic: 182.5 on 1 and 933 DF, p-value: < 2.2e-16
summary(all.variables)
##
## Call:
## lm(formula = wage ~ IQ + KWW + educ, data = sw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -927.30 -240.12 -32.46 199.57 2250.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -231.6018 92.1449 -2.513 0.012123 *
## IQ 3.5675 0.9749 3.660 0.000267 ***
## KWW 10.6471 1.7859 5.962 3.54e-09 ***
## educ 33.2366 6.5997 5.036 5.71e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 931 degrees of freedom
## Multiple R-squared: 0.1657, Adjusted R-squared: 0.163
## F-statistic: 61.64 on 3 and 931 DF, p-value: < 2.2e-16
# How much better does having three variables do compared to the one principal
# component here? (Look at R^2)
# Try to replicate this using more variables, including perhaps age and marital
# Status. Would you see the same thing?
###############################################################################
# CHANGE DESCRIPTION
################################## CODE CHANGE STARTS #########################
summary(wages)
## wage hours IQ KWW
## Min. : 115.0 Min. :20.00 Min. : 50.0 Min. :12.00
## 1st Qu.: 669.0 1st Qu.:40.00 1st Qu.: 92.0 1st Qu.:31.00
## Median : 905.0 Median :40.00 Median :102.0 Median :37.00
## Mean : 957.9 Mean :43.93 Mean :101.3 Mean :35.74
## 3rd Qu.:1160.0 3rd Qu.:48.00 3rd Qu.:112.0 3rd Qu.:41.00
## Max. :3078.0 Max. :80.00 Max. :145.0 Max. :56.00
##
## educ exper tenure age
## Min. : 9.00 Min. : 1.00 Min. : 0.000 Min. :28.00
## 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.: 3.000 1st Qu.:30.00
## Median :12.00 Median :11.00 Median : 7.000 Median :33.00
## Mean :13.47 Mean :11.56 Mean : 7.234 Mean :33.08
## 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:11.000 3rd Qu.:36.00
## Max. :18.00 Max. :23.00 Max. :22.000 Max. :38.00
##
## married black south urban
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.893 Mean :0.1283 Mean :0.3412 Mean :0.7176
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## sibs brthord meduc feduc
## Min. : 0.000 Min. : 1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 8.00 1st Qu.: 8.00
## Median : 2.000 Median : 2.000 Median :12.00 Median :10.00
## Mean : 2.941 Mean : 2.277 Mean :10.68 Mean :10.22
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.:12.00 3rd Qu.:12.00
## Max. :14.000 Max. :10.000 Max. :18.00 Max. :18.00
## NA's :83 NA's :78 NA's :194
## lwage
## Min. :4.745
## 1st Qu.:6.506
## Median :6.808
## Mean :6.779
## 3rd Qu.:7.056
## Max. :8.032
##
cluster_wages <- wages[,c("wage", "married", "age", "exper", "educ")]
kmeans_cluster <- kmeans(cluster_wages, 5, nstart = 6)
print(kmeans_cluster)
## K-means clustering with 5 clusters of sizes 281, 126, 30, 274, 224
##
## Cluster means:
## wage married age exper educ
## 1 805.6192 0.9003559 33.19929 11.66548 13.23132
## 2 1496.5000 0.9365079 33.49206 11.22222 14.88095
## 3 2212.1667 0.9666667 35.10000 11.33333 15.66667
## 4 1093.7409 0.9270073 33.30657 12.10949 13.39051
## 5 512.0134 0.8080357 32.15179 10.99107 12.77232
##
## Clustering vector:
## [1] 1 1 1 5 5 2 5 4 4 4 1 1 1 2 2 4 2 1 1 5 4 2 1 3 4 2 4 1 2 5 4 2 3 1 2 1 2
## [38] 4 1 4 1 2 4 4 2 4 1 4 4 1 4 4 2 4 5 1 1 2 2 1 2 4 1 4 4 5 5 5 2 2 2 3 2 4
## [75] 1 4 4 4 1 4 4 1 4 2 1 1 1 1 5 1 3 4 1 4 1 5 4 4 1 1 2 1 1 5 2 4 2 4 4 1 4
## [112] 1 4 1 5 5 5 5 4 5 4 4 1 4 4 4 4 3 2 1 2 2 4 5 2 1 1 1 4 4 1 5 1 5 4 4 3 1
## [149] 4 2 2 5 1 4 5 4 3 3 1 4 4 4 4 1 2 2 3 4 4 5 1 4 1 4 4 5 4 1 5 5 5 1 4 4 4
## [186] 5 5 4 5 1 4 1 1 3 4 4 1 2 3 4 4 5 5 1 5 4 5 5 1 1 4 2 5 5 5 4 4 5 4 5 1 5
## [223] 1 4 4 4 1 2 5 5 4 2 2 2 1 1 5 5 2 3 4 3 1 4 5 2 1 1 4 4 4 4 1 2 4 1 1 5 5
## [260] 4 1 4 5 5 1 1 5 3 4 5 4 4 4 4 1 4 1 4 5 4 1 2 2 2 3 1 1 4 5 5 5 2 4 4 1 4
## [297] 5 4 1 4 4 5 1 4 3 4 1 1 1 1 1 5 1 4 4 4 2 4 1 4 1 1 4 3 1 5 4 1 4 4 1 5 4
## [334] 4 2 5 1 1 1 1 2 1 4 4 5 1 4 1 1 5 4 2 1 2 2 4 1 4 4 5 4 4 1 1 4 5 5 1 1 1
## [371] 2 1 1 1 4 4 1 5 1 4 1 1 4 5 5 5 1 1 4 1 1 4 4 5 4 5 4 4 4 1 4 5 1 5 5 1 5
## [408] 1 4 2 4 4 2 2 1 1 4 5 4 1 2 2 1 2 5 4 4 5 4 5 3 1 1 5 4 2 3 1 4 4 1 1 4 4
## [445] 5 4 2 4 5 5 2 4 5 1 1 1 5 1 3 5 1 4 5 4 4 5 5 4 4 2 5 5 2 2 2 4 4 2 1 5 4
## [482] 4 1 1 5 1 1 1 1 1 4 5 2 1 2 1 1 5 4 4 4 5 1 2 4 5 1 1 1 5 1 1 4 5 5 5 1 5
## [519] 4 4 4 5 4 2 1 5 1 5 2 1 1 1 5 5 5 2 5 5 4 1 2 2 5 5 1 2 4 4 1 4 2 2 1 1 4
## [556] 1 1 4 2 2 5 1 5 4 4 5 5 4 4 5 5 5 1 2 4 4 5 2 4 5 4 2 4 5 5 5 5 5 5 1 2 2
## [593] 4 5 4 1 4 4 1 2 5 2 2 1 1 5 4 5 1 3 1 1 5 5 4 3 4 4 5 5 2 5 5 1 2 2 2 2 1
## [630] 4 3 1 5 4 2 5 5 1 1 5 5 4 5 2 2 4 4 1 5 1 2 4 5 5 4 5 2 1 4 1 4 4 4 1 1 5
## [667] 4 4 5 1 1 1 5 5 5 5 5 1 5 4 5 5 4 4 1 4 2 1 1 1 4 2 1 1 4 1 5 4 4 1 2 5 3
## [704] 1 3 2 5 2 4 4 4 4 1 1 2 2 1 1 1 4 5 4 5 4 1 2 5 4 2 1 2 5 1 3 1 1 1 1 2 5
## [741] 2 5 1 4 4 2 2 2 4 1 2 2 4 4 5 1 1 4 4 1 3 4 3 2 1 1 1 2 4 1 4 4 4 4 4 1 4
## [778] 1 1 1 4 4 4 5 1 4 4 1 5 3 1 1 4 4 1 1 1 4 1 1 4 1 4 1 4 1 4 1 1 1 4 2 1 5
## [815] 1 1 1 3 4 1 1 2 4 1 5 4 5 1 4 4 1 1 5 4 5 1 2 5 5 5 5 5 5 4 1 4 5 1 1 5 5
## [852] 5 5 1 5 5 1 4 5 4 1 1 1 1 1 2 5 1 5 5 5 4 2 5 1 1 1 5 1 5 1 1 5 5 5 1 5 1
## [889] 4 5 5 1 5 4 1 5 5 1 4 4 2 5 1 5 1 5 1 5 2 5 5 2 1 4 4 5 5 5 5 2 5 4 5 5 2
## [926] 5 1 5 5 1 5 4 5 1 4
##
## Within cluster sum of squares by cluster:
## [1] 1907765 2673541 3277405 2754561 2279640
## (between_SS / total_SS = 91.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
cluster_wages$cluster <- as.factor(kmeans_cluster$cluster)
ggplot(data=cluster_wages, aes(x=educ, y=wage)) + geom_jitter(aes(colour=cluster)) +
geom_point(data=data.frame(kmeans_cluster$centers), colour='blue', size=7)

################################## CODE CHANGE ENDS ###########################
# For this change, I wanted to create a cluster to see how wages (y-axis) would be
# depicted using clusters, and the x-axis would be education.
# The variables I included were wage, marital status, age, experience, and
# education. 5 clusters were created, and 6 starting samples were initialized.
# Interpreting the output:
# The center clusters (big blue circles) show us what the
# average wage of an employee might be in a cluster. The five clusters (multiple
# colored points) show us different wages corresponding to varying education
# levels.
# For example, a person with 15 years of education could be expected on average
# to earn approximately $1500 (wage), and this is looking at the blue circles,
# which means the average of points within a cluster.
# Another example would be if someone is just starting out their career (the pink
# cluster), their average wage would be approximately $500.
# END OF CHANGE DESCRIPTION
###############################################################################