# Clustering example
library(MASS)
library(ggplot2)
wages<-read.csv('http://inta.gatech.s3.amazonaws.com/wage2.csv')
subwages <-wages[,c("wage","IQ","KWW","educ","age","married")]
# Cluster based only on a subset of the data
cluster.results<-kmeans(subwages, 4, nstart=5)
# Calculate clusters, using 5 different starting samples
print(cluster.results)
## K-means clustering with 4 clusters of sizes 214, 52, 376, 293
## 
## Cluster means:
##        wage        IQ      KWW     educ      age   married
## 1 1319.3645 106.32243 38.33178 14.24299 33.34112 0.9485981
## 2 2013.2692 111.05769 41.38462 15.53846 34.28846 0.9230769
## 3  920.2074 101.18617 35.73936 13.25798 33.30585 0.9069149
## 4  555.1092  95.98976 32.86007 12.80546 32.38567 0.8293515
## 
## Clustering vector:
##   [1] 3 3 3 4 4 1 4 3 1 3 3 3 3 1 2 3 1 3 3 4 1 1 3 2 3 1 1 3 1 4 3 1 2 4 2 3 1
##  [38] 1 4 3 4 1 1 3 2 3 3 1 1 3 3 1 2 3 4 3 3 1 1 3 2 1 3 3 3 4 4 4 1 1 1 2 2 1
##  [75] 3 3 3 3 4 3 3 4 1 2 4 3 3 3 4 4 2 3 3 1 3 4 1 1 3 3 1 3 3 4 1 3 1 3 1 3 3
## [112] 3 1 3 4 4 4 4 1 4 1 1 3 3 3 3 3 2 1 3 1 1 1 4 1 3 3 3 3 3 3 4 3 4 3 1 2 3
## [149] 3 1 1 4 4 1 4 3 2 2 3 1 1 3 3 4 1 1 2 1 1 4 3 3 3 3 3 4 1 4 4 4 4 3 1 3 1
## [186] 4 4 3 4 3 3 3 3 2 1 1 4 2 2 3 3 4 4 4 4 3 4 4 4 3 3 1 4 4 4 1 3 4 1 4 3 4
## [223] 3 1 3 1 3 1 4 4 1 1 1 1 3 3 4 4 2 2 1 2 3 3 4 1 3 3 3 3 1 1 3 1 3 3 3 4 4
## [260] 3 3 1 4 4 3 3 4 2 3 4 1 1 3 1 4 1 3 3 4 3 3 1 1 1 2 3 3 3 4 4 4 1 3 3 3 1
## [297] 4 3 3 3 3 4 3 3 2 1 4 3 3 3 3 4 3 3 1 3 1 3 3 1 3 4 3 2 3 4 3 4 1 3 3 4 1
## [334] 1 1 4 3 3 3 4 1 3 1 3 4 3 3 4 3 4 1 1 3 1 1 3 4 3 3 4 3 3 4 4 3 4 4 3 4 3
## [371] 1 4 3 3 3 1 3 4 3 1 3 3 1 4 4 4 3 3 3 3 3 3 3 4 3 4 3 1 3 3 1 4 4 4 4 4 4
## [408] 4 3 1 1 1 1 1 3 3 1 4 1 3 2 1 3 1 4 1 3 4 1 4 2 4 3 4 3 1 2 4 1 1 4 3 3 3
## [445] 4 1 1 1 4 4 2 3 4 3 3 3 4 3 2 4 3 1 4 3 3 4 4 3 3 1 4 4 1 1 1 3 1 1 3 4 1
## [482] 3 4 4 4 3 3 4 3 3 3 4 1 3 1 4 3 4 3 3 3 4 3 1 3 4 3 3 3 4 3 4 3 4 4 4 3 4
## [519] 1 1 1 4 3 1 3 4 3 4 1 3 3 4 4 4 4 1 4 4 3 4 1 1 4 4 4 2 3 1 3 1 1 1 3 4 3
## [556] 3 3 3 1 2 4 4 4 3 1 4 4 3 3 4 4 4 4 1 3 3 4 1 3 4 1 1 1 4 4 4 4 4 4 4 1 1
## [593] 1 4 1 3 3 3 3 1 4 1 2 3 3 4 3 4 3 2 3 3 4 4 3 2 3 1 4 4 1 4 4 3 1 1 1 1 4
## [630] 3 2 3 4 3 2 4 4 4 3 4 4 1 4 1 1 1 3 4 4 3 1 3 4 4 3 4 1 3 3 3 3 1 3 4 3 4
## [667] 1 3 4 3 3 4 4 4 4 4 4 4 4 3 4 4 3 3 3 1 1 4 3 4 3 1 3 3 3 3 4 1 3 4 2 4 2
## [704] 3 2 1 4 1 1 3 1 3 3 3 1 1 3 3 3 3 4 1 4 1 3 1 4 1 2 3 2 4 3 2 4 3 3 3 1 4
## [741] 1 4 3 3 1 1 2 1 3 4 2 2 3 3 4 4 3 1 1 3 2 3 2 1 3 3 3 2 1 3 3 3 1 3 1 3 1
## [778] 3 3 3 1 1 3 4 4 3 3 3 4 2 3 3 3 3 3 4 3 3 3 3 1 4 3 3 3 3 1 4 3 3 3 1 3 4
## [815] 4 4 3 2 1 4 4 1 3 3 4 1 4 3 3 1 3 3 4 3 4 3 1 4 4 4 4 4 4 1 4 3 4 4 3 4 4
## [852] 4 4 4 4 4 3 3 4 3 3 3 3 3 3 1 4 4 4 4 4 3 1 4 3 3 4 4 3 4 3 3 4 4 4 3 4 3
## [889] 1 4 4 3 4 3 3 4 4 3 3 3 1 4 4 4 3 4 3 4 1 4 4 1 3 3 3 4 4 4 4 1 4 3 4 4 1
## [926] 4 3 4 4 4 4 1 4 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 4471404 6153621 4231857 4165182
##  (between_SS / total_SS =  87.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Look at cluster results
subwages$cluster<-as.factor(cluster.results$cluster)
# The 'cluster' element of the results is the cluster which each data
# point belongs

ggplot(data=subwages, aes(x=educ, y=wage)) + geom_point(aes(colour=cluster))  + 
  geom_point(data=data.frame(cluster.results$centers), colour='red', size=7)

ggplot(data=subwages, aes(x=educ, y=IQ)) + geom_point(aes(colour=cluster)) + 
  geom_point(data=data.frame(cluster.results$centers), colour='red', size=7)

# We can plot the data points with colors for the different clusters.
# The colors match up differently along different dimensions

# If you want to see these plots with the data points less on top of each other,
# 'geom_jitter' adds soem noise to each point to move them off of each other:
ggplot(data=subwages, aes(x=educ, y=wage)) + geom_jitter(aes(colour=cluster)) + 
  geom_point(data=data.frame(cluster.results$centers), colour='red', size=7)

# When you look at these graphs, it's clear that clustering is mostly happening 
# on wage. Why is that? How could you fix it?

####################
# Standardize before doing clustering
zscore<-function(x){
  out<-(x - mean(x))/sd(x)
  return(out)
}
subwages2<-subwages
subwages2$cluster<-NULL
for (col in names(subwages2)){
  subwages2[,col]<-zscore(subwages2[,col])
}
# This loop standardizes the data (subtracts mean and divides by standard deviation)
# so that the data is in z-scores

cluster.results2<-kmeans(subwages2, 4, nstart=5)
subwages$cluster2<-as.factor(cluster.results2$cluster)
ggplot(data=subwages, aes(x=educ, y=wage)) + geom_point(aes(colour=cluster2))

ggplot(data=subwages, aes(x=educ, y=IQ)) + geom_point(aes(colour=cluster2))

# Plot the original data using the new clusters - note there is more of an IQ-based 
# pattern and less a pure wages one.

# Explore clusters in this data:
# 1) What happens if you use a different number of clusters?
# 2) How many clusters do you think there should be?
# 3) How good a fit can you get of wages using these clusters?

##########
# Principal Components Analysis
wages<-read.csv('http://inta.gatech.s3.amazonaws.com/wage2.csv')
subwages <-wages[,c("IQ","KWW","educ")]
# Create a subset of wages with just a few columns
principal.components <- prcomp(subwages, retx=T, center=T, scale=T)
# Do a principal components analysis on just the columns in subwages
print(principal.components$rotation) # The weights on each variable
##            PC1        PC2         PC3
## IQ   0.5994431 -0.3116398 -0.73725745
## KWW  0.5413277  0.8363384  0.08661666
## educ 0.5896035 -0.4510196  0.67003657
summary(principal.components) # The proportion of variance explained
## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.3714 0.7979 0.6948
## Proportion of Variance 0.6269 0.2122 0.1609
## Cumulative Proportion  0.6269 0.8391 1.0000
plot(principal.components) # Graphical depiction of proportion of variance

sw <- cbind(wages,data.frame(principal.components$x))
# Add the new principal components as columns to a new data frame, sw
# (New so we don't muck up going back and running other stuff on wages now)
one.pc <- lm(wage ~ PC1, data= sw)
all.variables <- lm(wage ~ IQ + KWW + educ, data= sw)
summary(one.pc)
## 
## Call:
## lm(formula = wage ~ PC1, data = sw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -914.52 -241.02  -35.99  194.77 2215.61 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  957.945     12.100   79.17   <2e-16 ***
## PC1          119.264      8.828   13.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 370 on 933 degrees of freedom
## Multiple R-squared:  0.1636, Adjusted R-squared:  0.1627 
## F-statistic: 182.5 on 1 and 933 DF,  p-value: < 2.2e-16
summary(all.variables)
## 
## Call:
## lm(formula = wage ~ IQ + KWW + educ, data = sw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -927.30 -240.12  -32.46  199.57 2250.92 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -231.6018    92.1449  -2.513 0.012123 *  
## IQ             3.5675     0.9749   3.660 0.000267 ***
## KWW           10.6471     1.7859   5.962 3.54e-09 ***
## educ          33.2366     6.5997   5.036 5.71e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 931 degrees of freedom
## Multiple R-squared:  0.1657, Adjusted R-squared:  0.163 
## F-statistic: 61.64 on 3 and 931 DF,  p-value: < 2.2e-16
# How much better does having three variables do compared to the one principal
# component here? (Look at R^2)

# Try to replicate this using more variables, including perhaps age and marital 
# Status. Would you see the same thing?


###############################################################################
# CHANGE DESCRIPTION

################################## CODE CHANGE STARTS #########################
summary(wages)
##       wage            hours             IQ             KWW       
##  Min.   : 115.0   Min.   :20.00   Min.   : 50.0   Min.   :12.00  
##  1st Qu.: 669.0   1st Qu.:40.00   1st Qu.: 92.0   1st Qu.:31.00  
##  Median : 905.0   Median :40.00   Median :102.0   Median :37.00  
##  Mean   : 957.9   Mean   :43.93   Mean   :101.3   Mean   :35.74  
##  3rd Qu.:1160.0   3rd Qu.:48.00   3rd Qu.:112.0   3rd Qu.:41.00  
##  Max.   :3078.0   Max.   :80.00   Max.   :145.0   Max.   :56.00  
##                                                                  
##       educ           exper           tenure            age       
##  Min.   : 9.00   Min.   : 1.00   Min.   : 0.000   Min.   :28.00  
##  1st Qu.:12.00   1st Qu.: 8.00   1st Qu.: 3.000   1st Qu.:30.00  
##  Median :12.00   Median :11.00   Median : 7.000   Median :33.00  
##  Mean   :13.47   Mean   :11.56   Mean   : 7.234   Mean   :33.08  
##  3rd Qu.:16.00   3rd Qu.:15.00   3rd Qu.:11.000   3rd Qu.:36.00  
##  Max.   :18.00   Max.   :23.00   Max.   :22.000   Max.   :38.00  
##                                                                  
##     married          black            south            urban       
##  Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.893   Mean   :0.1283   Mean   :0.3412   Mean   :0.7176  
##  3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                    
##       sibs           brthord           meduc           feduc      
##  Min.   : 0.000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 8.00   1st Qu.: 8.00  
##  Median : 2.000   Median : 2.000   Median :12.00   Median :10.00  
##  Mean   : 2.941   Mean   : 2.277   Mean   :10.68   Mean   :10.22  
##  3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.:12.00   3rd Qu.:12.00  
##  Max.   :14.000   Max.   :10.000   Max.   :18.00   Max.   :18.00  
##                   NA's   :83       NA's   :78      NA's   :194    
##      lwage      
##  Min.   :4.745  
##  1st Qu.:6.506  
##  Median :6.808  
##  Mean   :6.779  
##  3rd Qu.:7.056  
##  Max.   :8.032  
## 
cluster_wages <- wages[,c("wage", "married", "age", "exper", "educ")]

kmeans_cluster <- kmeans(cluster_wages, 5, nstart = 6)

print(kmeans_cluster)
## K-means clustering with 5 clusters of sizes 281, 126, 30, 274, 224
## 
## Cluster means:
##        wage   married      age    exper     educ
## 1  805.6192 0.9003559 33.19929 11.66548 13.23132
## 2 1496.5000 0.9365079 33.49206 11.22222 14.88095
## 3 2212.1667 0.9666667 35.10000 11.33333 15.66667
## 4 1093.7409 0.9270073 33.30657 12.10949 13.39051
## 5  512.0134 0.8080357 32.15179 10.99107 12.77232
## 
## Clustering vector:
##   [1] 1 1 1 5 5 2 5 4 4 4 1 1 1 2 2 4 2 1 1 5 4 2 1 3 4 2 4 1 2 5 4 2 3 1 2 1 2
##  [38] 4 1 4 1 2 4 4 2 4 1 4 4 1 4 4 2 4 5 1 1 2 2 1 2 4 1 4 4 5 5 5 2 2 2 3 2 4
##  [75] 1 4 4 4 1 4 4 1 4 2 1 1 1 1 5 1 3 4 1 4 1 5 4 4 1 1 2 1 1 5 2 4 2 4 4 1 4
## [112] 1 4 1 5 5 5 5 4 5 4 4 1 4 4 4 4 3 2 1 2 2 4 5 2 1 1 1 4 4 1 5 1 5 4 4 3 1
## [149] 4 2 2 5 1 4 5 4 3 3 1 4 4 4 4 1 2 2 3 4 4 5 1 4 1 4 4 5 4 1 5 5 5 1 4 4 4
## [186] 5 5 4 5 1 4 1 1 3 4 4 1 2 3 4 4 5 5 1 5 4 5 5 1 1 4 2 5 5 5 4 4 5 4 5 1 5
## [223] 1 4 4 4 1 2 5 5 4 2 2 2 1 1 5 5 2 3 4 3 1 4 5 2 1 1 4 4 4 4 1 2 4 1 1 5 5
## [260] 4 1 4 5 5 1 1 5 3 4 5 4 4 4 4 1 4 1 4 5 4 1 2 2 2 3 1 1 4 5 5 5 2 4 4 1 4
## [297] 5 4 1 4 4 5 1 4 3 4 1 1 1 1 1 5 1 4 4 4 2 4 1 4 1 1 4 3 1 5 4 1 4 4 1 5 4
## [334] 4 2 5 1 1 1 1 2 1 4 4 5 1 4 1 1 5 4 2 1 2 2 4 1 4 4 5 4 4 1 1 4 5 5 1 1 1
## [371] 2 1 1 1 4 4 1 5 1 4 1 1 4 5 5 5 1 1 4 1 1 4 4 5 4 5 4 4 4 1 4 5 1 5 5 1 5
## [408] 1 4 2 4 4 2 2 1 1 4 5 4 1 2 2 1 2 5 4 4 5 4 5 3 1 1 5 4 2 3 1 4 4 1 1 4 4
## [445] 5 4 2 4 5 5 2 4 5 1 1 1 5 1 3 5 1 4 5 4 4 5 5 4 4 2 5 5 2 2 2 4 4 2 1 5 4
## [482] 4 1 1 5 1 1 1 1 1 4 5 2 1 2 1 1 5 4 4 4 5 1 2 4 5 1 1 1 5 1 1 4 5 5 5 1 5
## [519] 4 4 4 5 4 2 1 5 1 5 2 1 1 1 5 5 5 2 5 5 4 1 2 2 5 5 1 2 4 4 1 4 2 2 1 1 4
## [556] 1 1 4 2 2 5 1 5 4 4 5 5 4 4 5 5 5 1 2 4 4 5 2 4 5 4 2 4 5 5 5 5 5 5 1 2 2
## [593] 4 5 4 1 4 4 1 2 5 2 2 1 1 5 4 5 1 3 1 1 5 5 4 3 4 4 5 5 2 5 5 1 2 2 2 2 1
## [630] 4 3 1 5 4 2 5 5 1 1 5 5 4 5 2 2 4 4 1 5 1 2 4 5 5 4 5 2 1 4 1 4 4 4 1 1 5
## [667] 4 4 5 1 1 1 5 5 5 5 5 1 5 4 5 5 4 4 1 4 2 1 1 1 4 2 1 1 4 1 5 4 4 1 2 5 3
## [704] 1 3 2 5 2 4 4 4 4 1 1 2 2 1 1 1 4 5 4 5 4 1 2 5 4 2 1 2 5 1 3 1 1 1 1 2 5
## [741] 2 5 1 4 4 2 2 2 4 1 2 2 4 4 5 1 1 4 4 1 3 4 3 2 1 1 1 2 4 1 4 4 4 4 4 1 4
## [778] 1 1 1 4 4 4 5 1 4 4 1 5 3 1 1 4 4 1 1 1 4 1 1 4 1 4 1 4 1 4 1 1 1 4 2 1 5
## [815] 1 1 1 3 4 1 1 2 4 1 5 4 5 1 4 4 1 1 5 4 5 1 2 5 5 5 5 5 5 4 1 4 5 1 1 5 5
## [852] 5 5 1 5 5 1 4 5 4 1 1 1 1 1 2 5 1 5 5 5 4 2 5 1 1 1 5 1 5 1 1 5 5 5 1 5 1
## [889] 4 5 5 1 5 4 1 5 5 1 4 4 2 5 1 5 1 5 1 5 2 5 5 2 1 4 4 5 5 5 5 2 5 4 5 5 2
## [926] 5 1 5 5 1 5 4 5 1 4
## 
## Within cluster sum of squares by cluster:
## [1] 1907765 2673541 3277405 2754561 2279640
##  (between_SS / total_SS =  91.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
cluster_wages$cluster <- as.factor(kmeans_cluster$cluster)

ggplot(data=cluster_wages, aes(x=educ, y=wage)) + geom_jitter(aes(colour=cluster)) +
  geom_point(data=data.frame(kmeans_cluster$centers), colour='blue', size=7)

################################## CODE CHANGE ENDS ###########################

# For this change, I wanted to create a cluster to see how wages (y-axis) would be 
# depicted using clusters, and the x-axis would be education. 

# The variables I included were wage, marital status, age, experience, and
# education. 5 clusters were created, and 6 starting samples were initialized.

# Interpreting the output:
# The center clusters (big blue circles) show us what the 
# average wage of an employee might be in a cluster. The five clusters (multiple
# colored points) show us different wages corresponding to varying education 
# levels.

# For example, a person with 15 years of education could be expected on average
# to earn approximately $1500 (wage), and this is looking at the blue circles,
# which means the average of points within a cluster. 

# Another example would be if someone is just starting out their career (the pink 
# cluster), their average wage would be approximately $500. 

# END OF CHANGE DESCRIPTION
###############################################################################