library(RColorBrewer)
library(kohonen)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.2
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
Instructions: For the exercise, you will need to build use an unsupervised classification method with the California housing dataset using a self-organizing map. Your answer should use the following variables (you can use code from a previous example to create the average number of rooms per house, and the ratio of bedrooms to rooms): * housing_median_age * population * median_income * median_house_value * avg_rooms * bedroom_ratio
Your answer should consist of the following: * Your R code * A plot showing that the loss function has stabilized * A plot of the counts per nodes * A plot of the codebook vectors * One or more heatmaps of the individual features on the SOM * A set of clusters constructed from the SOM nodes using the method in the lab (and a figure showing these on the SOM grid) * A table given the average values of each feature per cluster
housing <- read.csv("/Users/brenna/Downloads/housing.csv")
# 20640 obs, 10 var
5*sqrt(20640) #target: 718 nodes
## [1] 718.3314
# grid
som_grid <- somgrid(xdim = 34, ydim = 21, topo = "hexagonal")
# making ocean_proximity numeric
housing$ocean_proximity[housing$ocean_proximity == "INLAND"] <- 0
housing$ocean_proximity[housing$ocean_proximity == "<1H OCEAN"] <- 1
housing$ocean_proximity[housing$ocean_proximity == "NEAR BAY"] <- 2
housing$ocean_proximity[housing$ocean_proximity == "NEAR OCEAN"] <- 3
housing$ocean_proximity[housing$ocean_proximity == "ISLAND"] <- 4
housing$ocean_proximity <- as.numeric(housing$ocean_proximity)
# missingness
meanlog <- log(mean(housing$total_bedrooms, na.rm = TRUE))
sdlog <- log(sd(housing$total_bedrooms, na.rm = TRUE))
housing$total_bedrooms[is.na(housing$total_bedrooms)] <- rlnorm(1, meanlog = meanlog, sdlog = sdlog)
# scaling
housing_scale <- housing %>%
mutate(total_rooms = scale(log(total_rooms)),
total_bedrooms = scale(log(total_bedrooms)),
population = scale(log(population)),
households = scale(log(households)),
median_income = scale(log(median_income)),
housing_median_age = scale(housing_median_age)
)
# training
housing_mat <- as.matrix(housing_scale[,c(1:8, 10)])
housing_som <- som(housing_mat, som_grid, rlen = 1000)
plot(housing_som, type = "changes")
#stabilized around 950
# plots
plot(housing_som, type = "counts", shape = "straight", border = "white")
# not much trend
plot(housing_som, type = "code", shape = "straight")
# a bit hard to interpret, with nine covariates
par(mfrow = c(3, 3))
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"population"],
main = "Population", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"longitude"],
main = "Longitude", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"latitude"],
main = "Latitude", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"housing_median_age"],
main = "housing_median_age", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"total_rooms"],
main = "total_rooms", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"total_bedrooms"],
main = "total_bedrooms", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"households"],
main = "households", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"median_income"],
main = "median_income", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"ocean_proximity"],
main = "ocean_proximity", shape = "straight", border = "white", lwd = 0)
# cluster analysis
dist_codes <- dist(housing_som$codes[[1]])
som_cluster <- cutree(hclust(dist_codes), 6)
# non-contiguous for all values greater than 2
dist_grid <- unit.distances(som_grid)
dist_adj <- as.matrix(dist_codes) * dist_grid
cluster_adj <- hclust(as.dist(dist_adj), "ward.D2")
som_cluster_adj <- cutree(cluster_adj, 6)
mypal = brewer.pal(6, "Set2")
par(mfrow = c(1, 2))
plot(housing_som, type="mapping", bgcol = mypal[som_cluster],
main = "Non-Contiguous Clusters", pch = '.', shape = "straight")
add.cluster.boundaries(housing_som, som_cluster)
plot(housing_som, type = "mapping", bgcol = mypal[som_cluster_adj],
main = "Housing Clusters", pch = ".", shape = "straight")
add.cluster.boundaries(housing_som, som_cluster_adj)
node_assignments <- housing_som$unit.classif
housing$somcluster <- as.factor(som_cluster_adj[node_assignments])
# back-converting ocean proximity
housing$ocean_proximity_ch[housing$ocean_proximity == 0] <- "INLAND"
housing$ocean_proximity_ch[housing$ocean_proximity == 1] <- "<1H OCEAN"
housing$ocean_proximity_ch[housing$ocean_proximity == 2] <- "NEAR BAY"
housing$ocean_proximity_ch[housing$ocean_proximity == 3] <- "NEAR OCEAN"
housing$ocean_proximity_ch[housing$ocean_proximity == 4] <- "ISLAND"
ftable(housing$somcluster, housing$ocean_proximity_ch)
## <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
##
## 1 1515 1693 0 2178 773
## 2 1 2241 0 0 0
## 3 4200 167 0 0 59
## 4 3095 1041 1 0 924
## 5 70 1156 0 112 42
## 6 255 253 4 0 860
cluster.vals <- aggregate(housing[,c(1:9)],
by = list(housing$somclust), mean)
cluster.vals
## Group.1 longitude latitude housing_median_age total_rooms total_bedrooms
## 1 1 -122.0787 38.03167 29.53580 2921.642 605.3574
## 2 2 -119.3285 36.13440 29.34880 2216.950 493.6238
## 3 3 -118.2557 34.03332 36.80547 1820.698 477.0111
## 4 4 -117.9192 33.82428 19.93124 3938.041 811.5618
## 5 5 -121.0066 38.04217 25.72971 1131.672 246.1893
## 6 6 -117.5829 33.43792 32.16327 1375.229 298.9862
## population households median_income median_house_value
## 1 1468.3229 543.8747 4.097283 226435.1
## 2 1281.9755 426.1017 2.833984 106052.8
## 3 1197.8552 382.1541 4.013930 234575.7
## 4 2066.8275 730.0041 4.138832 218689.7
## 5 500.3826 170.8210 3.548474 151747.2
## 6 766.6188 279.7063 3.420198 206041.2
Interpretation: In terms of ocean proximity, all island homes are located in Cluster 1. Most homes in Clusters 2, 4, and 6 are inland, while most homes in Cluster 3 are near bay, most in Cluster 5 are <1H Ocean.
Cluster 5 seems to contain homes in more upscale areas, with a centroid in Los Angeles. It has has relatively young, quite large homes, located in areas with higher population and housing concentrations and relatively high median income.
Cluster 1, with the most homes near or within an hour of the ocean, also contains the most expensive homes. Median income is quite high, population density is second-lowest, and homes are relatively small. These homes are probably located in desirable areas where the selling feature is not necessarily the expansiveness of the home.
Cluster 4 contains the homes in poorest areas, with the highest inland proportion. The rooms and bedrooms are approximately the median for the whole dataset — typical-sized homes, in other words. Home value is also approximately the median ($179K). Interestingly, these homes are also the newest.
Cluster 2 has the fewest members, with only 833 (Cluster 1 has the most members at 6,583). Cluster 2 seems to capture the unusual homes, which are 5.4-times smaller than homes in the next-largest cluster. Population is sparse, but the mean centroid isn’t far from that of Cluster 4 (both located in the Central Valley).
Cluster 6 contains relatively large homes in areas with slightly densely population areas. Median income is lower than average, home value is low, and homes are mostly inland.