Load libraries

library(kohonen)
## Warning: package 'kohonen' was built under R version 4.1.2
library(tidyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggpubr)
## Loading required package: ggplot2
library(RColorBrewer)
library(countrycode)
## Warning: package 'countrycode' was built under R version 4.1.2
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(tmap)
## Registered S3 methods overwritten by 'stars':
##   method             from
##   st_bbox.SpatRaster sf  
##   st_crs.SpatRaster  sf

Read in Data

housing = read.csv("housing.csv")
head(housing)
##   longitude latitude housing_median_age total_rooms total_bedrooms population
## 1   -122.23    37.88                 41         880            129        322
## 2   -122.22    37.86                 21        7099           1106       2401
## 3   -122.24    37.85                 52        1467            190        496
## 4   -122.25    37.85                 52        1274            235        558
## 5   -122.25    37.85                 52        1627            280        565
## 6   -122.25    37.85                 52         919            213        413
##   households median_income median_house_value ocean_proximity
## 1        126        8.3252             452600        NEAR BAY
## 2       1138        8.3014             358500        NEAR BAY
## 3        177        7.2574             352100        NEAR BAY
## 4        219        5.6431             341300        NEAR BAY
## 5        259        3.8462             342200        NEAR BAY
## 6        193        4.0368             269700        NEAR BAY

Clean up data to only variables needed

housing <- housing %>% 
  filter(!is.na(total_bedrooms))

housing <- housing %>%
  mutate(avg_rooms = total_rooms / households,
         bedroom_ratio = total_bedrooms / total_rooms)
housing <- housing %>%
  select(-longitude, -latitude, -total_rooms, -total_bedrooms, 
         -ocean_proximity, -households)
names(housing)
## [1] "housing_median_age" "population"         "median_income"     
## [4] "median_house_value" "avg_rooms"          "bedroom_ratio"
housing_scale <- housing %>%
  mutate(
    housing_median_age = scale(log(housing_median_age)),
    population = scale(log(population)),
    median_income = scale(log(median_income)),
    median_house_value = scale(log(median_house_value)),
    avg_rooms = scale(log(avg_rooms)),
    bedroom_ratio = scale(log(bedroom_ratio))
  )

Train the SOM

som_grid = somgrid(xdim = 30, ydim = 20, topo = "hexagonal")
housing_mat = as.matrix(housing_scale)
housing_som = som(housing_mat, som_grid)

Plotting the change in the loss function

plot(housing_som, type = "changes")

Stabilize the loss function

housing_som = som(housing_mat, som_grid, rlen = 1000)
plot(housing_som, type = "changes")

A plot of the counts per nodes

plot(housing_som, type = "counts")

A plot of the codebook vectors

plot(housing_som, type = "code")

One or more heatmaps of the individual features on the SOM

plot(housing_som, type = "property", property = housing_som$codes[[1]][,"population"],
     main = "population")

plot(housing_som, type = "property", property = housing_som$codes[[1]][,"housing_median_age"],
     main = "housing_median_age")

A set of clusters constructed from the SOM nodes using the method in the lab (and a figure showing these on the SOM grid)

dist_codes = dist(housing_som$codes[[1]])
som_cluster = cutree(hclust(dist_codes), 6)
mypal = brewer.pal(6, "Set2")
plot(housing_som, type="mapping", bgcol = mypal[som_cluster], 
     main = "Clusters", pch = '.') 
add.cluster.boundaries(housing_som, som_cluster)

Make clusters contiguous

dist_grid = unit.distances(som_grid)
dist_adj = as.matrix(dist_codes) * dist_grid

clust_adj = hclust(as.dist(dist_adj), 'ward.D2')
som_cluster_adj = cutree(clust_adj, 6)

plot(housing_som, type="mapping", bgcol = mypal[som_cluster_adj], 
     main = "Clusters", pch = '.') 
add.cluster.boundaries(housing_som, som_cluster_adj)

A table given the average values of each feature per cluster

nodeByVariable <- housing_som$unit.classif
housing_scale$somclust <-
  as.factor(som_cluster_adj[nodeByVariable])

housing$somclust <- housing_scale$somclust
housing_clusters <- merge(som_cluster_adj, housing_scale)

cluster_vals <- aggregate(housing[,1:6],
                          by = list(housing_scale$somclust), mean)
cluster_vals
##   Group.1 housing_median_age population median_income median_house_value
## 1       1           29.59195  1306.1487      7.044211           362435.4
## 2       2           38.94354   863.0682      4.084544           251648.4
## 3       3           35.04350  1394.9215      2.309950           149567.6
## 4       4           17.53032  2009.6817      4.606481           202741.2
## 5       5           24.79307  1283.7228      2.899186           105242.1
## 6       6           24.23419  1524.2623      3.562486           254211.7
##   avg_rooms bedroom_ratio
## 1  6.955522     0.1505183
## 2  5.260730     0.2004679
## 3  4.148897     0.2688367
## 4  5.856763     0.1839476
## 5  6.456228     0.2033079
## 6  4.608714     0.2411684