library(RColorBrewer)
library(kohonen)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.2
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

Instructions: For the exercise, you will need to build use an unsupervised classification method with the California housing dataset using a self-organizing map. Your answer should use the following variables (you can use code from a previous example to create the average number of rooms per house, and the ratio of bedrooms to rooms): * housing_median_age * population * median_income * median_house_value * avg_rooms * bedroom_ratio

Your answer should consist of the following: * Your R code * A plot showing that the loss function has stabilized * A plot of the counts per nodes * A plot of the codebook vectors * One or more heatmaps of the individual features on the SOM * A set of clusters constructed from the SOM nodes using the method in the lab (and a figure showing these on the SOM grid) * A table given the average values of each feature per cluster

housing <- read.csv("/Users/brenna/Downloads/housing.csv")
# 20640 obs, 10 var
5*sqrt(20640) #target: 718 nodes
## [1] 718.3314
# grid
som_grid <- somgrid(xdim = 34, ydim = 21, topo = "hexagonal")

# making ocean_proximity numeric
housing$ocean_proximity[housing$ocean_proximity == "INLAND"] <- 0
housing$ocean_proximity[housing$ocean_proximity == "<1H OCEAN"] <- 1
housing$ocean_proximity[housing$ocean_proximity == "NEAR BAY"] <- 2
housing$ocean_proximity[housing$ocean_proximity == "NEAR OCEAN"] <- 3
housing$ocean_proximity[housing$ocean_proximity == "ISLAND"] <- 4

housing$ocean_proximity <- as.numeric(housing$ocean_proximity)

# missingness
meanlog <- log(mean(housing$total_bedrooms, na.rm = TRUE))
sdlog <- log(sd(housing$total_bedrooms, na.rm = TRUE))
housing$total_bedrooms[is.na(housing$total_bedrooms)] <- rlnorm(1, meanlog = meanlog, sdlog = sdlog)

# scaling
housing_scale <- housing %>%
  mutate(total_rooms = scale(log(total_rooms)),
         total_bedrooms = scale(log(total_bedrooms)),
         population = scale(log(population)),
         households = scale(log(households)),
         median_income = scale(log(median_income)),
         housing_median_age = scale(housing_median_age)
  )

# training
housing_mat <- as.matrix(housing_scale[,c(1:8, 10)])
housing_som <- som(housing_mat, som_grid, rlen = 1000)

plot(housing_som, type = "changes")

#stabilized around 950
# plots

plot(housing_som, type = "counts", shape = "straight", border = "white")

# not much trend

plot(housing_som, type = "code", shape = "straight")

# a bit hard to interpret, with nine covariates

par(mfrow = c(3, 3))
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"population"],
     main = "Population", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"longitude"],
     main = "Longitude", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"latitude"],
     main = "Latitude", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"housing_median_age"],
     main = "housing_median_age", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"total_rooms"],
     main = "total_rooms", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"total_bedrooms"],
     main = "total_bedrooms", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"households"],
     main = "households", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"median_income"],
     main = "median_income", shape = "straight", border = "white")
plot(housing_som, type = "property", property = housing_som$codes[[1]][,"ocean_proximity"],
     main = "ocean_proximity", shape = "straight", border = "white", lwd = 0)

# cluster analysis

dist_codes <- dist(housing_som$codes[[1]])
som_cluster <- cutree(hclust(dist_codes), 6)


# non-contiguous for all values greater than 2

dist_grid <- unit.distances(som_grid)
dist_adj <- as.matrix(dist_codes) * dist_grid

cluster_adj <- hclust(as.dist(dist_adj), "ward.D2")
som_cluster_adj <- cutree(cluster_adj, 6)

mypal = brewer.pal(6, "Set2")
par(mfrow = c(1, 2))
plot(housing_som, type="mapping", bgcol = mypal[som_cluster], 
     main = "Non-Contiguous Clusters", pch = '.', shape = "straight") 
add.cluster.boundaries(housing_som, som_cluster)
plot(housing_som, type = "mapping", bgcol = mypal[som_cluster_adj],
     main = "Housing Clusters", pch = ".", shape = "straight")
add.cluster.boundaries(housing_som, som_cluster_adj)

node_assignments <- housing_som$unit.classif
housing$somcluster <- as.factor(som_cluster_adj[node_assignments])

# back-converting ocean proximity
housing$ocean_proximity_ch[housing$ocean_proximity == 0] <- "INLAND"
housing$ocean_proximity_ch[housing$ocean_proximity == 1] <- "<1H OCEAN"
housing$ocean_proximity_ch[housing$ocean_proximity == 2] <- "NEAR BAY"
housing$ocean_proximity_ch[housing$ocean_proximity == 3] <- "NEAR OCEAN"
housing$ocean_proximity_ch[housing$ocean_proximity == 4] <- "ISLAND"

ftable(housing$somcluster, housing$ocean_proximity_ch)
##    <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
##                                               
## 1       1515   1693      0     2178        773
## 2          1   2241      0        0          0
## 3       4200    167      0        0         59
## 4       3095   1041      1        0        924
## 5         70   1156      0      112         42
## 6        255    253      4        0        860
cluster.vals <- aggregate(housing[,c(1:9)], 
                         by = list(housing$somclust), mean)
cluster.vals
##   Group.1 longitude latitude housing_median_age total_rooms total_bedrooms
## 1       1 -122.0787 38.03167           29.53580    2921.642       605.3574
## 2       2 -119.3285 36.13440           29.34880    2216.950       493.6238
## 3       3 -118.2557 34.03332           36.80547    1820.698       477.0111
## 4       4 -117.9192 33.82428           19.93124    3938.041       811.5618
## 5       5 -121.0066 38.04217           25.72971    1131.672       246.1893
## 6       6 -117.5829 33.43792           32.16327    1375.229       298.9862
##   population households median_income median_house_value
## 1  1468.3229   543.8747      4.097283           226435.1
## 2  1281.9755   426.1017      2.833984           106052.8
## 3  1197.8552   382.1541      4.013930           234575.7
## 4  2066.8275   730.0041      4.138832           218689.7
## 5   500.3826   170.8210      3.548474           151747.2
## 6   766.6188   279.7063      3.420198           206041.2

Interpretation: In terms of ocean proximity, all island homes are located in Cluster 1. Most homes in Clusters 2, 4, and 6 are inland, while most homes in Cluster 3 are near bay, most in Cluster 5 are <1H Ocean.

Cluster 5 seems to contain homes in more upscale areas, with a centroid in Los Angeles. It has has relatively young, quite large homes, located in areas with higher population and housing concentrations and relatively high median income.

Cluster 1, with the most homes near or within an hour of the ocean, also contains the most expensive homes. Median income is quite high, population density is second-lowest, and homes are relatively small. These homes are probably located in desirable areas where the selling feature is not necessarily the expansiveness of the home.

Cluster 4 contains the homes in poorest areas, with the highest inland proportion. The rooms and bedrooms are approximately the median for the whole dataset — typical-sized homes, in other words. Home value is also approximately the median ($179K). Interestingly, these homes are also the newest.

Cluster 2 has the fewest members, with only 833 (Cluster 1 has the most members at 6,583). Cluster 2 seems to capture the unusual homes, which are 5.4-times smaller than homes in the next-largest cluster. Population is sparse, but the mean centroid isn’t far from that of Cluster 4 (both located in the Central Valley).

Cluster 6 contains relatively large homes in areas with slightly densely population areas. Median income is lower than average, home value is low, and homes are mostly inland.