library(tokenizers)
library(text2vec)
library(Matrix)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggrepel)
library(broom)
library(stringr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
## 
##     expand
library(readr)
phm_collection <- read_tsv("phm_collection.txt") 
names(phm_collection) <- names(phm_collection) %>% 
  str_to_lower() %>% 
  str_replace_all("\\ ", "_") %>% 
  str_replace("description.", "description")

I am importing a CSV of a dataset that contains over 90 thousand observations. This dataset was created by the Powerhouse Museum of Australia and includes the metadata for objects in the museum’s collections. The Powerhouse Museum is one of three institutions that make up the Museum of Applied Arts and Sciences (the Sydney Observatory and Discovery Centre are the other institutions). The Museum of Applied Arts and Sciences (MAAS) is, according to their About page, “Australia’s conetmporary museum for excellence and innovation in applied arts and sciences.” MAAS was founded in 1879, and they are “uniquely placed to demonstrate how technology, engineering, science and design impact Australia and the world.” The collection of the MAAS includes items relevant to history, science, technology, design, industry, decorative arts, music, transportation, and space exploration. In addition, the MAAS houses items that reflect the culture, history, and lifestyle of Australia. The dataset they published was compiled in March of 2012, so it might not reflect the current holdings of the museum.

Part One - Exploring the Data

glimpse(phm_collection)
## Observations: 90,079
## Variables: 16
## $ record_id               (int) 7801, 7793, 7773, 7763, 7753, 7751, 77...
## $ object_title            (chr) "19142 Meteorite (cast), Weight 12 lbs...
## $ registration_number     (chr) "19142", "19131", "19110", "19101", "1...
## $ description             (chr) "Meteorite (cast), Weight 12 lbs; 3.84...
## $ marks                   (chr) NA, NA, NA, "Text on the tag reads '19...
## $ production_date         (chr) NA, NA, NA, "1888", "1888", "1888", NA...
## $ provenance_(production) (chr) NA, NA, NA, "Made: Minter, Michael; Mu...
## $ provenance_(history)    (chr) NA, NA, NA, NA, NA, NA, "Used: unknown...
## $ categories              (chr) "Meteorites|Models|Models|Mineral Samp...
## $ persistent_link         (chr) "http://www.powerhousemuseum.com/colle...
## $ height                  (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ width                   (chr) NA, NA, NA, "100 mm", NA, "190 mm", "1...
## $ depth                   (chr) NA, NA, NA, "25 mm", NA, "85 mm", NA, ...
## $ diameter                (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ weight                  (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ license_info            (chr) "This text content licensed under<a hr...

Does each item in the dataset have a description?

phm_collection %>% count(description)
## Source: local data frame [83,626 x 2]
## 
##                                                                    description
##                                                                          (chr)
## 1  ... Patent 'Janus' weather-tight fastening for preventing wind & rain from 
## 2  ...Maroon fur felt hat with flat dented crown and flat rim with band of gro
## 3  ...Navy blue silk dress with pleated bodice with V neckline, set in cummerb
## 4  .32 Calibre sub-machine gun with moulded plastic grip under barrel. Numbere
## 5  .38 Smith and Wesson revolver. Body trade marked, Made in U.S.A., A/F and n
## 6  .44 magnum calibre liberty revolver, West Germany, serial No. D0494. Black 
## 7  .45 Service Revolver, 6 chamber, made by Samuel Buckley & Co., Birmingham. 
## 8  .Model, five furrow stump jump disc plough, 'Sundercut', made by H.V. McKay
## 9                  '3 in 1' Television, radio & record player, Precedent, 1960
## 10 'Aeolian Orchestrelle' player reed organ, domestic model, either O or S, Th
## ..                                                                         ...
## Variables not shown: n (int)

How many objects described in the dataset have marks?

phm_collection %>% count(marks)
## Source: local data frame [16,825 x 2]
## 
##                                                                          marks
##                                                                          (chr)
## 1                                 ^ [broad arrow] 9CD/ 1065241 Z4200 S/N: L145
## 2  - Donor's fabric tag sewn in to one sock - "Name: B11" - Ribbon tags sewn i
## 3  - White print "Modaltex style 602", "it's the tops", "9 1/2 (M)" on welting
## 4  -1 Embroidered makers label sewn at inside right breast of jacket, label ha
## 5  -1 Impressed 'WEDGWOOD'; incised, "c / x " and indecipherable mark -2 Impre
## 6  -1 No marks. -2 The hat from Assam has a paper label placed inside hat. Pri
## 7  -1 Paper label slipped inside rim. Printed in ink 'A hand made weed hat, / 
## 8  -1 Paper label stuck to inside rim. Printed in ink 'Philippines'. -2 Donor'
## 9  -1 wallet embossed inside "NOTES / CARDS" Wallet embossed in gold on front 
## 10 -1:2. Fabric label 'Henriette Lamotte/SYDNEY'. -3. Fabric label 'SYDNEY/Hen
## ..                                                                         ...
## Variables not shown: n (int)

How many unique production dates are represented in the dataset?

phm_collection %>% count(production_date)
## Source: local data frame [2,996 x 2]
## 
##    production_date     n
##              (chr) (int)
## 1           - 1800     1
## 2           - 1881     1
## 3           - 1883     2
## 4           - 1884     7
## 5           - 1885     2
## 6           - 1887     1
## 7           - 1888     1
## 8           - 1892     2
## 9           - 1895     2
## 10          - 1903     1
## ..             ...   ...

How many categories of items are represented in the dataset?

phm_collection %>% count(categories)
## Source: local data frame [14,747 x 2]
## 
##                                                                     categories
##                                                                          (chr)
## 1                                                                       120 mm
## 2                                                                       185 mm
## 3                       35mm cameras|Camera accessories|Photographic Equipment
## 4  35mm cameras|Camera stands|Audio and Visual Equipment|Photographic Equipmen
## 5                          35mm cameras|Folding cameras|Photographic Equipment
## 6                            35mm cameras|Movie cameras|Photographic Equipment
## 7                                          35mm cameras|Photographic Equipment
## 8                        35mm cameras|Roll film cameras|Photographic Equipment
## 9  35mm cameras|Roll film cameras|Single lens reflex cameras|Photographic Equi
## 10          35mm cameras|Single lens reflex cameras|Audio and Visual Equipment
## ..                                                                         ...
## Variables not shown: n (int)

Part Two - Textual Analysis

I want to take a closer look at the description of the various items contained in the dataset. The tokenize_words function tokenizes the words in the description column of the Powerhouse Museum dataset.

tokens <- tokenize_words(phm_collection$description)

Next, I create a vocabulary of unique terms found in the tokens variable. Then I prune the vocabulary with the prune_vocabulary function, which discards frequent and very infrequent terms. Following that, I create a corpus from the it and vectorizer vectors. Finally, a term co-occurrence matrix is created from the corpus.

it <- itoken(tokens)
vocab <- create_vocabulary(it)
vocab_pruned <- prune_vocabulary(vocab, term_count_min = 5,
                                 doc_proportion_max = 0.9)
vectorizer <- vocab_vectorizer(vocab_pruned, grow_dtm = TRUE, 
                               skip_grams_window = 10L)
it <- itoken(tokens)
corpus <- create_corpus(it, vectorizer)
tcm <- get_tcm(corpus)

This reveals the sparseness of the matrix.

nnzero(tcm) / length(tcm)
## [1] 0.006961058

Using the GloVe Word Embeddings library–the text2vec package I loaded earlier–I create a term co-occurence matrix that includes no more than ten co-occurrences, and I create 20 iterations, or epochs.

RcppParallel::setThreadOptions(numThreads = 8)

glove_fit <- glove(tcm, word_vectors_size = 100, x_max = 10, num_iters = 20)
## 2016-05-10 18:11:42 - epoch 1, expected cost 0.1698
## 2016-05-10 18:12:05 - epoch 2, expected cost 0.1069
## 2016-05-10 18:12:30 - epoch 3, expected cost 0.0905
## 2016-05-10 18:12:56 - epoch 4, expected cost 0.0811
## 2016-05-10 18:13:20 - epoch 5, expected cost 0.0746
## 2016-05-10 18:13:43 - epoch 6, expected cost 0.0697
## 2016-05-10 18:14:08 - epoch 7, expected cost 0.0658
## 2016-05-10 18:14:32 - epoch 8, expected cost 0.0627
## 2016-05-10 18:14:55 - epoch 9, expected cost 0.0601
## 2016-05-10 18:15:18 - epoch 10, expected cost 0.0579
## 2016-05-10 18:15:42 - epoch 11, expected cost 0.0560
## 2016-05-10 18:16:05 - epoch 12, expected cost 0.0543
## 2016-05-10 18:16:28 - epoch 13, expected cost 0.0528
## 2016-05-10 18:16:52 - epoch 14, expected cost 0.0515
## 2016-05-10 18:17:15 - epoch 15, expected cost 0.0503
## 2016-05-10 18:17:38 - epoch 16, expected cost 0.0493
## 2016-05-10 18:18:04 - epoch 17, expected cost 0.0483
## 2016-05-10 18:18:32 - epoch 18, expected cost 0.0474
## 2016-05-10 18:18:58 - epoch 19, expected cost 0.0466
## 2016-05-10 18:19:25 - epoch 20, expected cost 0.0458
word_vectors <- glove_fit$word_vectors[[1]] + glove_fit$word_vectors[[2]]
rownames(word_vectors) <- rownames(tcm)
word_vectors_norm <- sqrt(rowSums(word_vectors ^ 2))

Then, using the trigonometric function of cosine, I create functions that will determine words that are close to specific words (which I will specify later) and similarities. Cosine similarity, according to Wikipedia, is “a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them.”

word_vec <- function(word) {
  word_vectors[word, , drop = FALSE]
}

closest_to <- function(word_vec, n = 10) {
  cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
  head(sort(cos_dist[1, ], decreasing = TRUE), n)
}

similarities <- function(word_vec) {
  cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
  cos_dist %>% t() %>% tidy() %>% rename(word = .rownames)
}

I then create word vectors that examine the proximity of words that are closest to words I have specified. In this case, I am asking to determine the words closest to wool, specimen, replica, and collection.

word_vec("wool") %>% closest_to()
##        wool    specimen        stud         ewe        bred         ram 
##   1.0000000   0.7054008   0.6840878   0.6718078   0.6502491   0.6260729 
##      sample        1896 interactive         was 
##   0.5792103   0.5499659   0.5275464   0.5187954
word_vec("specimen") %>% closest_to()
##  specimen      wool       ewe      stud       ram      bred specimens 
## 1.0000000 0.7054008 0.6802257 0.6745069 0.6135635 0.6055073 0.5883966 
##      1896    timber   station 
## 0.5146197 0.5142583 0.5095527
word_vec("replica") %>% closest_to()
##   replica   ancient    greece   plaster        ce      coin     greek 
## 1.0000000 0.5916847 0.5380461 0.5334335 0.5325236 0.4758436 0.4531475 
##  original        ci         i 
## 0.4307530 0.4195715 0.4184604
word_vec("collection") %>% closest_to()
##               collection www.powerhousemuseum.com                     http 
##                1.0000000                0.8592429                0.8549174 
##                 database                      irn                       sa 
##                0.8357085                0.8144820                0.6980424 
##               sa_content                       cc                      _cc 
##                0.6858084                0.6835054                0.6761848 
##                  samples 
##                0.6508073
(word_vec("wool") - word_vec("specimen")) %>% closest_to()
##     rayon     blend    cotton polyester    womens      wool    jacket 
## 0.5065882 0.4995136 0.4569539 0.4500444 0.4402923 0.4331987 0.4307646 
##     crepe   viscose  trousers 
## 0.4296762 0.4190238 0.4170890

Next, I create a vector that combines specimen, exhibition, replica, and model. I chose those four words because they are fairly generic and are frequently found in descriptions of items belonging to any museum’s collection. After that, I create a vector of interesting words. These are words that I frequently saw when skimming the description column of the dataset, and I also included some words that I saw when I ran the code block asking for words similar to wool, specimen, collection, and replica. Finally, I plot the results I find when analyzing the similarity or dissimilarity of those words.

test_words <- c("specimen", "exhibition", "replica", "model")
word_sim <- word_vec(test_words) %>% similarities()
interesting_words <- c("wool", "wood", "plough",
                       "ceramics", "earthenware", "casts", "figure", "marble", "furnace",
                       "stones", "dress", "shoes", "badge",
                       "dish", "vase", "jug", "tureen", "chair",
                       "quilt", "cinema", "film", "australia", "handbag",
                       "clothing", "photograph", "mural", "ivory", "casts", "stud", "ewe", 
                       "bred", "timber", "plaster", "fleece", "coin", "ancient", "modern", "yarn",
                       "blouse", "blend")

NB: Instead of using geom_text or geom_point to display the words, I chose geom_text_repel because it made the words much more readable than either of the other aesthetic options.

word_sim %>%
  filter(word %in% interesting_words) %>%
ggplot(aes(x = specimen, y = exhibition, label = word)) +
  geom_text_repel() + 
  theme_bw() +
  lims(x = c(-1, 1), y = c(-1, 1)) +
  labs(title = "Relationship of words to 'specimen' and 'exhibition'")

word_sim %>%
  filter(word %in% interesting_words) %>%
ggplot(aes(x = replica, y = model, label = word)) +
  geom_text_repel() + 
  theme_bw() +
  lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) +
  labs(title = "Relationship of words to 'replica' and 'model'")

filter_interesting <- function(df, threshold = 0.33) {
  filter(df, abs(df[[2]]) >= threshold |
             abs(df[[3]]) >= threshold |
             df[[1]] %in% names(df)[2:3])
}
word_sim %>%
  select(word, specimen, exhibition) %>%
  filter_interesting(threshold = 0.38) %>%
  ggplot(aes(x = specimen, y = exhibition, label = word)) +
  geom_rect(xmin = -0.5, xmax = 0.5, ymin = -0.5, ymax = 0.5,
            fill = "lightgray", alpha = 0.1) +
  geom_point() +
  geom_text_repel() +
  theme_bw() +
  lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) +
  labs(title = "Words unusually related to 'specimen' and 'exhibition'")