library(tokenizers)
library(text2vec)
library(Matrix)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggrepel)
library(broom)
library(stringr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
library(readr)
phm_collection <- read_tsv("phm_collection.txt")
names(phm_collection) <- names(phm_collection) %>%
str_to_lower() %>%
str_replace_all("\\ ", "_") %>%
str_replace("description.", "description")
I am importing a CSV of a dataset that contains over 90 thousand observations. This dataset was created by the Powerhouse Museum of Australia and includes the metadata for objects in the museum’s collections. The Powerhouse Museum is one of three institutions that make up the Museum of Applied Arts and Sciences (the Sydney Observatory and Discovery Centre are the other institutions). The Museum of Applied Arts and Sciences (MAAS) is, according to their About page, “Australia’s conetmporary museum for excellence and innovation in applied arts and sciences.” MAAS was founded in 1879, and they are “uniquely placed to demonstrate how technology, engineering, science and design impact Australia and the world.” The collection of the MAAS includes items relevant to history, science, technology, design, industry, decorative arts, music, transportation, and space exploration. In addition, the MAAS houses items that reflect the culture, history, and lifestyle of Australia. The dataset they published was compiled in March of 2012, so it might not reflect the current holdings of the museum.
glimpse(phm_collection)
## Observations: 90,079
## Variables: 16
## $ record_id (int) 7801, 7793, 7773, 7763, 7753, 7751, 77...
## $ object_title (chr) "19142 Meteorite (cast), Weight 12 lbs...
## $ registration_number (chr) "19142", "19131", "19110", "19101", "1...
## $ description (chr) "Meteorite (cast), Weight 12 lbs; 3.84...
## $ marks (chr) NA, NA, NA, "Text on the tag reads '19...
## $ production_date (chr) NA, NA, NA, "1888", "1888", "1888", NA...
## $ provenance_(production) (chr) NA, NA, NA, "Made: Minter, Michael; Mu...
## $ provenance_(history) (chr) NA, NA, NA, NA, NA, NA, "Used: unknown...
## $ categories (chr) "Meteorites|Models|Models|Mineral Samp...
## $ persistent_link (chr) "http://www.powerhousemuseum.com/colle...
## $ height (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ width (chr) NA, NA, NA, "100 mm", NA, "190 mm", "1...
## $ depth (chr) NA, NA, NA, "25 mm", NA, "85 mm", NA, ...
## $ diameter (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ weight (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ license_info (chr) "This text content licensed under<a hr...
phm_collection %>% count(description)
## Source: local data frame [83,626 x 2]
##
## description
## (chr)
## 1 ... Patent 'Janus' weather-tight fastening for preventing wind & rain from
## 2 ...Maroon fur felt hat with flat dented crown and flat rim with band of gro
## 3 ...Navy blue silk dress with pleated bodice with V neckline, set in cummerb
## 4 .32 Calibre sub-machine gun with moulded plastic grip under barrel. Numbere
## 5 .38 Smith and Wesson revolver. Body trade marked, Made in U.S.A., A/F and n
## 6 .44 magnum calibre liberty revolver, West Germany, serial No. D0494. Black
## 7 .45 Service Revolver, 6 chamber, made by Samuel Buckley & Co., Birmingham.
## 8 .Model, five furrow stump jump disc plough, 'Sundercut', made by H.V. McKay
## 9 '3 in 1' Television, radio & record player, Precedent, 1960
## 10 'Aeolian Orchestrelle' player reed organ, domestic model, either O or S, Th
## .. ...
## Variables not shown: n (int)
phm_collection %>% count(marks)
## Source: local data frame [16,825 x 2]
##
## marks
## (chr)
## 1 ^ [broad arrow] 9CD/ 1065241 Z4200 S/N: L145
## 2 - Donor's fabric tag sewn in to one sock - "Name: B11" - Ribbon tags sewn i
## 3 - White print "Modaltex style 602", "it's the tops", "9 1/2 (M)" on welting
## 4 -1 Embroidered makers label sewn at inside right breast of jacket, label ha
## 5 -1 Impressed 'WEDGWOOD'; incised, "c / x " and indecipherable mark -2 Impre
## 6 -1 No marks. -2 The hat from Assam has a paper label placed inside hat. Pri
## 7 -1 Paper label slipped inside rim. Printed in ink 'A hand made weed hat, /
## 8 -1 Paper label stuck to inside rim. Printed in ink 'Philippines'. -2 Donor'
## 9 -1 wallet embossed inside "NOTES / CARDS" Wallet embossed in gold on front
## 10 -1:2. Fabric label 'Henriette Lamotte/SYDNEY'. -3. Fabric label 'SYDNEY/Hen
## .. ...
## Variables not shown: n (int)
phm_collection %>% count(production_date)
## Source: local data frame [2,996 x 2]
##
## production_date n
## (chr) (int)
## 1 - 1800 1
## 2 - 1881 1
## 3 - 1883 2
## 4 - 1884 7
## 5 - 1885 2
## 6 - 1887 1
## 7 - 1888 1
## 8 - 1892 2
## 9 - 1895 2
## 10 - 1903 1
## .. ... ...
phm_collection %>% count(categories)
## Source: local data frame [14,747 x 2]
##
## categories
## (chr)
## 1 120 mm
## 2 185 mm
## 3 35mm cameras|Camera accessories|Photographic Equipment
## 4 35mm cameras|Camera stands|Audio and Visual Equipment|Photographic Equipmen
## 5 35mm cameras|Folding cameras|Photographic Equipment
## 6 35mm cameras|Movie cameras|Photographic Equipment
## 7 35mm cameras|Photographic Equipment
## 8 35mm cameras|Roll film cameras|Photographic Equipment
## 9 35mm cameras|Roll film cameras|Single lens reflex cameras|Photographic Equi
## 10 35mm cameras|Single lens reflex cameras|Audio and Visual Equipment
## .. ...
## Variables not shown: n (int)
I want to take a closer look at the description of the various items contained in the dataset. The tokenize_words function tokenizes the words in the description column of the Powerhouse Museum dataset.
tokens <- tokenize_words(phm_collection$description)
Next, I create a vocabulary of unique terms found in the tokens variable. Then I prune the vocabulary with the prune_vocabulary function, which discards frequent and very infrequent terms. Following that, I create a corpus from the it and vectorizer vectors. Finally, a term co-occurrence matrix is created from the corpus.
it <- itoken(tokens)
vocab <- create_vocabulary(it)
vocab_pruned <- prune_vocabulary(vocab, term_count_min = 5,
doc_proportion_max = 0.9)
vectorizer <- vocab_vectorizer(vocab_pruned, grow_dtm = TRUE,
skip_grams_window = 10L)
it <- itoken(tokens)
corpus <- create_corpus(it, vectorizer)
tcm <- get_tcm(corpus)
This reveals the sparseness of the matrix.
nnzero(tcm) / length(tcm)
## [1] 0.006961058
Using the GloVe Word Embeddings library–the text2vec package I loaded earlier–I create a term co-occurence matrix that includes no more than ten co-occurrences, and I create 20 iterations, or epochs.
RcppParallel::setThreadOptions(numThreads = 8)
glove_fit <- glove(tcm, word_vectors_size = 100, x_max = 10, num_iters = 20)
## 2016-05-10 18:11:42 - epoch 1, expected cost 0.1698
## 2016-05-10 18:12:05 - epoch 2, expected cost 0.1069
## 2016-05-10 18:12:30 - epoch 3, expected cost 0.0905
## 2016-05-10 18:12:56 - epoch 4, expected cost 0.0811
## 2016-05-10 18:13:20 - epoch 5, expected cost 0.0746
## 2016-05-10 18:13:43 - epoch 6, expected cost 0.0697
## 2016-05-10 18:14:08 - epoch 7, expected cost 0.0658
## 2016-05-10 18:14:32 - epoch 8, expected cost 0.0627
## 2016-05-10 18:14:55 - epoch 9, expected cost 0.0601
## 2016-05-10 18:15:18 - epoch 10, expected cost 0.0579
## 2016-05-10 18:15:42 - epoch 11, expected cost 0.0560
## 2016-05-10 18:16:05 - epoch 12, expected cost 0.0543
## 2016-05-10 18:16:28 - epoch 13, expected cost 0.0528
## 2016-05-10 18:16:52 - epoch 14, expected cost 0.0515
## 2016-05-10 18:17:15 - epoch 15, expected cost 0.0503
## 2016-05-10 18:17:38 - epoch 16, expected cost 0.0493
## 2016-05-10 18:18:04 - epoch 17, expected cost 0.0483
## 2016-05-10 18:18:32 - epoch 18, expected cost 0.0474
## 2016-05-10 18:18:58 - epoch 19, expected cost 0.0466
## 2016-05-10 18:19:25 - epoch 20, expected cost 0.0458
word_vectors <- glove_fit$word_vectors[[1]] + glove_fit$word_vectors[[2]]
rownames(word_vectors) <- rownames(tcm)
word_vectors_norm <- sqrt(rowSums(word_vectors ^ 2))
Then, using the trigonometric function of cosine, I create functions that will determine words that are close to specific words (which I will specify later) and similarities. Cosine similarity, according to Wikipedia, is “a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them.”
word_vec <- function(word) {
word_vectors[word, , drop = FALSE]
}
closest_to <- function(word_vec, n = 10) {
cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
head(sort(cos_dist[1, ], decreasing = TRUE), n)
}
similarities <- function(word_vec) {
cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
cos_dist %>% t() %>% tidy() %>% rename(word = .rownames)
}
I then create word vectors that examine the proximity of words that are closest to words I have specified. In this case, I am asking to determine the words closest to wool, specimen, replica, and collection.
word_vec("wool") %>% closest_to()
## wool specimen stud ewe bred ram
## 1.0000000 0.7054008 0.6840878 0.6718078 0.6502491 0.6260729
## sample 1896 interactive was
## 0.5792103 0.5499659 0.5275464 0.5187954
word_vec("specimen") %>% closest_to()
## specimen wool ewe stud ram bred specimens
## 1.0000000 0.7054008 0.6802257 0.6745069 0.6135635 0.6055073 0.5883966
## 1896 timber station
## 0.5146197 0.5142583 0.5095527
word_vec("replica") %>% closest_to()
## replica ancient greece plaster ce coin greek
## 1.0000000 0.5916847 0.5380461 0.5334335 0.5325236 0.4758436 0.4531475
## original ci i
## 0.4307530 0.4195715 0.4184604
word_vec("collection") %>% closest_to()
## collection www.powerhousemuseum.com http
## 1.0000000 0.8592429 0.8549174
## database irn sa
## 0.8357085 0.8144820 0.6980424
## sa_content cc _cc
## 0.6858084 0.6835054 0.6761848
## samples
## 0.6508073
(word_vec("wool") - word_vec("specimen")) %>% closest_to()
## rayon blend cotton polyester womens wool jacket
## 0.5065882 0.4995136 0.4569539 0.4500444 0.4402923 0.4331987 0.4307646
## crepe viscose trousers
## 0.4296762 0.4190238 0.4170890
Next, I create a vector that combines specimen, exhibition, replica, and model. I chose those four words because they are fairly generic and are frequently found in descriptions of items belonging to any museum’s collection. After that, I create a vector of interesting words. These are words that I frequently saw when skimming the description column of the dataset, and I also included some words that I saw when I ran the code block asking for words similar to wool, specimen, collection, and replica. Finally, I plot the results I find when analyzing the similarity or dissimilarity of those words.
test_words <- c("specimen", "exhibition", "replica", "model")
word_sim <- word_vec(test_words) %>% similarities()
interesting_words <- c("wool", "wood", "plough",
"ceramics", "earthenware", "casts", "figure", "marble", "furnace",
"stones", "dress", "shoes", "badge",
"dish", "vase", "jug", "tureen", "chair",
"quilt", "cinema", "film", "australia", "handbag",
"clothing", "photograph", "mural", "ivory", "casts", "stud", "ewe",
"bred", "timber", "plaster", "fleece", "coin", "ancient", "modern", "yarn",
"blouse", "blend")
NB: Instead of using geom_text or geom_point to display the words, I chose geom_text_repel because it made the words much more readable than either of the other aesthetic options.
word_sim %>%
filter(word %in% interesting_words) %>%
ggplot(aes(x = specimen, y = exhibition, label = word)) +
geom_text_repel() +
theme_bw() +
lims(x = c(-1, 1), y = c(-1, 1)) +
labs(title = "Relationship of words to 'specimen' and 'exhibition'")
word_sim %>%
filter(word %in% interesting_words) %>%
ggplot(aes(x = replica, y = model, label = word)) +
geom_text_repel() +
theme_bw() +
lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) +
labs(title = "Relationship of words to 'replica' and 'model'")
filter_interesting <- function(df, threshold = 0.33) {
filter(df, abs(df[[2]]) >= threshold |
abs(df[[3]]) >= threshold |
df[[1]] %in% names(df)[2:3])
}
word_sim %>%
select(word, specimen, exhibition) %>%
filter_interesting(threshold = 0.38) %>%
ggplot(aes(x = specimen, y = exhibition, label = word)) +
geom_rect(xmin = -0.5, xmax = 0.5, ymin = -0.5, ymax = 0.5,
fill = "lightgray", alpha = 0.1) +
geom_point() +
geom_text_repel() +
theme_bw() +
lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) +
labs(title = "Words unusually related to 'specimen' and 'exhibition'")
word_sim %>%
select(word, replica, model) %>%
filter_interesting(threshold = 0.38) %>%
ggplot(aes(x = replica, y = model, label = word)) +
geom_rect(xmin = -0.5, xmax = 0.5, ymin = -0.5, ymax = 0.5,
fill = "lightgray", alpha = 0.1) +
geom_point() +
geom_text_repel() +
theme_bw() +
lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) +
labs(title = "Words unusually related to 'replica' and 'model'")
specimenvector <- word_vec("specimen") - word_vec("exhibition")
replicavector <- word_vec("replica") - word_vec("model")
cf <- rbind(specimenvector, replicavector)
rownames(cf) <- c("specimenvector", "replicavector")
cf_sim <- cf %>% similarities()
unusual <- 0.38
cf_sim %>%
filter_interesting(threshold = unusual) %>%
ggplot(aes(x = specimenvector, y = replicavector, label = word)) +
geom_rect(xmin = -unusual, xmax = unusual, ymin = -unusual, ymax = unusual,
fill = "gray90", alpha = 0.1) +
geom_text() +
theme_bw() +
lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) +
labs(title = "Words compared on multiple axes",
x = "specimen <---------- -----------> exhibition",
y = "replica <----------- ----------> model")
It is evident from these visualiations that this data warrants further study. Textual analysis can be useful for museums and other public history institutions that have datasets of the metadata of items in their collection. For example, it is clear from the analysis of word_vec(“replica”) %>% closest_to() that the Museum contains many replicas of items from ancient Greece and Britain, including coins and plasters. It is also evident from the words most closely related to specimen and exhibition (quilt, shoes, dress, jug, blouse, furnace, stones, marble, wool, stud, ewe, etc. ) that the Museum has many items relating to the history of Australia’s domestic realm, agriculture, and fashion. The analysis of words unusually related to specimen and exhibition demonstrate the places where coocurrences happen in unexpected ways. Textual analysis using GloVe Word Embeddings can be helpful in discovering not only what sort of items a collection contains, but also allows the viewer to analyze the items in a bottom-up way, taking the metadata and finding where items are similar and where they are different. This can be helpful to people not familiar with the museum’s holdings, or for a museum employee trying to showcase items in a specific exhibit.