The data today come from the Tate Art Gallery (https://github.com/tategallery/collection)
library(tidyverse)
library(tidytuesdayR)
library(lubridate)
library(dplyr)
library(tidyr)
library(broom)
library(countrycode)
library(praise)
#install.packages("tidymodels")
library(tidymodels)
#install.packages("ranger")
praise()
## [1] "You are tiptop!"
tt_data <- tt_load("2021-01-12")
##
## Downloading file 1 of 2: `artists.csv`
## Downloading file 2 of 2: `artwork.csv`
# make and save csvs
artwork <- tt_data$artwork
write_csv(artwork, "artwork.csv")
artists <- tt_data$artists
write_csv(artists, "artists.csv")
head(artwork)
## # A tibble: 6 x 20
## id accession_number artist artistRole artistId title dateText medium
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1035 A00001 Blake… artist 38 A Fi… date no… Water…
## 2 1036 A00002 Blake… artist 38 Two … date no… Graph…
## 3 1037 A00003 Blake… artist 38 The … ?c.1785 Graph…
## 4 1038 A00004 Blake… artist 38 Six … date no… Graph…
## 5 1039 A00005 Blake… artist 39 The … 1826–7,… Line …
## 6 1040 A00006 Blake… artist 39 Ciam… 1826–7,… Line …
## # … with 12 more variables: creditLine <chr>, year <dbl>,
## # acquisitionYear <dbl>, dimensions <chr>, width <dbl>, height <dbl>,
## # depth <dbl>, units <chr>, inscription <chr>, thumbnailCopyright <lgl>,
## # thumbnailUrl <chr>, url <chr>
glimpse(artwork)
## Rows: 69,201
## Columns: 20
## $ id <dbl> 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 10…
## $ accession_number <chr> "A00001", "A00002", "A00003", "A00004", "A00005", …
## $ artist <chr> "Blake, Robert", "Blake, Robert", "Blake, Robert",…
## $ artistRole <chr> "artist", "artist", "artist", "artist", "artist", …
## $ artistId <dbl> 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39…
## $ title <chr> "A Figure Bowing before a Seated Old Man with his …
## $ dateText <chr> "date not known", "date not known", "?c.1785", "da…
## $ medium <chr> "Watercolour, ink, chalk and graphite on paper. Ve…
## $ creditLine <chr> "Presented by Mrs John Richmond 1922", "Presented …
## $ year <dbl> NA, NA, 1785, NA, 1826, 1826, 1826, 1826, 1826, 18…
## $ acquisitionYear <dbl> 1922, 1922, 1922, 1922, 1919, 1919, 1919, 1919, 19…
## $ dimensions <chr> "support: 394 x 419 mm", "support: 311 x 213 mm", …
## $ width <dbl> 394, 311, 343, 318, 243, 240, 242, 246, 241, 243, …
## $ height <dbl> 419, 213, 467, 394, 335, 338, 334, 340, 335, 340, …
## $ depth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ units <chr> "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "m…
## $ inscription <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ thumbnailCopyright <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ thumbnailUrl <chr> "http://www.tate.org.uk/art/images/work/A/A00/A000…
## $ url <chr> "http://www.tate.org.uk/art/artworks/blake-a-figur…
head(artists)
## # A tibble: 6 x 9
## id name gender dates yearOfBirth yearOfDeath placeOfBirth placeOfDeath
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 10093 Abak… Female born… 1930 NA Polska <NA>
## 2 0 Abbe… Male 1852… 1852 1911 Philadelphi… London, Uni…
## 3 2756 Abbo… Female 1898… 1898 1991 Springfield… Monson, Uni…
## 4 1 Abbo… Male 1760… 1760 1803 Leicestersh… London, Uni…
## 5 622 Abra… Male born… 1935 NA Wigan, Unit… <NA>
## 6 2606 Absa… Male 1964… 1964 1993 Tel Aviv-Ya… Paris, Fran…
## # … with 1 more variable: url <chr>
glimpse(artists)
## Rows: 3,532
## Columns: 9
## $ id <dbl> 10093, 0, 2756, 1, 622, 2606, 9550, 623, 624, 625, 2411,…
## $ name <chr> "Abakanowicz, Magdalena", "Abbey, Edwin Austin", "Abbott…
## $ gender <chr> "Female", "Male", "Female", "Male", "Male", "Male", "Fem…
## $ dates <chr> "born 1930", "1852–1911", "1898–1991", "1760–1803", "bor…
## $ yearOfBirth <dbl> 1930, 1852, 1898, 1760, 1935, 1964, 1967, 1940, 1947, 19…
## $ yearOfDeath <dbl> NA, 1911, 1991, 1803, NA, 1993, NA, NA, 2014, NA, 1792, …
## $ placeOfBirth <chr> "Polska", "Philadelphia, United States", "Springfield, U…
## $ placeOfDeath <chr> NA, "London, United Kingdom", "Monson, United States", "…
## $ url <chr> "http://www.tate.org.uk/art/artists/magdalena-abakanowic…
Note: over half of the pieces are by Joseph Turner! We might be interested in analyzing the data without his work.
artwork %>%
group_by(artistId) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head()
## # A tibble: 6 x 2
## artistId count
## <dbl> <int>
## 1 558 39389
## 2 300 1046
## 3 1659 623
## 4 138 612
## 5 747 578
## 6 2638 388
artists %>%
filter(id == 558)
## # A tibble: 1 x 9
## id name gender dates yearOfBirth yearOfDeath placeOfBirth placeOfDeath
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 558 Turn… Male 1775… 1775 1851 London, Uni… Chelsea, Un…
## # … with 1 more variable: url <chr>
Both placeofBirth and placeofDeath are given as City, Country. Let’s look to split these into different categories - it might make it easier to analyze or model the data.
artists <- artists %>%
separate(placeOfBirth, c("cityBirth", "countryBirth"), sep = ", ", fill = "left") %>%
separate(placeOfDeath, c("cityDeath", "countryDeath"), sep = ", ", fill = "left")
artists %>% select(countryBirth) %>% table()
## .
## Al-‘Iraq Al-Jaza'ir
## 1 2
## Al-Lubnan Argentina
## 8 17
## Armenia As-Sudan
## 1 1
## Australia Auteuil
## 23 1
## Bahamas Bangladesh
## 1 2
## Barbados Beckington
## 1 1
## Belarus België
## 4 37
## Bénin Bermondsey
## 1 1
## Bharat Blackheath
## 24 1
## Bosna i Hercegovina Braintree
## 2 1
## Brasil Bristol
## 30 2
## Bulgaria Cameroun
## 2 1
## Canada Canterbury
## 40 1
## Ceská Republika Charlieu
## 14 1
## Charlotte Amalie Chile
## 1 5
## Choson Minjujuui In'min Konghwaguk Chung-hua Min-kuo
## 1 1
## Colombia Costa Rica
## 8 1
## Cuba D.C.
## 9 2
## Danmark Département de la
## 9 1
## Deutschland Douglas
## 142 1
## Edinburgh Eesti
## 1 1
## Egremont Éire
## 1 51
## Ellás Epsom
## 10 1
## España France
## 29 157
## Guyana Hertfordshire
## 2 1
## Hrvatska Indonesia
## 8 2
## Îran Ísland
## 10 1
## Isle of Man Italia
## 2 80
## Jamaica Jugoslavija
## 2 3
## Kensington Kenya
## 1 1
## Lao Latvija
## 1 3
## Lietuva Liverpool
## 4 1
## London Luxembourg
## 3 1
## Magyarország Makedonija
## 13 1
## Malaysia Malta
## 1 1
## Mauritius Mehoz
## 2 2
## Melmerby México
## 1 13
## Misr Moldova
## 8 1
## Montserrat Myanmar
## 1 1
## Nederland New Zealand
## 35 10
## Nicaragua Niederschlesien
## 1 1
## Nigeria Nihon
## 1 27
## Norge Österreich
## 3 29
## Otok Pakistan
## 1 5
## Panamá Perth
## 1 1
## Perú Pilipinas
## 4 1
## Plymouth Polska
## 1 41
## Portugal Prathet Thai
## 10 1
## Rochdale România
## 1 13
## Rossiya Saint Hélier
## 32 2
## Samoa Schlesien
## 1 2
## Schweiz Shqipëria
## 29 1
## Singapore Slovenija
## 2 6
## Slovenská Republika Solothurn
## 3 1
## South Africa Sri Lanka
## 20 3
## Staten Island Stockholm
## 1 1
## Stoke on Trent Suomi
## 1 1
## Suriyah Sverige
## 2 12
## Taehan Min'guk Tanzania
## 3 1
## Tunis Türkiye
## 1 6
## Uganda Ukrayina
## 1 17
## United Kingdom United States
## 1496 339
## Venezuela Viet Nam
## 7 2
## Wimbledon Yisra'el
## 1 11
## Zambia Zhonghua
## 1 22
## Zimbabwe
## 1
artists %>%
group_by(countryBirth) %>%
mutate(count = n()) %>%
filter(count >= 10) %>%
ungroup() %>%
ggplot(aes(x = countryBirth, y = yearOfBirth)) +
geom_boxplot()
Combining datasets with shared unique identifiers: Inner Join = keep everything that is shared in both datasets Full Union = keep everything from each dataset
Let’s combine the datasets so that if we want to use gender to ask questions about the art, the information is available!
artartist1 <-
inner_join(artists, artwork, by = c("id" = "artistId"))
artartist2 <-
full_join(artists, artwork, by = c("id" = "artistId"))
nrow(artwork)
## [1] 69201
nrow(artartist1)
## [1] 69195
nrow(artartist2)
## [1] 69395
Some inportant insights from these runs:
There are 6 pieces of art that do not have a listed artist. They are left out in the intersect.
There are ALSO 194 artists in the dataset that do not have artwork in the database, and that is why we get MORE data in artartist2 than in artwork itself.
artwork %>%
select(artistRole) %>%
table()
## .
## after and a pupil and assistants
## 2014 4 3
## and other artists and studio artist
## 12 1 66907
## attributed to circle of doubtfully attributed to
## 164 1 1
## follower of formerly attributed to imitator of
## 2 14 5
## manner of prints after pseudo
## 24 11 8
## pupil of school of studio of
## 18 4 7
## style of
## 1
#density plot
artartist1 %>%
filter(id != 558) %>%
ggplot(aes(x = year, color = gender)) +
geom_density()
#histogram
artartist1 %>%
filter(id != 558) %>%
ggplot(aes(x = year, color = gender, fill = gender)) +
geom_histogram()
We see that the relative proportion of female artwork increases as we turn towards modern times. Removing artist # 558 helps remove his skew.
Let’s explore a RFM.
#install.packages('ranger')
set.seed(47)
library(tidymodels)
library(vip)
# remove the artist that is listed an enormous number of times
# because he might skew the data
artRF <- artartist1 %>%
filter(id != 558)
# 75% train, 25% test
data_split <- initial_split(artRF, prop = 0.75)
art_train <- training(data_split)
# for better data science, we would perform more analysis on these
# individual fields, but removing them is sufficient for our low-stakes
# purposes now
art_test <- testing(data_split) %>%
filter(!is.na(gender)) %>%
filter(!is.na(width)) %>%
filter(!is.na(height)) %>%
filter(!is.na(acquisitionYear)) %>%
filter(!is.na(yearOfBirth)) %>%
filter(!is.na(yearOfDeath))
# give it the right fields to look at
# birth and death year might make the model overfit to the training data
rand_forest(mode = "regression") %>%
set_args(importance = "permutation") %>%
fit(year ~ gender + width + height + acquisitionYear +
yearOfBirth + yearOfDeath,
data = art_train) %>%
vip::vip()
We see that the model is relying very heavily on
yearOfBirth and yearOfDeath to make its predictions, as we hypothesized.
rand_forest(mode = "regression") %>%
fit(year ~ gender + width + height + acquisitionYear +
yearOfBirth + yearOfDeath, data = art_train) %>%
predict(new_data = art_test) %>%
ggplot(aes(y = .pred, x = art_test$year)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
ylab("Predicted Year") +
xlab("Year of Creation")
The above graph shows that our model has a very high accuracy - probably too accurate.