The data today come from the Tate Art Gallery (https://github.com/tategallery/collection)

library(tidyverse)
library(tidytuesdayR)
library(lubridate)
library(dplyr)
library(tidyr)
library(broom)
library(countrycode)
library(praise)
#install.packages("tidymodels")
library(tidymodels)
#install.packages("ranger")
praise()
## [1] "You are tiptop!"

Get the Data

tt_data <- tt_load("2021-01-12")
## 
##  Downloading file 1 of 2: `artists.csv`
##  Downloading file 2 of 2: `artwork.csv`
# make and save csvs
artwork <- tt_data$artwork
write_csv(artwork, "artwork.csv")
artists <- tt_data$artists
write_csv(artists, "artists.csv")

Preliminary looks at the data

head(artwork)
## # A tibble: 6 x 20
##      id accession_number artist artistRole artistId title dateText medium
##   <dbl> <chr>            <chr>  <chr>         <dbl> <chr> <chr>    <chr> 
## 1  1035 A00001           Blake… artist           38 A Fi… date no… Water…
## 2  1036 A00002           Blake… artist           38 Two … date no… Graph…
## 3  1037 A00003           Blake… artist           38 The … ?c.1785  Graph…
## 4  1038 A00004           Blake… artist           38 Six … date no… Graph…
## 5  1039 A00005           Blake… artist           39 The … 1826–7,… Line …
## 6  1040 A00006           Blake… artist           39 Ciam… 1826–7,… Line …
## # … with 12 more variables: creditLine <chr>, year <dbl>,
## #   acquisitionYear <dbl>, dimensions <chr>, width <dbl>, height <dbl>,
## #   depth <dbl>, units <chr>, inscription <chr>, thumbnailCopyright <lgl>,
## #   thumbnailUrl <chr>, url <chr>
glimpse(artwork)
## Rows: 69,201
## Columns: 20
## $ id                 <dbl> 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 10…
## $ accession_number   <chr> "A00001", "A00002", "A00003", "A00004", "A00005", …
## $ artist             <chr> "Blake, Robert", "Blake, Robert", "Blake, Robert",…
## $ artistRole         <chr> "artist", "artist", "artist", "artist", "artist", …
## $ artistId           <dbl> 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39…
## $ title              <chr> "A Figure Bowing before a Seated Old Man with his …
## $ dateText           <chr> "date not known", "date not known", "?c.1785", "da…
## $ medium             <chr> "Watercolour, ink, chalk and graphite on paper. Ve…
## $ creditLine         <chr> "Presented by Mrs John Richmond 1922", "Presented …
## $ year               <dbl> NA, NA, 1785, NA, 1826, 1826, 1826, 1826, 1826, 18…
## $ acquisitionYear    <dbl> 1922, 1922, 1922, 1922, 1919, 1919, 1919, 1919, 19…
## $ dimensions         <chr> "support: 394 x 419 mm", "support: 311 x 213 mm", …
## $ width              <dbl> 394, 311, 343, 318, 243, 240, 242, 246, 241, 243, …
## $ height             <dbl> 419, 213, 467, 394, 335, 338, 334, 340, 335, 340, …
## $ depth              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ units              <chr> "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "m…
## $ inscription        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ thumbnailCopyright <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ thumbnailUrl       <chr> "http://www.tate.org.uk/art/images/work/A/A00/A000…
## $ url                <chr> "http://www.tate.org.uk/art/artworks/blake-a-figur…
head(artists)
## # A tibble: 6 x 9
##      id name  gender dates yearOfBirth yearOfDeath placeOfBirth placeOfDeath
##   <dbl> <chr> <chr>  <chr>       <dbl>       <dbl> <chr>        <chr>       
## 1 10093 Abak… Female born…        1930          NA Polska       <NA>        
## 2     0 Abbe… Male   1852…        1852        1911 Philadelphi… London, Uni…
## 3  2756 Abbo… Female 1898…        1898        1991 Springfield… Monson, Uni…
## 4     1 Abbo… Male   1760…        1760        1803 Leicestersh… London, Uni…
## 5   622 Abra… Male   born…        1935          NA Wigan, Unit… <NA>        
## 6  2606 Absa… Male   1964…        1964        1993 Tel Aviv-Ya… Paris, Fran…
## # … with 1 more variable: url <chr>
glimpse(artists)
## Rows: 3,532
## Columns: 9
## $ id           <dbl> 10093, 0, 2756, 1, 622, 2606, 9550, 623, 624, 625, 2411,…
## $ name         <chr> "Abakanowicz, Magdalena", "Abbey, Edwin Austin", "Abbott…
## $ gender       <chr> "Female", "Male", "Female", "Male", "Male", "Male", "Fem…
## $ dates        <chr> "born 1930", "1852–1911", "1898–1991", "1760–1803", "bor…
## $ yearOfBirth  <dbl> 1930, 1852, 1898, 1760, 1935, 1964, 1967, 1940, 1947, 19…
## $ yearOfDeath  <dbl> NA, 1911, 1991, 1803, NA, 1993, NA, NA, 2014, NA, 1792, …
## $ placeOfBirth <chr> "Polska", "Philadelphia, United States", "Springfield, U…
## $ placeOfDeath <chr> NA, "London, United Kingdom", "Monson, United States", "…
## $ url          <chr> "http://www.tate.org.uk/art/artists/magdalena-abakanowic…

Note: over half of the pieces are by Joseph Turner! We might be interested in analyzing the data without his work.

artwork %>%
  group_by(artistId) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head()
## # A tibble: 6 x 2
##   artistId count
##      <dbl> <int>
## 1      558 39389
## 2      300  1046
## 3     1659   623
## 4      138   612
## 5      747   578
## 6     2638   388
artists %>%
  filter(id == 558)
## # A tibble: 1 x 9
##      id name  gender dates yearOfBirth yearOfDeath placeOfBirth placeOfDeath
##   <dbl> <chr> <chr>  <chr>       <dbl>       <dbl> <chr>        <chr>       
## 1   558 Turn… Male   1775…        1775        1851 London, Uni… Chelsea, Un…
## # … with 1 more variable: url <chr>

Both placeofBirth and placeofDeath are given as City, Country. Let’s look to split these into different categories - it might make it easier to analyze or model the data.

artists <- artists %>%
  separate(placeOfBirth, c("cityBirth", "countryBirth"), sep = ", ", fill = "left") %>%
  separate(placeOfDeath, c("cityDeath", "countryDeath"), sep = ", ", fill = "left")

artists %>% select(countryBirth) %>% table()
## .
##                           Al-‘Iraq                         Al-Jaza'ir 
##                                  1                                  2 
##                          Al-Lubnan                          Argentina 
##                                  8                                 17 
##                            Armenia                           As-Sudan 
##                                  1                                  1 
##                          Australia                            Auteuil 
##                                 23                                  1 
##                            Bahamas                         Bangladesh 
##                                  1                                  2 
##                           Barbados                         Beckington 
##                                  1                                  1 
##                            Belarus                             België 
##                                  4                                 37 
##                              Bénin                         Bermondsey 
##                                  1                                  1 
##                             Bharat                         Blackheath 
##                                 24                                  1 
##                Bosna i Hercegovina                          Braintree 
##                                  2                                  1 
##                             Brasil                            Bristol 
##                                 30                                  2 
##                           Bulgaria                           Cameroun 
##                                  2                                  1 
##                             Canada                         Canterbury 
##                                 40                                  1 
##                    Ceská Republika                           Charlieu 
##                                 14                                  1 
##                   Charlotte Amalie                              Chile 
##                                  1                                  5 
## Choson Minjujuui In'min Konghwaguk                  Chung-hua Min-kuo 
##                                  1                                  1 
##                           Colombia                         Costa Rica 
##                                  8                                  1 
##                               Cuba                               D.C. 
##                                  9                                  2 
##                            Danmark                  Département de la 
##                                  9                                  1 
##                        Deutschland                            Douglas 
##                                142                                  1 
##                          Edinburgh                              Eesti 
##                                  1                                  1 
##                           Egremont                               Éire 
##                                  1                                 51 
##                              Ellás                              Epsom 
##                                 10                                  1 
##                             España                             France 
##                                 29                                157 
##                             Guyana                      Hertfordshire 
##                                  2                                  1 
##                           Hrvatska                          Indonesia 
##                                  8                                  2 
##                               Îran                             Ísland 
##                                 10                                  1 
##                        Isle of Man                             Italia 
##                                  2                                 80 
##                            Jamaica                        Jugoslavija 
##                                  2                                  3 
##                         Kensington                              Kenya 
##                                  1                                  1 
##                                Lao                            Latvija 
##                                  1                                  3 
##                            Lietuva                          Liverpool 
##                                  4                                  1 
##                             London                         Luxembourg 
##                                  3                                  1 
##                       Magyarország                         Makedonija 
##                                 13                                  1 
##                           Malaysia                              Malta 
##                                  1                                  1 
##                          Mauritius                              Mehoz 
##                                  2                                  2 
##                           Melmerby                             México 
##                                  1                                 13 
##                               Misr                            Moldova 
##                                  8                                  1 
##                         Montserrat                            Myanmar 
##                                  1                                  1 
##                          Nederland                        New Zealand 
##                                 35                                 10 
##                          Nicaragua                    Niederschlesien 
##                                  1                                  1 
##                            Nigeria                              Nihon 
##                                  1                                 27 
##                              Norge                         Österreich 
##                                  3                                 29 
##                               Otok                           Pakistan 
##                                  1                                  5 
##                             Panamá                              Perth 
##                                  1                                  1 
##                               Perú                          Pilipinas 
##                                  4                                  1 
##                           Plymouth                             Polska 
##                                  1                                 41 
##                           Portugal                       Prathet Thai 
##                                 10                                  1 
##                           Rochdale                            România 
##                                  1                                 13 
##                            Rossiya                       Saint Hélier 
##                                 32                                  2 
##                              Samoa                          Schlesien 
##                                  1                                  2 
##                            Schweiz                          Shqipëria 
##                                 29                                  1 
##                          Singapore                          Slovenija 
##                                  2                                  6 
##                Slovenská Republika                          Solothurn 
##                                  3                                  1 
##                       South Africa                          Sri Lanka 
##                                 20                                  3 
##                      Staten Island                          Stockholm 
##                                  1                                  1 
##                     Stoke on Trent                              Suomi 
##                                  1                                  1 
##                            Suriyah                            Sverige 
##                                  2                                 12 
##                     Taehan Min'guk                           Tanzania 
##                                  3                                  1 
##                              Tunis                            Türkiye 
##                                  1                                  6 
##                             Uganda                           Ukrayina 
##                                  1                                 17 
##                     United Kingdom                      United States 
##                               1496                                339 
##                          Venezuela                           Viet Nam 
##                                  7                                  2 
##                          Wimbledon                           Yisra'el 
##                                  1                                 11 
##                             Zambia                           Zhonghua 
##                                  1                                 22 
##                           Zimbabwe 
##                                  1
artists %>%
  group_by(countryBirth) %>%
  mutate(count = n()) %>%
  filter(count >= 10) %>%
  ungroup() %>%
  ggplot(aes(x = countryBirth, y = yearOfBirth)) +
  geom_boxplot()

Joins!

Combining datasets with shared unique identifiers: Inner Join = keep everything that is shared in both datasets Full Union = keep everything from each dataset

Let’s combine the datasets so that if we want to use gender to ask questions about the art, the information is available!

artartist1 <- 
  inner_join(artists, artwork, by = c("id" = "artistId"))

artartist2 <- 
  full_join(artists, artwork, by = c("id" = "artistId"))

nrow(artwork)
## [1] 69201
nrow(artartist1)
## [1] 69195
nrow(artartist2)
## [1] 69395

Some inportant insights from these runs:

There are 6 pieces of art that do not have a listed artist. They are left out in the intersect.

There are ALSO 194 artists in the dataset that do not have artwork in the database, and that is why we get MORE data in artartist2 than in artwork itself.

Visualization

artwork %>%
  select(artistRole) %>%
  table()
## .
##                    after              and a pupil           and assistants 
##                     2014                        4                        3 
##        and other artists               and studio                   artist 
##                       12                        1                    66907 
##            attributed to                circle of doubtfully attributed to 
##                      164                        1                        1 
##              follower of   formerly attributed to              imitator of 
##                        2                       14                        5 
##                manner of             prints after                   pseudo 
##                       24                       11                        8 
##                 pupil of                school of                studio of 
##                       18                        4                        7 
##                 style of 
##                        1
#density plot
artartist1 %>%
  filter(id != 558) %>%
  ggplot(aes(x = year, color = gender)) + 
  geom_density()

#histogram
artartist1 %>%
  filter(id != 558) %>%
  ggplot(aes(x = year, color = gender, fill = gender)) + 
  geom_histogram()

We see that the relative proportion of female artwork increases as we turn towards modern times. Removing artist # 558 helps remove his skew.

Random Forest Model

Let’s explore a RFM.

#install.packages('ranger')
set.seed(47)
library(tidymodels)
library(vip)


# remove the artist that is listed an enormous number of times 
# because he might skew the data
artRF <- artartist1 %>%
  filter(id != 558)

# 75% train, 25% test
data_split <- initial_split(artRF, prop = 0.75)


art_train <- training(data_split)

# for better data science, we would perform more analysis on these
# individual fields, but removing them is sufficient for our low-stakes 
# purposes now
art_test  <- testing(data_split) %>%
  filter(!is.na(gender)) %>%
  filter(!is.na(width)) %>%
  filter(!is.na(height)) %>%
  filter(!is.na(acquisitionYear)) %>%
  filter(!is.na(yearOfBirth)) %>%
  filter(!is.na(yearOfDeath))

# give it the right fields to look at
# birth and death year might make the model overfit to the training data
rand_forest(mode = "regression") %>% 
   set_args(importance = "permutation") %>%
   fit(year ~ gender + width + height + acquisitionYear +
         yearOfBirth + yearOfDeath, 
       data = art_train) %>% 
   vip::vip() 

We see that the model is relying very heavily on yearOfBirth and yearOfDeath to make its predictions, as we hypothesized.

rand_forest(mode = "regression") %>% 
   fit(year ~ gender + width + height + acquisitionYear +
         yearOfBirth + yearOfDeath, data = art_train) %>% 
   predict(new_data = art_test) %>%
   ggplot(aes(y = .pred, x = art_test$year)) + 
   geom_point() +
   geom_abline(intercept = 0, slope = 1) +
  ylab("Predicted Year") +
  xlab("Year of Creation")

The above graph shows that our model has a very high accuracy - probably too accurate.