Last week, the new season of the Chinese Super League began. Over the past few months, this league has been getting a lot of attention due to its lavish spending on players from top leagues to fill out the permitted foreigner places on teams.

I thought it would be a nice exercise in visualing nested data to see just how much of the leagues value is taken up by the relatively small number of foreign players.

To start, we need to get a table of every team and the links to their squads from wikipedia and transfermarkt. The table also has loads of extra information I won’t use here but is nice to have. We’ll do this using the rvest scraping package and the tidyverse to try and keep the code as neat as possible.

#libraries
library(magrittr)
library(rvest)
library(dplyr)
#links for information to scrape
league_wiki <- "https://en.wikipedia.org/wiki/2017_Chinese_Super_League"
league_trmarkt <- "http://www.transfermarkt.com/chinese-super-league/startseite/wettbewerb/CSL"
#function to scrape from the links
scrape_func <- function(wiki, trmarkt){
  #get the wikipedia information
  wiki_read <- read_html(wiki)
    wiki_table <- wiki_read %>%
      html_nodes("#mw-content-text > table:nth-child(17)") %>%
        html_table(fill = TRUE) %>% 
          data.frame() %>%
            arrange(Team)
    
    #get links to each clubs wikipedia page
    wiki_club_links <- wiki_read %>%
      html_nodes("#mw-content-text > table:nth-child(17) > tr > td:nth-child(1) > a") %>%
        html_attr("href") %>%
          sort()
    wiki_table <- mutate(wiki_table, wiki_link = paste0("https://en.wikipedia.org", wiki_club_links)) 
    
  #get the transfermarkt information  
  trmarkt_read <- read_html(trmarkt)
    trmarkt_table <- trmarkt_read %>%
      html_nodes("#yw1 > table") %>%
        html_table(fill = TRUE) %>%
          data.frame() %>%
            filter(nchar(name) > 2) %>%
                arrange(name)
  trmarkt_table[trmarkt_table == ""] <- NA
  #get links to each clubs transfermarkt page
  trmarkt_club_links <- trmarkt_read %>%
    html_nodes(".hide-for-pad .vereinprofil_tooltip") %>%
      html_attr("href")
  trmarkt_table <- mutate(trmarkt_table, trmakrt_link = paste0("http://www.transfermarkt.com", trmarkt_club_links)) 
  #mash everything together
  wiki_table$Team <- trmarkt_table$Club.s..1
    names(trmarkt_table)[which(names(trmarkt_table) == "Club.s..1")] <- "Team"
  
  table <- merge(wiki_table, trmarkt_table, by = "Team")
  
  return(table)
}
table <- scrape_func(league_wiki, league_trmarkt) %>%
  select_if(colSums(!is.na(.)) > 0)

I wanted to have some colour parameter for each team to plot. The teams kits gives a nice simple way to scrape this. We’ll need the png package to work with the png pictures of kits, as well as the data.table and stringr packages to manipulate data.

I also took the top function to check if a string is a valid colour from this answer on stackoverflow.

#libraries
library(png)
library(data.table)
library(stringr)

#Josh O'Brien's stackoverflow answer (see above)
#checks if a string is a valid colour
areColors <- function(x) {
    sapply(x, function(X) {
        tryCatch(is.matrix(col2rgb(X)), 
                 error = function(e) FALSE)
    })
}

#function to get the colour of each teams shirts from their wikipedia page
shirt.col_func <- function(link){
  #grab the background colour for the model pictures of kits
  bg_col <- read_html(link) %>%
    html_nodes("td:nth-child(1) > div > div:nth-child(1) > div:nth-child(3)") %>%
    str_extract("#......")
  #for some, no background is listed. We'll use white instead to make the next steps work
  if(is.na(bg_col)){bg_col <- "#ffffff"}

  #most kits have a 'white' background on which colour is overlayed
  #if a background is lsited as white, we ignore it and use the overlay instead
  if(areColors(bg_col) && tolower(bg_col) != "#ffffff"){kit_col <- bg_col}else{
  
  #find and download the png picture of the kit overlay
  kit_link <- paste0("https:", read_html(link) %>%
    html_nodes("td:nth-child(1) > div > div:nth-child(1) > div:nth-child(3) > img") %>%
      html_attr('src'))
  
  temp <- tempfile()
  download.file(kit_link, temp, mode="wb", quiet = TRUE)
  #once read we need to find the colour of every pixel on these overlays
  kit_pic <- readPNG(temp) %>%
    melt() %>%
      reshape(timevar = "Var3",
              idvar = c("Var1", "Var2"),
              direction = "wide")
  #if there is an alpha channel (value.4), remove all these pixels
  if(names(kit_pic)[ncol(kit_pic)] == "value.4"){kit_pic <- filter(kit_pic, value.4 > 0)}
  kit_pic$Var1 <- rgb(kit_pic[,3:5])
  
  #sum the pixels by colour and choose the most frequent
  #this is actually a little biased towards white, which works nicely given our ignoring of white backgrounds above
  kit_cols <- data.frame(table(kit_pic$Var1))
  kit_col <- as.character(kit_cols$Var1[which.max(kit_cols$Freq)])
  }
  return(kit_col)
}

#run the function and mutate a column for the kit colour to each row
table$kit_colour <- unlist(lapply(table$wiki_link, shirt.col_func))
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpMYXN0IHdlZWssIHRoZSBuZXcgc2Vhc29uIG9mIHRoZSBbQ2hpbmVzZSBTdXBlciBMZWFndWVdKGh0dHBzOi8vZW4ud2lraXBlZGlhLm9yZy93aWtpLzIwMTdfQ2hpbmVzZV9TdXBlcl9MZWFndWUpIGJlZ2FuLiBPdmVyIHRoZSBwYXN0IGZldyBtb250aHMsIHRoaXMgbGVhZ3VlIGhhcyBiZWVuIGdldHRpbmcgYSBsb3Qgb2YgYXR0ZW50aW9uIGR1ZSB0byBpdHMgbGF2aXNoIHNwZW5kaW5nIG9uIHBsYXllcnMgZnJvbSB0b3AgbGVhZ3VlcyB0byBmaWxsIG91dCB0aGUgcGVybWl0dGVkIGZvcmVpZ25lciBwbGFjZXMgb24gdGVhbXMuCgpJIHRob3VnaHQgaXQgd291bGQgYmUgYSBuaWNlIGV4ZXJjaXNlIGluIHZpc3VhbGluZyBuZXN0ZWQgZGF0YSB0byBzZWUganVzdCBob3cgbXVjaCBvZiB0aGUgbGVhZ3VlcyB2YWx1ZSBpcyB0YWtlbiB1cCBieSB0aGUgcmVsYXRpdmVseSBzbWFsbCBudW1iZXIgb2YgZm9yZWlnbiBwbGF5ZXJzLgoKVG8gc3RhcnQsIHdlIG5lZWQgdG8gZ2V0IGEgdGFibGUgb2YgZXZlcnkgdGVhbSBhbmQgdGhlIGxpbmtzIHRvIHRoZWlyIHNxdWFkcyBmcm9tIHdpa2lwZWRpYSBhbmQgdHJhbnNmZXJtYXJrdC4gVGhlIHRhYmxlIGFsc28gaGFzIGxvYWRzIG9mIGV4dHJhIGluZm9ybWF0aW9uIEkgd29uJ3QgdXNlIGhlcmUgYnV0IGlzIG5pY2UgdG8gaGF2ZS4gV2UnbGwgZG8gdGhpcyB1c2luZyB0aGUgcnZlc3Qgc2NyYXBpbmcgcGFja2FnZSBhbmQgdGhlIHRpZHl2ZXJzZSB0byB0cnkgYW5kIGtlZXAgdGhlIGNvZGUgYXMgbmVhdCBhcyBwb3NzaWJsZS4KYGBge3J9CiNsaWJyYXJpZXMKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkocnZlc3QpCgojbGlua3MgZm9yIGluZm9ybWF0aW9uIHRvIHNjcmFwZQpsZWFndWVfd2lraSA8LSAiaHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvMjAxN19DaGluZXNlX1N1cGVyX0xlYWd1ZSIKbGVhZ3VlX3RybWFya3QgPC0gImh0dHA6Ly93d3cudHJhbnNmZXJtYXJrdC5jb20vY2hpbmVzZS1zdXBlci1sZWFndWUvc3RhcnRzZWl0ZS93ZXR0YmV3ZXJiL0NTTCIKCiNmdW5jdGlvbiB0byBzY3JhcGUgZnJvbSB0aGUgbGlua3MKc2NyYXBlX2Z1bmMgPC0gZnVuY3Rpb24od2lraSwgdHJtYXJrdCl7CiAgI2dldCB0aGUgd2lraXBlZGlhIGluZm9ybWF0aW9uCiAgd2lraV9yZWFkIDwtIHJlYWRfaHRtbCh3aWtpKQogICAgd2lraV90YWJsZSA8LSB3aWtpX3JlYWQgJT4lCiAgICAgIGh0bWxfbm9kZXMoIiNtdy1jb250ZW50LXRleHQgPiB0YWJsZTpudGgtY2hpbGQoMTcpIikgJT4lCiAgICAgICAgaHRtbF90YWJsZShmaWxsID0gVFJVRSkgJT4lIAogICAgICAgICAgZGF0YS5mcmFtZSgpICU+JQogICAgICAgICAgICBhcnJhbmdlKFRlYW0pCiAgICAKICAgICNnZXQgbGlua3MgdG8gZWFjaCBjbHVicyB3aWtpcGVkaWEgcGFnZQogICAgd2lraV9jbHViX2xpbmtzIDwtIHdpa2lfcmVhZCAlPiUKICAgICAgaHRtbF9ub2RlcygiI213LWNvbnRlbnQtdGV4dCA+IHRhYmxlOm50aC1jaGlsZCgxNykgPiB0ciA+IHRkOm50aC1jaGlsZCgxKSA+IGEiKSAlPiUKICAgICAgICBodG1sX2F0dHIoImhyZWYiKSAlPiUKICAgICAgICAgIHNvcnQoKQogICAgd2lraV90YWJsZSA8LSBtdXRhdGUod2lraV90YWJsZSwgd2lraV9saW5rID0gcGFzdGUwKCJodHRwczovL2VuLndpa2lwZWRpYS5vcmciLCB3aWtpX2NsdWJfbGlua3MpKSAKICAgIAogICNnZXQgdGhlIHRyYW5zZmVybWFya3QgaW5mb3JtYXRpb24gIAogIHRybWFya3RfcmVhZCA8LSByZWFkX2h0bWwodHJtYXJrdCkKICAgIHRybWFya3RfdGFibGUgPC0gdHJtYXJrdF9yZWFkICU+JQogICAgICBodG1sX25vZGVzKCIjeXcxID4gdGFibGUiKSAlPiUKICAgICAgICBodG1sX3RhYmxlKGZpbGwgPSBUUlVFKSAlPiUKICAgICAgICAgIGRhdGEuZnJhbWUoKSAlPiUKICAgICAgICAgICAgZmlsdGVyKG5jaGFyKG5hbWUpID4gMikgJT4lCiAgICAgICAgICAgICAgICBhcnJhbmdlKG5hbWUpCiAgdHJtYXJrdF90YWJsZVt0cm1hcmt0X3RhYmxlID09ICIiXSA8LSBOQQoKICAjZ2V0IGxpbmtzIHRvIGVhY2ggY2x1YnMgdHJhbnNmZXJtYXJrdCBwYWdlCiAgdHJtYXJrdF9jbHViX2xpbmtzIDwtIHRybWFya3RfcmVhZCAlPiUKICAgIGh0bWxfbm9kZXMoIi5oaWRlLWZvci1wYWQgLnZlcmVpbnByb2ZpbF90b29sdGlwIikgJT4lCiAgICAgIGh0bWxfYXR0cigiaHJlZiIpCiAgdHJtYXJrdF90YWJsZSA8LSBtdXRhdGUodHJtYXJrdF90YWJsZSwgdHJtYWtydF9saW5rID0gcGFzdGUwKCJodHRwOi8vd3d3LnRyYW5zZmVybWFya3QuY29tIiwgdHJtYXJrdF9jbHViX2xpbmtzKSkgCgogICNtYXNoIGV2ZXJ5dGhpbmcgdG9nZXRoZXIKICB3aWtpX3RhYmxlJFRlYW0gPC0gdHJtYXJrdF90YWJsZSRDbHViLnMuLjEKICAgIG5hbWVzKHRybWFya3RfdGFibGUpW3doaWNoKG5hbWVzKHRybWFya3RfdGFibGUpID09ICJDbHViLnMuLjEiKV0gPC0gIlRlYW0iCiAgCiAgdGFibGUgPC0gbWVyZ2Uod2lraV90YWJsZSwgdHJtYXJrdF90YWJsZSwgYnkgPSAiVGVhbSIpCiAgCiAgcmV0dXJuKHRhYmxlKQp9CgojcnVuIHRoZSBmdW5jdGlvbiBhbmQgcmVtb3ZlIGNvbHVtbnMgb2YgTkEKdGFibGUgPC0gc2NyYXBlX2Z1bmMobGVhZ3VlX3dpa2ksIGxlYWd1ZV90cm1hcmt0KSAlPiUKICBzZWxlY3RfaWYoY29sU3VtcyghaXMubmEoLikpID4gMCkKCgpgYGAKCkkgd2FudGVkIHRvIGhhdmUgc29tZSBjb2xvdXIgcGFyYW1ldGVyIGZvciBlYWNoIHRlYW0gdG8gcGxvdC4gVGhlIHRlYW1zIGtpdHMgZ2l2ZXMgYSBuaWNlIHNpbXBsZSB3YXkgdG8gc2NyYXBlIHRoaXMuIFdlJ2xsIG5lZWQgdGhlIHBuZyBwYWNrYWdlIHRvIHdvcmsgd2l0aCB0aGUgcG5nIHBpY3R1cmVzIG9mIGtpdHMsIGFzIHdlbGwgYXMgdGhlIGRhdGEudGFibGUgYW5kIHN0cmluZ3IgcGFja2FnZXMgdG8gbWFuaXB1bGF0ZSBkYXRhLgoKSSBhbHNvIHRvb2sgdGhlIHRvcCBmdW5jdGlvbiB0byBjaGVjayBpZiBhIHN0cmluZyBpcyBhIHZhbGlkIGNvbG91ciBmcm9tIFt0aGlzXShodHRwOi8vc3RhY2tvdmVyZmxvdy5jb20vcXVlc3Rpb25zLzEzMjg5MDA5L2NoZWNrLWlmLWNoYXJhY3Rlci1zdHJpbmctaXMtYS12YWxpZC1jb2xvci1yZXByZXNlbnRhdGlvbikgYW5zd2VyIG9uIHN0YWNrb3ZlcmZsb3cuCmBgYHtyfQojbGlicmFyaWVzCmxpYnJhcnkocG5nKQpsaWJyYXJ5KGRhdGEudGFibGUpCmxpYnJhcnkoc3RyaW5ncikKCiNKb3NoIE8nQnJpZW4ncyBzdGFja292ZXJmbG93IGFuc3dlciAoc2VlIGFib3ZlKQojY2hlY2tzIGlmIGEgc3RyaW5nIGlzIGEgdmFsaWQgY29sb3VyCmFyZUNvbG9ycyA8LSBmdW5jdGlvbih4KSB7CiAgICBzYXBwbHkoeCwgZnVuY3Rpb24oWCkgewogICAgICAgIHRyeUNhdGNoKGlzLm1hdHJpeChjb2wycmdiKFgpKSwgCiAgICAgICAgICAgICAgICAgZXJyb3IgPSBmdW5jdGlvbihlKSBGQUxTRSkKICAgIH0pCn0KCiNmdW5jdGlvbiB0byBnZXQgdGhlIGNvbG91ciBvZiBlYWNoIHRlYW1zIHNoaXJ0cyBmcm9tIHRoZWlyIHdpa2lwZWRpYSBwYWdlCnNoaXJ0LmNvbF9mdW5jIDwtIGZ1bmN0aW9uKGxpbmspewogICNncmFiIHRoZSBiYWNrZ3JvdW5kIGNvbG91ciBmb3IgdGhlIG1vZGVsIHBpY3R1cmVzIG9mIGtpdHMKICBiZ19jb2wgPC0gcmVhZF9odG1sKGxpbmspICU+JQogICAgaHRtbF9ub2RlcygidGQ6bnRoLWNoaWxkKDEpID4gZGl2ID4gZGl2Om50aC1jaGlsZCgxKSA+IGRpdjpudGgtY2hpbGQoMykiKSAlPiUKICAgIHN0cl9leHRyYWN0KCIjLi4uLi4uIikKICAjZm9yIHNvbWUsIG5vIGJhY2tncm91bmQgaXMgbGlzdGVkLiBXZSdsbCB1c2Ugd2hpdGUgaW5zdGVhZCB0byBtYWtlIHRoZSBuZXh0IHN0ZXBzIHdvcmsKICBpZihpcy5uYShiZ19jb2wpKXtiZ19jb2wgPC0gIiNmZmZmZmYifQoKICAjbW9zdCBraXRzIGhhdmUgYSAnd2hpdGUnIGJhY2tncm91bmQgb24gd2hpY2ggY29sb3VyIGlzIG92ZXJsYXllZAogICNpZiBhIGJhY2tncm91bmQgaXMgbHNpdGVkIGFzIHdoaXRlLCB3ZSBpZ25vcmUgaXQgYW5kIHVzZSB0aGUgb3ZlcmxheSBpbnN0ZWFkCiAgaWYoYXJlQ29sb3JzKGJnX2NvbCkgJiYgdG9sb3dlcihiZ19jb2wpICE9ICIjZmZmZmZmIil7a2l0X2NvbCA8LSBiZ19jb2x9ZWxzZXsKICAKICAjZmluZCBhbmQgZG93bmxvYWQgdGhlIHBuZyBwaWN0dXJlIG9mIHRoZSBraXQgb3ZlcmxheQogIGtpdF9saW5rIDwtIHBhc3RlMCgiaHR0cHM6IiwgcmVhZF9odG1sKGxpbmspICU+JQogICAgaHRtbF9ub2RlcygidGQ6bnRoLWNoaWxkKDEpID4gZGl2ID4gZGl2Om50aC1jaGlsZCgxKSA+IGRpdjpudGgtY2hpbGQoMykgPiBpbWciKSAlPiUKICAgICAgaHRtbF9hdHRyKCdzcmMnKSkKICAKICB0ZW1wIDwtIHRlbXBmaWxlKCkKICBkb3dubG9hZC5maWxlKGtpdF9saW5rLCB0ZW1wLCBtb2RlPSJ3YiIsIHF1aWV0ID0gVFJVRSkKICAjb25jZSByZWFkIHdlIG5lZWQgdG8gZmluZCB0aGUgY29sb3VyIG9mIGV2ZXJ5IHBpeGVsIG9uIHRoZXNlIG92ZXJsYXlzCiAga2l0X3BpYyA8LSByZWFkUE5HKHRlbXApICU+JQogICAgbWVsdCgpICU+JQogICAgICByZXNoYXBlKHRpbWV2YXIgPSAiVmFyMyIsCiAgICAgICAgICAgICAgaWR2YXIgPSBjKCJWYXIxIiwgIlZhcjIiKSwKICAgICAgICAgICAgICBkaXJlY3Rpb24gPSAid2lkZSIpCiAgI2lmIHRoZXJlIGlzIGFuIGFscGhhIGNoYW5uZWwgKHZhbHVlLjQpLCByZW1vdmUgYWxsIHRoZXNlIHBpeGVscwogIGlmKG5hbWVzKGtpdF9waWMpW25jb2woa2l0X3BpYyldID09ICJ2YWx1ZS40Iil7a2l0X3BpYyA8LSBmaWx0ZXIoa2l0X3BpYywgdmFsdWUuNCA+IDApfQogIGtpdF9waWMkVmFyMSA8LSByZ2Ioa2l0X3BpY1ssMzo1XSkKICAKICAjc3VtIHRoZSBwaXhlbHMgYnkgY29sb3VyIGFuZCBjaG9vc2UgdGhlIG1vc3QgZnJlcXVlbnQKICAjdGhpcyBpcyBhY3R1YWxseSBhIGxpdHRsZSBiaWFzZWQgdG93YXJkcyB3aGl0ZSwgd2hpY2ggd29ya3MgbmljZWx5IGdpdmVuIG91ciBpZ25vcmluZyBvZiB3aGl0ZSBiYWNrZ3JvdW5kcyBhYm92ZQogIGtpdF9jb2xzIDwtIGRhdGEuZnJhbWUodGFibGUoa2l0X3BpYyRWYXIxKSkKICBraXRfY29sIDwtIGFzLmNoYXJhY3RlcihraXRfY29scyRWYXIxW3doaWNoLm1heChraXRfY29scyRGcmVxKV0pCiAgfQogIHJldHVybihraXRfY29sKQp9CgojcnVuIHRoZSBmdW5jdGlvbiBhbmQgbXV0YXRlIGEgY29sdW1uIGZvciB0aGUga2l0IGNvbG91ciB0byBlYWNoIHJvdwp0YWJsZSRraXRfY29sb3VyIDwtIHVubGlzdChsYXBwbHkodGFibGUkd2lraV9saW5rLCBzaGlydC5jb2xfZnVuYykpCmBgYAoKYGBge3J9CgpgYGA=