リーガ所属選手の国籍一覧

やっつけ間がハンパなくて欠けてる選手もいたりで正確な値ではないですが、googleVizライブラリ使って、結果をGoogle Visualisation APIで描写する

初期設定

library(hash, quietly = TRUE)
## hash-2.2.3 provided by Decision Patterns
suppressPackageStartupMessages(library(googleVis))

関数定義

# チーム名から国一覧取得
get.countries.fromName <- function(fc = "FC_Barcelona") {
    url <- paste("http://en.wikipedia.org/w/api.php?format=xmlfm&action=query&export&titles=", 
        fc, sep = "")
    return(get.countries.fromURL(url))
}

# urlから国一覧取得
get.countries.fromURL <- function(url.str) {
    allxml <- readLines(con = url.str, n = -1L, warn = FALSE)
    if (length(grep("= *Out on loan", allxml)) > 0) {
        allxml <- allxml[1:grep("= *Out on loan", allxml)]
    }
    players <- allxml[grep("(player.*(nat=|nb=)|nac=.*|pos=)", allxml)]
    if (length(grep("( in| out| ss| disc| mfs)", players)) > 0) {
        players <- players[-grep("( in| out| ss| disc| mfs)", players)]
    } else {

    }
    countries <- unlist(sapply(strsplit(players, "\\|"), function(x) {
        x[grep("(nb=|nat=|nac=)", x)]
    }))
    return(countries)
}

# リーグ名からurlを構築(今のところリーガしかテストしてない)
get.url.fromLeagueName <- function(league = "La_Liga") {
    base.url <- "http://en.wikipedia.org/w/api.php?format=xmlfm&action=query&export&titles="
    mk.url <- paste(base.url, league, sep = "")
    return(URLencode(mk.url))
}

# urlから、そのurl中のチーム名を取得
get.teamNames.fromURL <- function(test.url) {
    # 一行ずつ読み込んでいく
    allxml <- readLines(con = test.url, n = -1L, warn = FALSE)
    teams.pre <- allxml[grep("\\{\\{Location map~", allxml)]
    teams.pre2 <- sapply(strsplit(teams.pre, "\\|"), function(x) {
        return(strsplit(x[5], "\\[\\["))
    })
    teams <- sapply(teams.pre2, function(x) {
        return(x[2])
    })
    teams <- sapply(strsplit(teams, "\\]\\]"), function(x) {
        return(x[1])
    })
    return(teams)
}

# wikipediaから国名の対応表をgetする
get.countries.name.tbl <- function(url.str = "http://en.wikipedia.org/w/api.php?format=xmlfm&action=query&export&titles=ISO_3166-1") {
    allxml <- readLines(con = url.str, n = -1L, warn = FALSE)
    # フル表記を含む文字列を取得
    idx <- grep("\\{\\{flag\\|.*\\}\\}", allxml)
    fullname <- allxml[idx]
    # 2文字、3文字表記を含む文字列を取得
    twoname <- allxml[idx + 1]
    # データフレーム化
    countries.name <- data.frame(fullname = fullname, twoname = sapply(strsplit(twoname, 
        "\\|"), function(x) {
        x[9]
    }), threename = sapply(strsplit(twoname, "\\|"), function(x) {
        x[5]
    }))
    # フル表記を作る
    countries.name <- countries.name[-grep("hide=y", countries.name$fullname), 
        ]
    countries.name$fullname <- substr(iconv(countries.name$fullname, "UTF-8"), 
        as.vector(regexpr("\\{\\{flag\\|", countries.name$fullname)), 1000)
    countries.name$fullname <- gsub("(\\{\\{|\\}\\})", "", countries.name$fullname)
    countries.name$fullname <- sapply(strsplit(countries.name$fullname, "\\|"), 
        function(x) {
            x[2]
        })
    countries.name <- countries.name[grep("^[A-Za-z]", countries.name$fullname), 
        ]
    # 2文字を作る
    countries.name$twoname <- sapply(strsplit(as.character(countries.name$twoname), 
        ":"), function(x) {
        substring(x[2], 1, 2)
    })
    # 3文字を作る
    countries.name$threename <- sapply(strsplit(as.character(countries.name$threename), 
        ";"), function(x) {
        substring(x[7], 1, 3)
    })
    return(countries.name)
}

get.countries.fromLeagueName <- function(league = "La_Liga") {
    test.url <- get.url.fromLeagueName(league = league)
    # team名のみ取得
    team.names <- get.teamNames.fromURL(test.url)
    this.year.list <- list()
    for (i in 1:length(team.names)) {
        # team情報を取得するurlを構築
        team.year.url <- get.url.fromLeagueName(league = team.names[i])
        # 国名を含む文字列を抽出
        team.countries <- get.countries.fromURL(team.year.url)
        if (!is.null(team.countries)) {
            this.year.list[[length(this.year.list) + 1]] <- team.countries
            names(this.year.list)[i] <- team.names[i]
        }
        Sys.sleep(3)  #念のため間隔を空ける
    }
    # listをベクトル化して
    pre.countries <- unlist(this.year.list)
    # 国名だけを抽出(空白が含まれる)
    return(gsub(" ", "", sapply(strsplit(pre.countries, "="), function(x) {
        return(x[length(x)])
    })))
}

ここからメイン

############################# 国のカウント数を取得
countries <- get.countries.fromLeagueName("La_Liga")

### 国名の変換表を取得
c.name.tbl <- get.countries.name.tbl()
# hash化
h <- hash(c(c.name.tbl$fullname, c.name.tbl$threename), c(c.name.tbl$twoname, 
    c.name.tbl$twoname))


# 2文字表記に変える
countries2 <- countries
for (i in 1:length(countries)) {
    countries2[i] <- ifelse(!is.null(h[[countries[i]]]), h[[countries[i]]], 
        countries[i])
}

## 表記揺れはメンドいから今回は無視 国別選手数のベクトル
c.players <- table(countries2[nchar(countries2) == 2])

# googleVisする
exp.data <- data.frame(c.players)
names(exp.data) <- c("Country", "Counts")
T <- gvisTable(exp.data, options = list(width = 200, height = 280))
G <- gvisGeoChart(exp.data, locationvar = "Country", colorvar = "Counts", 
    options = list(width = 720, height = 560, dataMode = "regions"))
TG <- gvisMerge(T, G, horizontal = TRUE, tableOptions = "bgcolor=\"#CCCCCC\" cellspacing=10")
print(TG, "chart")