Scout Analysis - Brazilian soccer championship

This is the R MarkdowDocument that bring the inicial focus on study and analyse data from payers of Brazilian soccer championship 2017.

Getting data

After each round the scout data is avilable on cartolaFC site, a fantasy game that brings out the player performance, coach and team. To get the data, theres is a open API available https://github.com/wgenial/cartrolandofc/blob/master/nova-api.md that shows some endpoints. Unfortunately doesn’t exists an API that show the scout for each, due that, is necessary manually hit the URL https://api.cartolafc.globo.com/atletas/mercado just after the end of round to get data for each round.

Considering this scenario was necessary manually get data from site, save it on JSON format creating a list of JSON files. It means, file rodada1.json contains round 1 data and so on.

Defining the libraries and creating the list of files

setwd("~/Desktop/Cartola")
library("rjson")
library("stringr")

library(reshape2)
library(ggplot2)

# read all file names that contains data to be analysed 
json_files <- c(list.files(path = "./Data/", pattern="*.json"))

Creating a data frame that going to receive the ‘parsed’ data.

An important information about the data is:

Standart informartion for each player:

iD, nickname,clubiD,position

Variable information

Depending of player position and action during the game can exists an column or not. For example, if a player score a goal or not or a goalkeeper performs blocks a penality etc.

listOfAtletas <- data.frame(
  atleta_id = character(), 
  apelido = character(), 
  clube_id = character(), 
  posicao_id = character(), 
  rodada_id = character(),
  
  pontos_num = integer(),
  preco_num = integer(),
  variacao_num = integer(),
  media_num = integer(),
  
  
  # RB - Roubada de bolas (+ 1,7)
  # G - Gol (+8,0)
  # A - Assistencia (+5,0)
  # SG - Jogos sem sofrer gols (+5,0)
  # FS - Falta sofrida ( +0,5)
  # FF - Finalizacao para fora (+0,7)
  # FD - Finalizacao defendida (+1,0)
  # FT - Finalizacao na trave (+3,5)
  # DD - Defesa dificil (+3,0)
  # DP - Defesa de penalti (+7,0)
  
  # GC - Gol contra (-6,0)
  # CV - Cartao vermelho (-5,0)
  # CA - Cartao amarelo (-2,0)
  # GS - Gol sofrido (-5,0)
  # PP - Penalti perdido (-3,5)
  # FC - Falta cometida (-0,5)
  # I - Impedimento (-0,5)
  # PE - Passe Errado (-0,3)
  # 
  
  #positive
  scout_RB <- integer(),
  scout_G <- integer(),
  scout_A  <- integer(),
  scout_SG <- integer(),
  scout_FS <- integer(),
  scout_FF <- integer(),
  scout_FD <- integer(),
  scout_FT <- integer(),
  scout_DD <- integer(),
  scout_DP <- integer(),
  
  #negative
  scout_GC <- integer(),
  scout_CV <- integer(),
  scout_CA <- integer(),
  scout_GS <- integer(),
  scout_PP <- integer(),
  scout_FC <- integer(),
  scout_I <-integer(),
  scout_PE <-integer()
)

Helper functions

Helper function to get data and parse it

## Function to get JSON data file
## Return the file data
dataFromJSON <- function(file_to_read){
  rawdata<-fromJSON(file = file_to_read)
  return (rawdata)
} 


## function to get the value of variable if exist
# return the value or 0
scout_var <- function(tmp_var){
  if (length(tmp_var) > 0){
    return(tmp_var)
  }else{
    return (0)
  }  
}
## iterating thru files

for (i in 1:length(json_files)){
  result <- dataFromJSON(str_replace_all(paste("./Data/",json_files[i]), pattern=" ", repl=""))
  
  for (i in 1:length(result$atletas)){
    tmpDF <- data.frame()
    if (length(result$atletas[[i]]$scout) > 0){
      apelido <- c(result$atletas[[i]]$apelido)
      
      atleta_id <- c(result$atletas[[i]]$atleta_id)
      clube_id <- c(result$atletas[[i]]$clube_id)
      posicao_id <- c(result$atletas[[i]]$posicao_id)
      rodada_id  <- c(result$atletas[[i]]$rodada_id)
      
      pontos_num <- (result$atletas[[i]]$pontos_num)
      preco_num <- (result$atletas[[i]]$preco_num)
      variacao_num <- (result$atletas[[i]]$variacao_num)
      media_num <- (result$atletas[[i]]$media_num)
      
      #positive
      scout_RB <- scout_var(result$atletas[[i]]$scout$RB)
      scout_G <- scout_var(result$atletas[[i]]$scout$G)
      scout_A  <- scout_var(result$atletas[[i]]$scout$A)
      scout_SG <- scout_var(result$atletas[[i]]$scout$SG)
      scout_FS <- scout_var(result$atletas[[i]]$scout$FS)
      scout_FF <- scout_var(result$atletas[[i]]$scout$FF)
      scout_FD <- scout_var(result$atletas[[i]]$scout$FD)
      scout_FT <- scout_var(result$atletas[[i]]$scout$FT)
      scout_DD <- scout_var(result$atletas[[i]]$scout$DD)
      scout_DP <- scout_var(result$atletas[[i]]$scout$DP)
      
      #negative
      scout_GC <- scout_var(result$atletas[[i]]$scout$GC)
      scout_CV <- scout_var(result$atletas[[i]]$scout$CV)
      scout_CA <- scout_var(result$atletas[[i]]$scout$CA)
      scout_GS <- scout_var(result$atletas[[i]]$scout$GS)
      scout_PP <- scout_var(result$atletas[[i]]$scout$PP)
      scout_FC <- scout_var(result$atletas[[i]]$scout$FC)
      scout_I  <- scout_var(result$atletas[[i]]$scout$I)
      scout_PE <- scout_var(result$atletas[[i]]$scout$PE)
      
      tmpDF <- data.frame(atleta_id, apelido,clube_id, posicao_id, rodada_id, 
                          pontos_num ,
                          preco_num ,
                          variacao_num ,
                          media_num ,
                          
                          scout_RB ,
                          scout_G ,
                          scout_A  ,
                          scout_SG ,
                          scout_FS ,
                          scout_FF ,
                          scout_FD ,
                          scout_FT ,
                          scout_DD ,
                          scout_DP ,
                          
                          #negative
                          scout_GC ,
                          scout_CV ,
                          scout_CA ,
                          scout_GS ,
                          scout_PP ,
                          scout_FC ,
                          scout_I ,
                          scout_PE 
      )
      listOfAtletas <- rbind(listOfAtletas, tmpDF)
    }
  }
}

Creating a subset to analyse

Top 10 players ‘Atacante’

Getting only player ‘Atacante’ where performed : FF- Finalizacao para fora (+0,7), FD - Finalizacao defendida (+1,0) and FT - Finalizacao na trave (+3,5)

## getting a subset to test
subset_listOfAtletas <- subset(listOfAtletas, 
                               select=c("atleta_id","posicao_id", "apelido", "scout_FF","scout_FD","scout_FT"),
                               posicao_id == 5 & scout_FF > 0 & scout_FD >0 & scout_FT>0)

## meaning of aggregated subset
subset_listOfAtletas_aggregated <- aggregate( subset_listOfAtletas[, 4:6],
                       by=list(subset_listOfAtletas$apelido), FUN=mean)

## ordering the subset
subset_listOfAtletas_aggregated [order(scout_FF,scout_FD,scout_FT)]
##                Group.1
## 1         Rafael Moura
## 2  Wellington Paulista
## 3     Ricardo Oliveira
## 4              Willian
## 5                 Fred
## 6              Everton
## 7            Guilherme
## 8               Walter
## 9                 Luan
## 10        Andr<U+00E9>
## 11    Henrique Almeida
## 12 J<U+00FA>nior Dutra
## 13      Bruno Henrique
## 14        Edigar Junio
## 15             Arthur 
## 16            Everaldo
## 17            Paulinho
## 18               Rossi
## 19 R<U+00F3>ger Guedes
## 20                Joel
## 21             Neilton
## 22                Keno
## 23             Mendoza
## 24         Pedro Rocha
## 25               David
## 26        Felipe Vizeu
## 27         Paulo Vitor
## 28     Vinicius Junior
## 29             Ribamar
## 30             Penilla
## 31      Tr<U+00E9>llez
## Graphic with the TOP 10 players
data.m <- melt(head(subset_listOfAtletas_aggregated, 10), id.vars='Group.1')

ggplot(data.m, aes(Group.1, value)) + geom_bar(aes(fill = variable), 
   width = 0.4, position = position_dodge(width=0.5), stat="identity") +  
   theme(legend.position="top", legend.title = 
   element_blank(),axis.title.x=element_blank(), 
   axis.title.y=element_blank())