This application is used for webscraping the data from ‘www.transfermarkt.com’ of the world’s best players ‘Messi’ and ‘Ronaldo’. The function is written in R and tabularizes and plots the number of goals scored or assists made by Messi or Ronaldo.
User must input the name of the player as ‘cristiano-ronaldo’ with id =8198 or ‘lionel-messi’ with id=28003 and the season whos data the user wants to see.
The packages used are:
-httr: For making a connection to the website
-XML: For Extracting the tabular data for the website(webscraping)
-ggplot2: For making graphical plots
-dplyr: For manipulating the data tables
-xtable: For Tabularizing Data Frame
library(httr)
library(XML)
library(ggplot2)
library(dplyr)
library(xtable)
The R function which does the job of extracting the data from the website and tabularizing the data:
Content function takes an url as input and returns HTML Source code as output.
The key function used here is the readHTMLTable function of the XML package which takes the content as input and extracts the HTML Table data from it. Then the function creates a data frame which we use for our analysis.
playerstats<-function(name=character(),id=integer(),seasons=2014)
{
season<-gsub(pattern="-(.*)",replacement="",seasons)
seasonend<-gsub(pattern="(.*)-",replacement="",seasons)
i<-NULL;finaldata<-data.frame()
for(i in 1:(as.numeric(seasonend)-as.numeric(season)))
{
GET(paste("http://www.transfermarkt.com/",name,"/leistungsdaten/spieler/",id,"/saison/",season,"/plus/",sep=""))->htmlcode
readHTMLTable(content(htmlcode))->tables
tables[4]->playerdata
as.data.frame(playerdata)->playerdata
playerdata<-playerdata[,-1]
names(playerdata)<-c("Comp","Aps","Goals","Asts","YC","YC/RC","RC","Mins")
gsub(pattern="(20)|(19)",replacement="",season)->temp
as.numeric(temp)+1->temp
if((temp/10)<1)
{
season<-gsub(pattern="(.*)",replacement=paste(season,"-0",temp,sep=""),temp)
}
if((temp/10)>=1)
{
season<-gsub(pattern="(.*)",replacement=paste(season,"-",temp,sep=""),temp)
}
playerdata<-cbind(playerdata,Season=season)
playerdata<-playerdata[,c(1,9,2:8)]
if(i==1)
{finaldata=playerdata}
finaldata<-merge(finaldata,playerdata,all=T)
finaldata<-arrange(finaldata,desc(Season))
season<-gsub(pattern="-(.*)",replacement="",season)
season<-as.numeric(season)+1
}
finaldata<-cbind(Player= paste(toupper(substring(gsub(pattern="(.*)-",replacement="",name), 1,1)),substring(gsub(pattern="(.*)-",replacement="",name),2),sep=""),finaldata)
finaldata$Mins<-gsub(pattern="\\.|'",replacement="",finaldata$Mins)
j<-NULL
for(j in 4:length(finaldata[1,]))
{
finaldata[,j]<-gsub(pattern="-",replacement=0,finaldata[,j])
finaldata[,j]<-as.numeric(finaldata[,j])
}
finaldata
}
We use the R function to extract the data of the 2 players of the seasons 2009-2015.
cr7data<-playerstats("cristiano-ronaldo",id=8198,seasons="2009-2015")
messidata<-playerstats("lionel-messi",id=28003,seasons="2009-2015")
Viewing Collected Data in Dataframes:
#MESSI
messidata
## Player Comp Season Aps Goals Asts YC YC/RC RC Mins
## 1 Messi Champions League 2014-15 13 10 6 1 0 0 1145
## 2 Messi Copa del Rey 2014-15 6 5 4 1 0 0 540
## 3 Messi La Liga 2014-15 38 43 21 4 0 0 3375
## 4 Messi Champions League 2013-14 7 8 1 0 0 0 630
## 5 Messi Copa del Rey 2013-14 6 5 3 1 0 0 477
## 6 Messi La Liga 2013-14 31 28 12 2 0 0 2498
## 7 Messi Supercopa 2013-14 2 0 0 0 0 0 135
## 8 Messi Champions League 2012-13 11 8 3 0 0 0 827
## 9 Messi Copa del Rey 2012-13 5 4 1 1 0 0 442
## 10 Messi La Liga 2012-13 32 46 14 1 0 0 2629
## 11 Messi Supercopa 2012-13 2 2 0 0 0 0 180
## 12 Messi Champions League 2011-12 11 14 9 2 0 0 990
## 13 Messi Club World Cup 2011-12 2 2 1 0 0 0 180
## 14 Messi Copa del Rey 2011-12 7 3 4 1 0 0 514
## 15 Messi La Liga 2011-12 37 50 20 6 0 0 3270
## 16 Messi Supercopa 2011-12 2 3 2 0 0 0 180
## 17 Messi UEFA Supercup 2011-12 1 1 1 0 0 0 90
## 18 Messi Champions League 2010-11 13 12 4 0 0 0 1050
## 19 Messi Copa del Rey 2010-11 7 7 3 1 0 0 542
## 20 Messi La Liga 2010-11 33 31 21 4 0 0 2862
## 21 Messi Supercopa 2010-11 2 3 0 0 0 0 129
## 22 Messi Champions League 2009-10 11 8 0 0 0 0 985
## 23 Messi Club World Cup 2009-10 2 2 0 1 0 0 158
## 24 Messi Copa del Rey 2009-10 3 1 0 1 0 0 212
## 25 Messi La Liga 2009-10 35 34 13 3 0 0 2841
## 26 Messi Supercopa 2009-10 1 2 0 0 0 0 90
## 27 Messi UEFA Supercup 2009-10 1 0 1 1 0 0 120
#RONALDO
cr7data
## Player Comp Season Aps Goals Asts YC YC/RC RC Mins
## 1 Ronaldo Champions League 2014-15 12 10 4 1 0 0 1064
## 2 Ronaldo La Liga 2014-15 35 48 16 4 0 1 3096
## 3 Ronaldo Copa del Rey 2014-15 2 1 0 0 0 0 118
## 4 Ronaldo Supercopa 2014-15 2 0 0 1 0 0 89
## 5 Ronaldo Club World Cup 2014-15 2 0 2 0 0 0 180
## 6 Ronaldo UEFA Supercup 2014-15 1 2 0 0 0 0 90
## 7 Ronaldo Champions League 2013-14 11 17 6 1 0 0 991
## 8 Ronaldo La Liga 2013-14 30 31 11 4 0 1 2537
## 9 Ronaldo Copa del Rey 2013-14 6 3 2 2 0 0 495
## 10 Ronaldo Champions League 2012-13 12 12 1 1 0 0 1080
## 11 Ronaldo La Liga 2012-13 34 34 11 9 0 0 2716
## 12 Ronaldo Copa del Rey 2012-13 7 7 2 3 0 1 655
## 13 Ronaldo Supercopa 2012-13 2 2 0 0 0 0 180
## 14 Ronaldo Champions League 2011-12 10 10 4 1 0 0 930
## 15 Ronaldo La Liga 2011-12 38 46 13 4 0 0 3353
## 16 Ronaldo Copa del Rey 2011-12 5 3 0 1 0 0 437
## 17 Ronaldo Supercopa 2011-12 2 1 0 1 0 0 180
## 18 Ronaldo Champions League 2010-11 12 6 4 2 0 0 1018
## 19 Ronaldo La Liga 2010-11 34 40 13 2 0 0 2914
## 20 Ronaldo Copa del Rey 2010-11 8 7 1 3 0 0 684
## 21 Ronaldo Champions League 2009-10 6 7 2 0 0 0 450
## 22 Ronaldo La Liga 2009-10 29 26 11 3 1 1 2462
We then clean the data and combine the datasets of the 2 players.
bothdata<-merge(messidata,cr7data,all=T)
bothdata<-group_by(bothdata,Season,Player)
allgoals<-summarise(bothdata,Goals=sum(Goals))
allasts<-summarise(bothdata,Asts=sum(Asts))
allgoals[order(desc(allgoals$Season)),]
## Source: local data frame [12 x 3]
## Groups: Season
##
## Season Player Goals
## 1 2014-15 Messi 58
## 2 2014-15 Ronaldo 61
## 3 2013-14 Messi 41
## 4 2013-14 Ronaldo 51
## 5 2012-13 Messi 60
## 6 2012-13 Ronaldo 55
## 7 2011-12 Messi 73
## 8 2011-12 Ronaldo 60
## 9 2010-11 Messi 53
## 10 2010-11 Ronaldo 53
## 11 2009-10 Messi 47
## 12 2009-10 Ronaldo 33
allasts[order(desc(allasts$Season)),]
## Source: local data frame [12 x 3]
## Groups: Season
##
## Season Player Asts
## 1 2014-15 Messi 31
## 2 2014-15 Ronaldo 22
## 3 2013-14 Messi 16
## 4 2013-14 Ronaldo 19
## 5 2012-13 Messi 18
## 6 2012-13 Ronaldo 14
## 7 2011-12 Messi 37
## 8 2011-12 Ronaldo 17
## 9 2010-11 Messi 28
## 10 2010-11 Ronaldo 18
## 11 2009-10 Messi 14
## 12 2009-10 Ronaldo 13
After refining the datasets we plot the data graphically.
goals<-ggplot(allgoals,aes(Season,Goals))
goals+geom_point(aes(color=Player),size=3.5)+geom_smooth(method="lm",linetype=2,size=1,aes(color=Player,group=Player),fill=NA)+geom_line(aes(color=Player,group=Player),size=1,alpha=.3)+scale_y_continuous(breaks=as.numeric(allgoals$Goals),limits=c(min(as.numeric(allgoals$Goals))-2,max(as.numeric(allgoals$Goals))+2))+labs(x="Season",y="Goals",title="MESSI VS. RONALDO, 2009 TO PRESENT, GOALS")+ scale_color_manual(values = c("steelblue","red"))+theme(axis.text.x=element_text(face="bold",color="darkblue"),axis.text.y=element_text(face="bold",color="darkgreen"),axis.title=element_text(face="bold"),legend.position=c(1,1),legend.justification=c(1,1),plot.title = element_text(face = "bold"))
asts<-ggplot(allasts,aes(Season,Asts))
asts+geom_point(aes(color=Player),size=3.5)+geom_smooth(method="lm",linetype=2,size=1,aes(color=Player,group=Player),fill=NA)+geom_line(aes(color=Player,group=Player),size=1,alpha=.3)+scale_y_continuous(breaks=as.numeric(allasts$Asts),limits=c(min(as.numeric(allasts$Asts))-2,max(as.numeric(allasts$Asts))+2))+labs(x="Season",y="Assists",title="MESSI VS. RONALDO, 2009 TO PRESENT, ASSISTS")+ scale_color_manual(values = c("steelblue","red"))+theme(axis.text.x=element_text(face="bold",color="darkblue"),axis.text.y=element_text(face="bold",color="darkgreen"),axis.title=element_text(face="bold"),legend.position=c(1,1),legend.justification=c(1,1),plot.title = element_text(face = "bold"))
We then individually analyze the data, ie. we see how many assists and goals each of them have score in the Champions League and in Domestic Leagues.
cr7data<-arrange(cr7data,Season)
cr7laliga<-filter(cr7data,Comp=="La Liga")
messilaliga<-filter(messidata,Comp=="La Liga")
laliga<-merge(messilaliga,cr7laliga,all=T)
lgoals<-ggplot(laliga,aes(Season,as.numeric(Goals)))
lgoals+geom_point(aes(color=Player),size=3.5)+geom_smooth(method="lm",linetype=2,size=1,aes(color=Player,group=Player),fill=NA)+geom_line(aes(color=Player,group=Player),size=1,alpha=.3)+scale_y_continuous(breaks=as.numeric(laliga$Goals),limits=c(min(as.numeric(laliga$Goals))-2,max(as.numeric(laliga$Goals))+2))+labs(x="Season",y="Goals",title="La Liga: MESSI VS. RONALDO, 2009 -> PRESENT, GOALS")+ scale_color_manual(values = c("steelblue","red"))+theme(axis.text.x=element_text(face="bold",color="darkblue"),axis.text.y=element_text(face="bold",color="darkgreen"),axis.title=element_text(face="bold"),legend.justification=c(1,1),plot.title = element_text(face = "bold"))
lassists<-ggplot(laliga,aes(Season,as.numeric(Asts)))
lassists+geom_point(aes(color=Player),size=3.5)+geom_smooth(method="lm",linetype=2,size=1,aes(color=Player,group=Player),fill=NA)+geom_line(aes(color=Player,group=Player),size=1,alpha=.3)+scale_y_continuous(breaks=as.numeric(laliga$Asts),limits=c(min(as.numeric(laliga$Asts))-2,max(as.numeric(laliga$Asts))+2))+labs(x="Season",y="Assists",title="La Liga: MESSI VS. RONALDO, 2009 -> PRESENT, ASSISTS")+ scale_color_manual(values = c("steelblue","red"))+theme(axis.text.x=element_text(face="bold",color="darkblue"),axis.text.y=element_text(face="bold",color="darkgreen"),axis.title=element_text(face="bold"),legend.justification=c(1,1),plot.title = element_text(face = "bold"))
messicl<-filter(messidata,Comp=="Champions League")
cr7cl<-filter(cr7data,Comp=="Champions League")
merge(messicl,cr7cl,all=T)->cl
clgoals<-ggplot(cl,aes(Season,as.numeric(Goals)))
clgoals+geom_point(aes(color=Player),size=3.5)+geom_smooth(method="lm",linetype=2,size=1,aes(color=Player,group=Player),fill=NA)+geom_line(aes(color=Player,group=Player),size=1,alpha=.3)+scale_y_continuous(breaks=as.numeric(cl$Goals),limits=c(min(as.numeric(cl$Goals))-2,max(as.numeric(cl$Goals))+2))+labs(x="Season",y="Goals",title="CL: MESSI VS. RONALDO, 2009 -> PRESENT, GOALS")+ scale_color_manual(values = c("steelblue","red"))+theme(axis.text.x=element_text(face="bold",color="darkblue"),axis.text.y=element_text(face="bold",color="darkgreen"),axis.title=element_text(face="bold"),legend.position=c(1,1),legend.justification=c(1,1),plot.title = element_text(face = "bold"))
classists<-ggplot(cl,aes(Season,as.numeric(Asts)))
classists+geom_point(aes(color=Player),size=3.5)+geom_smooth(method="lm",linetype=2,size=1,aes(color=Player,group=Player),fill=NA)+geom_line(aes(color=Player,group=Player),size=1,alpha=.3)+scale_y_continuous(breaks=as.numeric(cl$Asts),limits=c(min(as.numeric(cl$Asts))-2,max(as.numeric(cl$Asts))+2))+labs(x="Season",y="Assists",title="CL: MESSI VS. RONALDO, 2009 -> PRESENT, ASSISTS")+ scale_color_manual(values = c("steelblue","red"))+theme(axis.text.x=element_text(face="bold",color="darkblue"),axis.text.y=element_text(face="bold",color="darkgreen"),axis.title=element_text(face="bold"),legend.position=c(1,1),legend.justification=c(1,1),plot.title = element_text(face = "bold"))
| Season | Player | Liga_Goals | Liga_Asts | Liga_Aps | Liga_Mins | CL_Goals | CL_Asts | CL_Aps | CL_Mins |
|---|---|---|---|---|---|---|---|---|---|
| 2014-15 | Messi | 43 | 21 | 38 | 3375 | 10 | 6 | 13 | 1145 |
| 2014-15 | Ronaldo | 48 | 16 | 35 | 3096 | 10 | 4 | 12 | 1064 |
| 2013-14 | Messi | 28 | 12 | 31 | 2498 | 8 | 1 | 7 | 630 |
| 2013-14 | Ronaldo | 31 | 11 | 30 | 2537 | 17 | 6 | 11 | 991 |
| 2012-13 | Messi | 46 | 14 | 32 | 2629 | 8 | 3 | 11 | 827 |
| 2012-13 | Ronaldo | 34 | 11 | 34 | 2716 | 12 | 1 | 12 | 1080 |
| 2011-12 | Messi | 50 | 20 | 37 | 3270 | 14 | 9 | 11 | 990 |
| 2011-12 | Ronaldo | 46 | 13 | 38 | 3353 | 10 | 4 | 10 | 930 |
| 2010-11 | Messi | 31 | 21 | 33 | 2862 | 12 | 4 | 13 | 1050 |
| 2010-11 | Ronaldo | 40 | 13 | 34 | 2914 | 6 | 4 | 12 | 1018 |
| 2009-10 | Messi | 34 | 13 | 35 | 2841 | 8 | 0 | 11 | 985 |
| 2009-10 | Ronaldo | 26 | 11 | 29 | 2462 | 7 | 2 | 6 | 450 |
cr7data<-arrange(cr7data,desc(Season))
crmaindf<-data.frame(Season=c('2014-15','2013-14','2012-13','2011-12','2010-11','2009-10'),Player=rep('Ronaldo',times = 6),Liga_Goals=filter(cr7data,Comp=='La Liga')$Goals,Liga_Asts=filter(cr7data,Comp=='La Liga')$Asts,Liga_Aps=filter(cr7data,Comp=='La Liga')$Aps,Liga_Mins=filter(cr7data,Comp=='La Liga')$Mins,CL_Goals=filter(cr7data,Comp=='Champions League')$Goals,CL_Asts=filter(cr7data,Comp=='Champions League')$Asts,CL_Aps=filter(cr7data,Comp=='Champions League')$Aps,CL_Mins=filter(cr7data,Comp=='Champions League')$Mins)
messimaindf<-data.frame(Season=c('2014-15','2013-14','2012-13','2011-12','2010-11','2009-10'),Player=rep('Messi',times = 6),Liga_Goals=filter(messidata,Comp=='La Liga')$Goals,Liga_Asts=filter(messidata,Comp=='La Liga')$Asts,Liga_Aps=filter(messidata,Comp=='La Liga')$Aps,Liga_Mins=filter(messidata,Comp=='La Liga')$Mins,CL_Goals=filter(messidata,Comp=='Champions League')$Goals,CL_Asts=filter(messidata,Comp=='Champions League')$Asts,CL_Aps=filter(messidata,Comp=='Champions League')$Aps,CL_Mins=filter(messidata,Comp=='Champions League')$Mins)
merge(messimaindf,crmaindf,all=T)->mainmerge
mainmerge<-arrange(mainmerge,desc(Season))
for(i in 3:dim(mainmerge)[2])
{
mainmerge[,i]<-as.character(mainmerge[,i])
}
xt<-xtable(mainmerge,)
print(xt,type="html",include.rownames=FALSE)