History of Rugby Union Matches Between England and Wales

1. Loading Essential Libraries

# Load the library
library(magrittr)
library(rvest)
library(ggplot2)
library(gtable)
library(grid)
library(taRifx)
library(xtable)
library(pander)
library(stringr)
library(plyr)

2. Setting Up the Data Frame

# setting URL to fetch data from
URL <- "http://en.wikipedia.org/wiki/History_of_rugby_union_matches_between_England_and_Wales"
# Fetching HTML data from website
rugbyHTML <- html(URL)

# Fetching table data from the website
rugbyData <- rugbyHTML %>%
  html_nodes("table.wikitable") %>% .[[3]] %>%
  html_table

# Viewing data
head(rugbyData)

##   No.             Date                       Venue   Score  Winner
## 1 126  6 February 2015 Millennium Stadium, Cardiff 16 – 21 England
## 2 125     9 March 2014  Twickenham Stadium, London 29 – 18 England
## 3 124    16 March 2013 Millennium Stadium, Cardiff  30 – 3   Wales
## 4 123 25 February 2012  Twickenham Stadium, London 12 – 19   Wales
## 5 122   13 August 2011 Millennium Stadium, Cardiff  19 – 9   Wales
## 6 121    6 August 2011          Twickenham, London 23 – 19 England
##                         Competition Match report
## 1                  2015 Six Nations             
## 2                  2014 Six Nations             
## 3                  2013 Six Nations          BBC
## 4                  2012 Six Nations          BBC
## 5 2011 Rugby World Cup warm up test          BBC
## 6 2011 Rugby World Cup warm up test          BBC

# Converting Dates format
rugbyData$Date  <- as.Date(rugbyData$Date, "%d %b %Y")

# Removing irrelavent column for this tutorial
rugbyData <- rugbyData[, -c(1,6,7)]

# Removing first row as 2015's result (just announced)
# We will remove it for testing purposes and keep a copy of it
rugbyData0 <- rugbyData
rugbyData0 <- rugbyData0[-1, ]

# row.names inserted by default in the above operation,
# so lets remove it
row.names(rugbyData0) <- NULL

# Viewing final table
head(rugbyData0)

##         Date                       Venue   Score  Winner
## 1 2014-03-09  Twickenham Stadium, London 29 – 18 England
## 2 2013-03-16 Millennium Stadium, Cardiff  30 – 3   Wales
## 3 2012-02-25  Twickenham Stadium, London 12 – 19   Wales
## 4 2011-08-13 Millennium Stadium, Cardiff  19 – 9   Wales
## 5 2011-08-06          Twickenham, London 23 – 19 England
## 6 2011-02-04 Millennium Stadium, Cardiff 19 – 26 England

# Ok. we still have scores in "xx - yy" format
# So lets grep it to "xx" and "yy" in different columns

matches <- regmatches(rugbyData0$Score, gregexpr("[[:digit:]]+", rugbyData0$Score))
#matches <- as.data.frame(as.numeric(unlist(matches)))

matches <- as.data.frame(matches)
colnames(matches) <- 1:125
matches1 <- matches[-c(2,3,4), ]
matches2 <- matches[-c(1,3,4), ]

tail(matches1)

##    1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1 29 30 12 19 23 19 30 23 19 62 27 47 11 31 28  9  9 50 15 46 32 60 13 21
##   25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1  9 15 10 24  6 34 12  3  3 19 21 24 15 13 17 21  9 27  6 14  9 20 16 25
##   49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1  3 22 13 30 11 34  6 14  6  6  0  6 14  5  3  0  3  3  9  3  6 23  5  9
##   73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 1  3  6  3 14  4  0  3  0  3 12 11  3  8  8 11  3 12  9  7 28 18 19 10  0
##   97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 1  8 15 11   8  18  22   3  25  14  21   8  13   3  26  14  11  25   6  24
##   116 117 118 119 120 121 122 123 124 125
## 1  12  17   3   0   0   0   1   1   0   8

tail(matches2)

##    1 2  3 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 2 18 3 19 9 19 26 17 15 26  5 18 13  9 21 17 43 26 10 44 12 31 26 34 15 23
##   26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 2  8  9  0 25  6  9 11 16 12 18 15 24 13  7 19  8  3  9  9 21  4 12  9 12
##   50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## 2  6 17  9 11 21 11  3  6 13  0  3  6  0  3  3  8  0  6  8  8  5 11  3  3
##   74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
## 2  9  0  8  3  0  3  9  7  5 11 11  3 10  9  3  6 17  3  6  3  5  9 12  0
##   98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 2 11  6   0  28   0  16   0  14   5   9   0  13   3   7   0   0  14   3
##   116 117 118 119 120 121 122 123 124 125
## 2  11   0   7   1   0   5   4   5   0  30

# replacing last 5 elements in the tail with actual score
matches1[, 121:125] <- c(5,4,5,0,30)
matches2[, 121:125] <- c(3,7,3,10,0)

t(matches1)
rugbyData0$Score1  <-  t(matches1)
rugbyData0$Score2  <-  t(matches2)

# remove "Score" column
rugbyData0  <- rugbyData0[, -3]

# Checking Mode and Class of the Data Frame
sapply(rugbyData0, mode)
sapply(rugbyData0, class)
# OOOO La La... a lot of mess... so let's clean it up 

rugbyData0 <- transform(rugbyData0, Score1 = as.numeric(Score1))
rugbyData0 <- transform(rugbyData0, Score2 = as.numeric(Score2))

### Separating Winner from Looser

rugbyData1 <- rugbyData0
rugbyData1$Winner1 <- 0
rugbyData1$Looser1 <- 0
rugbyData1$Draw1 <- 0

for (n in 1:nrow(rugbyData1)){
  if (rugbyData1$Score1[n] - rugbyData1$Score2[n] > 0 ) {
    rugbyData1$Winner1[n] <- rugbyData1$Score1[n]
    rugbyData1$Looser1[n] <- rugbyData1$Score2[n]
  }
  else if (rugbyData1$Score1[n] - rugbyData1$Score2[n] == 0){
    rugbyData1$Draw1[n] <- rugbyData1$Score1[n]     
  }
  else {
    rugbyData1$Winner1[n] <- rugbyData1$Score2[n]
    rugbyData1$Looser1[n] <- rugbyData1$Score1[n]
  }
}

colnames(rugbyData1)[6:8] <- c("WinnerScore", "LooserScore", "DrawScore")
rugbyData1$Score1 <- NULL
rugbyData1$Score2 <- NULL
rugbyData1$WinnerScore <- rugbyData1$WinnerScore + rugbyData1$DrawScore
rugbyData1$LooserScore <- rugbyData1$LooserScore + rugbyData1$DrawScore

# So our data frame requires to be converted into workable dataset

rugbyData1$EnglandScore <- NULL
rugbyData1$WalesScore <- NULL

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "England"){
    rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
  } else {
    rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]    
  }
}

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "Wales"){
    rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]
  } else {
    rugbyData1$WalesScore[i] <- rugbyData1$LooserScore[i]    
  }
}

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "England"){
    rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
  } else {
    rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]    
  }
}

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "draw"){
    rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
    rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]    
  } else {
    rugbyData1$EnglandScore[i] <- rugbyData1$EnglandScore[i] + rugbyData1$DrawScore[i]
    rugbyData1$WalesScore[i] <- rugbyData1$WalesScore[i] + rugbyData1$DrawScore[i]    
  }
} 


# Separating Venue as whether "Home" or "Away" or "Other"
England <- data.frame( "England", c("London", "Leeds", "Birkenhead", "Gloucester", "Leicester", 
                                    "Richmond", "Yorkshire", "Bristol"))
colnames(England) <- c("Venue", "GameVenue")

Wales <- data.frame( "Wales", c("Cardiff", "Swansea", "Newport", "Llanelli"))
colnames(Wales) <- c("Venue", "GameVenue")

Other <- data.frame( "Other", c("Australia"))
colnames(Other) <- c("Venue", "GameVenue")

Venue <- merge(England, merge(Wales, Other, by= c("Venue", "GameVenue"), all.x=TRUE, all.y=TRUE), 
               by = c("Venue", "GameVenue"), all.x = TRUE, all.y = TRUE)

# Extracting last word from each row in a column
rugbyData1$GameVenue <- NULL
for (i in 1:nrow(rugbyData1)){
rugbyData1$GameVenue[i] <- tail(strsplit(rugbyData1$Venue[i], split = " ")[[1]], 1)  
}

# So converting all into one Final Data Set
rugbyDataFinal <- join(rugbyData1, Venue, by = 'GameVenue')
rugbyDataFinal$Venue <- NULL
rugbyDataFinal$WinnerScore <- NULL
rugbyDataFinal$LooserScore <- NULL
rugbyDataFinal$DrawScore <- NULL
rugbyDataFinal$GameVenue <- NULL

head(rugbyDataFinal)

##         Date  Winner EnglandScore WalesScore   Venue
## 1 2014-03-09 England           29         18 England
## 2 2013-03-16   Wales            3         30   Wales
## 3 2012-02-25   Wales           12         19 England
## 4 2011-08-13   Wales            9         19   Wales
## 5 2011-08-06 England           23         19 England
## 6 2011-02-04 England           26         19   Wales

# Wow... this dataset can be used for many statistical purposes

# We do like in one of the tutorials on internet
# Last Part of Data Cleaning and Converting into Workable
# We need whether winner won in "Home" venue or "Away"

rugbyDataFinal$WinnerVenue <- NULL
for (i in 1:nrow(rugbyDataFinal)){
if (rugbyDataFinal$Winner[i] == rugbyDataFinal$Venue[i]) {
  rugbyDataFinal$WinnerVenue[i] <- "Home"  
  } else {
  rugbyDataFinal$WinnerVenue[i] <- "Away"
  }
}

rugbyData1$GamesVenue <- rugbyDataFinal$Venue
rugbyData1$WinnersVenue <- rugbyDataFinal$WinnerVenue

# Saving into rds
saveRDS(rugbyData1, file = "rugbyData.rds", refhook = NULL)
saveRDS(rugbyDataFinal, file = "rugbyDataFinal.rds", refhook = NULL)
# Saving into RData
save(rugbyData1, file = "rugbyData.RData")
save(rugbyDataFinal, file = "rugbyDataFinal.RData")
# Saving into csv
write.csv(rugbyData1, file = "rugbyData.csv")
write.csv(rugbyDataFinal, file = "rugbyDataFinal.csv")

# OK all the files are saved into default directory
# We now can free up the R Environment and memory and reload one of the files saved

rm(list=ls())

# Loading Files from working directory, my preference is "rds" files
RugbyDataFinal <- readRDS("rugbyDataFinal.rds", refhook = NULL)
RugbyData <- readRDS("rugbyData.rds", refhook = NULL)

3. Plotting the Data

# Let us plot the data first
# In the following plot we see red dots represent Wales's Win 
# and blue, represents England's Win
# Further more the blue line is smoothing line for England 

p <- ggplot(RugbyData, aes(x = Date, y = WinnerScore))
p + geom_point(colour = "blue", size = 3, shape=20) +
  geom_point(data = RugbyData, aes(x = Date, y = LooserScore), colour = 'red', size = 3, shape=20)+
  theme(axis.text.x = element_text(angle=90, size=11, vjust=0.5, face="bold", color="black"),
        axis.text.y = element_text(size=11, vjust=0.5, face="bold", color="black"),
        axis.title.x = element_text(size=15, color="forestgreen", vjust=0.35, face = "bold"),
        axis.title.y = element_text(size=13, color="blue" , vjust=0.35, face = "bold")) +
  stat_smooth(method = "loess", se = FALSE, fill="blue", colour="blue", size=1) +
  labs(list(x = "Year", y = "Winner's Score", 
            title = ("Winner's(blue, with smoothing line) \nAnd Looser's (red) Scores - Yearly")))

# But.... YYYuuuukkkkk!!!
# This plot looks like missing many things and requires some make-up

4. Make-up

# Guys, delibrately I am not showing my R-code over here
# You have to "LIKE" my post, "Facebook page" and "Reply to the post"... 
# Then I will email you the code for that graph

## ggplot - plotting on dual axis (both the Y axis)

# So in the above plot we cannot see the significant differences between two teams.

4. Predict: who will win this year

# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result

# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result

Wales_Win <- sum(RugbyData$Winner == "Wales")
England_Win <- sum(RugbyData$Winner == "England")
Draw <- sum(RugbyData$Winner == "draw")

test <- data.frame("Wales Wins" = Wales_Win, "England Wins" = England_Win, "Draw" = Draw)
cnames<- c("Wales Wins", "England Wins", "Draw")
colnames(test) <- cnames

pander(test)

Wales Wins	England Wins	Draw
56	57	12

# Clearly there are no reasons that we can say that whether England or Wales 
# have won more games, over the year they are almost the same
# Although one might say that England has won 1 game more more than Wales's
# wins. But 1 in 113 (excluding 12 Draws) doesn't make much differance.

# Let us perform a T-test to have a basic statistical idea
ttest <- t.test(RugbyData$WinnerScore, RugbyData$LooserScore)

# Results of T-Test
ttest

## 
##  Welch Two Sample t-test
## 
## data:  RugbyData$WinnerScore and RugbyData$LooserScore
## t = 8.3961, df = 188.195, p-value = 1.094e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   7.687248 12.408752
## sample estimates:
## mean of x mean of y 
##    17.352     7.304

# Clearly two means are significantly different from each other and from zero.

# Density plot of p-values in T-test
plot(density(replicate(100, ttest$p.value)), 
     main = "Plot of p-values", col="red", lwd=2)

# Preparing a small dataset for logistic regression
data1  <- RugbyData[, c(1,3,4,5,11)]
summary(data1)

##       Date               Winner           WinnerScore     LooserScore    
##  Min.   :1881-02-19   Length:125         Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:1920-01-17   Class :character   1st Qu.: 9.00   1st Qu.: 3.000  
##  Median :1958-01-18   Mode  :character   Median :14.00   Median : 6.000  
##  Mean   :1953-03-31                      Mean   :17.35   Mean   : 7.304  
##  3rd Qu.:1988-02-06                      3rd Qu.:24.00   3rd Qu.:11.000  
##  Max.   :2014-03-09                      Max.   :62.00   Max.   :31.000  
##  WinnersVenue      
##  Length:125        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

str(data1)

## 'data.frame':    125 obs. of  5 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : chr  "England" "Wales" "Wales" "Wales" ...
##  $ WinnerScore : num  29 30 19 19 23 26 30 23 26 62 ...
##  $ LooserScore : num  18 3 12 9 19 19 17 15 19 5 ...
##  $ WinnersVenue: chr  "Home" "Home" "Away" "Home" ...

data1[, 'Winner'] <- as.factor(data1[, 'Winner'])
data1[, 'WinnersVenue'] <- as.factor(data1[, 'WinnersVenue'])
str(data1)

## 'data.frame':    125 obs. of  5 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 ...
##  $ WinnerScore : num  29 30 19 19 23 26 30 23 26 62 ...
##  $ LooserScore : num  18 3 12 9 19 19 17 15 19 5 ...
##  $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 ...

data1$EnglandWins <- NULL
for(i in 1:nrow(data1)){
  if (data1$Winner[i] == "England"){
    data1$EnglandWins[i] <- 1
  } else{
    data1$EnglandWins[i] <- 0
  }
}
# Done....

saveRDS(data1, file = "rugbyData1.rds", refhook = NULL)

# View Dataset
summary(data1)

##       Date                Winner    WinnerScore     LooserScore    
##  Min.   :1881-02-19   draw   :12   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:1920-01-17   England:57   1st Qu.: 9.00   1st Qu.: 3.000  
##  Median :1958-01-18   Wales  :56   Median :14.00   Median : 6.000  
##  Mean   :1953-03-31                Mean   :17.35   Mean   : 7.304  
##  3rd Qu.:1988-02-06                3rd Qu.:24.00   3rd Qu.:11.000  
##  Max.   :2014-03-09                Max.   :62.00   Max.   :31.000  
##  WinnersVenue  EnglandWins   
##  Away:53      Min.   :0.000  
##  Home:72      1st Qu.:0.000  
##               Median :0.000  
##               Mean   :0.456  
##               3rd Qu.:1.000  
##               Max.   :1.000

# Storing "Date", "Winner", "EnglandScore", "WalesScore", "GamesVenue", "WinnersVenue" 
# in a separate data frame

data2  <- RugbyData[, c(1,3,7,8,10,11)]
summary(data2)

##       Date               Winner           EnglandScore     WalesScore  
##  Min.   :1881-02-19   Length:125         Min.   : 0.00   Min.   : 0.0  
##  1st Qu.:1920-01-17   Class :character   1st Qu.: 4.00   1st Qu.: 5.0  
##  Median :1958-01-18   Mode  :character   Median : 9.00   Median :10.0  
##  Mean   :1953-03-31                      Mean   :13.06   Mean   :11.6  
##  3rd Qu.:1988-02-06                      3rd Qu.:17.00   3rd Qu.:18.0  
##  Max.   :2014-03-09                      Max.   :62.00   Max.   :34.0  
##    GamesVenue WinnersVenue      
##  England:63   Length:125        
##  Wales  :60   Class :character  
##  Other  : 2   Mode  :character  
##                                 
##                                 
##

str(data2)

## 'data.frame':    125 obs. of  6 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : chr  "England" "Wales" "Wales" "Wales" ...
##  $ EnglandScore: num  29 3 12 9 23 26 30 15 19 62 ...
##  $ WalesScore  : num  18 30 19 19 19 19 17 23 26 5 ...
##  $ GamesVenue  : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 ...
##  $ WinnersVenue: chr  "Home" "Home" "Away" "Home" ...

data2[, 'Winner'] <- as.factor(data2[, 'Winner'])
data2[, 'WinnersVenue'] <- as.factor(data2[, 'WinnersVenue'])
str(data2)

## 'data.frame':    125 obs. of  6 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 ...
##  $ EnglandScore: num  29 3 12 9 23 26 30 15 19 62 ...
##  $ WalesScore  : num  18 30 19 19 19 19 17 23 26 5 ...
##  $ GamesVenue  : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 ...
##  $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 ...

data2$HomeVenue <- NULL
for(i in 1:nrow(data2)){
  if (data2$WinnersVenue[i] == "Home"){
    data2$HomeVenue[i] <- 1
  } else{
    data2$HomeVenue[i] <- 0
  }
}

plot(data2$WinnersVenue~data2$Winner, col=c("red", "green"), 
     xlab="Winner Team", ylab="Venue of Game", main="Winning w.r.t. Home or Away Venue")

for(i in 1:nrow(data2)){
  if (data2$WinnersVenue[i] == "Home"){
    data2$HomeVenue[i] <- 1
  } else{
    data2$HomeVenue[i] <- 0
  }
}

data2$Home <- NULL
for(i in 1:nrow(data2)){
  if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "England"){
    data2$Home[i] <- "HomeEngland"
  } else if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "Wales"){
    data2$Home[i] <- "HomeWales"
  } else {
    data2$Home[i] <- "HomeOther"    
  }
}
data2[, 'Home'] <- as.factor(data2[, 'Home'])

# Saving into rds
saveRDS(data2, file = "rugbyData2.rds", refhook = NULL)
rm(list=ls())

Data1 <- readRDS("rugbyData1.rds", refhook = NULL)
head(Data1)

##         Date  Winner WinnerScore LooserScore WinnersVenue EnglandWins
## 1 2014-03-09 England          29          18         Home           1
## 2 2013-03-16   Wales          30           3         Home           0
## 3 2012-02-25   Wales          19          12         Away           0
## 4 2011-08-13   Wales          19           9         Home           0
## 5 2011-08-06 England          23          19         Home           1
## 6 2011-02-04 England          26          19         Away           1

Data2 <- readRDS("rugbyData2.rds", refhook = NULL)
head(Data2)

##         Date  Winner EnglandScore WalesScore GamesVenue WinnersVenue
## 1 2014-03-09 England           29         18    England         Home
## 2 2013-03-16   Wales            3         30      Wales         Home
## 3 2012-02-25   Wales           12         19    England         Away
## 4 2011-08-13   Wales            9         19      Wales         Home
## 5 2011-08-06 England           23         19    England         Home
## 6 2011-02-04 England           26         19      Wales         Away
##   HomeVenue        Home
## 1         1 HomeEngland
## 2         1   HomeWales
## 3         0   HomeOther
## 4         1   HomeWales
## 5         1 HomeEngland
## 6         0   HomeOther

# Let us work on prediction now

History of Rugby Union Matches Between England and Wales

Manoj Kumar

1. Loading Essential Libraries

2. Setting Up the Data Frame

3. Plotting the Data

4. Make-up

4. Predict: who will win this year

COMING SOON…..

Part 2