# Load the library
library(magrittr)
library(rvest)
library(ggplot2)
library(gtable)
library(grid)
library(taRifx)
library(xtable)
library(pander)
library(stringr)
library(plyr)
# setting URL to fetch data from
URL <- "http://en.wikipedia.org/wiki/History_of_rugby_union_matches_between_England_and_Wales"
# Fetching HTML data from website
rugbyHTML <- html(URL)
# Fetching table data from the website
rugbyData <- rugbyHTML %>%
html_nodes("table.wikitable") %>% .[[3]] %>%
html_table
# Viewing data
head(rugbyData)
## No. Date Venue Score Winner
## 1 126 6 February 2015 Millennium Stadium, Cardiff 16 – 21 England
## 2 125 9 March 2014 Twickenham Stadium, London 29 – 18 England
## 3 124 16 March 2013 Millennium Stadium, Cardiff 30 – 3 Wales
## 4 123 25 February 2012 Twickenham Stadium, London 12 – 19 Wales
## 5 122 13 August 2011 Millennium Stadium, Cardiff 19 – 9 Wales
## 6 121 6 August 2011 Twickenham, London 23 – 19 England
## Competition Match report
## 1 2015 Six Nations
## 2 2014 Six Nations
## 3 2013 Six Nations BBC
## 4 2012 Six Nations BBC
## 5 2011 Rugby World Cup warm up test BBC
## 6 2011 Rugby World Cup warm up test BBC
# Converting Dates format
rugbyData$Date <- as.Date(rugbyData$Date, "%d %b %Y")
# Removing irrelavent column for this tutorial
rugbyData <- rugbyData[, -c(1,6,7)]
# Removing first row as 2015's result (just announced)
# We will remove it for testing purposes and keep a copy of it
rugbyData0 <- rugbyData
rugbyData0 <- rugbyData0[-1, ]
# row.names inserted by default in the above operation,
# so lets remove it
row.names(rugbyData0) <- NULL
# Viewing final table
head(rugbyData0)
## Date Venue Score Winner
## 1 2014-03-09 Twickenham Stadium, London 29 – 18 England
## 2 2013-03-16 Millennium Stadium, Cardiff 30 – 3 Wales
## 3 2012-02-25 Twickenham Stadium, London 12 – 19 Wales
## 4 2011-08-13 Millennium Stadium, Cardiff 19 – 9 Wales
## 5 2011-08-06 Twickenham, London 23 – 19 England
## 6 2011-02-04 Millennium Stadium, Cardiff 19 – 26 England
# Ok. we still have scores in "xx - yy" format
# So lets grep it to "xx" and "yy" in different columns
matches <- regmatches(rugbyData0$Score, gregexpr("[[:digit:]]+", rugbyData0$Score))
#matches <- as.data.frame(as.numeric(unlist(matches)))
matches <- as.data.frame(matches)
colnames(matches) <- 1:125
matches1 <- matches[-c(2,3,4), ]
matches2 <- matches[-c(1,3,4), ]
tail(matches1)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1 29 30 12 19 23 19 30 23 19 62 27 47 11 31 28 9 9 50 15 46 32 60 13 21
## 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 9 15 10 24 6 34 12 3 3 19 21 24 15 13 17 21 9 27 6 14 9 20 16 25
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 3 22 13 30 11 34 6 14 6 6 0 6 14 5 3 0 3 3 9 3 6 23 5 9
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 1 3 6 3 14 4 0 3 0 3 12 11 3 8 8 11 3 12 9 7 28 18 19 10 0
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 1 8 15 11 8 18 22 3 25 14 21 8 13 3 26 14 11 25 6 24
## 116 117 118 119 120 121 122 123 124 125
## 1 12 17 3 0 0 0 1 1 0 8
tail(matches2)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 2 18 3 19 9 19 26 17 15 26 5 18 13 9 21 17 43 26 10 44 12 31 26 34 15 23
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 2 8 9 0 25 6 9 11 16 12 18 15 24 13 7 19 8 3 9 9 21 4 12 9 12
## 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## 2 6 17 9 11 21 11 3 6 13 0 3 6 0 3 3 8 0 6 8 8 5 11 3 3
## 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
## 2 9 0 8 3 0 3 9 7 5 11 11 3 10 9 3 6 17 3 6 3 5 9 12 0
## 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 2 11 6 0 28 0 16 0 14 5 9 0 13 3 7 0 0 14 3
## 116 117 118 119 120 121 122 123 124 125
## 2 11 0 7 1 0 5 4 5 0 30
# replacing last 5 elements in the tail with actual score
matches1[, 121:125] <- c(5,4,5,0,30)
matches2[, 121:125] <- c(3,7,3,10,0)
t(matches1)
rugbyData0$Score1 <- t(matches1)
rugbyData0$Score2 <- t(matches2)
# remove "Score" column
rugbyData0 <- rugbyData0[, -3]
# Checking Mode and Class of the Data Frame
sapply(rugbyData0, mode)
sapply(rugbyData0, class)
# OOOO La La... a lot of mess... so let's clean it up
rugbyData0 <- transform(rugbyData0, Score1 = as.numeric(Score1))
rugbyData0 <- transform(rugbyData0, Score2 = as.numeric(Score2))
### Separating Winner from Looser
rugbyData1 <- rugbyData0
rugbyData1$Winner1 <- 0
rugbyData1$Looser1 <- 0
rugbyData1$Draw1 <- 0
for (n in 1:nrow(rugbyData1)){
if (rugbyData1$Score1[n] - rugbyData1$Score2[n] > 0 ) {
rugbyData1$Winner1[n] <- rugbyData1$Score1[n]
rugbyData1$Looser1[n] <- rugbyData1$Score2[n]
}
else if (rugbyData1$Score1[n] - rugbyData1$Score2[n] == 0){
rugbyData1$Draw1[n] <- rugbyData1$Score1[n]
}
else {
rugbyData1$Winner1[n] <- rugbyData1$Score2[n]
rugbyData1$Looser1[n] <- rugbyData1$Score1[n]
}
}
colnames(rugbyData1)[6:8] <- c("WinnerScore", "LooserScore", "DrawScore")
rugbyData1$Score1 <- NULL
rugbyData1$Score2 <- NULL
rugbyData1$WinnerScore <- rugbyData1$WinnerScore + rugbyData1$DrawScore
rugbyData1$LooserScore <- rugbyData1$LooserScore + rugbyData1$DrawScore
# So our data frame requires to be converted into workable dataset
rugbyData1$EnglandScore <- NULL
rugbyData1$WalesScore <- NULL
for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "England"){
rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]
}
}
for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "Wales"){
rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$WalesScore[i] <- rugbyData1$LooserScore[i]
}
}
for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "England"){
rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]
}
}
for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "draw"){
rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$EnglandScore[i] <- rugbyData1$EnglandScore[i] + rugbyData1$DrawScore[i]
rugbyData1$WalesScore[i] <- rugbyData1$WalesScore[i] + rugbyData1$DrawScore[i]
}
}
# Separating Venue as whether "Home" or "Away" or "Other"
England <- data.frame( "England", c("London", "Leeds", "Birkenhead", "Gloucester", "Leicester",
"Richmond", "Yorkshire", "Bristol"))
colnames(England) <- c("Venue", "GameVenue")
Wales <- data.frame( "Wales", c("Cardiff", "Swansea", "Newport", "Llanelli"))
colnames(Wales) <- c("Venue", "GameVenue")
Other <- data.frame( "Other", c("Australia"))
colnames(Other) <- c("Venue", "GameVenue")
Venue <- merge(England, merge(Wales, Other, by= c("Venue", "GameVenue"), all.x=TRUE, all.y=TRUE),
by = c("Venue", "GameVenue"), all.x = TRUE, all.y = TRUE)
# Extracting last word from each row in a column
rugbyData1$GameVenue <- NULL
for (i in 1:nrow(rugbyData1)){
rugbyData1$GameVenue[i] <- tail(strsplit(rugbyData1$Venue[i], split = " ")[[1]], 1)
}
# So converting all into one Final Data Set
rugbyDataFinal <- join(rugbyData1, Venue, by = 'GameVenue')
rugbyDataFinal$Venue <- NULL
rugbyDataFinal$WinnerScore <- NULL
rugbyDataFinal$LooserScore <- NULL
rugbyDataFinal$DrawScore <- NULL
rugbyDataFinal$GameVenue <- NULL
head(rugbyDataFinal)
## Date Winner EnglandScore WalesScore Venue
## 1 2014-03-09 England 29 18 England
## 2 2013-03-16 Wales 3 30 Wales
## 3 2012-02-25 Wales 12 19 England
## 4 2011-08-13 Wales 9 19 Wales
## 5 2011-08-06 England 23 19 England
## 6 2011-02-04 England 26 19 Wales
# Wow... this dataset can be used for many statistical purposes
# We do like in one of the tutorials on internet
# Last Part of Data Cleaning and Converting into Workable
# We need whether winner won in "Home" venue or "Away"
rugbyDataFinal$WinnerVenue <- NULL
for (i in 1:nrow(rugbyDataFinal)){
if (rugbyDataFinal$Winner[i] == rugbyDataFinal$Venue[i]) {
rugbyDataFinal$WinnerVenue[i] <- "Home"
} else {
rugbyDataFinal$WinnerVenue[i] <- "Away"
}
}
rugbyData1$GamesVenue <- rugbyDataFinal$Venue
rugbyData1$WinnersVenue <- rugbyDataFinal$WinnerVenue
# Saving into rds
saveRDS(rugbyData1, file = "rugbyData.rds", refhook = NULL)
saveRDS(rugbyDataFinal, file = "rugbyDataFinal.rds", refhook = NULL)
# Saving into RData
save(rugbyData1, file = "rugbyData.RData")
save(rugbyDataFinal, file = "rugbyDataFinal.RData")
# Saving into csv
write.csv(rugbyData1, file = "rugbyData.csv")
write.csv(rugbyDataFinal, file = "rugbyDataFinal.csv")
# OK all the files are saved into default directory
# We now can free up the R Environment and memory and reload one of the files saved
rm(list=ls())
# Loading Files from working directory, my preference is "rds" files
RugbyDataFinal <- readRDS("rugbyDataFinal.rds", refhook = NULL)
RugbyData <- readRDS("rugbyData.rds", refhook = NULL)
# Let us plot the data first
# In the following plot we see red dots represent Wales's Win
# and blue, represents England's Win
# Further more the blue line is smoothing line for England
p <- ggplot(RugbyData, aes(x = Date, y = WinnerScore))
p + geom_point(colour = "blue", size = 3, shape=20) +
geom_point(data = RugbyData, aes(x = Date, y = LooserScore), colour = 'red', size = 3, shape=20)+
theme(axis.text.x = element_text(angle=90, size=11, vjust=0.5, face="bold", color="black"),
axis.text.y = element_text(size=11, vjust=0.5, face="bold", color="black"),
axis.title.x = element_text(size=15, color="forestgreen", vjust=0.35, face = "bold"),
axis.title.y = element_text(size=13, color="blue" , vjust=0.35, face = "bold")) +
stat_smooth(method = "loess", se = FALSE, fill="blue", colour="blue", size=1) +
labs(list(x = "Year", y = "Winner's Score",
title = ("Winner's(blue, with smoothing line) \nAnd Looser's (red) Scores - Yearly")))
# But.... YYYuuuukkkkk!!!
# This plot looks like missing many things and requires some make-up
# Guys, delibrately I am not showing my R-code over here
# You have to "LIKE" my post, "Facebook page" and "Reply to the post"...
# Then I will email you the code for that graph
## ggplot - plotting on dual axis (both the Y axis)
# So in the above plot we cannot see the significant differences between two teams.
# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result
# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result
Wales_Win <- sum(RugbyData$Winner == "Wales")
England_Win <- sum(RugbyData$Winner == "England")
Draw <- sum(RugbyData$Winner == "draw")
test <- data.frame("Wales Wins" = Wales_Win, "England Wins" = England_Win, "Draw" = Draw)
cnames<- c("Wales Wins", "England Wins", "Draw")
colnames(test) <- cnames
pander(test)
| Wales Wins | England Wins | Draw |
|---|---|---|
| 56 | 57 | 12 |
# Clearly there are no reasons that we can say that whether England or Wales
# have won more games, over the year they are almost the same
# Although one might say that England has won 1 game more more than Wales's
# wins. But 1 in 113 (excluding 12 Draws) doesn't make much differance.
# Let us perform a T-test to have a basic statistical idea
ttest <- t.test(RugbyData$WinnerScore, RugbyData$LooserScore)
# Results of T-Test
ttest
##
## Welch Two Sample t-test
##
## data: RugbyData$WinnerScore and RugbyData$LooserScore
## t = 8.3961, df = 188.195, p-value = 1.094e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.687248 12.408752
## sample estimates:
## mean of x mean of y
## 17.352 7.304
# Clearly two means are significantly different from each other and from zero.
# Density plot of p-values in T-test
plot(density(replicate(100, ttest$p.value)),
main = "Plot of p-values", col="red", lwd=2)
# Preparing a small dataset for logistic regression
data1 <- RugbyData[, c(1,3,4,5,11)]
summary(data1)
## Date Winner WinnerScore LooserScore
## Min. :1881-02-19 Length:125 Min. : 0.00 Min. : 0.000
## 1st Qu.:1920-01-17 Class :character 1st Qu.: 9.00 1st Qu.: 3.000
## Median :1958-01-18 Mode :character Median :14.00 Median : 6.000
## Mean :1953-03-31 Mean :17.35 Mean : 7.304
## 3rd Qu.:1988-02-06 3rd Qu.:24.00 3rd Qu.:11.000
## Max. :2014-03-09 Max. :62.00 Max. :31.000
## WinnersVenue
## Length:125
## Class :character
## Mode :character
##
##
##
str(data1)
## 'data.frame': 125 obs. of 5 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" ...
## $ Winner : chr "England" "Wales" "Wales" "Wales" ...
## $ WinnerScore : num 29 30 19 19 23 26 30 23 26 62 ...
## $ LooserScore : num 18 3 12 9 19 19 17 15 19 5 ...
## $ WinnersVenue: chr "Home" "Home" "Away" "Home" ...
data1[, 'Winner'] <- as.factor(data1[, 'Winner'])
data1[, 'WinnersVenue'] <- as.factor(data1[, 'WinnersVenue'])
str(data1)
## 'data.frame': 125 obs. of 5 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" ...
## $ Winner : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 ...
## $ WinnerScore : num 29 30 19 19 23 26 30 23 26 62 ...
## $ LooserScore : num 18 3 12 9 19 19 17 15 19 5 ...
## $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 ...
data1$EnglandWins <- NULL
for(i in 1:nrow(data1)){
if (data1$Winner[i] == "England"){
data1$EnglandWins[i] <- 1
} else{
data1$EnglandWins[i] <- 0
}
}
# Done....
saveRDS(data1, file = "rugbyData1.rds", refhook = NULL)
# View Dataset
summary(data1)
## Date Winner WinnerScore LooserScore
## Min. :1881-02-19 draw :12 Min. : 0.00 Min. : 0.000
## 1st Qu.:1920-01-17 England:57 1st Qu.: 9.00 1st Qu.: 3.000
## Median :1958-01-18 Wales :56 Median :14.00 Median : 6.000
## Mean :1953-03-31 Mean :17.35 Mean : 7.304
## 3rd Qu.:1988-02-06 3rd Qu.:24.00 3rd Qu.:11.000
## Max. :2014-03-09 Max. :62.00 Max. :31.000
## WinnersVenue EnglandWins
## Away:53 Min. :0.000
## Home:72 1st Qu.:0.000
## Median :0.000
## Mean :0.456
## 3rd Qu.:1.000
## Max. :1.000
# Storing "Date", "Winner", "EnglandScore", "WalesScore", "GamesVenue", "WinnersVenue"
# in a separate data frame
data2 <- RugbyData[, c(1,3,7,8,10,11)]
summary(data2)
## Date Winner EnglandScore WalesScore
## Min. :1881-02-19 Length:125 Min. : 0.00 Min. : 0.0
## 1st Qu.:1920-01-17 Class :character 1st Qu.: 4.00 1st Qu.: 5.0
## Median :1958-01-18 Mode :character Median : 9.00 Median :10.0
## Mean :1953-03-31 Mean :13.06 Mean :11.6
## 3rd Qu.:1988-02-06 3rd Qu.:17.00 3rd Qu.:18.0
## Max. :2014-03-09 Max. :62.00 Max. :34.0
## GamesVenue WinnersVenue
## England:63 Length:125
## Wales :60 Class :character
## Other : 2 Mode :character
##
##
##
str(data2)
## 'data.frame': 125 obs. of 6 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" ...
## $ Winner : chr "England" "Wales" "Wales" "Wales" ...
## $ EnglandScore: num 29 3 12 9 23 26 30 15 19 62 ...
## $ WalesScore : num 18 30 19 19 19 19 17 23 26 5 ...
## $ GamesVenue : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 ...
## $ WinnersVenue: chr "Home" "Home" "Away" "Home" ...
data2[, 'Winner'] <- as.factor(data2[, 'Winner'])
data2[, 'WinnersVenue'] <- as.factor(data2[, 'WinnersVenue'])
str(data2)
## 'data.frame': 125 obs. of 6 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" ...
## $ Winner : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 ...
## $ EnglandScore: num 29 3 12 9 23 26 30 15 19 62 ...
## $ WalesScore : num 18 30 19 19 19 19 17 23 26 5 ...
## $ GamesVenue : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 ...
## $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 ...
data2$HomeVenue <- NULL
for(i in 1:nrow(data2)){
if (data2$WinnersVenue[i] == "Home"){
data2$HomeVenue[i] <- 1
} else{
data2$HomeVenue[i] <- 0
}
}
plot(data2$WinnersVenue~data2$Winner, col=c("red", "green"),
xlab="Winner Team", ylab="Venue of Game", main="Winning w.r.t. Home or Away Venue")
for(i in 1:nrow(data2)){
if (data2$WinnersVenue[i] == "Home"){
data2$HomeVenue[i] <- 1
} else{
data2$HomeVenue[i] <- 0
}
}
data2$Home <- NULL
for(i in 1:nrow(data2)){
if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "England"){
data2$Home[i] <- "HomeEngland"
} else if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "Wales"){
data2$Home[i] <- "HomeWales"
} else {
data2$Home[i] <- "HomeOther"
}
}
data2[, 'Home'] <- as.factor(data2[, 'Home'])
# Saving into rds
saveRDS(data2, file = "rugbyData2.rds", refhook = NULL)
rm(list=ls())
Data1 <- readRDS("rugbyData1.rds", refhook = NULL)
head(Data1)
## Date Winner WinnerScore LooserScore WinnersVenue EnglandWins
## 1 2014-03-09 England 29 18 Home 1
## 2 2013-03-16 Wales 30 3 Home 0
## 3 2012-02-25 Wales 19 12 Away 0
## 4 2011-08-13 Wales 19 9 Home 0
## 5 2011-08-06 England 23 19 Home 1
## 6 2011-02-04 England 26 19 Away 1
Data2 <- readRDS("rugbyData2.rds", refhook = NULL)
head(Data2)
## Date Winner EnglandScore WalesScore GamesVenue WinnersVenue
## 1 2014-03-09 England 29 18 England Home
## 2 2013-03-16 Wales 3 30 Wales Home
## 3 2012-02-25 Wales 12 19 England Away
## 4 2011-08-13 Wales 9 19 Wales Home
## 5 2011-08-06 England 23 19 England Home
## 6 2011-02-04 England 26 19 Wales Away
## HomeVenue Home
## 1 1 HomeEngland
## 2 1 HomeWales
## 3 0 HomeOther
## 4 1 HomeWales
## 5 1 HomeEngland
## 6 0 HomeOther
# Let us work on prediction now