# Coding Presentation (Stata v R v Python)
# R Syntax
# Fors Marsh Group
# July 8, 2018
# Author: Michael Siebel
#######################
##Load data set
#set working directory
setwd('C:\\Users\\Siebelm\\Documents\\4 FMG\\Coding Presentation')
##Load data set
#Import CSV & drop variables
df = read.csv('results.csv')
df$city = NULL
df$country = NULL
#Check first few rows
head(df)
## date home_team away_team home_score away_score tournament neutral
## 1 1872-11-30 Scotland England 0 0 Friendly FALSE
## 2 1873-03-08 England Scotland 4 2 Friendly FALSE
## 3 1874-03-07 Scotland England 2 1 Friendly FALSE
## 4 1875-03-06 England Scotland 2 2 Friendly FALSE
## 5 1876-03-04 Scotland England 3 0 Friendly FALSE
## 6 1876-03-25 Scotland Wales 4 0 Friendly FALSE
#Check variables
sapply(df, class)
## date home_team away_team home_score away_score tournament
## "factor" "factor" "factor" "integer" "integer" "factor"
## neutral
## "logical"
#Check missings (NA's)
sapply(df, function(x)sum(is.na(x)))
## date home_team away_team home_score away_score tournament
## 0 0 0 0 0 0
## neutral
## 0
#Subset dataset to only include Germany matches
df_ger = df[df$home_team=='Germany' | df$away_team=='Germany',]
head(df_ger)
## date home_team away_team home_score away_score tournament
## 210 1908-04-05 Switzerland Germany 5 3 Friendly
## 217 1908-06-07 Austria Germany 3 2 Friendly
## 235 1909-04-04 Germany Switzerland 1 0 Friendly
## 236 1909-04-04 Hungary Germany 3 3 Friendly
## 254 1910-04-03 Switzerland Germany 2 3 Friendly
## 257 1910-04-24 Netherlands Germany 4 2 Friendly
## neutral
## 210 FALSE
## 217 FALSE
## 235 FALSE
## 236 FALSE
## 254 FALSE
## 257 FALSE
##############################
##Recodes and Data Manpulation
#
df_ger$match = 1
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match
## 38677 Friendly FALSE 1
## 38818 Friendly FALSE 1
## 38884 Friendly FALSE 1
## 38988 Friendly FALSE 1
## 39031 Friendly FALSE 1
## 39060 FIFA World Cup TRUE 1
#Conditional recode (Define Goals Scored)
df_ger$goals = ifelse(df_ger$home_team=='Germany', df_ger$home_score, ifelse(df_ger$away_team=='Germany', df_ger$away_score, NA))
#Conditional recode (Define Goals Conceded)
df_ger$conceded = ifelse(df_ger$home_team=='Germany', df_ger$away_score, ifelse(df_ger$away_team=='Germany', df_ger$home_score, NA))
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match goals conceded
## 38677 Friendly FALSE 1 2 2
## 38818 Friendly FALSE 1 1 1
## 38884 Friendly FALSE 1 0 1
## 38988 Friendly FALSE 1 1 2
## 39031 Friendly FALSE 1 2 1
## 39060 FIFA World Cup TRUE 1 0 1
#Simple recode (Define Goal Differential)
df_ger$goaldiff = df_ger$goals-df_ger$conceded
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match goals conceded goaldiff
## 38677 Friendly FALSE 1 2 2 0
## 38818 Friendly FALSE 1 1 1 0
## 38884 Friendly FALSE 1 0 1 -1
## 38988 Friendly FALSE 1 1 2 -1
## 39031 Friendly FALSE 1 2 1 1
## 39060 FIFA World Cup TRUE 1 0 1 -1
#Dummy recode (Define Friendly v Competitive match)
df_ger$friendly = as.logical(df_ger$tournament=='Friendly')
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match goals conceded goaldiff friendly
## 38677 Friendly FALSE 1 2 2 0 TRUE
## 38818 Friendly FALSE 1 1 1 0 TRUE
## 38884 Friendly FALSE 1 0 1 -1 TRUE
## 38988 Friendly FALSE 1 1 2 -1 TRUE
## 39031 Friendly FALSE 1 2 1 1 TRUE
## 39060 FIFA World Cup TRUE 1 0 1 -1 FALSE
#Categorical recode (home vs away vs neutral matches)
df_ger$home[df_ger$home_team=='Germany'] = 'home'
df_ger$home[df_ger$away_team=='Germany'] = 'away'
df_ger$home[df_ger$neutral=='TRUE'] = 'neutral'
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match goals conceded goaldiff friendly
## 38677 Friendly FALSE 1 2 2 0 TRUE
## 38818 Friendly FALSE 1 1 1 0 TRUE
## 38884 Friendly FALSE 1 0 1 -1 TRUE
## 38988 Friendly FALSE 1 1 2 -1 TRUE
## 39031 Friendly FALSE 1 2 1 1 TRUE
## 39060 FIFA World Cup TRUE 1 0 1 -1 FALSE
## home
## 38677 home
## 38818 home
## 38884 home
## 38988 away
## 39031 home
## 39060 neutral
#Conditional recode (opponent)
df_ger$opponent = ifelse(df_ger$home_team=='Germany', as.character(df_ger$away_team),
ifelse(df_ger$away_team=='Germany', as.character(df_ger$home_team), NA))
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match goals conceded goaldiff friendly
## 38677 Friendly FALSE 1 2 2 0 TRUE
## 38818 Friendly FALSE 1 1 1 0 TRUE
## 38884 Friendly FALSE 1 0 1 -1 TRUE
## 38988 Friendly FALSE 1 1 2 -1 TRUE
## 39031 Friendly FALSE 1 2 1 1 TRUE
## 39060 FIFA World Cup TRUE 1 0 1 -1 FALSE
## home opponent
## 38677 home France
## 38818 home Spain
## 38884 home Brazil
## 38988 away Austria
## 39031 home Saudi Arabia
## 39060 neutral Mexico
#String variables (Define year)
df_ger$year = as.numeric(substr(df_ger$date,1,4))
tail(df_ger)
## date home_team away_team home_score away_score
## 38677 2017-11-14 Germany France 2 2
## 38818 2018-03-23 Germany Spain 1 1
## 38884 2018-03-27 Germany Brazil 0 1
## 38988 2018-06-02 Austria Germany 2 1
## 39031 2018-06-08 Germany Saudi Arabia 2 1
## 39060 2018-06-17 Germany Mexico 0 1
## tournament neutral match goals conceded goaldiff friendly
## 38677 Friendly FALSE 1 2 2 0 TRUE
## 38818 Friendly FALSE 1 1 1 0 TRUE
## 38884 Friendly FALSE 1 0 1 -1 TRUE
## 38988 Friendly FALSE 1 1 2 -1 TRUE
## 39031 Friendly FALSE 1 2 1 1 TRUE
## 39060 FIFA World Cup TRUE 1 0 1 -1 FALSE
## home opponent year
## 38677 home France 2017
## 38818 home Spain 2018
## 38884 home Brazil 2018
## 38988 away Austria 2018
## 39031 home Saudi Arabia 2018
## 39060 neutral Mexico 2018
#Drop redundant variables
df_ger = subset(df_ger, select = -c(home_score, away_score, tournament,
neutral, home_team, away_team))
tail(df_ger)
## date match goals conceded goaldiff friendly home
## 38677 2017-11-14 1 2 2 0 TRUE home
## 38818 2018-03-23 1 1 1 0 TRUE home
## 38884 2018-03-27 1 0 1 -1 TRUE home
## 38988 2018-06-02 1 1 2 -1 TRUE away
## 39031 2018-06-08 1 2 1 1 TRUE home
## 39060 2018-06-17 1 0 1 -1 FALSE neutral
## opponent year
## 38677 France 2017
## 38818 Spain 2018
## 38884 Brazil 2018
## 38988 Austria 2018
## 39031 Saudi Arabia 2018
## 39060 Mexico 2018
########################
##Descriptive Statistics
#Tabs
tabHome = table(df_ger$home, df_ger$friendly)
tabHome
##
## FALSE TRUE
## away 108 271
## home 119 285
## neutral 139 9
#Unweighted proportions
propHome = prop.table(tabHome, 2)*100
round(propHome, 2)
##
## FALSE TRUE
## away 29.51 47.96
## home 32.51 50.44
## neutral 37.98 1.59
#Goal difference table
library('plyr')
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
goaldiffTable = group_by(df_ger, home, friendly)
goaldiffTable = summarize(goaldiffTable,
conceded = mean(conceded),
goaldiff = mean(goaldiff),
goals = mean(goals))
goaldiffTable
## # A tibble: 6 x 5
## # Groups: home [?]
## home friendly conceded goaldiff goals
## <chr> <lgl> <dbl> <dbl> <dbl>
## 1 away FALSE 0.806 1.54 2.34
## 2 away TRUE 1.45 0.446 1.89
## 3 home FALSE 0.723 2.17 2.89
## 4 home TRUE 1.16 1.26 2.42
## 5 neutral FALSE 1.17 0.755 1.93
## 6 neutral TRUE 1.67 0.222 1.89
#Opponent table
threeOpponents = df_ger[df_ger$opponent=='Mexico' | df_ger$opponent=='Sweden' | df_ger$opponent=='Korea Republic',]
opponentTable = group_by(threeOpponents, opponent)
opponentTable = summarize(opponentTable,
goaldiff = mean(goaldiff),
match = sum(match))
opponentTable
## # A tibble: 3 x 3
## opponent goaldiff match
## <chr> <dbl> <dbl>
## 1 Korea Republic 0 3
## 2 Mexico 1.08 12
## 3 Sweden 0.278 36
#######################
##Graphs
#Histogram
library('ggplot2')
ggplot(df_ger, aes(goaldiff)) +
geom_histogram(fill = "blue", binwidth=.75, position="dodge") +
ylab("Count") + xlab("Goal Differential") + ggtitle('Histogram of Goal Differences')

ggplot(df_ger, aes(goaldiff, fill = friendly)) +
geom_histogram(binwidth=.75, position="dodge") +
ylab("Count") + xlab("Goal Differential") + ggtitle('Histogram of Goal Differences') +
theme(legend.position="bottom")

#Line Graphs
df_ger_byyear = ddply(df_ger, .(year), summarize, goals=mean(goals),
conceded=mean(conceded), goaldiff=mean(goaldiff))
df_ger_byyear$conceded = df_ger_byyear$conceded*-1
tail(df_ger_byyear)
## year goals conceded goaldiff
## 94 2013 2.916667 -1.2500000 1.666667
## 95 2014 2.176471 -0.8823529 1.294118
## 96 2015 2.222222 -1.2222222 1.000000
## 97 2016 2.125000 -0.6250000 1.500000
## 98 2017 2.866667 -0.8000000 2.066667
## 99 2018 0.800000 -1.2000000 -0.400000
df_goals_byyear = ggplot(df_ger_byyear) + geom_line(aes(x=year, y=goals), color='dark green') +
ylab("Goals") + xlab("Year") + ggtitle('Goals Scored by Year')
df_goals_byyear

df_conceded_byyear = ggplot(df_ger_byyear) + geom_line(aes(x=year, y=conceded), color='maroon') +
ylab("Conceded") + xlab("Year") + ggtitle('Goals Conceded by Year')
df_conceded_byyear

df_goaldiff_overlay_byyear = ggplot(df_ger_byyear) + geom_line(aes(x=year, y=goaldiff), color='blue') +
ylab("Goal Differential") + xlab("Year") + ggtitle('Goal Differential by Year') + labs(subtitle='Overlay Chart') +
geom_line(aes(x=year, y=goals), color='dark green') + geom_line(aes(x=year, y=conceded), color='maroon') +
geom_hline(yintercept = 0, color='dark grey')
df_goaldiff_overlay_byyear
