# Coding Presentation (Stata v R v Python)
# R Syntax
# Fors Marsh Group
# July 8, 2018
# Author: Michael Siebel

#######################
##Load data set
#set working directory
setwd('C:\\Users\\Siebelm\\Documents\\4 FMG\\Coding Presentation')

##Load data set
#Import CSV & drop variables
df = read.csv('results.csv') 
df$city = NULL
df$country = NULL

#Check first few rows
head(df)
##         date home_team away_team home_score away_score tournament neutral
## 1 1872-11-30  Scotland   England          0          0   Friendly   FALSE
## 2 1873-03-08   England  Scotland          4          2   Friendly   FALSE
## 3 1874-03-07  Scotland   England          2          1   Friendly   FALSE
## 4 1875-03-06   England  Scotland          2          2   Friendly   FALSE
## 5 1876-03-04  Scotland   England          3          0   Friendly   FALSE
## 6 1876-03-25  Scotland     Wales          4          0   Friendly   FALSE
#Check variables
sapply(df, class)
##       date  home_team  away_team home_score away_score tournament 
##   "factor"   "factor"   "factor"  "integer"  "integer"   "factor" 
##    neutral 
##  "logical"
#Check missings (NA's)
sapply(df, function(x)sum(is.na(x)))
##       date  home_team  away_team home_score away_score tournament 
##          0          0          0          0          0          0 
##    neutral 
##          0
#Subset dataset to only include Germany matches 
df_ger = df[df$home_team=='Germany' | df$away_team=='Germany',]

head(df_ger)
##           date   home_team   away_team home_score away_score tournament
## 210 1908-04-05 Switzerland     Germany          5          3   Friendly
## 217 1908-06-07     Austria     Germany          3          2   Friendly
## 235 1909-04-04     Germany Switzerland          1          0   Friendly
## 236 1909-04-04     Hungary     Germany          3          3   Friendly
## 254 1910-04-03 Switzerland     Germany          2          3   Friendly
## 257 1910-04-24 Netherlands     Germany          4          2   Friendly
##     neutral
## 210   FALSE
## 217   FALSE
## 235   FALSE
## 236   FALSE
## 254   FALSE
## 257   FALSE
##############################
##Recodes and Data Manpulation
#
df_ger$match = 1

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match
## 38677       Friendly   FALSE     1
## 38818       Friendly   FALSE     1
## 38884       Friendly   FALSE     1
## 38988       Friendly   FALSE     1
## 39031       Friendly   FALSE     1
## 39060 FIFA World Cup    TRUE     1
#Conditional recode (Define Goals Scored)
df_ger$goals    = ifelse(df_ger$home_team=='Germany', df_ger$home_score, ifelse(df_ger$away_team=='Germany', df_ger$away_score, NA))

#Conditional recode (Define Goals Conceded)
df_ger$conceded = ifelse(df_ger$home_team=='Germany', df_ger$away_score, ifelse(df_ger$away_team=='Germany', df_ger$home_score, NA))

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match goals conceded
## 38677       Friendly   FALSE     1     2        2
## 38818       Friendly   FALSE     1     1        1
## 38884       Friendly   FALSE     1     0        1
## 38988       Friendly   FALSE     1     1        2
## 39031       Friendly   FALSE     1     2        1
## 39060 FIFA World Cup    TRUE     1     0        1
#Simple recode (Define Goal Differential)
df_ger$goaldiff = df_ger$goals-df_ger$conceded

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match goals conceded goaldiff
## 38677       Friendly   FALSE     1     2        2        0
## 38818       Friendly   FALSE     1     1        1        0
## 38884       Friendly   FALSE     1     0        1       -1
## 38988       Friendly   FALSE     1     1        2       -1
## 39031       Friendly   FALSE     1     2        1        1
## 39060 FIFA World Cup    TRUE     1     0        1       -1
#Dummy recode (Define Friendly v Competitive match)
df_ger$friendly = as.logical(df_ger$tournament=='Friendly')

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match goals conceded goaldiff friendly
## 38677       Friendly   FALSE     1     2        2        0     TRUE
## 38818       Friendly   FALSE     1     1        1        0     TRUE
## 38884       Friendly   FALSE     1     0        1       -1     TRUE
## 38988       Friendly   FALSE     1     1        2       -1     TRUE
## 39031       Friendly   FALSE     1     2        1        1     TRUE
## 39060 FIFA World Cup    TRUE     1     0        1       -1    FALSE
#Categorical recode (home vs away vs neutral matches)
df_ger$home[df_ger$home_team=='Germany'] = 'home'
df_ger$home[df_ger$away_team=='Germany'] = 'away'
df_ger$home[df_ger$neutral=='TRUE']      = 'neutral'

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match goals conceded goaldiff friendly
## 38677       Friendly   FALSE     1     2        2        0     TRUE
## 38818       Friendly   FALSE     1     1        1        0     TRUE
## 38884       Friendly   FALSE     1     0        1       -1     TRUE
## 38988       Friendly   FALSE     1     1        2       -1     TRUE
## 39031       Friendly   FALSE     1     2        1        1     TRUE
## 39060 FIFA World Cup    TRUE     1     0        1       -1    FALSE
##          home
## 38677    home
## 38818    home
## 38884    home
## 38988    away
## 39031    home
## 39060 neutral
#Conditional recode (opponent)
df_ger$opponent = ifelse(df_ger$home_team=='Germany', as.character(df_ger$away_team), 
                         ifelse(df_ger$away_team=='Germany', as.character(df_ger$home_team), NA))

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match goals conceded goaldiff friendly
## 38677       Friendly   FALSE     1     2        2        0     TRUE
## 38818       Friendly   FALSE     1     1        1        0     TRUE
## 38884       Friendly   FALSE     1     0        1       -1     TRUE
## 38988       Friendly   FALSE     1     1        2       -1     TRUE
## 39031       Friendly   FALSE     1     2        1        1     TRUE
## 39060 FIFA World Cup    TRUE     1     0        1       -1    FALSE
##          home     opponent
## 38677    home       France
## 38818    home        Spain
## 38884    home       Brazil
## 38988    away      Austria
## 39031    home Saudi Arabia
## 39060 neutral       Mexico
#String variables (Define year)
df_ger$year = as.numeric(substr(df_ger$date,1,4))

tail(df_ger)
##             date home_team    away_team home_score away_score
## 38677 2017-11-14   Germany       France          2          2
## 38818 2018-03-23   Germany        Spain          1          1
## 38884 2018-03-27   Germany       Brazil          0          1
## 38988 2018-06-02   Austria      Germany          2          1
## 39031 2018-06-08   Germany Saudi Arabia          2          1
## 39060 2018-06-17   Germany       Mexico          0          1
##           tournament neutral match goals conceded goaldiff friendly
## 38677       Friendly   FALSE     1     2        2        0     TRUE
## 38818       Friendly   FALSE     1     1        1        0     TRUE
## 38884       Friendly   FALSE     1     0        1       -1     TRUE
## 38988       Friendly   FALSE     1     1        2       -1     TRUE
## 39031       Friendly   FALSE     1     2        1        1     TRUE
## 39060 FIFA World Cup    TRUE     1     0        1       -1    FALSE
##          home     opponent year
## 38677    home       France 2017
## 38818    home        Spain 2018
## 38884    home       Brazil 2018
## 38988    away      Austria 2018
## 39031    home Saudi Arabia 2018
## 39060 neutral       Mexico 2018
#Drop redundant variables
df_ger = subset(df_ger, select = -c(home_score, away_score, tournament,
               neutral, home_team, away_team))

tail(df_ger)
##             date match goals conceded goaldiff friendly    home
## 38677 2017-11-14     1     2        2        0     TRUE    home
## 38818 2018-03-23     1     1        1        0     TRUE    home
## 38884 2018-03-27     1     0        1       -1     TRUE    home
## 38988 2018-06-02     1     1        2       -1     TRUE    away
## 39031 2018-06-08     1     2        1        1     TRUE    home
## 39060 2018-06-17     1     0        1       -1    FALSE neutral
##           opponent year
## 38677       France 2017
## 38818        Spain 2018
## 38884       Brazil 2018
## 38988      Austria 2018
## 39031 Saudi Arabia 2018
## 39060       Mexico 2018
########################
##Descriptive Statistics
#Tabs
tabHome = table(df_ger$home, df_ger$friendly)
tabHome
##          
##           FALSE TRUE
##   away      108  271
##   home      119  285
##   neutral   139    9
#Unweighted proportions
propHome = prop.table(tabHome, 2)*100
round(propHome, 2)
##          
##           FALSE  TRUE
##   away    29.51 47.96
##   home    32.51 50.44
##   neutral 37.98  1.59
#Goal difference table
library('plyr')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
goaldiffTable = group_by(df_ger, home, friendly)
goaldiffTable = summarize(goaldiffTable,
                       conceded = mean(conceded),
                       goaldiff = mean(goaldiff),
                       goals = mean(goals))
goaldiffTable
## # A tibble: 6 x 5
## # Groups:   home [?]
##   home    friendly conceded goaldiff goals
##   <chr>   <lgl>       <dbl>    <dbl> <dbl>
## 1 away    FALSE       0.806    1.54   2.34
## 2 away    TRUE        1.45     0.446  1.89
## 3 home    FALSE       0.723    2.17   2.89
## 4 home    TRUE        1.16     1.26   2.42
## 5 neutral FALSE       1.17     0.755  1.93
## 6 neutral TRUE        1.67     0.222  1.89
#Opponent table
threeOpponents = df_ger[df_ger$opponent=='Mexico' | df_ger$opponent=='Sweden' | df_ger$opponent=='Korea Republic',]
opponentTable = group_by(threeOpponents, opponent)
opponentTable = summarize(opponentTable, 
                          goaldiff = mean(goaldiff),
                          match = sum(match))
opponentTable
## # A tibble: 3 x 3
##   opponent       goaldiff match
##   <chr>             <dbl> <dbl>
## 1 Korea Republic    0         3
## 2 Mexico            1.08     12
## 3 Sweden            0.278    36
#######################
##Graphs
#Histogram
library('ggplot2')
ggplot(df_ger, aes(goaldiff)) +
  geom_histogram(fill = "blue", binwidth=.75, position="dodge") + 
  ylab("Count") + xlab("Goal Differential") + ggtitle('Histogram of Goal Differences')

ggplot(df_ger, aes(goaldiff, fill = friendly)) +
  geom_histogram(binwidth=.75, position="dodge") + 
  ylab("Count") + xlab("Goal Differential") + ggtitle('Histogram of Goal Differences') + 
  theme(legend.position="bottom")

#Line Graphs
df_ger_byyear = ddply(df_ger, .(year), summarize,  goals=mean(goals), 
                      conceded=mean(conceded), goaldiff=mean(goaldiff))
df_ger_byyear$conceded = df_ger_byyear$conceded*-1
tail(df_ger_byyear)
##    year    goals   conceded  goaldiff
## 94 2013 2.916667 -1.2500000  1.666667
## 95 2014 2.176471 -0.8823529  1.294118
## 96 2015 2.222222 -1.2222222  1.000000
## 97 2016 2.125000 -0.6250000  1.500000
## 98 2017 2.866667 -0.8000000  2.066667
## 99 2018 0.800000 -1.2000000 -0.400000
df_goals_byyear = ggplot(df_ger_byyear) + geom_line(aes(x=year, y=goals), color='dark green') +
  ylab("Goals") + xlab("Year") + ggtitle('Goals Scored by Year')
df_goals_byyear

df_conceded_byyear = ggplot(df_ger_byyear) + geom_line(aes(x=year, y=conceded), color='maroon') +
  ylab("Conceded") + xlab("Year") + ggtitle('Goals Conceded by Year')
df_conceded_byyear

df_goaldiff_overlay_byyear = ggplot(df_ger_byyear) + geom_line(aes(x=year, y=goaldiff), color='blue') +
  ylab("Goal Differential") + xlab("Year") + ggtitle('Goal Differential by Year') + labs(subtitle='Overlay Chart') +
  geom_line(aes(x=year, y=goals), color='dark green') + geom_line(aes(x=year, y=conceded), color='maroon') +
  geom_hline(yintercept = 0, color='dark grey')
df_goaldiff_overlay_byyear