#######################################################################
##########################  TENNIS PROJECT  ###########################
#######################################################################

# Libraries
library(MASS) ; library(ggplot2) ; library(scales) ; library(tidyverse) ; library(corrplot) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::select()     masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
library(corrplot) ; library(car) ; library(caret) ; library(readxl) ; library(e1071) ;
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest); library(dplyr)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
####
#### Reading in data.
####

# Load last 20 years of ATP tour level matches
matches2023 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2023.csv')
matches2022 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2022.csv')
matches2021 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2021.csv')
matches2020 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2020.csv')
matches2019 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2019.csv')
matches2018 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2018.csv')
matches2017 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2017.csv')
matches2016 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2016.csv')
matches2015 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2015.csv')
matches2014 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2014.csv')
matches2013 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2013.csv')
matches2012 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2012.csv')
matches2011 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2011.csv')
matches2010 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2010.csv')
matches2009 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2009.csv')
matches2008 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2008.csv')
matches2007 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2007.csv')
matches2006 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2006.csv')
matches2005 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2005.csv')
matches2004 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2004.csv')
matches2003 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2003.csv')

# Load ATP Player Data -> decided not to use
#atp_players <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_players.csv')

# Load Ranking Data (per decade) -> decided not to use.
#atp_rankings_00s <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_rankings_00s.csv')
#atp_rankings_10s <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_rankings_10s.csv')
#atp_rankings_20s <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_rankings_20s.csv')

# Combine matches into one data frame
atp_matches <- rbind(matches2003, matches2004, matches2005, matches2006, matches2007,
                     matches2008, matches2009, matches2010, matches2011, matches2012,
                     matches2013, matches2014, matches2015, matches2016, matches2017,
                     matches2018, matches2019, matches2020, matches2021, matches2022,
                     matches2023) 
# 61932 observations

# remove after combining to de-clutter
rm(matches2003, matches2004, matches2005, matches2006, matches2007,
   matches2008, matches2009, matches2010, matches2011, matches2012,
   matches2013, matches2014, matches2015, matches2016, matches2017,
   matches2018, matches2019, matches2020, matches2021, matches2022,
   matches2023)

####
#### Data Exploration & Manipulation
####

#instead of listing seed or NA for unseeded, create binary seeded variable
atp_matches$winner_seeded <- ifelse(is.na(atp_matches$winner_seed), "No", "Yes")
atp_matches$loser_seeded <- ifelse(is.na(atp_matches$loser_seed),"No", "Yes")

names(atp_matches)
##  [1] "tourney_id"         "tourney_name"       "surface"           
##  [4] "draw_size"          "tourney_level"      "tourney_date"      
##  [7] "match_num"          "winner_id"          "winner_seed"       
## [10] "winner_entry"       "winner_name"        "winner_hand"       
## [13] "winner_ht"          "winner_ioc"         "winner_age"        
## [16] "loser_id"           "loser_seed"         "loser_entry"       
## [19] "loser_name"         "loser_hand"         "loser_ht"          
## [22] "loser_ioc"          "loser_age"          "score"             
## [25] "best_of"            "round"              "minutes"           
## [28] "w_ace"              "w_df"               "w_svpt"            
## [31] "w_1stIn"            "w_1stWon"           "w_2ndWon"          
## [34] "w_SvGms"            "w_bpSaved"          "w_bpFaced"         
## [37] "l_ace"              "l_df"               "l_svpt"            
## [40] "l_1stIn"            "l_1stWon"           "l_2ndWon"          
## [43] "l_SvGms"            "l_bpSaved"          "l_bpFaced"         
## [46] "winner_rank"        "winner_rank_points" "loser_rank"        
## [49] "loser_rank_points"  "winner_seeded"      "loser_seeded"
str(atp_matches)
## 'data.frame':    61932 obs. of  51 variables:
##  $ tourney_id        : chr  "2003-1536" "2003-1536" "2003-1536" "2003-1536" ...
##  $ tourney_name      : chr  "Madrid Masters" "Madrid Masters" "Madrid Masters" "Madrid Masters" ...
##  $ surface           : chr  "Hard" "Hard" "Hard" "Hard" ...
##  $ draw_size         : int  48 48 48 48 48 48 48 48 48 48 ...
##  $ tourney_level     : chr  "M" "M" "M" "M" ...
##  $ tourney_date      : int  20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 ...
##  $ match_num         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ winner_id         : int  101965 102358 102998 102610 102374 103888 103852 103292 103970 102434 ...
##  $ winner_seed       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ winner_entry      : chr  "" "Q" "Q" "" ...
##  $ winner_name       : chr  "Wayne Ferreira" "Thomas Enqvist" "Jan Michael Gambill" "Albert Costa" ...
##  $ winner_hand       : chr  "R" "R" "R" "R" ...
##  $ winner_ht         : int  185 190 190 180 180 188 188 175 175 183 ...
##  $ winner_ioc        : chr  "RSA" "SWE" "USA" "ESP" ...
##  $ winner_age        : num  32 29.5 26.3 28.3 29.5 21.8 22 24.8 21.5 29.2 ...
##  $ loser_id          : int  103344 102338 103786 103602 104745 102450 103151 103813 104022 103294 ...
##  $ loser_seed        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ loser_entry       : chr  "" "" "" "" ...
##  $ loser_name        : chr  "Ivan Ljubicic" "Yevgeny Kafelnikov" "Nikolay Davydenko" "Fernando Gonzalez" ...
##  $ loser_hand        : chr  "R" "R" "R" "R" ...
##  $ loser_ht          : int  193 190 178 183 185 185 183 185 183 170 ...
##  $ loser_ioc         : chr  "CRO" "RUS" "RUS" "CHI" ...
##  $ loser_age         : num  24.5 29.6 22.3 23.2 17.3 29.1 25.6 22.2 21.3 24.8 ...
##  $ score             : chr  "7-6(7) 7-6(5)" "6-3 RET" "6-3 6-3" "6-3 7-6(3)" ...
##  $ best_of           : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ round             : chr  "R64" "R64" "R64" "R64" ...
##  $ minutes           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_ace             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_df              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_svpt            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_1stIn           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_1stWon          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_2ndWon          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_SvGms           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_bpSaved         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ w_bpFaced         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_ace             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_df              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_svpt            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_1stIn           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_1stWon          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_2ndWon          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_SvGms           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_bpSaved         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ l_bpFaced         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ winner_rank       : int  28 146 57 23 127 25 35 37 72 33 ...
##  $ winner_rank_points: int  1090 258 660 1170 290 1145 1025 1000 480 1040 ...
##  $ loser_rank        : int  42 40 43 22 49 30 27 31 29 87 ...
##  $ loser_rank_points : int  865 950 855 1190 788 1055 1133 1050 1060 421 ...
##  $ winner_seeded     : chr  "No" "No" "No" "No" ...
##  $ loser_seeded      : chr  "No" "No" "No" "No" ...
levels(as.factor(atp_matches$surface))
## [1] ""       "Carpet" "Clay"   "Grass"  "Hard"
levels(as.factor(atp_matches$winner_entry))
## [1] ""    "Alt" "ALT" "LL"  "PR"  "Q"   "SE"  "WC"
#creating outcome variable for winners and losers
match_winners = atp_matches
match_winners$outcome = 1

match_losers = atp_matches
match_losers$outcome = 0

#renaming columns to prep for consolidating winners and losers

match_winners = match_winners %>% dplyr::rename(
  id = winner_id,
  seed = winner_seed,
  entry = winner_entry,
  name = winner_name,
  hand = winner_hand,
  ht = winner_ht,
  ioc = winner_ioc,
  age = winner_age,
  opponent_id = loser_id,
  opponent_seed = loser_seed,
  opponent_entry = loser_entry,
  opponent_name = loser_name,
  opponent_hand = loser_hand,
  opponent_ht = loser_ht,
  opponent_ioc = loser_ioc,
  opponent_age = loser_age,
  ace = w_ace,
  df = w_df,
  svpt = w_svpt,
  firstIn = w_1stIn,
  firstWon = w_1stWon,
  secWon = w_2ndWon,
  SvGms = w_SvGms,
  bpSaved = w_bpSaved,
  bpFaced = w_bpFaced,
  opponent_ace = l_ace,
  opponent_df = l_df,
  opponent_svpt = l_svpt,
  opponent_firstIn = l_1stIn,
  opponent_firstWon = l_1stWon,
  opponent_secWon = l_2ndWon,
  opponent_SvGms = l_SvGms,
  opponent_bpSaved = l_bpSaved,
  opponent_bpFaced = l_bpFaced,
  rank = winner_rank,
  rank_points = winner_rank_points,
  opponent_rank = loser_rank,
  opponent_rank_points = loser_rank_points,
  seeded = winner_seeded,
  opponent_seeded = loser_seeded
)

match_losers = match_losers %>% dplyr::rename(
  id = loser_id,
  seed = loser_seed,
  entry = loser_entry,
  name = loser_name,
  hand = loser_hand,
  ht = loser_ht,
  ioc = loser_ioc,
  age = loser_age,
  opponent_id = winner_id,
  opponent_seed = winner_seed,
  opponent_entry = winner_entry,
  opponent_name = winner_name,
  opponent_hand = winner_hand,
  opponent_ht = winner_ht,
  opponent_ioc = winner_ioc,
  opponent_age = winner_age,
  ace = l_ace,
  df = l_df,
  svpt = l_svpt,
  firstIn = l_1stIn,
  firstWon = l_1stWon,
  secWon = l_2ndWon,
  SvGms = l_SvGms,
  bpSaved = l_bpSaved,
  bpFaced = l_bpFaced,
  opponent_ace = w_ace,
  opponent_df = w_df,
  opponent_svpt = w_svpt,
  opponent_firstIn = w_1stIn,
  opponent_firstWon = w_1stWon,
  opponent_secWon = w_2ndWon,
  opponent_SvGms = w_SvGms,
  opponent_bpSaved = w_bpSaved,
  opponent_bpFaced = w_bpFaced,
  rank = loser_rank,
  rank_points = loser_rank_points,
  opponent_rank = winner_rank,
  opponent_rank_points = winner_rank_points,
  seeded = loser_seeded,
  opponent_seeded = winner_seeded
)

# combine match_winners and losers
match_outcomes <- rbind(match_winners, match_losers)
rm(match_winners,match_losers)

# additional variables
match_outcomes$ht_dif = match_outcomes$ht - match_outcomes$opponent_ht
match_outcomes$age_dif = match_outcomes$age - match_outcomes$opponent_age
match_outcomes$rank_dif = match_outcomes$rank - match_outcomes$opponent_rank
names(match_outcomes)
##  [1] "tourney_id"           "tourney_name"         "surface"             
##  [4] "draw_size"            "tourney_level"        "tourney_date"        
##  [7] "match_num"            "id"                   "seed"                
## [10] "entry"                "name"                 "hand"                
## [13] "ht"                   "ioc"                  "age"                 
## [16] "opponent_id"          "opponent_seed"        "opponent_entry"      
## [19] "opponent_name"        "opponent_hand"        "opponent_ht"         
## [22] "opponent_ioc"         "opponent_age"         "score"               
## [25] "best_of"              "round"                "minutes"             
## [28] "ace"                  "df"                   "svpt"                
## [31] "firstIn"              "firstWon"             "secWon"              
## [34] "SvGms"                "bpSaved"              "bpFaced"             
## [37] "opponent_ace"         "opponent_df"          "opponent_svpt"       
## [40] "opponent_firstIn"     "opponent_firstWon"    "opponent_secWon"     
## [43] "opponent_SvGms"       "opponent_bpSaved"     "opponent_bpFaced"    
## [46] "rank"                 "rank_points"          "opponent_rank"       
## [49] "opponent_rank_points" "seeded"               "opponent_seeded"     
## [52] "outcome"              "ht_dif"               "age_dif"             
## [55] "rank_dif"
# create averages of player match outcomes
match_outcomes <- match_outcomes %>%
  group_by(id) %>%
  mutate(
    avg_ace = mean(ace, na.rm = TRUE),
    avg_df = mean(df, na.rm = TRUE),
    avg_svpt = mean(svpt, na.rm = TRUE),
    avg_firstIn = mean(firstIn, na.rm = TRUE),
    avg_firstWon = mean(firstWon, na.rm = TRUE),
    avg_secWon = mean(secWon, na.rm = TRUE),
    avg_SvGms = mean(SvGms, na.rm = TRUE),
    avg_bpSaved = mean(bpSaved, na.rm = TRUE),
    avg_bpFaced = mean(bpFaced, na.rm = TRUE)
  )

match_outcomes <- match_outcomes %>%
  group_by(opponent_id) %>%
  mutate(    
    avg_opponent_ace = mean(opponent_ace, na.rm = TRUE),
    avg_opponent_df = mean(opponent_df, na.rm = TRUE),
    avg_opponent_svpt = mean(opponent_svpt, na.rm = TRUE),
    avg_opponent_firstIn = mean(opponent_firstIn, na.rm = TRUE),
    avg_opponent_firstWon = mean(opponent_firstWon, na.rm = TRUE),
    avg_opponent_secWon = mean(opponent_secWon, na.rm = TRUE),
    avg_opponent_SvGms = mean(opponent_SvGms, na.rm = TRUE),
    avg_opponent_bpSaved = mean(opponent_bpSaved, na.rm = TRUE),
    avg_opponent_bpFaced = mean(opponent_bpFaced, na.rm = TRUE)
  )


match_outcomes$surface_clay <- ifelse(match_outcomes$surface == "Clay",1,0)
match_outcomes$surface_grass <- ifelse(match_outcomes$surface == "Grass",1,0)
match_outcomes$surface_hard <- ifelse(match_outcomes$surface == "Hard",1,0)
str(match_outcomes)
## gropd_df [123,864 × 76] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ tourney_id           : chr [1:123864] "2003-1536" "2003-1536" "2003-1536" "2003-1536" ...
##  $ tourney_name         : chr [1:123864] "Madrid Masters" "Madrid Masters" "Madrid Masters" "Madrid Masters" ...
##  $ surface              : chr [1:123864] "Hard" "Hard" "Hard" "Hard" ...
##  $ draw_size            : int [1:123864] 48 48 48 48 48 48 48 48 48 48 ...
##  $ tourney_level        : chr [1:123864] "M" "M" "M" "M" ...
##  $ tourney_date         : int [1:123864] 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 ...
##  $ match_num            : int [1:123864] 1 2 3 4 5 6 7 8 9 10 ...
##  $ id                   : int [1:123864] 101965 102358 102998 102610 102374 103888 103852 103292 103970 102434 ...
##  $ seed                 : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ entry                : chr [1:123864] "" "Q" "Q" "" ...
##  $ name                 : chr [1:123864] "Wayne Ferreira" "Thomas Enqvist" "Jan Michael Gambill" "Albert Costa" ...
##  $ hand                 : chr [1:123864] "R" "R" "R" "R" ...
##  $ ht                   : int [1:123864] 185 190 190 180 180 188 188 175 175 183 ...
##  $ ioc                  : chr [1:123864] "RSA" "SWE" "USA" "ESP" ...
##  $ age                  : num [1:123864] 32 29.5 26.3 28.3 29.5 21.8 22 24.8 21.5 29.2 ...
##  $ opponent_id          : int [1:123864] 103344 102338 103786 103602 104745 102450 103151 103813 104022 103294 ...
##  $ opponent_seed        : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_entry       : chr [1:123864] "" "" "" "" ...
##  $ opponent_name        : chr [1:123864] "Ivan Ljubicic" "Yevgeny Kafelnikov" "Nikolay Davydenko" "Fernando Gonzalez" ...
##  $ opponent_hand        : chr [1:123864] "R" "R" "R" "R" ...
##  $ opponent_ht          : int [1:123864] 193 190 178 183 185 185 183 185 183 170 ...
##  $ opponent_ioc         : chr [1:123864] "CRO" "RUS" "RUS" "CHI" ...
##  $ opponent_age         : num [1:123864] 24.5 29.6 22.3 23.2 17.3 29.1 25.6 22.2 21.3 24.8 ...
##  $ score                : chr [1:123864] "7-6(7) 7-6(5)" "6-3 RET" "6-3 6-3" "6-3 7-6(3)" ...
##  $ best_of              : int [1:123864] 3 3 3 3 3 3 3 3 3 3 ...
##  $ round                : chr [1:123864] "R64" "R64" "R64" "R64" ...
##  $ minutes              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ ace                  : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ df                   : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ svpt                 : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ firstIn              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ firstWon             : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ secWon               : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ SvGms                : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ bpSaved              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ bpFaced              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_ace         : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_df          : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_svpt        : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_firstIn     : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_firstWon    : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_secWon      : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_SvGms       : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_bpSaved     : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_bpFaced     : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ rank                 : int [1:123864] 28 146 57 23 127 25 35 37 72 33 ...
##  $ rank_points          : int [1:123864] 1090 258 660 1170 290 1145 1025 1000 480 1040 ...
##  $ opponent_rank        : int [1:123864] 42 40 43 22 49 30 27 31 29 87 ...
##  $ opponent_rank_points : int [1:123864] 865 950 855 1190 788 1055 1133 1050 1060 421 ...
##  $ seeded               : chr [1:123864] "No" "No" "No" "No" ...
##  $ opponent_seeded      : chr [1:123864] "No" "No" "No" "No" ...
##  $ outcome              : num [1:123864] 1 1 1 1 1 1 1 1 1 1 ...
##  $ ht_dif               : int [1:123864] -8 0 12 -3 -5 3 5 -10 -8 13 ...
##  $ age_dif              : num [1:123864] 7.5 -0.1 4 5.1 12.2 ...
##  $ rank_dif             : int [1:123864] -14 106 14 1 78 -5 8 6 43 -54 ...
##  $ avg_ace              : num [1:123864] 7.04 8.12 11.1 4.61 3.62 ...
##  $ avg_df               : num [1:123864] 2.54 5.34 3.36 2.35 3.11 ...
##  $ avg_svpt             : num [1:123864] 80.9 85.7 78 84.2 81.2 ...
##  $ avg_firstIn          : num [1:123864] 46.5 46.5 44.7 52.6 46.5 ...
##  $ avg_firstWon         : num [1:123864] 33.6 34.6 34 35.5 31.5 ...
##  $ avg_secWon           : num [1:123864] 18 18.9 15.6 16 17.4 ...
##  $ avg_SvGms            : num [1:123864] 12.5 13.2 12 12.8 12.5 ...
##  $ avg_bpSaved          : num [1:123864] 3.97 4.4 4.1 4.84 4.71 ...
##  $ avg_bpFaced          : num [1:123864] 6.38 7.43 6.61 8.03 8.19 ...
##  $ avg_opponent_ace     : num [1:123864] 12.28 3.98 3.23 6.89 3.19 ...
##  $ avg_opponent_df      : num [1:123864] 2.28 3.25 2.59 3.55 1.71 ...
##  $ avg_opponent_svpt    : num [1:123864] 80.5 79.1 74 81.5 73.6 ...
##  $ avg_opponent_firstIn : num [1:123864] 47.9 46.6 49.9 50 50 ...
##  $ avg_opponent_firstWon: num [1:123864] 37.2 32.4 34.2 37.2 36.1 ...
##  $ avg_opponent_secWon  : num [1:123864] 17.1 16.1 12.5 16.4 13.5 ...
##  $ avg_opponent_SvGms   : num [1:123864] 12.9 12.5 11.7 13 12.1 ...
##  $ avg_opponent_bpSaved : num [1:123864] 3.35 3.73 3.93 3.77 3.37 ...
##  $ avg_opponent_bpFaced : num [1:123864] 5.12 6.92 6.47 5.92 5.1 ...
##  $ surface_clay         : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
##  $ surface_grass        : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
##  $ surface_hard         : num [1:123864] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "groups")= tibble [2,261 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ opponent_id: int [1:2261] 100644 101316 101404 101532 101662 101723 101736 101746 101750 101774 ...
##   ..$ .rows      : list<int> [1:2261] 
##   .. ..$ : int [1:573] 33442 35678 36325 36416 36473 36511 37213 37718 37802 37918 ...
##   .. ..$ : int 6453
##   .. ..$ : int [1:3] 25319 27592 28329
##   .. ..$ : int [1:6] 325 882 2591 2794 62750 62782
##   .. ..$ : int [1:2] 12952 12954
##   .. ..$ : int [1:3] 1861 2534 63780
##   .. ..$ : int [1:176] 225 1164 1271 1377 1610 2183 2337 2469 2661 2665 ...
##   .. ..$ : int [1:5] 142 403 983 2619 64509
##   .. ..$ : int 648
##   .. ..$ : int [1:54] 204 790 907 931 1240 1438 1495 2133 2320 2459 ...
##   .. ..$ : int 3110
##   .. ..$ : int 19117
##   .. ..$ : int [1:2] 2951 3118
##   .. ..$ : int [1:2] 1831 63750
##   .. ..$ : int [1:2] 2972 64902
##   .. ..$ : int [1:27] 218 491 776 1651 1823 1864 2067 2260 2540 3268 ...
##   .. ..$ : int [1:5] 2980 3126 3129 9631 64915
##   .. ..$ : int [1:49] 48 212 534 2094 2252 2566 2855 3491 3599 3618 ...
##   .. ..$ : int 3231
##   .. ..$ : int 4723
##   .. ..$ : int [1:134] 162 180 242 358 508 821 941 976 1038 1118 ...
##   .. ..$ : int [1:4] 2897 3031 68401 71378
##   .. ..$ : int [1:3] 6423 6463 68397
##   .. ..$ : int [1:14] 743 852 1952 2000 2050 2589 2727 2799 62661 62752 ...
##   .. ..$ : int [1:7] 259 564 3579 3951 4432 5515 67414
##   .. ..$ : int [1:4] 1804 6346 63727 64912
##   .. ..$ : int 4270
##   .. ..$ : int 2922
##   .. ..$ : int [1:108] 46 300 384 434 662 696 755 909 913 1011 ...
##   .. ..$ : int [1:17] 194 935 1884 3372 3409 4069 4238 4275 4289 4436 ...
##   .. ..$ : int [1:78] 17 613 616 693 759 798 894 943 1085 1158 ...
##   .. ..$ : int [1:2] 3124 65054
##   .. ..$ : int [1:2] 3074 81153
##   .. ..$ : int [1:21] 134 219 595 644 827 928 994 1031 1689 2607 ...
##   .. ..$ : int 6449
##   .. ..$ : int [1:12] 802 881 929 1279 1386 1467 1478 1732 2085 2352 ...
##   .. ..$ : int [1:3] 59 189 2223
##   .. ..$ : int [1:6] 6365 12992 16143 19247 22372 74926
##   .. ..$ : int [1:235] 27 313 580 721 860 992 1032 1068 1588 1604 ...
##   .. ..$ : int [1:6] 3119 6393 6456 65053 68324 68386
##   .. ..$ : int [1:3] 3063 6241 65105
##   .. ..$ : int [1:3] 2923 68316 68317
##   .. ..$ : int 2948
##   .. ..$ : int [1:3] 3052 64866 64869
##   .. ..$ : int [1:4] 1812 6341 64919 65059
##   .. ..$ : int [1:171] 57 91 129 203 568 668 738 804 890 915 ...
##   .. ..$ : int [1:18] 144 1569 2943 3163 3170 3643 4800 9636 65094 65559 ...
##   .. ..$ : int [1:7] 1362 4569 9501 11229 63271 64975 65122
##   .. ..$ : int [1:2] 64863 64980
##   .. ..$ : int [1:20] 75 270 1319 1452 3266 3539 3578 3743 6575 6837 ...
##   .. ..$ : int [1:267] 51 487 676 695 782 927 999 1045 1107 1169 ...
##   .. ..$ : int [1:2] 1463 63380
##   .. ..$ : int [1:108] 93 137 221 246 351 409 643 655 739 867 ...
##   .. ..$ : int [1:6] 1632 1760 1976 2582 2912 5178
##   .. ..$ : int [1:153] 66 96 112 143 543 794 836 1020 1291 1471 ...
##   .. ..$ : int 121
##   .. ..$ : int 1764
##   .. ..$ : int [1:112] 116 238 385 426 562 789 840 921 1039 1144 ...
##   .. ..$ : int [1:42] 1909 2120 2508 3683 3959 4917 5055 5618 5709 6009 ...
##   .. ..$ : int [1:14] 2859 6320 9417 19204 64859 64861 64979 64982 68242 68244 ...
##   .. ..$ : int 366
##   .. ..$ : int [1:2] 1014 2037
##   .. ..$ : int [1:10] 1025 1179 1765 1784 2024 2032 2580 62943 63939 63950
##   .. ..$ : int 64913
##   .. ..$ : int [1:2] 19244 22314
##   .. ..$ : int [1:138] 217 1266 1303 1364 1425 2061 2289 2358 3202 3204 ...
##   .. ..$ : int [1:12] 2065 2558 2861 3481 4894 5166 5627 5812 67084 67093 ...
##   .. ..$ : int [1:5] 3165 9551 12956 13014 64998
##   .. ..$ : int [1:29] 285 511 2382 2845 3476 4055 4140 4483 4519 5167 ...
##   .. ..$ : int [1:81] 206 312 328 666 718 753 783 1013 1432 1517 ...
##   .. ..$ : int 10310
##   .. ..$ : int [1:14] 1363 1482 1647 1879 3064 3065 6255 63274 63569 65106 ...
##   .. ..$ : int [1:83] 310 332 372 457 727 1074 1194 1535 1699 1835 ...
##   .. ..$ : int [1:26] 532 2594 2929 3741 6306 6470 6473 7017 9590 9593 ...
##   .. ..$ : int [1:16] 156 2864 3280 3761 4938 5692 6641 8181 14987 15932 ...
##   .. ..$ : int [1:204] 452 675 724 758 864 919 990 1601 2475 2577 ...
##   .. ..$ : int [1:2] 3076 64891
##   .. ..$ : int [1:55] 2 170 768 887 945 1002 1037 1079 1224 1265 ...
##   .. ..$ : int [1:79] 29 118 556 683 824 1060 1111 1177 1380 1405 ...
##   .. ..$ : int [1:94] 18 175 591 620 657 773 896 937 991 1030 ...
##   .. ..$ : int [1:2] 3005 6339
##   .. ..$ : int [1:20] 354 398 1097 1761 2119 2268 2384 3961 5020 5290 ...
##   .. ..$ : int [1:75] 21 244 376 435 458 765 853 946 979 1042 ...
##   .. ..$ : int [1:46] 50 387 513 1528 2205 2388 3598 5716 10165 13097 ...
##   .. ..$ : int [1:19] 178 280 1659 1825 2612 2863 3349 3775 4384 5118 ...
##   .. ..$ : int [1:2] 15949 26231
##   .. ..$ : int [1:33] 182 278 547 597 617 1132 1256 1317 1335 1407 ...
##   .. ..$ : int 19203
##   .. ..$ : int [1:317] 37 102 577 633 688 816 857 973 1098 1120 ...
##   .. ..$ : int [1:28] 150 172 309 322 599 645 822 982 1021 1114 ...
##   .. ..$ : int [1:225] 6 160 224 501 583 855 955 984 1155 1167 ...
##   .. ..$ : int [1:114] 33 107 422 478 699 760 875 936 1070 1134 ...
##   .. ..$ : int [1:2] 1234 2896
##   .. ..$ : int 6215
##   .. ..$ : int [1:6] 2109 2194 2262 2598 5753 64467
##   .. ..$ : int [1:2] 3114 65041
##   .. ..$ : int 3582
##   .. ..$ : int [1:11] 368 423 1522 1791 2035 2216 62284 62339 64129 64140 ...
##   .. ..$ : int [1:79] 151 240 375 438 462 498 665 1867 2096 2246 ...
##   .. .. [list output truncated]
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
summary(match_outcomes)
##   tourney_id        tourney_name         surface            draw_size     
##  Length:123864      Length:123864      Length:123864      Min.   :  2.00  
##  Class :character   Class :character   Class :character   1st Qu.: 32.00  
##  Mode  :character   Mode  :character   Mode  :character   Median : 32.00  
##                                                           Mean   : 55.35  
##                                                           3rd Qu.: 64.00  
##                                                           Max.   :128.00  
##                                                                           
##  tourney_level       tourney_date        match_num            id        
##  Length:123864      Min.   :20021230   Min.   :   1.0   Min.   :100644  
##  Class :character   1st Qu.:20070813   1st Qu.:  12.0   1st Qu.:103819  
##  Mode  :character   Median :20120827   Median :  31.0   Median :104607  
##                     Mean   :20126107   Mean   : 106.8   Mean   :110110  
##                     3rd Qu.:20180202   3rd Qu.: 247.0   3rd Qu.:105583  
##                     Max.   :20231127   Max.   :1701.0   Max.   :212051  
##                                                                         
##       seed          entry               name               hand          
##  Min.   : 1      Length:123864      Length:123864      Length:123864     
##  1st Qu.: 3      Class :character   Class :character   Class :character  
##  Median : 6      Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8                                                              
##  3rd Qu.:10                                                              
##  Max.   :35                                                              
##  NA's   :83347                                                           
##        ht            ioc                 age         opponent_id    
##  Min.   : 71.0   Length:123864      Min.   :14.50   Min.   :100644  
##  1st Qu.:183.0   Class :character   1st Qu.:23.60   1st Qu.:103819  
##  Median :185.0   Mode  :character   Median :26.40   Median :104607  
##  Mean   :186.1                      Mean   :26.54   Mean   :110110  
##  3rd Qu.:190.0                      3rd Qu.:29.30   3rd Qu.:105583  
##  Max.   :211.0                      Max.   :46.00   Max.   :212051  
##  NA's   :3746                       NA's   :11                      
##  opponent_seed   opponent_entry     opponent_name      opponent_hand     
##  Min.   : 1      Length:123864      Length:123864      Length:123864     
##  1st Qu.: 3      Class :character   Class :character   Class :character  
##  Median : 6      Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8                                                              
##  3rd Qu.:10                                                              
##  Max.   :35                                                              
##  NA's   :83347                                                           
##   opponent_ht    opponent_ioc        opponent_age      score          
##  Min.   : 71.0   Length:123864      Min.   :14.50   Length:123864     
##  1st Qu.:183.0   Class :character   1st Qu.:23.60   Class :character  
##  Median :185.0   Mode  :character   Median :26.40   Mode  :character  
##  Mean   :186.1                      Mean   :26.54                     
##  3rd Qu.:190.0                      3rd Qu.:29.30                     
##  Max.   :211.0                      Max.   :46.00                     
##  NA's   :3746                       NA's   :11                        
##     best_of         round              minutes            ace         
##  Min.   :3.000   Length:123864      Min.   :   0.0   Min.   :  0.000  
##  1st Qu.:3.000   Class :character   1st Qu.:  77.0   1st Qu.:  2.000  
##  Median :3.000   Mode  :character   Median : 100.0   Median :  5.000  
##  Mean   :3.453                      Mean   : 107.8   Mean   :  6.067  
##  3rd Qu.:3.000                      3rd Qu.: 131.0   3rd Qu.:  8.000  
##  Max.   :5.000                      Max.   :1146.0   Max.   :113.000  
##                                     NA's   :13556    NA's   :10624    
##        df              svpt           firstIn          firstWon     
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 1.000   1st Qu.: 58.00   1st Qu.: 35.00   1st Qu.: 25.00  
##  Median : 2.000   Median : 74.00   Median : 45.00   Median : 32.00  
##  Mean   : 2.939   Mean   : 79.52   Mean   : 48.62   Mean   : 34.63  
##  3rd Qu.: 4.000   3rd Qu.: 96.00   3rd Qu.: 59.00   3rd Qu.: 42.00  
##  Max.   :26.000   Max.   :491.00   Max.   :361.00   Max.   :292.00  
##  NA's   :10624    NA's   :10624    NA's   :10624    NA's   :10624   
##      secWon           SvGms          bpSaved          bpFaced      
##  Min.   :  0.00   Min.   : 0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 11.00   1st Qu.: 9.00   1st Qu.: 2.000   1st Qu.: 3.000  
##  Median : 15.00   Median :11.00   Median : 3.000   Median : 6.000  
##  Mean   : 15.65   Mean   :12.43   Mean   : 4.088   Mean   : 6.769  
##  3rd Qu.: 20.00   3rd Qu.:15.00   3rd Qu.: 6.000   3rd Qu.: 9.000  
##  Max.   :101.00   Max.   :91.00   Max.   :27.000   Max.   :38.000  
##  NA's   :10624    NA's   :10622   NA's   :10624    NA's   :10624   
##   opponent_ace      opponent_df     opponent_svpt    opponent_firstIn
##  Min.   :  0.000   Min.   : 0.000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  2.000   1st Qu.: 1.000   1st Qu.: 58.00   1st Qu.: 35.00  
##  Median :  5.000   Median : 2.000   Median : 74.00   Median : 45.00  
##  Mean   :  6.067   Mean   : 2.939   Mean   : 79.52   Mean   : 48.62  
##  3rd Qu.:  8.000   3rd Qu.: 4.000   3rd Qu.: 96.00   3rd Qu.: 59.00  
##  Max.   :113.000   Max.   :26.000   Max.   :491.00   Max.   :361.00  
##  NA's   :10624     NA's   :10624    NA's   :10624    NA's   :10624   
##  opponent_firstWon opponent_secWon  opponent_SvGms  opponent_bpSaved
##  Min.   :  0.00    Min.   :  0.00   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 25.00    1st Qu.: 11.00   1st Qu.: 9.00   1st Qu.: 2.000  
##  Median : 32.00    Median : 15.00   Median :11.00   Median : 3.000  
##  Mean   : 34.63    Mean   : 15.65   Mean   :12.43   Mean   : 4.088  
##  3rd Qu.: 42.00    3rd Qu.: 20.00   3rd Qu.:15.00   3rd Qu.: 6.000  
##  Max.   :292.00    Max.   :101.00   Max.   :91.00   Max.   :27.000  
##  NA's   :10624     NA's   :10624    NA's   :10622   NA's   :10624   
##  opponent_bpFaced      rank          rank_points    opponent_rank    
##  Min.   : 0.000   Min.   :   1.00   Min.   :    1   Min.   :   1.00  
##  1st Qu.: 3.000   1st Qu.:  25.00   1st Qu.:  510   1st Qu.:  25.00  
##  Median : 6.000   Median :  56.00   Median :  825   Median :  56.00  
##  Mean   : 6.769   Mean   :  98.58   Mean   : 1342   Mean   :  98.58  
##  3rd Qu.: 9.000   3rd Qu.: 100.00   3rd Qu.: 1405   3rd Qu.: 100.00  
##  Max.   :38.000   Max.   :2159.00   Max.   :16950   Max.   :2159.00  
##  NA's   :10624    NA's   :1567      NA's   :1567    NA's   :1567     
##  opponent_rank_points    seeded          opponent_seeded       outcome   
##  Min.   :    1        Length:123864      Length:123864      Min.   :0.0  
##  1st Qu.:  510        Class :character   Class :character   1st Qu.:0.0  
##  Median :  825        Mode  :character   Mode  :character   Median :0.5  
##  Mean   : 1342                                              Mean   :0.5  
##  3rd Qu.: 1405                                              3rd Qu.:1.0  
##  Max.   :16950                                              Max.   :1.0  
##  NA's   :1567                                                            
##      ht_dif        age_dif         rank_dif        avg_ace      
##  Min.   :-112   Min.   :-25.9   Min.   :-2125   Min.   : 0.000  
##  1st Qu.:  -5   1st Qu.: -3.7   1st Qu.:  -43   1st Qu.: 3.786  
##  Median :   0   Median :  0.0   Median :    0   Median : 5.429  
##  Mean   :   0   Mean   :  0.0   Mean   :    0   Mean   : 6.021  
##  3rd Qu.:   5   3rd Qu.:  3.7   3rd Qu.:   43   3rd Qu.: 7.670  
##  Max.   : 112   Max.   : 25.9   Max.   : 2125   Max.   :19.821  
##  NA's   :5792   NA's   :22      NA's   :2746    NA's   :2469    
##      avg_df          avg_svpt       avg_firstIn      avg_firstWon  
##  Min.   : 0.000   Min.   : 20.00   Min.   : 11.00   Min.   : 6.00  
##  1st Qu.: 2.332   1st Qu.: 77.35   1st Qu.: 46.57   1st Qu.:32.56  
##  Median : 2.803   Median : 79.49   Median : 48.54   Median :34.43  
##  Mean   : 2.946   Mean   : 79.49   Mean   : 48.60   Mean   :34.55  
##  3rd Qu.: 3.461   3rd Qu.: 81.52   3rd Qu.: 50.45   3rd Qu.:36.23  
##  Max.   :15.000   Max.   :163.00   Max.   :123.50   Max.   :74.50  
##  NA's   :2469     NA's   :2469     NA's   :2469     NA's   :2469   
##    avg_secWon      avg_SvGms      avg_bpSaved      avg_bpFaced    
##  Min.   : 0.00   Min.   : 4.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:14.48   1st Qu.:12.09   1st Qu.: 3.797   1st Qu.: 6.100  
##  Median :15.65   Median :12.41   Median : 4.107   Median : 6.798  
##  Mean   :15.62   Mean   :12.42   Mean   : 4.106   Mean   : 6.810  
##  3rd Qu.:16.77   3rd Qu.:12.77   3rd Qu.: 4.440   3rd Qu.: 7.500  
##  Max.   :39.00   Max.   :25.00   Max.   :18.000   Max.   :24.000  
##  NA's   :2469    NA's   :2469    NA's   :2469     NA's   :2469    
##  avg_opponent_ace avg_opponent_df  avg_opponent_svpt avg_opponent_firstIn
##  Min.   : 0.000   Min.   : 0.000   Min.   : 20.00    Min.   : 11.00      
##  1st Qu.: 3.786   1st Qu.: 2.332   1st Qu.: 77.35    1st Qu.: 46.57      
##  Median : 5.429   Median : 2.803   Median : 79.49    Median : 48.54      
##  Mean   : 6.021   Mean   : 2.946   Mean   : 79.49    Mean   : 48.60      
##  3rd Qu.: 7.670   3rd Qu.: 3.461   3rd Qu.: 81.52    3rd Qu.: 50.45      
##  Max.   :19.821   Max.   :15.000   Max.   :163.00    Max.   :123.50      
##  NA's   :2469     NA's   :2469     NA's   :2469      NA's   :2469        
##  avg_opponent_firstWon avg_opponent_secWon avg_opponent_SvGms
##  Min.   : 6.00         Min.   : 0.00       Min.   : 4.00     
##  1st Qu.:32.56         1st Qu.:14.48       1st Qu.:12.09     
##  Median :34.43         Median :15.65       Median :12.41     
##  Mean   :34.55         Mean   :15.62       Mean   :12.42     
##  3rd Qu.:36.23         3rd Qu.:16.77       3rd Qu.:12.77     
##  Max.   :74.50         Max.   :39.00       Max.   :25.00     
##  NA's   :2469          NA's   :2469        NA's   :2469      
##  avg_opponent_bpSaved avg_opponent_bpFaced  surface_clay    surface_grass   
##  Min.   : 0.000       Min.   : 0.000       Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 3.797       1st Qu.: 6.100       1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 4.107       Median : 6.798       Median :0.0000   Median :0.0000  
##  Mean   : 4.106       Mean   : 6.810       Mean   :0.3211   Mean   :0.1036  
##  3rd Qu.: 4.440       3rd Qu.: 7.500       3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :18.000       Max.   :24.000       Max.   :1.0000   Max.   :1.0000  
##  NA's   :2469         NA's   :2469                                          
##   surface_hard   
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.5529  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
## 
## match_outcomes has all of my variables ##

# new DF for modeling
tennis_df = match_outcomes

# Removing pointless variables
tennis_df = select(tennis_df, -seed, -entry, -opponent_seed, -opponent_entry,
                   -tourney_id,-tourney_name,-tourney_date,-match_num,-name,-ioc,
                   -opponent_name,-opponent_ioc,-score,-round,-best_of,-surface)

# Removing variables related to match outcome
tennis_df = select(tennis_df,-minutes,-ace,-df,-svpt,-firstIn,-firstWon,-secWon,-SvGms,-bpSaved,
                   -bpFaced,-opponent_ace,-opponent_df,-opponent_svpt,-opponent_firstIn,
                   -opponent_firstWon,-opponent_secWon,-opponent_SvGms,-opponent_bpSaved,
                   -opponent_bpFaced)

# Removing height, age, rank since I have the differences instead. also rank points
tennis_df = select(tennis_df,-ht,-age,-rank,-rank_points,-opponent_ht,-opponent_age,-opponent_rank,
                   -opponent_rank_points)

# Removing IDs
tennis_df = ungroup(tennis_df)
tennis_df = select(tennis_df,-id,-opponent_id)

#coding U (unknown) hand as NAs
tennis_df$hand = ifelse(tennis_df$hand == 'U',NA, tennis_df$hand)
tennis_df$opponent_hand = ifelse(tennis_df$opponent_hand == 'U',NA, tennis_df$opponent_hand)


# remove rows containing missing values
tennis_df = na.omit(tennis_df) #123,864 to 116,166

# Looking at the relationships between the numeric variables
tennis_df_num <- dplyr::select_if(tennis_df, is.numeric)
corrplot(cor(tennis_df_num), method = c("number"), type = c("lower"),
         number.cex=0.65, tl.cex = 0.65,tl.col="black")

#pairs(tennis_df_num)
summary(tennis_df_num)
##    draw_size         outcome        ht_dif       age_dif         rank_dif    
##  Min.   :  2.00   Min.   :0.0   Min.   :-41   Min.   :-25.9   Min.   :-2094  
##  1st Qu.: 32.00   1st Qu.:0.0   1st Qu.: -5   1st Qu.: -3.7   1st Qu.:  -41  
##  Median : 32.00   Median :0.5   Median :  0   Median :  0.0   Median :    0  
##  Mean   : 57.93   Mean   :0.5   Mean   :  0   Mean   :  0.0   Mean   :    0  
##  3rd Qu.: 64.00   3rd Qu.:1.0   3rd Qu.:  5   3rd Qu.:  3.7   3rd Qu.:   41  
##  Max.   :128.00   Max.   :1.0   Max.   : 41   Max.   : 25.9   Max.   : 2094  
##     avg_ace           avg_df          avg_svpt       avg_firstIn    
##  Min.   : 0.000   Min.   : 0.000   Min.   : 36.00   Min.   : 16.00  
##  1st Qu.: 3.842   1st Qu.: 2.331   1st Qu.: 77.39   1st Qu.: 46.60  
##  Median : 5.482   Median : 2.799   Median : 79.50   Median : 48.54  
##  Mean   : 6.078   Mean   : 2.932   Mean   : 79.55   Mean   : 48.64  
##  3rd Qu.: 7.712   3rd Qu.: 3.430   3rd Qu.: 81.45   3rd Qu.: 50.33  
##  Max.   :19.821   Max.   :10.000   Max.   :159.00   Max.   :100.00  
##   avg_firstWon     avg_secWon      avg_SvGms      avg_bpSaved    
##  Min.   : 8.00   Min.   : 1.00   Min.   : 6.00   Min.   : 0.000  
##  1st Qu.:32.65   1st Qu.:14.55   1st Qu.:12.11   1st Qu.: 3.797  
##  Median :34.57   Median :15.66   Median :12.42   Median : 4.100  
##  Mean   :34.66   Mean   :15.67   Mean   :12.44   Mean   : 4.087  
##  3rd Qu.:36.23   3rd Qu.:16.78   3rd Qu.:12.77   3rd Qu.: 4.437  
##  Max.   :62.00   Max.   :39.00   Max.   :25.00   Max.   :15.000  
##   avg_bpFaced     avg_opponent_ace avg_opponent_df  avg_opponent_svpt
##  Min.   : 1.000   Min.   : 0.000   Min.   : 0.000   Min.   : 36.00   
##  1st Qu.: 6.099   1st Qu.: 3.842   1st Qu.: 2.331   1st Qu.: 77.39   
##  Median : 6.793   Median : 5.482   Median : 2.799   Median : 79.50   
##  Mean   : 6.763   Mean   : 6.078   Mean   : 2.932   Mean   : 79.55   
##  3rd Qu.: 7.470   3rd Qu.: 7.712   3rd Qu.: 3.430   3rd Qu.: 81.45   
##  Max.   :23.000   Max.   :19.821   Max.   :10.000   Max.   :159.00   
##  avg_opponent_firstIn avg_opponent_firstWon avg_opponent_secWon
##  Min.   : 16.00       Min.   : 8.00         Min.   : 1.00      
##  1st Qu.: 46.60       1st Qu.:32.65         1st Qu.:14.55      
##  Median : 48.54       Median :34.57         Median :15.66      
##  Mean   : 48.64       Mean   :34.66         Mean   :15.67      
##  3rd Qu.: 50.33       3rd Qu.:36.23         3rd Qu.:16.78      
##  Max.   :100.00       Max.   :62.00         Max.   :39.00      
##  avg_opponent_SvGms avg_opponent_bpSaved avg_opponent_bpFaced  surface_clay   
##  Min.   : 6.00      Min.   : 0.000       Min.   : 1.000       Min.   :0.0000  
##  1st Qu.:12.11      1st Qu.: 3.797       1st Qu.: 6.099       1st Qu.:0.0000  
##  Median :12.42      Median : 4.100       Median : 6.793       Median :0.0000  
##  Mean   :12.44      Mean   : 4.087       Mean   : 6.763       Mean   :0.3187  
##  3rd Qu.:12.77      3rd Qu.: 4.437       3rd Qu.: 7.470       3rd Qu.:1.0000  
##  Max.   :25.00      Max.   :15.000       Max.   :23.000       Max.   :1.0000  
##  surface_grass     surface_hard   
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000  
##  Mean   :0.1074   Mean   :0.5544  
##  3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000
high_correlation <- which(cor(tennis_df_num) > 0.8 & cor(tennis_df_num) < 1, arr.ind = TRUE)
print(high_correlation)
##                       row col
## avg_SvGms              12   8
## avg_SvGms              12  10
## avg_svpt                8  12
## avg_firstWon           10  12
## avg_bpFaced            14  13
## avg_bpSaved            13  14
## avg_opponent_SvGms     21  17
## avg_opponent_SvGms     21  19
## avg_opponent_svpt      17  21
## avg_opponent_firstWon  19  21
## avg_opponent_bpFaced   23  22
## avg_opponent_bpSaved   22  23
# bp faced & saved are highly correlated (greater than 0.9), will remove faced
tennis_df = select(tennis_df,-avg_bpFaced,-avg_opponent_bpFaced)
# service points and service games are highly correlated (0.88) will remove games
tennis_df = select(tennis_df,-avg_SvGms,-avg_opponent_SvGms)

# Look again
tennis_df_num <- dplyr::select_if(tennis_df, is.numeric)
corrplot(cor(tennis_df_num), method = c("number"), type = c("lower"),
         number.cex=0.65, tl.cex = 0.65,tl.col="black")

# Data Type Changes 
#tennis_df$tourney_date = as.Date(as.character(tennis_df$tourney_date),format = "%Y%m%d")
#tennis_df$surface = as.factor(tennis_df$surface)
tennis_df$tourney_level = as.factor(tennis_df$tourney_level)
tennis_df$hand = as.factor(tennis_df$hand)
tennis_df$opponent_hand = as.factor(tennis_df$opponent_hand)
tennis_df$seeded = as.factor(tennis_df$seeded)
tennis_df$opponent_seeded = as.factor(tennis_df$opponent_seeded)
tennis_df$outcome = as.factor(tennis_df$outcome)
tennis_df$surface_clay = as.factor(tennis_df$surface_clay)
tennis_df$surface_grass = as.factor(tennis_df$surface_grass)
tennis_df$surface_hard = as.factor(tennis_df$surface_hard)

str(tennis_df)
## tibble [116,974 × 27] (S3: tbl_df/tbl/data.frame)
##  $ draw_size            : int [1:116974] 48 48 48 48 48 48 48 48 48 48 ...
##  $ tourney_level        : Factor w/ 5 levels "A","D","F","G",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ hand                 : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 1 2 2 2 ...
##  $ opponent_hand        : Factor w/ 2 levels "L","R": 2 2 2 2 1 2 2 1 2 2 ...
##  $ seeded               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ opponent_seeded      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ outcome              : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ht_dif               : int [1:116974] -8 0 12 -3 -5 3 5 -10 -8 13 ...
##  $ age_dif              : num [1:116974] 7.5 -0.1 4 5.1 12.2 ...
##  $ rank_dif             : int [1:116974] -14 106 14 1 78 -5 8 6 43 -54 ...
##  $ avg_ace              : num [1:116974] 7.04 8.12 11.1 4.61 3.62 ...
##  $ avg_df               : num [1:116974] 2.54 5.34 3.36 2.35 3.11 ...
##  $ avg_svpt             : num [1:116974] 80.9 85.7 78 84.2 81.2 ...
##  $ avg_firstIn          : num [1:116974] 46.5 46.5 44.7 52.6 46.5 ...
##  $ avg_firstWon         : num [1:116974] 33.6 34.6 34 35.5 31.5 ...
##  $ avg_secWon           : num [1:116974] 18 18.9 15.6 16 17.4 ...
##  $ avg_bpSaved          : num [1:116974] 3.97 4.4 4.1 4.84 4.71 ...
##  $ avg_opponent_ace     : num [1:116974] 12.28 3.98 3.23 6.89 3.19 ...
##  $ avg_opponent_df      : num [1:116974] 2.28 3.25 2.59 3.55 1.71 ...
##  $ avg_opponent_svpt    : num [1:116974] 80.5 79.1 74 81.5 73.6 ...
##  $ avg_opponent_firstIn : num [1:116974] 47.9 46.6 49.9 50 50 ...
##  $ avg_opponent_firstWon: num [1:116974] 37.2 32.4 34.2 37.2 36.1 ...
##  $ avg_opponent_secWon  : num [1:116974] 17.1 16.1 12.5 16.4 13.5 ...
##  $ avg_opponent_bpSaved : num [1:116974] 3.35 3.73 3.93 3.77 3.37 ...
##  $ surface_clay         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ surface_grass        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ surface_hard         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:6890] 85 121 295 409 425 514 618 734 736 1227 ...
##   ..- attr(*, "names")= chr [1:6890] "85" "121" "295" "409" ...
names(tennis_df)
##  [1] "draw_size"             "tourney_level"         "hand"                 
##  [4] "opponent_hand"         "seeded"                "opponent_seeded"      
##  [7] "outcome"               "ht_dif"                "age_dif"              
## [10] "rank_dif"              "avg_ace"               "avg_df"               
## [13] "avg_svpt"              "avg_firstIn"           "avg_firstWon"         
## [16] "avg_secWon"            "avg_bpSaved"           "avg_opponent_ace"     
## [19] "avg_opponent_df"       "avg_opponent_svpt"     "avg_opponent_firstIn" 
## [22] "avg_opponent_firstWon" "avg_opponent_secWon"   "avg_opponent_bpSaved" 
## [25] "surface_clay"          "surface_grass"         "surface_hard"
### Splitting into train and test
set.seed(12)
index = sample(nrow(tennis_df), 0.8*nrow(tennis_df), replace = F) # 80/20 split
tennis_train = tennis_df[index,]
tennis_test = tennis_df[-index,]

#checking for balance
table(tennis_train$outcome)
## 
##     0     1 
## 46743 46836
#0     1 
#46477 46455
table(tennis_test$outcome)
## 
##     0     1 
## 11744 11651
#0     1 
#11606 11628
str(tennis_train)
## tibble [93,579 × 27] (S3: tbl_df/tbl/data.frame)
##  $ draw_size            : int [1:93579] 56 128 32 32 32 48 64 128 128 32 ...
##  $ tourney_level        : Factor w/ 5 levels "A","D","F","G",..: 1 4 1 1 1 1 5 4 4 1 ...
##  $ hand                 : Factor w/ 2 levels "L","R": 2 2 2 1 2 2 2 2 2 2 ...
##  $ opponent_hand        : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 1 2 2 2 ...
##  $ seeded               : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 2 2 2 ...
##  $ opponent_seeded      : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 2 1 1 1 1 ...
##  $ outcome              : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 2 2 ...
##  $ ht_dif               : int [1:93579] -13 0 -5 10 -15 18 3 -3 0 0 ...
##  $ age_dif              : num [1:93579] -6 -3.8 9.4 -2.9 -0.7 ...
##  $ rank_dif             : int [1:93579] -22 -50 -143 78 51 53 62 -42 -66 -107 ...
##  $ avg_ace              : num [1:93579] 4.22 3.88 7.29 7.23 4.11 ...
##  $ avg_df               : num [1:93579] 3.32 3.25 3.27 2.7 1.84 ...
##  $ avg_svpt             : num [1:93579] 76.5 80.4 79.8 79.3 79.3 ...
##  $ avg_firstIn          : num [1:93579] 45 47.5 42.4 48 51.7 ...
##  $ avg_firstWon         : num [1:93579] 30.7 33.1 31.7 35.2 35.9 ...
##  $ avg_secWon           : num [1:93579] 15.7 17.2 19.3 15.4 14.5 ...
##  $ avg_bpSaved          : num [1:93579] 4.32 4.07 3.82 3.81 4.18 ...
##  $ avg_opponent_ace     : num [1:93579] 4.84 4.88 8.21 6.49 9.61 ...
##  $ avg_opponent_df      : num [1:93579] 1.64 2.75 4.46 2.48 5.24 ...
##  $ avg_opponent_svpt    : num [1:93579] 79.6 76.1 79.6 78.4 86.8 ...
##  $ avg_opponent_firstIn : num [1:93579] 55 47 46.4 47.5 53.8 ...
##  $ avg_opponent_firstWon: num [1:93579] 38.1 31.3 34.2 35.5 40.2 ...
##  $ avg_opponent_secWon  : num [1:93579] 12.4 13.3 16.2 17 16.2 ...
##  $ avg_opponent_bpSaved : num [1:93579] 4.18 3.88 3.61 3.72 4.32 ...
##  $ surface_clay         : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 2 1 ...
##  $ surface_grass        : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
##  $ surface_hard         : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 1 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:6890] 85 121 295 409 425 514 618 734 736 1227 ...
##   ..- attr(*, "names")= chr [1:6890] "85" "121" "295" "409" ...
#frequency table for tennis_train$outcome
train_outcome_table <- table(tennis_train$outcome)

# frequency table for tennis_test$outcome
test_outcome_table <- table(tennis_test$outcome)

# Create a dataframe for visualization
outcome_data <- data.frame(
  dataset = rep(c("Train", "Test"), each = 2),
  outcome = rep(c("0", "1"), times = 2),
  count = c(train_outcome_table, test_outcome_table)
)
# Create bar graph using ggplot
ggplot(outcome_data, aes(x = outcome, y = count, fill = dataset)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Distribution of Outcome in Train and Test Datasets",
       x = "Outcome",
       y = "Count",
       fill = "Dataset") +
  theme_minimal()

# Histograms to understand skew
dev.new(width = 1500, height = 1000, unit = "px")
par(mfrow = c(3, 4))
hist(tennis_train$draw_size, main = "Distribution of Draw Size", xlab = "")
hist(tennis_train$ht_dif, main = "Distribution of Height Difference", xlab = "")
hist(tennis_train$age_dif, main = "Distribution of Age Difference", xlab = "")
hist(tennis_train$rank_dif, main = "Distribution of Rank Difference", xlab = "")
hist(tennis_train$avg_ace, main = "Distribution of Average Aces", xlab = "") #right skew
hist(tennis_train$avg_df, main = "Distribution of Average Double Faults", xlab = "") #right skew
hist(tennis_train$avg_svpt, main = "Distribution of Average Serve Points", xlab = "")
hist(tennis_train$avg_ace, main = "Distribution of Average Aces", xlab = "")
hist(tennis_train$avg_firstIn, main = "Distribution of Avg 1st Serve In", xlab = "")
hist(tennis_train$avg_firstWon, main = "Distribution of Avg 1st Serve Won", xlab = "")
hist(tennis_train$avg_secWon, main = "Distribution of Avg 2nd Serve Won", xlab = "") 
hist(tennis_train$avg_bpSaved, main = "Distribution of Avg BP Saved", xlab = "")
library(ggplot2)

# Prettier Histograms
ggplot(tennis_train, aes(x = draw_size)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Draw Size", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = ht_dif)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Height Difference", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = age_dif)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Age Difference", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = rank_dif)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Rank Difference", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_ace)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Average Aces", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_df)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Average Double Faults", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_svpt)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Average Serve Points", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_ace)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Average Aces", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_firstIn)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Avg 1st Serve In", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_firstWon)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Avg 1st Serve Won", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_secWon)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Avg 2nd Serve Won", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)

ggplot(tennis_train, aes(x = avg_bpSaved)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Avg BP Saved", x = "") +
  theme_minimal() +
  scale_y_continuous(NULL)


####
#### Which variables impact whether a tennis player wins a match?
####

#Logistic Regression
log.all = glm(outcome ~ .,data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ ., family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.518e-01  2.182e-01  -0.696 0.486695    
## draw_size              2.518e-05  5.331e-04   0.047 0.962321    
## tourney_levelD         1.219e-02  3.818e-02   0.319 0.749473    
## tourney_levelF         6.091e-02  9.077e-02   0.671 0.502164    
## tourney_levelG         1.467e-03  5.329e-02   0.028 0.978036    
## tourney_levelM         2.358e-03  2.942e-02   0.080 0.936118    
## handR                  7.998e-02  2.244e-02   3.564 0.000365 ***
## opponent_handR        -1.012e-01  2.242e-02  -4.514 6.36e-06 ***
## seededYes              3.691e-01  1.749e-02  21.101  < 2e-16 ***
## opponent_seededYes    -3.637e-01  1.750e-02 -20.787  < 2e-16 ***
## ht_dif                 2.046e-03  1.107e-03   1.847 0.064683 .  
## age_dif               -1.126e-02  1.324e-03  -8.503  < 2e-16 ***
## rank_dif              -2.770e-03  7.860e-05 -35.243  < 2e-16 ***
## avg_ace               -1.435e-01  6.415e-03 -22.374  < 2e-16 ***
## avg_df                 3.451e-02  1.094e-02   3.156 0.001599 ** 
## avg_svpt              -1.751e-01  9.968e-03 -17.570  < 2e-16 ***
## avg_firstIn           -3.251e-02  1.285e-02  -2.531 0.011375 *  
## avg_firstWon           3.151e-01  1.267e-02  24.874  < 2e-16 ***
## avg_secWon             2.834e-01  1.676e-02  16.901  < 2e-16 ***
## avg_bpSaved            7.039e-02  3.037e-02   2.318 0.020451 *  
## avg_opponent_ace       1.399e-01  6.376e-03  21.939  < 2e-16 ***
## avg_opponent_df       -3.780e-02  1.092e-02  -3.462 0.000537 ***
## avg_opponent_svpt      1.858e-01  1.004e-02  18.499  < 2e-16 ***
## avg_opponent_firstIn   2.220e-02  1.288e-02   1.724 0.084757 .  
## avg_opponent_firstWon -3.097e-01  1.261e-02 -24.560  < 2e-16 ***
## avg_opponent_secWon   -3.029e-01  1.689e-02 -17.932  < 2e-16 ***
## avg_opponent_bpSaved  -8.791e-02  3.042e-02  -2.889 0.003861 ** 
## surface_clay1          4.946e-02  5.313e-02   0.931 0.351840    
## surface_grass1         3.664e-02  5.660e-02   0.647 0.517349    
## surface_hard1          3.989e-02  5.255e-02   0.759 0.447839    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 114378  on 93549  degrees of freedom
## AIC: 114438
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: first removing avg_opponent_firstIn (43.77)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              8.868828  1        2.978058
## tourney_level         10.339998  4        1.339106
## hand                   1.119116  1        1.057883
## opponent_hand          1.121494  1        1.059006
## seeded                 1.354503  1        1.163831
## opponent_seeded        1.356026  1        1.164485
## ht_dif                 2.222873  1        1.490930
## age_dif                1.039727  1        1.019670
## rank_dif               1.253642  1        1.119661
## avg_ace                8.134836  1        2.852163
## avg_df                 1.772299  1        1.331277
## avg_svpt              30.586788  1        5.530532
## avg_firstIn           42.934662  1        6.552455
## avg_firstWon          32.628871  1        5.712169
## avg_secWon            16.722916  1        4.089366
## avg_bpSaved            5.617343  1        2.370093
## avg_opponent_ace       8.099690  1        2.845995
## avg_opponent_df        1.768078  1        1.329691
## avg_opponent_svpt     31.410080  1        5.604470
## avg_opponent_firstIn  43.595122  1        6.602660
## avg_opponent_firstWon 32.816154  1        5.728539
## avg_opponent_secWon   17.059079  1        4.130264
## avg_opponent_bpSaved   5.640536  1        2.374981
## surface_clay          12.118586  1        3.481176
## surface_grass          6.081792  1        2.466129
## surface_hard          13.515292  1        3.676315
log.all = glm(outcome ~ . -avg_opponent_firstIn,data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn, family = binomial, 
##     data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.557e-01  2.182e-01  -0.714 0.475429    
## draw_size              2.569e-05  5.331e-04   0.048 0.961559    
## tourney_levelD         1.289e-02  3.818e-02   0.338 0.735584    
## tourney_levelF         5.959e-02  9.075e-02   0.657 0.511403    
## tourney_levelG         1.632e-03  5.329e-02   0.031 0.975570    
## tourney_levelM         2.611e-03  2.942e-02   0.089 0.929267    
## handR                  8.012e-02  2.244e-02   3.570 0.000356 ***
## opponent_handR        -1.065e-01  2.220e-02  -4.796 1.62e-06 ***
## seededYes              3.691e-01  1.749e-02  21.100  < 2e-16 ***
## opponent_seededYes    -3.631e-01  1.749e-02 -20.754  < 2e-16 ***
## ht_dif                 2.328e-03  1.095e-03   2.126 0.033532 *  
## age_dif               -1.125e-02  1.324e-03  -8.497  < 2e-16 ***
## rank_dif              -2.772e-03  7.860e-05 -35.262  < 2e-16 ***
## avg_ace               -1.436e-01  6.415e-03 -22.381  < 2e-16 ***
## avg_df                 3.470e-02  1.094e-02   3.173 0.001508 ** 
## avg_svpt              -1.755e-01  9.965e-03 -17.612  < 2e-16 ***
## avg_firstIn           -3.167e-02  1.284e-02  -2.467 0.013624 *  
## avg_firstWon           3.142e-01  1.266e-02  24.825  < 2e-16 ***
## avg_secWon             2.843e-01  1.676e-02  16.966  < 2e-16 ***
## avg_bpSaved            7.007e-02  3.036e-02   2.308 0.021017 *  
## avg_opponent_ace       1.331e-01  5.028e-03  26.474  < 2e-16 ***
## avg_opponent_df       -4.633e-02  9.736e-03  -4.759 1.95e-06 ***
## avg_opponent_svpt      1.981e-01  7.059e-03  28.070  < 2e-16 ***
## avg_opponent_firstWon -2.936e-01  8.509e-03 -34.509  < 2e-16 ***
## avg_opponent_secWon   -3.268e-01  9.650e-03 -33.871  < 2e-16 ***
## avg_opponent_bpSaved  -8.801e-02  3.041e-02  -2.894 0.003808 ** 
## surface_clay1          4.957e-02  5.313e-02   0.933 0.350766    
## surface_grass1         3.556e-02  5.660e-02   0.628 0.529786    
## surface_hard1          3.918e-02  5.255e-02   0.746 0.455937    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 114381  on 93550  degrees of freedom
## AIC: 114439
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: next removing avg_firstIn (43.414)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              8.868515  1        2.978005
## tourney_level         10.337036  4        1.339058
## hand                   1.119099  1        1.057875
## opponent_hand          1.100471  1        1.049034
## seeded                 1.354505  1        1.163832
## opponent_seeded        1.355508  1        1.164263
## ht_dif                 2.173863  1        1.474403
## age_dif                1.039719  1        1.019666
## rank_dif               1.253577  1        1.119633
## avg_ace                8.134073  1        2.852030
## avg_df                 1.772207  1        1.331242
## avg_svpt              30.570504  1        5.529060
## avg_firstIn           42.874997  1        6.547900
## avg_firstWon          32.571055  1        5.707106
## avg_secWon            16.706364  1        4.087342
## avg_bpSaved            5.616995  1        2.370020
## avg_opponent_ace       5.034226  1        2.243708
## avg_opponent_df        1.404763  1        1.185227
## avg_opponent_svpt     15.509788  1        3.938247
## avg_opponent_firstWon 14.937719  1        3.864935
## avg_opponent_secWon    5.568464  1        2.359759
## avg_opponent_bpSaved   5.638733  1        2.374602
## surface_clay          12.120242  1        3.481414
## surface_grass          6.080515  1        2.465870
## surface_hard          13.515549  1        3.676350
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn,data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn, 
##     family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.518e-01  2.182e-01  -0.696  0.48655    
## draw_size              2.204e-05  5.330e-04   0.041  0.96702    
## tourney_levelD         1.198e-02  3.818e-02   0.314  0.75376    
## tourney_levelF         6.136e-02  9.074e-02   0.676  0.49888    
## tourney_levelG         1.607e-03  5.328e-02   0.030  0.97594    
## tourney_levelM         2.331e-03  2.941e-02   0.079  0.93683    
## handR                  8.745e-02  2.223e-02   3.933 8.38e-05 ***
## opponent_handR        -1.063e-01  2.220e-02  -4.788 1.69e-06 ***
## seededYes              3.684e-01  1.749e-02  21.061  < 2e-16 ***
## opponent_seededYes    -3.629e-01  1.749e-02 -20.745  < 2e-16 ***
## ht_dif                 2.717e-03  1.084e-03   2.507  0.01217 *  
## age_dif               -1.123e-02  1.324e-03  -8.483  < 2e-16 ***
## rank_dif              -2.774e-03  7.861e-05 -35.294  < 2e-16 ***
## avg_ace               -1.338e-01  5.044e-03 -26.527  < 2e-16 ***
## avg_df                 4.687e-02  9.763e-03   4.801 1.58e-06 ***
## avg_svpt              -1.929e-01  7.046e-03 -27.377  < 2e-16 ***
## avg_firstWon           2.911e-01  8.509e-03  34.210  < 2e-16 ***
## avg_secWon             3.182e-01  9.631e-03  33.033  < 2e-16 ***
## avg_bpSaved            6.980e-02  3.034e-02   2.300  0.02144 *  
## avg_opponent_ace       1.336e-01  5.025e-03  26.578  < 2e-16 ***
## avg_opponent_df       -4.619e-02  9.736e-03  -4.744 2.09e-06 ***
## avg_opponent_svpt      1.980e-01  7.058e-03  28.053  < 2e-16 ***
## avg_opponent_firstWon -2.932e-01  8.507e-03 -34.469  < 2e-16 ***
## avg_opponent_secWon   -3.269e-01  9.649e-03 -33.878  < 2e-16 ***
## avg_opponent_bpSaved  -8.750e-02  3.041e-02  -2.877  0.00401 ** 
## surface_clay1          4.945e-02  5.313e-02   0.931  0.35199    
## surface_grass1         3.713e-02  5.659e-02   0.656  0.51175    
## surface_hard1          4.019e-02  5.255e-02   0.765  0.44439    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 114387  on 93551  degrees of freedom
## AIC: 114443
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: next removing avg_svpt (15.61)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              8.867371  1        2.977813
## tourney_level         10.333753  4        1.339005
## hand                   1.099359  1        1.048503
## opponent_hand          1.100480  1        1.049038
## seeded                 1.354326  1        1.163755
## opponent_seeded        1.355472  1        1.164247
## ht_dif                 2.128183  1        1.458829
## age_dif                1.039689  1        1.019652
## rank_dif               1.253458  1        1.119579
## avg_ace                5.023229  1        2.241256
## avg_df                 1.411406  1        1.188026
## avg_svpt              15.281833  1        3.909198
## avg_firstWon          14.710534  1        3.835431
## avg_secWon             5.522235  1        2.349944
## avg_bpSaved            5.612448  1        2.369061
## avg_opponent_ace       5.028257  1        2.242377
## avg_opponent_df        1.404790  1        1.185238
## avg_opponent_svpt     15.508422  1        3.938073
## avg_opponent_firstWon 14.930058  1        3.863943
## avg_opponent_secWon    5.568079  1        2.359678
## avg_opponent_bpSaved   5.638981  1        2.374654
## surface_clay          12.122032  1        3.481671
## surface_grass          6.078993  1        2.465561
## surface_hard          13.515815  1        3.676386
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn -avg_svpt,
              data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn - 
##     avg_svpt, family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.8123803  0.2083117  -8.700  < 2e-16 ***
## draw_size             -0.0005080  0.0005300  -0.958  0.33781    
## tourney_levelD         0.0100152  0.0379604   0.264  0.79191    
## tourney_levelF         0.1679421  0.0901555   1.863  0.06249 .  
## tourney_levelG         0.0600236  0.0529612   1.133  0.25707    
## tourney_levelM         0.0659128  0.0291736   2.259  0.02386 *  
## handR                 -0.0492628  0.0215634  -2.285  0.02234 *  
## opponent_handR        -0.1021406  0.0221236  -4.617  3.9e-06 ***
## seededYes              0.4683647  0.0171148  27.366  < 2e-16 ***
## opponent_seededYes    -0.3417574  0.0173989 -19.643  < 2e-16 ***
## ht_dif                 0.0027225  0.0010792   2.523  0.01165 *  
## age_dif               -0.0121910  0.0013181  -9.249  < 2e-16 ***
## rank_dif              -0.0029205  0.0000791 -36.921  < 2e-16 ***
## avg_ace               -0.0945144  0.0048472 -19.499  < 2e-16 ***
## avg_df                -0.0151459  0.0094924  -1.596  0.11058    
## avg_firstWon           0.0791349  0.0034027  23.257  < 2e-16 ***
## avg_secWon             0.0935516  0.0049372  18.948  < 2e-16 ***
## avg_bpSaved           -0.6024699  0.0187646 -32.107  < 2e-16 ***
## avg_opponent_ace       0.1297252  0.0050029  25.930  < 2e-16 ***
## avg_opponent_df       -0.0444049  0.0097006  -4.578  4.7e-06 ***
## avg_opponent_svpt      0.1933085  0.0070402  27.458  < 2e-16 ***
## avg_opponent_firstWon -0.2861071  0.0084810 -33.735  < 2e-16 ***
## avg_opponent_secWon   -0.3190889  0.0096151 -33.186  < 2e-16 ***
## avg_opponent_bpSaved  -0.0928105  0.0304101  -3.052  0.00227 ** 
## surface_clay1          0.0447359  0.0528975   0.846  0.39771    
## surface_grass1         0.0285444  0.0563555   0.507  0.61250    
## surface_hard1          0.0360902  0.0523240   0.690  0.49035    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 115165  on 93552  degrees of freedom
## AIC: 115219
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: next removing avg_opponent_svpt (15.38)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              8.870047  1        2.978262
## tourney_level         10.268747  4        1.337949
## hand                   1.045349  1        1.022423
## opponent_hand          1.101116  1        1.049341
## seeded                 1.309215  1        1.144209
## opponent_seeded        1.354258  1        1.163726
## ht_dif                 2.119729  1        1.455929
## age_dif                1.039816  1        1.019713
## rank_dif               1.255487  1        1.120485
## avg_ace                4.614727  1        2.148192
## avg_df                 1.347853  1        1.160971
## avg_firstWon           2.360432  1        1.536370
## avg_secWon             1.468842  1        1.211958
## avg_bpSaved            2.105905  1        1.451174
## avg_opponent_ace       5.031061  1        2.243003
## avg_opponent_df        1.405957  1        1.185731
## avg_opponent_svpt     15.451261  1        3.930809
## avg_opponent_firstWon 14.934654  1        3.864538
## avg_opponent_secWon    5.565545  1        2.359141
## avg_opponent_bpSaved   5.668957  1        2.380957
## surface_clay          12.125031  1        3.482102
## surface_grass          6.076622  1        2.465081
## surface_hard          13.517881  1        3.676667
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt -avg_opponent_svpt,
              data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn - 
##     avg_svpt - avg_opponent_svpt, family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.004e-01  1.970e-01  -0.510   0.6104    
## draw_size             -1.323e-05  5.272e-04  -0.025   0.9800    
## tourney_levelD         8.211e-03  3.777e-02   0.217   0.8279    
## tourney_levelF         5.826e-02  8.940e-02   0.652   0.5146    
## tourney_levelG         4.386e-03  5.266e-02   0.083   0.9336    
## tourney_levelM         3.286e-03  2.894e-02   0.114   0.9096    
## handR                 -5.171e-02  2.148e-02  -2.407   0.0161 *  
## opponent_handR         3.586e-02  2.144e-02   1.673   0.0944 .  
## seededYes              4.430e-01  1.701e-02  26.038   <2e-16 ***
## opponent_seededYes    -4.421e-01  1.702e-02 -25.979   <2e-16 ***
## ht_dif                 2.746e-03  1.075e-03   2.554   0.0106 *  
## age_dif               -1.326e-02  1.312e-03 -10.105   <2e-16 ***
## rank_dif              -3.070e-03  7.956e-05 -38.582   <2e-16 ***
## avg_ace               -9.149e-02  4.825e-03 -18.960   <2e-16 ***
## avg_df                -1.633e-02  9.456e-03  -1.727   0.0841 .  
## avg_firstWon           7.696e-02  3.393e-03  22.684   <2e-16 ***
## avg_secWon             9.122e-02  4.919e-03  18.547   <2e-16 ***
## avg_bpSaved           -5.811e-01  1.868e-02 -31.118   <2e-16 ***
## avg_opponent_ace       9.112e-02  4.808e-03  18.949   <2e-16 ***
## avg_opponent_df        1.699e-02  9.430e-03   1.801   0.0716 .  
## avg_opponent_firstWon -7.397e-02  3.374e-03 -21.922   <2e-16 ***
## avg_opponent_secWon   -9.384e-02  4.898e-03 -19.160   <2e-16 ***
## avg_opponent_bpSaved   5.838e-01  1.868e-02  31.253   <2e-16 ***
## surface_clay1          4.922e-02  5.265e-02   0.935   0.3499    
## surface_grass1         3.696e-02  5.611e-02   0.659   0.5100    
## surface_hard1          3.959e-02  5.209e-02   0.760   0.4473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 115946  on 93553  degrees of freedom
## AIC: 115998
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: surface_hard (13.44)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              8.877252  1        2.979472
## tourney_level         10.200758  4        1.336839
## hand                   1.045722  1        1.022606
## opponent_hand          1.045412  1        1.022454
## seeded                 1.306411  1        1.142983
## opponent_seeded        1.307957  1        1.143659
## ht_dif                 2.111508  1        1.453103
## age_dif                1.039630  1        1.019623
## rank_dif               1.256823  1        1.121081
## avg_ace                4.618392  1        2.149045
## avg_df                 1.348226  1        1.161131
## avg_firstWon           2.364897  1        1.537822
## avg_secWon             1.468370  1        1.211763
## avg_bpSaved            2.100523  1        1.449318
## avg_opponent_ace       4.626291  1        2.150881
## avg_opponent_df        1.342632  1        1.158720
## avg_opponent_firstWon  2.375001  1        1.541104
## avg_opponent_secWon    1.461782  1        1.209042
## avg_opponent_bpSaved   2.104156  1        1.450571
## surface_clay          12.121841  1        3.481643
## surface_grass          6.073133  1        2.464373
## surface_hard          13.514212  1        3.676168
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt 
              -avg_opponent_svpt -surface_hard,
              data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn - 
##     avg_svpt - avg_opponent_svpt - surface_hard, family = binomial, 
##     data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -6.806e-02  1.924e-01  -0.354   0.7235    
## draw_size              1.900e-05  5.255e-04   0.036   0.9712    
## tourney_levelD         7.678e-03  3.776e-02   0.203   0.8389    
## tourney_levelF         5.875e-02  8.939e-02   0.657   0.5111    
## tourney_levelG         2.192e-03  5.258e-02   0.042   0.9667    
## tourney_levelM         2.313e-03  2.891e-02   0.080   0.9362    
## handR                 -5.179e-02  2.148e-02  -2.411   0.0159 *  
## opponent_handR         3.577e-02  2.144e-02   1.669   0.0952 .  
## seededYes              4.430e-01  1.701e-02  26.042   <2e-16 ***
## opponent_seededYes    -4.420e-01  1.702e-02 -25.974   <2e-16 ***
## ht_dif                 2.745e-03  1.075e-03   2.554   0.0107 *  
## age_dif               -1.326e-02  1.312e-03 -10.105   <2e-16 ***
## rank_dif              -3.070e-03  7.956e-05 -38.584   <2e-16 ***
## avg_ace               -9.151e-02  4.825e-03 -18.964   <2e-16 ***
## avg_df                -1.634e-02  9.456e-03  -1.728   0.0839 .  
## avg_firstWon           7.703e-02  3.392e-03  22.710   <2e-16 ***
## avg_secWon             9.123e-02  4.919e-03  18.548   <2e-16 ***
## avg_bpSaved           -5.810e-01  1.867e-02 -31.113   <2e-16 ***
## avg_opponent_ace       9.112e-02  4.808e-03  18.950   <2e-16 ***
## avg_opponent_df        1.695e-02  9.430e-03   1.797   0.0723 .  
## avg_opponent_firstWon -7.391e-02  3.373e-03 -21.911   <2e-16 ***
## avg_opponent_secWon   -9.385e-02  4.897e-03 -19.164   <2e-16 ***
## avg_opponent_bpSaved   5.840e-01  1.868e-02  31.262   <2e-16 ***
## surface_clay1          1.114e-02  1.619e-02   0.688   0.4912    
## surface_grass1        -1.497e-03  2.423e-02  -0.062   0.9507    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 115947  on 93554  degrees of freedom
## AIC: 115997
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: tourney_level (10.14)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              8.819879  1        2.969828
## tourney_level         10.153540  4        1.336064
## hand                   1.045694  1        1.022592
## opponent_hand          1.045384  1        1.022440
## seeded                 1.306367  1        1.142964
## opponent_seeded        1.307875  1        1.143624
## ht_dif                 2.111503  1        1.453101
## age_dif                1.039630  1        1.019622
## rank_dif               1.256807  1        1.121074
## avg_ace                4.618365  1        2.149038
## avg_df                 1.348247  1        1.161140
## avg_firstWon           2.363485  1        1.537363
## avg_secWon             1.468384  1        1.211769
## avg_bpSaved            2.100382  1        1.449269
## avg_opponent_ace       4.626193  1        2.150859
## avg_opponent_df        1.342570  1        1.158693
## avg_opponent_firstWon  2.373891  1        1.540744
## avg_opponent_secWon    1.461746  1        1.209027
## avg_opponent_bpSaved   2.103963  1        1.450504
## surface_clay           1.145435  1        1.070250
## surface_grass          1.132469  1        1.064175
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt 
              -avg_opponent_svpt -surface_hard -tourney_level,
              data = tennis_train, family = binomial)
summary(log.all)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn - 
##     avg_svpt - avg_opponent_svpt - surface_hard - tourney_level, 
##     family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -6.836e-02  1.917e-01  -0.357   0.7214    
## draw_size              1.525e-05  1.816e-04   0.084   0.9331    
## handR                 -5.164e-02  2.148e-02  -2.405   0.0162 *  
## opponent_handR         3.591e-02  2.144e-02   1.675   0.0939 .  
## seededYes              4.427e-01  1.683e-02  26.302   <2e-16 ***
## opponent_seededYes    -4.424e-01  1.683e-02 -26.289   <2e-16 ***
## ht_dif                 2.746e-03  1.075e-03   2.555   0.0106 *  
## age_dif               -1.326e-02  1.312e-03 -10.106   <2e-16 ***
## rank_dif              -3.070e-03  7.956e-05 -38.584   <2e-16 ***
## avg_ace               -9.169e-02  4.814e-03 -19.046   <2e-16 ***
## avg_df                -1.635e-02  9.456e-03  -1.729   0.0838 .  
## avg_firstWon           7.716e-02  3.383e-03  22.807   <2e-16 ***
## avg_secWon             9.133e-02  4.909e-03  18.604   <2e-16 ***
## avg_bpSaved           -5.820e-01  1.854e-02 -31.385   <2e-16 ***
## avg_opponent_ace       9.093e-02  4.796e-03  18.957   <2e-16 ***
## avg_opponent_df        1.695e-02  9.430e-03   1.797   0.0723 .  
## avg_opponent_firstWon -7.377e-02  3.363e-03 -21.936   <2e-16 ***
## avg_opponent_secWon   -9.375e-02  4.889e-03 -19.177   <2e-16 ***
## avg_opponent_bpSaved   5.830e-01  1.854e-02  31.437   <2e-16 ***
## surface_clay1          1.042e-02  1.606e-02   0.649   0.5164    
## surface_grass1        -2.384e-03  2.361e-02  -0.101   0.9195    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 115947  on 93558  degrees of freedom
## AIC: 115989
## 
## Number of Fisher Scoring iterations: 4
vif(log.all) # no multicollinearity :)
##             draw_size                  hand         opponent_hand 
##              1.054087              1.045211              1.044903 
##                seeded       opponent_seeded                ht_dif 
##              1.278557              1.278951              2.111483 
##               age_dif              rank_dif               avg_ace 
##              1.039621              1.256802              4.597178 
##                avg_df          avg_firstWon            avg_secWon 
##              1.348139              2.351309              1.462836 
##           avg_bpSaved      avg_opponent_ace       avg_opponent_df 
##              2.070417              4.603377              1.342438 
## avg_opponent_firstWon   avg_opponent_secWon  avg_opponent_bpSaved 
##              2.359852              1.456513              2.074092 
##          surface_clay         surface_grass 
##              1.127387              1.075109
predprob_log <- predict.glm(log.all, tennis_test, type = "response")
predclass_log = ifelse(predprob_log >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log), tennis_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7704 3904
##          1 4040 7747
##                                           
##                Accuracy : 0.6604          
##                  95% CI : (0.6543, 0.6665)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3209          
##                                           
##  Mcnemar's Test P-Value : 0.1299          
##                                           
##             Sensitivity : 0.6649          
##             Specificity : 0.6560          
##          Pos Pred Value : 0.6572          
##          Neg Pred Value : 0.6637          
##              Prevalence : 0.4980          
##          Detection Rate : 0.3311          
##    Detection Prevalence : 0.5038          
##       Balanced Accuracy : 0.6605          
##                                           
##        'Positive' Class : 1               
## 
# Accuracy    : 0.6596  
# Sensitivity : 0.6660         
# Specificity : 0.6529  

#Logistic Regression with Stepwise Selection
null_model = glm(outcome ~ 1, data = tennis_train, family = binomial) 
full_model = log.all

step.model.AIC = step(null_model, scope = list(upper = full_model),
                      direction = "both", test = "Chisq", trace = F) 
summary(step.model.AIC) 
## 
## Call:
## glm(formula = outcome ~ rank_dif + seeded + opponent_seeded + 
##     avg_bpSaved + avg_opponent_bpSaved + age_dif + avg_firstWon + 
##     avg_ace + avg_secWon + avg_opponent_firstWon + avg_opponent_ace + 
##     avg_opponent_secWon + ht_dif + hand + avg_df + avg_opponent_df + 
##     opponent_hand, family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -5.562e-02  1.907e-01  -0.292   0.7705    
## rank_dif              -3.070e-03  7.956e-05 -38.584   <2e-16 ***
## seededYes              4.430e-01  1.679e-02  26.384   <2e-16 ***
## opponent_seededYes    -4.420e-01  1.679e-02 -26.334   <2e-16 ***
## avg_bpSaved           -5.816e-01  1.853e-02 -31.392   <2e-16 ***
## avg_opponent_bpSaved   5.834e-01  1.853e-02  31.489   <2e-16 ***
## age_dif               -1.326e-02  1.312e-03 -10.106   <2e-16 ***
## avg_firstWon           7.712e-02  3.376e-03  22.842   <2e-16 ***
## avg_ace               -9.183e-02  4.807e-03 -19.104   <2e-16 ***
## avg_secWon             9.110e-02  4.890e-03  18.629   <2e-16 ***
## avg_opponent_firstWon -7.382e-02  3.356e-03 -21.994   <2e-16 ***
## avg_opponent_ace       9.080e-02  4.790e-03  18.958   <2e-16 ***
## avg_opponent_secWon   -9.398e-02  4.870e-03 -19.298   <2e-16 ***
## ht_dif                 2.747e-03  1.075e-03   2.556   0.0106 *  
## handR                 -5.163e-02  2.148e-02  -2.404   0.0162 *  
## avg_df                -1.640e-02  9.455e-03  -1.735   0.0828 .  
## avg_opponent_df        1.689e-02  9.428e-03   1.791   0.0733 .  
## opponent_handR         3.593e-02  2.143e-02   1.676   0.0937 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 115948  on 93561  degrees of freedom
## AIC: 115984
## 
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise 
log.sel <- glm(outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + opponent_seeded +
                 seeded + age_dif + avg_firstWon + avg_ace + avg_secWon + avg_opponent_firstWon +
                 avg_opponent_ace + avg_opponent_secWon + hand + ht_dif + avg_df,
               tennis_train, family = binomial)
summary(log.sel)
## 
## Call:
## glm(formula = outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + 
##     opponent_seeded + seeded + age_dif + avg_firstWon + avg_ace + 
##     avg_secWon + avg_opponent_firstWon + avg_opponent_ace + avg_opponent_secWon + 
##     hand + ht_dif + avg_df, family = binomial, data = tennis_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.576e-02  1.878e-01   0.084   0.9331    
## rank_dif              -3.075e-03  7.954e-05 -38.662   <2e-16 ***
## avg_opponent_bpSaved   5.919e-01  1.714e-02  34.536   <2e-16 ***
## avg_bpSaved           -5.815e-01  1.853e-02 -31.385   <2e-16 ***
## opponent_seededYes    -4.413e-01  1.678e-02 -26.292   <2e-16 ***
## seededYes              4.423e-01  1.679e-02  26.350   <2e-16 ***
## age_dif               -1.312e-02  1.311e-03 -10.010   <2e-16 ***
## avg_firstWon           7.722e-02  3.376e-03  22.871   <2e-16 ***
## avg_ace               -9.182e-02  4.807e-03 -19.102   <2e-16 ***
## avg_secWon             9.104e-02  4.890e-03  18.616   <2e-16 ***
## avg_opponent_firstWon -7.561e-02  3.259e-03 -23.198   <2e-16 ***
## avg_opponent_ace       9.402e-02  4.449e-03  21.130   <2e-16 ***
## avg_opponent_secWon   -9.311e-02  4.850e-03 -19.197   <2e-16 ***
## handR                 -5.154e-02  2.148e-02  -2.400   0.0164 *  
## ht_dif                 2.725e-03  1.075e-03   2.536   0.0112 *  
## avg_df                -1.619e-02  9.454e-03  -1.712   0.0868 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 115954  on 93563  degrees of freedom
## AIC: 115986
## 
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2 <- predict(log.sel, newdata = tennis_test, type = "response")
logistic_pred_class2 <- ifelse(logistic_pred2 > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2), tennis_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7693 3908
##          1 4051 7743
##                                           
##                Accuracy : 0.6598          
##                  95% CI : (0.6537, 0.6659)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3196          
##                                           
##  Mcnemar's Test P-Value : 0.1115          
##                                           
##             Sensitivity : 0.6646          
##             Specificity : 0.6551          
##          Pos Pred Value : 0.6565          
##          Neg Pred Value : 0.6631          
##              Prevalence : 0.4980          
##          Detection Rate : 0.3310          
##    Detection Prevalence : 0.5041          
##       Balanced Accuracy : 0.6598          
##                                           
##        'Positive' Class : 1               
## 
# Accuracy :    0.6592     
# Sensitivity : 0.6659        
# Specificity : 0.6525 


#LDA
lda_model = lda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded + 
                  ht_dif + age_dif + rank_dif + avg_ace + avg_df +  avg_firstWon + avg_secWon + 
                  avg_bpSaved + avg_opponent_ace + avg_opponent_df + avg_opponent_firstWon + 
                  avg_opponent_secWon + avg_opponent_bpSaved +  surface_clay + surface_grass,
                data = tennis_train)
lda_model
## Call:
## lda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded + 
##     ht_dif + age_dif + rank_dif + avg_ace + avg_df + avg_firstWon + 
##     avg_secWon + avg_bpSaved + avg_opponent_ace + avg_opponent_df + 
##     avg_opponent_firstWon + avg_opponent_secWon + avg_opponent_bpSaved + 
##     surface_clay + surface_grass, data = tennis_train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4995031 0.5004969 
## 
## Group means:
##   draw_size     handR opponent_handR seededYes opponent_seededYes     ht_dif
## 0  58.01326 0.8678947      0.8751685 0.2458336          0.4428898 -0.6534668
## 1  58.02882 0.8747118      0.8661500 0.4432701          0.2461141  0.6196943
##      age_dif  rank_dif  avg_ace   avg_df avg_firstWon avg_secWon avg_bpSaved
## 0  0.1491197  33.49761 5.876086 2.986887     34.32419   15.60362    4.176661
## 1 -0.1595653 -33.67920 6.277315 2.878460     34.98529   15.73356    3.998110
##   avg_opponent_ace avg_opponent_df avg_opponent_firstWon avg_opponent_secWon
## 0         6.292537        2.880003              34.99085            15.74220
## 1         5.887530        2.986144              34.34224            15.59955
##   avg_opponent_bpSaved surface_clay1 surface_grass1
## 0             3.996073     0.3170742      0.1078450
## 1             4.174689     0.3190067      0.1080152
## 
## Coefficients of linear discriminants:
##                                 LD1
## draw_size              8.322884e-06
## handR                 -6.490700e-02
## opponent_handR         4.344118e-02
## seededYes              6.875411e-01
## opponent_seededYes    -6.845001e-01
## ht_dif                 3.650945e-03
## age_dif               -1.694301e-02
## rank_dif              -2.935947e-03
## avg_ace               -1.023734e-01
## avg_df                -3.275343e-02
## avg_firstWon           9.014533e-02
## avg_secWon             1.049756e-01
## avg_bpSaved           -6.677011e-01
## avg_opponent_ace       1.029193e-01
## avg_opponent_df        3.326777e-02
## avg_opponent_firstWon -8.726589e-02
## avg_opponent_secWon   -1.094587e-01
## avg_opponent_bpSaved   6.735015e-01
## surface_clay1          1.244285e-02
## surface_grass1        -2.389925e-03
predictions.lda = predict(lda_model, tennis_test)
caret::confusionMatrix(as.factor(predictions.lda$class), tennis_test$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7686 3934
##          1 4058 7717
##                                           
##                Accuracy : 0.6584          
##                  95% CI : (0.6523, 0.6645)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3168          
##                                           
##  Mcnemar's Test P-Value : 0.1689          
##                                           
##             Sensitivity : 0.6545          
##             Specificity : 0.6623          
##          Pos Pred Value : 0.6614          
##          Neg Pred Value : 0.6554          
##              Prevalence : 0.5020          
##          Detection Rate : 0.3285          
##    Detection Prevalence : 0.4967          
##       Balanced Accuracy : 0.6584          
##                                           
##        'Positive' Class : 0               
## 
# Accuracy    : 0.6584 
# Sensitivity : 0.6521       
# Specificity : 0.6647 

#QDA
qda_model = qda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded + ht_dif + 
                  age_dif + rank_dif + avg_ace + avg_df +  avg_firstWon + avg_secWon + avg_bpSaved + 
                  avg_opponent_ace + avg_opponent_df + avg_opponent_firstWon + avg_opponent_secWon + 
                  avg_opponent_bpSaved +  surface_clay + surface_grass, data = tennis_train)
qda_model
## Call:
## qda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded + 
##     ht_dif + age_dif + rank_dif + avg_ace + avg_df + avg_firstWon + 
##     avg_secWon + avg_bpSaved + avg_opponent_ace + avg_opponent_df + 
##     avg_opponent_firstWon + avg_opponent_secWon + avg_opponent_bpSaved + 
##     surface_clay + surface_grass, data = tennis_train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4995031 0.5004969 
## 
## Group means:
##   draw_size     handR opponent_handR seededYes opponent_seededYes     ht_dif
## 0  58.01326 0.8678947      0.8751685 0.2458336          0.4428898 -0.6534668
## 1  58.02882 0.8747118      0.8661500 0.4432701          0.2461141  0.6196943
##      age_dif  rank_dif  avg_ace   avg_df avg_firstWon avg_secWon avg_bpSaved
## 0  0.1491197  33.49761 5.876086 2.986887     34.32419   15.60362    4.176661
## 1 -0.1595653 -33.67920 6.277315 2.878460     34.98529   15.73356    3.998110
##   avg_opponent_ace avg_opponent_df avg_opponent_firstWon avg_opponent_secWon
## 0         6.292537        2.880003              34.99085            15.74220
## 1         5.887530        2.986144              34.34224            15.59955
##   avg_opponent_bpSaved surface_clay1 surface_grass1
## 0             3.996073     0.3170742      0.1078450
## 1             4.174689     0.3190067      0.1080152
predictions.qda = predict(qda_model, tennis_test)
caret::confusionMatrix(as.factor(predictions.qda$class), tennis_test$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7618 4057
##          1 4126 7594
##                                           
##                Accuracy : 0.6502          
##                  95% CI : (0.6441, 0.6563)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3005          
##                                           
##  Mcnemar's Test P-Value : 0.4522          
##                                           
##             Sensitivity : 0.6487          
##             Specificity : 0.6518          
##          Pos Pred Value : 0.6525          
##          Neg Pred Value : 0.6480          
##              Prevalence : 0.5020          
##          Detection Rate : 0.3256          
##    Detection Prevalence : 0.4990          
##       Balanced Accuracy : 0.6502          
##                                           
##        'Positive' Class : 0               
## 
# Accuracy    : 0.6511
# Sensitivity : 0.6533       
# Specificity : 0.6489 

#Random Forest
set.seed(29)
rf <- randomForest(outcome ~ ., data = tennis_train, importance = TRUE)
rf
## 
## Call:
##  randomForest(formula = outcome ~ ., data = tennis_train, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 34.79%
## Confusion matrix:
##       0     1 class.error
## 0 30218 16525   0.3535289
## 1 16034 30802   0.3423435
rf.preds = predict(rf, tennis_test,type="class")
caret::confusionMatrix(as.factor(rf.preds), tennis_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7552 4047
##          1 4192 7604
##                                          
##                Accuracy : 0.6478         
##                  95% CI : (0.6417, 0.654)
##     No Information Rate : 0.502          
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.2957         
##                                          
##  Mcnemar's Test P-Value : 0.1126         
##                                          
##             Sensitivity : 0.6526         
##             Specificity : 0.6431         
##          Pos Pred Value : 0.6446         
##          Neg Pred Value : 0.6511         
##              Prevalence : 0.4980         
##          Detection Rate : 0.3250         
##    Detection Prevalence : 0.5042         
##       Balanced Accuracy : 0.6478         
##                                          
##        'Positive' Class : 1              
## 
# Accuracy    : 0.6503
# Sensitivity : 0.6482       
# Specificity : 0.6525


data <- data.frame(model = c("log.all", "log.sel", "LDA", "QDA", "RF"),
                   acc = c(0.6596, 0.6592, 0.6584, 0.6511, 0.6503))

custom_colors <- c("log.all" = "#5B9F9A", "log.sel" = "#7DAFCA", "LDA" = "#E9909D", 
                   "QDA" = "#AABAE4", "RF" = "#D2C3EE")

ggplot(data, aes(x = model, y = acc, fill = model)) +
  geom_bar(stat = "identity", width = 0.5, fill = custom_colors) +
  geom_text(aes(label = acc), vjust = -0.5, size = 3) +
  labs(title = "Test Accuracy Comparison of Models", x = "", y = "Accuracy") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1),
        legend.position = "none")




####
#### Do player age and height influence winning a match?
####
tennis_df2 = match_outcomes
tennis_df2 = dplyr::select(tennis_df2, -seed, -entry, -opponent_seed, -opponent_entry,
                          -tourney_id,-tourney_name,-match_num,-name,-opponent_name,-score)
tennis_df2$hand = ifelse(tennis_df2$hand == 'U',NA, tennis_df2$hand)
tennis_df2$opponent_hand = ifelse(tennis_df2$opponent_hand == 'U',NA, tennis_df2$opponent_hand)
# Data Type Changes
tennis_df2$tourney_date = as.Date(as.character(tennis_df2$tourney_date),format = "%Y%m%d")
tennis_df2$surface = as.factor(tennis_df2$surface)
tennis_df2$tourney_level = as.factor(tennis_df2$tourney_level)
tennis_df2$hand = as.factor(tennis_df2$hand)
tennis_df2$opponent_hand = as.factor(tennis_df2$opponent_hand)
tennis_df2$seeded = as.factor(tennis_df2$seeded)
tennis_df2$opponent_seeded = as.factor(tennis_df2$opponent_seeded)
tennis_df2$outcome = as.factor(tennis_df2$outcome)
str(tennis_df2)
## gropd_df [123,864 × 66] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ surface              : Factor w/ 5 levels "","Carpet","Clay",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ draw_size            : int [1:123864] 48 48 48 48 48 48 48 48 48 48 ...
##  $ tourney_level        : Factor w/ 5 levels "A","D","F","G",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ tourney_date         : Date[1:123864], format: "2003-10-13" "2003-10-13" ...
##  $ id                   : int [1:123864] 101965 102358 102998 102610 102374 103888 103852 103292 103970 102434 ...
##  $ hand                 : Factor w/ 4 levels "","A","L","R": 4 4 4 4 4 4 3 4 4 4 ...
##  $ ht                   : int [1:123864] 185 190 190 180 180 188 188 175 175 183 ...
##  $ ioc                  : chr [1:123864] "RSA" "SWE" "USA" "ESP" ...
##  $ age                  : num [1:123864] 32 29.5 26.3 28.3 29.5 21.8 22 24.8 21.5 29.2 ...
##  $ opponent_id          : int [1:123864] 103344 102338 103786 103602 104745 102450 103151 103813 104022 103294 ...
##  $ opponent_hand        : Factor w/ 4 levels "","A","L","R": 4 4 4 4 3 4 4 3 4 4 ...
##  $ opponent_ht          : int [1:123864] 193 190 178 183 185 185 183 185 183 170 ...
##  $ opponent_ioc         : chr [1:123864] "CRO" "RUS" "RUS" "CHI" ...
##  $ opponent_age         : num [1:123864] 24.5 29.6 22.3 23.2 17.3 29.1 25.6 22.2 21.3 24.8 ...
##  $ best_of              : int [1:123864] 3 3 3 3 3 3 3 3 3 3 ...
##  $ round                : chr [1:123864] "R64" "R64" "R64" "R64" ...
##  $ minutes              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ ace                  : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ df                   : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ svpt                 : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ firstIn              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ firstWon             : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ secWon               : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ SvGms                : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ bpSaved              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ bpFaced              : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_ace         : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_df          : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_svpt        : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_firstIn     : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_firstWon    : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_secWon      : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_SvGms       : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_bpSaved     : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ opponent_bpFaced     : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
##  $ rank                 : int [1:123864] 28 146 57 23 127 25 35 37 72 33 ...
##  $ rank_points          : int [1:123864] 1090 258 660 1170 290 1145 1025 1000 480 1040 ...
##  $ opponent_rank        : int [1:123864] 42 40 43 22 49 30 27 31 29 87 ...
##  $ opponent_rank_points : int [1:123864] 865 950 855 1190 788 1055 1133 1050 1060 421 ...
##  $ seeded               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ opponent_seeded      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ outcome              : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ht_dif               : int [1:123864] -8 0 12 -3 -5 3 5 -10 -8 13 ...
##  $ age_dif              : num [1:123864] 7.5 -0.1 4 5.1 12.2 ...
##  $ rank_dif             : int [1:123864] -14 106 14 1 78 -5 8 6 43 -54 ...
##  $ avg_ace              : num [1:123864] 7.04 8.12 11.1 4.61 3.62 ...
##  $ avg_df               : num [1:123864] 2.54 5.34 3.36 2.35 3.11 ...
##  $ avg_svpt             : num [1:123864] 80.9 85.7 78 84.2 81.2 ...
##  $ avg_firstIn          : num [1:123864] 46.5 46.5 44.7 52.6 46.5 ...
##  $ avg_firstWon         : num [1:123864] 33.6 34.6 34 35.5 31.5 ...
##  $ avg_secWon           : num [1:123864] 18 18.9 15.6 16 17.4 ...
##  $ avg_SvGms            : num [1:123864] 12.5 13.2 12 12.8 12.5 ...
##  $ avg_bpSaved          : num [1:123864] 3.97 4.4 4.1 4.84 4.71 ...
##  $ avg_bpFaced          : num [1:123864] 6.38 7.43 6.61 8.03 8.19 ...
##  $ avg_opponent_ace     : num [1:123864] 12.28 3.98 3.23 6.89 3.19 ...
##  $ avg_opponent_df      : num [1:123864] 2.28 3.25 2.59 3.55 1.71 ...
##  $ avg_opponent_svpt    : num [1:123864] 80.5 79.1 74 81.5 73.6 ...
##  $ avg_opponent_firstIn : num [1:123864] 47.9 46.6 49.9 50 50 ...
##  $ avg_opponent_firstWon: num [1:123864] 37.2 32.4 34.2 37.2 36.1 ...
##  $ avg_opponent_secWon  : num [1:123864] 17.1 16.1 12.5 16.4 13.5 ...
##  $ avg_opponent_SvGms   : num [1:123864] 12.9 12.5 11.7 13 12.1 ...
##  $ avg_opponent_bpSaved : num [1:123864] 3.35 3.73 3.93 3.77 3.37 ...
##  $ avg_opponent_bpFaced : num [1:123864] 5.12 6.92 6.47 5.92 5.1 ...
##  $ surface_clay         : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
##  $ surface_grass        : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
##  $ surface_hard         : num [1:123864] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "groups")= tibble [2,261 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ opponent_id: int [1:2261] 100644 101316 101404 101532 101662 101723 101736 101746 101750 101774 ...
##   ..$ .rows      : list<int> [1:2261] 
##   .. ..$ : int [1:573] 33442 35678 36325 36416 36473 36511 37213 37718 37802 37918 ...
##   .. ..$ : int 6453
##   .. ..$ : int [1:3] 25319 27592 28329
##   .. ..$ : int [1:6] 325 882 2591 2794 62750 62782
##   .. ..$ : int [1:2] 12952 12954
##   .. ..$ : int [1:3] 1861 2534 63780
##   .. ..$ : int [1:176] 225 1164 1271 1377 1610 2183 2337 2469 2661 2665 ...
##   .. ..$ : int [1:5] 142 403 983 2619 64509
##   .. ..$ : int 648
##   .. ..$ : int [1:54] 204 790 907 931 1240 1438 1495 2133 2320 2459 ...
##   .. ..$ : int 3110
##   .. ..$ : int 19117
##   .. ..$ : int [1:2] 2951 3118
##   .. ..$ : int [1:2] 1831 63750
##   .. ..$ : int [1:2] 2972 64902
##   .. ..$ : int [1:27] 218 491 776 1651 1823 1864 2067 2260 2540 3268 ...
##   .. ..$ : int [1:5] 2980 3126 3129 9631 64915
##   .. ..$ : int [1:49] 48 212 534 2094 2252 2566 2855 3491 3599 3618 ...
##   .. ..$ : int 3231
##   .. ..$ : int 4723
##   .. ..$ : int [1:134] 162 180 242 358 508 821 941 976 1038 1118 ...
##   .. ..$ : int [1:4] 2897 3031 68401 71378
##   .. ..$ : int [1:3] 6423 6463 68397
##   .. ..$ : int [1:14] 743 852 1952 2000 2050 2589 2727 2799 62661 62752 ...
##   .. ..$ : int [1:7] 259 564 3579 3951 4432 5515 67414
##   .. ..$ : int [1:4] 1804 6346 63727 64912
##   .. ..$ : int 4270
##   .. ..$ : int 2922
##   .. ..$ : int [1:108] 46 300 384 434 662 696 755 909 913 1011 ...
##   .. ..$ : int [1:17] 194 935 1884 3372 3409 4069 4238 4275 4289 4436 ...
##   .. ..$ : int [1:78] 17 613 616 693 759 798 894 943 1085 1158 ...
##   .. ..$ : int [1:2] 3124 65054
##   .. ..$ : int [1:2] 3074 81153
##   .. ..$ : int [1:21] 134 219 595 644 827 928 994 1031 1689 2607 ...
##   .. ..$ : int 6449
##   .. ..$ : int [1:12] 802 881 929 1279 1386 1467 1478 1732 2085 2352 ...
##   .. ..$ : int [1:3] 59 189 2223
##   .. ..$ : int [1:6] 6365 12992 16143 19247 22372 74926
##   .. ..$ : int [1:235] 27 313 580 721 860 992 1032 1068 1588 1604 ...
##   .. ..$ : int [1:6] 3119 6393 6456 65053 68324 68386
##   .. ..$ : int [1:3] 3063 6241 65105
##   .. ..$ : int [1:3] 2923 68316 68317
##   .. ..$ : int 2948
##   .. ..$ : int [1:3] 3052 64866 64869
##   .. ..$ : int [1:4] 1812 6341 64919 65059
##   .. ..$ : int [1:171] 57 91 129 203 568 668 738 804 890 915 ...
##   .. ..$ : int [1:18] 144 1569 2943 3163 3170 3643 4800 9636 65094 65559 ...
##   .. ..$ : int [1:7] 1362 4569 9501 11229 63271 64975 65122
##   .. ..$ : int [1:2] 64863 64980
##   .. ..$ : int [1:20] 75 270 1319 1452 3266 3539 3578 3743 6575 6837 ...
##   .. ..$ : int [1:267] 51 487 676 695 782 927 999 1045 1107 1169 ...
##   .. ..$ : int [1:2] 1463 63380
##   .. ..$ : int [1:108] 93 137 221 246 351 409 643 655 739 867 ...
##   .. ..$ : int [1:6] 1632 1760 1976 2582 2912 5178
##   .. ..$ : int [1:153] 66 96 112 143 543 794 836 1020 1291 1471 ...
##   .. ..$ : int 121
##   .. ..$ : int 1764
##   .. ..$ : int [1:112] 116 238 385 426 562 789 840 921 1039 1144 ...
##   .. ..$ : int [1:42] 1909 2120 2508 3683 3959 4917 5055 5618 5709 6009 ...
##   .. ..$ : int [1:14] 2859 6320 9417 19204 64859 64861 64979 64982 68242 68244 ...
##   .. ..$ : int 366
##   .. ..$ : int [1:2] 1014 2037
##   .. ..$ : int [1:10] 1025 1179 1765 1784 2024 2032 2580 62943 63939 63950
##   .. ..$ : int 64913
##   .. ..$ : int [1:2] 19244 22314
##   .. ..$ : int [1:138] 217 1266 1303 1364 1425 2061 2289 2358 3202 3204 ...
##   .. ..$ : int [1:12] 2065 2558 2861 3481 4894 5166 5627 5812 67084 67093 ...
##   .. ..$ : int [1:5] 3165 9551 12956 13014 64998
##   .. ..$ : int [1:29] 285 511 2382 2845 3476 4055 4140 4483 4519 5167 ...
##   .. ..$ : int [1:81] 206 312 328 666 718 753 783 1013 1432 1517 ...
##   .. ..$ : int 10310
##   .. ..$ : int [1:14] 1363 1482 1647 1879 3064 3065 6255 63274 63569 65106 ...
##   .. ..$ : int [1:83] 310 332 372 457 727 1074 1194 1535 1699 1835 ...
##   .. ..$ : int [1:26] 532 2594 2929 3741 6306 6470 6473 7017 9590 9593 ...
##   .. ..$ : int [1:16] 156 2864 3280 3761 4938 5692 6641 8181 14987 15932 ...
##   .. ..$ : int [1:204] 452 675 724 758 864 919 990 1601 2475 2577 ...
##   .. ..$ : int [1:2] 3076 64891
##   .. ..$ : int [1:55] 2 170 768 887 945 1002 1037 1079 1224 1265 ...
##   .. ..$ : int [1:79] 29 118 556 683 824 1060 1111 1177 1380 1405 ...
##   .. ..$ : int [1:94] 18 175 591 620 657 773 896 937 991 1030 ...
##   .. ..$ : int [1:2] 3005 6339
##   .. ..$ : int [1:20] 354 398 1097 1761 2119 2268 2384 3961 5020 5290 ...
##   .. ..$ : int [1:75] 21 244 376 435 458 765 853 946 979 1042 ...
##   .. ..$ : int [1:46] 50 387 513 1528 2205 2388 3598 5716 10165 13097 ...
##   .. ..$ : int [1:19] 178 280 1659 1825 2612 2863 3349 3775 4384 5118 ...
##   .. ..$ : int [1:2] 15949 26231
##   .. ..$ : int [1:33] 182 278 547 597 617 1132 1256 1317 1335 1407 ...
##   .. ..$ : int 19203
##   .. ..$ : int [1:317] 37 102 577 633 688 816 857 973 1098 1120 ...
##   .. ..$ : int [1:28] 150 172 309 322 599 645 822 982 1021 1114 ...
##   .. ..$ : int [1:225] 6 160 224 501 583 855 955 984 1155 1167 ...
##   .. ..$ : int [1:114] 33 107 422 478 699 760 875 936 1070 1134 ...
##   .. ..$ : int [1:2] 1234 2896
##   .. ..$ : int 6215
##   .. ..$ : int [1:6] 2109 2194 2262 2598 5753 64467
##   .. ..$ : int [1:2] 3114 65041
##   .. ..$ : int 3582
##   .. ..$ : int [1:11] 368 423 1522 1791 2035 2216 62284 62339 64129 64140 ...
##   .. ..$ : int [1:79] 151 240 375 438 462 498 665 1867 2096 2246 ...
##   .. .. [list output truncated]
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
# remove rows containing missing values
tennis_df2 = na.omit(tennis_df2) #123,864 to 107,536

# log for just these two vars
age_ht_log = glm(outcome ~ age + ht,tennis_df2, family = binomial)
summary(age_ht_log)
## 
## Call:
## glm(formula = outcome ~ age + ht, family = binomial, data = tennis_df2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.1201176  0.1740671 -12.180  < 2e-16 ***
## age         -0.0098096  0.0015244  -6.435 1.24e-10 ***
## ht           0.0127946  0.0008929  14.330  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 150094  on 108269  degrees of freedom
## Residual deviance: 149832  on 108267  degrees of freedom
## AIC: 149838
## 
## Number of Fisher Scoring iterations: 3
vif(age_ht_log)
##      age       ht 
## 1.005258 1.005258
#age and height are significant

log.dif <- glm(outcome ~ age_dif + ht_dif, tennis_train, family = binomial)
summary(log.dif)
## 
## Call:
## glm(formula = outcome ~ age_dif + ht_dif, family = binomial, 
##     data = tennis_train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.002176   0.006554   0.332     0.74    
## age_dif     -0.007706   0.001200  -6.422 1.34e-10 ***
## ht_dif       0.013568   0.000694  19.550  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129728  on 93578  degrees of freedom
## Residual deviance: 129269  on 93576  degrees of freedom
## AIC: 129275
## 
## Number of Fisher Scoring iterations: 3
####
#### Do top players face fewer break points?
####
#let's consider top players as ranked in the top 25 in the world
tennis_df3 = select(ungroup(tennis_df2),id,surface,rank,bpFaced,ioc)
str(tennis_df3)
## tibble [108,270 × 5] (S3: tbl_df/tbl/data.frame)
##  $ id     : int [1:108270] 103507 104166 102880 102571 103387 102202 104339 103344 103813 102539 ...
##  $ surface: Factor w/ 5 levels "","Carpet","Clay",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ rank   : int [1:108270] 1 116 115 85 11 68 71 45 37 151 ...
##  $ bpFaced: int [1:108270] 7 3 8 13 7 5 3 4 4 0 ...
##  $ ioc    : chr [1:108270] "ESP" "RUS" "GER" "FRA" ...
tennis_df3$top_player <- ifelse(tennis_df3$rank <= 25, "Yes", "No")
tennis_df3$top_player <- as.factor(tennis_df3$top_player)


# Check for Normality
qqnorm(tennis_df3$bpFaced)
qqline(tennis_df3$bpFaced)
# the qq plot does not look normal

# Because not normal, will use Wilcoxon-Mann Whitney test
wilcox.test(bpFaced ~ top_player, data = tennis_df3,int = TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  bpFaced by top_player
## W = 1344811274, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#p-value is 2.2e-16, less than 0.05, therefore significant
# There is a statistically significant difference between top player vs other players
#in terms of how many break points they face.

# median break points faced for top players vs other players
median(tennis_df3$bpFaced[tennis_df3$top_player == "Yes"])
## [1] 5
median(tennis_df3$bpFaced[tennis_df3$top_player == "No"])
## [1] 7
#median is 5 for top layers, 7 for others
#top players face fewer break points


####
#### Which countries produce top tennis players?
####
str(tennis_df3)
## tibble [108,270 × 6] (S3: tbl_df/tbl/data.frame)
##  $ id        : int [1:108270] 103507 104166 102880 102571 103387 102202 104339 103344 103813 102539 ...
##  $ surface   : Factor w/ 5 levels "","Carpet","Clay",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ rank      : int [1:108270] 1 116 115 85 11 68 71 45 37 151 ...
##  $ bpFaced   : int [1:108270] 7 3 8 13 7 5 3 4 4 0 ...
##  $ ioc       : chr [1:108270] "ESP" "RUS" "GER" "FRA" ...
##  $ top_player: Factor w/ 2 levels "No","Yes": 2 1 1 1 2 1 1 1 1 1 ...
tennis_df4 = select(tennis_df3, -surface, -rank, -bpFaced)
tennis_df4 = distinct(tennis_df4)
#chi square test 
chisq.test(table(tennis_df4$top_player, tennis_df4$ioc))
## Warning in chisq.test(table(tennis_df4$top_player, tennis_df4$ioc)):
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(tennis_df4$top_player, tennis_df4$ioc)
## X-squared = 56.335, df = 79, p-value = 0.9748
#p-value p-value = 0.9768

player_country = as.data.frame.matrix(table(tennis_df4$ioc,tennis_df4$top_player))
print(player_country)
##      No Yes
## ALG   1   0
## ARG  60  15
## ARM   1   0
## AUS  58   5
## AUT  29   2
## BAH   1   0
## BAR   1   0
## BEL  18   3
## BIH   6   1
## BLR   8   1
## BOL   2   0
## BRA  29   2
## BUL   6   1
## CAN  19   3
## CHI  15   4
## CHN  17   0
## COL   9   0
## CRC   1   0
## CRO  25   5
## CYP   3   1
## CZE  35   3
## DEN   7   1
## DOM   4   0
## ECU   6   0
## EGY   2   0
## ESA   2   0
## ESP  68  19
## EST   3   0
## FIN   7   1
## FRA  90  14
## GBR  36   5
## GEO   4   1
## GER  65   7
## GRE   5   1
## GUA   1   0
## HUN   8   0
## IND  17   0
## IRL   3   0
## ISR   9   0
## ITA  63   8
## JOR   1   0
## JPN  21   1
## KAZ  11   1
## KOR   9   1
## LAT   2   1
## LBN   1   0
## LTU   5   0
## LUX   2   1
## MAR   9   1
## MDA   3   0
## MEX  13   0
## MON   4   0
## NED  23   4
## NOR   3   1
## NZL   8   0
## PAK   1   0
## PAR   1   0
## PER   6   0
## PHI   2   0
## POL  12   2
## POR  11   0
## ROU  15   1
## RSA  11   2
## RUS  34  10
## SLO   5   0
## SRB  19   4
## SUI  22   2
## SVK  16   2
## SWE  27   4
## THA   6   1
## TOG   1   0
## TPE   8   0
## TUN   3   0
## TUR   4   0
## UKR  10   1
## URU   3   1
## USA 118  17
## UZB   4   0
## VEN   1   0
## ZIM   3   0
player_country %>% filter(Yes >= 10)
##      No Yes
## ARG  60  15
## ESP  68  19
## FRA  90  14
## RUS  34  10
## USA 118  17
# Spain, USA, Argentina, France, Russia

####
#### Do the variables that impact whether a tennis player wins a match depend on the surface type?
####

# Subset based on surface type
clay_subset <- tennis_df[tennis_df$surface_clay == 1, ]
grass_subset <- tennis_df[tennis_df$surface_grass == 1, ]
hard_subset <- tennis_df[tennis_df$surface_hard == 1, ]

clay_subset = select(clay_subset,-surface_clay,-surface_grass,-surface_hard)
grass_subset = select(grass_subset,-surface_clay,-surface_grass,-surface_hard)
hard_subset = select(hard_subset,-surface_clay,-surface_grass,-surface_hard)

### Splitting into train and test
set.seed(12)
#clay
index = sample(nrow(clay_subset), 0.8*nrow(clay_subset), replace = F) # 80/20 split
clay_train = clay_subset[index,]
clay_test = clay_subset[-index,]
#grass
index = sample(nrow(grass_subset), 0.8*nrow(grass_subset), replace = F) # 80/20 split
grass_train = grass_subset[index,]
grass_test = grass_subset[-index,]
#hard
index = sample(nrow(hard_subset), 0.8*nrow(hard_subset), replace = F) # 80/20 split
hard_train = hard_subset[index,]
hard_test = hard_subset[-index,]
#checking for balance
table(clay_train$outcome)
## 
##     0     1 
## 14909 14915
table(clay_test$outcome)
## 
##    0    1 
## 3731 3725
table(grass_train$outcome)
## 
##    0    1 
## 5051 5000
table(grass_test$outcome)
## 
##    0    1 
## 1231 1282
table(hard_train$outcome)
## 
##     0     1 
## 25932 25948
table(hard_test$outcome)
## 
##    0    1 
## 6493 6477
#Logistic Regression: CLAY
log.clay = glm(outcome ~ .,data = clay_train, family = binomial)
summary(log.clay)
## 
## Call:
## glm(formula = outcome ~ ., family = binomial, data = clay_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.3258311  0.3671204  -0.888 0.374792    
## draw_size             -0.0002169  0.0013062  -0.166 0.868104    
## tourney_levelD         0.0100330  0.0706793   0.142 0.887118    
## tourney_levelG         0.0253023  0.1276140   0.198 0.842832    
## tourney_levelM         0.0172425  0.0515901   0.334 0.738213    
## handR                  0.0843234  0.0388875   2.168 0.030129 *  
## opponent_handR        -0.0477343  0.0388304  -1.229 0.218958    
## seededYes              0.3718472  0.0311488  11.938  < 2e-16 ***
## opponent_seededYes    -0.3514566  0.0310688 -11.312  < 2e-16 ***
## ht_dif                 0.0059231  0.0019369   3.058 0.002228 ** 
## age_dif               -0.0215519  0.0024309  -8.866  < 2e-16 ***
## rank_dif              -0.0031208  0.0001456 -21.433  < 2e-16 ***
## avg_ace               -0.1803798  0.0118919 -15.168  < 2e-16 ***
## avg_df                 0.0709136  0.0204436   3.469 0.000523 ***
## avg_svpt              -0.1949795  0.0177214 -11.002  < 2e-16 ***
## avg_firstIn           -0.0036370  0.0230651  -0.158 0.874707    
## avg_firstWon           0.2987039  0.0217582  13.728  < 2e-16 ***
## avg_secWon             0.2878787  0.0303439   9.487  < 2e-16 ***
## avg_bpSaved            0.1429848  0.0496479   2.880 0.003977 ** 
## avg_opponent_ace       0.1781188  0.0118679  15.008  < 2e-16 ***
## avg_opponent_df       -0.0824849  0.0204559  -4.032 5.52e-05 ***
## avg_opponent_svpt      0.2097942  0.0176860  11.862  < 2e-16 ***
## avg_opponent_firstIn  -0.0057736  0.0229944  -0.251 0.801747    
## avg_opponent_firstWon -0.2960938  0.0217586 -13.608  < 2e-16 ***
## avg_opponent_secWon   -0.3117384  0.0302114 -10.319  < 2e-16 ***
## avg_opponent_bpSaved  -0.1662104  0.0490544  -3.388 0.000703 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41345  on 29823  degrees of freedom
## Residual deviance: 36628  on 29798  degrees of freedom
## AIC: 36680
## 
## Number of Fisher Scoring iterations: 4
vif(log.clay) # multicollinearity in the model: first removing avg_opponent_firstIn & avg_firstIn
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size             13.350640  1        3.653853
## tourney_level         15.472014  3        1.578548
## hand                   1.124467  1        1.060409
## opponent_hand          1.121790  1        1.059146
## seeded                 1.348204  1        1.161122
## opponent_seeded        1.347216  1        1.160697
## ht_dif                 1.988186  1        1.410031
## age_dif                1.047534  1        1.023491
## rank_dif               1.295764  1        1.138316
## avg_ace                7.146519  1        2.673297
## avg_df                 1.787240  1        1.336877
## avg_svpt              32.699202  1        5.718322
## avg_firstIn           45.385369  1        6.736866
## avg_firstWon          29.058929  1        5.390633
## avg_secWon            19.300416  1        4.393224
## avg_bpSaved            4.838756  1        2.199717
## avg_opponent_ace       7.052528  1        2.655660
## avg_opponent_df        1.791631  1        1.338518
## avg_opponent_svpt     32.765316  1        5.724100
## avg_opponent_firstIn  45.358971  1        6.734907
## avg_opponent_firstWon 29.119418  1        5.396241
## avg_opponent_secWon   19.218661  1        4.383909
## avg_opponent_bpSaved   4.813423  1        2.193951
log.clay = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn,data = clay_train, family = binomial)
summary(log.clay)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn, 
##     family = binomial, data = clay_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.3265690  0.3671225  -0.890 0.373715    
## draw_size             -0.0002057  0.0013057  -0.158 0.874828    
## tourney_levelD         0.0101175  0.0706786   0.143 0.886173    
## tourney_levelG         0.0243706  0.1275740   0.191 0.848501    
## tourney_levelM         0.0169567  0.0515808   0.329 0.742352    
## handR                  0.0852868  0.0384129   2.220 0.026401 *  
## opponent_handR        -0.0462005  0.0383470  -1.205 0.228281    
## seededYes              0.3716849  0.0311330  11.939  < 2e-16 ***
## opponent_seededYes    -0.3517009  0.0310549 -11.325  < 2e-16 ***
## ht_dif                 0.0058998  0.0018942   3.115 0.001841 ** 
## age_dif               -0.0215559  0.0024304  -8.869  < 2e-16 ***
## rank_dif              -0.0031206  0.0001456 -21.434  < 2e-16 ***
## avg_ace               -0.1791480  0.0094157 -19.027  < 2e-16 ***
## avg_df                 0.0724162  0.0182218   3.974 7.06e-05 ***
## avg_svpt              -0.1970575  0.0119891 -16.436  < 2e-16 ***
## avg_firstWon           0.2961881  0.0144225  20.537  < 2e-16 ***
## avg_secWon             0.2918856  0.0165034  17.686  < 2e-16 ***
## avg_bpSaved            0.1431648  0.0496465   2.884 0.003931 ** 
## avg_opponent_ace       0.1800052  0.0093758  19.199  < 2e-16 ***
## avg_opponent_df       -0.0801316  0.0182370  -4.394 1.11e-05 ***
## avg_opponent_svpt      0.2065342  0.0120481  17.142  < 2e-16 ***
## avg_opponent_firstWon -0.3001553  0.0144104 -20.829  < 2e-16 ***
## avg_opponent_secWon   -0.3054085  0.0165406 -18.464  < 2e-16 ***
## avg_opponent_bpSaved  -0.1661864  0.0490487  -3.388 0.000704 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41345  on 29823  degrees of freedom
## Residual deviance: 36628  on 29800  degrees of freedom
## AIC: 36676
## 
## Number of Fisher Scoring iterations: 4
vif(log.clay) # multicollinearity in the model: next removing tourney level & avg_svpt & avg_opponent_svpt
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size             13.339209  1        3.652288
## tourney_level         15.460497  3        1.578352
## hand                   1.097317  1        1.047529
## opponent_hand          1.093858  1        1.045877
## seeded                 1.346841  1        1.160535
## opponent_seeded        1.346009  1        1.160176
## ht_dif                 1.901405  1        1.378915
## age_dif                1.047044  1        1.023252
## rank_dif               1.295529  1        1.138213
## avg_ace                4.480368  1        2.116688
## avg_df                 1.419792  1        1.191550
## avg_svpt              14.965402  1        3.868514
## avg_firstWon          12.767963  1        3.573229
## avg_secWon             5.709229  1        2.389399
## avg_bpSaved            4.838270  1        2.199607
## avg_opponent_ace       4.401105  1        2.097881
## avg_opponent_df        1.424108  1        1.193360
## avg_opponent_svpt     15.203005  1        3.899103
## avg_opponent_firstWon 12.771268  1        3.573691
## avg_opponent_secWon    5.760324  1        2.400068
## avg_opponent_bpSaved   4.811531  1        2.193520
log.clay = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt 
               -avg_opponent_svpt -tourney_level,data = clay_train, family = binomial)
summary(log.clay)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn - 
##     avg_svpt - avg_opponent_svpt - tourney_level, family = binomial, 
##     data = clay_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.877e-01  3.173e-01  -0.591  0.55429    
## draw_size             -3.859e-05  3.639e-04  -0.106  0.91554    
## handR                 -5.780e-02  3.703e-02  -1.561  0.11851    
## opponent_handR         9.826e-02  3.704e-02   2.653  0.00798 ** 
## seededYes              4.400e-01  2.984e-02  14.747  < 2e-16 ***
## opponent_seededYes    -4.300e-01  2.980e-02 -14.428  < 2e-16 ***
## ht_dif                 5.202e-03  1.877e-03   2.771  0.00559 ** 
## age_dif               -2.309e-02  2.406e-03  -9.595  < 2e-16 ***
## rank_dif              -3.470e-03  1.477e-04 -23.488  < 2e-16 ***
## avg_ace               -1.330e-01  8.974e-03 -14.819  < 2e-16 ***
## avg_df                -2.875e-03  1.756e-02  -0.164  0.86997    
## avg_firstWon           8.081e-02  5.781e-03  13.980  < 2e-16 ***
## avg_secWon             6.238e-02  8.544e-03   7.301 2.86e-13 ***
## avg_bpSaved           -4.959e-01  3.177e-02 -15.609  < 2e-16 ***
## avg_opponent_ace       1.313e-01  8.928e-03  14.713  < 2e-16 ***
## avg_opponent_df       -8.219e-04  1.754e-02  -0.047  0.96262    
## avg_opponent_firstWon -7.510e-02  5.719e-03 -13.130  < 2e-16 ***
## avg_opponent_secWon   -6.404e-02  8.513e-03  -7.523 5.34e-14 ***
## avg_opponent_bpSaved   4.967e-01  3.147e-02  15.786  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41345  on 29823  degrees of freedom
## Residual deviance: 37191  on 29805  degrees of freedom
## AIC: 37229
## 
## Number of Fisher Scoring iterations: 4
vif(log.clay) # no multicollinearity
##             draw_size                  hand         opponent_hand 
##              1.067150              1.042132              1.042489 
##                seeded       opponent_seeded                ht_dif 
##              1.265340              1.267727              1.896591 
##               age_dif              rank_dif               avg_ace 
##              1.048968              1.299819              4.130822 
##                avg_df          avg_firstWon            avg_secWon 
##              1.344295              2.107831              1.562453 
##           avg_bpSaved      avg_opponent_ace       avg_opponent_df 
##              1.923550              4.054069              1.342097 
## avg_opponent_firstWon   avg_opponent_secWon  avg_opponent_bpSaved 
##              2.072292              1.555920              1.912698
predprob_log_clay <- predict.glm(log.clay, clay_test, type = "response")
predclass_log_clay = ifelse(predprob_log_clay >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log_clay), clay_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2428 1268
##          1 1303 2457
##                                          
##                Accuracy : 0.6552         
##                  95% CI : (0.6443, 0.666)
##     No Information Rate : 0.5004         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.3104         
##                                          
##  Mcnemar's Test P-Value : 0.5025         
##                                          
##             Sensitivity : 0.6596         
##             Specificity : 0.6508         
##          Pos Pred Value : 0.6535         
##          Neg Pred Value : 0.6569         
##              Prevalence : 0.4996         
##          Detection Rate : 0.3295         
##    Detection Prevalence : 0.5043         
##       Balanced Accuracy : 0.6552         
##                                          
##        'Positive' Class : 1              
## 
# Accuracy    : 0.6582 
# Sensitivity : 0.6524         
# Specificity : 0.6441  

#Logistic Regression with Stepwise Selection
null_model_clay = glm(outcome ~ 1, data = clay_train, family = binomial) 
full_model_clay = log.clay

step.model.AIC.clay = step(null_model_clay, scope = list(upper = full_model_clay),
                      direction = "both", test = "Chisq", trace = F) 
summary(step.model.AIC.clay) 
## 
## Call:
## glm(formula = outcome ~ rank_dif + seeded + opponent_seeded + 
##     age_dif + avg_bpSaved + avg_opponent_bpSaved + avg_opponent_ace + 
##     avg_opponent_firstWon + avg_ace + avg_firstWon + avg_opponent_secWon + 
##     avg_secWon + ht_dif + opponent_hand + hand, family = binomial, 
##     data = clay_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.1894962  0.3154610  -0.601  0.54804    
## rank_dif              -0.0034700  0.0001476 -23.502  < 2e-16 ***
## seededYes              0.4397208  0.0297794  14.766  < 2e-16 ***
## opponent_seededYes    -0.4301902  0.0297434 -14.463  < 2e-16 ***
## age_dif               -0.0230806  0.0024037  -9.602  < 2e-16 ***
## avg_bpSaved           -0.4976465  0.0295829 -16.822  < 2e-16 ***
## avg_opponent_bpSaved   0.4963505  0.0293100  16.934  < 2e-16 ***
## avg_opponent_ace       0.1312353  0.0084006  15.622  < 2e-16 ***
## avg_opponent_firstWon -0.0751005  0.0055438 -13.547  < 2e-16 ***
## avg_ace               -0.1334481  0.0084528 -15.787  < 2e-16 ***
## avg_firstWon           0.0809710  0.0056133  14.425  < 2e-16 ***
## avg_opponent_secWon   -0.0641456  0.0084699  -7.573 3.64e-14 ***
## avg_secWon             0.0622087  0.0084992   7.319 2.49e-13 ***
## ht_dif                 0.0052011  0.0018773   2.770  0.00560 ** 
## opponent_handR         0.0983094  0.0369720   2.659  0.00784 ** 
## handR                 -0.0574733  0.0369594  -1.555  0.11994    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41345  on 29823  degrees of freedom
## Residual deviance: 37191  on 29808  degrees of freedom
## AIC: 37223
## 
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise 
log.sel.clay <- glm(outcome ~ rank_dif + opponent_seeded + seeded + age_dif + avg_opponent_bpSaved +
                      avg_opponent_ace + avg_opponent_firstWon + avg_ace + avg_firstWon + avg_opponent_secWon +
                      avg_secWon + ht_dif + opponent_hand + hand,
               clay_train, family = binomial)
summary(log.sel.clay)
## 
## Call:
## glm(formula = outcome ~ rank_dif + opponent_seeded + seeded + 
##     age_dif + avg_opponent_bpSaved + avg_opponent_ace + avg_opponent_firstWon + 
##     avg_ace + avg_firstWon + avg_opponent_secWon + avg_secWon + 
##     ht_dif + opponent_hand + hand, family = binomial, data = clay_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.4069672  0.3055966  -4.604 4.14e-06 ***
## rank_dif              -0.0036981  0.0001495 -24.732  < 2e-16 ***
## opponent_seededYes    -0.4000148  0.0295300 -13.546  < 2e-16 ***
## seededYes              0.5795525  0.0286277  20.244  < 2e-16 ***
## age_dif               -0.0247842  0.0023921 -10.361  < 2e-16 ***
## avg_opponent_bpSaved   0.4529503  0.0289840  15.628  < 2e-16 ***
## avg_opponent_ace       0.1253500  0.0083447  15.021  < 2e-16 ***
## avg_opponent_firstWon -0.0712278  0.0055154 -12.914  < 2e-16 ***
## avg_ace               -0.0608434  0.0072146  -8.433  < 2e-16 ***
## avg_firstWon           0.0617944  0.0054473  11.344  < 2e-16 ***
## avg_opponent_secWon   -0.0576471  0.0084165  -6.849 7.42e-12 ***
## avg_secWon             0.0157417  0.0079618   1.977   0.0480 *  
## ht_dif                 0.0045750  0.0018672   2.450   0.0143 *  
## opponent_handR         0.0895571  0.0368133   2.433   0.0150 *  
## handR                  0.0091659  0.0364372   0.252   0.8014    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41345  on 29823  degrees of freedom
## Residual deviance: 37490  on 29809  degrees of freedom
## AIC: 37520
## 
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2_clay <- predict(log.sel.clay, newdata = clay_test, type = "response")
logistic_pred_class2_clay <- ifelse(logistic_pred2_clay > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2_clay), clay_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2430 1308
##          1 1301 2417
##                                           
##                Accuracy : 0.6501          
##                  95% CI : (0.6391, 0.6609)
##     No Information Rate : 0.5004          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3002          
##                                           
##  Mcnemar's Test P-Value : 0.9065          
##                                           
##             Sensitivity : 0.6489          
##             Specificity : 0.6513          
##          Pos Pred Value : 0.6501          
##          Neg Pred Value : 0.6501          
##              Prevalence : 0.4996          
##          Detection Rate : 0.3242          
##    Detection Prevalence : 0.4987          
##       Balanced Accuracy : 0.6501          
##                                           
##        'Positive' Class : 1               
## 
# Accuracy :    0.6451    
# Sensitivity : 0.6426        
# Specificity : 0.6476  

# Logistic Regression: GRASS
log.grass <- glm(outcome ~ ., data = grass_train, family = binomial)
summary(log.grass)
## 
## Call:
## glm(formula = outcome ~ ., family = binomial, data = grass_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.590e-01  7.185e-01  -0.221  0.82486    
## draw_size              6.533e-05  2.753e-03   0.024  0.98107    
## tourney_levelD        -1.196e-02  2.180e-01  -0.055  0.95624    
## tourney_levelG         6.170e-04  2.544e-01   0.002  0.99806    
## handR                  7.219e-02  6.914e-02   1.044  0.29637    
## opponent_handR        -4.056e-02  6.865e-02  -0.591  0.55460    
## seededYes              3.726e-01  5.542e-02   6.724 1.77e-11 ***
## opponent_seededYes    -2.884e-01  5.525e-02  -5.220 1.79e-07 ***
## ht_dif                 3.340e-03  3.362e-03   0.993  0.32063    
## age_dif                1.256e-02  3.990e-03   3.147  0.00165 ** 
## rank_dif              -2.196e-03  2.207e-04  -9.950  < 2e-16 ***
## avg_ace               -1.811e-01  2.070e-02  -8.749  < 2e-16 ***
## avg_df                 2.161e-02  3.111e-02   0.695  0.48723    
## avg_svpt              -1.904e-01  3.100e-02  -6.142 8.15e-10 ***
## avg_firstIn           -1.322e-01  4.035e-02  -3.277  0.00105 ** 
## avg_firstWon           4.901e-01  4.242e-02  11.555  < 2e-16 ***
## avg_secWon             3.209e-01  5.087e-02   6.310 2.79e-10 ***
## avg_bpSaved            2.027e-01  9.973e-02   2.033  0.04210 *  
## avg_opponent_ace       1.652e-01  2.080e-02   7.943 1.97e-15 ***
## avg_opponent_df        8.525e-04  3.152e-02   0.027  0.97842    
## avg_opponent_svpt      1.968e-01  3.170e-02   6.210 5.31e-10 ***
## avg_opponent_firstIn   1.156e-01  4.093e-02   2.825  0.00473 ** 
## avg_opponent_firstWon -4.738e-01  4.247e-02 -11.158  < 2e-16 ***
## avg_opponent_secWon   -3.313e-01  5.202e-02  -6.369 1.90e-10 ***
## avg_opponent_bpSaved  -1.972e-01  1.010e-01  -1.952  0.05099 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13933  on 10050  degrees of freedom
## Residual deviance: 11877  on 10026  degrees of freedom
## AIC: 11927
## 
## Number of Fisher Scoring iterations: 4
vif(log.grass)  # Check for multicollinearity in the model
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size             32.185822  1        5.673255
## tourney_level         32.739287  2        2.392034
## hand                   1.129083  1        1.062583
## opponent_hand          1.132450  1        1.064166
## seeded                 1.401319  1        1.183773
## opponent_seeded        1.396307  1        1.181654
## ht_dif                 2.232477  1        1.494148
## age_dif                1.046681  1        1.023074
## rank_dif               1.278566  1        1.130737
## avg_ace                9.265607  1        3.043946
## avg_df                 1.793296  1        1.339140
## avg_svpt              27.994225  1        5.290957
## avg_firstIn           42.556577  1        6.523540
## avg_firstWon          36.833044  1        6.069023
## avg_secWon            14.578655  1        3.818200
## avg_bpSaved            6.054831  1        2.460657
## avg_opponent_ace       9.113666  1        3.018885
## avg_opponent_df        1.802179  1        1.342453
## avg_opponent_svpt     27.898705  1        5.281922
## avg_opponent_firstIn  42.672727  1        6.532437
## avg_opponent_firstWon 35.984623  1        5.998718
## avg_opponent_secWon   15.109383  1        3.887079
## avg_opponent_bpSaved   6.037939  1        2.457222
log.grass = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn,data = grass_train, family = binomial)
summary(log.grass)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn, 
##     family = binomial, data = grass_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.1623039  0.7202110  -0.225  0.82170    
## draw_size              0.0001289  0.0027487   0.047  0.96259    
## tourney_levelD        -0.0039259  0.2179471  -0.018  0.98563    
## tourney_levelG        -0.0053831  0.2540003  -0.021  0.98309    
## handR                  0.0945248  0.0686668   1.377  0.16864    
## opponent_handR        -0.0626694  0.0680216  -0.921  0.35689    
## seededYes              0.3749291  0.0553853   6.769 1.29e-11 ***
## opponent_seededYes    -0.2893566  0.0552117  -5.241 1.60e-07 ***
## ht_dif                 0.0057254  0.0033111   1.729  0.08378 .  
## age_dif                0.0119339  0.0039849   2.995  0.00275 ** 
## rank_dif              -0.0021672  0.0002199  -9.857  < 2e-16 ***
## avg_ace               -0.1381040  0.0157273  -8.781  < 2e-16 ***
## avg_df                 0.0650148  0.0282422   2.302  0.02133 *  
## avg_svpt              -0.2571421  0.0231787 -11.094  < 2e-16 ***
## avg_firstWon           0.3842614  0.0281062  13.672  < 2e-16 ***
## avg_secWon             0.4509656  0.0318101  14.177  < 2e-16 ***
## avg_bpSaved            0.1889544  0.1002023   1.886  0.05933 .  
## avg_opponent_ace       0.1273364  0.0157673   8.076 6.69e-16 ***
## avg_opponent_df       -0.0364848  0.0284553  -1.282  0.19978    
## avg_opponent_svpt      0.2557677  0.0234120  10.925  < 2e-16 ***
## avg_opponent_firstWon -0.3814238  0.0283334 -13.462  < 2e-16 ***
## avg_opponent_secWon   -0.4453007  0.0322620 -13.803  < 2e-16 ***
## avg_opponent_bpSaved  -0.1884972  0.1014692  -1.858  0.06321 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13933  on 10050  degrees of freedom
## Residual deviance: 11895  on 10028  degrees of freedom
## AIC: 11941
## 
## Number of Fisher Scoring iterations: 4
vif(log.grass)
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size             32.157829  1        5.670787
## tourney_level         32.661972  2        2.390620
## hand                   1.116630  1        1.056707
## opponent_hand          1.115075  1        1.055971
## seeded                 1.401770  1        1.183964
## opponent_seeded        1.396611  1        1.181783
## ht_dif                 2.159519  1        1.469530
## age_dif                1.045118  1        1.022310
## rank_dif               1.273769  1        1.128614
## avg_ace                5.301774  1        2.302558
## avg_df                 1.466883  1        1.211150
## avg_svpt              15.453427  1        3.931085
## avg_firstWon          16.009525  1        4.001190
## avg_secWon             5.710721  1        2.389712
## avg_bpSaved            6.096597  1        2.469129
## avg_opponent_ace       5.196379  1        2.279557
## avg_opponent_df        1.460520  1        1.208520
## avg_opponent_svpt     15.101703  1        3.886091
## avg_opponent_firstWon 15.889777  1        3.986198
## avg_opponent_secWon    5.826479  1        2.413810
## avg_opponent_bpSaved   6.078088  1        2.465378
log.grass = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn -tourney_level -avg_firstWon
                -avg_opponent_firstWon,data = grass_train, family = binomial)
summary(log.grass)
## 
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn - 
##     tourney_level - avg_firstWon - avg_opponent_firstWon, family = binomial, 
##     data = grass_train)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.378e-01  6.763e-01  -0.352  0.72510    
## draw_size             4.394e-05  4.807e-04   0.091  0.92716    
## handR                -1.160e-01  6.519e-02  -1.780  0.07513 .  
## opponent_handR        1.360e-01  6.475e-02   2.100  0.03570 *  
## seededYes             5.336e-01  5.233e-02  10.197  < 2e-16 ***
## opponent_seededYes   -4.465e-01  5.218e-02  -8.557  < 2e-16 ***
## ht_dif                8.528e-03  3.259e-03   2.616  0.00889 ** 
## age_dif               7.233e-03  3.900e-03   1.855  0.06366 .  
## rank_dif             -2.414e-03  2.213e-04 -10.908  < 2e-16 ***
## avg_ace              -1.882e-02  1.295e-02  -1.453  0.14624    
## avg_df               -4.297e-02  2.643e-02  -1.626  0.10391    
## avg_svpt              2.633e-02  8.782e-03   2.998  0.00271 ** 
## avg_secWon            8.036e-02  1.581e-02   5.083 3.72e-07 ***
## avg_bpSaved          -7.009e-01  7.113e-02  -9.855  < 2e-16 ***
## avg_opponent_ace      1.061e-02  1.295e-02   0.820  0.41248    
## avg_opponent_df       6.849e-02  2.668e-02   2.567  0.01026 *  
## avg_opponent_svpt    -2.625e-02  8.879e-03  -2.957  0.00311 ** 
## avg_opponent_secWon  -7.299e-02  1.587e-02  -4.600 4.23e-06 ***
## avg_opponent_bpSaved  7.097e-01  7.134e-02   9.948  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13933  on 10050  degrees of freedom
## Residual deviance: 12277  on 10032  degrees of freedom
## AIC: 12315
## 
## Number of Fisher Scoring iterations: 4
vif(log.grass) # no multicollinearity
##            draw_size                 hand        opponent_hand 
##             1.028236             1.050715             1.052344 
##               seeded      opponent_seeded               ht_dif 
##             1.303862             1.300475             2.131194 
##              age_dif             rank_dif              avg_ace 
##             1.040036             1.280145             3.670248 
##               avg_df             avg_svpt           avg_secWon 
##             1.348390             2.390852             1.474428 
##          avg_bpSaved     avg_opponent_ace      avg_opponent_df 
##             3.161097             3.577481             1.343232 
##    avg_opponent_svpt  avg_opponent_secWon avg_opponent_bpSaved 
##             2.332993             1.470113             3.080825
# Prediction on test set
predprob_log_grass <- predict.glm(log.grass, grass_test, type = "response")
predclass_log_grass <- ifelse(predprob_log_grass >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log_grass), grass_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 816 410
##          1 415 872
##                                          
##                Accuracy : 0.6717         
##                  95% CI : (0.653, 0.6901)
##     No Information Rate : 0.5101         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.3431         
##                                          
##  Mcnemar's Test P-Value : 0.8892         
##                                          
##             Sensitivity : 0.6802         
##             Specificity : 0.6629         
##          Pos Pred Value : 0.6775         
##          Neg Pred Value : 0.6656         
##              Prevalence : 0.5101         
##          Detection Rate : 0.3470         
##    Detection Prevalence : 0.5121         
##       Balanced Accuracy : 0.6715         
##                                          
##        'Positive' Class : 1              
## 
# Accuracy :    0.6811    
# Sensitivity : 0.6940        
# Specificity : 0.6675 

#Logistic Regression with Stepwise Selection
null_model_grass = glm(outcome ~ 1, data = grass_train, family = binomial) 
full_model_grass = log.grass

step.model.AIC.grass = step(null_model_grass, scope = list(upper = full_model_grass),
                           direction = "both", test = "Chisq", trace = F) 
summary(step.model.AIC.grass) 
## 
## Call:
## glm(formula = outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + 
##     seeded + opponent_seeded + avg_secWon + avg_opponent_secWon + 
##     avg_opponent_svpt + avg_svpt + avg_opponent_df + opponent_hand + 
##     avg_df + ht_dif + age_dif + hand, family = binomial, data = grass_train)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -0.1387030  0.6477310  -0.214  0.83044    
## rank_dif             -0.0024195  0.0002211 -10.946  < 2e-16 ***
## avg_opponent_bpSaved  0.6786043  0.0502779  13.497  < 2e-16 ***
## avg_bpSaved          -0.6322881  0.0495441 -12.762  < 2e-16 ***
## seededYes             0.5424305  0.0518621  10.459  < 2e-16 ***
## opponent_seededYes   -0.4512300  0.0517121  -8.726  < 2e-16 ***
## avg_secWon            0.0769666  0.0156630   4.914 8.93e-07 ***
## avg_opponent_secWon  -0.0698150  0.0157051  -4.445 8.77e-06 ***
## avg_opponent_svpt    -0.0237746  0.0076310  -3.116  0.00184 ** 
## avg_svpt              0.0203315  0.0074957   2.712  0.00668 ** 
## avg_opponent_df       0.0727106  0.0251184   2.895  0.00379 ** 
## opponent_handR        0.1325162  0.0644790   2.055  0.03986 *  
## avg_df               -0.0544727  0.0248139  -2.195  0.02815 *  
## ht_dif                0.0059726  0.0027771   2.151  0.03151 *  
## age_dif               0.0067419  0.0038876   1.734  0.08288 .  
## handR                -0.1077899  0.0649021  -1.661  0.09675 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13933  on 10050  degrees of freedom
## Residual deviance: 12280  on 10035  degrees of freedom
## AIC: 12312
## 
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise 
log.sel.grass <- glm(outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + opponent_seeded + seeded +
                       avg_opponent_secWon + avg_secWon + ht_dif + avg_df + avg_svpt + age_dif + avg_opponent_svpt + 
                       avg_opponent_df + opponent_hand + hand,
                    grass_train, family = binomial)
summary(log.sel.grass)
## 
## Call:
## glm(formula = outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + 
##     opponent_seeded + seeded + avg_opponent_secWon + avg_secWon + 
##     ht_dif + avg_df + avg_svpt + age_dif + avg_opponent_svpt + 
##     avg_opponent_df + opponent_hand + hand, family = binomial, 
##     data = grass_train)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -0.1387030  0.6477310  -0.214  0.83044    
## rank_dif             -0.0024195  0.0002211 -10.946  < 2e-16 ***
## avg_opponent_bpSaved  0.6786043  0.0502779  13.497  < 2e-16 ***
## avg_bpSaved          -0.6322881  0.0495441 -12.762  < 2e-16 ***
## opponent_seededYes   -0.4512300  0.0517121  -8.726  < 2e-16 ***
## seededYes             0.5424305  0.0518621  10.459  < 2e-16 ***
## avg_opponent_secWon  -0.0698150  0.0157051  -4.445 8.77e-06 ***
## avg_secWon            0.0769666  0.0156630   4.914 8.93e-07 ***
## ht_dif                0.0059726  0.0027771   2.151  0.03151 *  
## avg_df               -0.0544727  0.0248139  -2.195  0.02815 *  
## avg_svpt              0.0203315  0.0074957   2.712  0.00668 ** 
## age_dif               0.0067419  0.0038876   1.734  0.08288 .  
## avg_opponent_svpt    -0.0237746  0.0076310  -3.116  0.00184 ** 
## avg_opponent_df       0.0727106  0.0251184   2.895  0.00379 ** 
## opponent_handR        0.1325162  0.0644790   2.055  0.03986 *  
## handR                -0.1077899  0.0649021  -1.661  0.09675 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13933  on 10050  degrees of freedom
## Residual deviance: 12280  on 10035  degrees of freedom
## AIC: 12312
## 
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2_grass <- predict(log.sel.grass, newdata = grass_test, type = "response")
logistic_pred_class2_grass <- ifelse(logistic_pred2_grass > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2_grass), grass_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 818 412
##          1 413 870
##                                          
##                Accuracy : 0.6717         
##                  95% CI : (0.653, 0.6901)
##     No Information Rate : 0.5101         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.3431         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.6786         
##             Specificity : 0.6645         
##          Pos Pred Value : 0.6781         
##          Neg Pred Value : 0.6650         
##              Prevalence : 0.5101         
##          Detection Rate : 0.3462         
##    Detection Prevalence : 0.5105         
##       Balanced Accuracy : 0.6716         
##                                          
##        'Positive' Class : 1              
## 
# Accuracy :    0.6803     
# Sensitivity : 0.6924         
# Specificity : 0.6675


# Logistic Regression: HARD
log.hard <- glm(outcome ~ ., data = hard_train, family = binomial)
summary(log.hard)
## 
## Call:
## glm(formula = outcome ~ ., family = binomial, data = hard_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.979e-01  2.935e-01  -0.674 0.500312    
## draw_size              4.644e-04  6.325e-04   0.734 0.462749    
## tourney_levelD         1.208e-03  5.082e-02   0.024 0.981029    
## tourney_levelF         8.157e-03  9.143e-02   0.089 0.928907    
## tourney_levelG        -3.193e-02  6.530e-02  -0.489 0.624818    
## tourney_levelM        -1.284e-02  3.877e-02  -0.331 0.740519    
## handR                  1.074e-01  3.074e-02   3.492 0.000479 ***
## opponent_handR        -8.407e-02  3.084e-02  -2.726 0.006410 ** 
## seededYes              3.493e-01  2.361e-02  14.793  < 2e-16 ***
## opponent_seededYes    -3.848e-01  2.363e-02 -16.285  < 2e-16 ***
## ht_dif                 3.621e-05  1.514e-03   0.024 0.980926    
## age_dif               -1.308e-02  1.768e-03  -7.401 1.35e-13 ***
## rank_dif              -2.879e-03  1.084e-04 -26.561  < 2e-16 ***
## avg_ace               -1.468e-01  8.554e-03 -17.158  < 2e-16 ***
## avg_df                 2.519e-02  1.467e-02   1.717 0.085988 .  
## avg_svpt              -1.632e-01  1.363e-02 -11.974  < 2e-16 ***
## avg_firstIn           -4.973e-02  1.737e-02  -2.863 0.004191 ** 
## avg_firstWon           3.280e-01  1.730e-02  18.964  < 2e-16 ***
## avg_secWon             2.898e-01  2.276e-02  12.732  < 2e-16 ***
## avg_bpSaved           -1.712e-03  4.261e-02  -0.040 0.967948    
## avg_opponent_ace       1.334e-01  8.532e-03  15.630  < 2e-16 ***
## avg_opponent_df       -1.438e-02  1.469e-02  -0.979 0.327482    
## avg_opponent_svpt      1.619e-01  1.373e-02  11.792  < 2e-16 ***
## avg_opponent_firstIn   4.022e-02  1.739e-02   2.313 0.020715 *  
## avg_opponent_firstWon -3.055e-01  1.726e-02 -17.704  < 2e-16 ***
## avg_opponent_secWon   -2.868e-01  2.288e-02 -12.535  < 2e-16 ***
## avg_opponent_bpSaved  -8.678e-03  4.303e-02  -0.202 0.840173    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71921  on 51879  degrees of freedom
## Residual deviance: 62967  on 51853  degrees of freedom
## AIC: 63021
## 
## Number of Fisher Scoring iterations: 4
vif(log.hard)  # Check for multicollinearity in the model
##                            GVIF Df GVIF^(1/(2*Df))
## draw_size              7.046722  1        2.654566
## tourney_level          7.884077  4        1.294476
## hand                   1.129175  1        1.062626
## opponent_hand          1.131329  1        1.063640
## seeded                 1.373379  1        1.171913
## opponent_seeded        1.376127  1        1.173084
## ht_dif                 2.382946  1        1.543679
## age_dif                1.041681  1        1.020628
## rank_dif               1.248194  1        1.117226
## avg_ace                8.486426  1        2.913147
## avg_df                 1.770458  1        1.330586
## avg_svpt              30.557885  1        5.527919
## avg_firstIn           43.364966  1        6.585208
## avg_firstWon          34.243652  1        5.851808
## avg_secWon            15.783104  1        3.972795
## avg_bpSaved            6.001001  1        2.449694
## avg_opponent_ace       8.390360  1        2.896612
## avg_opponent_df        1.774788  1        1.332212
## avg_opponent_svpt     31.163999  1        5.582472
## avg_opponent_firstIn  43.070123  1        6.562783
## avg_opponent_firstWon 33.936159  1        5.825475
## avg_opponent_secWon   15.857414  1        3.982137
## avg_opponent_bpSaved   6.109295  1        2.471699
log.hard = glm(outcome ~ . -avg_firstIn -avg_opponent_firstIn -avg_firstWon
               -avg_opponent_firstWon,data = hard_train, family = binomial)
summary(log.hard)
## 
## Call:
## glm(formula = outcome ~ . - avg_firstIn - avg_opponent_firstIn - 
##     avg_firstWon - avg_opponent_firstWon, family = binomial, 
##     data = hard_train)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -0.1780813  0.2883215  -0.618  0.53681    
## draw_size             0.0003151  0.0006236   0.505  0.61330    
## tourney_levelD       -0.0112444  0.0501275  -0.224  0.82251    
## tourney_levelF        0.0133216  0.0891697   0.149  0.88124    
## tourney_levelG       -0.0165964  0.0643032  -0.258  0.79633    
## tourney_levelM       -0.0016953  0.0379620  -0.045  0.96438    
## handR                -0.0638445  0.0292062  -2.186  0.02882 *  
## opponent_handR        0.0814752  0.0292509   2.785  0.00535 ** 
## seededYes             0.4635500  0.0226958  20.424  < 2e-16 ***
## opponent_seededYes   -0.4908463  0.0226906 -21.632  < 2e-16 ***
## ht_dif                0.0026397  0.0014642   1.803  0.07141 .  
## age_dif              -0.0176499  0.0017455 -10.112  < 2e-16 ***
## rank_dif             -0.0033310  0.0001102 -30.216  < 2e-16 ***
## avg_ace              -0.0442513  0.0057542  -7.690 1.47e-14 ***
## avg_df               -0.0530250  0.0123623  -4.289 1.79e-05 ***
## avg_svpt              0.0243822  0.0038295   6.367 1.93e-10 ***
## avg_secWon            0.0663137  0.0068963   9.616  < 2e-16 ***
## avg_bpSaved          -0.6842932  0.0317877 -21.527  < 2e-16 ***
## avg_opponent_ace      0.0395826  0.0057536   6.880 6.00e-12 ***
## avg_opponent_df       0.0616188  0.0123677   4.982 6.29e-07 ***
## avg_opponent_svpt    -0.0199646  0.0038442  -5.193 2.06e-07 ***
## avg_opponent_secWon  -0.0679136  0.0069297  -9.800  < 2e-16 ***
## avg_opponent_bpSaved  0.6433371  0.0316674  20.315  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71921  on 51879  degrees of freedom
## Residual deviance: 64126  on 51857  degrees of freedom
## AIC: 64172
## 
## Number of Fisher Scoring iterations: 4
vif(log.hard) #no multicolienarity 
##                          GVIF Df GVIF^(1/(2*Df))
## draw_size            7.044136  1        2.654079
## tourney_level        7.680766  4        1.290255
## hand                 1.046553  1        1.023012
## opponent_hand        1.045216  1        1.022358
## seeded               1.301851  1        1.140987
## opponent_seeded      1.301567  1        1.140862
## ht_dif               2.255708  1        1.501902
## age_dif              1.037123  1        1.018393
## rank_dif             1.254125  1        1.119877
## avg_ace              3.876487  1        1.968880
## avg_df               1.288771  1        1.135241
## avg_svpt             2.490405  1        1.578102
## avg_secWon           1.491040  1        1.221081
## avg_bpSaved          3.337821  1        1.826971
## avg_opponent_ace     3.859613  1        1.964590
## avg_opponent_df      1.289440  1        1.135535
## avg_opponent_svpt    2.497862  1        1.580463
## avg_opponent_secWon  1.495456  1        1.222889
## avg_opponent_bpSaved 3.315152  1        1.820756
# Prediction on test set
predprob_log_hard <- predict.glm(log.hard, hard_test, type = "response")
predclass_log_hard <- ifelse(predprob_log_hard >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log_hard), hard_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4306 2164
##          1 2187 4313
##                                           
##                Accuracy : 0.6645          
##                  95% CI : (0.6563, 0.6727)
##     No Information Rate : 0.5006          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3291          
##                                           
##  Mcnemar's Test P-Value : 0.7387          
##                                           
##             Sensitivity : 0.6659          
##             Specificity : 0.6632          
##          Pos Pred Value : 0.6635          
##          Neg Pred Value : 0.6655          
##              Prevalence : 0.4994          
##          Detection Rate : 0.3325          
##    Detection Prevalence : 0.5012          
##       Balanced Accuracy : 0.6645          
##                                           
##        'Positive' Class : 1               
## 
# Accuracy :    0.6666    
# Sensitivity : 0.6645        
# Specificity : 0.6687

#Logistic Regression with Stepwise Selection
null_model_hard = glm(outcome ~ 1, data = hard_train, family = binomial) 
full_model_hard = log.hard

step.model.AIC.hard = step(null_model_hard, scope = list(upper = full_model_hard),
                            direction = "both", test = "Chisq", trace = F) 
summary(step.model.AIC.hard) 
## 
## Call:
## glm(formula = outcome ~ rank_dif + avg_bpSaved + avg_opponent_bpSaved + 
##     opponent_seeded + seeded + avg_secWon + age_dif + avg_opponent_secWon + 
##     avg_opponent_df + avg_df + avg_ace + avg_svpt + avg_opponent_ace + 
##     avg_opponent_svpt + opponent_hand + hand + ht_dif, family = binomial, 
##     data = hard_train)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -0.1717329  0.2878726  -0.597  0.55080    
## rank_dif             -0.0033310  0.0001102 -30.215  < 2e-16 ***
## avg_bpSaved          -0.6850011  0.0315975 -21.679  < 2e-16 ***
## avg_opponent_bpSaved  0.6424905  0.0314932  20.401  < 2e-16 ***
## opponent_seededYes   -0.4882382  0.0224056 -21.791  < 2e-16 ***
## seededYes             0.4661486  0.0224093  20.802  < 2e-16 ***
## avg_secWon            0.0663468  0.0068889   9.631  < 2e-16 ***
## age_dif              -0.0176554  0.0017454 -10.115  < 2e-16 ***
## avg_opponent_secWon  -0.0678954  0.0069242  -9.806  < 2e-16 ***
## avg_opponent_df       0.0616078  0.0123637   4.983 6.26e-07 ***
## avg_df               -0.0530378  0.0123589  -4.291 1.77e-05 ***
## avg_ace              -0.0443733  0.0057424  -7.727 1.10e-14 ***
## avg_svpt              0.0244759  0.0038221   6.404 1.52e-10 ***
## avg_opponent_ace      0.0394434  0.0057419   6.869 6.45e-12 ***
## avg_opponent_svpt    -0.0198636  0.0038391  -5.174 2.29e-07 ***
## opponent_handR        0.0808226  0.0292391   2.764  0.00571 ** 
## handR                -0.0645497  0.0291934  -2.211  0.02703 *  
## ht_dif                0.0026368  0.0014641   1.801  0.07172 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71921  on 51879  degrees of freedom
## Residual deviance: 64127  on 51862  degrees of freedom
## AIC: 64163
## 
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise 
log.sel.hard <- glm(outcome ~ rank_dif + avg_bpSaved + avg_opponent_bpSaved + seeded + opponent_seeded +
                       age_dif + avg_secWon + avg_opponent_secWon + avg_opponent_df + avg_ace + avg_svpt + 
                      avg_opponent_ace + avg_opponent_svpt + avg_df + hand + opponent_hand + ht_dif,
                     hard_train, family = binomial)
summary(log.sel.hard)
## 
## Call:
## glm(formula = outcome ~ rank_dif + avg_bpSaved + avg_opponent_bpSaved + 
##     seeded + opponent_seeded + age_dif + avg_secWon + avg_opponent_secWon + 
##     avg_opponent_df + avg_ace + avg_svpt + avg_opponent_ace + 
##     avg_opponent_svpt + avg_df + hand + opponent_hand + ht_dif, 
##     family = binomial, data = hard_train)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -0.1717329  0.2878726  -0.597  0.55080    
## rank_dif             -0.0033310  0.0001102 -30.215  < 2e-16 ***
## avg_bpSaved          -0.6850011  0.0315975 -21.679  < 2e-16 ***
## avg_opponent_bpSaved  0.6424905  0.0314932  20.401  < 2e-16 ***
## seededYes             0.4661486  0.0224093  20.802  < 2e-16 ***
## opponent_seededYes   -0.4882382  0.0224056 -21.791  < 2e-16 ***
## age_dif              -0.0176554  0.0017454 -10.115  < 2e-16 ***
## avg_secWon            0.0663468  0.0068889   9.631  < 2e-16 ***
## avg_opponent_secWon  -0.0678954  0.0069242  -9.806  < 2e-16 ***
## avg_opponent_df       0.0616078  0.0123637   4.983 6.26e-07 ***
## avg_ace              -0.0443733  0.0057424  -7.727 1.10e-14 ***
## avg_svpt              0.0244759  0.0038221   6.404 1.52e-10 ***
## avg_opponent_ace      0.0394434  0.0057419   6.869 6.45e-12 ***
## avg_opponent_svpt    -0.0198636  0.0038391  -5.174 2.29e-07 ***
## avg_df               -0.0530378  0.0123589  -4.291 1.77e-05 ***
## handR                -0.0645497  0.0291934  -2.211  0.02703 *  
## opponent_handR        0.0808226  0.0292391   2.764  0.00571 ** 
## ht_dif                0.0026368  0.0014641   1.801  0.07172 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71921  on 51879  degrees of freedom
## Residual deviance: 64127  on 51862  degrees of freedom
## AIC: 64163
## 
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2_hard <- predict(log.sel.hard, newdata = hard_test, type = "response")
logistic_pred_class2_hard <- ifelse(logistic_pred2_hard > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2_hard), hard_test$outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4306 2170
##          1 2187 4307
##                                           
##                Accuracy : 0.6641          
##                  95% CI : (0.6559, 0.6722)
##     No Information Rate : 0.5006          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3281          
##                                           
##  Mcnemar's Test P-Value : 0.8085          
##                                           
##             Sensitivity : 0.6650          
##             Specificity : 0.6632          
##          Pos Pred Value : 0.6632          
##          Neg Pred Value : 0.6649          
##              Prevalence : 0.4994          
##          Detection Rate : 0.3321          
##    Detection Prevalence : 0.5007          
##       Balanced Accuracy : 0.6641          
##                                           
##        'Positive' Class : 1               
## 
# Accuracy :    0.6672     
# Sensitivity : 0.6648         
# Specificity : 0.6696  

#Common Variables: Some variables appear consistently across different surface types, such as:
#rank_dif, seeded, opponent_seeded, avg_secWon, avg_opponent_secWon, and age_dif. 
#This suggests that these variables have a consistent impact on match outcomes regardless of the surface type.

#Differing Variables:
#In the clay model, avg_ace has a negative coefficient, indicating that a higher average number of aces is associated with a lower probability of winning.
#In the grass model, avg_ace has a positive coefficient, suggesting that a higher average number of aces is associated with a higher probability of winning on grass.
#Similarly, other variables like ht_dif, avg_df, and avg_svpt also have coefficients that vary across surface types.



####
#### Does match length depend on the surface type?
####
levels(as.factor(match_outcomes$surface))
## [1] ""       "Carpet" "Clay"   "Grass"  "Hard"
surface_subset <- match_outcomes[match_outcomes$surface != "Carpet", ]
levels(as.factor(surface_subset$best_of))
## [1] "3" "5"
surface_bestof3 <- surface_subset[surface_subset$best_of == 3, ]
surface_bestof5 <- surface_subset[surface_subset$best_of == 5, ]

lm_surface3 <- lm(minutes ~ surface, data = surface_bestof3)
summary(lm_surface3)
## 
## Call:
## lm(formula = minutes ~ surface, data = surface_bestof3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100.73  -24.27   -5.47   21.27 1048.73 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  100.7297     0.1888  533.60   <2e-16 ***
## surfaceGrass  -8.2614     0.4324  -19.11   <2e-16 ***
## surfaceHard   -3.4627     0.2385  -14.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.69 on 87295 degrees of freedom
##   (6400 observations deleted due to missingness)
## Multiple R-squared:  0.004972,   Adjusted R-squared:  0.004949 
## F-statistic: 218.1 on 2 and 87295 DF,  p-value: < 2.2e-16
# FOR BEST OF 3:
#The intercept (for matches played on clay) is estimated to be 100.7297 minutes.
#Matches played on Grass surface have, on average, 8.26 minutes shorter duration compared to matches played on clay.
#Matches played on Hard surface have, on average, 3.4627 minutes shorter duration compared to matches played on clay.

lm_surface5 <- lm(minutes ~ surface, data = surface_bestof5)
summary(lm_surface5)
## 
## Call:
## lm(formula = minutes ~ surface, data = surface_bestof5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -151.34  -36.34   -7.34   32.66  521.96 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  151.9310     0.6693 226.985   <2e-16 ***
## surfaceGrass  -8.8946     0.9724  -9.147   <2e-16 ***
## surfaceHard   -0.5864     0.8176  -0.717    0.473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.02 on 21089 degrees of freedom
##   (6400 observations deleted due to missingness)
## Multiple R-squared:  0.005307,   Adjusted R-squared:  0.005213 
## F-statistic: 56.26 on 2 and 21089 DF,  p-value: < 2.2e-16
# FOR BEST OF 5:
#The intercept (for matches played on clay) is estimated to be 151.931 minutes.
#Matches played on Grass surface have, on average 8.89 minutes shorter duration than matches played on clay.
#Matches played on Hard surface have, on average 0.5864 minutes shorter duration than matches played on clay (NOT SIGNIFICANT)