#######################################################################
########################## TENNIS PROJECT ###########################
#######################################################################
# Libraries
library(MASS) ; library(ggplot2) ; library(scales) ; library(tidyverse) ; library(corrplot)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
library(corrplot) ; library(car) ; library(caret) ; library(readxl) ; library(e1071) ;
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
##
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest); library(dplyr)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
####
#### Reading in data.
####
# Load last 20 years of ATP tour level matches
matches2023 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2023.csv')
matches2022 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2022.csv')
matches2021 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2021.csv')
matches2020 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2020.csv')
matches2019 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2019.csv')
matches2018 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2018.csv')
matches2017 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2017.csv')
matches2016 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2016.csv')
matches2015 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2015.csv')
matches2014 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2014.csv')
matches2013 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2013.csv')
matches2012 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2012.csv')
matches2011 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2011.csv')
matches2010 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2010.csv')
matches2009 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2009.csv')
matches2008 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2008.csv')
matches2007 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2007.csv')
matches2006 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2006.csv')
matches2005 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2005.csv')
matches2004 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2004.csv')
matches2003 <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2003.csv')
# Load ATP Player Data -> decided not to use
#atp_players <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_players.csv')
# Load Ranking Data (per decade) -> decided not to use.
#atp_rankings_00s <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_rankings_00s.csv')
#atp_rankings_10s <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_rankings_10s.csv')
#atp_rankings_20s <- read.csv('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_rankings_20s.csv')
# Combine matches into one data frame
atp_matches <- rbind(matches2003, matches2004, matches2005, matches2006, matches2007,
matches2008, matches2009, matches2010, matches2011, matches2012,
matches2013, matches2014, matches2015, matches2016, matches2017,
matches2018, matches2019, matches2020, matches2021, matches2022,
matches2023)
# 61932 observations
# remove after combining to de-clutter
rm(matches2003, matches2004, matches2005, matches2006, matches2007,
matches2008, matches2009, matches2010, matches2011, matches2012,
matches2013, matches2014, matches2015, matches2016, matches2017,
matches2018, matches2019, matches2020, matches2021, matches2022,
matches2023)
####
#### Data Exploration & Manipulation
####
#instead of listing seed or NA for unseeded, create binary seeded variable
atp_matches$winner_seeded <- ifelse(is.na(atp_matches$winner_seed), "No", "Yes")
atp_matches$loser_seeded <- ifelse(is.na(atp_matches$loser_seed),"No", "Yes")
names(atp_matches)
## [1] "tourney_id" "tourney_name" "surface"
## [4] "draw_size" "tourney_level" "tourney_date"
## [7] "match_num" "winner_id" "winner_seed"
## [10] "winner_entry" "winner_name" "winner_hand"
## [13] "winner_ht" "winner_ioc" "winner_age"
## [16] "loser_id" "loser_seed" "loser_entry"
## [19] "loser_name" "loser_hand" "loser_ht"
## [22] "loser_ioc" "loser_age" "score"
## [25] "best_of" "round" "minutes"
## [28] "w_ace" "w_df" "w_svpt"
## [31] "w_1stIn" "w_1stWon" "w_2ndWon"
## [34] "w_SvGms" "w_bpSaved" "w_bpFaced"
## [37] "l_ace" "l_df" "l_svpt"
## [40] "l_1stIn" "l_1stWon" "l_2ndWon"
## [43] "l_SvGms" "l_bpSaved" "l_bpFaced"
## [46] "winner_rank" "winner_rank_points" "loser_rank"
## [49] "loser_rank_points" "winner_seeded" "loser_seeded"
str(atp_matches)
## 'data.frame': 61932 obs. of 51 variables:
## $ tourney_id : chr "2003-1536" "2003-1536" "2003-1536" "2003-1536" ...
## $ tourney_name : chr "Madrid Masters" "Madrid Masters" "Madrid Masters" "Madrid Masters" ...
## $ surface : chr "Hard" "Hard" "Hard" "Hard" ...
## $ draw_size : int 48 48 48 48 48 48 48 48 48 48 ...
## $ tourney_level : chr "M" "M" "M" "M" ...
## $ tourney_date : int 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 ...
## $ match_num : int 1 2 3 4 5 6 7 8 9 10 ...
## $ winner_id : int 101965 102358 102998 102610 102374 103888 103852 103292 103970 102434 ...
## $ winner_seed : int NA NA NA NA NA NA NA NA NA NA ...
## $ winner_entry : chr "" "Q" "Q" "" ...
## $ winner_name : chr "Wayne Ferreira" "Thomas Enqvist" "Jan Michael Gambill" "Albert Costa" ...
## $ winner_hand : chr "R" "R" "R" "R" ...
## $ winner_ht : int 185 190 190 180 180 188 188 175 175 183 ...
## $ winner_ioc : chr "RSA" "SWE" "USA" "ESP" ...
## $ winner_age : num 32 29.5 26.3 28.3 29.5 21.8 22 24.8 21.5 29.2 ...
## $ loser_id : int 103344 102338 103786 103602 104745 102450 103151 103813 104022 103294 ...
## $ loser_seed : int NA NA NA NA NA NA NA NA NA NA ...
## $ loser_entry : chr "" "" "" "" ...
## $ loser_name : chr "Ivan Ljubicic" "Yevgeny Kafelnikov" "Nikolay Davydenko" "Fernando Gonzalez" ...
## $ loser_hand : chr "R" "R" "R" "R" ...
## $ loser_ht : int 193 190 178 183 185 185 183 185 183 170 ...
## $ loser_ioc : chr "CRO" "RUS" "RUS" "CHI" ...
## $ loser_age : num 24.5 29.6 22.3 23.2 17.3 29.1 25.6 22.2 21.3 24.8 ...
## $ score : chr "7-6(7) 7-6(5)" "6-3 RET" "6-3 6-3" "6-3 7-6(3)" ...
## $ best_of : int 3 3 3 3 3 3 3 3 3 3 ...
## $ round : chr "R64" "R64" "R64" "R64" ...
## $ minutes : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_ace : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_df : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_svpt : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_1stIn : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_1stWon : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_2ndWon : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_SvGms : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_bpSaved : int NA NA NA NA NA NA NA NA NA NA ...
## $ w_bpFaced : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_ace : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_df : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_svpt : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_1stIn : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_1stWon : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_2ndWon : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_SvGms : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_bpSaved : int NA NA NA NA NA NA NA NA NA NA ...
## $ l_bpFaced : int NA NA NA NA NA NA NA NA NA NA ...
## $ winner_rank : int 28 146 57 23 127 25 35 37 72 33 ...
## $ winner_rank_points: int 1090 258 660 1170 290 1145 1025 1000 480 1040 ...
## $ loser_rank : int 42 40 43 22 49 30 27 31 29 87 ...
## $ loser_rank_points : int 865 950 855 1190 788 1055 1133 1050 1060 421 ...
## $ winner_seeded : chr "No" "No" "No" "No" ...
## $ loser_seeded : chr "No" "No" "No" "No" ...
levels(as.factor(atp_matches$surface))
## [1] "" "Carpet" "Clay" "Grass" "Hard"
levels(as.factor(atp_matches$winner_entry))
## [1] "" "Alt" "ALT" "LL" "PR" "Q" "SE" "WC"
#creating outcome variable for winners and losers
match_winners = atp_matches
match_winners$outcome = 1
match_losers = atp_matches
match_losers$outcome = 0
#renaming columns to prep for consolidating winners and losers
match_winners = match_winners %>% dplyr::rename(
id = winner_id,
seed = winner_seed,
entry = winner_entry,
name = winner_name,
hand = winner_hand,
ht = winner_ht,
ioc = winner_ioc,
age = winner_age,
opponent_id = loser_id,
opponent_seed = loser_seed,
opponent_entry = loser_entry,
opponent_name = loser_name,
opponent_hand = loser_hand,
opponent_ht = loser_ht,
opponent_ioc = loser_ioc,
opponent_age = loser_age,
ace = w_ace,
df = w_df,
svpt = w_svpt,
firstIn = w_1stIn,
firstWon = w_1stWon,
secWon = w_2ndWon,
SvGms = w_SvGms,
bpSaved = w_bpSaved,
bpFaced = w_bpFaced,
opponent_ace = l_ace,
opponent_df = l_df,
opponent_svpt = l_svpt,
opponent_firstIn = l_1stIn,
opponent_firstWon = l_1stWon,
opponent_secWon = l_2ndWon,
opponent_SvGms = l_SvGms,
opponent_bpSaved = l_bpSaved,
opponent_bpFaced = l_bpFaced,
rank = winner_rank,
rank_points = winner_rank_points,
opponent_rank = loser_rank,
opponent_rank_points = loser_rank_points,
seeded = winner_seeded,
opponent_seeded = loser_seeded
)
match_losers = match_losers %>% dplyr::rename(
id = loser_id,
seed = loser_seed,
entry = loser_entry,
name = loser_name,
hand = loser_hand,
ht = loser_ht,
ioc = loser_ioc,
age = loser_age,
opponent_id = winner_id,
opponent_seed = winner_seed,
opponent_entry = winner_entry,
opponent_name = winner_name,
opponent_hand = winner_hand,
opponent_ht = winner_ht,
opponent_ioc = winner_ioc,
opponent_age = winner_age,
ace = l_ace,
df = l_df,
svpt = l_svpt,
firstIn = l_1stIn,
firstWon = l_1stWon,
secWon = l_2ndWon,
SvGms = l_SvGms,
bpSaved = l_bpSaved,
bpFaced = l_bpFaced,
opponent_ace = w_ace,
opponent_df = w_df,
opponent_svpt = w_svpt,
opponent_firstIn = w_1stIn,
opponent_firstWon = w_1stWon,
opponent_secWon = w_2ndWon,
opponent_SvGms = w_SvGms,
opponent_bpSaved = w_bpSaved,
opponent_bpFaced = w_bpFaced,
rank = loser_rank,
rank_points = loser_rank_points,
opponent_rank = winner_rank,
opponent_rank_points = winner_rank_points,
seeded = loser_seeded,
opponent_seeded = winner_seeded
)
# combine match_winners and losers
match_outcomes <- rbind(match_winners, match_losers)
rm(match_winners,match_losers)
# additional variables
match_outcomes$ht_dif = match_outcomes$ht - match_outcomes$opponent_ht
match_outcomes$age_dif = match_outcomes$age - match_outcomes$opponent_age
match_outcomes$rank_dif = match_outcomes$rank - match_outcomes$opponent_rank
names(match_outcomes)
## [1] "tourney_id" "tourney_name" "surface"
## [4] "draw_size" "tourney_level" "tourney_date"
## [7] "match_num" "id" "seed"
## [10] "entry" "name" "hand"
## [13] "ht" "ioc" "age"
## [16] "opponent_id" "opponent_seed" "opponent_entry"
## [19] "opponent_name" "opponent_hand" "opponent_ht"
## [22] "opponent_ioc" "opponent_age" "score"
## [25] "best_of" "round" "minutes"
## [28] "ace" "df" "svpt"
## [31] "firstIn" "firstWon" "secWon"
## [34] "SvGms" "bpSaved" "bpFaced"
## [37] "opponent_ace" "opponent_df" "opponent_svpt"
## [40] "opponent_firstIn" "opponent_firstWon" "opponent_secWon"
## [43] "opponent_SvGms" "opponent_bpSaved" "opponent_bpFaced"
## [46] "rank" "rank_points" "opponent_rank"
## [49] "opponent_rank_points" "seeded" "opponent_seeded"
## [52] "outcome" "ht_dif" "age_dif"
## [55] "rank_dif"
# create averages of player match outcomes
match_outcomes <- match_outcomes %>%
group_by(id) %>%
mutate(
avg_ace = mean(ace, na.rm = TRUE),
avg_df = mean(df, na.rm = TRUE),
avg_svpt = mean(svpt, na.rm = TRUE),
avg_firstIn = mean(firstIn, na.rm = TRUE),
avg_firstWon = mean(firstWon, na.rm = TRUE),
avg_secWon = mean(secWon, na.rm = TRUE),
avg_SvGms = mean(SvGms, na.rm = TRUE),
avg_bpSaved = mean(bpSaved, na.rm = TRUE),
avg_bpFaced = mean(bpFaced, na.rm = TRUE)
)
match_outcomes <- match_outcomes %>%
group_by(opponent_id) %>%
mutate(
avg_opponent_ace = mean(opponent_ace, na.rm = TRUE),
avg_opponent_df = mean(opponent_df, na.rm = TRUE),
avg_opponent_svpt = mean(opponent_svpt, na.rm = TRUE),
avg_opponent_firstIn = mean(opponent_firstIn, na.rm = TRUE),
avg_opponent_firstWon = mean(opponent_firstWon, na.rm = TRUE),
avg_opponent_secWon = mean(opponent_secWon, na.rm = TRUE),
avg_opponent_SvGms = mean(opponent_SvGms, na.rm = TRUE),
avg_opponent_bpSaved = mean(opponent_bpSaved, na.rm = TRUE),
avg_opponent_bpFaced = mean(opponent_bpFaced, na.rm = TRUE)
)
match_outcomes$surface_clay <- ifelse(match_outcomes$surface == "Clay",1,0)
match_outcomes$surface_grass <- ifelse(match_outcomes$surface == "Grass",1,0)
match_outcomes$surface_hard <- ifelse(match_outcomes$surface == "Hard",1,0)
str(match_outcomes)
## gropd_df [123,864 × 76] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ tourney_id : chr [1:123864] "2003-1536" "2003-1536" "2003-1536" "2003-1536" ...
## $ tourney_name : chr [1:123864] "Madrid Masters" "Madrid Masters" "Madrid Masters" "Madrid Masters" ...
## $ surface : chr [1:123864] "Hard" "Hard" "Hard" "Hard" ...
## $ draw_size : int [1:123864] 48 48 48 48 48 48 48 48 48 48 ...
## $ tourney_level : chr [1:123864] "M" "M" "M" "M" ...
## $ tourney_date : int [1:123864] 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 20031013 ...
## $ match_num : int [1:123864] 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int [1:123864] 101965 102358 102998 102610 102374 103888 103852 103292 103970 102434 ...
## $ seed : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ entry : chr [1:123864] "" "Q" "Q" "" ...
## $ name : chr [1:123864] "Wayne Ferreira" "Thomas Enqvist" "Jan Michael Gambill" "Albert Costa" ...
## $ hand : chr [1:123864] "R" "R" "R" "R" ...
## $ ht : int [1:123864] 185 190 190 180 180 188 188 175 175 183 ...
## $ ioc : chr [1:123864] "RSA" "SWE" "USA" "ESP" ...
## $ age : num [1:123864] 32 29.5 26.3 28.3 29.5 21.8 22 24.8 21.5 29.2 ...
## $ opponent_id : int [1:123864] 103344 102338 103786 103602 104745 102450 103151 103813 104022 103294 ...
## $ opponent_seed : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_entry : chr [1:123864] "" "" "" "" ...
## $ opponent_name : chr [1:123864] "Ivan Ljubicic" "Yevgeny Kafelnikov" "Nikolay Davydenko" "Fernando Gonzalez" ...
## $ opponent_hand : chr [1:123864] "R" "R" "R" "R" ...
## $ opponent_ht : int [1:123864] 193 190 178 183 185 185 183 185 183 170 ...
## $ opponent_ioc : chr [1:123864] "CRO" "RUS" "RUS" "CHI" ...
## $ opponent_age : num [1:123864] 24.5 29.6 22.3 23.2 17.3 29.1 25.6 22.2 21.3 24.8 ...
## $ score : chr [1:123864] "7-6(7) 7-6(5)" "6-3 RET" "6-3 6-3" "6-3 7-6(3)" ...
## $ best_of : int [1:123864] 3 3 3 3 3 3 3 3 3 3 ...
## $ round : chr [1:123864] "R64" "R64" "R64" "R64" ...
## $ minutes : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ ace : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ df : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ svpt : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ firstIn : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ firstWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ secWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ SvGms : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ bpSaved : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ bpFaced : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_ace : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_df : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_svpt : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_firstIn : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_firstWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_secWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_SvGms : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_bpSaved : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_bpFaced : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ rank : int [1:123864] 28 146 57 23 127 25 35 37 72 33 ...
## $ rank_points : int [1:123864] 1090 258 660 1170 290 1145 1025 1000 480 1040 ...
## $ opponent_rank : int [1:123864] 42 40 43 22 49 30 27 31 29 87 ...
## $ opponent_rank_points : int [1:123864] 865 950 855 1190 788 1055 1133 1050 1060 421 ...
## $ seeded : chr [1:123864] "No" "No" "No" "No" ...
## $ opponent_seeded : chr [1:123864] "No" "No" "No" "No" ...
## $ outcome : num [1:123864] 1 1 1 1 1 1 1 1 1 1 ...
## $ ht_dif : int [1:123864] -8 0 12 -3 -5 3 5 -10 -8 13 ...
## $ age_dif : num [1:123864] 7.5 -0.1 4 5.1 12.2 ...
## $ rank_dif : int [1:123864] -14 106 14 1 78 -5 8 6 43 -54 ...
## $ avg_ace : num [1:123864] 7.04 8.12 11.1 4.61 3.62 ...
## $ avg_df : num [1:123864] 2.54 5.34 3.36 2.35 3.11 ...
## $ avg_svpt : num [1:123864] 80.9 85.7 78 84.2 81.2 ...
## $ avg_firstIn : num [1:123864] 46.5 46.5 44.7 52.6 46.5 ...
## $ avg_firstWon : num [1:123864] 33.6 34.6 34 35.5 31.5 ...
## $ avg_secWon : num [1:123864] 18 18.9 15.6 16 17.4 ...
## $ avg_SvGms : num [1:123864] 12.5 13.2 12 12.8 12.5 ...
## $ avg_bpSaved : num [1:123864] 3.97 4.4 4.1 4.84 4.71 ...
## $ avg_bpFaced : num [1:123864] 6.38 7.43 6.61 8.03 8.19 ...
## $ avg_opponent_ace : num [1:123864] 12.28 3.98 3.23 6.89 3.19 ...
## $ avg_opponent_df : num [1:123864] 2.28 3.25 2.59 3.55 1.71 ...
## $ avg_opponent_svpt : num [1:123864] 80.5 79.1 74 81.5 73.6 ...
## $ avg_opponent_firstIn : num [1:123864] 47.9 46.6 49.9 50 50 ...
## $ avg_opponent_firstWon: num [1:123864] 37.2 32.4 34.2 37.2 36.1 ...
## $ avg_opponent_secWon : num [1:123864] 17.1 16.1 12.5 16.4 13.5 ...
## $ avg_opponent_SvGms : num [1:123864] 12.9 12.5 11.7 13 12.1 ...
## $ avg_opponent_bpSaved : num [1:123864] 3.35 3.73 3.93 3.77 3.37 ...
## $ avg_opponent_bpFaced : num [1:123864] 5.12 6.92 6.47 5.92 5.1 ...
## $ surface_clay : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
## $ surface_grass : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
## $ surface_hard : num [1:123864] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "groups")= tibble [2,261 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ opponent_id: int [1:2261] 100644 101316 101404 101532 101662 101723 101736 101746 101750 101774 ...
## ..$ .rows : list<int> [1:2261]
## .. ..$ : int [1:573] 33442 35678 36325 36416 36473 36511 37213 37718 37802 37918 ...
## .. ..$ : int 6453
## .. ..$ : int [1:3] 25319 27592 28329
## .. ..$ : int [1:6] 325 882 2591 2794 62750 62782
## .. ..$ : int [1:2] 12952 12954
## .. ..$ : int [1:3] 1861 2534 63780
## .. ..$ : int [1:176] 225 1164 1271 1377 1610 2183 2337 2469 2661 2665 ...
## .. ..$ : int [1:5] 142 403 983 2619 64509
## .. ..$ : int 648
## .. ..$ : int [1:54] 204 790 907 931 1240 1438 1495 2133 2320 2459 ...
## .. ..$ : int 3110
## .. ..$ : int 19117
## .. ..$ : int [1:2] 2951 3118
## .. ..$ : int [1:2] 1831 63750
## .. ..$ : int [1:2] 2972 64902
## .. ..$ : int [1:27] 218 491 776 1651 1823 1864 2067 2260 2540 3268 ...
## .. ..$ : int [1:5] 2980 3126 3129 9631 64915
## .. ..$ : int [1:49] 48 212 534 2094 2252 2566 2855 3491 3599 3618 ...
## .. ..$ : int 3231
## .. ..$ : int 4723
## .. ..$ : int [1:134] 162 180 242 358 508 821 941 976 1038 1118 ...
## .. ..$ : int [1:4] 2897 3031 68401 71378
## .. ..$ : int [1:3] 6423 6463 68397
## .. ..$ : int [1:14] 743 852 1952 2000 2050 2589 2727 2799 62661 62752 ...
## .. ..$ : int [1:7] 259 564 3579 3951 4432 5515 67414
## .. ..$ : int [1:4] 1804 6346 63727 64912
## .. ..$ : int 4270
## .. ..$ : int 2922
## .. ..$ : int [1:108] 46 300 384 434 662 696 755 909 913 1011 ...
## .. ..$ : int [1:17] 194 935 1884 3372 3409 4069 4238 4275 4289 4436 ...
## .. ..$ : int [1:78] 17 613 616 693 759 798 894 943 1085 1158 ...
## .. ..$ : int [1:2] 3124 65054
## .. ..$ : int [1:2] 3074 81153
## .. ..$ : int [1:21] 134 219 595 644 827 928 994 1031 1689 2607 ...
## .. ..$ : int 6449
## .. ..$ : int [1:12] 802 881 929 1279 1386 1467 1478 1732 2085 2352 ...
## .. ..$ : int [1:3] 59 189 2223
## .. ..$ : int [1:6] 6365 12992 16143 19247 22372 74926
## .. ..$ : int [1:235] 27 313 580 721 860 992 1032 1068 1588 1604 ...
## .. ..$ : int [1:6] 3119 6393 6456 65053 68324 68386
## .. ..$ : int [1:3] 3063 6241 65105
## .. ..$ : int [1:3] 2923 68316 68317
## .. ..$ : int 2948
## .. ..$ : int [1:3] 3052 64866 64869
## .. ..$ : int [1:4] 1812 6341 64919 65059
## .. ..$ : int [1:171] 57 91 129 203 568 668 738 804 890 915 ...
## .. ..$ : int [1:18] 144 1569 2943 3163 3170 3643 4800 9636 65094 65559 ...
## .. ..$ : int [1:7] 1362 4569 9501 11229 63271 64975 65122
## .. ..$ : int [1:2] 64863 64980
## .. ..$ : int [1:20] 75 270 1319 1452 3266 3539 3578 3743 6575 6837 ...
## .. ..$ : int [1:267] 51 487 676 695 782 927 999 1045 1107 1169 ...
## .. ..$ : int [1:2] 1463 63380
## .. ..$ : int [1:108] 93 137 221 246 351 409 643 655 739 867 ...
## .. ..$ : int [1:6] 1632 1760 1976 2582 2912 5178
## .. ..$ : int [1:153] 66 96 112 143 543 794 836 1020 1291 1471 ...
## .. ..$ : int 121
## .. ..$ : int 1764
## .. ..$ : int [1:112] 116 238 385 426 562 789 840 921 1039 1144 ...
## .. ..$ : int [1:42] 1909 2120 2508 3683 3959 4917 5055 5618 5709 6009 ...
## .. ..$ : int [1:14] 2859 6320 9417 19204 64859 64861 64979 64982 68242 68244 ...
## .. ..$ : int 366
## .. ..$ : int [1:2] 1014 2037
## .. ..$ : int [1:10] 1025 1179 1765 1784 2024 2032 2580 62943 63939 63950
## .. ..$ : int 64913
## .. ..$ : int [1:2] 19244 22314
## .. ..$ : int [1:138] 217 1266 1303 1364 1425 2061 2289 2358 3202 3204 ...
## .. ..$ : int [1:12] 2065 2558 2861 3481 4894 5166 5627 5812 67084 67093 ...
## .. ..$ : int [1:5] 3165 9551 12956 13014 64998
## .. ..$ : int [1:29] 285 511 2382 2845 3476 4055 4140 4483 4519 5167 ...
## .. ..$ : int [1:81] 206 312 328 666 718 753 783 1013 1432 1517 ...
## .. ..$ : int 10310
## .. ..$ : int [1:14] 1363 1482 1647 1879 3064 3065 6255 63274 63569 65106 ...
## .. ..$ : int [1:83] 310 332 372 457 727 1074 1194 1535 1699 1835 ...
## .. ..$ : int [1:26] 532 2594 2929 3741 6306 6470 6473 7017 9590 9593 ...
## .. ..$ : int [1:16] 156 2864 3280 3761 4938 5692 6641 8181 14987 15932 ...
## .. ..$ : int [1:204] 452 675 724 758 864 919 990 1601 2475 2577 ...
## .. ..$ : int [1:2] 3076 64891
## .. ..$ : int [1:55] 2 170 768 887 945 1002 1037 1079 1224 1265 ...
## .. ..$ : int [1:79] 29 118 556 683 824 1060 1111 1177 1380 1405 ...
## .. ..$ : int [1:94] 18 175 591 620 657 773 896 937 991 1030 ...
## .. ..$ : int [1:2] 3005 6339
## .. ..$ : int [1:20] 354 398 1097 1761 2119 2268 2384 3961 5020 5290 ...
## .. ..$ : int [1:75] 21 244 376 435 458 765 853 946 979 1042 ...
## .. ..$ : int [1:46] 50 387 513 1528 2205 2388 3598 5716 10165 13097 ...
## .. ..$ : int [1:19] 178 280 1659 1825 2612 2863 3349 3775 4384 5118 ...
## .. ..$ : int [1:2] 15949 26231
## .. ..$ : int [1:33] 182 278 547 597 617 1132 1256 1317 1335 1407 ...
## .. ..$ : int 19203
## .. ..$ : int [1:317] 37 102 577 633 688 816 857 973 1098 1120 ...
## .. ..$ : int [1:28] 150 172 309 322 599 645 822 982 1021 1114 ...
## .. ..$ : int [1:225] 6 160 224 501 583 855 955 984 1155 1167 ...
## .. ..$ : int [1:114] 33 107 422 478 699 760 875 936 1070 1134 ...
## .. ..$ : int [1:2] 1234 2896
## .. ..$ : int 6215
## .. ..$ : int [1:6] 2109 2194 2262 2598 5753 64467
## .. ..$ : int [1:2] 3114 65041
## .. ..$ : int 3582
## .. ..$ : int [1:11] 368 423 1522 1791 2035 2216 62284 62339 64129 64140 ...
## .. ..$ : int [1:79] 151 240 375 438 462 498 665 1867 2096 2246 ...
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
summary(match_outcomes)
## tourney_id tourney_name surface draw_size
## Length:123864 Length:123864 Length:123864 Min. : 2.00
## Class :character Class :character Class :character 1st Qu.: 32.00
## Mode :character Mode :character Mode :character Median : 32.00
## Mean : 55.35
## 3rd Qu.: 64.00
## Max. :128.00
##
## tourney_level tourney_date match_num id
## Length:123864 Min. :20021230 Min. : 1.0 Min. :100644
## Class :character 1st Qu.:20070813 1st Qu.: 12.0 1st Qu.:103819
## Mode :character Median :20120827 Median : 31.0 Median :104607
## Mean :20126107 Mean : 106.8 Mean :110110
## 3rd Qu.:20180202 3rd Qu.: 247.0 3rd Qu.:105583
## Max. :20231127 Max. :1701.0 Max. :212051
##
## seed entry name hand
## Min. : 1 Length:123864 Length:123864 Length:123864
## 1st Qu.: 3 Class :character Class :character Class :character
## Median : 6 Mode :character Mode :character Mode :character
## Mean : 8
## 3rd Qu.:10
## Max. :35
## NA's :83347
## ht ioc age opponent_id
## Min. : 71.0 Length:123864 Min. :14.50 Min. :100644
## 1st Qu.:183.0 Class :character 1st Qu.:23.60 1st Qu.:103819
## Median :185.0 Mode :character Median :26.40 Median :104607
## Mean :186.1 Mean :26.54 Mean :110110
## 3rd Qu.:190.0 3rd Qu.:29.30 3rd Qu.:105583
## Max. :211.0 Max. :46.00 Max. :212051
## NA's :3746 NA's :11
## opponent_seed opponent_entry opponent_name opponent_hand
## Min. : 1 Length:123864 Length:123864 Length:123864
## 1st Qu.: 3 Class :character Class :character Class :character
## Median : 6 Mode :character Mode :character Mode :character
## Mean : 8
## 3rd Qu.:10
## Max. :35
## NA's :83347
## opponent_ht opponent_ioc opponent_age score
## Min. : 71.0 Length:123864 Min. :14.50 Length:123864
## 1st Qu.:183.0 Class :character 1st Qu.:23.60 Class :character
## Median :185.0 Mode :character Median :26.40 Mode :character
## Mean :186.1 Mean :26.54
## 3rd Qu.:190.0 3rd Qu.:29.30
## Max. :211.0 Max. :46.00
## NA's :3746 NA's :11
## best_of round minutes ace
## Min. :3.000 Length:123864 Min. : 0.0 Min. : 0.000
## 1st Qu.:3.000 Class :character 1st Qu.: 77.0 1st Qu.: 2.000
## Median :3.000 Mode :character Median : 100.0 Median : 5.000
## Mean :3.453 Mean : 107.8 Mean : 6.067
## 3rd Qu.:3.000 3rd Qu.: 131.0 3rd Qu.: 8.000
## Max. :5.000 Max. :1146.0 Max. :113.000
## NA's :13556 NA's :10624
## df svpt firstIn firstWon
## Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 58.00 1st Qu.: 35.00 1st Qu.: 25.00
## Median : 2.000 Median : 74.00 Median : 45.00 Median : 32.00
## Mean : 2.939 Mean : 79.52 Mean : 48.62 Mean : 34.63
## 3rd Qu.: 4.000 3rd Qu.: 96.00 3rd Qu.: 59.00 3rd Qu.: 42.00
## Max. :26.000 Max. :491.00 Max. :361.00 Max. :292.00
## NA's :10624 NA's :10624 NA's :10624 NA's :10624
## secWon SvGms bpSaved bpFaced
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 11.00 1st Qu.: 9.00 1st Qu.: 2.000 1st Qu.: 3.000
## Median : 15.00 Median :11.00 Median : 3.000 Median : 6.000
## Mean : 15.65 Mean :12.43 Mean : 4.088 Mean : 6.769
## 3rd Qu.: 20.00 3rd Qu.:15.00 3rd Qu.: 6.000 3rd Qu.: 9.000
## Max. :101.00 Max. :91.00 Max. :27.000 Max. :38.000
## NA's :10624 NA's :10622 NA's :10624 NA's :10624
## opponent_ace opponent_df opponent_svpt opponent_firstIn
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 58.00 1st Qu.: 35.00
## Median : 5.000 Median : 2.000 Median : 74.00 Median : 45.00
## Mean : 6.067 Mean : 2.939 Mean : 79.52 Mean : 48.62
## 3rd Qu.: 8.000 3rd Qu.: 4.000 3rd Qu.: 96.00 3rd Qu.: 59.00
## Max. :113.000 Max. :26.000 Max. :491.00 Max. :361.00
## NA's :10624 NA's :10624 NA's :10624 NA's :10624
## opponent_firstWon opponent_secWon opponent_SvGms opponent_bpSaved
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 25.00 1st Qu.: 11.00 1st Qu.: 9.00 1st Qu.: 2.000
## Median : 32.00 Median : 15.00 Median :11.00 Median : 3.000
## Mean : 34.63 Mean : 15.65 Mean :12.43 Mean : 4.088
## 3rd Qu.: 42.00 3rd Qu.: 20.00 3rd Qu.:15.00 3rd Qu.: 6.000
## Max. :292.00 Max. :101.00 Max. :91.00 Max. :27.000
## NA's :10624 NA's :10624 NA's :10622 NA's :10624
## opponent_bpFaced rank rank_points opponent_rank
## Min. : 0.000 Min. : 1.00 Min. : 1 Min. : 1.00
## 1st Qu.: 3.000 1st Qu.: 25.00 1st Qu.: 510 1st Qu.: 25.00
## Median : 6.000 Median : 56.00 Median : 825 Median : 56.00
## Mean : 6.769 Mean : 98.58 Mean : 1342 Mean : 98.58
## 3rd Qu.: 9.000 3rd Qu.: 100.00 3rd Qu.: 1405 3rd Qu.: 100.00
## Max. :38.000 Max. :2159.00 Max. :16950 Max. :2159.00
## NA's :10624 NA's :1567 NA's :1567 NA's :1567
## opponent_rank_points seeded opponent_seeded outcome
## Min. : 1 Length:123864 Length:123864 Min. :0.0
## 1st Qu.: 510 Class :character Class :character 1st Qu.:0.0
## Median : 825 Mode :character Mode :character Median :0.5
## Mean : 1342 Mean :0.5
## 3rd Qu.: 1405 3rd Qu.:1.0
## Max. :16950 Max. :1.0
## NA's :1567
## ht_dif age_dif rank_dif avg_ace
## Min. :-112 Min. :-25.9 Min. :-2125 Min. : 0.000
## 1st Qu.: -5 1st Qu.: -3.7 1st Qu.: -43 1st Qu.: 3.786
## Median : 0 Median : 0.0 Median : 0 Median : 5.429
## Mean : 0 Mean : 0.0 Mean : 0 Mean : 6.021
## 3rd Qu.: 5 3rd Qu.: 3.7 3rd Qu.: 43 3rd Qu.: 7.670
## Max. : 112 Max. : 25.9 Max. : 2125 Max. :19.821
## NA's :5792 NA's :22 NA's :2746 NA's :2469
## avg_df avg_svpt avg_firstIn avg_firstWon
## Min. : 0.000 Min. : 20.00 Min. : 11.00 Min. : 6.00
## 1st Qu.: 2.332 1st Qu.: 77.35 1st Qu.: 46.57 1st Qu.:32.56
## Median : 2.803 Median : 79.49 Median : 48.54 Median :34.43
## Mean : 2.946 Mean : 79.49 Mean : 48.60 Mean :34.55
## 3rd Qu.: 3.461 3rd Qu.: 81.52 3rd Qu.: 50.45 3rd Qu.:36.23
## Max. :15.000 Max. :163.00 Max. :123.50 Max. :74.50
## NA's :2469 NA's :2469 NA's :2469 NA's :2469
## avg_secWon avg_SvGms avg_bpSaved avg_bpFaced
## Min. : 0.00 Min. : 4.00 Min. : 0.000 Min. : 0.000
## 1st Qu.:14.48 1st Qu.:12.09 1st Qu.: 3.797 1st Qu.: 6.100
## Median :15.65 Median :12.41 Median : 4.107 Median : 6.798
## Mean :15.62 Mean :12.42 Mean : 4.106 Mean : 6.810
## 3rd Qu.:16.77 3rd Qu.:12.77 3rd Qu.: 4.440 3rd Qu.: 7.500
## Max. :39.00 Max. :25.00 Max. :18.000 Max. :24.000
## NA's :2469 NA's :2469 NA's :2469 NA's :2469
## avg_opponent_ace avg_opponent_df avg_opponent_svpt avg_opponent_firstIn
## Min. : 0.000 Min. : 0.000 Min. : 20.00 Min. : 11.00
## 1st Qu.: 3.786 1st Qu.: 2.332 1st Qu.: 77.35 1st Qu.: 46.57
## Median : 5.429 Median : 2.803 Median : 79.49 Median : 48.54
## Mean : 6.021 Mean : 2.946 Mean : 79.49 Mean : 48.60
## 3rd Qu.: 7.670 3rd Qu.: 3.461 3rd Qu.: 81.52 3rd Qu.: 50.45
## Max. :19.821 Max. :15.000 Max. :163.00 Max. :123.50
## NA's :2469 NA's :2469 NA's :2469 NA's :2469
## avg_opponent_firstWon avg_opponent_secWon avg_opponent_SvGms
## Min. : 6.00 Min. : 0.00 Min. : 4.00
## 1st Qu.:32.56 1st Qu.:14.48 1st Qu.:12.09
## Median :34.43 Median :15.65 Median :12.41
## Mean :34.55 Mean :15.62 Mean :12.42
## 3rd Qu.:36.23 3rd Qu.:16.77 3rd Qu.:12.77
## Max. :74.50 Max. :39.00 Max. :25.00
## NA's :2469 NA's :2469 NA's :2469
## avg_opponent_bpSaved avg_opponent_bpFaced surface_clay surface_grass
## Min. : 0.000 Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 3.797 1st Qu.: 6.100 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 4.107 Median : 6.798 Median :0.0000 Median :0.0000
## Mean : 4.106 Mean : 6.810 Mean :0.3211 Mean :0.1036
## 3rd Qu.: 4.440 3rd Qu.: 7.500 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :18.000 Max. :24.000 Max. :1.0000 Max. :1.0000
## NA's :2469 NA's :2469
## surface_hard
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5529
## 3rd Qu.:1.0000
## Max. :1.0000
##
## match_outcomes has all of my variables ##
# new DF for modeling
tennis_df = match_outcomes
# Removing pointless variables
tennis_df = select(tennis_df, -seed, -entry, -opponent_seed, -opponent_entry,
-tourney_id,-tourney_name,-tourney_date,-match_num,-name,-ioc,
-opponent_name,-opponent_ioc,-score,-round,-best_of,-surface)
# Removing variables related to match outcome
tennis_df = select(tennis_df,-minutes,-ace,-df,-svpt,-firstIn,-firstWon,-secWon,-SvGms,-bpSaved,
-bpFaced,-opponent_ace,-opponent_df,-opponent_svpt,-opponent_firstIn,
-opponent_firstWon,-opponent_secWon,-opponent_SvGms,-opponent_bpSaved,
-opponent_bpFaced)
# Removing height, age, rank since I have the differences instead. also rank points
tennis_df = select(tennis_df,-ht,-age,-rank,-rank_points,-opponent_ht,-opponent_age,-opponent_rank,
-opponent_rank_points)
# Removing IDs
tennis_df = ungroup(tennis_df)
tennis_df = select(tennis_df,-id,-opponent_id)
#coding U (unknown) hand as NAs
tennis_df$hand = ifelse(tennis_df$hand == 'U',NA, tennis_df$hand)
tennis_df$opponent_hand = ifelse(tennis_df$opponent_hand == 'U',NA, tennis_df$opponent_hand)
# remove rows containing missing values
tennis_df = na.omit(tennis_df) #123,864 to 116,166
# Looking at the relationships between the numeric variables
tennis_df_num <- dplyr::select_if(tennis_df, is.numeric)
corrplot(cor(tennis_df_num), method = c("number"), type = c("lower"),
number.cex=0.65, tl.cex = 0.65,tl.col="black")

#pairs(tennis_df_num)
summary(tennis_df_num)
## draw_size outcome ht_dif age_dif rank_dif
## Min. : 2.00 Min. :0.0 Min. :-41 Min. :-25.9 Min. :-2094
## 1st Qu.: 32.00 1st Qu.:0.0 1st Qu.: -5 1st Qu.: -3.7 1st Qu.: -41
## Median : 32.00 Median :0.5 Median : 0 Median : 0.0 Median : 0
## Mean : 57.93 Mean :0.5 Mean : 0 Mean : 0.0 Mean : 0
## 3rd Qu.: 64.00 3rd Qu.:1.0 3rd Qu.: 5 3rd Qu.: 3.7 3rd Qu.: 41
## Max. :128.00 Max. :1.0 Max. : 41 Max. : 25.9 Max. : 2094
## avg_ace avg_df avg_svpt avg_firstIn
## Min. : 0.000 Min. : 0.000 Min. : 36.00 Min. : 16.00
## 1st Qu.: 3.842 1st Qu.: 2.331 1st Qu.: 77.39 1st Qu.: 46.60
## Median : 5.482 Median : 2.799 Median : 79.50 Median : 48.54
## Mean : 6.078 Mean : 2.932 Mean : 79.55 Mean : 48.64
## 3rd Qu.: 7.712 3rd Qu.: 3.430 3rd Qu.: 81.45 3rd Qu.: 50.33
## Max. :19.821 Max. :10.000 Max. :159.00 Max. :100.00
## avg_firstWon avg_secWon avg_SvGms avg_bpSaved
## Min. : 8.00 Min. : 1.00 Min. : 6.00 Min. : 0.000
## 1st Qu.:32.65 1st Qu.:14.55 1st Qu.:12.11 1st Qu.: 3.797
## Median :34.57 Median :15.66 Median :12.42 Median : 4.100
## Mean :34.66 Mean :15.67 Mean :12.44 Mean : 4.087
## 3rd Qu.:36.23 3rd Qu.:16.78 3rd Qu.:12.77 3rd Qu.: 4.437
## Max. :62.00 Max. :39.00 Max. :25.00 Max. :15.000
## avg_bpFaced avg_opponent_ace avg_opponent_df avg_opponent_svpt
## Min. : 1.000 Min. : 0.000 Min. : 0.000 Min. : 36.00
## 1st Qu.: 6.099 1st Qu.: 3.842 1st Qu.: 2.331 1st Qu.: 77.39
## Median : 6.793 Median : 5.482 Median : 2.799 Median : 79.50
## Mean : 6.763 Mean : 6.078 Mean : 2.932 Mean : 79.55
## 3rd Qu.: 7.470 3rd Qu.: 7.712 3rd Qu.: 3.430 3rd Qu.: 81.45
## Max. :23.000 Max. :19.821 Max. :10.000 Max. :159.00
## avg_opponent_firstIn avg_opponent_firstWon avg_opponent_secWon
## Min. : 16.00 Min. : 8.00 Min. : 1.00
## 1st Qu.: 46.60 1st Qu.:32.65 1st Qu.:14.55
## Median : 48.54 Median :34.57 Median :15.66
## Mean : 48.64 Mean :34.66 Mean :15.67
## 3rd Qu.: 50.33 3rd Qu.:36.23 3rd Qu.:16.78
## Max. :100.00 Max. :62.00 Max. :39.00
## avg_opponent_SvGms avg_opponent_bpSaved avg_opponent_bpFaced surface_clay
## Min. : 6.00 Min. : 0.000 Min. : 1.000 Min. :0.0000
## 1st Qu.:12.11 1st Qu.: 3.797 1st Qu.: 6.099 1st Qu.:0.0000
## Median :12.42 Median : 4.100 Median : 6.793 Median :0.0000
## Mean :12.44 Mean : 4.087 Mean : 6.763 Mean :0.3187
## 3rd Qu.:12.77 3rd Qu.: 4.437 3rd Qu.: 7.470 3rd Qu.:1.0000
## Max. :25.00 Max. :15.000 Max. :23.000 Max. :1.0000
## surface_grass surface_hard
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000
## Mean :0.1074 Mean :0.5544
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
high_correlation <- which(cor(tennis_df_num) > 0.8 & cor(tennis_df_num) < 1, arr.ind = TRUE)
print(high_correlation)
## row col
## avg_SvGms 12 8
## avg_SvGms 12 10
## avg_svpt 8 12
## avg_firstWon 10 12
## avg_bpFaced 14 13
## avg_bpSaved 13 14
## avg_opponent_SvGms 21 17
## avg_opponent_SvGms 21 19
## avg_opponent_svpt 17 21
## avg_opponent_firstWon 19 21
## avg_opponent_bpFaced 23 22
## avg_opponent_bpSaved 22 23
# bp faced & saved are highly correlated (greater than 0.9), will remove faced
tennis_df = select(tennis_df,-avg_bpFaced,-avg_opponent_bpFaced)
# service points and service games are highly correlated (0.88) will remove games
tennis_df = select(tennis_df,-avg_SvGms,-avg_opponent_SvGms)
# Look again
tennis_df_num <- dplyr::select_if(tennis_df, is.numeric)
corrplot(cor(tennis_df_num), method = c("number"), type = c("lower"),
number.cex=0.65, tl.cex = 0.65,tl.col="black")

# Data Type Changes
#tennis_df$tourney_date = as.Date(as.character(tennis_df$tourney_date),format = "%Y%m%d")
#tennis_df$surface = as.factor(tennis_df$surface)
tennis_df$tourney_level = as.factor(tennis_df$tourney_level)
tennis_df$hand = as.factor(tennis_df$hand)
tennis_df$opponent_hand = as.factor(tennis_df$opponent_hand)
tennis_df$seeded = as.factor(tennis_df$seeded)
tennis_df$opponent_seeded = as.factor(tennis_df$opponent_seeded)
tennis_df$outcome = as.factor(tennis_df$outcome)
tennis_df$surface_clay = as.factor(tennis_df$surface_clay)
tennis_df$surface_grass = as.factor(tennis_df$surface_grass)
tennis_df$surface_hard = as.factor(tennis_df$surface_hard)
str(tennis_df)
## tibble [116,974 × 27] (S3: tbl_df/tbl/data.frame)
## $ draw_size : int [1:116974] 48 48 48 48 48 48 48 48 48 48 ...
## $ tourney_level : Factor w/ 5 levels "A","D","F","G",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ hand : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 1 2 2 2 ...
## $ opponent_hand : Factor w/ 2 levels "L","R": 2 2 2 2 1 2 2 1 2 2 ...
## $ seeded : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ opponent_seeded : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ outcome : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ht_dif : int [1:116974] -8 0 12 -3 -5 3 5 -10 -8 13 ...
## $ age_dif : num [1:116974] 7.5 -0.1 4 5.1 12.2 ...
## $ rank_dif : int [1:116974] -14 106 14 1 78 -5 8 6 43 -54 ...
## $ avg_ace : num [1:116974] 7.04 8.12 11.1 4.61 3.62 ...
## $ avg_df : num [1:116974] 2.54 5.34 3.36 2.35 3.11 ...
## $ avg_svpt : num [1:116974] 80.9 85.7 78 84.2 81.2 ...
## $ avg_firstIn : num [1:116974] 46.5 46.5 44.7 52.6 46.5 ...
## $ avg_firstWon : num [1:116974] 33.6 34.6 34 35.5 31.5 ...
## $ avg_secWon : num [1:116974] 18 18.9 15.6 16 17.4 ...
## $ avg_bpSaved : num [1:116974] 3.97 4.4 4.1 4.84 4.71 ...
## $ avg_opponent_ace : num [1:116974] 12.28 3.98 3.23 6.89 3.19 ...
## $ avg_opponent_df : num [1:116974] 2.28 3.25 2.59 3.55 1.71 ...
## $ avg_opponent_svpt : num [1:116974] 80.5 79.1 74 81.5 73.6 ...
## $ avg_opponent_firstIn : num [1:116974] 47.9 46.6 49.9 50 50 ...
## $ avg_opponent_firstWon: num [1:116974] 37.2 32.4 34.2 37.2 36.1 ...
## $ avg_opponent_secWon : num [1:116974] 17.1 16.1 12.5 16.4 13.5 ...
## $ avg_opponent_bpSaved : num [1:116974] 3.35 3.73 3.93 3.77 3.37 ...
## $ surface_clay : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ surface_grass : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ surface_hard : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:6890] 85 121 295 409 425 514 618 734 736 1227 ...
## ..- attr(*, "names")= chr [1:6890] "85" "121" "295" "409" ...
names(tennis_df)
## [1] "draw_size" "tourney_level" "hand"
## [4] "opponent_hand" "seeded" "opponent_seeded"
## [7] "outcome" "ht_dif" "age_dif"
## [10] "rank_dif" "avg_ace" "avg_df"
## [13] "avg_svpt" "avg_firstIn" "avg_firstWon"
## [16] "avg_secWon" "avg_bpSaved" "avg_opponent_ace"
## [19] "avg_opponent_df" "avg_opponent_svpt" "avg_opponent_firstIn"
## [22] "avg_opponent_firstWon" "avg_opponent_secWon" "avg_opponent_bpSaved"
## [25] "surface_clay" "surface_grass" "surface_hard"
### Splitting into train and test
set.seed(12)
index = sample(nrow(tennis_df), 0.8*nrow(tennis_df), replace = F) # 80/20 split
tennis_train = tennis_df[index,]
tennis_test = tennis_df[-index,]
#checking for balance
table(tennis_train$outcome)
##
## 0 1
## 46743 46836
#0 1
#46477 46455
table(tennis_test$outcome)
##
## 0 1
## 11744 11651
#0 1
#11606 11628
str(tennis_train)
## tibble [93,579 × 27] (S3: tbl_df/tbl/data.frame)
## $ draw_size : int [1:93579] 56 128 32 32 32 48 64 128 128 32 ...
## $ tourney_level : Factor w/ 5 levels "A","D","F","G",..: 1 4 1 1 1 1 5 4 4 1 ...
## $ hand : Factor w/ 2 levels "L","R": 2 2 2 1 2 2 2 2 2 2 ...
## $ opponent_hand : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 1 2 2 2 ...
## $ seeded : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 2 2 2 ...
## $ opponent_seeded : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 2 1 1 1 1 ...
## $ outcome : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 2 2 ...
## $ ht_dif : int [1:93579] -13 0 -5 10 -15 18 3 -3 0 0 ...
## $ age_dif : num [1:93579] -6 -3.8 9.4 -2.9 -0.7 ...
## $ rank_dif : int [1:93579] -22 -50 -143 78 51 53 62 -42 -66 -107 ...
## $ avg_ace : num [1:93579] 4.22 3.88 7.29 7.23 4.11 ...
## $ avg_df : num [1:93579] 3.32 3.25 3.27 2.7 1.84 ...
## $ avg_svpt : num [1:93579] 76.5 80.4 79.8 79.3 79.3 ...
## $ avg_firstIn : num [1:93579] 45 47.5 42.4 48 51.7 ...
## $ avg_firstWon : num [1:93579] 30.7 33.1 31.7 35.2 35.9 ...
## $ avg_secWon : num [1:93579] 15.7 17.2 19.3 15.4 14.5 ...
## $ avg_bpSaved : num [1:93579] 4.32 4.07 3.82 3.81 4.18 ...
## $ avg_opponent_ace : num [1:93579] 4.84 4.88 8.21 6.49 9.61 ...
## $ avg_opponent_df : num [1:93579] 1.64 2.75 4.46 2.48 5.24 ...
## $ avg_opponent_svpt : num [1:93579] 79.6 76.1 79.6 78.4 86.8 ...
## $ avg_opponent_firstIn : num [1:93579] 55 47 46.4 47.5 53.8 ...
## $ avg_opponent_firstWon: num [1:93579] 38.1 31.3 34.2 35.5 40.2 ...
## $ avg_opponent_secWon : num [1:93579] 12.4 13.3 16.2 17 16.2 ...
## $ avg_opponent_bpSaved : num [1:93579] 4.18 3.88 3.61 3.72 4.32 ...
## $ surface_clay : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 2 1 ...
## $ surface_grass : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
## $ surface_hard : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:6890] 85 121 295 409 425 514 618 734 736 1227 ...
## ..- attr(*, "names")= chr [1:6890] "85" "121" "295" "409" ...
#frequency table for tennis_train$outcome
train_outcome_table <- table(tennis_train$outcome)
# frequency table for tennis_test$outcome
test_outcome_table <- table(tennis_test$outcome)
# Create a dataframe for visualization
outcome_data <- data.frame(
dataset = rep(c("Train", "Test"), each = 2),
outcome = rep(c("0", "1"), times = 2),
count = c(train_outcome_table, test_outcome_table)
)
# Create bar graph using ggplot
ggplot(outcome_data, aes(x = outcome, y = count, fill = dataset)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Distribution of Outcome in Train and Test Datasets",
x = "Outcome",
y = "Count",
fill = "Dataset") +
theme_minimal()

# Histograms to understand skew
dev.new(width = 1500, height = 1000, unit = "px")
par(mfrow = c(3, 4))
hist(tennis_train$draw_size, main = "Distribution of Draw Size", xlab = "")
hist(tennis_train$ht_dif, main = "Distribution of Height Difference", xlab = "")
hist(tennis_train$age_dif, main = "Distribution of Age Difference", xlab = "")
hist(tennis_train$rank_dif, main = "Distribution of Rank Difference", xlab = "")
hist(tennis_train$avg_ace, main = "Distribution of Average Aces", xlab = "") #right skew
hist(tennis_train$avg_df, main = "Distribution of Average Double Faults", xlab = "") #right skew
hist(tennis_train$avg_svpt, main = "Distribution of Average Serve Points", xlab = "")
hist(tennis_train$avg_ace, main = "Distribution of Average Aces", xlab = "")
hist(tennis_train$avg_firstIn, main = "Distribution of Avg 1st Serve In", xlab = "")
hist(tennis_train$avg_firstWon, main = "Distribution of Avg 1st Serve Won", xlab = "")
hist(tennis_train$avg_secWon, main = "Distribution of Avg 2nd Serve Won", xlab = "")
hist(tennis_train$avg_bpSaved, main = "Distribution of Avg BP Saved", xlab = "")
library(ggplot2)
# Prettier Histograms
ggplot(tennis_train, aes(x = draw_size)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Draw Size", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = ht_dif)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Height Difference", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = age_dif)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Age Difference", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = rank_dif)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Rank Difference", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_ace)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Average Aces", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_df)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Average Double Faults", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_svpt)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Average Serve Points", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_ace)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Average Aces", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_firstIn)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Avg 1st Serve In", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_firstWon)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Avg 1st Serve Won", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_secWon)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Avg 2nd Serve Won", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
ggplot(tennis_train, aes(x = avg_bpSaved)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Avg BP Saved", x = "") +
theme_minimal() +
scale_y_continuous(NULL)
####
#### Which variables impact whether a tennis player wins a match?
####
#Logistic Regression
log.all = glm(outcome ~ .,data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ ., family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.518e-01 2.182e-01 -0.696 0.486695
## draw_size 2.518e-05 5.331e-04 0.047 0.962321
## tourney_levelD 1.219e-02 3.818e-02 0.319 0.749473
## tourney_levelF 6.091e-02 9.077e-02 0.671 0.502164
## tourney_levelG 1.467e-03 5.329e-02 0.028 0.978036
## tourney_levelM 2.358e-03 2.942e-02 0.080 0.936118
## handR 7.998e-02 2.244e-02 3.564 0.000365 ***
## opponent_handR -1.012e-01 2.242e-02 -4.514 6.36e-06 ***
## seededYes 3.691e-01 1.749e-02 21.101 < 2e-16 ***
## opponent_seededYes -3.637e-01 1.750e-02 -20.787 < 2e-16 ***
## ht_dif 2.046e-03 1.107e-03 1.847 0.064683 .
## age_dif -1.126e-02 1.324e-03 -8.503 < 2e-16 ***
## rank_dif -2.770e-03 7.860e-05 -35.243 < 2e-16 ***
## avg_ace -1.435e-01 6.415e-03 -22.374 < 2e-16 ***
## avg_df 3.451e-02 1.094e-02 3.156 0.001599 **
## avg_svpt -1.751e-01 9.968e-03 -17.570 < 2e-16 ***
## avg_firstIn -3.251e-02 1.285e-02 -2.531 0.011375 *
## avg_firstWon 3.151e-01 1.267e-02 24.874 < 2e-16 ***
## avg_secWon 2.834e-01 1.676e-02 16.901 < 2e-16 ***
## avg_bpSaved 7.039e-02 3.037e-02 2.318 0.020451 *
## avg_opponent_ace 1.399e-01 6.376e-03 21.939 < 2e-16 ***
## avg_opponent_df -3.780e-02 1.092e-02 -3.462 0.000537 ***
## avg_opponent_svpt 1.858e-01 1.004e-02 18.499 < 2e-16 ***
## avg_opponent_firstIn 2.220e-02 1.288e-02 1.724 0.084757 .
## avg_opponent_firstWon -3.097e-01 1.261e-02 -24.560 < 2e-16 ***
## avg_opponent_secWon -3.029e-01 1.689e-02 -17.932 < 2e-16 ***
## avg_opponent_bpSaved -8.791e-02 3.042e-02 -2.889 0.003861 **
## surface_clay1 4.946e-02 5.313e-02 0.931 0.351840
## surface_grass1 3.664e-02 5.660e-02 0.647 0.517349
## surface_hard1 3.989e-02 5.255e-02 0.759 0.447839
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 114378 on 93549 degrees of freedom
## AIC: 114438
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: first removing avg_opponent_firstIn (43.77)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 8.868828 1 2.978058
## tourney_level 10.339998 4 1.339106
## hand 1.119116 1 1.057883
## opponent_hand 1.121494 1 1.059006
## seeded 1.354503 1 1.163831
## opponent_seeded 1.356026 1 1.164485
## ht_dif 2.222873 1 1.490930
## age_dif 1.039727 1 1.019670
## rank_dif 1.253642 1 1.119661
## avg_ace 8.134836 1 2.852163
## avg_df 1.772299 1 1.331277
## avg_svpt 30.586788 1 5.530532
## avg_firstIn 42.934662 1 6.552455
## avg_firstWon 32.628871 1 5.712169
## avg_secWon 16.722916 1 4.089366
## avg_bpSaved 5.617343 1 2.370093
## avg_opponent_ace 8.099690 1 2.845995
## avg_opponent_df 1.768078 1 1.329691
## avg_opponent_svpt 31.410080 1 5.604470
## avg_opponent_firstIn 43.595122 1 6.602660
## avg_opponent_firstWon 32.816154 1 5.728539
## avg_opponent_secWon 17.059079 1 4.130264
## avg_opponent_bpSaved 5.640536 1 2.374981
## surface_clay 12.118586 1 3.481176
## surface_grass 6.081792 1 2.466129
## surface_hard 13.515292 1 3.676315
log.all = glm(outcome ~ . -avg_opponent_firstIn,data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn, family = binomial,
## data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.557e-01 2.182e-01 -0.714 0.475429
## draw_size 2.569e-05 5.331e-04 0.048 0.961559
## tourney_levelD 1.289e-02 3.818e-02 0.338 0.735584
## tourney_levelF 5.959e-02 9.075e-02 0.657 0.511403
## tourney_levelG 1.632e-03 5.329e-02 0.031 0.975570
## tourney_levelM 2.611e-03 2.942e-02 0.089 0.929267
## handR 8.012e-02 2.244e-02 3.570 0.000356 ***
## opponent_handR -1.065e-01 2.220e-02 -4.796 1.62e-06 ***
## seededYes 3.691e-01 1.749e-02 21.100 < 2e-16 ***
## opponent_seededYes -3.631e-01 1.749e-02 -20.754 < 2e-16 ***
## ht_dif 2.328e-03 1.095e-03 2.126 0.033532 *
## age_dif -1.125e-02 1.324e-03 -8.497 < 2e-16 ***
## rank_dif -2.772e-03 7.860e-05 -35.262 < 2e-16 ***
## avg_ace -1.436e-01 6.415e-03 -22.381 < 2e-16 ***
## avg_df 3.470e-02 1.094e-02 3.173 0.001508 **
## avg_svpt -1.755e-01 9.965e-03 -17.612 < 2e-16 ***
## avg_firstIn -3.167e-02 1.284e-02 -2.467 0.013624 *
## avg_firstWon 3.142e-01 1.266e-02 24.825 < 2e-16 ***
## avg_secWon 2.843e-01 1.676e-02 16.966 < 2e-16 ***
## avg_bpSaved 7.007e-02 3.036e-02 2.308 0.021017 *
## avg_opponent_ace 1.331e-01 5.028e-03 26.474 < 2e-16 ***
## avg_opponent_df -4.633e-02 9.736e-03 -4.759 1.95e-06 ***
## avg_opponent_svpt 1.981e-01 7.059e-03 28.070 < 2e-16 ***
## avg_opponent_firstWon -2.936e-01 8.509e-03 -34.509 < 2e-16 ***
## avg_opponent_secWon -3.268e-01 9.650e-03 -33.871 < 2e-16 ***
## avg_opponent_bpSaved -8.801e-02 3.041e-02 -2.894 0.003808 **
## surface_clay1 4.957e-02 5.313e-02 0.933 0.350766
## surface_grass1 3.556e-02 5.660e-02 0.628 0.529786
## surface_hard1 3.918e-02 5.255e-02 0.746 0.455937
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 114381 on 93550 degrees of freedom
## AIC: 114439
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: next removing avg_firstIn (43.414)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 8.868515 1 2.978005
## tourney_level 10.337036 4 1.339058
## hand 1.119099 1 1.057875
## opponent_hand 1.100471 1 1.049034
## seeded 1.354505 1 1.163832
## opponent_seeded 1.355508 1 1.164263
## ht_dif 2.173863 1 1.474403
## age_dif 1.039719 1 1.019666
## rank_dif 1.253577 1 1.119633
## avg_ace 8.134073 1 2.852030
## avg_df 1.772207 1 1.331242
## avg_svpt 30.570504 1 5.529060
## avg_firstIn 42.874997 1 6.547900
## avg_firstWon 32.571055 1 5.707106
## avg_secWon 16.706364 1 4.087342
## avg_bpSaved 5.616995 1 2.370020
## avg_opponent_ace 5.034226 1 2.243708
## avg_opponent_df 1.404763 1 1.185227
## avg_opponent_svpt 15.509788 1 3.938247
## avg_opponent_firstWon 14.937719 1 3.864935
## avg_opponent_secWon 5.568464 1 2.359759
## avg_opponent_bpSaved 5.638733 1 2.374602
## surface_clay 12.120242 1 3.481414
## surface_grass 6.080515 1 2.465870
## surface_hard 13.515549 1 3.676350
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn,data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn,
## family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.518e-01 2.182e-01 -0.696 0.48655
## draw_size 2.204e-05 5.330e-04 0.041 0.96702
## tourney_levelD 1.198e-02 3.818e-02 0.314 0.75376
## tourney_levelF 6.136e-02 9.074e-02 0.676 0.49888
## tourney_levelG 1.607e-03 5.328e-02 0.030 0.97594
## tourney_levelM 2.331e-03 2.941e-02 0.079 0.93683
## handR 8.745e-02 2.223e-02 3.933 8.38e-05 ***
## opponent_handR -1.063e-01 2.220e-02 -4.788 1.69e-06 ***
## seededYes 3.684e-01 1.749e-02 21.061 < 2e-16 ***
## opponent_seededYes -3.629e-01 1.749e-02 -20.745 < 2e-16 ***
## ht_dif 2.717e-03 1.084e-03 2.507 0.01217 *
## age_dif -1.123e-02 1.324e-03 -8.483 < 2e-16 ***
## rank_dif -2.774e-03 7.861e-05 -35.294 < 2e-16 ***
## avg_ace -1.338e-01 5.044e-03 -26.527 < 2e-16 ***
## avg_df 4.687e-02 9.763e-03 4.801 1.58e-06 ***
## avg_svpt -1.929e-01 7.046e-03 -27.377 < 2e-16 ***
## avg_firstWon 2.911e-01 8.509e-03 34.210 < 2e-16 ***
## avg_secWon 3.182e-01 9.631e-03 33.033 < 2e-16 ***
## avg_bpSaved 6.980e-02 3.034e-02 2.300 0.02144 *
## avg_opponent_ace 1.336e-01 5.025e-03 26.578 < 2e-16 ***
## avg_opponent_df -4.619e-02 9.736e-03 -4.744 2.09e-06 ***
## avg_opponent_svpt 1.980e-01 7.058e-03 28.053 < 2e-16 ***
## avg_opponent_firstWon -2.932e-01 8.507e-03 -34.469 < 2e-16 ***
## avg_opponent_secWon -3.269e-01 9.649e-03 -33.878 < 2e-16 ***
## avg_opponent_bpSaved -8.750e-02 3.041e-02 -2.877 0.00401 **
## surface_clay1 4.945e-02 5.313e-02 0.931 0.35199
## surface_grass1 3.713e-02 5.659e-02 0.656 0.51175
## surface_hard1 4.019e-02 5.255e-02 0.765 0.44439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 114387 on 93551 degrees of freedom
## AIC: 114443
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: next removing avg_svpt (15.61)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 8.867371 1 2.977813
## tourney_level 10.333753 4 1.339005
## hand 1.099359 1 1.048503
## opponent_hand 1.100480 1 1.049038
## seeded 1.354326 1 1.163755
## opponent_seeded 1.355472 1 1.164247
## ht_dif 2.128183 1 1.458829
## age_dif 1.039689 1 1.019652
## rank_dif 1.253458 1 1.119579
## avg_ace 5.023229 1 2.241256
## avg_df 1.411406 1 1.188026
## avg_svpt 15.281833 1 3.909198
## avg_firstWon 14.710534 1 3.835431
## avg_secWon 5.522235 1 2.349944
## avg_bpSaved 5.612448 1 2.369061
## avg_opponent_ace 5.028257 1 2.242377
## avg_opponent_df 1.404790 1 1.185238
## avg_opponent_svpt 15.508422 1 3.938073
## avg_opponent_firstWon 14.930058 1 3.863943
## avg_opponent_secWon 5.568079 1 2.359678
## avg_opponent_bpSaved 5.638981 1 2.374654
## surface_clay 12.122032 1 3.481671
## surface_grass 6.078993 1 2.465561
## surface_hard 13.515815 1 3.676386
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn -avg_svpt,
data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn -
## avg_svpt, family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8123803 0.2083117 -8.700 < 2e-16 ***
## draw_size -0.0005080 0.0005300 -0.958 0.33781
## tourney_levelD 0.0100152 0.0379604 0.264 0.79191
## tourney_levelF 0.1679421 0.0901555 1.863 0.06249 .
## tourney_levelG 0.0600236 0.0529612 1.133 0.25707
## tourney_levelM 0.0659128 0.0291736 2.259 0.02386 *
## handR -0.0492628 0.0215634 -2.285 0.02234 *
## opponent_handR -0.1021406 0.0221236 -4.617 3.9e-06 ***
## seededYes 0.4683647 0.0171148 27.366 < 2e-16 ***
## opponent_seededYes -0.3417574 0.0173989 -19.643 < 2e-16 ***
## ht_dif 0.0027225 0.0010792 2.523 0.01165 *
## age_dif -0.0121910 0.0013181 -9.249 < 2e-16 ***
## rank_dif -0.0029205 0.0000791 -36.921 < 2e-16 ***
## avg_ace -0.0945144 0.0048472 -19.499 < 2e-16 ***
## avg_df -0.0151459 0.0094924 -1.596 0.11058
## avg_firstWon 0.0791349 0.0034027 23.257 < 2e-16 ***
## avg_secWon 0.0935516 0.0049372 18.948 < 2e-16 ***
## avg_bpSaved -0.6024699 0.0187646 -32.107 < 2e-16 ***
## avg_opponent_ace 0.1297252 0.0050029 25.930 < 2e-16 ***
## avg_opponent_df -0.0444049 0.0097006 -4.578 4.7e-06 ***
## avg_opponent_svpt 0.1933085 0.0070402 27.458 < 2e-16 ***
## avg_opponent_firstWon -0.2861071 0.0084810 -33.735 < 2e-16 ***
## avg_opponent_secWon -0.3190889 0.0096151 -33.186 < 2e-16 ***
## avg_opponent_bpSaved -0.0928105 0.0304101 -3.052 0.00227 **
## surface_clay1 0.0447359 0.0528975 0.846 0.39771
## surface_grass1 0.0285444 0.0563555 0.507 0.61250
## surface_hard1 0.0360902 0.0523240 0.690 0.49035
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 115165 on 93552 degrees of freedom
## AIC: 115219
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: next removing avg_opponent_svpt (15.38)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 8.870047 1 2.978262
## tourney_level 10.268747 4 1.337949
## hand 1.045349 1 1.022423
## opponent_hand 1.101116 1 1.049341
## seeded 1.309215 1 1.144209
## opponent_seeded 1.354258 1 1.163726
## ht_dif 2.119729 1 1.455929
## age_dif 1.039816 1 1.019713
## rank_dif 1.255487 1 1.120485
## avg_ace 4.614727 1 2.148192
## avg_df 1.347853 1 1.160971
## avg_firstWon 2.360432 1 1.536370
## avg_secWon 1.468842 1 1.211958
## avg_bpSaved 2.105905 1 1.451174
## avg_opponent_ace 5.031061 1 2.243003
## avg_opponent_df 1.405957 1 1.185731
## avg_opponent_svpt 15.451261 1 3.930809
## avg_opponent_firstWon 14.934654 1 3.864538
## avg_opponent_secWon 5.565545 1 2.359141
## avg_opponent_bpSaved 5.668957 1 2.380957
## surface_clay 12.125031 1 3.482102
## surface_grass 6.076622 1 2.465081
## surface_hard 13.517881 1 3.676667
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt -avg_opponent_svpt,
data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn -
## avg_svpt - avg_opponent_svpt, family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.004e-01 1.970e-01 -0.510 0.6104
## draw_size -1.323e-05 5.272e-04 -0.025 0.9800
## tourney_levelD 8.211e-03 3.777e-02 0.217 0.8279
## tourney_levelF 5.826e-02 8.940e-02 0.652 0.5146
## tourney_levelG 4.386e-03 5.266e-02 0.083 0.9336
## tourney_levelM 3.286e-03 2.894e-02 0.114 0.9096
## handR -5.171e-02 2.148e-02 -2.407 0.0161 *
## opponent_handR 3.586e-02 2.144e-02 1.673 0.0944 .
## seededYes 4.430e-01 1.701e-02 26.038 <2e-16 ***
## opponent_seededYes -4.421e-01 1.702e-02 -25.979 <2e-16 ***
## ht_dif 2.746e-03 1.075e-03 2.554 0.0106 *
## age_dif -1.326e-02 1.312e-03 -10.105 <2e-16 ***
## rank_dif -3.070e-03 7.956e-05 -38.582 <2e-16 ***
## avg_ace -9.149e-02 4.825e-03 -18.960 <2e-16 ***
## avg_df -1.633e-02 9.456e-03 -1.727 0.0841 .
## avg_firstWon 7.696e-02 3.393e-03 22.684 <2e-16 ***
## avg_secWon 9.122e-02 4.919e-03 18.547 <2e-16 ***
## avg_bpSaved -5.811e-01 1.868e-02 -31.118 <2e-16 ***
## avg_opponent_ace 9.112e-02 4.808e-03 18.949 <2e-16 ***
## avg_opponent_df 1.699e-02 9.430e-03 1.801 0.0716 .
## avg_opponent_firstWon -7.397e-02 3.374e-03 -21.922 <2e-16 ***
## avg_opponent_secWon -9.384e-02 4.898e-03 -19.160 <2e-16 ***
## avg_opponent_bpSaved 5.838e-01 1.868e-02 31.253 <2e-16 ***
## surface_clay1 4.922e-02 5.265e-02 0.935 0.3499
## surface_grass1 3.696e-02 5.611e-02 0.659 0.5100
## surface_hard1 3.959e-02 5.209e-02 0.760 0.4473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 115946 on 93553 degrees of freedom
## AIC: 115998
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: surface_hard (13.44)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 8.877252 1 2.979472
## tourney_level 10.200758 4 1.336839
## hand 1.045722 1 1.022606
## opponent_hand 1.045412 1 1.022454
## seeded 1.306411 1 1.142983
## opponent_seeded 1.307957 1 1.143659
## ht_dif 2.111508 1 1.453103
## age_dif 1.039630 1 1.019623
## rank_dif 1.256823 1 1.121081
## avg_ace 4.618392 1 2.149045
## avg_df 1.348226 1 1.161131
## avg_firstWon 2.364897 1 1.537822
## avg_secWon 1.468370 1 1.211763
## avg_bpSaved 2.100523 1 1.449318
## avg_opponent_ace 4.626291 1 2.150881
## avg_opponent_df 1.342632 1 1.158720
## avg_opponent_firstWon 2.375001 1 1.541104
## avg_opponent_secWon 1.461782 1 1.209042
## avg_opponent_bpSaved 2.104156 1 1.450571
## surface_clay 12.121841 1 3.481643
## surface_grass 6.073133 1 2.464373
## surface_hard 13.514212 1 3.676168
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt
-avg_opponent_svpt -surface_hard,
data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn -
## avg_svpt - avg_opponent_svpt - surface_hard, family = binomial,
## data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.806e-02 1.924e-01 -0.354 0.7235
## draw_size 1.900e-05 5.255e-04 0.036 0.9712
## tourney_levelD 7.678e-03 3.776e-02 0.203 0.8389
## tourney_levelF 5.875e-02 8.939e-02 0.657 0.5111
## tourney_levelG 2.192e-03 5.258e-02 0.042 0.9667
## tourney_levelM 2.313e-03 2.891e-02 0.080 0.9362
## handR -5.179e-02 2.148e-02 -2.411 0.0159 *
## opponent_handR 3.577e-02 2.144e-02 1.669 0.0952 .
## seededYes 4.430e-01 1.701e-02 26.042 <2e-16 ***
## opponent_seededYes -4.420e-01 1.702e-02 -25.974 <2e-16 ***
## ht_dif 2.745e-03 1.075e-03 2.554 0.0107 *
## age_dif -1.326e-02 1.312e-03 -10.105 <2e-16 ***
## rank_dif -3.070e-03 7.956e-05 -38.584 <2e-16 ***
## avg_ace -9.151e-02 4.825e-03 -18.964 <2e-16 ***
## avg_df -1.634e-02 9.456e-03 -1.728 0.0839 .
## avg_firstWon 7.703e-02 3.392e-03 22.710 <2e-16 ***
## avg_secWon 9.123e-02 4.919e-03 18.548 <2e-16 ***
## avg_bpSaved -5.810e-01 1.867e-02 -31.113 <2e-16 ***
## avg_opponent_ace 9.112e-02 4.808e-03 18.950 <2e-16 ***
## avg_opponent_df 1.695e-02 9.430e-03 1.797 0.0723 .
## avg_opponent_firstWon -7.391e-02 3.373e-03 -21.911 <2e-16 ***
## avg_opponent_secWon -9.385e-02 4.897e-03 -19.164 <2e-16 ***
## avg_opponent_bpSaved 5.840e-01 1.868e-02 31.262 <2e-16 ***
## surface_clay1 1.114e-02 1.619e-02 0.688 0.4912
## surface_grass1 -1.497e-03 2.423e-02 -0.062 0.9507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 115947 on 93554 degrees of freedom
## AIC: 115997
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # multicollinearity in the model: tourney_level (10.14)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 8.819879 1 2.969828
## tourney_level 10.153540 4 1.336064
## hand 1.045694 1 1.022592
## opponent_hand 1.045384 1 1.022440
## seeded 1.306367 1 1.142964
## opponent_seeded 1.307875 1 1.143624
## ht_dif 2.111503 1 1.453101
## age_dif 1.039630 1 1.019622
## rank_dif 1.256807 1 1.121074
## avg_ace 4.618365 1 2.149038
## avg_df 1.348247 1 1.161140
## avg_firstWon 2.363485 1 1.537363
## avg_secWon 1.468384 1 1.211769
## avg_bpSaved 2.100382 1 1.449269
## avg_opponent_ace 4.626193 1 2.150859
## avg_opponent_df 1.342570 1 1.158693
## avg_opponent_firstWon 2.373891 1 1.540744
## avg_opponent_secWon 1.461746 1 1.209027
## avg_opponent_bpSaved 2.103963 1 1.450504
## surface_clay 1.145435 1 1.070250
## surface_grass 1.132469 1 1.064175
log.all = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt
-avg_opponent_svpt -surface_hard -tourney_level,
data = tennis_train, family = binomial)
summary(log.all)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn -
## avg_svpt - avg_opponent_svpt - surface_hard - tourney_level,
## family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.836e-02 1.917e-01 -0.357 0.7214
## draw_size 1.525e-05 1.816e-04 0.084 0.9331
## handR -5.164e-02 2.148e-02 -2.405 0.0162 *
## opponent_handR 3.591e-02 2.144e-02 1.675 0.0939 .
## seededYes 4.427e-01 1.683e-02 26.302 <2e-16 ***
## opponent_seededYes -4.424e-01 1.683e-02 -26.289 <2e-16 ***
## ht_dif 2.746e-03 1.075e-03 2.555 0.0106 *
## age_dif -1.326e-02 1.312e-03 -10.106 <2e-16 ***
## rank_dif -3.070e-03 7.956e-05 -38.584 <2e-16 ***
## avg_ace -9.169e-02 4.814e-03 -19.046 <2e-16 ***
## avg_df -1.635e-02 9.456e-03 -1.729 0.0838 .
## avg_firstWon 7.716e-02 3.383e-03 22.807 <2e-16 ***
## avg_secWon 9.133e-02 4.909e-03 18.604 <2e-16 ***
## avg_bpSaved -5.820e-01 1.854e-02 -31.385 <2e-16 ***
## avg_opponent_ace 9.093e-02 4.796e-03 18.957 <2e-16 ***
## avg_opponent_df 1.695e-02 9.430e-03 1.797 0.0723 .
## avg_opponent_firstWon -7.377e-02 3.363e-03 -21.936 <2e-16 ***
## avg_opponent_secWon -9.375e-02 4.889e-03 -19.177 <2e-16 ***
## avg_opponent_bpSaved 5.830e-01 1.854e-02 31.437 <2e-16 ***
## surface_clay1 1.042e-02 1.606e-02 0.649 0.5164
## surface_grass1 -2.384e-03 2.361e-02 -0.101 0.9195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 115947 on 93558 degrees of freedom
## AIC: 115989
##
## Number of Fisher Scoring iterations: 4
vif(log.all) # no multicollinearity :)
## draw_size hand opponent_hand
## 1.054087 1.045211 1.044903
## seeded opponent_seeded ht_dif
## 1.278557 1.278951 2.111483
## age_dif rank_dif avg_ace
## 1.039621 1.256802 4.597178
## avg_df avg_firstWon avg_secWon
## 1.348139 2.351309 1.462836
## avg_bpSaved avg_opponent_ace avg_opponent_df
## 2.070417 4.603377 1.342438
## avg_opponent_firstWon avg_opponent_secWon avg_opponent_bpSaved
## 2.359852 1.456513 2.074092
## surface_clay surface_grass
## 1.127387 1.075109
predprob_log <- predict.glm(log.all, tennis_test, type = "response")
predclass_log = ifelse(predprob_log >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log), tennis_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7704 3904
## 1 4040 7747
##
## Accuracy : 0.6604
## 95% CI : (0.6543, 0.6665)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3209
##
## Mcnemar's Test P-Value : 0.1299
##
## Sensitivity : 0.6649
## Specificity : 0.6560
## Pos Pred Value : 0.6572
## Neg Pred Value : 0.6637
## Prevalence : 0.4980
## Detection Rate : 0.3311
## Detection Prevalence : 0.5038
## Balanced Accuracy : 0.6605
##
## 'Positive' Class : 1
##
# Accuracy : 0.6596
# Sensitivity : 0.6660
# Specificity : 0.6529
#Logistic Regression with Stepwise Selection
null_model = glm(outcome ~ 1, data = tennis_train, family = binomial)
full_model = log.all
step.model.AIC = step(null_model, scope = list(upper = full_model),
direction = "both", test = "Chisq", trace = F)
summary(step.model.AIC)
##
## Call:
## glm(formula = outcome ~ rank_dif + seeded + opponent_seeded +
## avg_bpSaved + avg_opponent_bpSaved + age_dif + avg_firstWon +
## avg_ace + avg_secWon + avg_opponent_firstWon + avg_opponent_ace +
## avg_opponent_secWon + ht_dif + hand + avg_df + avg_opponent_df +
## opponent_hand, family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.562e-02 1.907e-01 -0.292 0.7705
## rank_dif -3.070e-03 7.956e-05 -38.584 <2e-16 ***
## seededYes 4.430e-01 1.679e-02 26.384 <2e-16 ***
## opponent_seededYes -4.420e-01 1.679e-02 -26.334 <2e-16 ***
## avg_bpSaved -5.816e-01 1.853e-02 -31.392 <2e-16 ***
## avg_opponent_bpSaved 5.834e-01 1.853e-02 31.489 <2e-16 ***
## age_dif -1.326e-02 1.312e-03 -10.106 <2e-16 ***
## avg_firstWon 7.712e-02 3.376e-03 22.842 <2e-16 ***
## avg_ace -9.183e-02 4.807e-03 -19.104 <2e-16 ***
## avg_secWon 9.110e-02 4.890e-03 18.629 <2e-16 ***
## avg_opponent_firstWon -7.382e-02 3.356e-03 -21.994 <2e-16 ***
## avg_opponent_ace 9.080e-02 4.790e-03 18.958 <2e-16 ***
## avg_opponent_secWon -9.398e-02 4.870e-03 -19.298 <2e-16 ***
## ht_dif 2.747e-03 1.075e-03 2.556 0.0106 *
## handR -5.163e-02 2.148e-02 -2.404 0.0162 *
## avg_df -1.640e-02 9.455e-03 -1.735 0.0828 .
## avg_opponent_df 1.689e-02 9.428e-03 1.791 0.0733 .
## opponent_handR 3.593e-02 2.143e-02 1.676 0.0937 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 115948 on 93561 degrees of freedom
## AIC: 115984
##
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise
log.sel <- glm(outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + opponent_seeded +
seeded + age_dif + avg_firstWon + avg_ace + avg_secWon + avg_opponent_firstWon +
avg_opponent_ace + avg_opponent_secWon + hand + ht_dif + avg_df,
tennis_train, family = binomial)
summary(log.sel)
##
## Call:
## glm(formula = outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved +
## opponent_seeded + seeded + age_dif + avg_firstWon + avg_ace +
## avg_secWon + avg_opponent_firstWon + avg_opponent_ace + avg_opponent_secWon +
## hand + ht_dif + avg_df, family = binomial, data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.576e-02 1.878e-01 0.084 0.9331
## rank_dif -3.075e-03 7.954e-05 -38.662 <2e-16 ***
## avg_opponent_bpSaved 5.919e-01 1.714e-02 34.536 <2e-16 ***
## avg_bpSaved -5.815e-01 1.853e-02 -31.385 <2e-16 ***
## opponent_seededYes -4.413e-01 1.678e-02 -26.292 <2e-16 ***
## seededYes 4.423e-01 1.679e-02 26.350 <2e-16 ***
## age_dif -1.312e-02 1.311e-03 -10.010 <2e-16 ***
## avg_firstWon 7.722e-02 3.376e-03 22.871 <2e-16 ***
## avg_ace -9.182e-02 4.807e-03 -19.102 <2e-16 ***
## avg_secWon 9.104e-02 4.890e-03 18.616 <2e-16 ***
## avg_opponent_firstWon -7.561e-02 3.259e-03 -23.198 <2e-16 ***
## avg_opponent_ace 9.402e-02 4.449e-03 21.130 <2e-16 ***
## avg_opponent_secWon -9.311e-02 4.850e-03 -19.197 <2e-16 ***
## handR -5.154e-02 2.148e-02 -2.400 0.0164 *
## ht_dif 2.725e-03 1.075e-03 2.536 0.0112 *
## avg_df -1.619e-02 9.454e-03 -1.712 0.0868 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 115954 on 93563 degrees of freedom
## AIC: 115986
##
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2 <- predict(log.sel, newdata = tennis_test, type = "response")
logistic_pred_class2 <- ifelse(logistic_pred2 > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2), tennis_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7693 3908
## 1 4051 7743
##
## Accuracy : 0.6598
## 95% CI : (0.6537, 0.6659)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3196
##
## Mcnemar's Test P-Value : 0.1115
##
## Sensitivity : 0.6646
## Specificity : 0.6551
## Pos Pred Value : 0.6565
## Neg Pred Value : 0.6631
## Prevalence : 0.4980
## Detection Rate : 0.3310
## Detection Prevalence : 0.5041
## Balanced Accuracy : 0.6598
##
## 'Positive' Class : 1
##
# Accuracy : 0.6592
# Sensitivity : 0.6659
# Specificity : 0.6525
#LDA
lda_model = lda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded +
ht_dif + age_dif + rank_dif + avg_ace + avg_df + avg_firstWon + avg_secWon +
avg_bpSaved + avg_opponent_ace + avg_opponent_df + avg_opponent_firstWon +
avg_opponent_secWon + avg_opponent_bpSaved + surface_clay + surface_grass,
data = tennis_train)
lda_model
## Call:
## lda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded +
## ht_dif + age_dif + rank_dif + avg_ace + avg_df + avg_firstWon +
## avg_secWon + avg_bpSaved + avg_opponent_ace + avg_opponent_df +
## avg_opponent_firstWon + avg_opponent_secWon + avg_opponent_bpSaved +
## surface_clay + surface_grass, data = tennis_train)
##
## Prior probabilities of groups:
## 0 1
## 0.4995031 0.5004969
##
## Group means:
## draw_size handR opponent_handR seededYes opponent_seededYes ht_dif
## 0 58.01326 0.8678947 0.8751685 0.2458336 0.4428898 -0.6534668
## 1 58.02882 0.8747118 0.8661500 0.4432701 0.2461141 0.6196943
## age_dif rank_dif avg_ace avg_df avg_firstWon avg_secWon avg_bpSaved
## 0 0.1491197 33.49761 5.876086 2.986887 34.32419 15.60362 4.176661
## 1 -0.1595653 -33.67920 6.277315 2.878460 34.98529 15.73356 3.998110
## avg_opponent_ace avg_opponent_df avg_opponent_firstWon avg_opponent_secWon
## 0 6.292537 2.880003 34.99085 15.74220
## 1 5.887530 2.986144 34.34224 15.59955
## avg_opponent_bpSaved surface_clay1 surface_grass1
## 0 3.996073 0.3170742 0.1078450
## 1 4.174689 0.3190067 0.1080152
##
## Coefficients of linear discriminants:
## LD1
## draw_size 8.322884e-06
## handR -6.490700e-02
## opponent_handR 4.344118e-02
## seededYes 6.875411e-01
## opponent_seededYes -6.845001e-01
## ht_dif 3.650945e-03
## age_dif -1.694301e-02
## rank_dif -2.935947e-03
## avg_ace -1.023734e-01
## avg_df -3.275343e-02
## avg_firstWon 9.014533e-02
## avg_secWon 1.049756e-01
## avg_bpSaved -6.677011e-01
## avg_opponent_ace 1.029193e-01
## avg_opponent_df 3.326777e-02
## avg_opponent_firstWon -8.726589e-02
## avg_opponent_secWon -1.094587e-01
## avg_opponent_bpSaved 6.735015e-01
## surface_clay1 1.244285e-02
## surface_grass1 -2.389925e-03
predictions.lda = predict(lda_model, tennis_test)
caret::confusionMatrix(as.factor(predictions.lda$class), tennis_test$outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7686 3934
## 1 4058 7717
##
## Accuracy : 0.6584
## 95% CI : (0.6523, 0.6645)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3168
##
## Mcnemar's Test P-Value : 0.1689
##
## Sensitivity : 0.6545
## Specificity : 0.6623
## Pos Pred Value : 0.6614
## Neg Pred Value : 0.6554
## Prevalence : 0.5020
## Detection Rate : 0.3285
## Detection Prevalence : 0.4967
## Balanced Accuracy : 0.6584
##
## 'Positive' Class : 0
##
# Accuracy : 0.6584
# Sensitivity : 0.6521
# Specificity : 0.6647
#QDA
qda_model = qda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded + ht_dif +
age_dif + rank_dif + avg_ace + avg_df + avg_firstWon + avg_secWon + avg_bpSaved +
avg_opponent_ace + avg_opponent_df + avg_opponent_firstWon + avg_opponent_secWon +
avg_opponent_bpSaved + surface_clay + surface_grass, data = tennis_train)
qda_model
## Call:
## qda(outcome ~ draw_size + hand + opponent_hand + seeded + opponent_seeded +
## ht_dif + age_dif + rank_dif + avg_ace + avg_df + avg_firstWon +
## avg_secWon + avg_bpSaved + avg_opponent_ace + avg_opponent_df +
## avg_opponent_firstWon + avg_opponent_secWon + avg_opponent_bpSaved +
## surface_clay + surface_grass, data = tennis_train)
##
## Prior probabilities of groups:
## 0 1
## 0.4995031 0.5004969
##
## Group means:
## draw_size handR opponent_handR seededYes opponent_seededYes ht_dif
## 0 58.01326 0.8678947 0.8751685 0.2458336 0.4428898 -0.6534668
## 1 58.02882 0.8747118 0.8661500 0.4432701 0.2461141 0.6196943
## age_dif rank_dif avg_ace avg_df avg_firstWon avg_secWon avg_bpSaved
## 0 0.1491197 33.49761 5.876086 2.986887 34.32419 15.60362 4.176661
## 1 -0.1595653 -33.67920 6.277315 2.878460 34.98529 15.73356 3.998110
## avg_opponent_ace avg_opponent_df avg_opponent_firstWon avg_opponent_secWon
## 0 6.292537 2.880003 34.99085 15.74220
## 1 5.887530 2.986144 34.34224 15.59955
## avg_opponent_bpSaved surface_clay1 surface_grass1
## 0 3.996073 0.3170742 0.1078450
## 1 4.174689 0.3190067 0.1080152
predictions.qda = predict(qda_model, tennis_test)
caret::confusionMatrix(as.factor(predictions.qda$class), tennis_test$outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7618 4057
## 1 4126 7594
##
## Accuracy : 0.6502
## 95% CI : (0.6441, 0.6563)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3005
##
## Mcnemar's Test P-Value : 0.4522
##
## Sensitivity : 0.6487
## Specificity : 0.6518
## Pos Pred Value : 0.6525
## Neg Pred Value : 0.6480
## Prevalence : 0.5020
## Detection Rate : 0.3256
## Detection Prevalence : 0.4990
## Balanced Accuracy : 0.6502
##
## 'Positive' Class : 0
##
# Accuracy : 0.6511
# Sensitivity : 0.6533
# Specificity : 0.6489
#Random Forest
set.seed(29)
rf <- randomForest(outcome ~ ., data = tennis_train, importance = TRUE)
rf
##
## Call:
## randomForest(formula = outcome ~ ., data = tennis_train, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 34.79%
## Confusion matrix:
## 0 1 class.error
## 0 30218 16525 0.3535289
## 1 16034 30802 0.3423435
rf.preds = predict(rf, tennis_test,type="class")
caret::confusionMatrix(as.factor(rf.preds), tennis_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7552 4047
## 1 4192 7604
##
## Accuracy : 0.6478
## 95% CI : (0.6417, 0.654)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.2957
##
## Mcnemar's Test P-Value : 0.1126
##
## Sensitivity : 0.6526
## Specificity : 0.6431
## Pos Pred Value : 0.6446
## Neg Pred Value : 0.6511
## Prevalence : 0.4980
## Detection Rate : 0.3250
## Detection Prevalence : 0.5042
## Balanced Accuracy : 0.6478
##
## 'Positive' Class : 1
##
# Accuracy : 0.6503
# Sensitivity : 0.6482
# Specificity : 0.6525
data <- data.frame(model = c("log.all", "log.sel", "LDA", "QDA", "RF"),
acc = c(0.6596, 0.6592, 0.6584, 0.6511, 0.6503))
custom_colors <- c("log.all" = "#5B9F9A", "log.sel" = "#7DAFCA", "LDA" = "#E9909D",
"QDA" = "#AABAE4", "RF" = "#D2C3EE")
ggplot(data, aes(x = model, y = acc, fill = model)) +
geom_bar(stat = "identity", width = 0.5, fill = custom_colors) +
geom_text(aes(label = acc), vjust = -0.5, size = 3) +
labs(title = "Test Accuracy Comparison of Models", x = "", y = "Accuracy") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1),
legend.position = "none")
####
#### Do player age and height influence winning a match?
####
tennis_df2 = match_outcomes
tennis_df2 = dplyr::select(tennis_df2, -seed, -entry, -opponent_seed, -opponent_entry,
-tourney_id,-tourney_name,-match_num,-name,-opponent_name,-score)
tennis_df2$hand = ifelse(tennis_df2$hand == 'U',NA, tennis_df2$hand)
tennis_df2$opponent_hand = ifelse(tennis_df2$opponent_hand == 'U',NA, tennis_df2$opponent_hand)
# Data Type Changes
tennis_df2$tourney_date = as.Date(as.character(tennis_df2$tourney_date),format = "%Y%m%d")
tennis_df2$surface = as.factor(tennis_df2$surface)
tennis_df2$tourney_level = as.factor(tennis_df2$tourney_level)
tennis_df2$hand = as.factor(tennis_df2$hand)
tennis_df2$opponent_hand = as.factor(tennis_df2$opponent_hand)
tennis_df2$seeded = as.factor(tennis_df2$seeded)
tennis_df2$opponent_seeded = as.factor(tennis_df2$opponent_seeded)
tennis_df2$outcome = as.factor(tennis_df2$outcome)
str(tennis_df2)
## gropd_df [123,864 × 66] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ surface : Factor w/ 5 levels "","Carpet","Clay",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ draw_size : int [1:123864] 48 48 48 48 48 48 48 48 48 48 ...
## $ tourney_level : Factor w/ 5 levels "A","D","F","G",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ tourney_date : Date[1:123864], format: "2003-10-13" "2003-10-13" ...
## $ id : int [1:123864] 101965 102358 102998 102610 102374 103888 103852 103292 103970 102434 ...
## $ hand : Factor w/ 4 levels "","A","L","R": 4 4 4 4 4 4 3 4 4 4 ...
## $ ht : int [1:123864] 185 190 190 180 180 188 188 175 175 183 ...
## $ ioc : chr [1:123864] "RSA" "SWE" "USA" "ESP" ...
## $ age : num [1:123864] 32 29.5 26.3 28.3 29.5 21.8 22 24.8 21.5 29.2 ...
## $ opponent_id : int [1:123864] 103344 102338 103786 103602 104745 102450 103151 103813 104022 103294 ...
## $ opponent_hand : Factor w/ 4 levels "","A","L","R": 4 4 4 4 3 4 4 3 4 4 ...
## $ opponent_ht : int [1:123864] 193 190 178 183 185 185 183 185 183 170 ...
## $ opponent_ioc : chr [1:123864] "CRO" "RUS" "RUS" "CHI" ...
## $ opponent_age : num [1:123864] 24.5 29.6 22.3 23.2 17.3 29.1 25.6 22.2 21.3 24.8 ...
## $ best_of : int [1:123864] 3 3 3 3 3 3 3 3 3 3 ...
## $ round : chr [1:123864] "R64" "R64" "R64" "R64" ...
## $ minutes : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ ace : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ df : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ svpt : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ firstIn : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ firstWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ secWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ SvGms : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ bpSaved : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ bpFaced : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_ace : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_df : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_svpt : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_firstIn : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_firstWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_secWon : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_SvGms : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_bpSaved : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ opponent_bpFaced : int [1:123864] NA NA NA NA NA NA NA NA NA NA ...
## $ rank : int [1:123864] 28 146 57 23 127 25 35 37 72 33 ...
## $ rank_points : int [1:123864] 1090 258 660 1170 290 1145 1025 1000 480 1040 ...
## $ opponent_rank : int [1:123864] 42 40 43 22 49 30 27 31 29 87 ...
## $ opponent_rank_points : int [1:123864] 865 950 855 1190 788 1055 1133 1050 1060 421 ...
## $ seeded : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ opponent_seeded : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ outcome : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ht_dif : int [1:123864] -8 0 12 -3 -5 3 5 -10 -8 13 ...
## $ age_dif : num [1:123864] 7.5 -0.1 4 5.1 12.2 ...
## $ rank_dif : int [1:123864] -14 106 14 1 78 -5 8 6 43 -54 ...
## $ avg_ace : num [1:123864] 7.04 8.12 11.1 4.61 3.62 ...
## $ avg_df : num [1:123864] 2.54 5.34 3.36 2.35 3.11 ...
## $ avg_svpt : num [1:123864] 80.9 85.7 78 84.2 81.2 ...
## $ avg_firstIn : num [1:123864] 46.5 46.5 44.7 52.6 46.5 ...
## $ avg_firstWon : num [1:123864] 33.6 34.6 34 35.5 31.5 ...
## $ avg_secWon : num [1:123864] 18 18.9 15.6 16 17.4 ...
## $ avg_SvGms : num [1:123864] 12.5 13.2 12 12.8 12.5 ...
## $ avg_bpSaved : num [1:123864] 3.97 4.4 4.1 4.84 4.71 ...
## $ avg_bpFaced : num [1:123864] 6.38 7.43 6.61 8.03 8.19 ...
## $ avg_opponent_ace : num [1:123864] 12.28 3.98 3.23 6.89 3.19 ...
## $ avg_opponent_df : num [1:123864] 2.28 3.25 2.59 3.55 1.71 ...
## $ avg_opponent_svpt : num [1:123864] 80.5 79.1 74 81.5 73.6 ...
## $ avg_opponent_firstIn : num [1:123864] 47.9 46.6 49.9 50 50 ...
## $ avg_opponent_firstWon: num [1:123864] 37.2 32.4 34.2 37.2 36.1 ...
## $ avg_opponent_secWon : num [1:123864] 17.1 16.1 12.5 16.4 13.5 ...
## $ avg_opponent_SvGms : num [1:123864] 12.9 12.5 11.7 13 12.1 ...
## $ avg_opponent_bpSaved : num [1:123864] 3.35 3.73 3.93 3.77 3.37 ...
## $ avg_opponent_bpFaced : num [1:123864] 5.12 6.92 6.47 5.92 5.1 ...
## $ surface_clay : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
## $ surface_grass : num [1:123864] 0 0 0 0 0 0 0 0 0 0 ...
## $ surface_hard : num [1:123864] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "groups")= tibble [2,261 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ opponent_id: int [1:2261] 100644 101316 101404 101532 101662 101723 101736 101746 101750 101774 ...
## ..$ .rows : list<int> [1:2261]
## .. ..$ : int [1:573] 33442 35678 36325 36416 36473 36511 37213 37718 37802 37918 ...
## .. ..$ : int 6453
## .. ..$ : int [1:3] 25319 27592 28329
## .. ..$ : int [1:6] 325 882 2591 2794 62750 62782
## .. ..$ : int [1:2] 12952 12954
## .. ..$ : int [1:3] 1861 2534 63780
## .. ..$ : int [1:176] 225 1164 1271 1377 1610 2183 2337 2469 2661 2665 ...
## .. ..$ : int [1:5] 142 403 983 2619 64509
## .. ..$ : int 648
## .. ..$ : int [1:54] 204 790 907 931 1240 1438 1495 2133 2320 2459 ...
## .. ..$ : int 3110
## .. ..$ : int 19117
## .. ..$ : int [1:2] 2951 3118
## .. ..$ : int [1:2] 1831 63750
## .. ..$ : int [1:2] 2972 64902
## .. ..$ : int [1:27] 218 491 776 1651 1823 1864 2067 2260 2540 3268 ...
## .. ..$ : int [1:5] 2980 3126 3129 9631 64915
## .. ..$ : int [1:49] 48 212 534 2094 2252 2566 2855 3491 3599 3618 ...
## .. ..$ : int 3231
## .. ..$ : int 4723
## .. ..$ : int [1:134] 162 180 242 358 508 821 941 976 1038 1118 ...
## .. ..$ : int [1:4] 2897 3031 68401 71378
## .. ..$ : int [1:3] 6423 6463 68397
## .. ..$ : int [1:14] 743 852 1952 2000 2050 2589 2727 2799 62661 62752 ...
## .. ..$ : int [1:7] 259 564 3579 3951 4432 5515 67414
## .. ..$ : int [1:4] 1804 6346 63727 64912
## .. ..$ : int 4270
## .. ..$ : int 2922
## .. ..$ : int [1:108] 46 300 384 434 662 696 755 909 913 1011 ...
## .. ..$ : int [1:17] 194 935 1884 3372 3409 4069 4238 4275 4289 4436 ...
## .. ..$ : int [1:78] 17 613 616 693 759 798 894 943 1085 1158 ...
## .. ..$ : int [1:2] 3124 65054
## .. ..$ : int [1:2] 3074 81153
## .. ..$ : int [1:21] 134 219 595 644 827 928 994 1031 1689 2607 ...
## .. ..$ : int 6449
## .. ..$ : int [1:12] 802 881 929 1279 1386 1467 1478 1732 2085 2352 ...
## .. ..$ : int [1:3] 59 189 2223
## .. ..$ : int [1:6] 6365 12992 16143 19247 22372 74926
## .. ..$ : int [1:235] 27 313 580 721 860 992 1032 1068 1588 1604 ...
## .. ..$ : int [1:6] 3119 6393 6456 65053 68324 68386
## .. ..$ : int [1:3] 3063 6241 65105
## .. ..$ : int [1:3] 2923 68316 68317
## .. ..$ : int 2948
## .. ..$ : int [1:3] 3052 64866 64869
## .. ..$ : int [1:4] 1812 6341 64919 65059
## .. ..$ : int [1:171] 57 91 129 203 568 668 738 804 890 915 ...
## .. ..$ : int [1:18] 144 1569 2943 3163 3170 3643 4800 9636 65094 65559 ...
## .. ..$ : int [1:7] 1362 4569 9501 11229 63271 64975 65122
## .. ..$ : int [1:2] 64863 64980
## .. ..$ : int [1:20] 75 270 1319 1452 3266 3539 3578 3743 6575 6837 ...
## .. ..$ : int [1:267] 51 487 676 695 782 927 999 1045 1107 1169 ...
## .. ..$ : int [1:2] 1463 63380
## .. ..$ : int [1:108] 93 137 221 246 351 409 643 655 739 867 ...
## .. ..$ : int [1:6] 1632 1760 1976 2582 2912 5178
## .. ..$ : int [1:153] 66 96 112 143 543 794 836 1020 1291 1471 ...
## .. ..$ : int 121
## .. ..$ : int 1764
## .. ..$ : int [1:112] 116 238 385 426 562 789 840 921 1039 1144 ...
## .. ..$ : int [1:42] 1909 2120 2508 3683 3959 4917 5055 5618 5709 6009 ...
## .. ..$ : int [1:14] 2859 6320 9417 19204 64859 64861 64979 64982 68242 68244 ...
## .. ..$ : int 366
## .. ..$ : int [1:2] 1014 2037
## .. ..$ : int [1:10] 1025 1179 1765 1784 2024 2032 2580 62943 63939 63950
## .. ..$ : int 64913
## .. ..$ : int [1:2] 19244 22314
## .. ..$ : int [1:138] 217 1266 1303 1364 1425 2061 2289 2358 3202 3204 ...
## .. ..$ : int [1:12] 2065 2558 2861 3481 4894 5166 5627 5812 67084 67093 ...
## .. ..$ : int [1:5] 3165 9551 12956 13014 64998
## .. ..$ : int [1:29] 285 511 2382 2845 3476 4055 4140 4483 4519 5167 ...
## .. ..$ : int [1:81] 206 312 328 666 718 753 783 1013 1432 1517 ...
## .. ..$ : int 10310
## .. ..$ : int [1:14] 1363 1482 1647 1879 3064 3065 6255 63274 63569 65106 ...
## .. ..$ : int [1:83] 310 332 372 457 727 1074 1194 1535 1699 1835 ...
## .. ..$ : int [1:26] 532 2594 2929 3741 6306 6470 6473 7017 9590 9593 ...
## .. ..$ : int [1:16] 156 2864 3280 3761 4938 5692 6641 8181 14987 15932 ...
## .. ..$ : int [1:204] 452 675 724 758 864 919 990 1601 2475 2577 ...
## .. ..$ : int [1:2] 3076 64891
## .. ..$ : int [1:55] 2 170 768 887 945 1002 1037 1079 1224 1265 ...
## .. ..$ : int [1:79] 29 118 556 683 824 1060 1111 1177 1380 1405 ...
## .. ..$ : int [1:94] 18 175 591 620 657 773 896 937 991 1030 ...
## .. ..$ : int [1:2] 3005 6339
## .. ..$ : int [1:20] 354 398 1097 1761 2119 2268 2384 3961 5020 5290 ...
## .. ..$ : int [1:75] 21 244 376 435 458 765 853 946 979 1042 ...
## .. ..$ : int [1:46] 50 387 513 1528 2205 2388 3598 5716 10165 13097 ...
## .. ..$ : int [1:19] 178 280 1659 1825 2612 2863 3349 3775 4384 5118 ...
## .. ..$ : int [1:2] 15949 26231
## .. ..$ : int [1:33] 182 278 547 597 617 1132 1256 1317 1335 1407 ...
## .. ..$ : int 19203
## .. ..$ : int [1:317] 37 102 577 633 688 816 857 973 1098 1120 ...
## .. ..$ : int [1:28] 150 172 309 322 599 645 822 982 1021 1114 ...
## .. ..$ : int [1:225] 6 160 224 501 583 855 955 984 1155 1167 ...
## .. ..$ : int [1:114] 33 107 422 478 699 760 875 936 1070 1134 ...
## .. ..$ : int [1:2] 1234 2896
## .. ..$ : int 6215
## .. ..$ : int [1:6] 2109 2194 2262 2598 5753 64467
## .. ..$ : int [1:2] 3114 65041
## .. ..$ : int 3582
## .. ..$ : int [1:11] 368 423 1522 1791 2035 2216 62284 62339 64129 64140 ...
## .. ..$ : int [1:79] 151 240 375 438 462 498 665 1867 2096 2246 ...
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# remove rows containing missing values
tennis_df2 = na.omit(tennis_df2) #123,864 to 107,536
# log for just these two vars
age_ht_log = glm(outcome ~ age + ht,tennis_df2, family = binomial)
summary(age_ht_log)
##
## Call:
## glm(formula = outcome ~ age + ht, family = binomial, data = tennis_df2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1201176 0.1740671 -12.180 < 2e-16 ***
## age -0.0098096 0.0015244 -6.435 1.24e-10 ***
## ht 0.0127946 0.0008929 14.330 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 150094 on 108269 degrees of freedom
## Residual deviance: 149832 on 108267 degrees of freedom
## AIC: 149838
##
## Number of Fisher Scoring iterations: 3
vif(age_ht_log)
## age ht
## 1.005258 1.005258
#age and height are significant
log.dif <- glm(outcome ~ age_dif + ht_dif, tennis_train, family = binomial)
summary(log.dif)
##
## Call:
## glm(formula = outcome ~ age_dif + ht_dif, family = binomial,
## data = tennis_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.002176 0.006554 0.332 0.74
## age_dif -0.007706 0.001200 -6.422 1.34e-10 ***
## ht_dif 0.013568 0.000694 19.550 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129728 on 93578 degrees of freedom
## Residual deviance: 129269 on 93576 degrees of freedom
## AIC: 129275
##
## Number of Fisher Scoring iterations: 3
####
#### Do top players face fewer break points?
####
#let's consider top players as ranked in the top 25 in the world
tennis_df3 = select(ungroup(tennis_df2),id,surface,rank,bpFaced,ioc)
str(tennis_df3)
## tibble [108,270 × 5] (S3: tbl_df/tbl/data.frame)
## $ id : int [1:108270] 103507 104166 102880 102571 103387 102202 104339 103344 103813 102539 ...
## $ surface: Factor w/ 5 levels "","Carpet","Clay",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ rank : int [1:108270] 1 116 115 85 11 68 71 45 37 151 ...
## $ bpFaced: int [1:108270] 7 3 8 13 7 5 3 4 4 0 ...
## $ ioc : chr [1:108270] "ESP" "RUS" "GER" "FRA" ...
tennis_df3$top_player <- ifelse(tennis_df3$rank <= 25, "Yes", "No")
tennis_df3$top_player <- as.factor(tennis_df3$top_player)
# Check for Normality
qqnorm(tennis_df3$bpFaced)
qqline(tennis_df3$bpFaced)
# the qq plot does not look normal
# Because not normal, will use Wilcoxon-Mann Whitney test
wilcox.test(bpFaced ~ top_player, data = tennis_df3,int = TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bpFaced by top_player
## W = 1344811274, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#p-value is 2.2e-16, less than 0.05, therefore significant
# There is a statistically significant difference between top player vs other players
#in terms of how many break points they face.
# median break points faced for top players vs other players
median(tennis_df3$bpFaced[tennis_df3$top_player == "Yes"])
## [1] 5
median(tennis_df3$bpFaced[tennis_df3$top_player == "No"])
## [1] 7
#median is 5 for top layers, 7 for others
#top players face fewer break points
####
#### Which countries produce top tennis players?
####
str(tennis_df3)
## tibble [108,270 × 6] (S3: tbl_df/tbl/data.frame)
## $ id : int [1:108270] 103507 104166 102880 102571 103387 102202 104339 103344 103813 102539 ...
## $ surface : Factor w/ 5 levels "","Carpet","Clay",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ rank : int [1:108270] 1 116 115 85 11 68 71 45 37 151 ...
## $ bpFaced : int [1:108270] 7 3 8 13 7 5 3 4 4 0 ...
## $ ioc : chr [1:108270] "ESP" "RUS" "GER" "FRA" ...
## $ top_player: Factor w/ 2 levels "No","Yes": 2 1 1 1 2 1 1 1 1 1 ...
tennis_df4 = select(tennis_df3, -surface, -rank, -bpFaced)
tennis_df4 = distinct(tennis_df4)
#chi square test
chisq.test(table(tennis_df4$top_player, tennis_df4$ioc))
## Warning in chisq.test(table(tennis_df4$top_player, tennis_df4$ioc)):
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(tennis_df4$top_player, tennis_df4$ioc)
## X-squared = 56.335, df = 79, p-value = 0.9748
#p-value p-value = 0.9768
player_country = as.data.frame.matrix(table(tennis_df4$ioc,tennis_df4$top_player))
print(player_country)
## No Yes
## ALG 1 0
## ARG 60 15
## ARM 1 0
## AUS 58 5
## AUT 29 2
## BAH 1 0
## BAR 1 0
## BEL 18 3
## BIH 6 1
## BLR 8 1
## BOL 2 0
## BRA 29 2
## BUL 6 1
## CAN 19 3
## CHI 15 4
## CHN 17 0
## COL 9 0
## CRC 1 0
## CRO 25 5
## CYP 3 1
## CZE 35 3
## DEN 7 1
## DOM 4 0
## ECU 6 0
## EGY 2 0
## ESA 2 0
## ESP 68 19
## EST 3 0
## FIN 7 1
## FRA 90 14
## GBR 36 5
## GEO 4 1
## GER 65 7
## GRE 5 1
## GUA 1 0
## HUN 8 0
## IND 17 0
## IRL 3 0
## ISR 9 0
## ITA 63 8
## JOR 1 0
## JPN 21 1
## KAZ 11 1
## KOR 9 1
## LAT 2 1
## LBN 1 0
## LTU 5 0
## LUX 2 1
## MAR 9 1
## MDA 3 0
## MEX 13 0
## MON 4 0
## NED 23 4
## NOR 3 1
## NZL 8 0
## PAK 1 0
## PAR 1 0
## PER 6 0
## PHI 2 0
## POL 12 2
## POR 11 0
## ROU 15 1
## RSA 11 2
## RUS 34 10
## SLO 5 0
## SRB 19 4
## SUI 22 2
## SVK 16 2
## SWE 27 4
## THA 6 1
## TOG 1 0
## TPE 8 0
## TUN 3 0
## TUR 4 0
## UKR 10 1
## URU 3 1
## USA 118 17
## UZB 4 0
## VEN 1 0
## ZIM 3 0
player_country %>% filter(Yes >= 10)
## No Yes
## ARG 60 15
## ESP 68 19
## FRA 90 14
## RUS 34 10
## USA 118 17
# Spain, USA, Argentina, France, Russia
####
#### Do the variables that impact whether a tennis player wins a match depend on the surface type?
####
# Subset based on surface type
clay_subset <- tennis_df[tennis_df$surface_clay == 1, ]
grass_subset <- tennis_df[tennis_df$surface_grass == 1, ]
hard_subset <- tennis_df[tennis_df$surface_hard == 1, ]
clay_subset = select(clay_subset,-surface_clay,-surface_grass,-surface_hard)
grass_subset = select(grass_subset,-surface_clay,-surface_grass,-surface_hard)
hard_subset = select(hard_subset,-surface_clay,-surface_grass,-surface_hard)
### Splitting into train and test
set.seed(12)
#clay
index = sample(nrow(clay_subset), 0.8*nrow(clay_subset), replace = F) # 80/20 split
clay_train = clay_subset[index,]
clay_test = clay_subset[-index,]
#grass
index = sample(nrow(grass_subset), 0.8*nrow(grass_subset), replace = F) # 80/20 split
grass_train = grass_subset[index,]
grass_test = grass_subset[-index,]
#hard
index = sample(nrow(hard_subset), 0.8*nrow(hard_subset), replace = F) # 80/20 split
hard_train = hard_subset[index,]
hard_test = hard_subset[-index,]
#checking for balance
table(clay_train$outcome)
##
## 0 1
## 14909 14915
table(clay_test$outcome)
##
## 0 1
## 3731 3725
table(grass_train$outcome)
##
## 0 1
## 5051 5000
table(grass_test$outcome)
##
## 0 1
## 1231 1282
table(hard_train$outcome)
##
## 0 1
## 25932 25948
table(hard_test$outcome)
##
## 0 1
## 6493 6477
#Logistic Regression: CLAY
log.clay = glm(outcome ~ .,data = clay_train, family = binomial)
summary(log.clay)
##
## Call:
## glm(formula = outcome ~ ., family = binomial, data = clay_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3258311 0.3671204 -0.888 0.374792
## draw_size -0.0002169 0.0013062 -0.166 0.868104
## tourney_levelD 0.0100330 0.0706793 0.142 0.887118
## tourney_levelG 0.0253023 0.1276140 0.198 0.842832
## tourney_levelM 0.0172425 0.0515901 0.334 0.738213
## handR 0.0843234 0.0388875 2.168 0.030129 *
## opponent_handR -0.0477343 0.0388304 -1.229 0.218958
## seededYes 0.3718472 0.0311488 11.938 < 2e-16 ***
## opponent_seededYes -0.3514566 0.0310688 -11.312 < 2e-16 ***
## ht_dif 0.0059231 0.0019369 3.058 0.002228 **
## age_dif -0.0215519 0.0024309 -8.866 < 2e-16 ***
## rank_dif -0.0031208 0.0001456 -21.433 < 2e-16 ***
## avg_ace -0.1803798 0.0118919 -15.168 < 2e-16 ***
## avg_df 0.0709136 0.0204436 3.469 0.000523 ***
## avg_svpt -0.1949795 0.0177214 -11.002 < 2e-16 ***
## avg_firstIn -0.0036370 0.0230651 -0.158 0.874707
## avg_firstWon 0.2987039 0.0217582 13.728 < 2e-16 ***
## avg_secWon 0.2878787 0.0303439 9.487 < 2e-16 ***
## avg_bpSaved 0.1429848 0.0496479 2.880 0.003977 **
## avg_opponent_ace 0.1781188 0.0118679 15.008 < 2e-16 ***
## avg_opponent_df -0.0824849 0.0204559 -4.032 5.52e-05 ***
## avg_opponent_svpt 0.2097942 0.0176860 11.862 < 2e-16 ***
## avg_opponent_firstIn -0.0057736 0.0229944 -0.251 0.801747
## avg_opponent_firstWon -0.2960938 0.0217586 -13.608 < 2e-16 ***
## avg_opponent_secWon -0.3117384 0.0302114 -10.319 < 2e-16 ***
## avg_opponent_bpSaved -0.1662104 0.0490544 -3.388 0.000703 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41345 on 29823 degrees of freedom
## Residual deviance: 36628 on 29798 degrees of freedom
## AIC: 36680
##
## Number of Fisher Scoring iterations: 4
vif(log.clay) # multicollinearity in the model: first removing avg_opponent_firstIn & avg_firstIn
## GVIF Df GVIF^(1/(2*Df))
## draw_size 13.350640 1 3.653853
## tourney_level 15.472014 3 1.578548
## hand 1.124467 1 1.060409
## opponent_hand 1.121790 1 1.059146
## seeded 1.348204 1 1.161122
## opponent_seeded 1.347216 1 1.160697
## ht_dif 1.988186 1 1.410031
## age_dif 1.047534 1 1.023491
## rank_dif 1.295764 1 1.138316
## avg_ace 7.146519 1 2.673297
## avg_df 1.787240 1 1.336877
## avg_svpt 32.699202 1 5.718322
## avg_firstIn 45.385369 1 6.736866
## avg_firstWon 29.058929 1 5.390633
## avg_secWon 19.300416 1 4.393224
## avg_bpSaved 4.838756 1 2.199717
## avg_opponent_ace 7.052528 1 2.655660
## avg_opponent_df 1.791631 1 1.338518
## avg_opponent_svpt 32.765316 1 5.724100
## avg_opponent_firstIn 45.358971 1 6.734907
## avg_opponent_firstWon 29.119418 1 5.396241
## avg_opponent_secWon 19.218661 1 4.383909
## avg_opponent_bpSaved 4.813423 1 2.193951
log.clay = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn,data = clay_train, family = binomial)
summary(log.clay)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn,
## family = binomial, data = clay_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3265690 0.3671225 -0.890 0.373715
## draw_size -0.0002057 0.0013057 -0.158 0.874828
## tourney_levelD 0.0101175 0.0706786 0.143 0.886173
## tourney_levelG 0.0243706 0.1275740 0.191 0.848501
## tourney_levelM 0.0169567 0.0515808 0.329 0.742352
## handR 0.0852868 0.0384129 2.220 0.026401 *
## opponent_handR -0.0462005 0.0383470 -1.205 0.228281
## seededYes 0.3716849 0.0311330 11.939 < 2e-16 ***
## opponent_seededYes -0.3517009 0.0310549 -11.325 < 2e-16 ***
## ht_dif 0.0058998 0.0018942 3.115 0.001841 **
## age_dif -0.0215559 0.0024304 -8.869 < 2e-16 ***
## rank_dif -0.0031206 0.0001456 -21.434 < 2e-16 ***
## avg_ace -0.1791480 0.0094157 -19.027 < 2e-16 ***
## avg_df 0.0724162 0.0182218 3.974 7.06e-05 ***
## avg_svpt -0.1970575 0.0119891 -16.436 < 2e-16 ***
## avg_firstWon 0.2961881 0.0144225 20.537 < 2e-16 ***
## avg_secWon 0.2918856 0.0165034 17.686 < 2e-16 ***
## avg_bpSaved 0.1431648 0.0496465 2.884 0.003931 **
## avg_opponent_ace 0.1800052 0.0093758 19.199 < 2e-16 ***
## avg_opponent_df -0.0801316 0.0182370 -4.394 1.11e-05 ***
## avg_opponent_svpt 0.2065342 0.0120481 17.142 < 2e-16 ***
## avg_opponent_firstWon -0.3001553 0.0144104 -20.829 < 2e-16 ***
## avg_opponent_secWon -0.3054085 0.0165406 -18.464 < 2e-16 ***
## avg_opponent_bpSaved -0.1661864 0.0490487 -3.388 0.000704 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41345 on 29823 degrees of freedom
## Residual deviance: 36628 on 29800 degrees of freedom
## AIC: 36676
##
## Number of Fisher Scoring iterations: 4
vif(log.clay) # multicollinearity in the model: next removing tourney level & avg_svpt & avg_opponent_svpt
## GVIF Df GVIF^(1/(2*Df))
## draw_size 13.339209 1 3.652288
## tourney_level 15.460497 3 1.578352
## hand 1.097317 1 1.047529
## opponent_hand 1.093858 1 1.045877
## seeded 1.346841 1 1.160535
## opponent_seeded 1.346009 1 1.160176
## ht_dif 1.901405 1 1.378915
## age_dif 1.047044 1 1.023252
## rank_dif 1.295529 1 1.138213
## avg_ace 4.480368 1 2.116688
## avg_df 1.419792 1 1.191550
## avg_svpt 14.965402 1 3.868514
## avg_firstWon 12.767963 1 3.573229
## avg_secWon 5.709229 1 2.389399
## avg_bpSaved 4.838270 1 2.199607
## avg_opponent_ace 4.401105 1 2.097881
## avg_opponent_df 1.424108 1 1.193360
## avg_opponent_svpt 15.203005 1 3.899103
## avg_opponent_firstWon 12.771268 1 3.573691
## avg_opponent_secWon 5.760324 1 2.400068
## avg_opponent_bpSaved 4.811531 1 2.193520
log.clay = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn - avg_svpt
-avg_opponent_svpt -tourney_level,data = clay_train, family = binomial)
summary(log.clay)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn -
## avg_svpt - avg_opponent_svpt - tourney_level, family = binomial,
## data = clay_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.877e-01 3.173e-01 -0.591 0.55429
## draw_size -3.859e-05 3.639e-04 -0.106 0.91554
## handR -5.780e-02 3.703e-02 -1.561 0.11851
## opponent_handR 9.826e-02 3.704e-02 2.653 0.00798 **
## seededYes 4.400e-01 2.984e-02 14.747 < 2e-16 ***
## opponent_seededYes -4.300e-01 2.980e-02 -14.428 < 2e-16 ***
## ht_dif 5.202e-03 1.877e-03 2.771 0.00559 **
## age_dif -2.309e-02 2.406e-03 -9.595 < 2e-16 ***
## rank_dif -3.470e-03 1.477e-04 -23.488 < 2e-16 ***
## avg_ace -1.330e-01 8.974e-03 -14.819 < 2e-16 ***
## avg_df -2.875e-03 1.756e-02 -0.164 0.86997
## avg_firstWon 8.081e-02 5.781e-03 13.980 < 2e-16 ***
## avg_secWon 6.238e-02 8.544e-03 7.301 2.86e-13 ***
## avg_bpSaved -4.959e-01 3.177e-02 -15.609 < 2e-16 ***
## avg_opponent_ace 1.313e-01 8.928e-03 14.713 < 2e-16 ***
## avg_opponent_df -8.219e-04 1.754e-02 -0.047 0.96262
## avg_opponent_firstWon -7.510e-02 5.719e-03 -13.130 < 2e-16 ***
## avg_opponent_secWon -6.404e-02 8.513e-03 -7.523 5.34e-14 ***
## avg_opponent_bpSaved 4.967e-01 3.147e-02 15.786 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41345 on 29823 degrees of freedom
## Residual deviance: 37191 on 29805 degrees of freedom
## AIC: 37229
##
## Number of Fisher Scoring iterations: 4
vif(log.clay) # no multicollinearity
## draw_size hand opponent_hand
## 1.067150 1.042132 1.042489
## seeded opponent_seeded ht_dif
## 1.265340 1.267727 1.896591
## age_dif rank_dif avg_ace
## 1.048968 1.299819 4.130822
## avg_df avg_firstWon avg_secWon
## 1.344295 2.107831 1.562453
## avg_bpSaved avg_opponent_ace avg_opponent_df
## 1.923550 4.054069 1.342097
## avg_opponent_firstWon avg_opponent_secWon avg_opponent_bpSaved
## 2.072292 1.555920 1.912698
predprob_log_clay <- predict.glm(log.clay, clay_test, type = "response")
predclass_log_clay = ifelse(predprob_log_clay >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log_clay), clay_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2428 1268
## 1 1303 2457
##
## Accuracy : 0.6552
## 95% CI : (0.6443, 0.666)
## No Information Rate : 0.5004
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3104
##
## Mcnemar's Test P-Value : 0.5025
##
## Sensitivity : 0.6596
## Specificity : 0.6508
## Pos Pred Value : 0.6535
## Neg Pred Value : 0.6569
## Prevalence : 0.4996
## Detection Rate : 0.3295
## Detection Prevalence : 0.5043
## Balanced Accuracy : 0.6552
##
## 'Positive' Class : 1
##
# Accuracy : 0.6582
# Sensitivity : 0.6524
# Specificity : 0.6441
#Logistic Regression with Stepwise Selection
null_model_clay = glm(outcome ~ 1, data = clay_train, family = binomial)
full_model_clay = log.clay
step.model.AIC.clay = step(null_model_clay, scope = list(upper = full_model_clay),
direction = "both", test = "Chisq", trace = F)
summary(step.model.AIC.clay)
##
## Call:
## glm(formula = outcome ~ rank_dif + seeded + opponent_seeded +
## age_dif + avg_bpSaved + avg_opponent_bpSaved + avg_opponent_ace +
## avg_opponent_firstWon + avg_ace + avg_firstWon + avg_opponent_secWon +
## avg_secWon + ht_dif + opponent_hand + hand, family = binomial,
## data = clay_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1894962 0.3154610 -0.601 0.54804
## rank_dif -0.0034700 0.0001476 -23.502 < 2e-16 ***
## seededYes 0.4397208 0.0297794 14.766 < 2e-16 ***
## opponent_seededYes -0.4301902 0.0297434 -14.463 < 2e-16 ***
## age_dif -0.0230806 0.0024037 -9.602 < 2e-16 ***
## avg_bpSaved -0.4976465 0.0295829 -16.822 < 2e-16 ***
## avg_opponent_bpSaved 0.4963505 0.0293100 16.934 < 2e-16 ***
## avg_opponent_ace 0.1312353 0.0084006 15.622 < 2e-16 ***
## avg_opponent_firstWon -0.0751005 0.0055438 -13.547 < 2e-16 ***
## avg_ace -0.1334481 0.0084528 -15.787 < 2e-16 ***
## avg_firstWon 0.0809710 0.0056133 14.425 < 2e-16 ***
## avg_opponent_secWon -0.0641456 0.0084699 -7.573 3.64e-14 ***
## avg_secWon 0.0622087 0.0084992 7.319 2.49e-13 ***
## ht_dif 0.0052011 0.0018773 2.770 0.00560 **
## opponent_handR 0.0983094 0.0369720 2.659 0.00784 **
## handR -0.0574733 0.0369594 -1.555 0.11994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41345 on 29823 degrees of freedom
## Residual deviance: 37191 on 29808 degrees of freedom
## AIC: 37223
##
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise
log.sel.clay <- glm(outcome ~ rank_dif + opponent_seeded + seeded + age_dif + avg_opponent_bpSaved +
avg_opponent_ace + avg_opponent_firstWon + avg_ace + avg_firstWon + avg_opponent_secWon +
avg_secWon + ht_dif + opponent_hand + hand,
clay_train, family = binomial)
summary(log.sel.clay)
##
## Call:
## glm(formula = outcome ~ rank_dif + opponent_seeded + seeded +
## age_dif + avg_opponent_bpSaved + avg_opponent_ace + avg_opponent_firstWon +
## avg_ace + avg_firstWon + avg_opponent_secWon + avg_secWon +
## ht_dif + opponent_hand + hand, family = binomial, data = clay_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4069672 0.3055966 -4.604 4.14e-06 ***
## rank_dif -0.0036981 0.0001495 -24.732 < 2e-16 ***
## opponent_seededYes -0.4000148 0.0295300 -13.546 < 2e-16 ***
## seededYes 0.5795525 0.0286277 20.244 < 2e-16 ***
## age_dif -0.0247842 0.0023921 -10.361 < 2e-16 ***
## avg_opponent_bpSaved 0.4529503 0.0289840 15.628 < 2e-16 ***
## avg_opponent_ace 0.1253500 0.0083447 15.021 < 2e-16 ***
## avg_opponent_firstWon -0.0712278 0.0055154 -12.914 < 2e-16 ***
## avg_ace -0.0608434 0.0072146 -8.433 < 2e-16 ***
## avg_firstWon 0.0617944 0.0054473 11.344 < 2e-16 ***
## avg_opponent_secWon -0.0576471 0.0084165 -6.849 7.42e-12 ***
## avg_secWon 0.0157417 0.0079618 1.977 0.0480 *
## ht_dif 0.0045750 0.0018672 2.450 0.0143 *
## opponent_handR 0.0895571 0.0368133 2.433 0.0150 *
## handR 0.0091659 0.0364372 0.252 0.8014
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41345 on 29823 degrees of freedom
## Residual deviance: 37490 on 29809 degrees of freedom
## AIC: 37520
##
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2_clay <- predict(log.sel.clay, newdata = clay_test, type = "response")
logistic_pred_class2_clay <- ifelse(logistic_pred2_clay > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2_clay), clay_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2430 1308
## 1 1301 2417
##
## Accuracy : 0.6501
## 95% CI : (0.6391, 0.6609)
## No Information Rate : 0.5004
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3002
##
## Mcnemar's Test P-Value : 0.9065
##
## Sensitivity : 0.6489
## Specificity : 0.6513
## Pos Pred Value : 0.6501
## Neg Pred Value : 0.6501
## Prevalence : 0.4996
## Detection Rate : 0.3242
## Detection Prevalence : 0.4987
## Balanced Accuracy : 0.6501
##
## 'Positive' Class : 1
##
# Accuracy : 0.6451
# Sensitivity : 0.6426
# Specificity : 0.6476
# Logistic Regression: GRASS
log.grass <- glm(outcome ~ ., data = grass_train, family = binomial)
summary(log.grass)
##
## Call:
## glm(formula = outcome ~ ., family = binomial, data = grass_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.590e-01 7.185e-01 -0.221 0.82486
## draw_size 6.533e-05 2.753e-03 0.024 0.98107
## tourney_levelD -1.196e-02 2.180e-01 -0.055 0.95624
## tourney_levelG 6.170e-04 2.544e-01 0.002 0.99806
## handR 7.219e-02 6.914e-02 1.044 0.29637
## opponent_handR -4.056e-02 6.865e-02 -0.591 0.55460
## seededYes 3.726e-01 5.542e-02 6.724 1.77e-11 ***
## opponent_seededYes -2.884e-01 5.525e-02 -5.220 1.79e-07 ***
## ht_dif 3.340e-03 3.362e-03 0.993 0.32063
## age_dif 1.256e-02 3.990e-03 3.147 0.00165 **
## rank_dif -2.196e-03 2.207e-04 -9.950 < 2e-16 ***
## avg_ace -1.811e-01 2.070e-02 -8.749 < 2e-16 ***
## avg_df 2.161e-02 3.111e-02 0.695 0.48723
## avg_svpt -1.904e-01 3.100e-02 -6.142 8.15e-10 ***
## avg_firstIn -1.322e-01 4.035e-02 -3.277 0.00105 **
## avg_firstWon 4.901e-01 4.242e-02 11.555 < 2e-16 ***
## avg_secWon 3.209e-01 5.087e-02 6.310 2.79e-10 ***
## avg_bpSaved 2.027e-01 9.973e-02 2.033 0.04210 *
## avg_opponent_ace 1.652e-01 2.080e-02 7.943 1.97e-15 ***
## avg_opponent_df 8.525e-04 3.152e-02 0.027 0.97842
## avg_opponent_svpt 1.968e-01 3.170e-02 6.210 5.31e-10 ***
## avg_opponent_firstIn 1.156e-01 4.093e-02 2.825 0.00473 **
## avg_opponent_firstWon -4.738e-01 4.247e-02 -11.158 < 2e-16 ***
## avg_opponent_secWon -3.313e-01 5.202e-02 -6.369 1.90e-10 ***
## avg_opponent_bpSaved -1.972e-01 1.010e-01 -1.952 0.05099 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13933 on 10050 degrees of freedom
## Residual deviance: 11877 on 10026 degrees of freedom
## AIC: 11927
##
## Number of Fisher Scoring iterations: 4
vif(log.grass) # Check for multicollinearity in the model
## GVIF Df GVIF^(1/(2*Df))
## draw_size 32.185822 1 5.673255
## tourney_level 32.739287 2 2.392034
## hand 1.129083 1 1.062583
## opponent_hand 1.132450 1 1.064166
## seeded 1.401319 1 1.183773
## opponent_seeded 1.396307 1 1.181654
## ht_dif 2.232477 1 1.494148
## age_dif 1.046681 1 1.023074
## rank_dif 1.278566 1 1.130737
## avg_ace 9.265607 1 3.043946
## avg_df 1.793296 1 1.339140
## avg_svpt 27.994225 1 5.290957
## avg_firstIn 42.556577 1 6.523540
## avg_firstWon 36.833044 1 6.069023
## avg_secWon 14.578655 1 3.818200
## avg_bpSaved 6.054831 1 2.460657
## avg_opponent_ace 9.113666 1 3.018885
## avg_opponent_df 1.802179 1 1.342453
## avg_opponent_svpt 27.898705 1 5.281922
## avg_opponent_firstIn 42.672727 1 6.532437
## avg_opponent_firstWon 35.984623 1 5.998718
## avg_opponent_secWon 15.109383 1 3.887079
## avg_opponent_bpSaved 6.037939 1 2.457222
log.grass = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn,data = grass_train, family = binomial)
summary(log.grass)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn,
## family = binomial, data = grass_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1623039 0.7202110 -0.225 0.82170
## draw_size 0.0001289 0.0027487 0.047 0.96259
## tourney_levelD -0.0039259 0.2179471 -0.018 0.98563
## tourney_levelG -0.0053831 0.2540003 -0.021 0.98309
## handR 0.0945248 0.0686668 1.377 0.16864
## opponent_handR -0.0626694 0.0680216 -0.921 0.35689
## seededYes 0.3749291 0.0553853 6.769 1.29e-11 ***
## opponent_seededYes -0.2893566 0.0552117 -5.241 1.60e-07 ***
## ht_dif 0.0057254 0.0033111 1.729 0.08378 .
## age_dif 0.0119339 0.0039849 2.995 0.00275 **
## rank_dif -0.0021672 0.0002199 -9.857 < 2e-16 ***
## avg_ace -0.1381040 0.0157273 -8.781 < 2e-16 ***
## avg_df 0.0650148 0.0282422 2.302 0.02133 *
## avg_svpt -0.2571421 0.0231787 -11.094 < 2e-16 ***
## avg_firstWon 0.3842614 0.0281062 13.672 < 2e-16 ***
## avg_secWon 0.4509656 0.0318101 14.177 < 2e-16 ***
## avg_bpSaved 0.1889544 0.1002023 1.886 0.05933 .
## avg_opponent_ace 0.1273364 0.0157673 8.076 6.69e-16 ***
## avg_opponent_df -0.0364848 0.0284553 -1.282 0.19978
## avg_opponent_svpt 0.2557677 0.0234120 10.925 < 2e-16 ***
## avg_opponent_firstWon -0.3814238 0.0283334 -13.462 < 2e-16 ***
## avg_opponent_secWon -0.4453007 0.0322620 -13.803 < 2e-16 ***
## avg_opponent_bpSaved -0.1884972 0.1014692 -1.858 0.06321 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13933 on 10050 degrees of freedom
## Residual deviance: 11895 on 10028 degrees of freedom
## AIC: 11941
##
## Number of Fisher Scoring iterations: 4
vif(log.grass)
## GVIF Df GVIF^(1/(2*Df))
## draw_size 32.157829 1 5.670787
## tourney_level 32.661972 2 2.390620
## hand 1.116630 1 1.056707
## opponent_hand 1.115075 1 1.055971
## seeded 1.401770 1 1.183964
## opponent_seeded 1.396611 1 1.181783
## ht_dif 2.159519 1 1.469530
## age_dif 1.045118 1 1.022310
## rank_dif 1.273769 1 1.128614
## avg_ace 5.301774 1 2.302558
## avg_df 1.466883 1 1.211150
## avg_svpt 15.453427 1 3.931085
## avg_firstWon 16.009525 1 4.001190
## avg_secWon 5.710721 1 2.389712
## avg_bpSaved 6.096597 1 2.469129
## avg_opponent_ace 5.196379 1 2.279557
## avg_opponent_df 1.460520 1 1.208520
## avg_opponent_svpt 15.101703 1 3.886091
## avg_opponent_firstWon 15.889777 1 3.986198
## avg_opponent_secWon 5.826479 1 2.413810
## avg_opponent_bpSaved 6.078088 1 2.465378
log.grass = glm(outcome ~ . -avg_opponent_firstIn -avg_firstIn -tourney_level -avg_firstWon
-avg_opponent_firstWon,data = grass_train, family = binomial)
summary(log.grass)
##
## Call:
## glm(formula = outcome ~ . - avg_opponent_firstIn - avg_firstIn -
## tourney_level - avg_firstWon - avg_opponent_firstWon, family = binomial,
## data = grass_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.378e-01 6.763e-01 -0.352 0.72510
## draw_size 4.394e-05 4.807e-04 0.091 0.92716
## handR -1.160e-01 6.519e-02 -1.780 0.07513 .
## opponent_handR 1.360e-01 6.475e-02 2.100 0.03570 *
## seededYes 5.336e-01 5.233e-02 10.197 < 2e-16 ***
## opponent_seededYes -4.465e-01 5.218e-02 -8.557 < 2e-16 ***
## ht_dif 8.528e-03 3.259e-03 2.616 0.00889 **
## age_dif 7.233e-03 3.900e-03 1.855 0.06366 .
## rank_dif -2.414e-03 2.213e-04 -10.908 < 2e-16 ***
## avg_ace -1.882e-02 1.295e-02 -1.453 0.14624
## avg_df -4.297e-02 2.643e-02 -1.626 0.10391
## avg_svpt 2.633e-02 8.782e-03 2.998 0.00271 **
## avg_secWon 8.036e-02 1.581e-02 5.083 3.72e-07 ***
## avg_bpSaved -7.009e-01 7.113e-02 -9.855 < 2e-16 ***
## avg_opponent_ace 1.061e-02 1.295e-02 0.820 0.41248
## avg_opponent_df 6.849e-02 2.668e-02 2.567 0.01026 *
## avg_opponent_svpt -2.625e-02 8.879e-03 -2.957 0.00311 **
## avg_opponent_secWon -7.299e-02 1.587e-02 -4.600 4.23e-06 ***
## avg_opponent_bpSaved 7.097e-01 7.134e-02 9.948 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13933 on 10050 degrees of freedom
## Residual deviance: 12277 on 10032 degrees of freedom
## AIC: 12315
##
## Number of Fisher Scoring iterations: 4
vif(log.grass) # no multicollinearity
## draw_size hand opponent_hand
## 1.028236 1.050715 1.052344
## seeded opponent_seeded ht_dif
## 1.303862 1.300475 2.131194
## age_dif rank_dif avg_ace
## 1.040036 1.280145 3.670248
## avg_df avg_svpt avg_secWon
## 1.348390 2.390852 1.474428
## avg_bpSaved avg_opponent_ace avg_opponent_df
## 3.161097 3.577481 1.343232
## avg_opponent_svpt avg_opponent_secWon avg_opponent_bpSaved
## 2.332993 1.470113 3.080825
# Prediction on test set
predprob_log_grass <- predict.glm(log.grass, grass_test, type = "response")
predclass_log_grass <- ifelse(predprob_log_grass >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log_grass), grass_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 816 410
## 1 415 872
##
## Accuracy : 0.6717
## 95% CI : (0.653, 0.6901)
## No Information Rate : 0.5101
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3431
##
## Mcnemar's Test P-Value : 0.8892
##
## Sensitivity : 0.6802
## Specificity : 0.6629
## Pos Pred Value : 0.6775
## Neg Pred Value : 0.6656
## Prevalence : 0.5101
## Detection Rate : 0.3470
## Detection Prevalence : 0.5121
## Balanced Accuracy : 0.6715
##
## 'Positive' Class : 1
##
# Accuracy : 0.6811
# Sensitivity : 0.6940
# Specificity : 0.6675
#Logistic Regression with Stepwise Selection
null_model_grass = glm(outcome ~ 1, data = grass_train, family = binomial)
full_model_grass = log.grass
step.model.AIC.grass = step(null_model_grass, scope = list(upper = full_model_grass),
direction = "both", test = "Chisq", trace = F)
summary(step.model.AIC.grass)
##
## Call:
## glm(formula = outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved +
## seeded + opponent_seeded + avg_secWon + avg_opponent_secWon +
## avg_opponent_svpt + avg_svpt + avg_opponent_df + opponent_hand +
## avg_df + ht_dif + age_dif + hand, family = binomial, data = grass_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1387030 0.6477310 -0.214 0.83044
## rank_dif -0.0024195 0.0002211 -10.946 < 2e-16 ***
## avg_opponent_bpSaved 0.6786043 0.0502779 13.497 < 2e-16 ***
## avg_bpSaved -0.6322881 0.0495441 -12.762 < 2e-16 ***
## seededYes 0.5424305 0.0518621 10.459 < 2e-16 ***
## opponent_seededYes -0.4512300 0.0517121 -8.726 < 2e-16 ***
## avg_secWon 0.0769666 0.0156630 4.914 8.93e-07 ***
## avg_opponent_secWon -0.0698150 0.0157051 -4.445 8.77e-06 ***
## avg_opponent_svpt -0.0237746 0.0076310 -3.116 0.00184 **
## avg_svpt 0.0203315 0.0074957 2.712 0.00668 **
## avg_opponent_df 0.0727106 0.0251184 2.895 0.00379 **
## opponent_handR 0.1325162 0.0644790 2.055 0.03986 *
## avg_df -0.0544727 0.0248139 -2.195 0.02815 *
## ht_dif 0.0059726 0.0027771 2.151 0.03151 *
## age_dif 0.0067419 0.0038876 1.734 0.08288 .
## handR -0.1077899 0.0649021 -1.661 0.09675 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13933 on 10050 degrees of freedom
## Residual deviance: 12280 on 10035 degrees of freedom
## AIC: 12312
##
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise
log.sel.grass <- glm(outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved + opponent_seeded + seeded +
avg_opponent_secWon + avg_secWon + ht_dif + avg_df + avg_svpt + age_dif + avg_opponent_svpt +
avg_opponent_df + opponent_hand + hand,
grass_train, family = binomial)
summary(log.sel.grass)
##
## Call:
## glm(formula = outcome ~ rank_dif + avg_opponent_bpSaved + avg_bpSaved +
## opponent_seeded + seeded + avg_opponent_secWon + avg_secWon +
## ht_dif + avg_df + avg_svpt + age_dif + avg_opponent_svpt +
## avg_opponent_df + opponent_hand + hand, family = binomial,
## data = grass_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1387030 0.6477310 -0.214 0.83044
## rank_dif -0.0024195 0.0002211 -10.946 < 2e-16 ***
## avg_opponent_bpSaved 0.6786043 0.0502779 13.497 < 2e-16 ***
## avg_bpSaved -0.6322881 0.0495441 -12.762 < 2e-16 ***
## opponent_seededYes -0.4512300 0.0517121 -8.726 < 2e-16 ***
## seededYes 0.5424305 0.0518621 10.459 < 2e-16 ***
## avg_opponent_secWon -0.0698150 0.0157051 -4.445 8.77e-06 ***
## avg_secWon 0.0769666 0.0156630 4.914 8.93e-07 ***
## ht_dif 0.0059726 0.0027771 2.151 0.03151 *
## avg_df -0.0544727 0.0248139 -2.195 0.02815 *
## avg_svpt 0.0203315 0.0074957 2.712 0.00668 **
## age_dif 0.0067419 0.0038876 1.734 0.08288 .
## avg_opponent_svpt -0.0237746 0.0076310 -3.116 0.00184 **
## avg_opponent_df 0.0727106 0.0251184 2.895 0.00379 **
## opponent_handR 0.1325162 0.0644790 2.055 0.03986 *
## handR -0.1077899 0.0649021 -1.661 0.09675 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13933 on 10050 degrees of freedom
## Residual deviance: 12280 on 10035 degrees of freedom
## AIC: 12312
##
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2_grass <- predict(log.sel.grass, newdata = grass_test, type = "response")
logistic_pred_class2_grass <- ifelse(logistic_pred2_grass > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2_grass), grass_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 818 412
## 1 413 870
##
## Accuracy : 0.6717
## 95% CI : (0.653, 0.6901)
## No Information Rate : 0.5101
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3431
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.6786
## Specificity : 0.6645
## Pos Pred Value : 0.6781
## Neg Pred Value : 0.6650
## Prevalence : 0.5101
## Detection Rate : 0.3462
## Detection Prevalence : 0.5105
## Balanced Accuracy : 0.6716
##
## 'Positive' Class : 1
##
# Accuracy : 0.6803
# Sensitivity : 0.6924
# Specificity : 0.6675
# Logistic Regression: HARD
log.hard <- glm(outcome ~ ., data = hard_train, family = binomial)
summary(log.hard)
##
## Call:
## glm(formula = outcome ~ ., family = binomial, data = hard_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.979e-01 2.935e-01 -0.674 0.500312
## draw_size 4.644e-04 6.325e-04 0.734 0.462749
## tourney_levelD 1.208e-03 5.082e-02 0.024 0.981029
## tourney_levelF 8.157e-03 9.143e-02 0.089 0.928907
## tourney_levelG -3.193e-02 6.530e-02 -0.489 0.624818
## tourney_levelM -1.284e-02 3.877e-02 -0.331 0.740519
## handR 1.074e-01 3.074e-02 3.492 0.000479 ***
## opponent_handR -8.407e-02 3.084e-02 -2.726 0.006410 **
## seededYes 3.493e-01 2.361e-02 14.793 < 2e-16 ***
## opponent_seededYes -3.848e-01 2.363e-02 -16.285 < 2e-16 ***
## ht_dif 3.621e-05 1.514e-03 0.024 0.980926
## age_dif -1.308e-02 1.768e-03 -7.401 1.35e-13 ***
## rank_dif -2.879e-03 1.084e-04 -26.561 < 2e-16 ***
## avg_ace -1.468e-01 8.554e-03 -17.158 < 2e-16 ***
## avg_df 2.519e-02 1.467e-02 1.717 0.085988 .
## avg_svpt -1.632e-01 1.363e-02 -11.974 < 2e-16 ***
## avg_firstIn -4.973e-02 1.737e-02 -2.863 0.004191 **
## avg_firstWon 3.280e-01 1.730e-02 18.964 < 2e-16 ***
## avg_secWon 2.898e-01 2.276e-02 12.732 < 2e-16 ***
## avg_bpSaved -1.712e-03 4.261e-02 -0.040 0.967948
## avg_opponent_ace 1.334e-01 8.532e-03 15.630 < 2e-16 ***
## avg_opponent_df -1.438e-02 1.469e-02 -0.979 0.327482
## avg_opponent_svpt 1.619e-01 1.373e-02 11.792 < 2e-16 ***
## avg_opponent_firstIn 4.022e-02 1.739e-02 2.313 0.020715 *
## avg_opponent_firstWon -3.055e-01 1.726e-02 -17.704 < 2e-16 ***
## avg_opponent_secWon -2.868e-01 2.288e-02 -12.535 < 2e-16 ***
## avg_opponent_bpSaved -8.678e-03 4.303e-02 -0.202 0.840173
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71921 on 51879 degrees of freedom
## Residual deviance: 62967 on 51853 degrees of freedom
## AIC: 63021
##
## Number of Fisher Scoring iterations: 4
vif(log.hard) # Check for multicollinearity in the model
## GVIF Df GVIF^(1/(2*Df))
## draw_size 7.046722 1 2.654566
## tourney_level 7.884077 4 1.294476
## hand 1.129175 1 1.062626
## opponent_hand 1.131329 1 1.063640
## seeded 1.373379 1 1.171913
## opponent_seeded 1.376127 1 1.173084
## ht_dif 2.382946 1 1.543679
## age_dif 1.041681 1 1.020628
## rank_dif 1.248194 1 1.117226
## avg_ace 8.486426 1 2.913147
## avg_df 1.770458 1 1.330586
## avg_svpt 30.557885 1 5.527919
## avg_firstIn 43.364966 1 6.585208
## avg_firstWon 34.243652 1 5.851808
## avg_secWon 15.783104 1 3.972795
## avg_bpSaved 6.001001 1 2.449694
## avg_opponent_ace 8.390360 1 2.896612
## avg_opponent_df 1.774788 1 1.332212
## avg_opponent_svpt 31.163999 1 5.582472
## avg_opponent_firstIn 43.070123 1 6.562783
## avg_opponent_firstWon 33.936159 1 5.825475
## avg_opponent_secWon 15.857414 1 3.982137
## avg_opponent_bpSaved 6.109295 1 2.471699
log.hard = glm(outcome ~ . -avg_firstIn -avg_opponent_firstIn -avg_firstWon
-avg_opponent_firstWon,data = hard_train, family = binomial)
summary(log.hard)
##
## Call:
## glm(formula = outcome ~ . - avg_firstIn - avg_opponent_firstIn -
## avg_firstWon - avg_opponent_firstWon, family = binomial,
## data = hard_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1780813 0.2883215 -0.618 0.53681
## draw_size 0.0003151 0.0006236 0.505 0.61330
## tourney_levelD -0.0112444 0.0501275 -0.224 0.82251
## tourney_levelF 0.0133216 0.0891697 0.149 0.88124
## tourney_levelG -0.0165964 0.0643032 -0.258 0.79633
## tourney_levelM -0.0016953 0.0379620 -0.045 0.96438
## handR -0.0638445 0.0292062 -2.186 0.02882 *
## opponent_handR 0.0814752 0.0292509 2.785 0.00535 **
## seededYes 0.4635500 0.0226958 20.424 < 2e-16 ***
## opponent_seededYes -0.4908463 0.0226906 -21.632 < 2e-16 ***
## ht_dif 0.0026397 0.0014642 1.803 0.07141 .
## age_dif -0.0176499 0.0017455 -10.112 < 2e-16 ***
## rank_dif -0.0033310 0.0001102 -30.216 < 2e-16 ***
## avg_ace -0.0442513 0.0057542 -7.690 1.47e-14 ***
## avg_df -0.0530250 0.0123623 -4.289 1.79e-05 ***
## avg_svpt 0.0243822 0.0038295 6.367 1.93e-10 ***
## avg_secWon 0.0663137 0.0068963 9.616 < 2e-16 ***
## avg_bpSaved -0.6842932 0.0317877 -21.527 < 2e-16 ***
## avg_opponent_ace 0.0395826 0.0057536 6.880 6.00e-12 ***
## avg_opponent_df 0.0616188 0.0123677 4.982 6.29e-07 ***
## avg_opponent_svpt -0.0199646 0.0038442 -5.193 2.06e-07 ***
## avg_opponent_secWon -0.0679136 0.0069297 -9.800 < 2e-16 ***
## avg_opponent_bpSaved 0.6433371 0.0316674 20.315 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71921 on 51879 degrees of freedom
## Residual deviance: 64126 on 51857 degrees of freedom
## AIC: 64172
##
## Number of Fisher Scoring iterations: 4
vif(log.hard) #no multicolienarity
## GVIF Df GVIF^(1/(2*Df))
## draw_size 7.044136 1 2.654079
## tourney_level 7.680766 4 1.290255
## hand 1.046553 1 1.023012
## opponent_hand 1.045216 1 1.022358
## seeded 1.301851 1 1.140987
## opponent_seeded 1.301567 1 1.140862
## ht_dif 2.255708 1 1.501902
## age_dif 1.037123 1 1.018393
## rank_dif 1.254125 1 1.119877
## avg_ace 3.876487 1 1.968880
## avg_df 1.288771 1 1.135241
## avg_svpt 2.490405 1 1.578102
## avg_secWon 1.491040 1 1.221081
## avg_bpSaved 3.337821 1 1.826971
## avg_opponent_ace 3.859613 1 1.964590
## avg_opponent_df 1.289440 1 1.135535
## avg_opponent_svpt 2.497862 1 1.580463
## avg_opponent_secWon 1.495456 1 1.222889
## avg_opponent_bpSaved 3.315152 1 1.820756
# Prediction on test set
predprob_log_hard <- predict.glm(log.hard, hard_test, type = "response")
predclass_log_hard <- ifelse(predprob_log_hard >= 0.5, yes = 1, 0)
caret::confusionMatrix(as.factor(predclass_log_hard), hard_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4306 2164
## 1 2187 4313
##
## Accuracy : 0.6645
## 95% CI : (0.6563, 0.6727)
## No Information Rate : 0.5006
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3291
##
## Mcnemar's Test P-Value : 0.7387
##
## Sensitivity : 0.6659
## Specificity : 0.6632
## Pos Pred Value : 0.6635
## Neg Pred Value : 0.6655
## Prevalence : 0.4994
## Detection Rate : 0.3325
## Detection Prevalence : 0.5012
## Balanced Accuracy : 0.6645
##
## 'Positive' Class : 1
##
# Accuracy : 0.6666
# Sensitivity : 0.6645
# Specificity : 0.6687
#Logistic Regression with Stepwise Selection
null_model_hard = glm(outcome ~ 1, data = hard_train, family = binomial)
full_model_hard = log.hard
step.model.AIC.hard = step(null_model_hard, scope = list(upper = full_model_hard),
direction = "both", test = "Chisq", trace = F)
summary(step.model.AIC.hard)
##
## Call:
## glm(formula = outcome ~ rank_dif + avg_bpSaved + avg_opponent_bpSaved +
## opponent_seeded + seeded + avg_secWon + age_dif + avg_opponent_secWon +
## avg_opponent_df + avg_df + avg_ace + avg_svpt + avg_opponent_ace +
## avg_opponent_svpt + opponent_hand + hand + ht_dif, family = binomial,
## data = hard_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1717329 0.2878726 -0.597 0.55080
## rank_dif -0.0033310 0.0001102 -30.215 < 2e-16 ***
## avg_bpSaved -0.6850011 0.0315975 -21.679 < 2e-16 ***
## avg_opponent_bpSaved 0.6424905 0.0314932 20.401 < 2e-16 ***
## opponent_seededYes -0.4882382 0.0224056 -21.791 < 2e-16 ***
## seededYes 0.4661486 0.0224093 20.802 < 2e-16 ***
## avg_secWon 0.0663468 0.0068889 9.631 < 2e-16 ***
## age_dif -0.0176554 0.0017454 -10.115 < 2e-16 ***
## avg_opponent_secWon -0.0678954 0.0069242 -9.806 < 2e-16 ***
## avg_opponent_df 0.0616078 0.0123637 4.983 6.26e-07 ***
## avg_df -0.0530378 0.0123589 -4.291 1.77e-05 ***
## avg_ace -0.0443733 0.0057424 -7.727 1.10e-14 ***
## avg_svpt 0.0244759 0.0038221 6.404 1.52e-10 ***
## avg_opponent_ace 0.0394434 0.0057419 6.869 6.45e-12 ***
## avg_opponent_svpt -0.0198636 0.0038391 -5.174 2.29e-07 ***
## opponent_handR 0.0808226 0.0292391 2.764 0.00571 **
## handR -0.0645497 0.0291934 -2.211 0.02703 *
## ht_dif 0.0026368 0.0014641 1.801 0.07172 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71921 on 51879 degrees of freedom
## Residual deviance: 64127 on 51862 degrees of freedom
## AIC: 64163
##
## Number of Fisher Scoring iterations: 4
# Best model based on stepwise
log.sel.hard <- glm(outcome ~ rank_dif + avg_bpSaved + avg_opponent_bpSaved + seeded + opponent_seeded +
age_dif + avg_secWon + avg_opponent_secWon + avg_opponent_df + avg_ace + avg_svpt +
avg_opponent_ace + avg_opponent_svpt + avg_df + hand + opponent_hand + ht_dif,
hard_train, family = binomial)
summary(log.sel.hard)
##
## Call:
## glm(formula = outcome ~ rank_dif + avg_bpSaved + avg_opponent_bpSaved +
## seeded + opponent_seeded + age_dif + avg_secWon + avg_opponent_secWon +
## avg_opponent_df + avg_ace + avg_svpt + avg_opponent_ace +
## avg_opponent_svpt + avg_df + hand + opponent_hand + ht_dif,
## family = binomial, data = hard_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1717329 0.2878726 -0.597 0.55080
## rank_dif -0.0033310 0.0001102 -30.215 < 2e-16 ***
## avg_bpSaved -0.6850011 0.0315975 -21.679 < 2e-16 ***
## avg_opponent_bpSaved 0.6424905 0.0314932 20.401 < 2e-16 ***
## seededYes 0.4661486 0.0224093 20.802 < 2e-16 ***
## opponent_seededYes -0.4882382 0.0224056 -21.791 < 2e-16 ***
## age_dif -0.0176554 0.0017454 -10.115 < 2e-16 ***
## avg_secWon 0.0663468 0.0068889 9.631 < 2e-16 ***
## avg_opponent_secWon -0.0678954 0.0069242 -9.806 < 2e-16 ***
## avg_opponent_df 0.0616078 0.0123637 4.983 6.26e-07 ***
## avg_ace -0.0443733 0.0057424 -7.727 1.10e-14 ***
## avg_svpt 0.0244759 0.0038221 6.404 1.52e-10 ***
## avg_opponent_ace 0.0394434 0.0057419 6.869 6.45e-12 ***
## avg_opponent_svpt -0.0198636 0.0038391 -5.174 2.29e-07 ***
## avg_df -0.0530378 0.0123589 -4.291 1.77e-05 ***
## handR -0.0645497 0.0291934 -2.211 0.02703 *
## opponent_handR 0.0808226 0.0292391 2.764 0.00571 **
## ht_dif 0.0026368 0.0014641 1.801 0.07172 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71921 on 51879 degrees of freedom
## Residual deviance: 64127 on 51862 degrees of freedom
## AIC: 64163
##
## Number of Fisher Scoring iterations: 4
# predictions based on stepwise model
logistic_pred2_hard <- predict(log.sel.hard, newdata = hard_test, type = "response")
logistic_pred_class2_hard <- ifelse(logistic_pred2_hard > 0.5, yes = 1,0)
caret::confusionMatrix(as.factor(logistic_pred_class2_hard), hard_test$outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4306 2170
## 1 2187 4307
##
## Accuracy : 0.6641
## 95% CI : (0.6559, 0.6722)
## No Information Rate : 0.5006
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3281
##
## Mcnemar's Test P-Value : 0.8085
##
## Sensitivity : 0.6650
## Specificity : 0.6632
## Pos Pred Value : 0.6632
## Neg Pred Value : 0.6649
## Prevalence : 0.4994
## Detection Rate : 0.3321
## Detection Prevalence : 0.5007
## Balanced Accuracy : 0.6641
##
## 'Positive' Class : 1
##
# Accuracy : 0.6672
# Sensitivity : 0.6648
# Specificity : 0.6696
#Common Variables: Some variables appear consistently across different surface types, such as:
#rank_dif, seeded, opponent_seeded, avg_secWon, avg_opponent_secWon, and age_dif.
#This suggests that these variables have a consistent impact on match outcomes regardless of the surface type.
#Differing Variables:
#In the clay model, avg_ace has a negative coefficient, indicating that a higher average number of aces is associated with a lower probability of winning.
#In the grass model, avg_ace has a positive coefficient, suggesting that a higher average number of aces is associated with a higher probability of winning on grass.
#Similarly, other variables like ht_dif, avg_df, and avg_svpt also have coefficients that vary across surface types.
####
#### Does match length depend on the surface type?
####
levels(as.factor(match_outcomes$surface))
## [1] "" "Carpet" "Clay" "Grass" "Hard"
surface_subset <- match_outcomes[match_outcomes$surface != "Carpet", ]
levels(as.factor(surface_subset$best_of))
## [1] "3" "5"
surface_bestof3 <- surface_subset[surface_subset$best_of == 3, ]
surface_bestof5 <- surface_subset[surface_subset$best_of == 5, ]
lm_surface3 <- lm(minutes ~ surface, data = surface_bestof3)
summary(lm_surface3)
##
## Call:
## lm(formula = minutes ~ surface, data = surface_bestof3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.73 -24.27 -5.47 21.27 1048.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 100.7297 0.1888 533.60 <2e-16 ***
## surfaceGrass -8.2614 0.4324 -19.11 <2e-16 ***
## surfaceHard -3.4627 0.2385 -14.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.69 on 87295 degrees of freedom
## (6400 observations deleted due to missingness)
## Multiple R-squared: 0.004972, Adjusted R-squared: 0.004949
## F-statistic: 218.1 on 2 and 87295 DF, p-value: < 2.2e-16
# FOR BEST OF 3:
#The intercept (for matches played on clay) is estimated to be 100.7297 minutes.
#Matches played on Grass surface have, on average, 8.26 minutes shorter duration compared to matches played on clay.
#Matches played on Hard surface have, on average, 3.4627 minutes shorter duration compared to matches played on clay.
lm_surface5 <- lm(minutes ~ surface, data = surface_bestof5)
summary(lm_surface5)
##
## Call:
## lm(formula = minutes ~ surface, data = surface_bestof5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -151.34 -36.34 -7.34 32.66 521.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 151.9310 0.6693 226.985 <2e-16 ***
## surfaceGrass -8.8946 0.9724 -9.147 <2e-16 ***
## surfaceHard -0.5864 0.8176 -0.717 0.473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.02 on 21089 degrees of freedom
## (6400 observations deleted due to missingness)
## Multiple R-squared: 0.005307, Adjusted R-squared: 0.005213
## F-statistic: 56.26 on 2 and 21089 DF, p-value: < 2.2e-16
# FOR BEST OF 5:
#The intercept (for matches played on clay) is estimated to be 151.931 minutes.
#Matches played on Grass surface have, on average 8.89 minutes shorter duration than matches played on clay.
#Matches played on Hard surface have, on average 0.5864 minutes shorter duration than matches played on clay (NOT SIGNIFICANT)