Data - 2018 Tennis ATP Tour Results

I will build regression model for predicitng point difference based on 2018 Tennis ATP Tour Results.

Reading data

library(rvest) 
library(dplyr)
library(knitr)
library(rcompanion)
library(MASS)
library(tidyverse)
library(caret)


t_data<-read.csv(file="atp_matches_2018.csv",header=TRUE,sep=",")

Drop games ended in retirement and withdrawal

t_data$score<-as.character(t_data$score)

t_data<-t_data[!grepl("RET",t_data$score),]

t_data<-t_data[t_data$score!='W/O',]

Calculating point difference

t_data$score<-gsub(' +','',(gsub('\\s+','',t_data$score)))


t_data$points<-0
tie<-function(spos){t_data$points<-t_data$points+ifelse(str_sub(t_data$score,spos,spos)=='(',
                      ifelse(as.numeric(str_sub(t_data$score,spos-3,spos-3))==7,
                             ifelse(as.numeric(str_sub(t_data$score,spos+1,spos+1))<6,7-as.numeric(str_sub(t_data$score,spos+1,spos+1)),2),
                              ifelse(as.numeric(str_sub(t_data$score,spos+1,spos+1))<6,-7+as.numeric(str_sub(t_data$score,spos+1,spos+1)),-2)),0)}


tie(4)
tie(7)
tie(10)
tie(13)

t_data$points<-t_data$w_1stWon+t_data$w_2ndWon+(t_data$l_svpt-t_data$l_1stWon-t_data$l_2ndWon)-(t_data$l_1stWon+t_data$l_2ndWon+(t_data$w_svpt-t_data$w_1stWon-t_data$w_2ndWon))

Keeping only variables to be used later

t_data1<-t_data%>%dplyr::select(surface,draw_size,tourney_level,winner_seed,winner_hand,winner_ht,winner_age,winner_rank,winner_rank_points,loser_seed,loser_hand,loser_ht,loser_age,loser_rank,loser_rank_points,minutes,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_SvGms,w_bpFaced,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_SvGms,l_bpFaced,l_bpFaced,points)

Regression

To review a data sample

head(t_data1)
##   surface draw_size tourney_level winner_seed winner_hand winner_ht
## 1    Hard        32             A           3           R       193
## 2    Hard        32             A           3           R       193
## 3    Hard        32             A          NA           R       183
## 4    Hard        32             A           1           R       188
## 5    Hard        32             A           3           R       193
## 6    Hard        32             A          NA           R        NA
##   winner_age winner_rank winner_rank_points loser_seed loser_hand loser_ht
## 1   22.68309          21               2010         NA          R      183
## 2   22.68309          21               2010          1          R      188
## 3   25.65366          47               1010         NA          R       NA
## 4   26.63107           3               5150         NA          R       NA
## 5   22.68309          21               2010         NA          R      180
## 6   18.87201         208                245         NA          R       NA
##   loser_age loser_rank loser_rank_points minutes best_of round w_ace w_df
## 1  25.65366         47              1010      73       3     F    17    1
## 2  26.63107          3              5150      93       3    SF    19    2
## 3  18.87201        208               245     157       3    SF    14    5
## 4  22.98152         50               992     145       3    QF    12    9
## 5  29.14990         38              1231      90       3    QF    18    5
## 6  19.97536        175               299      74       3    QF     6    1
##   w_svpt w_1stIn w_SvGms w_bpFaced l_ace l_df l_svpt l_1stIn l_SvGms
## 1     56      40       9         5     8    3     58      32       9
## 2     72      50      13         2     5    6     74      44      13
## 3    115      70      16         6     9    5     94      62      16
## 4    103      65      16         2     6    1     89      53      15
## 5     69      43      13         3     6    3     75      41      13
## 6     46      23       8         3     3    5     51      29       8
##   l_bpFaced points
## 1         7     18
## 2         6     10
## 3         3     -5
## 4         6      6
## 5         4      6
## 6         7     23

Summary of data

kable(summary(t_data1))
surface draw_size tourney_level winner_seed winner_hand winner_ht winner_age winner_rank winner_rank_points loser_seed loser_hand loser_ht loser_age loser_rank loser_rank_points minutes best_of round w_ace w_df w_svpt w_1stIn w_SvGms w_bpFaced l_ace l_df l_svpt l_1stIn l_SvGms l_bpFaced points
Clay : 879 Min. : 4.00 A:1254 Min. : 1.000 : 3 Min. :163.0 Min. :14.93 Min. : 1.00 Min. : 1 Min. : 1.000 : 11 Min. :163 Min. :14.51 Min. : 1.0 Min. : 1.0 Min. : 31.0 Min. :3.000 R32 :736 Min. : 0.00 Min. : 0.000 Min. : 27.00 Min. : 15.00 Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 32.00 Min. : 15.00 Min. : 0.00 Min. : 0.000 Min. :-12.00
Grass: 323 1st Qu.: 32.00 D: 231 1st Qu.: 3.000 L: 342 1st Qu.:183.0 1st Qu.:24.00 1st Qu.: 21.00 1st Qu.: 634 1st Qu.: 4.000 L: 372 1st Qu.:183 1st Qu.:23.93 1st Qu.: 39.0 1st Qu.: 476.5 1st Qu.: 80.0 1st Qu.:3.000 R16 :429 1st Qu.: 3.00 1st Qu.: 1.000 1st Qu.: 59.00 1st Qu.: 36.00 1st Qu.: 9.00 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.: 62.00 1st Qu.: 37.00 1st Qu.: 9.00 1st Qu.: 6.000 1st Qu.: 8.00
Hard :1226 Median : 32.00 G: 488 Median : 6.000 R:2005 Median :185.0 Median :28.01 Median : 51.00 Median : 980 Median : 7.000 R:1921 Median :185 Median :27.74 Median : 72.0 Median : 755.0 Median :105.5 Median :3.000 R64 :354 Median : 6.00 Median : 2.000 Median : 77.00 Median : 47.00 Median :11.00 Median : 5.000 Median : 4.000 Median : 3.000 Median : 79.00 Median : 48.00 Median :11.00 Median : 8.000 Median : 14.00
NA Mean : 61.34 M: 455 Mean : 8.114 U: 78 Mean :186.9 Mean :27.63 Mean : 89.14 Mean : 1598 Mean : 9.307 U: 124 Mean :186 Mean :27.55 Mean : 120.8 Mean : 1006.8 Mean :112.8 Mean :3.449 R128 :306 Mean : 7.49 Mean : 2.918 Mean : 81.64 Mean : 50.65 Mean :11.98 Mean : 5.152 Mean : 5.641 Mean : 3.578 Mean : 84.69 Mean : 51.22 Mean :11.76 Mean : 8.729 Mean : 14.09
NA 3rd Qu.:128.00 NA 3rd Qu.:10.000 NA 3rd Qu.:190.0 3rd Qu.:31.10 3rd Qu.: 91.00 3rd Qu.: 1755 3rd Qu.:13.000 NA 3rd Qu.:190 3rd Qu.:30.81 3rd Qu.: 118.8 3rd Qu.: 1189.0 3rd Qu.:136.0 3rd Qu.:3.000 RR :231 3rd Qu.:10.00 3rd Qu.: 4.000 3rd Qu.: 97.00 3rd Qu.: 61.00 3rd Qu.:15.00 3rd Qu.: 7.000 3rd Qu.: 8.000 3rd Qu.: 5.000 3rd Qu.:101.00 3rd Qu.: 62.00 3rd Qu.:15.00 3rd Qu.:11.000 3rd Qu.: 20.00
NA Max. :128.00 NA Max. :32.000 NA Max. :208.0 Max. :39.38 Max. :1821.00 Max. :10600 Max. :32.000 NA Max. :208 Max. :39.42 Max. :1770.0 Max. :10060.0 Max. :396.0 Max. :5.000 QF :214 Max. :64.00 Max. :16.000 Max. :278.00 Max. :198.00 Max. :49.00 Max. :25.000 Max. :61.000 Max. :18.000 Max. :291.00 Max. :218.00 Max. :50.00 Max. :28.000 Max. : 46.00
NA NA NA NA’s :1508 NA NA’s :813 NA NA’s :7 NA’s :7 NA’s :1867 NA NA’s :1002 NA’s :3 NA’s :30 NA’s :30 NA’s :10 NA (Other):158 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13 NA’s :13

Cleaning Data

Deleting bad records and round of 5

t_data1<-subset(t_data1,!is.na(t_data1$points))

t_data1<-subset(t_data1,!is.na(t_data1$winner_rank_points))

t_data1<-subset(t_data1,!is.na(t_data1$loser_rank_points))

t_data1<-subset(t_data1,t_data1$best_of==3)

Take care of missing values

t_data1$winner_hand[t_data1$winner_hand == ''] = 'U'

t_data1$loser_hand[t_data1$loser_hand == ''] = 'U'

t_data1$winner_seed<-as.factor(ifelse(is.na(t_data1$winner_seed),"N","Y"))

t_data1$loser_seed<-as.factor(ifelse(is.na(t_data1$loser_seed),"N","Y"))

t_data1$winner_ht[is.na(t_data1$winner_ht)]<-mean(t_data1$winner_ht, na.rm = TRUE)

t_data1$loser_ht[is.na(t_data1$loser_ht)]<-mean(t_data1$loser_ht, na.rm = TRUE)

t_data1$minutes[is.na(t_data1$minutes)]<-mean(t_data1$minutes, na.rm = TRUE)

t_data1$ht<-t_data1$winner_ht-t_data1$loser_ht

t_data1$age<-t_data1$winner_age-t_data1$loser_age

t_data1$rank<-log(t_data1$winner_rank)-log(t_data1$loser_rank)

t_data1$rank_points<-log(t_data1$winner_rank_points)-log(t_data1$loser_rank_points)

t_data1$ace<-t_data1$w_ace-t_data1$l_ace

t_data1$df<-t_data1$w_df-t_data1$l_df
t_data1<-t_data1%>%dplyr::select(-c(winner_ht,loser_ht,winner_age,loser_age,draw_size,winner_rank,loser_rank,winner_rank_points,loser_rank_points,w_ace,l_ace,best_of,w_df,l_df))
cols <- c("w_svpt","minutes","l_svpt")
t_data1[cols] <- log(t_data1[cols])

Histogram and density

library(purrr)
library(tidyr)
library(ggplot2)

t_data1 %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

t_data1 %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()  

Boxplot

bpf<-function(d,column,column1){boxplot(
d[,column]~d[,column1],
data=d,
main=paste0("Different boxplots for each ",column1),
col="orange",
border="brown")}
  
bpf(t_data1,"points","surface")

bpf(t_data1,"points","tourney_level")

bpf(t_data1,"points","winner_hand")

bpf(t_data1,"points","loser_hand")

bpf(t_data1,"points","round")

bpf(t_data1,"points","winner_seed")

bpf(t_data1,"points","loser_seed")

Correlation

t_data1 %>%
  keep(is.numeric) %>% cor(method = "pearson", use = "complete.obs")
##                 minutes      w_svpt     w_1stIn     w_SvGms    w_bpFaced
## minutes      1.00000000  0.90706404  0.82283117  0.87734323  0.565600290
## w_svpt       0.90706404  1.00000000  0.90682917  0.89806580  0.634738017
## w_1stIn      0.82283117  0.90682917  1.00000000  0.84272569  0.547959942
## w_SvGms      0.87734323  0.89806580  0.84272569  1.00000000  0.451275433
## w_bpFaced    0.56560029  0.63473802  0.54795994  0.45127543  1.000000000
## l_svpt       0.90078195  0.81177941  0.75520780  0.86243508  0.433041902
## l_1stIn      0.81419649  0.75469897  0.73704787  0.81545264  0.378876263
## l_SvGms      0.88279916  0.87763500  0.82705420  0.97417316  0.444799052
## l_bpFaced    0.23412931  0.07138044  0.04689095  0.08308295  0.274685092
## points      -0.61696829 -0.69622426 -0.60881545 -0.60828226 -0.435350225
## ht          -0.03336441 -0.04642657 -0.04879748 -0.03398647 -0.060261475
## age         -0.03148375 -0.05192563 -0.03705980 -0.04572126 -0.005346589
## rank         0.13380195  0.18562193  0.16752070  0.13698531  0.165410369
## rank_points -0.15130730 -0.19256349 -0.18318185 -0.14104043 -0.165046118
## ace         -0.11251196 -0.10651705 -0.11888773 -0.08023636 -0.099231608
## df           0.11111239  0.17123251  0.10398473  0.12462953  0.118140487
##                  l_svpt     l_1stIn     l_SvGms     l_bpFaced      points
## minutes      0.90078195  0.81419649  0.88279916  0.2341293083 -0.61696829
## w_svpt       0.81177941  0.75469897  0.87763500  0.0713804431 -0.69622426
## w_1stIn      0.75520780  0.73704787  0.82705420  0.0468909532 -0.60881545
## w_SvGms      0.86243508  0.81545264  0.97417316  0.0830829483 -0.60828226
## w_bpFaced    0.43304190  0.37887626  0.44479905  0.2746850915 -0.43535022
## l_svpt       1.00000000  0.90125158  0.88762842  0.3297270068 -0.53620640
## l_1stIn      0.90125158  1.00000000  0.83417994  0.2680567423 -0.49006819
## l_SvGms      0.88762842  0.83417994  1.00000000  0.1044841131 -0.63002580
## l_bpFaced    0.32972701  0.26805674  0.10448411  1.0000000000  0.12242738
## points      -0.53620640 -0.49006819 -0.63002580  0.1224273824  1.00000000
## ht          -0.02501984 -0.02165872 -0.04697112 -0.0017226107  0.04365116
## age         -0.03035786 -0.03015498 -0.04553559  0.0158403974  0.04865192
## rank         0.10125776  0.08278245  0.13523782 -0.0219307256 -0.22644056
## rank_points -0.10889058 -0.08770898 -0.13970057  0.0198387316  0.22753306
## ace         -0.07654856 -0.05226763 -0.10747125 -0.0007608165  0.15588392
## df           0.03979827  0.13510184  0.10687796 -0.0783594441 -0.17379845
##                       ht          age        rank rank_points
## minutes     -0.033364411 -0.031483747  0.13380195 -0.15130730
## w_svpt      -0.046426572 -0.051925634  0.18562193 -0.19256349
## w_1stIn     -0.048797478 -0.037059804  0.16752070 -0.18318185
## w_SvGms     -0.033986467 -0.045721260  0.13698531 -0.14104043
## w_bpFaced   -0.060261475 -0.005346589  0.16541037 -0.16504612
## l_svpt      -0.025019841 -0.030357865  0.10125776 -0.10889058
## l_1stIn     -0.021658719 -0.030154981  0.08278245 -0.08770898
## l_SvGms     -0.046971119 -0.045535590  0.13523782 -0.13970057
## l_bpFaced   -0.001722611  0.015840397 -0.02193073  0.01983873
## points       0.043651159  0.048651918 -0.22644056  0.22753306
## ht           1.000000000  0.056113981 -0.08965611  0.05796211
## age          0.056113981  1.000000000 -0.17213426  0.17069516
## rank        -0.089656111 -0.172134263  1.00000000 -0.92096983
## rank_points  0.057962107  0.170695160 -0.92096983  1.00000000
## ace          0.451850590  0.007985649 -0.14917280  0.14710197
## df           0.070537240 -0.050622335 -0.02472582  0.02920247
##                       ace          df
## minutes     -0.1125119626  0.11111239
## w_svpt      -0.1065170534  0.17123251
## w_1stIn     -0.1188877286  0.10398473
## w_SvGms     -0.0802363576  0.12462953
## w_bpFaced   -0.0992316084  0.11814049
## l_svpt      -0.0765485638  0.03979827
## l_1stIn     -0.0522676308  0.13510184
## l_SvGms     -0.1074712466  0.10687796
## l_bpFaced   -0.0007608165 -0.07835944
## points       0.1558839170 -0.17379845
## ht           0.4518505898  0.07053724
## age          0.0079856486 -0.05062234
## rank        -0.1491728003 -0.02472582
## rank_points  0.1471019736  0.02920247
## ace          1.0000000000  0.12523177
## df           0.1252317706  1.00000000

Scatter plots

t_data1 %>%
  keep(is.numeric) %>% pairs()

Model

t_mod<-lm(points~.+poly(rank,3)+poly(w_SvGms,3)+poly(l_SvGms,3)+poly(ace,3)+poly(l_bpFaced,3)+w_SvGms:round,data=t_data1)

summary(t_mod)
## 
## Call:
## lm(formula = points ~ . + poly(rank, 3) + poly(w_SvGms, 3) + 
##     poly(l_SvGms, 3) + poly(ace, 3) + poly(l_bpFaced, 3) + w_SvGms:round, 
##     data = t_data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.3985  -3.0528  -0.0197   2.9315  19.0875 
## 
## Coefficients: (6 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          65.853637   6.730959   9.784  < 2e-16 ***
## surfaceGrass          0.482025   0.428010   1.126 0.260232    
## surfaceHard           0.102314   0.265256   0.386 0.699752    
## tourney_levelD       -1.979206   3.579800  -0.553 0.580414    
## tourney_levelM       -0.583972   0.364138  -1.604 0.108954    
## winner_seedY          0.015464   0.354920   0.044 0.965252    
## winner_handR          0.048760   0.337545   0.144 0.885157    
## winner_handU         -0.163911   0.738533  -0.222 0.824385    
## loser_seedY           0.537969   0.361129   1.490 0.136483    
## loser_handR           0.054910   0.321405   0.171 0.864366    
## loser_handU           0.358227   0.621555   0.576 0.564458    
## minutes               0.669506   1.362033   0.492 0.623098    
## roundQF              -1.953921   3.322488  -0.588 0.556546    
## roundR128            -2.912138   4.180985  -0.697 0.486194    
## roundR16             -0.789701   3.111882  -0.254 0.799703    
## roundR32             -0.893791   3.040630  -0.294 0.768831    
## roundR64              2.167675   3.263280   0.664 0.506608    
## roundRR                     NA         NA      NA       NA    
## roundSF              -0.583361   3.631176  -0.161 0.872384    
## w_svpt              -15.164169   1.670482  -9.078  < 2e-16 ***
## w_1stIn               0.041597   0.020860   1.994 0.046297 *  
## w_SvGms               1.602108   0.335711   4.772 1.97e-06 ***
## w_bpFaced            -0.142344   0.051581  -2.760 0.005846 ** 
## l_svpt                2.347317   1.630926   1.439 0.150253    
## l_1stIn               0.001196   0.020885   0.057 0.954355    
## l_SvGms              -2.298915   0.222573 -10.329  < 2e-16 ***
## l_bpFaced             0.321044   0.044498   7.215 7.94e-13 ***
## ht                   -0.028640   0.015507  -1.847 0.064935 .  
## age                  -0.012273   0.017920  -0.685 0.493537    
## rank                 -0.615147   0.295805  -2.080 0.037707 *  
## rank_points          -0.060966   0.293289  -0.208 0.835355    
## ace                   0.101947   0.023800   4.283 1.94e-05 ***
## df                   -0.165141   0.043957  -3.757 0.000178 ***
## poly(rank, 3)1              NA         NA      NA       NA    
## poly(rank, 3)2       -9.000463   5.240710  -1.717 0.086077 .  
## poly(rank, 3)3        0.950378   5.244131   0.181 0.856210    
## poly(w_SvGms, 3)1           NA         NA      NA       NA    
## poly(w_SvGms, 3)2    50.777773  10.566886   4.805 1.67e-06 ***
## poly(w_SvGms, 3)3   -12.588775  10.236604  -1.230 0.218941    
## poly(l_SvGms, 3)1           NA         NA      NA       NA    
## poly(l_SvGms, 3)2     6.492725  10.620336   0.611 0.541047    
## poly(l_SvGms, 3)3   -10.482666  10.197410  -1.028 0.304101    
## poly(ace, 3)1               NA         NA      NA       NA    
## poly(ace, 3)2        -0.926986   5.418644  -0.171 0.864185    
## poly(ace, 3)3         4.774438   5.013485   0.952 0.341064    
## poly(l_bpFaced, 3)1         NA         NA      NA       NA    
## poly(l_bpFaced, 3)2 -14.752997   5.035549  -2.930 0.003435 ** 
## poly(l_bpFaced, 3)3   8.506884   4.953958   1.717 0.086118 .  
## roundQF:w_SvGms       0.237135   0.281406   0.843 0.399521    
## roundR128:w_SvGms     0.374742   0.342926   1.093 0.274638    
## roundR16:w_SvGms      0.140587   0.264390   0.532 0.594970    
## roundR32:w_SvGms      0.157678   0.258739   0.609 0.542331    
## roundR64:w_SvGms     -0.029393   0.277081  -0.106 0.915530    
## roundRR:w_SvGms       0.249312   0.306888   0.812 0.416677    
## roundSF:w_SvGms       0.154621   0.304575   0.508 0.611754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.884 on 1790 degrees of freedom
## Multiple R-squared:  0.601,  Adjusted R-squared:  0.5903 
## F-statistic: 56.18 on 48 and 1790 DF,  p-value: < 2.2e-16
step.model <- stepAIC(t_mod, direction = "both", 
                      trace = FALSE)
summary(step.model) 
## 
## Call:
## lm(formula = points ~ w_svpt + w_1stIn + w_bpFaced + l_svpt + 
##     ht + df + poly(rank, 3) + poly(w_SvGms, 3) + poly(l_SvGms, 
##     3) + poly(ace, 3) + poly(l_bpFaced, 3), data = t_data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.8894  -3.0546  -0.0335   2.9248  20.1625 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           62.00494    6.72134   9.225  < 2e-16 ***
## w_svpt               -14.63942    1.45549 -10.058  < 2e-16 ***
## w_1stIn                0.03848    0.02006   1.918  0.05526 .  
## w_bpFaced             -0.14290    0.05084  -2.811  0.00499 ** 
## l_svpt                 2.64999    1.15448   2.295  0.02182 *  
## ht                    -0.02868    0.01509  -1.901  0.05744 .  
## df                    -0.16962    0.04235  -4.005 6.46e-05 ***
## poly(rank, 3)1       -26.16770    5.04666  -5.185 2.40e-07 ***
## poly(rank, 3)2        -8.35304    4.89288  -1.707  0.08796 .  
## poly(rank, 3)3        -0.05906    4.90220  -0.012  0.99039    
## poly(w_SvGms, 3)1    211.72805   26.98437   7.846 7.25e-15 ***
## poly(w_SvGms, 3)2     52.99859   10.44477   5.074 4.29e-07 ***
## poly(w_SvGms, 3)3    -12.02266   10.12479  -1.187  0.23521    
## poly(l_SvGms, 3)1   -282.78442   27.13481 -10.421  < 2e-16 ***
## poly(l_SvGms, 3)2      4.47016   10.43385   0.428  0.66839    
## poly(l_SvGms, 3)3    -11.74202   10.07716  -1.165  0.24409    
## poly(ace, 3)1         25.64685    5.66006   4.531 6.25e-06 ***
## poly(ace, 3)2          0.40084    5.30278   0.076  0.93975    
## poly(ace, 3)3          3.98442    4.92987   0.808  0.41907    
## poly(l_bpFaced, 3)1   48.68234    6.61617   7.358 2.81e-13 ***
## poly(l_bpFaced, 3)2  -14.64239    4.97579  -2.943  0.00329 ** 
## poly(l_bpFaced, 3)3    8.38289    4.91266   1.706  0.08811 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.868 on 1817 degrees of freedom
## Multiple R-squared:  0.5978, Adjusted R-squared:  0.5931 
## F-statistic: 128.6 on 21 and 1817 DF,  p-value: < 2.2e-16
t_mod1<-lm(points~w_svpt+poly(l_bpFaced,2)+ht+rank+df+poly(w_SvGms,2)+l_SvGms+ace,data=t_data1)

summary(t_mod1)
## 
## Call:
## lm(formula = points ~ w_svpt + poly(l_bpFaced, 2) + ht + rank + 
##     df + poly(w_SvGms, 2) + l_SvGms + ace, data = t_data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.0609  -3.1273  -0.1428   2.9893  20.1162 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          99.60471    4.45347  22.366  < 2e-16 ***
## w_svpt              -15.02717    0.94584 -15.888  < 2e-16 ***
## poly(l_bpFaced, 2)1  55.40578    4.98762  11.109  < 2e-16 ***
## poly(l_bpFaced, 2)2 -13.54057    4.98159  -2.718  0.00663 ** 
## ht                   -0.02352    0.01514  -1.553  0.12056    
## rank                 -0.50455    0.09003  -5.604 2.41e-08 ***
## df                   -0.19383    0.04195  -4.620 4.10e-06 ***
## poly(w_SvGms, 2)1   229.57728   24.08265   9.533  < 2e-16 ***
## poly(w_SvGms, 2)2    54.65038    5.18289  10.544  < 2e-16 ***
## l_SvGms              -2.08802    0.17870 -11.684  < 2e-16 ***
## ace                   0.10760    0.02329   4.620 4.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.908 on 1828 degrees of freedom
## Multiple R-squared:  0.5885, Adjusted R-squared:  0.5863 
## F-statistic: 261.5 on 10 and 1828 DF,  p-value: < 2.2e-16
t_mod2<-lm(points~w_svpt+l_bpFaced+rank+df+poly(w_SvGms,2)+l_SvGms+ace,data=t_data1)

summary(t_mod2)
## 
## Call:
## lm(formula = points ~ w_svpt + l_bpFaced + rank + df + poly(w_SvGms, 
##     2) + l_SvGms + ace, data = t_data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2580  -3.0727  -0.1241   2.9743  20.6482 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        97.01864    4.44973  21.803  < 2e-16 ***
## w_svpt            -15.06569    0.94753 -15.900  < 2e-16 ***
## l_bpFaced           0.36649    0.03303  11.096  < 2e-16 ***
## rank               -0.50779    0.09016  -5.632 2.05e-08 ***
## df                 -0.19928    0.04200  -4.744 2.25e-06 ***
## poly(w_SvGms, 2)1 228.86547   24.13242   9.484  < 2e-16 ***
## poly(w_SvGms, 2)2  55.75806    5.18128  10.761  < 2e-16 ***
## l_SvGms            -2.09421    0.17907 -11.695  < 2e-16 ***
## ace                 0.08992    0.02102   4.279 1.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.919 on 1830 degrees of freedom
## Multiple R-squared:  0.5864, Adjusted R-squared:  0.5845 
## F-statistic: 324.3 on 8 and 1830 DF,  p-value: < 2.2e-16
plot(t_mod2)

cooksd <- cooks.distance(t_mod2)
  
inf <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))]) 
  
t_data2<-t_data1[-inf,]
  
t_mod3<-lm(points~w_svpt+l_bpFaced+rank+df+poly(w_SvGms,2)+l_SvGms+ace,data=t_data2)

plot(t_mod3)

t_data2$w_SvGms2<-t_data2$w_SvGms^2
  
t_mod4<-rlm(points~w_svpt+l_bpFaced+rank+df+w_SvGms2+l_SvGms+ace,data=t_data2) 

summary(t_mod4)
## 
## Call: rlm(formula = points ~ w_svpt + l_bpFaced + rank + df + w_SvGms2 + 
##     l_SvGms + ace, data = t_data2)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -25.47514  -3.05171   0.02002   3.01778  20.52267 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept)  96.0234   3.1245    30.7325
## w_svpt      -17.0353   0.8476   -20.0989
## l_bpFaced     0.3595   0.0328    10.9784
## rank         -0.4912   0.0895    -5.4879
## df           -0.1586   0.0415    -3.8167
## w_SvGms2      0.1007   0.0069    14.5965
## l_SvGms      -2.5386   0.1683   -15.0878
## ace           0.0827   0.0208     3.9812
## 
## Residual standard error: 4.505 on 1759 degrees of freedom
plot(t_mod4)

#install.packages("robust")

library(robust)

mod6 <- lmRob(points ~ w_svpt+l_bpFaced+rank+df+w_SvGms2+l_SvGms+ace, data = t_data2)

summary(mod6)
## 
## Call:
## lmRob(formula = points ~ w_svpt + l_bpFaced + rank + df + w_SvGms2 + 
##     l_SvGms + ace, data = t_data2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -25.73699  -3.06687   0.04556   3.04337  20.64371 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  95.696140   3.329193  28.745  < 2e-16 ***
## w_svpt      -16.797513   0.902577 -18.611  < 2e-16 ***
## l_bpFaced     0.349820   0.035247   9.925  < 2e-16 ***
## rank         -0.487689   0.095320  -5.116 3.46e-07 ***
## df           -0.132247   0.044679  -2.960 0.003118 ** 
## w_SvGms2      0.101994   0.007387  13.807  < 2e-16 ***
## l_SvGms      -2.610718   0.179696 -14.529  < 2e-16 ***
## ace           0.084792   0.022329   3.797 0.000151 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.538 on 1759 degrees of freedom
## Multiple R-Squared: 0.5068 
## 
## Test for Bias:
##             statistic  p-value
## M-estimate      23.17 0.003154
## LS-estimate    106.14 0.000000

Conclusion

The final model has a modest adjusted R-squired of 0.58. It is somewhat surprising to me that R-squired is that low, because I use a bunch of predictors that are onlyknown after the game. Nevertheless, there is one predictor that is known before the game and it is difference in ranks.