I will build regression model for predicitng point difference based on 2018 Tennis ATP Tour Results.
library(rvest)
library(dplyr)
library(knitr)
library(rcompanion)
library(MASS)
library(tidyverse)
library(caret)
t_data<-read.csv(file="atp_matches_2018.csv",header=TRUE,sep=",")
Drop games ended in retirement and withdrawal
t_data$score<-as.character(t_data$score)
t_data<-t_data[!grepl("RET",t_data$score),]
t_data<-t_data[t_data$score!='W/O',]
Calculating point difference
t_data$score<-gsub(' +','',(gsub('\\s+','',t_data$score)))
t_data$points<-0
tie<-function(spos){t_data$points<-t_data$points+ifelse(str_sub(t_data$score,spos,spos)=='(',
ifelse(as.numeric(str_sub(t_data$score,spos-3,spos-3))==7,
ifelse(as.numeric(str_sub(t_data$score,spos+1,spos+1))<6,7-as.numeric(str_sub(t_data$score,spos+1,spos+1)),2),
ifelse(as.numeric(str_sub(t_data$score,spos+1,spos+1))<6,-7+as.numeric(str_sub(t_data$score,spos+1,spos+1)),-2)),0)}
tie(4)
tie(7)
tie(10)
tie(13)
t_data$points<-t_data$w_1stWon+t_data$w_2ndWon+(t_data$l_svpt-t_data$l_1stWon-t_data$l_2ndWon)-(t_data$l_1stWon+t_data$l_2ndWon+(t_data$w_svpt-t_data$w_1stWon-t_data$w_2ndWon))
Keeping only variables to be used later
t_data1<-t_data%>%dplyr::select(surface,draw_size,tourney_level,winner_seed,winner_hand,winner_ht,winner_age,winner_rank,winner_rank_points,loser_seed,loser_hand,loser_ht,loser_age,loser_rank,loser_rank_points,minutes,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_SvGms,w_bpFaced,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_SvGms,l_bpFaced,l_bpFaced,points)
head(t_data1)
## surface draw_size tourney_level winner_seed winner_hand winner_ht
## 1 Hard 32 A 3 R 193
## 2 Hard 32 A 3 R 193
## 3 Hard 32 A NA R 183
## 4 Hard 32 A 1 R 188
## 5 Hard 32 A 3 R 193
## 6 Hard 32 A NA R NA
## winner_age winner_rank winner_rank_points loser_seed loser_hand loser_ht
## 1 22.68309 21 2010 NA R 183
## 2 22.68309 21 2010 1 R 188
## 3 25.65366 47 1010 NA R NA
## 4 26.63107 3 5150 NA R NA
## 5 22.68309 21 2010 NA R 180
## 6 18.87201 208 245 NA R NA
## loser_age loser_rank loser_rank_points minutes best_of round w_ace w_df
## 1 25.65366 47 1010 73 3 F 17 1
## 2 26.63107 3 5150 93 3 SF 19 2
## 3 18.87201 208 245 157 3 SF 14 5
## 4 22.98152 50 992 145 3 QF 12 9
## 5 29.14990 38 1231 90 3 QF 18 5
## 6 19.97536 175 299 74 3 QF 6 1
## w_svpt w_1stIn w_SvGms w_bpFaced l_ace l_df l_svpt l_1stIn l_SvGms
## 1 56 40 9 5 8 3 58 32 9
## 2 72 50 13 2 5 6 74 44 13
## 3 115 70 16 6 9 5 94 62 16
## 4 103 65 16 2 6 1 89 53 15
## 5 69 43 13 3 6 3 75 41 13
## 6 46 23 8 3 3 5 51 29 8
## l_bpFaced points
## 1 7 18
## 2 6 10
## 3 3 -5
## 4 6 6
## 5 4 6
## 6 7 23
kable(summary(t_data1))
| surface | draw_size | tourney_level | winner_seed | winner_hand | winner_ht | winner_age | winner_rank | winner_rank_points | loser_seed | loser_hand | loser_ht | loser_age | loser_rank | loser_rank_points | minutes | best_of | round | w_ace | w_df | w_svpt | w_1stIn | w_SvGms | w_bpFaced | l_ace | l_df | l_svpt | l_1stIn | l_SvGms | l_bpFaced | points | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Clay : 879 | Min. : 4.00 | A:1254 | Min. : 1.000 | : 3 | Min. :163.0 | Min. :14.93 | Min. : 1.00 | Min. : 1 | Min. : 1.000 | : 11 | Min. :163 | Min. :14.51 | Min. : 1.0 | Min. : 1.0 | Min. : 31.0 | Min. :3.000 | R32 :736 | Min. : 0.00 | Min. : 0.000 | Min. : 27.00 | Min. : 15.00 | Min. : 0.00 | Min. : 0.000 | Min. : 0.000 | Min. : 0.000 | Min. : 32.00 | Min. : 15.00 | Min. : 0.00 | Min. : 0.000 | Min. :-12.00 | |
| Grass: 323 | 1st Qu.: 32.00 | D: 231 | 1st Qu.: 3.000 | L: 342 | 1st Qu.:183.0 | 1st Qu.:24.00 | 1st Qu.: 21.00 | 1st Qu.: 634 | 1st Qu.: 4.000 | L: 372 | 1st Qu.:183 | 1st Qu.:23.93 | 1st Qu.: 39.0 | 1st Qu.: 476.5 | 1st Qu.: 80.0 | 1st Qu.:3.000 | R16 :429 | 1st Qu.: 3.00 | 1st Qu.: 1.000 | 1st Qu.: 59.00 | 1st Qu.: 36.00 | 1st Qu.: 9.00 | 1st Qu.: 2.000 | 1st Qu.: 2.000 | 1st Qu.: 2.000 | 1st Qu.: 62.00 | 1st Qu.: 37.00 | 1st Qu.: 9.00 | 1st Qu.: 6.000 | 1st Qu.: 8.00 | |
| Hard :1226 | Median : 32.00 | G: 488 | Median : 6.000 | R:2005 | Median :185.0 | Median :28.01 | Median : 51.00 | Median : 980 | Median : 7.000 | R:1921 | Median :185 | Median :27.74 | Median : 72.0 | Median : 755.0 | Median :105.5 | Median :3.000 | R64 :354 | Median : 6.00 | Median : 2.000 | Median : 77.00 | Median : 47.00 | Median :11.00 | Median : 5.000 | Median : 4.000 | Median : 3.000 | Median : 79.00 | Median : 48.00 | Median :11.00 | Median : 8.000 | Median : 14.00 | |
| NA | Mean : 61.34 | M: 455 | Mean : 8.114 | U: 78 | Mean :186.9 | Mean :27.63 | Mean : 89.14 | Mean : 1598 | Mean : 9.307 | U: 124 | Mean :186 | Mean :27.55 | Mean : 120.8 | Mean : 1006.8 | Mean :112.8 | Mean :3.449 | R128 :306 | Mean : 7.49 | Mean : 2.918 | Mean : 81.64 | Mean : 50.65 | Mean :11.98 | Mean : 5.152 | Mean : 5.641 | Mean : 3.578 | Mean : 84.69 | Mean : 51.22 | Mean :11.76 | Mean : 8.729 | Mean : 14.09 | |
| NA | 3rd Qu.:128.00 | NA | 3rd Qu.:10.000 | NA | 3rd Qu.:190.0 | 3rd Qu.:31.10 | 3rd Qu.: 91.00 | 3rd Qu.: 1755 | 3rd Qu.:13.000 | NA | 3rd Qu.:190 | 3rd Qu.:30.81 | 3rd Qu.: 118.8 | 3rd Qu.: 1189.0 | 3rd Qu.:136.0 | 3rd Qu.:3.000 | RR :231 | 3rd Qu.:10.00 | 3rd Qu.: 4.000 | 3rd Qu.: 97.00 | 3rd Qu.: 61.00 | 3rd Qu.:15.00 | 3rd Qu.: 7.000 | 3rd Qu.: 8.000 | 3rd Qu.: 5.000 | 3rd Qu.:101.00 | 3rd Qu.: 62.00 | 3rd Qu.:15.00 | 3rd Qu.:11.000 | 3rd Qu.: 20.00 | |
| NA | Max. :128.00 | NA | Max. :32.000 | NA | Max. :208.0 | Max. :39.38 | Max. :1821.00 | Max. :10600 | Max. :32.000 | NA | Max. :208 | Max. :39.42 | Max. :1770.0 | Max. :10060.0 | Max. :396.0 | Max. :5.000 | QF :214 | Max. :64.00 | Max. :16.000 | Max. :278.00 | Max. :198.00 | Max. :49.00 | Max. :25.000 | Max. :61.000 | Max. :18.000 | Max. :291.00 | Max. :218.00 | Max. :50.00 | Max. :28.000 | Max. : 46.00 | |
| NA | NA | NA | NA’s :1508 | NA | NA’s :813 | NA | NA’s :7 | NA’s :7 | NA’s :1867 | NA | NA’s :1002 | NA’s :3 | NA’s :30 | NA’s :30 | NA’s :10 | NA | (Other):158 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 | NA’s :13 |
Deleting bad records and round of 5
t_data1<-subset(t_data1,!is.na(t_data1$points))
t_data1<-subset(t_data1,!is.na(t_data1$winner_rank_points))
t_data1<-subset(t_data1,!is.na(t_data1$loser_rank_points))
t_data1<-subset(t_data1,t_data1$best_of==3)
Take care of missing values
t_data1$winner_hand[t_data1$winner_hand == ''] = 'U'
t_data1$loser_hand[t_data1$loser_hand == ''] = 'U'
t_data1$winner_seed<-as.factor(ifelse(is.na(t_data1$winner_seed),"N","Y"))
t_data1$loser_seed<-as.factor(ifelse(is.na(t_data1$loser_seed),"N","Y"))
t_data1$winner_ht[is.na(t_data1$winner_ht)]<-mean(t_data1$winner_ht, na.rm = TRUE)
t_data1$loser_ht[is.na(t_data1$loser_ht)]<-mean(t_data1$loser_ht, na.rm = TRUE)
t_data1$minutes[is.na(t_data1$minutes)]<-mean(t_data1$minutes, na.rm = TRUE)
t_data1$ht<-t_data1$winner_ht-t_data1$loser_ht
t_data1$age<-t_data1$winner_age-t_data1$loser_age
t_data1$rank<-log(t_data1$winner_rank)-log(t_data1$loser_rank)
t_data1$rank_points<-log(t_data1$winner_rank_points)-log(t_data1$loser_rank_points)
t_data1$ace<-t_data1$w_ace-t_data1$l_ace
t_data1$df<-t_data1$w_df-t_data1$l_df
t_data1<-t_data1%>%dplyr::select(-c(winner_ht,loser_ht,winner_age,loser_age,draw_size,winner_rank,loser_rank,winner_rank_points,loser_rank_points,w_ace,l_ace,best_of,w_df,l_df))
cols <- c("w_svpt","minutes","l_svpt")
t_data1[cols] <- log(t_data1[cols])
library(purrr)
library(tidyr)
library(ggplot2)
t_data1 %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
t_data1 %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density()
bpf<-function(d,column,column1){boxplot(
d[,column]~d[,column1],
data=d,
main=paste0("Different boxplots for each ",column1),
col="orange",
border="brown")}
bpf(t_data1,"points","surface")
bpf(t_data1,"points","tourney_level")
bpf(t_data1,"points","winner_hand")
bpf(t_data1,"points","loser_hand")
bpf(t_data1,"points","round")
bpf(t_data1,"points","winner_seed")
bpf(t_data1,"points","loser_seed")
t_data1 %>%
keep(is.numeric) %>% cor(method = "pearson", use = "complete.obs")
## minutes w_svpt w_1stIn w_SvGms w_bpFaced
## minutes 1.00000000 0.90706404 0.82283117 0.87734323 0.565600290
## w_svpt 0.90706404 1.00000000 0.90682917 0.89806580 0.634738017
## w_1stIn 0.82283117 0.90682917 1.00000000 0.84272569 0.547959942
## w_SvGms 0.87734323 0.89806580 0.84272569 1.00000000 0.451275433
## w_bpFaced 0.56560029 0.63473802 0.54795994 0.45127543 1.000000000
## l_svpt 0.90078195 0.81177941 0.75520780 0.86243508 0.433041902
## l_1stIn 0.81419649 0.75469897 0.73704787 0.81545264 0.378876263
## l_SvGms 0.88279916 0.87763500 0.82705420 0.97417316 0.444799052
## l_bpFaced 0.23412931 0.07138044 0.04689095 0.08308295 0.274685092
## points -0.61696829 -0.69622426 -0.60881545 -0.60828226 -0.435350225
## ht -0.03336441 -0.04642657 -0.04879748 -0.03398647 -0.060261475
## age -0.03148375 -0.05192563 -0.03705980 -0.04572126 -0.005346589
## rank 0.13380195 0.18562193 0.16752070 0.13698531 0.165410369
## rank_points -0.15130730 -0.19256349 -0.18318185 -0.14104043 -0.165046118
## ace -0.11251196 -0.10651705 -0.11888773 -0.08023636 -0.099231608
## df 0.11111239 0.17123251 0.10398473 0.12462953 0.118140487
## l_svpt l_1stIn l_SvGms l_bpFaced points
## minutes 0.90078195 0.81419649 0.88279916 0.2341293083 -0.61696829
## w_svpt 0.81177941 0.75469897 0.87763500 0.0713804431 -0.69622426
## w_1stIn 0.75520780 0.73704787 0.82705420 0.0468909532 -0.60881545
## w_SvGms 0.86243508 0.81545264 0.97417316 0.0830829483 -0.60828226
## w_bpFaced 0.43304190 0.37887626 0.44479905 0.2746850915 -0.43535022
## l_svpt 1.00000000 0.90125158 0.88762842 0.3297270068 -0.53620640
## l_1stIn 0.90125158 1.00000000 0.83417994 0.2680567423 -0.49006819
## l_SvGms 0.88762842 0.83417994 1.00000000 0.1044841131 -0.63002580
## l_bpFaced 0.32972701 0.26805674 0.10448411 1.0000000000 0.12242738
## points -0.53620640 -0.49006819 -0.63002580 0.1224273824 1.00000000
## ht -0.02501984 -0.02165872 -0.04697112 -0.0017226107 0.04365116
## age -0.03035786 -0.03015498 -0.04553559 0.0158403974 0.04865192
## rank 0.10125776 0.08278245 0.13523782 -0.0219307256 -0.22644056
## rank_points -0.10889058 -0.08770898 -0.13970057 0.0198387316 0.22753306
## ace -0.07654856 -0.05226763 -0.10747125 -0.0007608165 0.15588392
## df 0.03979827 0.13510184 0.10687796 -0.0783594441 -0.17379845
## ht age rank rank_points
## minutes -0.033364411 -0.031483747 0.13380195 -0.15130730
## w_svpt -0.046426572 -0.051925634 0.18562193 -0.19256349
## w_1stIn -0.048797478 -0.037059804 0.16752070 -0.18318185
## w_SvGms -0.033986467 -0.045721260 0.13698531 -0.14104043
## w_bpFaced -0.060261475 -0.005346589 0.16541037 -0.16504612
## l_svpt -0.025019841 -0.030357865 0.10125776 -0.10889058
## l_1stIn -0.021658719 -0.030154981 0.08278245 -0.08770898
## l_SvGms -0.046971119 -0.045535590 0.13523782 -0.13970057
## l_bpFaced -0.001722611 0.015840397 -0.02193073 0.01983873
## points 0.043651159 0.048651918 -0.22644056 0.22753306
## ht 1.000000000 0.056113981 -0.08965611 0.05796211
## age 0.056113981 1.000000000 -0.17213426 0.17069516
## rank -0.089656111 -0.172134263 1.00000000 -0.92096983
## rank_points 0.057962107 0.170695160 -0.92096983 1.00000000
## ace 0.451850590 0.007985649 -0.14917280 0.14710197
## df 0.070537240 -0.050622335 -0.02472582 0.02920247
## ace df
## minutes -0.1125119626 0.11111239
## w_svpt -0.1065170534 0.17123251
## w_1stIn -0.1188877286 0.10398473
## w_SvGms -0.0802363576 0.12462953
## w_bpFaced -0.0992316084 0.11814049
## l_svpt -0.0765485638 0.03979827
## l_1stIn -0.0522676308 0.13510184
## l_SvGms -0.1074712466 0.10687796
## l_bpFaced -0.0007608165 -0.07835944
## points 0.1558839170 -0.17379845
## ht 0.4518505898 0.07053724
## age 0.0079856486 -0.05062234
## rank -0.1491728003 -0.02472582
## rank_points 0.1471019736 0.02920247
## ace 1.0000000000 0.12523177
## df 0.1252317706 1.00000000
t_data1 %>%
keep(is.numeric) %>% pairs()
t_mod<-lm(points~.+poly(rank,3)+poly(w_SvGms,3)+poly(l_SvGms,3)+poly(ace,3)+poly(l_bpFaced,3)+w_SvGms:round,data=t_data1)
summary(t_mod)
##
## Call:
## lm(formula = points ~ . + poly(rank, 3) + poly(w_SvGms, 3) +
## poly(l_SvGms, 3) + poly(ace, 3) + poly(l_bpFaced, 3) + w_SvGms:round,
## data = t_data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.3985 -3.0528 -0.0197 2.9315 19.0875
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.853637 6.730959 9.784 < 2e-16 ***
## surfaceGrass 0.482025 0.428010 1.126 0.260232
## surfaceHard 0.102314 0.265256 0.386 0.699752
## tourney_levelD -1.979206 3.579800 -0.553 0.580414
## tourney_levelM -0.583972 0.364138 -1.604 0.108954
## winner_seedY 0.015464 0.354920 0.044 0.965252
## winner_handR 0.048760 0.337545 0.144 0.885157
## winner_handU -0.163911 0.738533 -0.222 0.824385
## loser_seedY 0.537969 0.361129 1.490 0.136483
## loser_handR 0.054910 0.321405 0.171 0.864366
## loser_handU 0.358227 0.621555 0.576 0.564458
## minutes 0.669506 1.362033 0.492 0.623098
## roundQF -1.953921 3.322488 -0.588 0.556546
## roundR128 -2.912138 4.180985 -0.697 0.486194
## roundR16 -0.789701 3.111882 -0.254 0.799703
## roundR32 -0.893791 3.040630 -0.294 0.768831
## roundR64 2.167675 3.263280 0.664 0.506608
## roundRR NA NA NA NA
## roundSF -0.583361 3.631176 -0.161 0.872384
## w_svpt -15.164169 1.670482 -9.078 < 2e-16 ***
## w_1stIn 0.041597 0.020860 1.994 0.046297 *
## w_SvGms 1.602108 0.335711 4.772 1.97e-06 ***
## w_bpFaced -0.142344 0.051581 -2.760 0.005846 **
## l_svpt 2.347317 1.630926 1.439 0.150253
## l_1stIn 0.001196 0.020885 0.057 0.954355
## l_SvGms -2.298915 0.222573 -10.329 < 2e-16 ***
## l_bpFaced 0.321044 0.044498 7.215 7.94e-13 ***
## ht -0.028640 0.015507 -1.847 0.064935 .
## age -0.012273 0.017920 -0.685 0.493537
## rank -0.615147 0.295805 -2.080 0.037707 *
## rank_points -0.060966 0.293289 -0.208 0.835355
## ace 0.101947 0.023800 4.283 1.94e-05 ***
## df -0.165141 0.043957 -3.757 0.000178 ***
## poly(rank, 3)1 NA NA NA NA
## poly(rank, 3)2 -9.000463 5.240710 -1.717 0.086077 .
## poly(rank, 3)3 0.950378 5.244131 0.181 0.856210
## poly(w_SvGms, 3)1 NA NA NA NA
## poly(w_SvGms, 3)2 50.777773 10.566886 4.805 1.67e-06 ***
## poly(w_SvGms, 3)3 -12.588775 10.236604 -1.230 0.218941
## poly(l_SvGms, 3)1 NA NA NA NA
## poly(l_SvGms, 3)2 6.492725 10.620336 0.611 0.541047
## poly(l_SvGms, 3)3 -10.482666 10.197410 -1.028 0.304101
## poly(ace, 3)1 NA NA NA NA
## poly(ace, 3)2 -0.926986 5.418644 -0.171 0.864185
## poly(ace, 3)3 4.774438 5.013485 0.952 0.341064
## poly(l_bpFaced, 3)1 NA NA NA NA
## poly(l_bpFaced, 3)2 -14.752997 5.035549 -2.930 0.003435 **
## poly(l_bpFaced, 3)3 8.506884 4.953958 1.717 0.086118 .
## roundQF:w_SvGms 0.237135 0.281406 0.843 0.399521
## roundR128:w_SvGms 0.374742 0.342926 1.093 0.274638
## roundR16:w_SvGms 0.140587 0.264390 0.532 0.594970
## roundR32:w_SvGms 0.157678 0.258739 0.609 0.542331
## roundR64:w_SvGms -0.029393 0.277081 -0.106 0.915530
## roundRR:w_SvGms 0.249312 0.306888 0.812 0.416677
## roundSF:w_SvGms 0.154621 0.304575 0.508 0.611754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.884 on 1790 degrees of freedom
## Multiple R-squared: 0.601, Adjusted R-squared: 0.5903
## F-statistic: 56.18 on 48 and 1790 DF, p-value: < 2.2e-16
step.model <- stepAIC(t_mod, direction = "both",
trace = FALSE)
summary(step.model)
##
## Call:
## lm(formula = points ~ w_svpt + w_1stIn + w_bpFaced + l_svpt +
## ht + df + poly(rank, 3) + poly(w_SvGms, 3) + poly(l_SvGms,
## 3) + poly(ace, 3) + poly(l_bpFaced, 3), data = t_data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.8894 -3.0546 -0.0335 2.9248 20.1625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.00494 6.72134 9.225 < 2e-16 ***
## w_svpt -14.63942 1.45549 -10.058 < 2e-16 ***
## w_1stIn 0.03848 0.02006 1.918 0.05526 .
## w_bpFaced -0.14290 0.05084 -2.811 0.00499 **
## l_svpt 2.64999 1.15448 2.295 0.02182 *
## ht -0.02868 0.01509 -1.901 0.05744 .
## df -0.16962 0.04235 -4.005 6.46e-05 ***
## poly(rank, 3)1 -26.16770 5.04666 -5.185 2.40e-07 ***
## poly(rank, 3)2 -8.35304 4.89288 -1.707 0.08796 .
## poly(rank, 3)3 -0.05906 4.90220 -0.012 0.99039
## poly(w_SvGms, 3)1 211.72805 26.98437 7.846 7.25e-15 ***
## poly(w_SvGms, 3)2 52.99859 10.44477 5.074 4.29e-07 ***
## poly(w_SvGms, 3)3 -12.02266 10.12479 -1.187 0.23521
## poly(l_SvGms, 3)1 -282.78442 27.13481 -10.421 < 2e-16 ***
## poly(l_SvGms, 3)2 4.47016 10.43385 0.428 0.66839
## poly(l_SvGms, 3)3 -11.74202 10.07716 -1.165 0.24409
## poly(ace, 3)1 25.64685 5.66006 4.531 6.25e-06 ***
## poly(ace, 3)2 0.40084 5.30278 0.076 0.93975
## poly(ace, 3)3 3.98442 4.92987 0.808 0.41907
## poly(l_bpFaced, 3)1 48.68234 6.61617 7.358 2.81e-13 ***
## poly(l_bpFaced, 3)2 -14.64239 4.97579 -2.943 0.00329 **
## poly(l_bpFaced, 3)3 8.38289 4.91266 1.706 0.08811 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.868 on 1817 degrees of freedom
## Multiple R-squared: 0.5978, Adjusted R-squared: 0.5931
## F-statistic: 128.6 on 21 and 1817 DF, p-value: < 2.2e-16
t_mod1<-lm(points~w_svpt+poly(l_bpFaced,2)+ht+rank+df+poly(w_SvGms,2)+l_SvGms+ace,data=t_data1)
summary(t_mod1)
##
## Call:
## lm(formula = points ~ w_svpt + poly(l_bpFaced, 2) + ht + rank +
## df + poly(w_SvGms, 2) + l_SvGms + ace, data = t_data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.0609 -3.1273 -0.1428 2.9893 20.1162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.60471 4.45347 22.366 < 2e-16 ***
## w_svpt -15.02717 0.94584 -15.888 < 2e-16 ***
## poly(l_bpFaced, 2)1 55.40578 4.98762 11.109 < 2e-16 ***
## poly(l_bpFaced, 2)2 -13.54057 4.98159 -2.718 0.00663 **
## ht -0.02352 0.01514 -1.553 0.12056
## rank -0.50455 0.09003 -5.604 2.41e-08 ***
## df -0.19383 0.04195 -4.620 4.10e-06 ***
## poly(w_SvGms, 2)1 229.57728 24.08265 9.533 < 2e-16 ***
## poly(w_SvGms, 2)2 54.65038 5.18289 10.544 < 2e-16 ***
## l_SvGms -2.08802 0.17870 -11.684 < 2e-16 ***
## ace 0.10760 0.02329 4.620 4.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.908 on 1828 degrees of freedom
## Multiple R-squared: 0.5885, Adjusted R-squared: 0.5863
## F-statistic: 261.5 on 10 and 1828 DF, p-value: < 2.2e-16
t_mod2<-lm(points~w_svpt+l_bpFaced+rank+df+poly(w_SvGms,2)+l_SvGms+ace,data=t_data1)
summary(t_mod2)
##
## Call:
## lm(formula = points ~ w_svpt + l_bpFaced + rank + df + poly(w_SvGms,
## 2) + l_SvGms + ace, data = t_data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.2580 -3.0727 -0.1241 2.9743 20.6482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.01864 4.44973 21.803 < 2e-16 ***
## w_svpt -15.06569 0.94753 -15.900 < 2e-16 ***
## l_bpFaced 0.36649 0.03303 11.096 < 2e-16 ***
## rank -0.50779 0.09016 -5.632 2.05e-08 ***
## df -0.19928 0.04200 -4.744 2.25e-06 ***
## poly(w_SvGms, 2)1 228.86547 24.13242 9.484 < 2e-16 ***
## poly(w_SvGms, 2)2 55.75806 5.18128 10.761 < 2e-16 ***
## l_SvGms -2.09421 0.17907 -11.695 < 2e-16 ***
## ace 0.08992 0.02102 4.279 1.98e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.919 on 1830 degrees of freedom
## Multiple R-squared: 0.5864, Adjusted R-squared: 0.5845
## F-statistic: 324.3 on 8 and 1830 DF, p-value: < 2.2e-16
plot(t_mod2)
cooksd <- cooks.distance(t_mod2)
inf <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])
t_data2<-t_data1[-inf,]
t_mod3<-lm(points~w_svpt+l_bpFaced+rank+df+poly(w_SvGms,2)+l_SvGms+ace,data=t_data2)
plot(t_mod3)
t_data2$w_SvGms2<-t_data2$w_SvGms^2
t_mod4<-rlm(points~w_svpt+l_bpFaced+rank+df+w_SvGms2+l_SvGms+ace,data=t_data2)
summary(t_mod4)
##
## Call: rlm(formula = points ~ w_svpt + l_bpFaced + rank + df + w_SvGms2 +
## l_SvGms + ace, data = t_data2)
## Residuals:
## Min 1Q Median 3Q Max
## -25.47514 -3.05171 0.02002 3.01778 20.52267
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 96.0234 3.1245 30.7325
## w_svpt -17.0353 0.8476 -20.0989
## l_bpFaced 0.3595 0.0328 10.9784
## rank -0.4912 0.0895 -5.4879
## df -0.1586 0.0415 -3.8167
## w_SvGms2 0.1007 0.0069 14.5965
## l_SvGms -2.5386 0.1683 -15.0878
## ace 0.0827 0.0208 3.9812
##
## Residual standard error: 4.505 on 1759 degrees of freedom
plot(t_mod4)
#install.packages("robust")
library(robust)
mod6 <- lmRob(points ~ w_svpt+l_bpFaced+rank+df+w_SvGms2+l_SvGms+ace, data = t_data2)
summary(mod6)
##
## Call:
## lmRob(formula = points ~ w_svpt + l_bpFaced + rank + df + w_SvGms2 +
## l_SvGms + ace, data = t_data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.73699 -3.06687 0.04556 3.04337 20.64371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.696140 3.329193 28.745 < 2e-16 ***
## w_svpt -16.797513 0.902577 -18.611 < 2e-16 ***
## l_bpFaced 0.349820 0.035247 9.925 < 2e-16 ***
## rank -0.487689 0.095320 -5.116 3.46e-07 ***
## df -0.132247 0.044679 -2.960 0.003118 **
## w_SvGms2 0.101994 0.007387 13.807 < 2e-16 ***
## l_SvGms -2.610718 0.179696 -14.529 < 2e-16 ***
## ace 0.084792 0.022329 3.797 0.000151 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.538 on 1759 degrees of freedom
## Multiple R-Squared: 0.5068
##
## Test for Bias:
## statistic p-value
## M-estimate 23.17 0.003154
## LS-estimate 106.14 0.000000
The final model has a modest adjusted R-squired of 0.58. It is somewhat surprising to me that R-squired is that low, because I use a bunch of predictors that are onlyknown after the game. Nevertheless, there is one predictor that is known before the game and it is difference in ranks.