options(repos = "https://cran.rstudio.com/")
library("StepReg")
library("rms")
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library("aod")
library("knitr")
library("car")
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:rms':
## 
##     Predict, vif
library("broom")
library("sjPlot")
library("effects")
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library("outliers")
library("spatstat")
## Loading required package: spatstat.data
## Loading required package: spatstat.geom
## spatstat.geom 3.2-5
## 
## Attaching package: 'spatstat.geom'
## The following object is masked from 'package:car':
## 
##     ellipse
## The following object is masked from 'package:rms':
## 
##     perimeter
## Loading required package: spatstat.random
## spatstat.random 3.1-6
## Loading required package: spatstat.explore
## Loading required package: nlme
## spatstat.explore 3.2-3
## Loading required package: spatstat.model
## Loading required package: rpart
## spatstat.model 3.2-6
## 
## Attaching package: 'spatstat.model'
## The following object is masked from 'package:car':
## 
##     bc
## Loading required package: spatstat.linnet
## spatstat.linnet 3.1-1
## 
## spatstat 3.0-6 
## For an introduction to spatstat, type 'beginner'
library("DMwR2")
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library("ggplot2")
library("gridExtra")
library("grid")
## 
## Attaching package: 'grid'
## The following object is masked from 'package:spatstat.geom':
## 
##     as.mask
library("dbscan")
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:DMwR2':
## 
##     kNN
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()  masks nlme::collapse()
## ✖ dplyr::combine()   masks gridExtra::combine()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::recode()    masks car::recode()
## ✖ purrr::some()      masks car::some()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("MASS")
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## The following object is masked from 'package:spatstat.geom':
## 
##     area
library("pROC")
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:spatstat.explore':
## 
##     auc, roc
## 
## The following object is masked from 'package:spatstat.geom':
## 
##     coords
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:spatstat.model':
## 
##     panel.histogram
## 
## The following object is masked from 'package:spatstat.explore':
## 
##     panel.histogram
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart.plot)
library(rpart)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(corrplot)
## corrplot 0.92 loaded
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:outliers':
## 
##     outlier
library(readr)
library(parsnip)
## 
## Attaching package: 'parsnip'
## 
## The following object is masked from 'package:Hmisc':
## 
##     translate
library(party)
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object is masked from 'package:parsnip':
## 
##     fit
## 
## The following object is masked from 'package:spatstat.model':
## 
##     parameters
## 
## The following object is masked from 'package:car':
## 
##     Predict
## 
## The following object is masked from 'package:rms':
## 
##     Predict
## 
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:dplyr':
## 
##     where
## 
## The following object is masked from 'package:spatstat.model':
## 
##     response
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:randomForest':
## 
##     outlier
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## The following objects are masked from 'package:spatstat.geom':
## 
##     reflect, rescale
## 
## The following object is masked from 'package:outliers':
## 
##     outlier
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe

Step 1

[1] The dataset that I obtained to analyze is from a dataset originally published by Jeff Sackmann, which contains records of tennis matches dating back to 1968. However, for the purposes of this assignment, only data from the years 2010-2020 was collected, specifically focusing on Rafael Nadal’s matches in all four Grand Slams from the initial to exit round.

data <- read.csv("SelectedVariables1.csv")
str(data)
## 'data.frame':    210 obs. of  9 variables:
##  $ Y           : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ tourney_name: chr  "Australian Open" "Australian Open" "Australian Open" "Australian Open" ...
##  $ surface     : chr  "Hard" "Hard" "Hard" "Hard" ...
##  $ ace         : int  11 1 6 2 1 0 1 1 2 4 ...
##  $ bpSaved     : int  6 0 12 1 6 9 5 3 5 1 ...
##  $ df          : int  1 1 3 2 1 0 2 1 0 0 ...
##  $ X1stWon     : int  55 25 58 56 30 44 47 42 41 59 ...
##  $ X2ndWon     : int  18 20 28 22 10 15 7 10 15 17 ...
##  $ minutes     : int  154 113 210 156 150 143 105 148 153 155 ...

Step 2: Parametric Statistics

Logistic Regression

Logistic regression was chosen since Class is the dependent variable (response variable) and there is only two outcomes, whether Nadal won or loss.

data$Y <- as.factor(data$Y)
data$tourney_name <- as.factor(data$tourney_name)
data$surface <- as.factor(data$surface)

data$tourney_name <- relevel(data$tourney_name, ref = "Wimbledon")
data$surface <- relevel(data$surface, ref = "Hard")

set.seed(1234)
dt <- sort(sample(nrow(data), nrow(data) *.70))
train <- data[dt,]
test <- data[-dt,] 

model <- glm(Y ~ tourney_name + surface + ace + bpSaved + df + X1stWon + X2ndWon,
             data = train, family = binomial(link = "logit"))
summary(model)
## 
## Call:
## glm(formula = Y ~ tourney_name + surface + ace + bpSaved + df + 
##     X1stWon + X2ndWon, family = binomial(link = "logit"), data = train)
## 
## Coefficients: (2 not defined because of singularities)
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  3.7188670  1.7251087   2.156 0.031105 *  
## tourney_nameAustralian Open  0.3538581  0.9871464   0.358 0.719995    
## tourney_nameRoland Garros    2.1117088  1.4202830   1.487 0.137062    
## tourney_nameUS Open          0.5063037  1.0332196   0.490 0.624116    
## surfaceClay                         NA         NA      NA       NA    
## surfaceGrass                        NA         NA      NA       NA    
## ace                         -0.1338933  0.1343991  -0.996 0.319135    
## bpSaved                     -0.4501797  0.1213284  -3.710 0.000207 ***
## df                          -0.2270983  0.2182608  -1.040 0.298112    
## X1stWon                     -0.0009426  0.0312908  -0.030 0.975968    
## X2ndWon                      0.0911637  0.0658584   1.384 0.166285    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 96.886  on 146  degrees of freedom
## Residual deviance: 61.315  on 138  degrees of freedom
## AIC: 79.315
## 
## Number of Fisher Scoring iterations: 7

Odds ratio and confidence intervals

exp(model$coefficients)
##                 (Intercept) tourney_nameAustralian Open 
##                  41.2176673                   1.4245531 
##   tourney_nameRoland Garros         tourney_nameUS Open 
##                   8.2623480                   1.6591471 
##                 surfaceClay                surfaceGrass 
##                          NA                          NA 
##                         ace                     bpSaved 
##                   0.8746834                   0.6375136 
##                          df                     X1stWon 
##                   0.7968424                   0.9990578 
##                     X2ndWon 
##                   1.0954483
exp(confint(model))
## Waiting for profiling to be done...
##                                 2.5 %       97.5 %
## (Intercept)                 1.5825332 1514.8577691
## tourney_nameAustralian Open 0.1941230   10.3134310
## tourney_nameRoland Garros   0.6093668  233.3681061
## tourney_nameUS Open         0.2078647   13.1996239
## surfaceClay                        NA           NA
## surfaceGrass                       NA           NA
## ace                         0.6639030    1.1374578
## bpSaved                     0.4885537    0.7916893
## df                          0.5100819    1.2254360
## X1stWon                     0.9387120    1.0632473
## X2ndWon                     0.9658716    1.2561840

Predicted probabilities

train$predicted.prob <- fitted(model)
modelChi <- model$null.deviance - model$deviance
modelChi
## [1] 35.57064
chidf <- model$df.null - model$df.residual
chidf
## [1] 8
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob
## [1] 2.103993e-05

Thus because this probability is less than 0.05, we can reject the null hypothesis that the model is not better than chance at predicting the outcome.

Forward Selection Method

stepwiseLogit(Y ~ ace + bpSaved + df + X1stWon + X2ndWon + tourney_name + surface, 
              data = train, selection = "forward", select = "SL", sle = 0.05)
##      Table 1. Summary of Parameters     
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##            Paramters             Value  
## ————————————————————————————————————————
## Response Variable              Y         
## Included Variable              NULL      
## Selection Method               forward   
## Select Criterion               SL        
## Entry Significance Level(sle)  0.05      
## Variable significance test     Rao       
## Multicollinearity Terms        NULL      
## Intercept                      1         
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##          Table 2. Variables Type         
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##   class              variable            
## —————————————————————————————————————————
## factor   Y tourney_name surface           
## numeric  ace bpSaved df X1stWon X2ndWon   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                      Table 3. Process of Selection                      
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  Step  EnteredEffect  RemovedEffect  DF  NumberIn           SL          
## ————————————————————————————————————————————————————————————————————————
## 0     1                             1   1         1                      
## 1     bpSaved                       1   2         2.86692869592454e-08   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
## Table 4. Selected Varaibles
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  variables1  variables2 
## ————————————————————————
## 1           bpSaved      
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                        Table 5. Coefficients of the Selected Variables                       
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##   Variable         Estimate            StdError           t.value             P.value        
## —————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept)  4.37531622855915    0.684799811445662   6.38919017708625  1.66766552957952e-10   
## bpSaved      -0.386162582223795  0.0875593364565924  -4.4102958959178  1.03229464735747e-05   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗

Correlation Analysis

If you watch tennis you know what variables are highly correlated. Australian Open and US Open will be highly correlated to hard court surface since these are the only two Grand Slam tournaments played are hard court. Wimbledon will be highly correlated to grass court surface since Wimbledon is the only Grand Slam tournament that is played on grass. Finally, Roland Garros will be highly correlated to clay court surface since, once again, Rolland Garros is the only grand slams that is played on clay court.

data_matrix <- cor(data[, c(4:9)])
corrplot(data_matrix, method="number")

corrplot(data_matrix, order="hclust", type='upper',tl.srt = 45)

stepwiseLogit(Y ~ ace + bpSaved + df + X1stWon + X2ndWon + tourney_name + surface, 
              train, selection = "forward", select = "SL", sle = 0.05)
##      Table 1. Summary of Parameters     
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##            Paramters             Value  
## ————————————————————————————————————————
## Response Variable              Y         
## Included Variable              NULL      
## Selection Method               forward   
## Select Criterion               SL        
## Entry Significance Level(sle)  0.05      
## Variable significance test     Rao       
## Multicollinearity Terms        NULL      
## Intercept                      1         
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##          Table 2. Variables Type         
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##   class              variable            
## —————————————————————————————————————————
## factor   Y tourney_name surface           
## numeric  ace bpSaved df X1stWon X2ndWon   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                      Table 3. Process of Selection                      
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  Step  EnteredEffect  RemovedEffect  DF  NumberIn           SL          
## ————————————————————————————————————————————————————————————————————————
## 0     1                             1   1         1                      
## 1     bpSaved                       1   2         2.86692869592454e-08   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
## Table 4. Selected Varaibles
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  variables1  variables2 
## ————————————————————————
## 1           bpSaved      
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                        Table 5. Coefficients of the Selected Variables                       
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##   Variable         Estimate            StdError           t.value             P.value        
## —————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept)  4.37531622855915    0.684799811445662   6.38919017708625  1.66766552957952e-10   
## bpSaved      -0.386162582223795  0.0875593364565924  -4.4102958959178  1.03229464735747e-05   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗

Anomaly Detection (outliers)

[1] Using the statistical approach, the Grubb’s Test identifies outliers for a single attribute, univariate normal (Pang et al., 2020). The Grubb’s Test successfully identified one anomaly in the X1stWon variable and another in the X2ndWon variable. Due to the observed right-skewed distribution of the ace, bpSaved, and df variables, a non-parametric approach was deemed necessary for anomaly detection. Accordingly, the local outlier factor (LOF) was selected as it does not rely on any predefined distribution assumptions

data2<-data
data2$orig_rows<-1:nrow(data)

res<-0
ids_M <- NULL

while(res<0.05){
  res<-grubbs.test(data2$X1stWon)$p.value
  id<-which.max(data2$X1stWon)
  
  
  ids_M<-c(ids_M,data2$orig_rows[id])
  data2<-data2[-id,]
}

data[ids_M[1:5],]
##         Y tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 166     0    Wimbledon   Grass   9      15  4     100      34     315
## NA   <NA>         <NA>    <NA>  NA      NA NA      NA      NA      NA
## NA.1 <NA>         <NA>    <NA>  NA      NA NA      NA      NA      NA
## NA.2 <NA>         <NA>    <NA>  NA      NA NA      NA      NA      NA
## NA.3 <NA>         <NA>    <NA>  NA      NA NA      NA      NA      NA
res<-0
ids_M<-NULL

while(res<0.05){
  res<-grubbs.test(data2$X2ndWon)$p.value
  id<-which.max(data2$X2ndWon)
  
  
  ids_M<-c(ids_M,data2$orig_rows[id])
  data2<-data2[-id,]
}

data[ids_M[1:5],]
##         Y  tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 74      1 Roland Garros    Clay   6       4  3      73      35     277
## NA   <NA>          <NA>    <NA>  NA      NA NA      NA      NA      NA
## NA.1 <NA>          <NA>    <NA>  NA      NA NA      NA      NA      NA
## NA.2 <NA>          <NA>    <NA>  NA      NA NA      NA      NA      NA
## NA.3 <NA>          <NA>    <NA>  NA      NA NA      NA      NA      NA
x <- as.matrix(data$ace)
lof_scores_ace <- lof(x)
threshold <- mean(data$ace) + (3 * sd(data$ace))
anomalies <- which(lof_scores_ace > threshold)
plot(x, col = ifelse(lof_scores_ace > threshold, "red", "black"), pch = 16)
legend("topright", legend = c("Normal", "Anomaly"), col = c("black", "red"), pch = 16)

anomaly_rows <- data[anomalies, ]
anomaly_rows
##     Y    tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 15  1       Wimbledon   Grass  12       1  1      77      28     225
## 17  1       Wimbledon   Grass  12       4  3      59      25     163
## 59  0 Australian Open    Hard  10      13  4      88      32     353
## 64  1   Roland Garros    Clay  10       4  1      56      17     166
## 87  1 Australian Open    Hard  12       6  1      67      14     197
## 188 1       Wimbledon   Grass  10       0  2      60      30     184
## 191 1       Wimbledon   Grass  10       6  2      45      20     127
## 202 1 Australian Open    Hard  12       1  4      66      27     218
y <- as.matrix(data$bpSaved)
lof_scores_bpSaved <- lof(y)
threshold <- mean(data$bpSaved) + (3 * sd(data$bpSaved))
anomalies <- which(lof_scores_bpSaved > threshold)
plot(y, col = ifelse(lof_scores_bpSaved > threshold, "red", "black"), pch = 16)
legend("topright", legend = c("Normal", "Anomaly"), col = c("black", "red"), pch = 16)

anomaly_rows <- data[anomalies, ]
anomaly_rows
##     Y    tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 3   1 Australian Open    Hard   6      12  3      58      28     210
## 6   1   Roland Garros    Clay   0       9  0      44      15     143
## 75  1   Roland Garros    Clay   5       9  2      43      11     136
## 123 0         US Open    Hard   1       9  4      64      30     246
## 129 1 Australian Open    Hard   8      12  3      93      27     296
## 130 0 Australian Open    Hard   4      14  3      69      23     217
## 153 0 Australian Open    Hard   3      14  2      83      13     227
## 209 1   Roland Garros    Clay   3       9  0      55      19     189
z <- as.matrix(data$df)
lof_scores_df <- lof(z)
threshold <- mean(data$df) + (3 * sd(data$df))
anomalies <- which(lof_scores_df > threshold)
plot(z, col = ifelse(lof_scores_df > threshold, "red", "black"), pch = 16)
legend("topright", legend = c("Normal", "Anomaly"), col = c("black", "red"), pch = 16)

anomaly_rows <- data[anomalies, ]
anomaly_rows
##     Y    tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 88  1 Australian Open    Hard   3       3  7      66      22     217
## 103 1 Australian Open    Hard   3       2  7      68      29     252

Step 3: Non-Parametric Statistics

Classification Tree

dt <- sort (sample(nrow(data), nrow(data) *.7))
train <- data[dt,]
test <- data[-dt,]
rtree <- rpart(Y ~ ., data, method = "class")
rpart.plot(rtree)

Factor Analysis

Factor Analysis Test KMO

data_fa <- data[c(4:9)]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.62
## MSA for each item = 
##     ace bpSaved      df X1stWon X2ndWon minutes 
##    0.54    0.76    0.94    0.61    0.62    0.59

Bartlett Test

cortest.bartlett(datamatrix, nrow(data))
## $chisq
## [1] 645.2857
## 
## $p.value
## [1] 8.343873e-128
## 
## $df
## [1] 15

Scree plot

ev <- eigen(cor(data_fa))
ev$values
## [1] 3.0679519 1.0743213 0.8065280 0.6065931 0.3716501 0.0729555
factors <- 1:length(ev$values)
Eigen_Values <-ev$values

scree_data <- data.frame(factors, Eigen_Values)

plot(scree_data, main = "Scree Plot", col = "blue", ylim = c(0, 4))
lines(scree_data, col = "red")
abline(h = 1, col = "green")

Varimax Rotation

nfactors <- 1
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)
## 
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"),     rotation = "varimax")
## 
## Uniquenesses:
##     ace bpSaved      df X1stWon X2ndWon minutes 
##   0.897   0.727   0.855   0.253   0.478   0.005 
## 
## Loadings:
##         Factor1
## ace     0.322  
## bpSaved 0.522  
## df      0.381  
## X1stWon 0.864  
## X2ndWon 0.723  
## minutes 0.998  
## 
##                Factor1
## SS loadings      2.785
## Proportion Var   0.464
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 92.85 on 9 degrees of freedom.
## The p-value is 4.36e-16

Diagram

fa_var <-  fa(r=data_fa, nfactors = 1, rotate="varimax",fm="pa")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
fa.diagram(fa_var)

head(fa_var$scores)
##              PA1
## [1,]  0.02597377
## [2,] -0.81151979
## [3,]  0.78149046
## [4,] -0.15789778
## [5,]  0.28035023
## [6,] -0.43492250

References

[1] Tan, P.-N, Steinbach, M, Karpatne, A., and Kumar, V. (2020). ”Introduction to Data Mining.” Pearson Education. Second Edition

[2] https://github.com/JeffSackmann/tennis_atp , Jeff Sackmann