options(repos = "https://cran.rstudio.com/")
library("StepReg")
library("rms")
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library("aod")
library("knitr")
library("car")
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:rms':
##
## Predict, vif
library("broom")
library("sjPlot")
library("effects")
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library("outliers")
library("spatstat")
## Loading required package: spatstat.data
## Loading required package: spatstat.geom
## spatstat.geom 3.2-5
##
## Attaching package: 'spatstat.geom'
## The following object is masked from 'package:car':
##
## ellipse
## The following object is masked from 'package:rms':
##
## perimeter
## Loading required package: spatstat.random
## spatstat.random 3.1-6
## Loading required package: spatstat.explore
## Loading required package: nlme
## spatstat.explore 3.2-3
## Loading required package: spatstat.model
## Loading required package: rpart
## spatstat.model 3.2-6
##
## Attaching package: 'spatstat.model'
## The following object is masked from 'package:car':
##
## bc
## Loading required package: spatstat.linnet
## spatstat.linnet 3.1-1
##
## spatstat 3.0-6
## For an introduction to spatstat, type 'beginner'
library("DMwR2")
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library("ggplot2")
library("gridExtra")
library("grid")
##
## Attaching package: 'grid'
## The following object is masked from 'package:spatstat.geom':
##
## as.mask
library("dbscan")
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:DMwR2':
##
## kNN
## The following object is masked from 'package:stats':
##
## as.dendrogram
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks nlme::collapse()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("MASS")
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## The following object is masked from 'package:spatstat.geom':
##
## area
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:spatstat.explore':
##
## auc, roc
##
## The following object is masked from 'package:spatstat.geom':
##
## coords
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:spatstat.model':
##
## panel.histogram
##
## The following object is masked from 'package:spatstat.explore':
##
## panel.histogram
##
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart.plot)
library(rpart)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(corrplot)
## corrplot 0.92 loaded
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:outliers':
##
## outlier
library(readr)
library(parsnip)
##
## Attaching package: 'parsnip'
##
## The following object is masked from 'package:Hmisc':
##
## translate
library(party)
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following object is masked from 'package:parsnip':
##
## fit
##
## The following object is masked from 'package:spatstat.model':
##
## parameters
##
## The following object is masked from 'package:car':
##
## Predict
##
## The following object is masked from 'package:rms':
##
## Predict
##
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:dplyr':
##
## where
##
## The following object is masked from 'package:spatstat.model':
##
## response
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:randomForest':
##
## outlier
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## The following objects are masked from 'package:spatstat.geom':
##
## reflect, rescale
##
## The following object is masked from 'package:outliers':
##
## outlier
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:Hmisc':
##
## describe
[1] The dataset that I obtained to analyze is from a dataset originally published by Jeff Sackmann, which contains records of tennis matches dating back to 1968. However, for the purposes of this assignment, only data from the years 2010-2020 was collected, specifically focusing on Rafael Nadal’s matches in all four Grand Slams from the initial to exit round.
data <- read.csv("SelectedVariables1.csv")
str(data)
## 'data.frame': 210 obs. of 9 variables:
## $ Y : int 1 1 1 1 0 1 1 1 1 1 ...
## $ tourney_name: chr "Australian Open" "Australian Open" "Australian Open" "Australian Open" ...
## $ surface : chr "Hard" "Hard" "Hard" "Hard" ...
## $ ace : int 11 1 6 2 1 0 1 1 2 4 ...
## $ bpSaved : int 6 0 12 1 6 9 5 3 5 1 ...
## $ df : int 1 1 3 2 1 0 2 1 0 0 ...
## $ X1stWon : int 55 25 58 56 30 44 47 42 41 59 ...
## $ X2ndWon : int 18 20 28 22 10 15 7 10 15 17 ...
## $ minutes : int 154 113 210 156 150 143 105 148 153 155 ...
Logistic regression was chosen since Class is the dependent variable (response variable) and there is only two outcomes, whether Nadal won or loss.
data$Y <- as.factor(data$Y)
data$tourney_name <- as.factor(data$tourney_name)
data$surface <- as.factor(data$surface)
data$tourney_name <- relevel(data$tourney_name, ref = "Wimbledon")
data$surface <- relevel(data$surface, ref = "Hard")
set.seed(1234)
dt <- sort(sample(nrow(data), nrow(data) *.70))
train <- data[dt,]
test <- data[-dt,]
model <- glm(Y ~ tourney_name + surface + ace + bpSaved + df + X1stWon + X2ndWon,
data = train, family = binomial(link = "logit"))
summary(model)
##
## Call:
## glm(formula = Y ~ tourney_name + surface + ace + bpSaved + df +
## X1stWon + X2ndWon, family = binomial(link = "logit"), data = train)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.7188670 1.7251087 2.156 0.031105 *
## tourney_nameAustralian Open 0.3538581 0.9871464 0.358 0.719995
## tourney_nameRoland Garros 2.1117088 1.4202830 1.487 0.137062
## tourney_nameUS Open 0.5063037 1.0332196 0.490 0.624116
## surfaceClay NA NA NA NA
## surfaceGrass NA NA NA NA
## ace -0.1338933 0.1343991 -0.996 0.319135
## bpSaved -0.4501797 0.1213284 -3.710 0.000207 ***
## df -0.2270983 0.2182608 -1.040 0.298112
## X1stWon -0.0009426 0.0312908 -0.030 0.975968
## X2ndWon 0.0911637 0.0658584 1.384 0.166285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 96.886 on 146 degrees of freedom
## Residual deviance: 61.315 on 138 degrees of freedom
## AIC: 79.315
##
## Number of Fisher Scoring iterations: 7
exp(model$coefficients)
## (Intercept) tourney_nameAustralian Open
## 41.2176673 1.4245531
## tourney_nameRoland Garros tourney_nameUS Open
## 8.2623480 1.6591471
## surfaceClay surfaceGrass
## NA NA
## ace bpSaved
## 0.8746834 0.6375136
## df X1stWon
## 0.7968424 0.9990578
## X2ndWon
## 1.0954483
exp(confint(model))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.5825332 1514.8577691
## tourney_nameAustralian Open 0.1941230 10.3134310
## tourney_nameRoland Garros 0.6093668 233.3681061
## tourney_nameUS Open 0.2078647 13.1996239
## surfaceClay NA NA
## surfaceGrass NA NA
## ace 0.6639030 1.1374578
## bpSaved 0.4885537 0.7916893
## df 0.5100819 1.2254360
## X1stWon 0.9387120 1.0632473
## X2ndWon 0.9658716 1.2561840
train$predicted.prob <- fitted(model)
modelChi <- model$null.deviance - model$deviance
modelChi
## [1] 35.57064
chidf <- model$df.null - model$df.residual
chidf
## [1] 8
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob
## [1] 2.103993e-05
Thus because this probability is less than 0.05, we can reject the null hypothesis that the model is not better than chance at predicting the outcome.
stepwiseLogit(Y ~ ace + bpSaved + df + X1stWon + X2ndWon + tourney_name + surface,
data = train, selection = "forward", select = "SL", sle = 0.05)
## Table 1. Summary of Parameters
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Paramters Value
## ————————————————————————————————————————
## Response Variable Y
## Included Variable NULL
## Selection Method forward
## Select Criterion SL
## Entry Significance Level(sle) 0.05
## Variable significance test Rao
## Multicollinearity Terms NULL
## Intercept 1
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 2. Variables Type
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## class variable
## —————————————————————————————————————————
## factor Y tourney_name surface
## numeric ace bpSaved df X1stWon X2ndWon
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 3. Process of Selection
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Step EnteredEffect RemovedEffect DF NumberIn SL
## ————————————————————————————————————————————————————————————————————————
## 0 1 1 1 1
## 1 bpSaved 1 2 2.86692869592454e-08
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 4. Selected Varaibles
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## variables1 variables2
## ————————————————————————
## 1 bpSaved
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 5. Coefficients of the Selected Variables
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Variable Estimate StdError t.value P.value
## —————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept) 4.37531622855915 0.684799811445662 6.38919017708625 1.66766552957952e-10
## bpSaved -0.386162582223795 0.0875593364565924 -4.4102958959178 1.03229464735747e-05
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
If you watch tennis you know what variables are highly correlated. Australian Open and US Open will be highly correlated to hard court surface since these are the only two Grand Slam tournaments played are hard court. Wimbledon will be highly correlated to grass court surface since Wimbledon is the only Grand Slam tournament that is played on grass. Finally, Roland Garros will be highly correlated to clay court surface since, once again, Rolland Garros is the only grand slams that is played on clay court.
data_matrix <- cor(data[, c(4:9)])
corrplot(data_matrix, method="number")
corrplot(data_matrix, order="hclust", type='upper',tl.srt = 45)
stepwiseLogit(Y ~ ace + bpSaved + df + X1stWon + X2ndWon + tourney_name + surface,
train, selection = "forward", select = "SL", sle = 0.05)
## Table 1. Summary of Parameters
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Paramters Value
## ————————————————————————————————————————
## Response Variable Y
## Included Variable NULL
## Selection Method forward
## Select Criterion SL
## Entry Significance Level(sle) 0.05
## Variable significance test Rao
## Multicollinearity Terms NULL
## Intercept 1
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 2. Variables Type
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## class variable
## —————————————————————————————————————————
## factor Y tourney_name surface
## numeric ace bpSaved df X1stWon X2ndWon
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 3. Process of Selection
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Step EnteredEffect RemovedEffect DF NumberIn SL
## ————————————————————————————————————————————————————————————————————————
## 0 1 1 1 1
## 1 bpSaved 1 2 2.86692869592454e-08
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 4. Selected Varaibles
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## variables1 variables2
## ————————————————————————
## 1 bpSaved
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 5. Coefficients of the Selected Variables
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Variable Estimate StdError t.value P.value
## —————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept) 4.37531622855915 0.684799811445662 6.38919017708625 1.66766552957952e-10
## bpSaved -0.386162582223795 0.0875593364565924 -4.4102958959178 1.03229464735747e-05
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
[1] Using the statistical approach, the Grubb’s Test identifies outliers for a single attribute, univariate normal (Pang et al., 2020). The Grubb’s Test successfully identified one anomaly in the X1stWon variable and another in the X2ndWon variable. Due to the observed right-skewed distribution of the ace, bpSaved, and df variables, a non-parametric approach was deemed necessary for anomaly detection. Accordingly, the local outlier factor (LOF) was selected as it does not rely on any predefined distribution assumptions
data2<-data
data2$orig_rows<-1:nrow(data)
res<-0
ids_M <- NULL
while(res<0.05){
res<-grubbs.test(data2$X1stWon)$p.value
id<-which.max(data2$X1stWon)
ids_M<-c(ids_M,data2$orig_rows[id])
data2<-data2[-id,]
}
data[ids_M[1:5],]
## Y tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 166 0 Wimbledon Grass 9 15 4 100 34 315
## NA <NA> <NA> <NA> NA NA NA NA NA NA
## NA.1 <NA> <NA> <NA> NA NA NA NA NA NA
## NA.2 <NA> <NA> <NA> NA NA NA NA NA NA
## NA.3 <NA> <NA> <NA> NA NA NA NA NA NA
res<-0
ids_M<-NULL
while(res<0.05){
res<-grubbs.test(data2$X2ndWon)$p.value
id<-which.max(data2$X2ndWon)
ids_M<-c(ids_M,data2$orig_rows[id])
data2<-data2[-id,]
}
data[ids_M[1:5],]
## Y tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 74 1 Roland Garros Clay 6 4 3 73 35 277
## NA <NA> <NA> <NA> NA NA NA NA NA NA
## NA.1 <NA> <NA> <NA> NA NA NA NA NA NA
## NA.2 <NA> <NA> <NA> NA NA NA NA NA NA
## NA.3 <NA> <NA> <NA> NA NA NA NA NA NA
x <- as.matrix(data$ace)
lof_scores_ace <- lof(x)
threshold <- mean(data$ace) + (3 * sd(data$ace))
anomalies <- which(lof_scores_ace > threshold)
plot(x, col = ifelse(lof_scores_ace > threshold, "red", "black"), pch = 16)
legend("topright", legend = c("Normal", "Anomaly"), col = c("black", "red"), pch = 16)
anomaly_rows <- data[anomalies, ]
anomaly_rows
## Y tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 15 1 Wimbledon Grass 12 1 1 77 28 225
## 17 1 Wimbledon Grass 12 4 3 59 25 163
## 59 0 Australian Open Hard 10 13 4 88 32 353
## 64 1 Roland Garros Clay 10 4 1 56 17 166
## 87 1 Australian Open Hard 12 6 1 67 14 197
## 188 1 Wimbledon Grass 10 0 2 60 30 184
## 191 1 Wimbledon Grass 10 6 2 45 20 127
## 202 1 Australian Open Hard 12 1 4 66 27 218
y <- as.matrix(data$bpSaved)
lof_scores_bpSaved <- lof(y)
threshold <- mean(data$bpSaved) + (3 * sd(data$bpSaved))
anomalies <- which(lof_scores_bpSaved > threshold)
plot(y, col = ifelse(lof_scores_bpSaved > threshold, "red", "black"), pch = 16)
legend("topright", legend = c("Normal", "Anomaly"), col = c("black", "red"), pch = 16)
anomaly_rows <- data[anomalies, ]
anomaly_rows
## Y tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 3 1 Australian Open Hard 6 12 3 58 28 210
## 6 1 Roland Garros Clay 0 9 0 44 15 143
## 75 1 Roland Garros Clay 5 9 2 43 11 136
## 123 0 US Open Hard 1 9 4 64 30 246
## 129 1 Australian Open Hard 8 12 3 93 27 296
## 130 0 Australian Open Hard 4 14 3 69 23 217
## 153 0 Australian Open Hard 3 14 2 83 13 227
## 209 1 Roland Garros Clay 3 9 0 55 19 189
z <- as.matrix(data$df)
lof_scores_df <- lof(z)
threshold <- mean(data$df) + (3 * sd(data$df))
anomalies <- which(lof_scores_df > threshold)
plot(z, col = ifelse(lof_scores_df > threshold, "red", "black"), pch = 16)
legend("topright", legend = c("Normal", "Anomaly"), col = c("black", "red"), pch = 16)
anomaly_rows <- data[anomalies, ]
anomaly_rows
## Y tourney_name surface ace bpSaved df X1stWon X2ndWon minutes
## 88 1 Australian Open Hard 3 3 7 66 22 217
## 103 1 Australian Open Hard 3 2 7 68 29 252
dt <- sort (sample(nrow(data), nrow(data) *.7))
train <- data[dt,]
test <- data[-dt,]
rtree <- rpart(Y ~ ., data, method = "class")
rpart.plot(rtree)
data_fa <- data[c(4:9)]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA = 0.62
## MSA for each item =
## ace bpSaved df X1stWon X2ndWon minutes
## 0.54 0.76 0.94 0.61 0.62 0.59
cortest.bartlett(datamatrix, nrow(data))
## $chisq
## [1] 645.2857
##
## $p.value
## [1] 8.343873e-128
##
## $df
## [1] 15
ev <- eigen(cor(data_fa))
ev$values
## [1] 3.0679519 1.0743213 0.8065280 0.6065931 0.3716501 0.0729555
factors <- 1:length(ev$values)
Eigen_Values <-ev$values
scree_data <- data.frame(factors, Eigen_Values)
plot(scree_data, main = "Scree Plot", col = "blue", ylim = c(0, 4))
lines(scree_data, col = "red")
abline(h = 1, col = "green")
nfactors <- 1
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)
##
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"), rotation = "varimax")
##
## Uniquenesses:
## ace bpSaved df X1stWon X2ndWon minutes
## 0.897 0.727 0.855 0.253 0.478 0.005
##
## Loadings:
## Factor1
## ace 0.322
## bpSaved 0.522
## df 0.381
## X1stWon 0.864
## X2ndWon 0.723
## minutes 0.998
##
## Factor1
## SS loadings 2.785
## Proportion Var 0.464
##
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 92.85 on 9 degrees of freedom.
## The p-value is 4.36e-16
fa_var <- fa(r=data_fa, nfactors = 1, rotate="varimax",fm="pa")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
fa.diagram(fa_var)
head(fa_var$scores)
## PA1
## [1,] 0.02597377
## [2,] -0.81151979
## [3,] 0.78149046
## [4,] -0.15789778
## [5,] 0.28035023
## [6,] -0.43492250
[1] Tan, P.-N, Steinbach, M, Karpatne, A., and Kumar, V. (2020). ”Introduction to Data Mining.” Pearson Education. Second Edition
[2] https://github.com/JeffSackmann/tennis_atp , Jeff Sackmann