library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
library(knitr)
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
setwd("C:/Users/nika/Downloads/R/Other/csv")
Beginner_Players <- read_excel("Beta Testing Data.xlsx")
#Basic filtering (Removing the juniors and splitting classes by class level)
clean_beginner_players <- Beginner_Players |>
mutate(`Session Start Time` = format(`Session Start Time`, "%H:%M")) |>
mutate(`Session End Time` = format(`Session End Time`, "%H:%M")) |>
mutate(`Session Date` = format(`Session Date`, "%m/%d/%Y")) |>
filter(!`Account Status` %in% c("Inactive", "Archived")) |>
filter(!`Admin Session Title` %in% c("Junior Stars", "Little Juniors", "Summer Junior Academy", "Junior Academy", "National Cathedral School Middle School", "National Cathedral School", "Junior Stars Clinic", "Summer Junior Academy Session", "Little Juniors", "Summer Junior Academy", "US Squash Junior Silver Tournament", "NCS Middle School", "Junior Academy", "National Cathedral School Middle School", "Junior Stars", "Free Junior Intro Session", "Clinic - Junior Stars", "Clinic: Junior Stars", "Waitlist Junior Team Tournament", "Junior Academy Matchplay", "Junior - Advanced", "Junior - Elite", "Toddlers", "Toddler Ball Court", "Junior - Quick Start", "Clinic Junior Stars", "CLINIC: TODDLERS", "EastBanc Corporate", "Elite Junior Training", "Future Stars", "Jr. Winter Break Camp", "Jr. Winter Camp", "Junio Stars", "Junior", "Junior -", "Junior - Future Stars", "Junior Academy +", "Junior Academy Ladder Matches", "Junior Academy Ladder/ Matchplay", "Junior Academy Lesson", "Junior Academy Practice", "Junior Academy+", "Junior Clinic- Future Stars", "Junior Club", "JUNIOR DAY CAMP", "Junior Package +", "Junior Round Robin", "Junior Stars Course", "Junior Stars Course - Mondays (5:45 PM)", "Junior Stars Course - Wednesdays (5:45 pm)", "Junior Stars Course (Mon 5:45 PM)", "Junior Stars Course (Wed 5:45 PM)", "Junjior Academy +", "Little Junior Course", "Little Juniors Clinic", "Little Juniors Course", "Little Juniors Course - Mondays (5:00 PM)", "Little Juniors Course - Wednesdays (5:00 PM)", "Little Juniors Course (Mon 5:00 PM)", "Little Juniors Course (Wed 5.00 pm)", "Middle School", "MIDDLE SCHOOL CLINIC", "Philippe's Birthday", "Private Beginners Clinic (P.Lanier)", "Private Beginners", "Reserved", "School Without Walls Middle School")) |>
mutate('Class Type' = case_when(
`Admin Session Title` %in% c("Beginners Level 1", "Intro to Squash", "Clinic Discounted Beginners Level 2", "Beginners Level 1 course", "Beginners Tournament", "NEW PLAYERS CLINIC", "Beginners Round Robin", "Discounted Beginners Level 2", "Beginners Level 1 3-Week Course", "Clinic: Discounted Beginners Level 2", "Beginners Workshop", "Ladies Beginners Workshop", "Clinic: Beginners 3-Week", "Discounted Beginners Level 2 Plus Fitness", "Beginners Learn and Play", "Women's Beginners Level 1", "Discounted Beginners Level 1", "Discounted Beginners Level 1", "Discounted Beginner Clinic Level 2", "New Players", "Beginners Level 2", "Clinic: Discounted Beginners Level 2 Plus Fitness", "New Player Clinic", "Women's Beginner Level 1", "Squash Basics", "BEGINNERS COURSE 1x/ week", "Intro to Squash Course (THURS 6:15 PM)", "New Player" ,"Beginners", "Beginners Clinic", "Beginners Course", "Beginners Course - Tues & Thurs (6:30 PM)", "BEGINNERS COURSE (1x/ week)", "BEGINNERS COURSE ( 2x/ week Tue-Thu)", "Beginners Course (1x/week)", "Beginners Course (Mon & Wed 7:15 PM)", "Beginners Course (Mon & Wed 7:30 PM)", "Beginners Course (Tues & Thurs 7:00 PM)", "Beginners Course (Tues & Thurs 7:15 PM)", "BEGINNERS COURSE 1 x /week", "BEGINNERS COURSE 1x/week", "BEGINNERS COURSE 2 x /WEEK (Mon-Wed)", "Beginners Course Level 1", "Beginners Course Level 2", "Beginners Level 2 course", "Beginners Round Robin World Squash Day", "Beginners World Squash Day Clinic", "Intro to Squash (Mon 6:30 PM)", "Intro to Squash Clinic", "Intro to Squash course", "Intro to Squash Course - Mondays (6:30 PM)", "Intro to Squash Course - Tuesdays (7:45 AM)", "Intro to Squash Course (Tues 7:45 AM)", "Intro to Sqush") ~ "Beginner",
`Admin Session Title` %in% c("Intermediate", "Clinic - Intermediate", "Clinic: Discounted Intermediate Plus Fitness", "Clinic:Intermediate", "Intermediate Clinic", "Clinic: Discounted Intermediate", "Clinic Discounted Intermediate", "Clinic: Intermediate", "Discounted Intermediate", "Discounted Intermediate Plus Fitness", "Intermediate Round Robin", "Discounted Intermediate Clinic", "Intermediate Course", "Intermediate Course - Tues & Thurs (12:15 PM)", "INTERMEDIATE COURSE (Mon-Wed 6.15 PM)", "Intermediate Course (Thurs only)", "INTERMEDIATE COURSE (Tue + Thu)", "INTERMEDIATE COURSE (Tue and Thu)", "Intermediate Course (TUES & THURS 12:15 PM)", "Intermediate Course (TUES & THURS 6:15 PM)", "Intermediate Course 2x / week") ~ "Intermediate",
`Admin Session Title` %in% c("Play the pro", "Advanced clinic", "Advanced", "Advanced Round Robin", "HIGH PERFORMANCE", "High Performance Camp", "Thanksgiving High Performance Camp", "Advanced Team Training", "Elite performance", "Elite performance Session", "Elite Training Program", "Group Training Advanced Players", "Play the Pro (Tinne Gilis)", "Play the Pro Clinic", "Pro session", "Round Robin - Advanced and Intermediate", "Round Robin - Intermediate & Advanced", "Round Robin Advanced", "Round Robin Advanced + intermediate", "Round Robin Intermediate and Advanced", "Round Robin with Rodrigo (Intermediate and Advanced)", "Round Robin with the Pros") ~ "Advanced",
`Admin Session Title` %in% c("4th of July Round Robin", "80's Themed Round Robin (open)", "ALAN PERRY TOURNAMENT", "Back to school night", "Black Friday Round Robin (Open)", "Cambria Hamburg Party", "Christmas Fun Tournament in Round Robin Style", "Christmas Round Robin", "Clinic", "Custom Clinic", "Gilis Exhibition (Free!)", "HL Play", "HL Play (Vic)", "Howe Cup Training & Round Robin - Ladies", "Labor Day Round Robin", "Ladder Night", "Ladies' Clinic", "Ladies Course", "Ladies Course - Thursdays (7:45 AM)", "Ladies Round Robin", "Ladies Team Training", "League", "League 4.0", "Lesson", "Lucky Lesson!", "Lunch Round Robin", "Matchplay", "Memorial Day Round Robin", "MLK Day Round Robin", "Morning Round Robin", "NCS", "NCS Team", "NCS TEAM CLINIC", "New Year's Eve Round Robin", "Open Round Robin World Squash Day", "Performance", "Play", "Play (60 mins $25)", "Presidents' Day Round Robin", "Round Robin", "Round Robin-Open", "Round Robin - Ladies", "Round Robin - Open", "Round Robin (open)", "Round Robin Open", "SOF Ladies Team", "Sharks Session", "WINTER BREAK CAMP", "Thanksgiving Camp", "Women's Squash Week Special Clinic", "Women's Squash Week Team Event") ~ "Shared",
`Session Type` %in% c("Play", "Lesson") ~ "Practice",
TRUE ~ NA_character_))
#Assigning random values to each player
clean_beginner_players <- clean_beginner_players |>
left_join(distinct(clean_beginner_players, `Full Name`) |>
mutate(Player_ID = sample(10000:99999, size = n(), replace = FALSE)), by = "Full Name") |>
mutate(Start_Time = strptime(`Session Start Time`, "%H:%M"), End_Time = strptime(`Session End Time`, "%H:%M"), Session_Duration = difftime(End_Time, Start_Time, units = "mins")) |>
select(-Start_Time, -End_Time)
#Making the clean data the correct format
clean_beginner_players <- clean_beginner_players |>
mutate(`Session Date` = as.Date(`Session Date`, format = "%m/%d/%Y")) |>
group_by(Player_ID) |>
mutate(First_Session = min(`Session Date`), Last_Session = max(`Session Date`), Total_Weeks = as.numeric(difftime(Last_Session, First_Session, units = "weeks")), Total_Sessions = n(), Training_Frequency = Total_Sessions / (Total_Weeks + 1)) |>
ungroup() |>
select(Player_ID, Gender, `Age Range`, `Skill Level`, `Class Type`, `Session Date`, Session_Duration, Total_Weeks, Total_Sessions, Training_Frequency)
#Identify each player's progression through class types
player_progression <- clean_beginner_players |>
group_by(Player_ID) |>
arrange(`Session Date`) |>
mutate(Class_Order = row_number()) |>
select(Player_ID, Gender, `Age Range`, `Session Date`, `Class Type`, Class_Order) |>
ungroup()
#Find the date each player first took a Beginner class and first took an Intermediate class
player_transitions <- player_progression |>
group_by(Player_ID) |>
mutate(
First_Beginner_Date = min(`Session Date`[`Class Type` == "Beginner"], na.rm = TRUE),
First_Intermediate_Date = min(`Session Date`[`Class Type` == "Intermediate"], na.rm = TRUE)
) |>
distinct(Player_ID, Gender, `Age Range`, First_Beginner_Date, First_Intermediate_Date) |>
ungroup()
## Warning: There were 6834 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `First_Beginner_Date = min(`Session Date`[`Class Type` ==
## "Beginner"], na.rm = TRUE)`.
## ℹ In group 1: `Player_ID = 10059`.
## Caused by warning in `min.default()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6833 remaining warnings.
#Filter to only players who progressed from Beginner to Intermediate
transitioned_players <- player_transitions |>
filter(!is.infinite(First_Beginner_Date) & !is.infinite(First_Intermediate_Date)) |>
mutate(
Weeks_to_Intermediate = as.numeric(difftime(First_Intermediate_Date, First_Beginner_Date, units = "weeks")),
Days_to_Intermediate = as.numeric(difftime(First_Intermediate_Date, First_Beginner_Date, units = "days"))
)
#Calculate training frequency during beginner phase
beginner_phase_data <- clean_beginner_players |>
filter(`Class Type` == "Beginner") |>
group_by(Player_ID) |>
summarise(
Beginner_Sessions = n(),
First_Beginner_Session = min(`Session Date`),
Last_Beginner_Session = max(`Session Date`),
.groups = 'drop'
) |>
mutate(
Beginner_Weeks = as.numeric(difftime(Last_Beginner_Session, First_Beginner_Session, units = "weeks"))
)
#Calculate training frequency during intermediate phase
intermediate_phase_data <- clean_beginner_players |>
filter(`Class Type` == "Intermediate") |>
group_by(Player_ID) |>
summarise(
Intermediate_Sessions = n(),
First_Intermediate_Session = min(`Session Date`),
Last_Intermediate_Session = max(`Session Date`),
.groups = 'drop'
) |>
mutate(
Intermediate_Weeks = as.numeric(difftime(Last_Intermediate_Session, First_Intermediate_Session, units = "weeks"))
)
#Getting final analysis data set
analysis_data <- transitioned_players |>
left_join(beginner_phase_data, by = "Player_ID") |>
left_join(intermediate_phase_data, by = "Player_ID") |>
mutate(
Beginner_Training_Frequency = Beginner_Sessions / (Beginner_Weeks + 1),
Intermediate_Training_Frequency = Intermediate_Sessions / (Intermediate_Weeks + 1)
) |>
filter(!is.na(Beginner_Training_Frequency) & !is.na(Weeks_to_Intermediate))
#Data Analysis
#Summary statistics
summary(analysis_data$Weeks_to_Intermediate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -88.0000 -0.4286 3.7143 8.9714 12.3214 106.2857
summary(analysis_data$Beginner_Training_Frequency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03406 0.33110 1.00000 0.69548 1.00000 1.27273
#Scatter Plot
ggplot(analysis_data, aes(x = Beginner_Training_Frequency, y = Weeks_to_Intermediate)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = "Training Frequency vs. Time to Intermediate Level",
x = "Beginner Training Frequency (sessions per week)",
y = "Weeks to Reach Intermediate Level") +
theme_minimal()
#Histogram Weeks to Intermediate
ggplot(analysis_data, aes(x = Weeks_to_Intermediate)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Weeks to Intermediate Level",
x = "Weeks", y = "Frequency") +
theme_minimal()
#Histogram Training Frequency
ggplot(analysis_data, aes(x = Beginner_Training_Frequency)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Beginner Training Frequency",
x = "Sessions per Week", y = "Frequency") +
theme_minimal()
#Box Plots
ggplot(analysis_data, aes(y = Weeks_to_Intermediate)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
labs(title = "Boxplot: Weeks to Intermediate Level",
y = "Weeks") +
theme_minimal()
ggplot(analysis_data, aes(y = Beginner_Training_Frequency)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
labs(title = "Boxplot: Beginner Training Frequency",
y = "Sessions per Week") +
theme_minimal()
#Pearsons Correlation
correlation <- cor.test(analysis_data$Beginner_Training_Frequency,
analysis_data$Weeks_to_Intermediate,
method = "pearson")
print(correlation)
##
## Pearson's product-moment correlation
##
## data: analysis_data$Beginner_Training_Frequency and analysis_data$Weeks_to_Intermediate
## t = -0.70871, df = 58, p-value = 0.4813
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3386155 0.1651526
## sample estimates:
## cor
## -0.09265781
#Regression Analysis
regression_model <- lm(Weeks_to_Intermediate ~ Beginner_Training_Frequency, data = analysis_data)
summary(regression_model)
##
## Call:
## lm(formula = Weeks_to_Intermediate ~ Beginner_Training_Frequency,
## data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.522 -10.088 -5.438 3.067 92.603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.926 7.959 1.750 0.0855 .
## Beginner_Training_Frequency -7.124 10.051 -0.709 0.4813
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.48 on 58 degrees of freedom
## Multiple R-squared: 0.008585, Adjusted R-squared: -0.008508
## F-statistic: 0.5023 on 1 and 58 DF, p-value: 0.4813
#Residuals vs Order
plot(resid(regression_model), type="b", main="Residuals vs Order", ylab="Residuals")
abline(h=0, lty=2)
# Diagnostic plots
par(mfrow=c(2,2))
plot(regression_model)
par(mfrow=c(1,1))
# Component-Component plus Residual Plots
crPlots(regression_model)
# Correlation matrix
cor(analysis_data[, c("Weeks_to_Intermediate", "Beginner_Training_Frequency", "Beginner_Sessions")], use = "complete.obs")
## Weeks_to_Intermediate Beginner_Training_Frequency
## Weeks_to_Intermediate 1.00000000 -0.09265781
## Beginner_Training_Frequency -0.09265781 1.00000000
## Beginner_Sessions 0.13346495 -0.40002475
## Beginner_Sessions
## Weeks_to_Intermediate 0.1334649
## Beginner_Training_Frequency -0.4000247
## Beginner_Sessions 1.0000000