Provide all the needed libraries and import the database

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
library(knitr)
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
setwd("C:/Users/nika/Downloads/R/Other/csv")
Beginner_Players <- read_excel("Final Research Data.xlsx")

Data Cleaning

Initial cleaning:

Basic filtering (Removing the juniors/children and splitting classes by class level)

clean_beginner_players <- Beginner_Players |>
  mutate(`Session Start Time` = format(`Session Start Time`, "%H:%M")) |>
  mutate(`Session End Time` = format(`Session End Time`, "%H:%M")) |>
  mutate(`Session Date` = format(`Session Date`, "%m/%d/%Y")) |>
  filter(!`Account Status` %in% c("Inactive", "Archived")) |>
  filter(!`Admin Session Title` %in% c("Junior Stars", "Little Juniors", "Summer Junior Academy", "Junior Academy", "National Cathedral School Middle School", "National Cathedral School", "Junior Stars Clinic", "Summer Junior Academy Session", "Little Juniors", "Summer Junior Academy", "US Squash Junior Silver Tournament", "NCS Middle School", "Junior Academy", "National Cathedral School Middle School", "Junior Stars", "Free Junior Intro Session", "Clinic - Junior Stars", "Clinic: Junior Stars", "Waitlist Junior Team Tournament", "Junior Academy Matchplay", "Junior - Advanced", "Junior - Elite", "Toddlers", "Toddler Ball Court", "Junior - Quick Start", "Clinic Junior Stars", "CLINIC: TODDLERS", "EastBanc Corporate", "Elite Junior Training", "Future Stars", "Jr. Winter Break Camp", "Jr. Winter Camp", "Junio Stars", "Junior", "Junior -", "Junior - Future Stars", "Junior Academy +", "Junior Academy Ladder Matches", "Junior Academy Ladder/ Matchplay", "Junior Academy Lesson", "Junior Academy Practice", "Junior Academy+", "Junior Clinic- Future Stars", "Junior Club", "JUNIOR DAY CAMP", "Junior Package +", "Junior Round Robin", "Junior Stars Course", "Junior Stars Course - Mondays (5:45 PM)", "Junior Stars Course - Wednesdays (5:45 pm)", "Junior Stars Course (Mon 5:45 PM)", "Junior Stars Course (Wed 5:45 PM)", "Junjior Academy +", "Little Junior Course", "Little Juniors Clinic", "Little Juniors Course", "Little Juniors Course - Mondays (5:00 PM)", "Little Juniors Course - Wednesdays (5:00 PM)", "Little Juniors Course (Mon 5:00 PM)", "Little Juniors Course (Wed 5.00 pm)", "Middle School", "MIDDLE SCHOOL CLINIC", "Philippe's Birthday", "Private Beginners Clinic (P.Lanier)", "Private Beginners", "Reserved", "School Without Walls Middle School")) |>
  mutate('Class Type' = case_when(
    `Admin Session Title` %in% c("Beginners Level 1", "Intro to Squash", "Clinic Discounted Beginners Level 2", "Beginners Level 1 course", "Beginners Tournament", "NEW PLAYERS CLINIC", "Beginners Round Robin", "Discounted Beginners Level 2", "Beginners Level 1 3-Week Course", "Clinic: Discounted Beginners Level 2", "Beginners Workshop", "Ladies Beginners Workshop", "Clinic: Beginners 3-Week", "Discounted Beginners Level 2 Plus Fitness", "Beginners Learn and Play", "Women's Beginners Level 1", "Discounted Beginners Level 1", "Discounted Beginners Level 1", "Discounted Beginner Clinic Level 2", "New Players", "Beginners Level 2", "Clinic: Discounted Beginners Level 2 Plus Fitness", "New Player Clinic", "Women's Beginner Level 1", "Squash Basics", "BEGINNERS COURSE 1x/ week", "Intro to Squash Course (THURS 6:15 PM)", "New Player" ,"Beginners", "Beginners Clinic", "Beginners Course", "Beginners Course - Tues & Thurs (6:30 PM)", "BEGINNERS COURSE (1x/ week)", "BEGINNERS COURSE ( 2x/ week Tue-Thu)", "Beginners Course (1x/week)", "Beginners Course (Mon & Wed 7:15 PM)", "Beginners Course (Mon & Wed 7:30 PM)", "Beginners Course (Tues & Thurs 7:00 PM)", "Beginners Course (Tues & Thurs 7:15 PM)", "BEGINNERS COURSE 1 x /week", "BEGINNERS COURSE 1x/week", "BEGINNERS COURSE 2 x /WEEK (Mon-Wed)", "Beginners Course Level 1", "Beginners Course Level 2", "Beginners Level 2 course", "Beginners Round Robin World Squash Day", "Beginners World Squash Day Clinic", "Intro to Squash (Mon 6:30 PM)", "Intro to Squash Clinic", "Intro to Squash course", "Intro to Squash Course - Mondays (6:30 PM)", "Intro to Squash Course - Tuesdays (7:45 AM)", "Intro to Squash Course (Tues 7:45 AM)", "Intro to Sqush") ~ "Beginner",
    `Admin Session Title` %in% c("Intermediate", "Clinic - Intermediate", "Clinic: Discounted Intermediate Plus Fitness", "Clinic:Intermediate", "Intermediate Clinic", "Clinic: Discounted Intermediate", "Clinic Discounted Intermediate", "Clinic: Intermediate", "Discounted Intermediate", "Discounted Intermediate Plus Fitness", "Intermediate Round Robin", "Discounted Intermediate Clinic", "Intermediate Course", "Intermediate Course - Tues & Thurs (12:15 PM)", "INTERMEDIATE COURSE (Mon-Wed 6.15 PM)", "Intermediate Course (Thurs only)", "INTERMEDIATE COURSE (Tue + Thu)", "INTERMEDIATE COURSE (Tue and Thu)", "Intermediate Course (TUES & THURS 12:15 PM)", "Intermediate Course (TUES & THURS 6:15 PM)", "Intermediate Course 2x / week") ~ "Intermediate",
    `Admin Session Title` %in% c("Play the pro", "Advanced clinic", "Advanced", "Advanced Round Robin", "HIGH PERFORMANCE", "High Performance Camp", "Thanksgiving High Performance Camp", "Advanced Team Training", "Elite performance", "Elite performance Session", "Elite Training Program", "Group Training Advanced Players", "Play the Pro (Tinne Gilis)", "Play the Pro Clinic", "Pro session", "Round Robin - Advanced and Intermediate", "Round Robin - Intermediate & Advanced", "Round Robin Advanced", "Round Robin Advanced + intermediate", "Round Robin Intermediate and Advanced", "Round Robin with Rodrigo (Intermediate and Advanced)", "Round Robin with the Pros") ~ "Advanced",
    `Admin Session Title` %in% c("4th of July Round Robin", "80's Themed Round Robin (open)", "ALAN PERRY TOURNAMENT", "Back to school night", "Black Friday Round Robin (Open)", "Cambria Hamburg Party", "Christmas Fun Tournament in Round Robin Style", "Christmas Round Robin", "Clinic", "Custom Clinic", "Gilis Exhibition (Free!)", "HL Play", "HL Play (Vic)", "Howe Cup Training & Round Robin - Ladies", "Labor Day Round Robin", "Ladder Night", "Ladies' Clinic", "Ladies Course", "Ladies Course - Thursdays (7:45 AM)", "Ladies Round Robin", "Ladies Team Training", "League", "League 4.0", "Lesson", "Lucky Lesson!", "Lunch Round Robin", "Matchplay", "Memorial Day Round Robin", "MLK Day Round Robin", "Morning Round Robin", "NCS", "NCS Team", "NCS TEAM CLINIC", "New Year's Eve Round Robin", "Open Round Robin World Squash Day", "Performance", "Presidents' Day Round Robin", "Round Robin", "Round Robin-Open", "Round Robin - Ladies", "Round Robin - Open", "Round Robin (open)", "Round Robin Open", "SOF Ladies Team", "Sharks Session", "WINTER BREAK CAMP", "Thanksgiving Camp", "Women's Squash Week Special Clinic", "Women's Squash Week Team Event", "World Squash Day Invitational", "Open Round Robin") ~ "Shared",
     `Admin Session Title` %in% c("Play", "Lesson", "$15", "Play (60 mins $25)") ~ "Practice",
    is.na(`Admin Session Title`) ~ "Practice",
    TRUE ~ NA_character_))

Assigning random values to each player

clean_beginner_players <- clean_beginner_players |>
  left_join(distinct(clean_beginner_players, `Full Name`) |>
  mutate(Player_ID = sample(10000:99999, size = n(), replace = FALSE)), by = "Full Name") |>
  mutate(Start_Time = strptime(`Session Start Time`, "%H:%M"), End_Time = strptime(`Session End Time`, "%H:%M"), Session_Duration = difftime(End_Time, Start_Time, units = "mins")) |>
  select(-Start_Time, -End_Time)

Forming the clean data the correct format

clean_beginner_players <- clean_beginner_players |>
  mutate(`Session Date` = as.Date(`Session Date`, format = "%m/%d/%Y")) |>
  group_by(Player_ID) |>
  mutate(First_Session = min(`Session Date`), Last_Session = max(`Session Date`), Total_Weeks = as.numeric(difftime(Last_Session, First_Session, units = "weeks")), Total_Sessions = n(), Training_Frequency = Total_Sessions / (Total_Weeks + 1)) |>
  ungroup() |>
  select(Player_ID, Gender, `Age Range`, `Skill Level`, `Class Type`, `Session Date`, Session_Duration, Total_Weeks, Total_Sessions, Training_Frequency)

Calculating the needed values:

Identify each player’s progression through class types taken (player progression)

player_progression <- clean_beginner_players |>
  group_by(Player_ID) |>
  arrange(`Session Date`) |>
  mutate(Class_Order = row_number()) |>
  select(Player_ID, Gender, `Age Range`, `Session Date`, `Class Type`, Class_Order) |>
  ungroup()

Find the date each player first took a Beginner class and first took an Intermediate class (player transition)

player_transitions <- player_progression |>
  group_by(Player_ID) |>
  summarise(
    Gender = first(Gender),
    `Age Range` = first(`Age Range`),
    First_Beginner_Date = if(any(`Class Type` == "Beginner", na.rm = TRUE)) min(`Session Date`[`Class Type` == "Beginner" & !is.na(`Class Type`)]) else NA,
    First_Intermediate_Date = if(any(`Class Type` == "Intermediate", na.rm = TRUE)) min(`Session Date`[`Class Type` == "Intermediate" & !is.na(`Class Type`)]) else NA
  ) |>
  ungroup() |>
  mutate(
    Days_to_Transition = as.numeric(difftime(First_Intermediate_Date, First_Beginner_Date, units = "days")),
    Transition_Status = case_when(
      is.na(First_Beginner_Date) & !is.na(First_Intermediate_Date) ~ "Started at Intermediate",
      !is.na(First_Beginner_Date) & is.na(First_Intermediate_Date) ~ "Stayed at Beginner",
      is.na(First_Beginner_Date) & is.na(First_Intermediate_Date) ~ "Never took either class",
      Days_to_Transition < 0 ~ "Took Intermediate before Beginner",
      Days_to_Transition >= 0 ~ "Progressed Beginner to Intermediate"
    )
  )

player_transitions <- player_transitions |>
  mutate(
    Days_to_Transition = as.numeric(difftime(First_Intermediate_Date, First_Beginner_Date, units = "days"))
  )

Filter to only players who progressed from Beginner to Intermediate (transitioned players)

transitioned_players <- player_transitions |>
  filter(Transition_Status == "Progressed Beginner to Intermediate") |>
  mutate(
    Weeks_to_Intermediate = as.numeric(difftime(First_Intermediate_Date, First_Beginner_Date, units = "weeks")),
    Days_to_Intermediate = as.numeric(difftime(First_Intermediate_Date, First_Beginner_Date, units = "days"))
  )

Calculate training frequency during beginner phase (beginner phase)

beginner_phase_data <- clean_beginner_players |>
  filter(`Class Type` == "Beginner") |>
  group_by(Player_ID) |>
  summarise(
    Beginner_Sessions = n(),
    First_Beginner_Session = min(`Session Date`),
    Last_Beginner_Session = max(`Session Date`),
    .groups = 'drop'
  ) |>
  mutate(
    Beginner_Weeks = as.numeric(difftime(Last_Beginner_Session, First_Beginner_Session, units = "weeks"))
  )|>
  filter(Beginner_Weeks > 0)

Calculate training frequency during intermediate phase (intermediate phase)

intermediate_phase_data <- clean_beginner_players |>
  filter(`Class Type` == "Intermediate") |>
  group_by(Player_ID) |>
  summarise(
    Intermediate_Sessions = n(),
    First_Intermediate_Session = min(`Session Date`),
    Last_Intermediate_Session = max(`Session Date`),
    .groups = 'drop'
  ) |>
  mutate(
    Intermediate_Weeks = as.numeric(difftime(Last_Intermediate_Session, First_Intermediate_Session, units = "weeks"))
  )|>
  filter(Intermediate_Weeks > 0)

Final analysis data-set:

analysis_data <- transitioned_players |>
  left_join(beginner_phase_data, by = "Player_ID") |>
  left_join(intermediate_phase_data, by = "Player_ID") |>
  mutate(
    Beginner_Training_Frequency = Beginner_Sessions / (Beginner_Weeks + 1),
    Intermediate_Training_Frequency = Intermediate_Sessions / (Intermediate_Weeks + 1)
  ) |>
  filter(!is.na(Beginner_Training_Frequency) & !is.na(Weeks_to_Intermediate))|>
  select(Player_ID, Gender, `Age Range`, Days_to_Intermediate, Weeks_to_Intermediate, Beginner_Sessions, Beginner_Weeks, Beginner_Training_Frequency)

Data Analysis

General Graphs:

Summary statistics

summary(analysis_data$Weeks_to_Intermediate)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.4286   3.8571  15.3571  28.6429  44.7857 122.1429
summary(analysis_data$Beginner_Training_Frequency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02177 0.18124 0.33159 0.48080 0.68750 1.90000

Scatter Plot

ggplot(analysis_data, aes(x = Beginner_Training_Frequency, y = Weeks_to_Intermediate)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "Training Frequency vs. Time to Intermediate Level",
       x = "Beginner Training Frequency (sessions per week)",
       y = "Weeks to Reach Intermediate Level") +
  theme_minimal()

Histogram Weeks to Intermediate

ggplot(analysis_data, aes(x = Weeks_to_Intermediate)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Weeks to Intermediate Level",
       x = "Weeks", y = "Frequency") +
  theme_minimal()

Histogram Training Frequency

ggplot(analysis_data, aes(x = Beginner_Training_Frequency)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Beginner Training Frequency",
       x = "Sessions per Week", y = "Frequency") +
  theme_minimal()

Box Plots

ggplot(analysis_data, aes(y = Weeks_to_Intermediate)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  labs(title = "Boxplot: Weeks to Intermediate Level",
       y = "Weeks") +
  theme_minimal()

ggplot(analysis_data, aes(y = Beginner_Training_Frequency)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  labs(title = "Boxplot: Beginner Training Frequency",
       y = "Sessions per Week") +
  theme_minimal()

Pearson’s Correlation:

correlation <- cor.test(analysis_data$Beginner_Training_Frequency, 
                       analysis_data$Weeks_to_Intermediate, 
                       method = "pearson")
print(correlation)
## 
##  Pearson's product-moment correlation
## 
## data:  analysis_data$Beginner_Training_Frequency and analysis_data$Weeks_to_Intermediate
## t = -1.824, df = 70, p-value = 0.07242
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.42376349  0.01963095
## sample estimates:
##        cor 
## -0.2130063

Regression Analysis:

Regression model

regression_model <- lm(Weeks_to_Intermediate ~ Beginner_Training_Frequency, data = analysis_data)
summary(regression_model)
## 
## Call:
## lm(formula = Weeks_to_Intermediate ~ Beginner_Training_Frequency, 
##     data = analysis_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33.66 -22.53 -12.94  18.60  88.78 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   36.410      5.534   6.580 7.22e-09 ***
## Beginner_Training_Frequency  -16.155      8.857  -1.824   0.0724 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.99 on 70 degrees of freedom
## Multiple R-squared:  0.04537,    Adjusted R-squared:  0.03173 
## F-statistic: 3.327 on 1 and 70 DF,  p-value: 0.07242

Checking diagnostics

#Residuals vs Order
plot(resid(regression_model), type="b", main="Residuals vs Order", ylab="Residuals")
abline(h=0, lty=2)

# Diagnostic plots
par(mfrow=c(2,2))
plot(regression_model)

par(mfrow=c(1,1))

# Component-Component plus Residual Plots
crPlots(regression_model)

# Correlation matrix
cor(analysis_data[, c("Weeks_to_Intermediate", "Beginner_Training_Frequency", "Beginner_Sessions")], use = "complete.obs")
##                             Weeks_to_Intermediate Beginner_Training_Frequency
## Weeks_to_Intermediate                   1.0000000                  -0.2130063
## Beginner_Training_Frequency            -0.2130063                   1.0000000
## Beginner_Sessions                       0.3969350                   0.1888307
##                             Beginner_Sessions
## Weeks_to_Intermediate               0.3969350
## Beginner_Training_Frequency         0.1888307
## Beginner_Sessions                   1.0000000