Final Project

DSA410_Julian_Iveyls_Andrew

Author

Andrew Markizon & Julian Bennett and Ivelys Garcia Sanabria

Published

April 9, 2025

Pickleball Game Analysis

##Loading the DataSet

library(readr)
Warning: package 'readr' was built under R version 4.3.3
library(readxl)
Warning: package 'readxl' was built under R version 4.3.3
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.3.3
Warning: package 'ggplot2' was built under R version 4.3.3
Warning: package 'tibble' was built under R version 4.3.3
Warning: package 'tidyr' was built under R version 4.3.3
Warning: package 'purrr' was built under R version 4.3.3
Warning: package 'dplyr' was built under R version 4.3.3
Warning: package 'stringr' was built under R version 4.3.3
Warning: package 'forcats' was built under R version 4.3.3
Warning: package 'lubridate' was built under R version 4.3.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.2
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(skimr)
Warning: package 'skimr' was built under R version 4.3.3
library(naniar)
Warning: package 'naniar' was built under R version 4.3.3

Attaching package: 'naniar'

The following object is masked from 'package:skimr':

    n_complete
pickleball <- read_csv("Data/NCSGpickleballData.csv")
New names:
Rows: 21750 Columns: 9
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(3): EventType, Gender, EventYearParticipantAge...7 dbl (4): ParticipantID,
EventYear, BirthYear, EventYearParticipantAge...6 lgl (2): ...8, ...9
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `EventYearParticipantAge` -> `EventYearParticipantAge...6`
• `EventYearParticipantAge` -> `EventYearParticipantAge...7`
• `` -> `...8`
• `` -> `...9`

Inspect Data

#Displays first 6 rows of the pickleball dataset
head(pickleball)
# A tibble: 6 × 9
  ParticipantID EventType      Gender EventYear BirthYear EventYearParticipant…¹
          <dbl> <chr>          <chr>      <dbl>     <dbl>                  <dbl>
1        105925 Bowling        Male        2016      1944                     72
2        105927 Pickleball     Female      2016      1954                     62
3        105927 Softball Tour… Female      2016      1954                     62
4        105938 Basketball Sh… Male        2016      1931                     85
5        105947 Football Throw Female      2016      1921                     95
6        105950 Performing Ar… Female      2016      1947                     69
# ℹ abbreviated name: ¹​EventYearParticipantAge...6
# ℹ 3 more variables: EventYearParticipantAge...7 <chr>, ...8 <lgl>, ...9 <lgl>
## Displays the structure of the 'pickleball' dataset, including data types and a preview of the contents
str(pickleball)
spc_tbl_ [21,750 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ParticipantID              : num [1:21750] 105925 105927 105927 105938 105947 ...
 $ EventType                  : chr [1:21750] "Bowling" "Pickleball" "Softball Tournament" "Basketball Shooting" ...
 $ Gender                     : chr [1:21750] "Male" "Female" "Female" "Male" ...
 $ EventYear                  : num [1:21750] 2016 2016 2016 2016 2016 ...
 $ BirthYear                  : num [1:21750] 1944 1954 1954 1931 1921 ...
 $ EventYearParticipantAge...6: num [1:21750] 72 62 62 85 95 69 79 68 57 73 ...
 $ EventYearParticipantAge...7: chr [1:21750] "70-74" "60-64" "60-64" "85-89" ...
 $ ...8                       : logi [1:21750] NA NA NA NA NA NA ...
 $ ...9                       : logi [1:21750] NA NA NA NA NA NA ...
 - attr(*, "spec")=
  .. cols(
  ..   ParticipantID = col_double(),
  ..   EventType = col_character(),
  ..   Gender = col_character(),
  ..   EventYear = col_double(),
  ..   BirthYear = col_double(),
  ..   EventYearParticipantAge...6 = col_double(),
  ..   EventYearParticipantAge...7 = col_character(),
  ..   ...8 = col_logical(),
  ..   ...9 = col_logical()
  .. )
 - attr(*, "problems")=<externalptr> 
# Provides a transposed view of the dataset's structure, showing variable names, types, and a preview of the values
glimpse(pickleball)
Rows: 21,750
Columns: 9
$ ParticipantID               <dbl> 105925, 105927, 105927, 105938, 105947, 10…
$ EventType                   <chr> "Bowling", "Pickleball", "Softball Tournam…
$ Gender                      <chr> "Male", "Female", "Female", "Male", "Femal…
$ EventYear                   <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, …
$ BirthYear                   <dbl> 1944, 1954, 1954, 1931, 1921, 1947, 1937, …
$ EventYearParticipantAge...6 <dbl> 72, 62, 62, 85, 95, 69, 79, 68, 57, 73, 66…
$ EventYearParticipantAge...7 <chr> "70-74", "60-64", "60-64", "85-89", "95-99…
$ ...8                        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ...9                        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## Generates summary statistics for each variable in the 'pickleball' dataset (e.g., min, max, mean, median for numeric variables; counts for factors)
summary(pickleball)
 ParticipantID     EventType            Gender            EventYear   
 Min.   :105925   Length:21750       Length:21750       Min.   :2016  
 1st Qu.:110202   Class :character   Class :character   1st Qu.:2017  
 Median :114939   Mode  :character   Mode  :character   Median :2019  
 Mean   :126711                                         Mean   :2020  
 3rd Qu.:140210                                         3rd Qu.:2023  
 Max.   :175766                                         Max.   :2024  
   BirthYear    EventYearParticipantAge...6 EventYearParticipantAge...7
 Min.   :1918   Min.   : 50.00              Length:21750               
 1st Qu.:1945   1st Qu.: 62.00              Class :character           
 Median :1951   Median : 69.00              Mode  :character           
 Mean   :1951   Mean   : 68.55                                         
 3rd Qu.:1958   3rd Qu.: 75.00                                         
 Max.   :1974   Max.   :101.00                                         
   ...8           ...9        
 Mode:logical   Mode:logical  
 NA's:21750     NA's:21750    
                              
                              
                              
                              

Age of Participants

# Calculate summary statistics (mean, median, min, max) of participant age by event year
AgeofParticipant <- pickleball %>%
  # Select relevant columns
  select(EventType, EventYearParticipantAge...6, ParticipantID, EventYear) %>%
  
  # Filter for events specifically labeled as "Pickleball"
  filter(EventType == "Pickleball") %>%
  
  # Group data by the year of the event
  group_by(EventYear) %>%
  
  # Summarize age statistics for each year
  summarise(
    MeanAge = mean(EventYearParticipantAge...6, na.rm = TRUE),   # Average age
    MedianAge = median(EventYearParticipantAge...6, na.rm = TRUE), # Median age
    MinAge = min(EventYearParticipantAge...6, na.rm = TRUE),     # Minimum age
    MaxAge = max(EventYearParticipantAge...6, na.rm = TRUE)      # Maximum age
  )

# Display the summarized data
AgeofParticipant
# A tibble: 7 × 5
  EventYear MeanAge MedianAge MinAge MaxAge
      <dbl>   <dbl>     <dbl>  <dbl>  <dbl>
1      2016    66.3        66     50     94
2      2017    67.4        67     50     91
3      2018    67.0        67     50     96
4      2019    66.9        67     50     97
5      2022    67.6        68     50     93
6      2023    66.8        66     50     89
7      2024    66.4        66     50     90

Is Pickleball Thriving or Declining

# Count the number of Pickleball participants per event year
ThrivePickleball <- pickleball %>%
  # Select only the relevant columns
  select(EventType, EventYearParticipantAge...6, ParticipantID, EventYear) %>%
  
  # Filter for rows where the event type is "Pickleball"
  filter(EventType == "Pickleball") %>%
  
  # Group by the year of the event
  group_by(EventYear) %>%
  
  # Count the number of participants per year
  summarise(count = n())

# Display the result
ThrivePickleball
# A tibble: 7 × 2
  EventYear count
      <dbl> <int>
1      2016   466
2      2017   467
3      2018   552
4      2019   563
5      2022   501
6      2023   719
7      2024   864

Do Pickleball Players play other sports

MultiSportTrend <- pickleball %>%
  
  # Identify participants who played Pickleball in a given year
  filter(EventType == "Pickleball") %>%
  
  # Select only relevant columns
  select(ParticipantID, EventYear) %>%
  
  # Join back to the original dataset to find other sports played by the same participants
  inner_join(pickleball, by = c("ParticipantID", "EventYear")) %>%
  
  # Filter out Pickleball to keep only other sports
  filter(EventType != "Pickleball") %>%
  
  # Count the number of unique Pickleball players who also played other sports per year
  group_by(EventYear) %>%
  summarise(
    MultiSportPlayerCount = n_distinct(ParticipantID),
    .groups = "drop"
  )
  
MultiSportTrend
# A tibble: 7 × 2
  EventYear MultiSportPlayerCount
      <dbl>                 <int>
1      2016                   145
2      2017                   127
3      2018                   138
4      2019                   120
5      2022                   104
6      2023                   113
7      2024                   148

What other Sports do Pickleball Players Play

MultiSportTrend <- pickleball %>%
  
  # Identify participants who played Pickleball in a given year
  filter(EventType == "Pickleball") %>%
  
  # Select only relevant columns
  select(ParticipantID, EventYear) %>%
  
  # Join back to the original dataset to find other sports played by the same participants
  inner_join(pickleball, by = c("ParticipantID", "EventYear")) %>%
  
  # Filter out Pickleball to keep only other sports
  filter(EventType != "Pickleball") %>%
  
  # Group by event year and sport to count occurrences
  group_by(EventYear, EventType) %>%
  
  # Count the number of Pickleball players who also played other sports
  summarise(MultiSportCount = n(), .groups = "drop") %>%
  
  # Rename EventType to OtherSport for clarity
  rename(OtherSport = EventType)
  
MultiSportTrend
# A tibble: 166 × 3
   EventYear OtherSport            MultiSportCount
       <dbl> <chr>                           <int>
 1      2016 Badminton                          15
 2      2016 Basketball Shooting                11
 3      2016 Basketball Tournament              12
 4      2016 Billiards                           3
 5      2016 Bocce                              21
 6      2016 Bowling                             4
 7      2016 Cornhole                            7
 8      2016 Cycling                             5
 9      2016 Discus Throw                        4
10      2016 Distance Run                        2
# ℹ 156 more rows

Most Played Sport Pickleball Players Play by Year

MultiSportTrend <- pickleball %>%
  
  # Filter for participants who played Pickleball
  filter(EventType == "Pickleball") %>%
  
  # Keep only participant ID and the year of the event
  select(ParticipantID, EventYear) %>%
  
  # Join back with the full dataset to find what *other* sports these Pickleball players played in the same year
  inner_join(pickleball, by = c("ParticipantID", "EventYear")) %>%
  
  # Remove Pickleball to focus only on other sports
  filter(EventType != "Pickleball") %>%
  
  # Group by year and sport
  group_by(EventYear, EventType) %>%
  
  # Count how many Pickleball players also played each other sport in each year
  summarise(MultiSportCount = n(), .groups = "drop") %>%
  
  # Rename EventType column to OtherSport for clarity
  rename(OtherSport = EventType) %>%
  
  # Rank the sports by number of players in each year and get the most played sport
  group_by(EventYear) %>%
  mutate(MostPlayedSport = if_else(MultiSportCount == max(MultiSportCount), OtherSport, NA_character_)) %>%
  
  # Keep only the most played sport for each year (removes duplicates)
  filter(!is.na(MostPlayedSport)) %>%
  
  # Clean up by removing the MostPlayedSport column after extraction
  select(-MostPlayedSport)

MultiSportTrend
# A tibble: 8 × 3
# Groups:   EventYear [7]
  EventYear OtherSport            MultiSportCount
      <dbl> <chr>                           <int>
1      2016 Bocce                              21
2      2017 Badminton                          14
3      2017 Table Tennis                       14
4      2018 Tennis                             18
5      2019 Softball Tournament                15
6      2022 Basketball Tournament              12
7      2023 Basketball Shooting                14
8      2024 Tennis                             18

Do Pickleball Players play Silver Arts

#read in Silver Arts dataset
silverArts <- read_excel("Data/NCSGSilverArtsData.xlsx")
#Identify unique Pickleball participants by year
pb_players <- pickleball %>%
  filter(EventType == "Pickleball") %>%
  select(ParticipantID, EventYear) %>%
  distinct()

#Identify participants in other sports (excluding Pickleball) by year
other_sports <- pickleball %>%
  filter(EventType != "Pickleball") %>%
  select(ParticipantID, EventYear) %>%
  distinct()

#Identify participants in Silver Arts by year
silver_arts_participation <- silverArts %>%
  select(ParticipantID, EventYear) %>%
  distinct()

#Combine participation data for Pickleball players
pb_participation <- pb_players %>%
  # Join with other sports participation, marking TRUE where matched
  left_join(other_sports %>% mutate(PlayedOtherSport = TRUE),
            by = c("ParticipantID", "EventYear")) %>%
  
  # Join with Silver Arts participation, marking TRUE where matched
  left_join(silver_arts_participation %>% mutate(PlayedSilverArts = TRUE),
            by = c("ParticipantID", "EventYear")) %>%
  
  # Fill NA values with FALSE for boolean participation indicators
  mutate(
    PlayedOtherSport = ifelse(is.na(PlayedOtherSport), FALSE, TRUE),
    PlayedSilverArts = ifelse(is.na(PlayedSilverArts), FALSE, TRUE)
  )

#Categorize each participant's type of participation and count each category
participation_summary <- pb_participation %>%
  mutate(
    ParticipationType = case_when(
      PlayedOtherSport & PlayedSilverArts ~ "Pickleball + Other Sports + Silver Arts",
      PlayedOtherSport ~ "Pickleball + Other Sports Only",
      PlayedSilverArts ~ "Pickleball + Silver Arts Only",
      TRUE ~ "Pickleball Only"
    )
  ) %>%
  count(ParticipationType)  # Count number of participants in each category

# View the summary
participation_summary
# A tibble: 3 × 2
  ParticipationType                           n
  <chr>                                   <int>
1 Pickleball + Other Sports + Silver Arts    58
2 Pickleball + Other Sports Only            837
3 Pickleball Only                          3237