DSC 385 - Project 1 Report

Name and EID

Arvin Zapanta (acz492)

Setup

## Loading all the data files
## DO NOT CHANGE THIS CODE
championships <- read_tsv("WCA_export_championships.tsv.bz2")
competitions <- read_tsv("WCA_export_Competitions.tsv.bz2")
continents <- read_tsv("WCA_export_Continents.tsv.bz2")
countries <- read_tsv("WCA_export_Countries.tsv.bz2")
eligible_country_iso2s_for_championship <- read_tsv("WCA_export_eligible_country_iso2s_for_championship.tsv.bz2")
events <- read_tsv("WCA_export_Events.tsv.bz2")
formats <- read_tsv("WCA_export_Formats.tsv.bz2")
persons <- read_tsv("WCA_export_Persons.tsv.bz2")
ranksaverage <- read_tsv("WCA_export_RanksAverage_333.tsv.bz2")
rankssingle <- read_tsv("WCA_export_RanksSingle_333.tsv.bz2")
results <- read_tsv("WCA_export_Results_333.tsv.bz2")
roundtypes <- read_tsv("WCA_export_RoundTypes.tsv.bz2")
scrambles <- read_tsv("WCA_export_Scrambles.tsv.bz2")

Required Questions

Active Speed Cubers

How many active (3x3x3) speedcubers are there registered with the WCA?

For this question an active speeedcuber is defined as any person registered in the WCA who has competed in at least two competitions in the years 2022–2024.

## Add your code here
# Step 1: Filter (3x3x3) active speedcubers who competed in year 2022--2024.
filtered_results <- results |>
  filter(eventId == 333, grepl("2022$|2023$|2024$", competitionId))

# Identifying personId that participated in at least two different competitions
person_participation <- filtered_results |>
  group_by(personId) |>
  summarise(competition_count = n_distinct(competitionId)) |>
  filter(competition_count >= 2)

# Displaying the relevant personIds
num_participants = nrow(person_participation)


# Display the results
cat("The number of active (3x3x3) speedcubers that are registered with the WCA is:",num_participants)

The number of active (3x3x3) speedcubers that are registered with the WCA is: 39484

Write your answer here.

World Records

This question has two parts:

According to the data, who holds the world record single best solve?

## Add your code here

# Finding the world record best solve
world_record <- filtered_results |>
  filter(best > 0) |> # Discard negative values to remove DNF attempts which are marked by -1
   arrange(best)|> # Arrange best to place shortest time at top
  filter(best == min(best, na.rm = TRUE)) |>  # Get the row with the best (lowest) solve
  select(personName, personId, competitionId, best)  # Selecting relevant columns


# Display the results
cat(world_record$personName, "holds the world record single best solve.")

Max Park holds the world record single best solve.

Write your answer here.

On what date was this record set?

## Add your code here
# Extract the competition ID where the world record single best solve occurred
event_of_single_best_solve <- world_record$competitionId[1]

# Retrieve competition details (month, day, year) for the event where the record was set
get_info_competitions  <- competitions |>
  filter(id==event_of_single_best_solve)|>
  select(id,month,day,year) 

# Convert month number to full month name, and pull day and year
record_month <- month.name[get_info_competitions$month]
record_day <- get_info_competitions$day
record_year <- get_info_competitions$year


# Display the results
cat( "The record was set on", formatted_date <- paste0(record_month, " ", record_day, ", ", record_year, "."))

The record was set on June 11, 2023.

Write your answer here.

According to the data, who previously held the world record best single solve?

## Add your code here
# Filtering results for eventId 333 and competitionId ending in a year **before** 2022
part_2_filtered_results <- results |>
  filter(eventId == 333, grepl("^(.*)(2019$|2020$|2021$|201[0-8]$)", competitionId)) 

# Identifying personId that participated in at least two different competitions
prev_person_participation <- part_2_filtered_results |>
  group_by(personId) |>
  summarise(competition_count = n_distinct(competitionId)) |>
  filter(competition_count >= 2)

# Finding the world record best solve
prev_world_record <- part_2_filtered_results |>
  filter(best > 0) |>
   arrange(best)|>
  filter(best == min(best, na.rm = TRUE)) |>  # Get the row with the best (lowest) solve
  select(personName, personId, competitionId, best)  # Selecting relevant columns


# Display the results
cat(prev_world_record$personName, "holds the previous world record single best solve.")

Yusheng Du (杜宇生) holds the previous world record single best solve.

Write your answer here.

On what date was this previous record set?

## Add your code here
# Retrieve competition details (month, day, year) for the event where the record was set
prev_event_of_single_best_solve <- prev_world_record$competitionId[1]
# Retrieve competition details (month, day, year) for the event where the record was set
prev_get_info_competitions  <- competitions |>
  filter(id==prev_event_of_single_best_solve)|>
  select(id,month,day,year) 

# Convert month number to full month name
prev_record_month <- month.name[prev_get_info_competitions$month]
prev_record_day <- prev_get_info_competitions$day
prev_record_year <- prev_get_info_competitions$year


# Display the results
cat( "The record was set on", formatted_date <- paste0(prev_record_month, " ", prev_record_day, ", ", prev_record_year, "."))

The record was set on November 24, 2018.

Write your answer here.

NOTE: For these questions, consider all speedcubers (not just active ones) and define “best” as the fastest time for a single solve (not for an average).

Regional Rankings

This question has two parts:

Amongst all speedcubers, who is the top ranked male speedcuber (for best single solve) in Australia?

## Add your code here
# Step 1: Filter male participants from Australia
find_male<- persons |>
  filter(gender=="m",countryId=="Australia")|>
  select(id, name, countryId, gender)  # Keep relevant columns

# Step 2: Filter competition results for Australian participants in 3x3x3
regional_filtered_results <- results |>
  filter( eventId == 333,personCountryId=="Australia") |>
  filter(best > 0) |>
   arrange(best)|>
  select(personName, personId, competitionId, best)  # Selecting relevant columns
 
# Step 3:Identifying personId that participated in at least two different competitions
regional_person_participation <- regional_filtered_results |>
  group_by(personId) |>
  summarise(competition_count = n_distinct(competitionId),# Count distinct competitions
            best = min(best)) |> # Keep the best (lowest) solve for each person
  
  filter(competition_count >= 2)# Filter for people with at least 2 competitions



# Step 4: Left Join to link 'id' from find_male with 'personId' from regional_grouped_results
regional_linked_results <- left_join(regional_person_participation, find_male, by = c("personId" = "id")) |> 
  drop_na() |>   # Remove NA rows
  arrange(best)




# Display the results
cat(regional_linked_results$name[1], "is the top ranked male speedcuber (for best single solve) in Australia.")

Jode Brewster is the top ranked male speedcuber (for best single solve) in Australia.

Write your answer here.

Amongst all speedcubers, who is the top ranked female speedcuber (for best single solve time) in Europe?

NOTE: Europe is identified under the continentId column of the countries table.

## Add your code here
# Step 1: Filter female participants from Australia
find_female<- persons |>
  filter(gender=="f")|>
  select(id, name, countryId, gender)  # Keep relevant columns

# Step 2: Filter Europe under continentId
Europe<- countries |>
  filter(continentId=="_Europe")|>
  select(id, continentId)



# Step 3: Filter competition results for European contries participants in 3x3x3
part2_regional_filtered_results <- results |>
  filter(eventId == 333) |>  # 3x3x3 event
  filter(personId %in% find_female$id) |>  # Ensure they are female
  filter(personCountryId %in% Europe$id) |>  # Ensure they are from Europe
  filter(best > 0) %>%  # Exclude negative or missing times
  arrange(best) %>%  # Sort in ascending order (fastest solve first)
  select(personName, personId, competitionId, best)  # Keep relevant columns

## Step 4: Group by 'personId', count unique competitions, and filter for at least 2 competitions
female_grouped_results <- part2_regional_filtered_results |>
  group_by(personId, personName) |>
  summarise(
    competition_count = n_distinct(competitionId),  # Count distinct competitions
    best = min(best),  # Keep the best (lowest) solve for each person
    .groups = "drop"  # Ensures the output is ungrouped, preventing summarise() warning
  ) |>
  filter(competition_count >= 2)  # Ensure they have competed in at least 2 competitions

## Step 5: Get the top-ranked female speedcuber (best single solve)
top_female_speedcuber <- female_grouped_results |>
  filter(best == min(best, na.rm = TRUE))|>  # Get the best (lowest) solve time
  arrange(best)


# Display the results
cat(top_female_speedcuber$personName[1], "is the top ranked female speedcuber (for best single solve) in Europe.")

Magdalena Pabisz is the top ranked female speedcuber (for best single solve) in Europe.

Write your answer here.

Time Until Sub-5

Having a time below 5 seconds is considered an elite achievement and most speedcubers have to complete a large number of solves before they can obtain a sub-5 second solve.

For the current top 10 speedcubers in the world (as recorded in the RanksSingle table), on average, how many solves did they have to do before achieving a sub-5 second solve?

NOTE: Each round of a competition has 5 solves that should be considered separately when counting the number of solves.

## Add your code here


# Step 1: Extract the top 10 world-ranked speedcubers from rankssingle
top_10_speedcubers <- rankssingle |>
  filter(worldRank <= 10) |>
  select(personId)

# Step 2: Match these cubers with their competition results from results table
top_10_results <- results |>
  filter(personId %in% top_10_speedcubers$personId) |>
  select(competitionId, personId, roundTypeId, value1, value2, value3, value4, value5)  # Include roundTypeId


# Step 3: Extract the dates from competitions
top_10_dates <- competitions |>
  filter(id %in% top_10_results$competitionId) |>
  select(id, month, day, year)

# Step 4: Extract the dates from competitions and join with top_10_results
top_10_results_with_dates <- top_10_results |>
  left_join(
    competitions %>% select(id, month, day, year),
    by = c("competitionId" = "id")
  ) |>
  mutate(solve_date = as.Date(paste(year, month, day, sep = "-")))  # Create a proper date column

# Step 5: Join with RoundTypes to get round rank order
top_10_results_with_dates <- top_10_results_with_dates |>
  left_join(roundtypes |>select(id, rank), by = c("roundTypeId" = "id"))  # Merge round order

# Step 6: Convert to long format (one solve per row)
long_format_results <- top_10_results_with_dates |>
  pivot_longer(cols = starts_with("value"), names_to = "solve_attempt", values_to = "solve_time") |>
  filter(solve_time > 0) |> # Remove invalid solves (-1 or missing)
  arrange(personId, solve_date, roundTypeId, competitionId, rank)  # Arrange by date and round order

# Step 7: Count number of solves before achieving a sub-5 second solve per person
sub_5_solves_per_person <- long_format_results |>
  group_by(personId) |>
  mutate(solve_number = row_number()) 

# Step 8: Find the first solve_number where solve_time is < 500 per person
sub_5_solves_per_person <- sub_5_solves_per_person |>
  filter(solve_time < 500) |>
  summarise(
    first_sub_5_attempt = min(solve_number),  # Find first sub-5 solve considering all rounds
    .groups = "drop"
  )

# Step 9: Compute the average number of solves before achieving a sub-5 second solve per person
average_solves_per_person <- sub_5_solves_per_person |>
  summarise(average_solves = mean(first_sub_5_attempt, na.rm = TRUE))

# Step 10: Display results
cat("On average, the top 10 speedcubers had to do", round(average_solves_per_person$average_solves, 2), 
    "solves before achieving a sub-5 second solve.")

On average, the top 10 speedcubers had to do 502.5 solves before achieving a sub-5 second solve.

Write your answer here.

For one of the top 10 speedcubers make a scatterplot of their individual single solve times vs. the date of the solve, with date on the x-axis and solve time on the y-axis.

## Add your code here

# Step 1: Extract the top 10 world-ranked speedcubers from rankssingle
top_10_speedcubers <- rankssingle|>
  filter(worldRank <= 10) |>  # Select only top 10
  select(personId)  # Keep only personId

# Step 2: Match these cubers with their competition results from results table
top_10_results <- results |>
  filter(personId %in% top_10_speedcubers$personId) |>  # Keep only top 10 competitors
  select(competitionId, personName, personId, value1, value2, value3, value4, value5)  # Select solve values


# Step 3: Extract the dates from competitions
top_10_dates <- competitions |>
  filter(id %in% top_10_results$competitionId) |>
  select(id, month, day, year)

# Step 4: Extract the dates from competitions and join with top_10_results
top_10_results_with_dates <- top_10_results |>
  left_join(
    competitions |> select(id, month, day, year),
    by = c("competitionId" = "id")
  )


# Step 5: Convert the results to long format (one solve per row)
long_format_results <- top_10_results_with_dates |>
  pivot_longer(cols = starts_with("value"), names_to = "solve_attempt", values_to = "solve_time") |>
  filter(solve_time > 0)  # Remove invalid solves (-1 or missing)

# Step 6: Create a proper date column for plotting
long_format_results <- long_format_results |>
  mutate(solve_date = as.Date(paste(year, month, day, sep = "-")))  # Combine year, month, day into a date




# Step 7: Select one top speedcuber (modify as needed)
chosen_person <- long_format_results|>
  filter(personId == "2019WANY36")  # Change this to any specific personId from top 10

# Extract the person's name for dynamic title
person_name <- unique(chosen_person$personName)



# Step 8: Create a scatterplot of solve times vs. competition year
ggplot(chosen_person, aes(x = solve_date, y = solve_time)) +
  geom_point(color = "blue", size = 2) +
    labs(title = paste("Single Solve Times vs. Date of the Solve for:", person_name),
       x = "Date of Solve",
       y = "Solve Time (Milliseconds)") +
  theme_minimal()

Up-and-Coming Speed Cubers

Which speed cubers not in the top 10,000 (worldwide for single best time) should we keep an eye on for the near future?

The idea here is to identify “up-and-coming” speedcubers who are not yet achieving elite times. Come up with a list of five speedcubers (provide their names and WCA IDs) that you have identified as “up-and-coming”. There is no one way to answer this question and the goal is to provide an analysis of the data that justifies the selection of your five names.

## Add your code here

Write your result here.

Name and EID

Setup

Required Questions

Active Speed Cubers

World Records

Regional Rankings

Time Until Sub-5

Up-and-Coming Speed Cubers

Discussion