# Set the file path
file_path <- "C:/Users/user/OneDrive/Desktop/Google Data Analystics/Datasets/Houston Marathon results.csv"

# Read the CSV file
Houston_Marathon <- read.csv(file_path, stringsAsFactors = FALSE)

# View the first few rows
head(Houston_Marathon)
##             Name Gun.Time Age Sex Place     Time Year Marathon
## 1        Zhao Yu  3:56:21  NA   F  <NA> 236.3500 2013  Houston
## 2        Zhao Yu  4:25:38  NA   F  <NA> 265.6333 2014  Houston
## 3   Elisa Wagner  4:58:44  43   F (F43) 298.7333 2015  Houston
## 4 Susan Thompson  4:57:47  66   F (F66) 297.7833 2013  Houston
## 5   Carlos Salas  4:10:54  43   M (M43) 250.9000 2014  Houston
## 6   Carlos Salas  4:57:02  44   M (M44) 297.0333 2015  Houston
sex_table <- table(Houston_Marathon$Sex)
sex_percent <- round(prop.table(sex_table) * 100, 1)
sex_labels <- paste0(names(sex_percent), ": ", sex_percent, "%")
print(sex_labels)
## [1] "F: 36.9%" "M: 63.1%"



``` r
sex_table <- table(Houston_Marathon$Sex)
sex_percent <- round(prop.table(sex_table) * 100, 1)
sex_labels <- paste0(names(sex_percent), ": ", sex_percent, "%")
print(sex_labels)
## [1] "F: 36.9%" "M: 63.1%"



# 1. Participant Demographics

``` r
summary(Houston_Marathon$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.00   32.00   40.00   40.08   47.00   99.00   13437

Histogram of ages

library(ggplot2)
ggplot(Houston_Marathon, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", na.rm = TRUE) +
  labs(title = "Age Distribution of Houston Marathon Participants", x = "Age", y = "Count") +
  theme_minimal()

ggsave("age_distribution.png")
## Saving 7 x 5 in image

Sex distribution

sex_percentages <- round(prop.table(sex_table) * 100, 1)
paste0(names(sex_percentages), ": ", sex_percentages, "%") # Proportions
## [1] "F: 36.9%" "M: 63.1%"

Bar plot of sex distribution

ggplot(Houston_Marathon, aes(x = Sex)) +
  geom_bar(fill = "green", color = "black") +
  labs(title = "Sex Distribution of Houston Marathon Participants", x = "Sex", y = "Count") +
  theme_minimal()

ggsave("sex_distribution.png")
## Saving 7 x 5 in image

Participation by year

year_table <- table(Houston_Marathon$Year)
year_table
## 
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 
## 4556 3932 4300 4673 5382 5727 5425 5333 5592 5345 6300 6902 7636 6667 7042 7131 
## 2016 2017 2018 2019 
## 7807 7109 6710 6525

Plot participation over years

ggplot(Houston_Marathon, aes(x = factor(Year))) +
  geom_bar(fill = "purple", color = "black") +
  labs(title = "Participation by Year in Houston Marathon", x = "Year", y = "Number of Participants") +
  theme_minimal()

ggsave("participation_by_year.png")
## Saving 7 x 5 in image

2. Performance Metrics

Net time distribution

summary(Houston_Marathon$Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   126.8   234.2   267.5   268.0   301.4   745.8

Histogram of net times

ggplot(Houston_Marathon, aes(x = Time)) +
  geom_histogram(binwidth = 10, fill = "orange", color = "black", na.rm = TRUE) +
  labs(title = "Net Time Distribution (Minutes)", x = "Net Time (Minutes)", y = "Count") +
  theme_minimal()

ggsave("time_distribution.png")
## Saving 7 x 5 in image
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Top 5 performers by sex and year

top_performers <- Houston_Marathon %>%
  group_by(Sex, Year) %>%
  slice_min(Time, n = 5, with_ties = FALSE) %>%
  select(Name, Sex, Year, Time)
print(top_performers)
## # A tibble: 200 × 4
## # Groups:   Sex, Year [40]
##    Name                Sex    Year  Time
##    <chr>               <chr> <int> <dbl>
##  1 Tetyana Pozdnyakova F      2000  152.
##  2 Zinaida Semenova    F      2000  153.
##  3 Wioletta Kryza      F      2000  154.
##  4 Jacqueline Jerotich F      2000  156.
##  5 Hilde Hovdenak      F      2000  157.
##  6 Stacie Alboucrek    F      2001  164.
##  7 Joy Smith           F      2001  176.
##  8 Wizz Bayou          F      2001  176.
##  9 Vasiliki Sikopeti   F      2001  178.
## 10 Emily Mccann        F      2001  180.
## # ℹ 190 more rows

Average time by year and sex

avg_time <- Houston_Marathon %>%
  group_by(Year, Sex) %>%
  summarise(Avg_Time = mean(Time, na.rm = TRUE), .groups = "drop")
print(avg_time)
## # A tibble: 40 × 3
##     Year Sex   Avg_Time
##    <int> <chr>    <dbl>
##  1  2000 F         278.
##  2  2000 M         262.
##  3  2001 F         275.
##  4  2001 M         257.
##  5  2002 F         276.
##  6  2002 M         254.
##  7  2003 F         282.
##  8  2003 M         257.
##  9  2004 F         285.
## 10  2004 M         258.
## # ℹ 30 more rows
minutes_to_hms <- function(mins) {
  hrs <- floor(mins / 60)
  min <- floor(mins %% 60)
  sec <- round((mins - floor(mins)) * 60)
  sprintf("%02d:%02d:%02d", hrs, min, sec)
}
avg_time <- Houston_Marathon %>%
  group_by(Year, Sex) %>%
  summarise(Avg_Time = mean(Time, na.rm = TRUE), .groups = "drop") %>%
  mutate(Avg_HMS = minutes_to_hms(Avg_Time))

print(avg_time)
## # A tibble: 40 × 4
##     Year Sex   Avg_Time Avg_HMS 
##    <int> <chr>    <dbl> <chr>   
##  1  2000 F         278. 04:37:37
##  2  2000 M         262. 04:22:23
##  3  2001 F         275. 04:35:25
##  4  2001 M         257. 04:16:35
##  5  2002 F         276. 04:36:06
##  6  2002 M         254. 04:14:21
##  7  2003 F         282. 04:42:29
##  8  2003 M         257. 04:16:42
##  9  2004 F         285. 04:44:37
## 10  2004 M         258. 04:18:03
## # ℹ 30 more rows

Line plot of average time by year and sex

ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
  geom_line() +
  geom_point() +
  labs(title = "Average Net Time by Year and Sex", x = "Year", y = "Average Net Time (Minutes)") +
  theme_minimal()

ggsave("avg_time_by_year_sex.png")
## Saving 7 x 5 in image
minutes_to_hms <- function(mins) {
  hrs <- floor(mins / 60)
  min <- floor(mins %% 60)
  sec <- round((mins - floor(mins)) * 60)
  sprintf("%02d:%02d:%02d", hrs, min, sec)
}

avg_time <- Houston_Marathon %>%
  group_by(Year, Sex) %>%
  summarise(Avg_Time = mean(Time, na.rm = TRUE), .groups = "drop") %>%
  mutate(Avg_HMS = minutes_to_hms(Avg_Time))
library(ggplot2)

ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
  geom_line() +
  geom_point() +
  geom_text(aes(label = Avg_HMS), vjust = -0.5, size = 3) +  # Add time labels
  labs(
    title = "Average Net Time by Year and Sex",
    x = "Year",
    y = "Average Net Time (Minutes)"
  ) +
  theme_minimal()

ggsave("avg_time_by_year_sex.png", width = 7.29, height = 4.5)
minutes_to_hms <- function(mins) {
  hrs <- floor(mins / 60)
  min <- floor(mins %% 60)
  sec <- round((mins - floor(mins)) * 60)
  sprintf("%02d:%02d:%02d", hrs, min, sec)
}
library(ggplot2)

ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Average Net Time by Year and Sex",
    x = "Year",
    y = "Average Net Time (HH:MM:SS)"
  ) +
  scale_y_continuous(labels = minutes_to_hms) +
  theme_minimal()

library(dplyr)
library(tidyr)
extremes <- avg_time %>%
  summarise(
    min_time = min(Avg_Time),
    max_time = max(Avg_Time)
  ) %>%
  pivot_longer(everything(), names_to = "Type", values_to = "Avg_Time") %>%
  left_join(avg_time, by = "Avg_Time")

ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
  geom_line() +
  geom_point() +
  geom_text(
    data = extremes,
    aes(label = minutes_to_hms(Avg_Time)),
    vjust = -0.8,
    size = 3,
    show.legend = FALSE
  ) +
  labs(
    title = "Average Net Time by Year and Sex",
    x = "Year",
    y = "Average Net Time (HH:MM:SS)"
  ) +
  scale_y_continuous(labels = minutes_to_hms) +
  theme_minimal()

library(dplyr)

extremes <- avg_time %>%
  group_by(Sex) %>%
  slice_min(order_by = Avg_Time, n = 1) %>%
  bind_rows(
    avg_time %>% group_by(Sex) %>% slice_max(order_by = Avg_Time, n = 1)
  ) %>%
  ungroup()
library(ggplot2)

ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
  geom_line() +
  geom_point() +
  geom_text(
    data = extremes,
    aes(label = Avg_HMS),
    vjust = -0.8,
    size = 3,
    show.legend = FALSE
  ) +
  labs(
    title = "Average Net Time by Year and Sex",
    x = "Year",
    y = "Average Net Time (HH:MM:SS)"
  ) +
  scale_y_continuous(labels = minutes_to_hms) +
  theme_minimal()