# Set the file path
file_path <- "C:/Users/user/OneDrive/Desktop/Google Data Analystics/Datasets/Houston Marathon results.csv"
# Read the CSV file
Houston_Marathon <- read.csv(file_path, stringsAsFactors = FALSE)
# View the first few rows
head(Houston_Marathon)
## Name Gun.Time Age Sex Place Time Year Marathon
## 1 Zhao Yu 3:56:21 NA F <NA> 236.3500 2013 Houston
## 2 Zhao Yu 4:25:38 NA F <NA> 265.6333 2014 Houston
## 3 Elisa Wagner 4:58:44 43 F (F43) 298.7333 2015 Houston
## 4 Susan Thompson 4:57:47 66 F (F66) 297.7833 2013 Houston
## 5 Carlos Salas 4:10:54 43 M (M43) 250.9000 2014 Houston
## 6 Carlos Salas 4:57:02 44 M (M44) 297.0333 2015 Houston
sex_table <- table(Houston_Marathon$Sex)
sex_percent <- round(prop.table(sex_table) * 100, 1)
sex_labels <- paste0(names(sex_percent), ": ", sex_percent, "%")
print(sex_labels)
## [1] "F: 36.9%" "M: 63.1%"
``` r
sex_table <- table(Houston_Marathon$Sex)
sex_percent <- round(prop.table(sex_table) * 100, 1)
sex_labels <- paste0(names(sex_percent), ": ", sex_percent, "%")
print(sex_labels)
## [1] "F: 36.9%" "M: 63.1%"
# 1. Participant Demographics
``` r
summary(Houston_Marathon$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.00 32.00 40.00 40.08 47.00 99.00 13437
Histogram of ages
library(ggplot2)
ggplot(Houston_Marathon, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black", na.rm = TRUE) +
labs(title = "Age Distribution of Houston Marathon Participants", x = "Age", y = "Count") +
theme_minimal()

ggsave("age_distribution.png")
## Saving 7 x 5 in image
Sex distribution
sex_percentages <- round(prop.table(sex_table) * 100, 1)
paste0(names(sex_percentages), ": ", sex_percentages, "%") # Proportions
## [1] "F: 36.9%" "M: 63.1%"
Bar plot of sex distribution
ggplot(Houston_Marathon, aes(x = Sex)) +
geom_bar(fill = "green", color = "black") +
labs(title = "Sex Distribution of Houston Marathon Participants", x = "Sex", y = "Count") +
theme_minimal()

ggsave("sex_distribution.png")
## Saving 7 x 5 in image
Participation by year
year_table <- table(Houston_Marathon$Year)
year_table
##
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
## 4556 3932 4300 4673 5382 5727 5425 5333 5592 5345 6300 6902 7636 6667 7042 7131
## 2016 2017 2018 2019
## 7807 7109 6710 6525
Plot participation over years
ggplot(Houston_Marathon, aes(x = factor(Year))) +
geom_bar(fill = "purple", color = "black") +
labs(title = "Participation by Year in Houston Marathon", x = "Year", y = "Number of Participants") +
theme_minimal()

ggsave("participation_by_year.png")
## Saving 7 x 5 in image
2. Performance Metrics
Net time distribution
summary(Houston_Marathon$Time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 126.8 234.2 267.5 268.0 301.4 745.8
Histogram of net times
ggplot(Houston_Marathon, aes(x = Time)) +
geom_histogram(binwidth = 10, fill = "orange", color = "black", na.rm = TRUE) +
labs(title = "Net Time Distribution (Minutes)", x = "Net Time (Minutes)", y = "Count") +
theme_minimal()

ggsave("time_distribution.png")
## Saving 7 x 5 in image
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
3. Yearly Trends
Average time by year and sex
avg_time <- Houston_Marathon %>%
group_by(Year, Sex) %>%
summarise(Avg_Time = mean(Time, na.rm = TRUE), .groups = "drop")
print(avg_time)
## # A tibble: 40 × 3
## Year Sex Avg_Time
## <int> <chr> <dbl>
## 1 2000 F 278.
## 2 2000 M 262.
## 3 2001 F 275.
## 4 2001 M 257.
## 5 2002 F 276.
## 6 2002 M 254.
## 7 2003 F 282.
## 8 2003 M 257.
## 9 2004 F 285.
## 10 2004 M 258.
## # ℹ 30 more rows
minutes_to_hms <- function(mins) {
hrs <- floor(mins / 60)
min <- floor(mins %% 60)
sec <- round((mins - floor(mins)) * 60)
sprintf("%02d:%02d:%02d", hrs, min, sec)
}
avg_time <- Houston_Marathon %>%
group_by(Year, Sex) %>%
summarise(Avg_Time = mean(Time, na.rm = TRUE), .groups = "drop") %>%
mutate(Avg_HMS = minutes_to_hms(Avg_Time))
print(avg_time)
## # A tibble: 40 × 4
## Year Sex Avg_Time Avg_HMS
## <int> <chr> <dbl> <chr>
## 1 2000 F 278. 04:37:37
## 2 2000 M 262. 04:22:23
## 3 2001 F 275. 04:35:25
## 4 2001 M 257. 04:16:35
## 5 2002 F 276. 04:36:06
## 6 2002 M 254. 04:14:21
## 7 2003 F 282. 04:42:29
## 8 2003 M 257. 04:16:42
## 9 2004 F 285. 04:44:37
## 10 2004 M 258. 04:18:03
## # ℹ 30 more rows
Line plot of average time by year and sex
ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
geom_line() +
geom_point() +
labs(title = "Average Net Time by Year and Sex", x = "Year", y = "Average Net Time (Minutes)") +
theme_minimal()

ggsave("avg_time_by_year_sex.png")
## Saving 7 x 5 in image
minutes_to_hms <- function(mins) {
hrs <- floor(mins / 60)
min <- floor(mins %% 60)
sec <- round((mins - floor(mins)) * 60)
sprintf("%02d:%02d:%02d", hrs, min, sec)
}
avg_time <- Houston_Marathon %>%
group_by(Year, Sex) %>%
summarise(Avg_Time = mean(Time, na.rm = TRUE), .groups = "drop") %>%
mutate(Avg_HMS = minutes_to_hms(Avg_Time))
library(ggplot2)
ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
geom_line() +
geom_point() +
geom_text(aes(label = Avg_HMS), vjust = -0.5, size = 3) + # Add time labels
labs(
title = "Average Net Time by Year and Sex",
x = "Year",
y = "Average Net Time (Minutes)"
) +
theme_minimal()

ggsave("avg_time_by_year_sex.png", width = 7.29, height = 4.5)
minutes_to_hms <- function(mins) {
hrs <- floor(mins / 60)
min <- floor(mins %% 60)
sec <- round((mins - floor(mins)) * 60)
sprintf("%02d:%02d:%02d", hrs, min, sec)
}
library(ggplot2)
ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
geom_line() +
geom_point() +
labs(
title = "Average Net Time by Year and Sex",
x = "Year",
y = "Average Net Time (HH:MM:SS)"
) +
scale_y_continuous(labels = minutes_to_hms) +
theme_minimal()

library(dplyr)
library(tidyr)
extremes <- avg_time %>%
summarise(
min_time = min(Avg_Time),
max_time = max(Avg_Time)
) %>%
pivot_longer(everything(), names_to = "Type", values_to = "Avg_Time") %>%
left_join(avg_time, by = "Avg_Time")
ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
geom_line() +
geom_point() +
geom_text(
data = extremes,
aes(label = minutes_to_hms(Avg_Time)),
vjust = -0.8,
size = 3,
show.legend = FALSE
) +
labs(
title = "Average Net Time by Year and Sex",
x = "Year",
y = "Average Net Time (HH:MM:SS)"
) +
scale_y_continuous(labels = minutes_to_hms) +
theme_minimal()

library(dplyr)
extremes <- avg_time %>%
group_by(Sex) %>%
slice_min(order_by = Avg_Time, n = 1) %>%
bind_rows(
avg_time %>% group_by(Sex) %>% slice_max(order_by = Avg_Time, n = 1)
) %>%
ungroup()
library(ggplot2)
ggplot(avg_time, aes(x = Year, y = Avg_Time, color = Sex, group = Sex)) +
geom_line() +
geom_point() +
geom_text(
data = extremes,
aes(label = Avg_HMS),
vjust = -0.8,
size = 3,
show.legend = FALSE
) +
labs(
title = "Average Net Time by Year and Sex",
x = "Year",
y = "Average Net Time (HH:MM:SS)"
) +
scale_y_continuous(labels = minutes_to_hms) +
theme_minimal()
