Maratona de Boston - 2023

Everton Gonçalves

2024-09-26

BAA Logo

Descrição:

A Maratona de Boston é uma corrida anual que acontece na cidade de Boston, Massachusetts. Atraindo milhares de corredores de diferentes níveis de habilidade e experiência, ano após ano, participantes de todo o mundo se reúnem para competir por recordes mundiais, recordes de divisão e melhores marcas pessoais. Analisar os resultados desses atletas oferece uma visão interessante sobre a distribuição e a variação dos tempos de todos os corredores que completaram com sucesso essa corrida histórica.

Fonte / Metadados:

Fonte de dados: Download

Informações de metadados: Ver

Instalação dos pacotes utilizados

install.packages("here")
install.packages("skimr")
install.packages("janitor")
install.packages("dplyr")
install.packages("rmarkdown")
install.packages("lubridate")
install.packages("tidyverse")
install.packages("gridExtra")

Coleta dos Dados

#Coletar Dados

ds <- read.csv("https://data.scorenetwork.org/data/boston_marathon_2023.csv", sep =',')

Analisando o data set

str(ds)
## 'data.frame':    26598 obs. of  15 variables:
##  $ age_group         : chr  "18-39" "18-39" "18-39" "18-39" ...
##  $ place_overall     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ place_gender      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ place_division    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ name              : chr  "Chebet, Evans" "Geay, Gabriel" "Kipruto, Benson" "Korir, Albert" ...
##  $ team              : chr  "Team–" "Team–" "Team–" "Team–" ...
##  $ bib_number        : chr  "1" "3" "5" "19" ...
##  $ half_time         : chr  "1H 2M 20S" "1H 2M 20S" "1H 2M 19S" "1H 2M 20S" ...
##  $ finish_net        : chr  "2H 5M 54S" "2H 6M 4S" "2H 6M 6S" "2H 8M 1S" ...
##  $ finish_gun        : chr  "2H 5M 54S" "2H 6M 4S" "2H 6M 6S" "2H 8M 1S" ...
##  $ gender            : chr  "M" "M" "M" "M" ...
##  $ half_time_sec     : int  3740 3740 3739 3740 3740 3739 3839 3839 3739 3839 ...
##  $ finish_net_sec    : int  7554 7564 7566 7681 7715 7763 7784 7786 7804 7817 ...
##  $ finish_gun_sec    : int  7554 7564 7566 7681 7715 7763 7784 7786 7804 7817 ...
##  $ finish_net_minutes: num  126 126 126 128 129 ...
#Valores NA-Missing por Colunas
sapply(ds, function(x) sum(is.na(x))) #contando valores NA´s das colunas
##          age_group      place_overall       place_gender     place_division 
##                  0                  0                  0                  0 
##               name               team         bib_number          half_time 
##                  0                  0                  0                 72 
##         finish_net         finish_gun             gender      half_time_sec 
##                  0                  0                  0                 72 
##     finish_net_sec     finish_gun_sec finish_net_minutes 
##                  0                  0                  0
#Eliminar os Valores NA
ds_run <- ds %>% 
  filter(!is.na(half_time_sec))
#Maior Tempo na Maratona
ds_run_max <- ds_run %>% 
  filter(
    ds_run$finish_net_sec == max(ds_run$finish_net_sec)
  )

#Plot your table with table Grob in the library(gridExtra)
ds_run_max_table <- tableGrob(ds_run_max)

#Arrange them as you want with grid.arrange
grid.arrange(ds_run_max_table)

#Menor Tempo na Maratona
ds_run_min <- ds_run %>% 
  filter(
    ds_run$finish_net_sec == min(ds_run$finish_net_sec)
  )

#Plot your table with table Grob in the library(gridExtra)
ds_run_min_table <- tableGrob(ds_run_min)

#Arrange them as you want with grid.arrange
grid.arrange(ds_run_min_table)

#Tempo na Meia Martona x Tempo na Maratona
ggplot(data = ds_run) + 
  geom_point(mapping = aes(x = half_time_sec, y = finish_net_sec), color = "#008DD5") + 
  facet_wrap(~age_group) +
  theme_light()

#Tempo na Meia Maratona x Tempo na Maratona (Gênero)
ggplot(data = ds_run) + 
  geom_point(mapping = aes(x = half_time_sec, y = finish_net_sec), color = "#008DD5") + 
  facet_wrap(~gender) +
  theme_light()

#Participantes por Gênero
ggplot(data = ds_run, aes(x = gender, fill = gender)) + 
  geom_bar() +
  scale_fill_manual(values = c("M" = "#008DD5", "W" = "#F56476")) + 
  theme_minimal() +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Mais rápido na categoria Men
faster_gender_M <- ds_run %>% 
  filter(gender ==  "M") %>% 
  group_by(age_group) %>% 
  summarise(best_result_M = seconds_to_period( min(finish_net_sec)))

#Plot your table with table Grob in the library(gridExtra)
faster_gender_M_table <- tableGrob(faster_gender_M)

#Arrange them as you want with grid.arrange
grid.arrange(faster_gender_M_table)

#Mais rápido na categoria Woman
faster_gender_W <- ds_run %>% 
  filter(gender ==  "W") %>% 
  group_by(age_group) %>% 
  summarise(best_result_W = seconds_to_period( min(finish_net_sec)))


#Plot your table with table Grob in the library(gridExtra)
faster_gender_W_table <- tableGrob(faster_gender_W)

#Arrange them as you want with grid.arrange
grid.arrange(faster_gender_W_table)

#Mais Rápido na Categoria
faster_gender <- inner_join(faster_gender_M, faster_gender_W, by = "age_group")

faster_gender <- faster_gender %>%
  mutate(WIN = if_else(best_result_W > best_result_M, "M", "W"))

#Plot your table with table Grob in the library(gridExtra)
faster_gender_table <- tableGrob(faster_gender)

#Arrange them as you want with grid.arrange
grid.arrange(faster_gender_table)

#Categorias Mais Rapidas

time_2h30m<- ds_run %>% 
  group_by(age_group) %>% 
  filter(
    finish_net_sec <= 9000
  ) %>% 
  count()%>% 
  rename(time_2h30m = n)

time_2h30m_3h<- ds_run %>% 
  group_by(age_group) %>% 
  filter(
    finish_net_sec > 9000 & finish_net_sec <= 10800
  ) %>% 
  count() %>% 
  rename(time_2h30m_3h = n)

time_3h_3h30m <- ds_run %>% 
  group_by(age_group) %>% 
  filter(
    finish_net_sec > 10800 & finish_net_sec <= 12600
  ) %>% 
  count() %>% 
  rename(time_3h_3h30m = n)

time_3h30m_4h <- ds_run %>% 
  group_by(age_group) %>% 
  filter(
    finish_net_sec > 12600 & finish_net_sec <= 14400
  ) %>% 
  count() %>% 
  rename(time_3h30m_4h = n)

time_4h<- ds_run %>% 
  group_by(age_group) %>% 
  filter(
    finish_net_sec > 14400
  ) %>% 
  count()%>% 
  rename(time_4h = n)


faster_category <- time_2h30m %>%
  full_join(time_2h30m_3h, by = "age_group") %>%
  full_join(time_3h_3h30m, by = "age_group") %>%
  full_join(time_3h30m_4h, by = "age_group") %>%
  full_join(time_4h, by = "age_group")

View(faster_category)

#Plot your table with table Grob in the library(gridExtra)
faster_category_table <- tableGrob(faster_category)

#Arrange them as you want with grid.arrange
grid.arrange(faster_category_table)

# Calcular os totais
total_time_2h30m <- sum(time_2h30m$time_2h30m)
total_time_2h30m_3h <- sum(time_2h30m_3h$time_2h30m_3h)
total_time_3h_3h30m <- sum(time_3h_3h30m$time_3h_3h30m)
total_time_3h30m_4h <- sum(time_3h30m_4h$time_3h30m_4h)
total_time_4h <- sum(time_4h$time_4h)

# Criar um data frame com os resultados
faster_category_total <- data.frame(
  Category = c("time_2h30m", "time_2h30m_3h", "time_3h_3h30m", "time_3h30m_4h", "time_4h"),
  Total = c(total_time_2h30m, total_time_2h30m_3h, total_time_3h_3h30m, total_time_3h30m_4h, total_time_4h)
)


#Plot your table with table Grob in the library(gridExtra)
faster_category_total_table <- tableGrob(faster_category_total)

#Arrange them as you want with grid.arrange
grid.arrange(faster_category_total_table)

#Gráfico Total por Categoria
ggplot(data = faster_category_total) +
  geom_col(mapping = aes(x = Category, y = Total), fill = "#F56476") +
  geom_text(mapping = aes(x = Category, y = Total, label = Total), 
            vjust = -0.5, size = 5, color = "black") +  # Ajuste o vjust e size conforme necessário
  labs(title = "Total por Categoria", x = "Categoria", y = "Total") +
  theme_minimal()

#Encontrar Outliers#
ds_run$finish_net_sec <- as.numeric(ds_run$finish_net_sec)
quartis <- quantile(ds_run$finish_net_sec, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
quartis
##       0%      25%      50%      75%     100% 
##  7554.00 11464.25 12835.50 14666.75 26431.00
# Calcular o intervalo interquartil (IQR)
iqr <- IQR(ds_run$finish_net_sec)
q1 <- quartis[2]  # Primeiro quartil (Q1)
q3 <- quartis[4]  # Terceiro quartil (Q3)

# Definir os limites inferior e superior
limite_inferior <- q1 - 1.5 * iqr
limite_superior <- q3 + 1.5 * iqr

outliers <- ds_run$finish_net_sec[ds_run$finish_net_sec < limite_inferior |ds_run$finish_net_sec > limite_superior]
print(outliers)
##   [1] 19495 19509 19539 19593 19598 19609 19612 19635 19640 19650 19681 19683
##  [13] 19694 19694 19713 19718 19726 19731 19761 19773 19774 19784 19789 19789
##  [25] 19798 19808 19837 19854 19893 19898 19924 19926 19934 19953 19961 19961
##  [37] 20000 20030 20055 20059 20099 20102 20139 20202 20239 20322 20371 20398
##  [49] 20439 20445 20460 20460 20470 20511 20523 20546 20577 20597 20647 20648
##  [61] 20706 20745 20817 20827 20839 20882 20920 20955 20956 20963 20986 21068
##  [73] 21102 21144 21231 21380 21477 21482 21535 21537 21572 21582 21641 21764
##  [85] 21881 22003 19502 19511 19576 19580 19693 19715 19725 19733 19777 19780
##  [97] 19814 19831 19854 19985 20082 20086 20128 20220 20235 20236 20247 20379
## [109] 20457 20501 20503 20543 20646 20707 20760 20762 20864 20890 20918 21100
## [121] 21226 21238 21239 21329 21493 21705 24188 19516 19527 19540 19583 19652
## [133] 19814 19821 19826 19868 19911 20051 20104 20125 20137 20256 20266 20361
## [145] 20469 20564 20599 20601 20728 20809 20812 20813 20916 21016 21161 21210
## [157] 21290 21314 21349 21363 21386 21393 21502 21504 21513 21605 21618 21889
## [169] 22109 22146 22616 19490 19517 19535 19593 19595 19653 19721 19742 19757
## [181] 19802 19847 19874 19892 19928 19928 20004 20010 20049 20239 20240 20279
## [193] 20294 20355 20385 20506 20507 20507 20727 20764 20824 20861 20878 20905
## [205] 20917 20918 20972 21003 21011 21021 21075 21142 21152 21190 21260 21269
## [217] 21288 21332 21571 21755 21801 21827 21903 23049 24056 25460 26019 19580
## [229] 19582 19640 19659 19663 19805 19910 19929 20123 20147 20173 20279 20281
## [241] 20283 20300 20339 20360 20362 20436 20455 20467 20512 20572 20727 20741
## [253] 20781 20990 21004 21016 21023 21054 21134 21184 21270 21291 21372 21419
## [265] 21582 21625 21648 21752 21803 21850 21936 23865 19499 19525 19566 19590
## [277] 19598 19714 19716 19727 19750 19762 19845 19846 19924 19945 20045 20148
## [289] 20157 20236 20415 20468 20510 20545 20749 20790 20816 20822 20827 20994
## [301] 21015 21032 21082 21157 21314 21365 21577 21584 21601 21674 21709 21724
## [313] 21752 22130 22702 22871 23939 19662 19700 19763 19769 19774 19774 19779
## [325] 19816 20010 20186 20377 20388 20422 20443 20523 20533 20583 20617 20626
## [337] 20632 20686 20822 20828 20837 21268 21280 21396 21606 21702 21731 21805
## [349] 21883 22120 23514 19608 19631 19740 19864 20216 20237 20295 20414 20596
## [361] 20652 20671 20733 21059 21115 21141 21214 21457 21490 21826 21916 22414
## [373] 22712 23306 23641 19493 19641 19976 20213 20286 20502 20736 21031 21157
## [385] 22101 22187 22318 20175 19479 19493 19493 19497 19507 19528 19530 19549
## [397] 19563 19586 19598 19605 19612 19631 19633 19634 19634 19636 19641 19659
## [409] 19667 19680 19695 19709 19716 19721 19732 19733 19768 19770 19771 19784
## [421] 19807 19817 19831 19836 19845 19849 19851 19851 19881 19897 19902 19906
## [433] 19920 19923 19935 19938 19956 19957 19962 19962 19970 19979 19980 20003
## [445] 20005 20005 20009 20010 20031 20044 20053 20054 20068 20074 20084 20091
## [457] 20108 20108 20109 20114 20131 20133 20133 20147 20156 20188 20208 20218
## [469] 20232 20237 20261 20280 20297 20311 20321 20328 20330 20359 20370 20376
## [481] 20388 20397 20412 20412 20412 20435 20446 20452 20454 20462 20474 20506
## [493] 20526 20533 20539 20543 20543 20557 20560 20564 20571 20575 20581 20598
## [505] 20611 20624 20631 20637 20651 20669 20678 20686 20701 20706 20738 20743
## [517] 20761 20822 20823 20834 20856 20872 20872 20901 20918 20937 20943 20964
## [529] 20983 20987 20994 21001 21049 21071 21073 21121 21144 21173 21191 21192
## [541] 21206 21227 21244 21252 21270 21279 21287 21355 21383 21395 21396 21421
## [553] 21476 21477 21482 21508 21514 21541 21543 21551 21570 21572 21585 21624
## [565] 21652 21655 21715 21744 21760 21765 21795 21797 21829 21910 21911 21938
## [577] 21949 21957 22007 22102 22739 23169 23407 26431 19472 19475 19478 19490
## [589] 19496 19580 19639 19639 19639 19704 19720 19725 19732 19747 19779 19797
## [601] 19845 19858 19875 19881 19894 19910 19961 20040 20118 20135 20139 20250
## [613] 20268 20337 20427 20475 20479 20528 20529 20531 20554 20586 20597 20625
## [625] 20631 20673 20700 20704 20708 20714 20741 20773 20781 20832 20836 20879
## [637] 20902 20945 20991 21055 21056 21160 21166 21200 21280 21309 21339 21403
## [649] 21440 21441 21499 21500 21500 21511 21520 21543 21607 21614 21741 21762
## [661] 21857 19582 19620 19634 19648 19669 19673 19795 19812 19821 19868 19895
## [673] 19962 19967 20010 20063 20097 20097 20141 20153 20156 20212 20229 20233
## [685] 20255 20281 20341 20365 20446 20451 20534 20611 20662 20695 20719 20719
## [697] 20746 20773 20906 20906 20957 21003 21037 21061 21141 21215 21275 21286
## [709] 21341 21444 21505 21650 21735 21825 21886 19475 19564 19567 19588 19612
## [721] 19620 19643 19658 19790 19818 19819 19840 19861 19871 19878 19933 19955
## [733] 19974 20058 20059 20121 20190 20203 20216 20232 20232 20254 20286 20323
## [745] 20405 20565 20588 20611 20652 20706 20775 20779 20827 20891 20892 20893
## [757] 20900 20904 21005 21060 21081 21133 21184 21210 21287 21309 21324 21340
## [769] 21448 21451 21527 21631 21698 22053 22145 22187 22225 19510 19581 19646
## [781] 19735 19774 19875 19887 19889 19985 20010 20227 20270 20343 20345 20456
## [793] 20592 20612 20701 20740 20780 20874 20922 20955 21005 21075 21089 21093
## [805] 21129 21239 21299 21312 21417 21456 21469 21554 21620 21746 21747 21836
## [817] 21916 21936 22262 24994 19700 19866 19923 19977 20021 20041 20062 20117
## [829] 20166 20218 20316 20322 20331 20394 20448 20597 21058 21139 21169 21193
## [841] 21531 21615 21693 21865 21935 23134 19991 20183 20233 20465 20780 20867
## [853] 20906 21064 21065 21352 21359 21382 21483 19537 20075 20229 20643 20916
## [865] 21003 21155 21417 21799 22101 22606 19728 20437 20579
# Criar o boxplot com os Quartis e valores extremos

boxplot(ds_run$finish_net_sec, 
        main = "Boxplot com Outliers", 
        ylab = "Valores", 
        col = "lightblue")

# Adicionar os rótulos dos quartis
text(x = 1, y = quartis, 
     labels = round(quartis, 2), 
     pos = 4, 
     cex = 0.8, 
     col = "red")