knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grp_NOC = dataset_olympics %>% group_by(NOC) %>%
summarise(Gold_Medals = sum((Medal == 'Gold') & !is.na(Medal)),
Silver_Medals = sum((Medal == 'Silver') & !is.na(Medal)),
Bronze_Medals = sum((Medal == 'Bronze') & !is.na(Medal)),
Total_NoMedals = sum(is.na(Medal)),
Average_Age = mean(Age),
Average_Weight = mean(Weight),
Count = n()
) %>% mutate(Probability = round(Count/nrow(dataset_olympics), 4))
## Ideally we would use head but print.data,frame prints all columns on for the data
# head(grp_NOC,20)
print.data.frame(head(grp_NOC %>% arrange(desc(Count)),20))
## NOC Gold_Medals Silver_Medals Bronze_Medals Total_NoMedals Average_Age
## 1 USA 747 448 366 3655 NA
## 2 FRA 185 218 257 4242 NA
## 3 ITA 217 193 194 3572 NA
## 4 GBR 211 207 186 2922 NA
## 5 CAN 119 114 127 2302 NA
## 6 AUS 98 124 151 1897 NA
## 7 SWE 84 136 145 1775 NA
## 8 GER 148 142 144 1626 NA
## 9 ESP 38 87 53 1689 NA
## 10 SUI 44 70 66 1492 NA
## 11 BRA 32 65 76 1384 NA
## 12 HUN 94 67 80 1271 NA
## 13 NED 62 94 116 1208 NA
## 14 BEL 35 70 57 1206 NA
## 15 ROU 46 61 91 1162 NA
## 16 NOR 104 94 68 936 26.66889
## 17 URS 216 150 149 686 NA
## 18 ARG 25 36 32 1029 NA
## 19 RUS 74 70 83 843 NA
## 20 POL 13 27 47 916 NA
## Average_Weight Count Probability
## 1 NA 5216 0.0745
## 2 NA 4902 0.0700
## 3 NA 4176 0.0597
## 4 NA 3526 0.0504
## 5 NA 2662 0.0380
## 6 NA 2270 0.0324
## 7 NA 2140 0.0306
## 8 NA 2060 0.0294
## 9 NA 1867 0.0267
## 10 NA 1672 0.0239
## 11 NA 1557 0.0222
## 12 NA 1512 0.0216
## 13 NA 1480 0.0211
## 14 NA 1368 0.0195
## 15 NA 1360 0.0194
## 16 NA 1202 0.0172
## 17 NA 1201 0.0172
## 18 NA 1122 0.0160
## 19 NA 1070 0.0153
## 20 NA 1003 0.0143
print.data.frame(tail(grp_NOC %>% arrange(desc(Count)),10))
## NOC Gold_Medals Silver_Medals Bronze_Medals Total_NoMedals Average_Age
## 1 TGA 0 0 0 3 24.66667
## 2 TLS 0 0 0 3 28.33333
## 3 MHL 0 0 0 2 24.00000
## 4 NBO 0 0 0 2 22.50000
## 5 BDI 0 0 0 1 23.00000
## 6 BUR 0 0 0 1 27.00000
## 7 KIR 0 0 0 1 17.00000
## 8 TUV 0 0 0 1 21.00000
## 9 UNK 0 0 0 1 NA
## 10 YMD 0 0 0 1 20.00000
## Average_Weight Count Probability
## 1 82.66667 3 0
## 2 NA 3 0
## 3 NA 2 0
## 4 NA 2 0
## 5 69.00000 1 0
## 6 67.00000 1 0
## 7 67.00000 1 0
## 8 69.00000 1 0
## 9 NA 1 0
## 10 65.00000 1 0
ggplot(data = head(grp_NOC %>% arrange(desc(Count)),30), aes(x = NOC, y = Gold_Medals)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
This data shows that there are many NA values throughout the dataset where Age,Weight, or Height are missing. This affects the NOC’s mean calculations. This can be avoided by adding rm.na = TRUE however that might not represent the data correctly (it might be important to first summarize the total NA within the columns). There are multiple countries with fewer than 10 players and no medals. However, these do have all datapoints present. Moving to further groupings to find stronger anomalies!
grp_Sport = dataset_olympics %>% group_by(Sport) %>%
summarise(MenCount = sum(Sex == 'M'),
WomenCount = sum(Sex == 'F'),
TotalCount = n()
) %>% mutate(AthleteProbability = round(TotalCount/nrow(dataset_olympics), 4)) %>% arrange(desc(AthleteProbability))
head(grp_Sport,20)
## # A tibble: 20 × 5
## Sport MenCount WomenCount TotalCount AthleteProbability
## <chr> <int> <int> <int> <dbl>
## 1 Athletics 7526 3103 10629 0.152
## 2 Gymnastics 4039 2209 6248 0.0893
## 3 Swimming 3504 2460 5964 0.0852
## 4 Fencing 2650 446 3096 0.0442
## 5 Shooting 2597 443 3040 0.0434
## 6 Cycling 2469 325 2794 0.0399
## 7 Rowing 2158 532 2690 0.0384
## 8 Alpine Skiing 1511 815 2326 0.0332
## 9 Cross Country Skiing 1398 755 2153 0.0308
## 10 Football 1738 221 1959 0.028
## 11 Sailing 1652 274 1926 0.0275
## 12 Wrestling 1766 71 1837 0.0262
## 13 Equestrianism 1521 306 1827 0.0261
## 14 Boxing 1641 19 1660 0.0237
## 15 Hockey 1133 374 1507 0.0215
## 16 Canoeing 1168 287 1455 0.0208
## 17 Basketball 921 330 1251 0.0179
## 18 Speed Skating 795 443 1238 0.0177
## 19 Biathlon 792 406 1198 0.0171
## 20 Ice Hockey 1009 172 1181 0.0169
tail(grp_Sport,10)
## # A tibble: 10 × 5
## Sport MenCount WomenCount TotalCount AthleteProbability
## <chr> <int> <int> <int> <dbl>
## 1 Lacrosse 20 0 20 0.0003
## 2 Cricket 13 0 13 0.0002
## 3 Alpinism 6 1 7 0.0001
## 4 Croquet 4 4 8 0.0001
## 5 Military Ski Patrol 9 0 9 0.0001
## 6 Motorboating 5 0 5 0.0001
## 7 Racquets 6 0 6 0.0001
## 8 Basque Pelota 1 0 1 0
## 9 Jeu De Paume 2 0 2 0
## 10 Roque 2 0 2 0
ggplot(data=head(grp_Sport,20), aes(x=Sport,y=TotalCount))+
geom_tile(color="red",size=0.3)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Sports such as Roque and Racquets have a very low probability of
occurence and can be seen as anomalies since the data is scarce or
discontinued such as in the case of Jeu De Paume, which ironically
translates to ‘Real Tennis’, a 17th century sport. This gives us a
strong basis to add filters to overall data to remove sports played in
less than 5 Olympics and before the 1900s.
grp_Sport = dataset_olympics %>% group_by(Season) %>%
summarise(MenCount = sum(Sex == 'M'),
WomenCount = sum(Sex == 'F'),
Average_Weight = mean(Weight, na.rm = TRUE),
TotalCount = n()
) %>% mutate(AthleteProbability = round(TotalCount/nrow(dataset_olympics), 4)) %>% arrange(desc(AthleteProbability))
head(grp_Sport,20)
## # A tibble: 2 × 6
## Season MenCount WomenCount Average_Weight TotalCount AthleteProbability
## <chr> <int> <int> <dbl> <int> <dbl>
## 1 Summer 43715 14752 70.9 58467 0.835
## 2 Winter 8162 3371 71.0 11533 0.165
tail(grp_Sport,10)
## # A tibble: 2 × 6
## Season MenCount WomenCount Average_Weight TotalCount AthleteProbability
## <chr> <int> <int> <dbl> <int> <dbl>
## 1 Summer 43715 14752 70.9 58467 0.835
## 2 Winter 8162 3371 71.0 11533 0.165
Given the weather, aone could hypothesize athletes weight more for Winter sports over Summer sports however, the averages are very even in between both seasons. The men to women ratio is smaller during the summer despite my assumptions.
We outlined above combinations and groupings that went against hypothesis of the data. Combinations with sports not played past the 20th century have no clear data and thus a low probability of having data on Women.Further classification of categorical data into ranges can lead to better structure and visualizations