# excel file
data <- read_excel("../00_data/myData_apply2.xlsx")
data
## # A tibble: 271,116 × 16
## Column1 id name sex age height weight team noc games year season
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 270479 135289 Zzim… M 20 NA NA Braz… BRA 1952… 1952 Summer
## 2 207677 104222 zzet… M 26 178 74 Turk… TUR 2016… 2016 Summer
## 3 102956 52087 zzet… M 23 172 85 Turk… TUR 2004… 2004 Summer
## 4 102957 52087 zzet… M 27 172 85 Turk… TUR 2008… 2008 Summer
## 5 106601 53910 Zyta… F 26 187 85 Pola… POL 1988… 1988 Summer
## 6 219493 110259 Zygm… M 25 185 82 Pola… POL 1932… 1932 Summer
## 7 183779 92370 Zygm… M 21 179 72 Pola… POL 1952… 1952 Summer
## 8 183780 92370 Zygm… M 26 179 72 Pola… POL 1956… 1956 Summer
## 9 152047 76313 Zygm… M 27 175 71 Pola… POL 1972… 1972 Summer
## 10 152048 76313 Zygm… M 31 175 71 Pola… POL 1976… 1976 Summer
## # ℹ 271,106 more rows
## # ℹ 4 more variables: city <chr>, sport <chr>, event <chr>, medal <chr>
What is the mean age per medal?
# Clean the 'medal' column by replacing empty strings, spaces, and 'NA' strings with actual NA
data <- data %>%
mutate(
medal = na_if(trimws(medal), ""), # Convert empty strings or spaces to NA
medal = na_if(medal, "NA")) # Convert string "NA" to actual NA
# Clean the data and ensure the 'age' column is numeric
data <- data %>%
mutate(age = as.numeric(age))
# Calculate the mean age for each medal type
plotdata <- data %>%
group_by(medal) %>%
summarize(mean_age = mean(age, na.rm = TRUE))
# Plot the mean age per medal
ggplot(plotdata, aes(x = medal, y = mean_age)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Mean Age per Medal", x = "Medal", y = "Mean Age")
Bronze: The average age of athletes who won a Bronze medal is 25.9 years. Gold: Similarly, the average age for Gold medal winners is 25.9 years. Silver: Athletes who won Silver medals have an average age of 26.0 years. NA (25.5): This indicates rows where athletes that did not win any medal(NA), and their average age is 25.5 years. NA (34): Another instance where the medal field was missing (NA), but this group has a much higher average age of 34 years. The two NA groups likely represent different missing categories in the dataset, but should be together.