library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(readr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
data <- read_csv("C:/Users/hp/Downloads/olympics_dataset.csv")
## Rows: 252565 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Name, Sex, Team, NOC, Season, City, Sport, Event, Medal
## dbl (2): player_id, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(data)
## spc_tbl_ [252,565 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ player_id: num [1:252565] 0 1 2 3 26 27 29 31 32 33 ...
## $ Name : chr [1:252565] "A Dijiang" "A Lamusi" "Gunnar Aaby" "Edgar Aabye" ...
## $ Sex : chr [1:252565] "M" "M" "M" "M" ...
## $ Team : chr [1:252565] "China" "China" "Denmark" "Denmark/Sweden" ...
## $ NOC : chr [1:252565] "CHN" "CHN" "DEN" "DEN" ...
## $ Year : num [1:252565] 1992 2012 1920 1900 1932 ...
## $ Season : chr [1:252565] "Summer" "Summer" "Summer" "Summer" ...
## $ City : chr [1:252565] "Barcelona" "London" "Antwerpen" "Paris" ...
## $ Sport : chr [1:252565] "Basketball" "Judo" "Football" "Tug-Of-War" ...
## $ Event : chr [1:252565] "Basketball Men's Basketball" "Judo Men's Extra-Lightweight" "Football Men's Football" "Tug-Of-War Men's Tug-Of-War" ...
## $ Medal : chr [1:252565] "No medal" "No medal" "No medal" "Gold" ...
## - attr(*, "spec")=
## .. cols(
## .. player_id = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Team = col_character(),
## .. NOC = col_character(),
## .. Year = col_double(),
## .. Season = col_character(),
## .. City = col_character(),
## .. Sport = col_character(),
## .. Event = col_character(),
## .. Medal = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
colnames(data)
## [1] "player_id" "Name" "Sex" "Team" "NOC" "Year"
## [7] "Season" "City" "Sport" "Event" "Medal"
#Summary of the data(include min,max,quartile)
summary(data)
## player_id Name Sex Team
## Min. : 0 Length:252565 Length:252565 Length:252565
## 1st Qu.: 57137 Class :character Class :character Class :character
## Median : 135611 Mode :character Mode :character Mode :character
## Mean : 230550
## 3rd Qu.: 211859
## Max. :9460001
## NOC Year Season City
## Length:252565 Min. :1896 Length:252565 Length:252565
## Class :character 1st Qu.:1960 Class :character Class :character
## Mode :character Median :1988 Mode :character Mode :character
## Mean :1982
## 3rd Qu.:2008
## Max. :2024
## Sport Event Medal
## Length:252565 Length:252565 Length:252565
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
Interpretation: It Gives the minimum, maximum, mean, and quartile for each column.
#check for missing value
colSums(is.na(data))
## player_id Name Sex Team NOC Year Season City
## 0 0 0 0 0 0 0 0
## Sport Event Medal
## 0 0 0
Interpretation: This helps identify how many missing values (NA) are present in each column.
ggplot(data, aes(x = Medal)) +
geom_bar(fill = "skyblue") +
labs(title = "Medal Distribution", x = "Medal", y = "Count")
Interpretation: The bar plot shows the number of athletes who won each type of medal—Gold, Silver, or Bronze. It helps compare the frequency of each medal type and highlights which medals were awarded more often. A taller bar means more athletes won that particular medal during the Olympics.
ggplot(data, aes(x = Year)) +
geom_histogram(binwidth = 4, fill = "lightgreen", color = "black") +
labs(title = "Histogram of Olympic Years", x = "Year", y = "Number of Athletes")
Interpretation: This histogram shows how the number of athletes varied across different Olympic years. It helps us understand which years had more participation and how frequently the Olympics were organized over time.
ggplot(data, aes(x = Year)) +
geom_histogram(aes(y = ..density..), binwidth = 4, fill = "lightblue", color = "black") +
geom_density(color = "darkred", size = 1.2) +
labs(title = "Histogram with Density Curve of Participation Years",
x = "Year of Participation",
y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Interpretation: The histogram with density curve shows that athlete participation in the Olympics increased over time, peaking during the modern era. The smooth density curve highlights periods of higher participation and reflects the growing popularity of the Games.
# Filter medal winners and count medals per team
top_10_teams <- data %>%
filter(Medal %in% c("Gold", "Silver", "Bronze")) %>%
count(Team, name = "Medal_Count") %>%
arrange(desc(Medal_Count)) %>%
head(10)
# Scatter plot
ggplot(top_10_teams, aes(x = reorder(Team, -Medal_Count), y = Medal_Count)) +
geom_point(size = 4, color = "steelblue") +
labs(title = "Top 10 Teams by Medal Count", x = "Team", y = "Medals") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Interpretation: This scatter plot illustrates the number of medals won by the top 10 Olympic teams. The United States is a clear outlier with significantly more medals. This visualization helps to identify the most dominant countries in Olympic history.
ggplot(data, aes(x = Sex, y = Year)) +
geom_boxplot(fill = "orange") +
labs(title = "Boxplot of Year by Sex", x = "Sex", y = "Year")
Interpretation: This boxplot compares the distribution of Olympic participation years between male and female athletes. It helps visualize differences in median year, range, and variability of participation across genders over time.
quantile(data$Year, probs = c(0.25, 0.5, 0.75))
## 25% 50% 75%
## 1960 1988 2008
Interpretation: This calculates the 25th, 50th (median), and 75th percentiles of the Olympic years, giving a summary of how the events are spread over time. It helps identify the central tendency and the overall distribution range of the years.
anova_result <- aov(Year ~ Sport, data = data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Sport 75 47136118 628482 717.3 <2e-16 ***
## Residuals 252489 221221957 876
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation: This ANOVA test checks whether the average Olympic participation year differs significantly across various sports. It helps identify if certain sports were more active or introduced at different periods in Olympic history.
sex_count <- data %>%
group_by(Sex) %>%
summarise(Count = n())
ggplot(sex_count, aes(x = "", y = Count, fill = Sex)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
labs(title = "Gender Distribution of Athletes") +
theme_void()
Interpretation: This pie chart visualizes the proportion of male and female athletes. It clearly shows the gender participation gap and highlights how diverse the representation is.
#Group by Year and count number of athletes
line_data <- data %>%
group_by(Year) %>%
summarise(Num_Athletes = n())
# View first few rows
head(line_data)
## # A tibble: 6 × 2
## Year Num_Athletes
## <dbl> <int>
## 1 1896 380
## 2 1900 1936
## 3 1904 1301
## 4 1906 1733
## 5 1908 3101
## 6 1912 4040
#ggplot(line_data, aes(x = Year, y = Num_Athletes)) +
ggplot(line_data, aes(x = Year, y = Num_Athletes)) +
geom_line(color = "blue", linewidth = 1) + # <- use linewidth, not size
geom_point(color = "red") +
labs(title = "Number of Athletes Over the Years",
x = "Year",
y = "Number of Athletes") +
theme_minimal()
Interpretation: The X-axis represents Years when the Olympics were held. The Y-axis represents the Number of athletes who participated each year. Each point shows the number of participating athletes for that particular year. The blue line connects these points, showing the trend over time.
medal_count <- data %>%
filter(!is.na(Medal)) %>%
count(Medal)
# Second: Plot the bar chart
ggplot(medal_count, aes(x = Medal, y = n, fill = Medal)) +
geom_col() +
labs(title = "Medal Distribution", x = "Medal Type", y = "Count") +
theme_minimal()
Interpretation: The bar chart displays the count of each medal type side-by-side, making it easier to compare the exact number of Gold, Silver, and Bronze medals. It confirms the observation from the pie chart: Bronze medals are the most awarded, followed by Silver, and then Gold. Bar charts are more effective than pie charts when comparing specific numbers because humans read length better than area.
grouped_sport_medal <- data %>%
filter(!is.na(Medal)) %>%
group_by(Sport, Medal) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'Sport'. You can override using the
## `.groups` argument.
print(grouped_sport_medal)
## # A tibble: 274 × 3
## # Groups: Sport [76]
## Sport Medal count
## <chr> <chr> <int>
## 1 Athletics No medal 38865
## 2 Gymnastics No medal 24451
## 3 Swimming No medal 22946
## 4 Shooting No medal 11244
## 5 Fencing No medal 9635
## 6 Cycling No medal 9596
## 7 Rowing No medal 8392
## 8 Wrestling No medal 6294
## 9 Football No medal 6079
## 10 Sailing No medal 5947
## # ℹ 264 more rows
Interpretation: It tells us which sports have won the most medals and what type of medal (Gold, Silver, Bronze) was most common.