library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(readr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
data <- read_csv("C:/Users/hp/Downloads/olympics_dataset.csv")
## Rows: 252565 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Name, Sex, Team, NOC, Season, City, Sport, Event, Medal
## dbl (2): player_id, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(data)
## spc_tbl_ [252,565 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ player_id: num [1:252565] 0 1 2 3 26 27 29 31 32 33 ...
##  $ Name     : chr [1:252565] "A Dijiang" "A Lamusi" "Gunnar Aaby" "Edgar Aabye" ...
##  $ Sex      : chr [1:252565] "M" "M" "M" "M" ...
##  $ Team     : chr [1:252565] "China" "China" "Denmark" "Denmark/Sweden" ...
##  $ NOC      : chr [1:252565] "CHN" "CHN" "DEN" "DEN" ...
##  $ Year     : num [1:252565] 1992 2012 1920 1900 1932 ...
##  $ Season   : chr [1:252565] "Summer" "Summer" "Summer" "Summer" ...
##  $ City     : chr [1:252565] "Barcelona" "London" "Antwerpen" "Paris" ...
##  $ Sport    : chr [1:252565] "Basketball" "Judo" "Football" "Tug-Of-War" ...
##  $ Event    : chr [1:252565] "Basketball Men's Basketball" "Judo Men's Extra-Lightweight" "Football Men's Football" "Tug-Of-War Men's Tug-Of-War" ...
##  $ Medal    : chr [1:252565] "No medal" "No medal" "No medal" "Gold" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   player_id = col_double(),
##   ..   Name = col_character(),
##   ..   Sex = col_character(),
##   ..   Team = col_character(),
##   ..   NOC = col_character(),
##   ..   Year = col_double(),
##   ..   Season = col_character(),
##   ..   City = col_character(),
##   ..   Sport = col_character(),
##   ..   Event = col_character(),
##   ..   Medal = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
colnames(data)
##  [1] "player_id" "Name"      "Sex"       "Team"      "NOC"       "Year"     
##  [7] "Season"    "City"      "Sport"     "Event"     "Medal"

#Summary of the data(include min,max,quartile)

summary(data)
##    player_id           Name               Sex                Team          
##  Min.   :      0   Length:252565      Length:252565      Length:252565     
##  1st Qu.:  57137   Class :character   Class :character   Class :character  
##  Median : 135611   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 230550                                                           
##  3rd Qu.: 211859                                                           
##  Max.   :9460001                                                           
##      NOC                 Year         Season              City          
##  Length:252565      Min.   :1896   Length:252565      Length:252565     
##  Class :character   1st Qu.:1960   Class :character   Class :character  
##  Mode  :character   Median :1988   Mode  :character   Mode  :character  
##                     Mean   :1982                                        
##                     3rd Qu.:2008                                        
##                     Max.   :2024                                        
##     Sport              Event              Medal          
##  Length:252565      Length:252565      Length:252565     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

Interpretation: It Gives the minimum, maximum, mean, and quartile for each column.

#check for missing value

colSums(is.na(data))
## player_id      Name       Sex      Team       NOC      Year    Season      City 
##         0         0         0         0         0         0         0         0 
##     Sport     Event     Medal 
##         0         0         0

Interpretation: This helps identify how many missing values (NA) are present in each column.

1. Bar Plot: Medal Distribution

ggplot(data, aes(x = Medal)) +
  geom_bar(fill = "skyblue") +
  labs(title = "Medal Distribution", x = "Medal", y = "Count")

Interpretation: The bar plot shows the number of athletes who won each type of medal—Gold, Silver, or Bronze. It helps compare the frequency of each medal type and highlights which medals were awarded more often. A taller bar means more athletes won that particular medal during the Olympics.

2. Histogram: Year Distribution

ggplot(data, aes(x = Year)) +
  geom_histogram(binwidth = 4, fill = "lightgreen", color = "black") +
  labs(title = "Histogram of Olympic Years", x = "Year", y = "Number of Athletes")

Interpretation: This histogram shows how the number of athletes varied across different Olympic years. It helps us understand which years had more participation and how frequently the Olympics were organized over time.

3. Density Curve Analysis for Year:

ggplot(data, aes(x = Year)) +
  geom_histogram(aes(y = ..density..), binwidth = 4, fill = "lightblue", color = "black") +
  geom_density(color = "darkred", size = 1.2) +
  labs(title = "Histogram with Density Curve of Participation Years",
       x = "Year of Participation",
       y = "Density") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Interpretation: The histogram with density curve shows that athlete participation in the Olympics increased over time, peaking during the modern era. The smooth density curve highlights periods of higher participation and reflects the growing popularity of the Games.

4. Scatter Plot: Year vs Season

# Filter medal winners and count medals per team
top_10_teams <- data %>%
  filter(Medal %in% c("Gold", "Silver", "Bronze")) %>%
  count(Team, name = "Medal_Count") %>%
  arrange(desc(Medal_Count)) %>%
  head(10)

# Scatter plot
ggplot(top_10_teams, aes(x = reorder(Team, -Medal_Count), y = Medal_Count)) +
  geom_point(size = 4, color = "steelblue") +
  labs(title = "Top 10 Teams by Medal Count", x = "Team", y = "Medals") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Interpretation: This scatter plot illustrates the number of medals won by the top 10 Olympic teams. The United States is a clear outlier with significantly more medals. This visualization helps to identify the most dominant countries in Olympic history.

5. Boxplot: Year by Sex

ggplot(data, aes(x = Sex, y = Year)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Boxplot of Year by Sex", x = "Sex", y = "Year")

Interpretation: This boxplot compares the distribution of Olympic participation years between male and female athletes. It helps visualize differences in median year, range, and variability of participation across genders over time.

6.Quartiles of Year

quantile(data$Year, probs = c(0.25, 0.5, 0.75))
##  25%  50%  75% 
## 1960 1988 2008

Interpretation: This calculates the 25th, 50th (median), and 75th percentiles of the Olympic years, giving a summary of how the events are spread over time. It helps identify the central tendency and the overall distribution range of the years.

7.ANOVA: Year by Sport

anova_result <- aov(Year ~ Sport, data = data)
summary(anova_result)
##                 Df    Sum Sq Mean Sq F value Pr(>F)    
## Sport           75  47136118  628482   717.3 <2e-16 ***
## Residuals   252489 221221957     876                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: This ANOVA test checks whether the average Olympic participation year differs significantly across various sports. It helps identify if certain sports were more active or introduced at different periods in Olympic history.

8. Pie Chart (Sex Distribution of Athletes)

sex_count <- data %>%
  group_by(Sex) %>%
  summarise(Count = n())

ggplot(sex_count, aes(x = "", y = Count, fill = Sex)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  labs(title = "Gender Distribution of Athletes") +
  theme_void()

Interpretation: This pie chart visualizes the proportion of male and female athletes. It clearly shows the gender participation gap and highlights how diverse the representation is.

9. Single line chart of Number of Athletes vs Year

#Group by Year and count number of athletes
line_data <- data %>%
  group_by(Year) %>%
  summarise(Num_Athletes = n())

# View first few rows
head(line_data)
## # A tibble: 6 × 2
##    Year Num_Athletes
##   <dbl>        <int>
## 1  1896          380
## 2  1900         1936
## 3  1904         1301
## 4  1906         1733
## 5  1908         3101
## 6  1912         4040
#ggplot(line_data, aes(x = Year, y = Num_Athletes)) +
  ggplot(line_data, aes(x = Year, y = Num_Athletes)) +
  geom_line(color = "blue", linewidth = 1) +   # <- use linewidth, not size
  geom_point(color = "red") +
  labs(title = "Number of Athletes Over the Years",
       x = "Year",
       y = "Number of Athletes") +
  theme_minimal()

Interpretation: The X-axis represents Years when the Olympics were held. The Y-axis represents the Number of athletes who participated each year. Each point shows the number of participating athletes for that particular year. The blue line connects these points, showing the trend over time.

10.Bar chart

medal_count <- data %>%
  filter(!is.na(Medal)) %>%
  count(Medal)

# Second: Plot the bar chart
ggplot(medal_count, aes(x = Medal, y = n, fill = Medal)) +
  geom_col() +
  labs(title = "Medal Distribution", x = "Medal Type", y = "Count") +
  theme_minimal()

Interpretation: The bar chart displays the count of each medal type side-by-side, making it easier to compare the exact number of Gold, Silver, and Bronze medals. It confirms the observation from the pie chart: Bronze medals are the most awarded, followed by Silver, and then Gold. Bar charts are more effective than pie charts when comparing specific numbers because humans read length better than area.

11. Grouping the data

grouped_sport_medal <- data %>%
  filter(!is.na(Medal)) %>%
  group_by(Sport, Medal) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
## `summarise()` has grouped output by 'Sport'. You can override using the
## `.groups` argument.
print(grouped_sport_medal)
## # A tibble: 274 × 3
## # Groups:   Sport [76]
##    Sport      Medal    count
##    <chr>      <chr>    <int>
##  1 Athletics  No medal 38865
##  2 Gymnastics No medal 24451
##  3 Swimming   No medal 22946
##  4 Shooting   No medal 11244
##  5 Fencing    No medal  9635
##  6 Cycling    No medal  9596
##  7 Rowing     No medal  8392
##  8 Wrestling  No medal  6294
##  9 Football   No medal  6079
## 10 Sailing    No medal  5947
## # ℹ 264 more rows

Interpretation: It tells us which sports have won the most medals and what type of medal (Gold, Silver, Bronze) was most common.