# Libraries used for Analysis

# library(tidyverse)
# library(readxl)
# library(summarytools)
# library(cowplot)
# library(gridExtra)

Data Loading

setwd("D:/JGBS/IV Sem/Data Analysis using R/Assessments/Mid Term/Assigned Dataset/Analysis")

athele_events <- readxl::read_xlsx("athlete_events.xlsx")

str(athele_events)
Classes 'tbl_df', 'tbl' and 'data.frame':   85258 obs. of  16 variables:
 $ ID         : num  12 13 18 23 30 36 36 53 68 68 ...
 $ Name       : chr  "Jyri Tapani Aalto" "Minna Maarit Aalto" "Timo Antero Aaltonen" "Fritz Aanes" ...
 $ Sex        : chr  "M" "F" "M" "M" ...
 $ Age        : num  31 34 31 22 30 25 25 24 23 23 ...
 $ Height(cms): num  172 159 189 187 189 194 194 172 178 178 ...
 $ Weight(Kgs): num  70 55.5 130 89 72 78 78 58 76 76 ...
 $ BMI        : logi  NA NA NA NA NA NA ...
 $ Team       : chr  "Finland" "Finland" "Finland" "Norway" ...
 $ NOC        : chr  "FIN" "FIN" "FIN" "NOR" ...
 $ Games      : chr  "2000 Summer" "2000 Summer" "2000 Summer" "2000 Summer" ...
 $ Year       : num  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
 $ Season     : chr  "Summer" "Summer" "Summer" "Summer" ...
 $ City       : chr  "Sydney" "Sydney" "Sydney" "Sydney" ...
 $ Sport      : chr  "Badminton" "Sailing" "Athletics" "Wrestling" ...
 $ Event      : chr  "Badminton Men's Singles" "Sailing Women's Windsurfer" "Athletics Men's Shot Put" "Wrestling Men's Light-Heavyweight, Greco-Roman" ...
 $ Medal      : chr  "NA" "NA" "NA" "NA" ...
head(athele_events)
# A tibble: 6 x 16
     ID Name  Sex     Age `Height(cms)` `Weight(Kgs)` BMI   Team  NOC  
  <dbl> <chr> <chr> <dbl>         <dbl>         <dbl> <lgl> <chr> <chr>
1    12 Jyri~ M        31           172          70   NA    Finl~ FIN  
2    13 Minn~ F        34           159          55.5 NA    Finl~ FIN  
3    18 Timo~ M        31           189         130   NA    Finl~ FIN  
4    23 Frit~ M        22           187          89   NA    Norw~ NOR  
5    30 Pepi~ M        30           189          72   NA    Neth~ NED  
6    36 Stef~ M        25           194          78   NA    Neth~ NED  
# ... with 7 more variables: Games <chr>, Year <dbl>, Season <chr>,
#   City <chr>, Sport <chr>, Event <chr>, Medal <chr>

Analysis
* The athlete dataset consists of 85,258 rows and 16 attributes/columns.
* The attributes needs to be converted into appropriate class or types to continue analysis and derive insights.
* Attribute Games is divided into Year and Season.
* Need to calculate BMI for each athlete.

Descriptive Statistics

# str(athele_events)
summary(athele_events)
       ID             Name               Sex                 Age       
 Min.   :     2   Length:85258       Length:85258       Min.   :12.00  
 1st Qu.: 35375   Class :character   Class :character   1st Qu.:22.00  
 Median : 69193   Mode  :character   Mode  :character   Median :25.00  
 Mean   : 68988                                         Mean   :25.83  
 3rd Qu.:102782                                         3rd Qu.:29.00  
 Max.   :135571                                         Max.   :71.00  
                                                        NA's   :3      
  Height(cms)     Weight(Kgs)       BMI              Team          
 Min.   :133.0   Min.   : 28.00   Mode:logical   Length:85258      
 1st Qu.:168.0   1st Qu.: 60.00   NA's:85258     Class :character  
 Median :176.0   Median : 70.00                  Mode  :character  
 Mean   :175.8   Mean   : 71.13                                    
 3rd Qu.:183.0   3rd Qu.: 80.00                                    
 Max.   :226.0   Max.   :214.00                                    
 NA's   :711     NA's   :1210                                      
     NOC               Games                Year         Season         
 Length:85258       Length:85258       Min.   :2000   Length:85258      
 Class :character   Class :character   1st Qu.:2004   Class :character  
 Mode  :character   Mode  :character   Median :2008   Mode  :character  
                                       Mean   :2008                     
                                       3rd Qu.:2012                     
                                       Max.   :2016                     
                                                                        
     City              Sport              Event          
 Length:85258       Length:85258       Length:85258      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
                                                         
    Medal          
 Length:85258      
 Class :character  
 Mode  :character  
                   
                   
                   
                   
summarytools::descr(athele_events)
Non-numerical variable(s) ignored: Name, Sex, BMI, Team, NOC, Games, Season, City, Sport, Event, Medal
Descriptive Statistics  
athele_events  
N: 85258  

                         Age   Height(cms)          ID   Weight(Kgs)       Year
----------------- ---------- ------------- ----------- ------------- ----------
             Mean      25.83        175.83    68987.57         71.13    2008.02
          Std.Dev       5.45         10.98    39183.80         15.30       5.46
              Min      12.00        133.00        2.00         28.00    2000.00
               Q1      22.00        168.00    35374.00         60.00    2004.00
           Median      25.00        176.00    69192.50         70.00    2008.00
               Q3      29.00        183.00   102782.00         80.00    2012.00
              Max      71.00        226.00   135571.00        214.00    2016.00
              MAD       4.45         11.86    49940.64         14.83       5.93
              IQR       7.00         15.00    67407.00         20.00       8.00
               CV       0.21          0.06        0.57          0.22       0.00
         Skewness       0.99          0.06       -0.02          0.87      -0.01
      SE.Skewness       0.01          0.01        0.01          0.01       0.01
         Kurtosis       2.37          0.04       -1.18          1.93      -1.27
          N.Valid   85255.00      84547.00    85258.00      84048.00   85258.00
        Pct.Valid     100.00         99.17      100.00         98.58     100.00

Analysis
* Olympics data is available from Year 2000 to 2016.
* Min Age of athlete is 12 years and Max Age is 71. The average age of participitants was 25.8 years.
* The average height of athletes was 175.83 cms and the average weight was 71.13 Kgs.
* The distribution for Height is mesokurtic (almost standard tail shape).
* The distribution for Weight is slightly leptokurtic (fat tailed distribution).

Data Cleaning and Wrangling

Steps
1. Calculating BMI for each athlete.
2. Converting Non Metric attributes to appropriate type, i.e., factors.
3. Quick observance of NA values in each column.
4. Removing rows with NA values in the dataset.
5. Creating a new column called BMI_cat which segments the athletes into one of the seven categories. Source: Wiki.
6. Reordering the factor levels in newly created columns BMI_cat based on logical levels.

# BMI Calculation
athele_events$BMI <- round(athele_events$`Weight(Kgs)` / ((athele_events$`Height(cms)`/100)^2),3)

# Converting apporopriate columns to respective types
non_metric_col <- c("Sex","Team","NOC","Games","Year","Season","City","Sport","Event","Medal")
athele_events[non_metric_col] <- lapply(athele_events[non_metric_col], factor)
# str(athele_events)

# Calculate number of NA's for each column
sapply(athele_events, function(col) sum(is.na(col)))
         ID        Name         Sex         Age Height(cms) Weight(Kgs) 
          0           0           0           3         711        1210 
        BMI        Team         NOC       Games        Year      Season 
       1330           0           0           0           0           0 
       City       Sport       Event       Medal 
          0           0           0           0 
#Removing rows with NA values
athele_events <- athele_events[complete.cases(athele_events),]

# BMI Category
athele_events$BMI_cat <- 0
athele_events$BMI_cat[athele_events$BMI <= 16] <- "Severely underweight"
athele_events$BMI_cat[athele_events$BMI > 16 & athele_events$BMI <= 18.5] <- "Underweight"
athele_events$BMI_cat[athele_events$BMI > 18.5 & athele_events$BMI <= 25] <- "Healthy"
athele_events$BMI_cat[athele_events$BMI > 25 & athele_events$BMI <= 30] <- "Overweight"
athele_events$BMI_cat[athele_events$BMI > 30 & athele_events$BMI <= 35] <- "Moderately obese"
athele_events$BMI_cat[athele_events$BMI > 35 & athele_events$BMI <= 40] <- "Severely obese"
athele_events$BMI_cat[athele_events$BMI > 40] <- "Very Severely obese"

athlete_data <- athele_events

athlete_data$BMI_cat <- factor(athlete_data$BMI_cat, 
                               levels = c("Severely underweight", "Underweight", "Healthy", "Overweight", "Moderately obese", "Severely obese",
                                          "Very Severely obese"))
str(athlete_data)
Classes 'tbl_df', 'tbl' and 'data.frame':   83926 obs. of  17 variables:
 $ ID         : num  12 13 18 23 30 36 36 53 68 68 ...
 $ Name       : chr  "Jyri Tapani Aalto" "Minna Maarit Aalto" "Timo Antero Aaltonen" "Fritz Aanes" ...
 $ Sex        : Factor w/ 2 levels "F","M": 2 1 2 2 2 2 2 2 2 2 ...
 $ Age        : num  31 34 31 22 30 25 25 24 23 23 ...
 $ Height(cms): num  172 159 189 187 189 194 194 172 178 178 ...
 $ Weight(Kgs): num  70 55.5 130 89 72 78 78 58 76 76 ...
 $ BMI        : num  23.7 22 36.4 25.5 20.2 ...
 $ Team       : Factor w/ 331 levels "Afghanistan",..: 102 102 102 216 205 205 205 93 8 8 ...
 $ NOC        : Factor w/ 210 levels "AFG","AHO","ALB",..: 65 65 65 142 138 138 138 58 8 8 ...
 $ Games      : Factor w/ 9 levels "2000 Summer",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Year       : Factor w/ 9 levels "2000","2002",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Season     : Factor w/ 2 levels "Summer","Winter": 1 1 1 1 1 1 1 1 1 1 ...
 $ City       : Factor w/ 9 levels "Athina","Beijing",..: 7 7 7 7 7 7 7 7 7 7 ...
 $ Sport      : Factor w/ 51 levels "Alpine Skiing",..: 4 33 3 51 31 41 41 44 41 41 ...
 $ Event      : Factor w/ 442 levels "Alpine Skiing Men's Combined",..: 63 288 37 428 262 347 351 386 347 355 ...
 $ Medal      : Factor w/ 4 levels "Bronze","Gold",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ BMI_cat    : Factor w/ 7 levels "Severely underweight",..: 3 3 6 4 3 3 3 3 3 3 ...

Quick Comment
* The dataset now looks in a proper format for analysis and is reduced to 83,296 records with 17 columns.

Data Tabulation

Quick Tabulation of all factor columns to have an overview of data before proceeding with analysis.
* Re-leveling Medal columns with respect to NA and medal categories.

# Unique values/levels for all Non-Metric data  
non_metric_col <- c("Sex","Team","NOC","Games","Year","Season","City","Sport","Event","Medal","BMI_cat")

# sapply(athlete_data[,non_metric_col], function(x) unique(x))
# 
# # Tabulation for all levels in non-metric columns
# sapply(athlete_data[,non_metric_col], function(col) table(col))

athlete_data$Medal <- factor(athlete_data$Medal, levels = c(NA, "Bronze", "Silver", "Gold"))

# str(athlete_data)

Analysis
* There are numerous teams with total participitants less than 10.
* Winter games have less participitation than Summer games.
* Athletics is the most popular Sport.
* There are some events with only double digit participitation.
* 71,881 records with No medals among 83,296 total records.
* Almost 70% of athletes seem to be in Healthy BMI category.

Data Exploration

Univariate Plots - Outlier Detection and Data Distrbution Histograms

Steps
* Detecting outliers in Age, Height, Weight and BMI.
* Plotting Box plot to have a quick observance of above metric attributes.
* Histogram plotting for Age and BMI to observe the distribution. Since BMI depends on Height and Weight.

metric_col <- c("Age", "Height(cms)", "Weight(Kgs)", "BMI")


# Age
ggplot(data = athlete_data, aes(y = Age, fill = Sex)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Age Outlier Detection - Boxplot", y = "Age") +
  theme(plot.title = element_text(hjust = 0.5))

# Age Histogram to see distribution
ggplot(data = athlete_data, aes(x = Age, fill = Sex)) +
  geom_histogram(binwidth = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
  labs(title = "Age Distribution Histogram", x = "Age", y = "Count")

# Age Histogram to see distribution
ggplot(data = athlete_data, aes(x = log10(Age), fill = Sex)) +
  geom_histogram(binwidth = 0.01) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
  labs(title = "Normalized Age Distribution Histogram", x = "Age", y = "Count")

# Height
ggplot(data = athlete_data, aes(y = `Height(cms)`, fill = Sex)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Height(cms) Outlier Detection - Boxplot") +
  theme(plot.title = element_text(hjust = 0.5))

# Weight
ggplot(data = athlete_data, aes(y = `Weight(Kgs)`, fill = Sex)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Weight(Kgs) Outlier Detection - Boxplot") +
  theme(plot.title = element_text(hjust = 0.5))

# BMI
ggplot(data = athlete_data, aes(y = BMI, fill = Sex)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "BMI Outlier Detection - Boxplot") +
  theme(plot.title = element_text(hjust = 0.5))

# BMI Histogram to see distribution
ggplot(data = athlete_data, aes(x = BMI, fill = Sex)) +
  geom_histogram(binwidth = 1) +
  theme_bw() +
  labs(title = "BMI Distribution Histogram") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) 

# BMI Histogram to see distribution
ggplot(data = athlete_data, aes(x = log10(BMI), fill = Sex)) +
  geom_histogram(binwidth = 0.01) +
  theme_bw() +
  labs(title = "Normalized BMI Distribution - Histogram") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) 

Analysis
* Male Age has more outliers than Female age.
* Average Age for Males is slightly higher than Female Age.
* To normalize the age distribution, plotted the logarithmic (log10) conversion of Age distribution. It looks close to a normal distribution.
* Male athletes are taller than Female athletes with few more athletes of height more than 200 cms.
* Male athletes are heavier than Female athletes with few more athletes with weight more than 175 kgs.
* Thus, Male athletes have little higher BMI than Female athletes with few more outliers than Female participitants.
* Similarly, as logarithmic Age transformation, observing at logarthimic BMI plot we observe that it is close to a normal distribution.

Bivariate Plots

Age Outlier Exploration

Further Oultier Exploration of Age to better understand the data outliers.
* Plotting Boxplots of Age Vs Sport, Sport with Medal, Year, BMI category

# Age Bivariate Outlier Exploration

# Detailed look at age with type of sport to make sense of Age outliers
ggplot(data = athlete_data, aes(x = Sport, y = Age)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - Age V/s Sports") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# Further look by faceting with Medal
# Maximum outliers in NA Medal category
ggplot(data = athlete_data, aes(x = Sport, y = Age, fill = Medal)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - Age V/s Sports - Wrap by Medal") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
  facet_grid(Medal ~.) 

ggplot(data = athlete_data, aes(x = paste(athlete_data$Games,"\n",athlete_data$City), y = Age)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - Age V/s Olympic Year or City", x = "Year/City") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

ggplot(data = athlete_data, aes(x = BMI_cat, y = Age, fill = Medal)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - Age V/s BMI_category - Wrap by Medal", x = "BMI Category") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
  facet_wrap(~Medal)

# nrow(athlete_data[athlete_data$Age > 40,])

Analysis
* Age V/s Sports reveals few sports have more Age outliers than others.
+ Accounting these outliers to their corrosponding sports, these outliers may be real (upon reading for each sports).
* When faceted by Medal type, we have more Age outliers for players with no medals.
* Summer Games have more Age Outliers than Winter games.
* Maximum number of Age Outliers with No Medals.
* Few Age outliers won other medals with maximum number of player in Healthy BMI category.

Quick tabulation for Age > 45 and Sport,medal

table(athlete_data[athlete_data$Age > 45,]$Sport, athlete_data[athlete_data$Age > 45,]$Age)
                           
                            46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
  Alpine Skiing              0  0  0  0  0  2  0  0  0  1  0  0  0  0  0
  Archery                    5  2  2  0  2  0  2  0  0  0  0  0  0  0  0
  Athletics                  3  1  0  0  0  0  0  0  0  0  0  0  0  0  0
  Badminton                  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Baseball                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Basketball                 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Beach Volleyball           0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Biathlon                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Bobsleigh                  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0
  Boxing                     0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Canoeing                   0  1  2  0  0  0  0  0  0  0  0  0  0  0  0
  Cross Country Skiing       0  2  0  0  0  0  0  0  0  0  0  0  0  0  0
  Curling                    2  1  1  1  1  0  0  0  0  0  0  0  0  0  0
  Cycling                    0  0  0  2  0  0  0  0  0  0  0  0  0  0  0
  Diving                     0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Equestrianism             34 41 34 30 22 20 24 16  8  5  8  3  6  0  8
  Fencing                    0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
  Figure Skating             0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Football                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Freestyle Skiing           0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Golf                       2  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Gymnastics                 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Handball                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Hockey                     0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Ice Hockey                 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Judo                       0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Luge                       0  1  2  0  0  0  1  0  0  0  0  0  0  0  0
  Modern Pentathlon          0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Nordic Combined            0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Rhythmic Gymnastics        0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Rowing                     0  0  1  0  0  2  1  0  0  0  1  0  0  0  0
  Rugby Sevens               0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Sailing                    6  2  1  3  1  0  0  0  1  0  0  1  1  0  0
  Shooting                  29 24 18 18  7  9 11  6  6  2  2  3  0  1  0
  Short Track Speed Skating  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Skeleton                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Ski Jumping                0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Snowboarding               0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Softball                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Speed Skating              0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Swimming                   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Synchronized Swimming      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Table Tennis               3  0  0  1  1  0  0  1  1  0  0  0  0  0  0
  Taekwondo                  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Tennis                     0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
  Trampolining               0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Triathlon                  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Volleyball                 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Water Polo                 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Weightlifting              0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  Wrestling                  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
                           
                            61 62 63 65 67 71
  Alpine Skiing              0  0  0  0  0  0
  Archery                    0  0  0  0  0  0
  Athletics                  0  0  0  0  0  0
  Badminton                  0  0  0  0  0  0
  Baseball                   0  0  0  0  0  0
  Basketball                 0  0  0  0  0  0
  Beach Volleyball           0  0  0  0  0  0
  Biathlon                   0  0  0  0  0  0
  Bobsleigh                  0  0  0  0  0  0
  Boxing                     0  0  0  0  0  0
  Canoeing                   0  0  0  0  0  0
  Cross Country Skiing       0  0  0  0  0  0
  Curling                    0  0  0  0  0  0
  Cycling                    0  0  0  0  0  0
  Diving                     0  0  0  0  0  0
  Equestrianism              4  1  0  2  2  1
  Fencing                    0  0  0  0  0  0
  Figure Skating             0  0  0  0  0  0
  Football                   0  0  0  0  0  0
  Freestyle Skiing           0  0  0  0  0  0
  Golf                       0  0  0  0  0  0
  Gymnastics                 0  0  0  0  0  0
  Handball                   0  0  0  0  0  0
  Hockey                     0  0  0  0  0  0
  Ice Hockey                 0  0  0  0  0  0
  Judo                       0  0  0  0  0  0
  Luge                       0  0  0  0  0  0
  Modern Pentathlon          0  0  0  0  0  0
  Nordic Combined            0  0  0  0  0  0
  Rhythmic Gymnastics        0  0  0  0  0  0
  Rowing                     0  0  0  0  0  0
  Rugby Sevens               0  0  0  0  0  0
  Sailing                    0  0  0  0  0  0
  Shooting                   2  0  1  1  0  0
  Short Track Speed Skating  0  0  0  0  0  0
  Skeleton                   0  0  0  0  0  0
  Ski Jumping                0  0  0  0  0  0
  Snowboarding               0  0  0  0  0  0
  Softball                   0  0  0  0  0  0
  Speed Skating              0  0  0  0  0  0
  Swimming                   0  0  0  0  0  0
  Synchronized Swimming      0  0  0  0  0  0
  Table Tennis               0  0  0  0  0  0
  Taekwondo                  0  0  0  0  0  0
  Tennis                     0  0  0  0  0  0
  Trampolining               0  0  0  0  0  0
  Triathlon                  0  0  0  0  0  0
  Volleyball                 0  0  0  0  0  0
  Water Polo                 0  0  0  0  0  0
  Weightlifting              0  0  0  0  0  0
  Wrestling                  0  0  0  0  0  0
table(athlete_data[athlete_data$Age > 45,]$Medal, athlete_data[athlete_data$Age > 45,]$Age)
        
         46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 65 67 71
  Bronze  6  4  2  2  1  2  3  0  0  0  1  0  0  0  0  0  0  0  0  0  0
  Silver  9  2  1  1  0  1  2  0  0  1  0  0  0  0  0  1  0  0  0  0  0
  Gold    3  4  0  3  2  1  1  1  2  0  0  0  1  0  0  0  0  0  0  0  0
# table(athlete_data[athlete_data$Age > 45,]$Sport, athlete_data[athlete_data$Age > 45,]$Age, athlete_data[athlete_data$Age > 45,]$Medal)

Analysis
* Shooting and Equestrianism Sport have greater number of players who are older, which is acceptable for these sports.
* Not many older players won medals. Only 3 Medals won overall for Age greater than 55.

BMI Outlier Exploration

Further BMI Outlier detection w.r.t Sport, Sport + Medal, Year, BMI Category

# Detailed look at age with type of sport to make sense of Age outliers
ggplot(data = athlete_data, aes(x = Sport, y = BMI)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - BMI V/s Sport") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) 

# Further look by faceting with Medal
# Maximum outliers in NA Medal category
ggplot(data = athlete_data, aes(x = Sport, y = BMI, fill = Medal)) +
  geom_boxplot() +
  theme_bw() +
  # labs(title = "Outlier Detection - BMI V/s Sport - Wrap by Medal") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
  facet_grid(Medal ~.) 

ggplot(data = athlete_data, aes(x = paste(athlete_data$Games,"\n",athlete_data$City), y = BMI)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - BMI V/s Year/City", x = "Year-City") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) 

ggplot(data = athlete_data, aes(x = BMI_cat, y = BMI, fill = Medal)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Outlier Detection - BMI V/s BMI_category - Wrap by Medal", x = "BMI Category") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
  facet_wrap(~Medal)

Analysis
* Large number of BMI outliers for Athletics, Judo, Weightlifting, Shooting and Rugby Sevens.
+ These makes sense since these specific sports generally have overweight players and (some)sport themselves have a weight category.
* Same distribution of outliers is followed for different medal types also, with NA having most outliers.
* Beijing and London Summer Olympics had a few extreme BMI outlier.
* BMI V/s BMI category outlier detection follows a normal distribution as expected.

Quick Tabulation of BMI(>55) category with Medal

# table(athlete_data[athlete_data$BMI > 55,]$Sport, athlete_data[athlete_data$BMI > 45,]$BMI_cat)
# table(athlete_data[athlete_data$BMI > 45,]$Medal, athlete_data[athlete_data$BMI > 45,]$BMI_cat)
table(athlete_data[athlete_data$BMI > 55,]$Medal, athlete_data[athlete_data$BMI > 55,]$BMI_cat)
        
         Severely underweight Underweight Healthy Overweight
  Bronze                    0           0       0          0
  Silver                    0           0       0          0
  Gold                      0           0       0          0
        
         Moderately obese Severely obese Very Severely obese
  Bronze                0              0                   1
  Silver                0              0                   0
  Gold                  0              0                   0

Analysis
* Only one Very Severly Obese athlete won a Bronze wedal across all years of our dataset.
* Thus, based on above outlier detection we can filter the data with Age < 61 and BMI < 56 to have minimum loss of Winners, while staying true to population representation.

Data Cleaning

Based on above outlier detection and considering the relvance of age and BMI for respective sports, we subset the data with Age less than 61 and BMI less than 56 for further exploration. Overall, there is a loss of only two records with a Medal among 12045 Medal holders.

athlete_clean <- subset(athlete_data, Age < 61 & BMI < 56)
str(athlete_clean)
Classes 'tbl_df', 'tbl' and 'data.frame':   83909 obs. of  17 variables:
 $ ID         : num  12 13 18 23 30 36 36 53 68 68 ...
 $ Name       : chr  "Jyri Tapani Aalto" "Minna Maarit Aalto" "Timo Antero Aaltonen" "Fritz Aanes" ...
 $ Sex        : Factor w/ 2 levels "F","M": 2 1 2 2 2 2 2 2 2 2 ...
 $ Age        : num  31 34 31 22 30 25 25 24 23 23 ...
 $ Height(cms): num  172 159 189 187 189 194 194 172 178 178 ...
 $ Weight(Kgs): num  70 55.5 130 89 72 78 78 58 76 76 ...
 $ BMI        : num  23.7 22 36.4 25.5 20.2 ...
 $ Team       : Factor w/ 331 levels "Afghanistan",..: 102 102 102 216 205 205 205 93 8 8 ...
 $ NOC        : Factor w/ 210 levels "AFG","AHO","ALB",..: 65 65 65 142 138 138 138 58 8 8 ...
 $ Games      : Factor w/ 9 levels "2000 Summer",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Year       : Factor w/ 9 levels "2000","2002",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Season     : Factor w/ 2 levels "Summer","Winter": 1 1 1 1 1 1 1 1 1 1 ...
 $ City       : Factor w/ 9 levels "Athina","Beijing",..: 7 7 7 7 7 7 7 7 7 7 ...
 $ Sport      : Factor w/ 51 levels "Alpine Skiing",..: 4 33 3 51 31 41 41 44 41 41 ...
 $ Event      : Factor w/ 442 levels "Alpine Skiing Men's Combined",..: 63 288 37 428 262 347 351 386 347 355 ...
 $ Medal      : Factor w/ 3 levels "Bronze","Silver",..: NA NA NA NA NA NA NA NA NA NA ...
 $ BMI_cat    : Factor w/ 7 levels "Severely underweight",..: 3 3 6 4 3 3 3 3 3 3 ...

Comment
* Our dataset is now in a proper format with a (supposedly) true representation of the population.
* We now have 83,909 number of records after subsetting/filtering with respect to outlier detection.
* We can now start with Expolatory Data Analysis to derive insteresting insights.

Expolatory Data Analysis

Simple Expolaration w.r.t. BMI Category

# qplot(x = Medal, y = Age, data = athlete_data, geom = "boxplot", fill = Sex )
# qplot(x = Sport, y = Age, data = athlete_data, geom = "boxplot"  )


# Using for loop to find apporopriate bin value
# for (bin_i in c(5,10,15,20,25,30,35,40)) {
#   
# plot <- ggplot(data = athlete_clean, aes(x = Age, fill = BMI_cat)) +
#           geom_density(stat = "bin", bins = bin_i) +
#           theme_bw() +
#           theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
#           facet_wrap(~Sex)
# 
# print(plot)
# }


ggplot(data = athlete_clean, aes(x = Age, fill = BMI_cat)) +
  geom_density(stat = "bin", bins = 30) +
  theme_bw() +
  labs(title = "Density Plot for M/F Age Distribution wrt BMI Category") +
  theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5)) + 
  facet_wrap(~Sex)

# labs(title = "Outlier Detection - BMI V/s BMI_category - Wrap by Medal", x = "BMI Category") +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))


ggplot(data = athlete_clean, aes(x = BMI_cat, y = Age, fill = Sex)) +
  geom_violin(scale = "area") +
  theme_bw() +
  labs(title = "Violin plot for Age Distribution wrt BMI Category - M/F", x = "BMI Category") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

ggplot(data = athlete_clean, aes(x = BMI_cat, y = `Height(cms)`, fill = Sex)) +
  geom_violin(scale = "area") +
  theme_bw() +
  labs(title = "Violin plot for Heights Distribution wrt BMI Category - M/F", x = "BMI Category") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

ggplot(data = athlete_clean, aes(x = BMI_cat, y = `Weight(Kgs)`, fill = Sex)) +
  geom_violin(scale = "area") +
  theme_bw() +
  labs(title = "Violin plot for Weight Distribution wrt BMI Category - M/F", x = "BMI Category") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

ggplot(data = athlete_clean, aes(x = BMI_cat, y = BMI, fill = Sex)) +
  geom_violin(scale = "area") +
  theme_bw() +
  labs(title = "BMI Trend V/s BMI Category - M/F", x = "BMI Category") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# ggplot(data = athlete_data, aes(x = Medal, y = Age, fill = Sex)) +
#   geom_violin() +
#   theme_bw()

Analysis
* Age distribution for Male and Female athletes does not follow a true normal distribution for all BMI categories.
+ Maximum athletes belong to Healthy BMI category between the age of 20 to 30.
* Violin plot for Age wrt BMI category
+ Females under the age of 20 are severly underweight.
+ Slightly Normal Distribution for Healthy BMI category for both genders.
+ Maximum Age outliers in Healthy and Overweight BMI Category.
* Violin plot for Heights wrt BMI category
+ More number of males with more height under Severly Underweight category.
+ Males are taller than Females across the distribution.
+ Almost equal Height distribution for both genders.
* Violin plot for Weight and BMI category.
+ There is an increasing trend in weights distribution with each incresing level of BMI category, as expected.
* Same trend is followed for the plot of BMI V/s BMI Category, since there are corelated to each other.

Expolatory Analysis with Data Munging

By Number of Medals Won by Sex, Season and Games

# Total Medals Won by Sex
athlete_clean %>% filter(!is.na(Medal)) %>% 
  group_by(Medal, Sex) %>% 
  summarize(count = n()) %>% 
  mutate(percentage = round(count/sum(count) * 100,2)) %>% 
  ggplot(aes(x = Sex, y = count, fill = Medal)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  facet_wrap(~Medal) +
  labs(title = "Total Medals Won by Sex") +
  geom_text(aes(label = paste(percentage,"%")), vjust = 1.5) +
  theme(plot.title = element_text(hjust = 0.5))

# table(athlete_clean$Sex, athlete_clean$Season, athlete_clean$Medal)

# Total Medals Won by Season and Sex
athlete_clean %>% filter(!is.na(Medal)) %>% 
  group_by(Season, Medal, Sex) %>% 
  summarize(count = n()) %>% 
  mutate(percentage = round(count/sum(count) * 100,2)) %>% 
  ggplot(aes(x = Sex, y = count, fill = Season)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_bw() +
  facet_wrap(~Medal) +
  labs(title = "Total Medals Won by Season and Sex") +
  geom_text(aes(label = paste(percentage,"%")), vjust = 1.5) +
  theme(plot.title = element_text(hjust = 0.5))

# Medals Proportion Distribution Won by Games
athlete_clean %>% filter(!is.na(Medal)) %>% 
  group_by(Games, Medal) %>% 
  summarize(count = n()) %>% 
  mutate(percentage = round(count/sum(count) * 100,2)) %>% 
  ggplot(aes(x = Medal, y = count, fill = Medal)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_bw() +
  facet_wrap(~Games) +
  labs(title = "Proportion of Medals Won by Games") +
  geom_text(aes(label = paste(percentage,"%")), vjust = 1.1, hjust = 0.5, size = 3)+ #position = position_jitter(width = 0, height = 0.9)) +
  theme(plot.title = element_text(hjust = 0.5))

# Medals Proportion Distribution Won by Games and Sex
athlete_clean %>% filter(!is.na(Medal), Season == "Summer") %>% 
  group_by(Games,Medal,Sex) %>% 
  summarize(count = n()) %>% 
  mutate(percentage = round(count/sum(count) * 100,2)) %>% 
  ggplot(aes(x = Medal, y = count, fill = Sex)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_bw() +
  facet_wrap(~Games) +
  labs(title = "Proportion of Medals Won by Games and Sex - SUMMER") +
  geom_text(aes(label = paste(percentage,"%")), size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
  #position = position_jitter(width = 0, height = 0)) +
  theme(plot.title = element_text(hjust = 0.5))

athlete_clean %>% filter(!is.na(Medal), Season == "Winter") %>% 
  group_by(Games,Medal,Sex) %>% 
  summarize(count = n()) %>% 
  mutate(percentage = round(count/sum(count) * 100,2)) %>% 
  ggplot(aes(x = Medal, y = count, fill = Sex)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_bw() +
  facet_wrap(~Games) +
  labs(title = "Proportion of Medals Won by Games and Sex - WINTER") +
  geom_text(aes(label = paste(percentage,"%")), size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
  #position = position_jitter(width = 0, height = 0)) +
  theme(plot.title = element_text(hjust = 0.5))

Analysis
* Male athletes won more number(or proportion) of medals than females, about 55% on average for all medal caegories.
* Same trend is followed for both the seasons, i.e., Summer and Winter games.
+ More number of participitants in Summer games than in Winter.
* Equal proportion(33.3%) distribution of Gold, Silver and Bronze Medals across all the games - from 2000 Summer to 2016 Summer.
* Female participitants performance and Wins increased in the last games of 2016 Summer.
* Similar performance of both Male and Female participitants for all Winter games.

# str(athlete_clean)

By No of participants and Teams

# Base
# athlete_clean %>% group_by(Team,Sex) %>% 
#   summarize(No_of_athlete = n()) %>% 
#   arrange(-No_of_athlete) %>% 
#   mutate(percentage = round(No_of_athlete/sum(No_of_athlete) * 100,0)) %>% 
#   filter(No_of_athlete > 1000 ) %>% 
#   ggplot(aes(x = Team, y = No_of_athlete, fill = Sex)) +
#     geom_bar(stat = "identity") +
#     theme_bw() +
#     # facet_wrap(~Medal) +
#     labs(title = "No of participants(>1000) from Teams") +
#     geom_text(aes(label = paste(percentage,"%")), size = 3, check_overlap = F,  position = position_stack(vjust = 0.5) ) +
#     theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))


np_team <- athlete_clean %>% group_by(Team) %>% summarize(Total_athletes_team = n())

np_team_sex <- athlete_clean %>% group_by(Team,Sex) %>% summarize(No_of_athlete_ts = n()) %>% 
  mutate(percentage_ts = round(No_of_athlete_ts/sum(No_of_athlete_ts) * 100,0))

np_total <- left_join(np_team, np_team_sex, by= "Team" )

# No of participants(>1000) for Teams
np_total %>% filter(Total_athletes_team > 1000 ) %>% 
  ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = Sex)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of participants(>1000) for Teams", y = "Number of Athletes", x = "Teams") +
    geom_text(aes(label = paste(percentage_ts,"\n%",sep = "")), size = 3, check_overlap = F,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# No of participants(>1000) from Teams - MALE
np_total %>% filter(Sex == "M", No_of_athlete_ts > 1000 ) %>% 
  ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of MALE participants(>1000) for Teams", y = "Number of Athletes", x = "Teams") +
    geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.1) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# No of participants(>1000) from Teams - FEMALE
np_total %>% filter(Sex == "F", No_of_athlete_ts > 1000 ) %>% 
  ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of FEMALE participants(>1000) for Teams", y = "Number of Athletes", x = "Teams") +
    geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.1) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# Top 10 Teams with highest participation
np_total %>% arrange(-Total_athletes_team) %>% top_n(20, Total_athletes_team) %>%
  ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = Sex)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 Teams with Highest participation", y = "Number of Athletes", x = "Teams") +
    geom_text(aes(label = paste(percentage_ts,"\n%",sep = "")), size = 3, check_overlap = F,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# Top 10 Teams with highest participation - MALE
np_total %>% filter(Sex == "M") %>% arrange(-No_of_athlete_ts) %>% top_n(10, No_of_athlete_ts) %>%
  ggplot(aes(x = reorder(Team,No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 Teams with highest MALE participation", y = "Number of Athletes", x = "Teams") +
    geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.1) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.2)) +
    coord_flip()

# Top 10 Teams with highest participation - FEMALE
np_total %>% filter(Sex == "F") %>% arrange(-No_of_athlete_ts) %>% top_n(10, No_of_athlete_ts) %>%
  ggplot(aes(x = reorder(Team,No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 Teams with highest FEMALE participation", y = "Number of Athletes", x = "Teams") +
    geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.1) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
    coord_flip()

Analysis
* Plot - No of participitants (>1000) for Teams + 26 out of 331 Total Teams have more than 1000 participitants.
+ China has more Female players than Males.
+ Italy, France, Austria, Kazakhstan, Switzerland, Spain and Germany have more Male players than Females.
* Plots - No of participitants (>1000) for Teams - MALE and FEMALE + United States has the maximum number of Male and Female participitants, followed by Russian and Germany.
+ Only 14 teams have more than 1000 Male participitants.
+ Only 10 teams have more than 1000 Female participitants.
+ Top 3 Teams with Male players (>1000) are United States, Germany and Russia.
+ Top 3 Teams with FeMale players (>1000) are United States, Russia and China.
* Plots - Top 10 Teams with highest MALE and FEMALE participation
+ China was last in this list for Male players, whereas third country for Female participitation.
+ France at 4 position for Male and at 9 position for Female participitation.
+ Canade at 7 position for Male and at 5 position for Female participitation.

By No of participants and Sport

np_sport <- athlete_clean %>% group_by(Sport) %>% summarize(Total_athletes_sport = n())

np_sport_sex <- athlete_clean %>% group_by(Sport,Sex) %>% summarize(No_of_athlete_ss = n()) %>% 
  mutate(percentage_ss = round(No_of_athlete_ss/sum(No_of_athlete_ss) * 100,0))

np_total_sport <- left_join(np_sport, np_sport_sex, by= "Sport" )

# No of participants(>1000) from Teams
np_total_sport %>% filter(Total_athletes_sport > 1000 ) %>% 
  ggplot(aes(x = reorder(Sport,-No_of_athlete_ss), y = No_of_athlete_ss, fill = Sex)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of participants(>1000) for Sports", y = "Number of Athletes", x="Sport") +
    geom_text(aes(label = paste(percentage_ss,sep = "")), size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

np_total_sport %>% filter(Sex == "M", No_of_athlete_ss > 1000 ) %>% 
  ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of MALE participants(>1000) for Sports", y = "Number of Athletes", x="Sport") +
    geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.20) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
    coord_flip()

np_total_sport %>% filter(Sex == "F", No_of_athlete_ss > 1000 ) %>% 
  ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of FEMALE participants(>1000) for Sports", y = "Number of Athletes", x="Sport") +
    geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.20) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") + 
    coord_flip()

# plot_grid(p1,p2)

# Top 10 Sports
np_total_sport %>% filter(Total_athletes_sport > 1000 )  %>% arrange(-Total_athletes_sport) %>% top_n(20, Total_athletes_sport) %>% 
  ggplot(aes(x = reorder(Sport,-No_of_athlete_ss), y = No_of_athlete_ss, fill = Sex)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 - Popular Sports", y = "Number of Athletes", x="Sport") +
    geom_text(aes(label = paste(percentage_ss,"%",sep = "")), size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# Top 10 Sports - MEN
np_total_sport %>% filter(Sex == "M") %>% arrange(-No_of_athlete_ss) %>% top_n(10, No_of_athlete_ss) %>% 
  ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 - Popular Sports - Men", y = "Number of Athletes", x="Sport") +
    geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.20) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
    coord_flip()

# Top 10 Sports - WOMEN
np_total_sport %>% filter(Sex == "F")  %>% arrange(-No_of_athlete_ss) %>% top_n(10, No_of_athlete_ss) %>% 
  ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 - Popular Sports - Female", y = "Number of Athletes", x="Sport") +
    geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.20) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") + 
    coord_flip()

Analysis
* Plot - No of participitants (>1000) for SPORTS
+ Athletics, Swimming and Gymnastics have the highest participitation.
+ Boxing has 95% Male participitants.
+ Archery and Short Track Speed Skating have lowest participitation. (for players>1000)
+ More Male participitants than Females for Boxing, Cycling, Gymnastics, Shooting, Rowing, Football, Ice Hockey and Equestrianism sports.
* Plots - No of participitants (>1000) for SPORTS - MALE and FEMALE
+ 17 Sports with more than 1000 Male participitants.
+ Only 8 Sports with more than 1000 Female participitants.
* Plots - Top 10 SPORTS with highest MALE and FEMALE participation
+ Cross Country Skiing and Alpine Skiing are more popular sports for Females than Males.
+ Canoeing and Wrestling sports are popular among Men but same is not true for Females.

By No participants and Event

np_event <- athlete_clean %>% group_by(Event) %>% summarize(Total_athletes_event = n())

np_event_sex <- athlete_clean %>% group_by(Event,Sex) %>% summarize(No_of_athlete_es = n()) %>% 
  mutate(percentage_es = round(No_of_athlete_es/sum(No_of_athlete_es) * 100,0))

np_total_event <- left_join(np_event, np_event_sex, by= "Event" )

# Top 10 Events Participation - MEN
np_total_event %>% filter(Sex == "M") %>% arrange(-No_of_athlete_es) %>% top_n(10, No_of_athlete_es) %>% 
  ggplot(aes(x = reorder(Event,No_of_athlete_es), y = No_of_athlete_es, fill = No_of_athlete_es)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 - Events - Men", y = "Number of Athletes", x="Events") +
    geom_text(aes(label = paste(No_of_athlete_es)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.20) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
    coord_flip()

# Top 10 Events Participation - WOMEN
np_total_event %>% filter(Sex == "F")  %>% arrange(-No_of_athlete_es) %>% top_n(10, No_of_athlete_es) %>% 
  ggplot(aes(x = reorder(Event,No_of_athlete_es), y = No_of_athlete_es, fill = No_of_athlete_es)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 - Events - Female", y = "Number of Athletes", x="Events") +
    geom_text(aes(label = paste(No_of_athlete_es)), size = 3, check_overlap = F,  position = position_stack(vjust = 1.20) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") + 
    coord_flip()

Analysis
* Plots - Top 10 EVENTS with highest MALE and FEMALE participation
+ Football, Water polo and Ice Hockey events are more popular among Men than Women.
+ Hockey, Athletics and Volleyball events are more popular among Women than Men.

By No participants and City

np_city <- athlete_clean %>% group_by(Year,City) %>% summarize(Total_athletes_city = n())

np_city_sex <- athlete_clean %>% group_by(Year,City,Sex) %>% summarize(No_of_athlete_es = n()) %>% 
  mutate(percentage_es = round(No_of_athlete_es/sum(No_of_athlete_es) * 100,0))

np_total_city <- left_join(np_city, np_city_sex, by= c("Year","City"))

# np_total_city  %>% arrange(-Total_athletes_city) %>% #top_n(20, Total_athletes_city) %>% 
#   ggplot(aes(x = paste(City,"\n",Year), y = No_of_athlete_es, fill = Sex)) +
#     geom_bar(stat = "identity") +
#     theme_bw() +
#     # facet_wrap(~Medal) +
#     labs(title = "No of Participants in City", y = "Number of Athletes") +
#     geom_text(aes(label = paste(percentage_es,"%\n",No_of_athlete_es, sep="")), size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
#     theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

np_total_city  %>% arrange(-Total_athletes_city) %>% #top_n(20, Total_athletes_city) %>% 
  ggplot(aes(x = reorder(paste(City,"\n",Year),No_of_athlete_es), y = No_of_athlete_es, fill = Sex)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "No of Participants in City", y = "Number of Athletes", x = "City and Year") +
    geom_text(aes(label = paste(percentage_es,"%\n",No_of_athlete_es, sep="")), size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

Analysis
* Almost one third participitation for Winter games than Summer games.
* More proportion of Female participitants for Winter games than Summer games.
* Higher participitation by male athletes than Female athletes.
* sydney 2000 had the highest participitation whereas Salt Lake City 2002 had the lowest.

By No of Medals and Sport

nm_medal <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport) %>% summarize(Total_sport_medal = n()) %>% 
  mutate(Total_sport_medal_perc = round((Total_sport_medal/sum(Total_sport_medal)*100),2))

nm_sport <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport,Medal) %>% summarize(Total_medal_type_sport = n()) %>% 
  mutate(Total_medal_type_sport_perc = round((Total_medal_type_sport/sum(Total_medal_type_sport)*100),2))

nm_sport_sex <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport,Medal,Sex) %>% summarize(No_of_athlete_ms = n()) %>% 
  mutate(percentage_ms = round(No_of_athlete_ms/sum(No_of_athlete_ms) * 100,0))

nm_medal_plus_sport <- left_join(nm_medal, nm_sport, by="Sport")

nm_total_sport_sex <- left_join(nm_medal_plus_sport, nm_sport_sex, by= c("Sport","Medal"))

# Top 10 Sports by Medals Won
nm_medal  %>% arrange(-Total_sport_medal) %>% top_n(10, Total_sport_medal) %>% 
  ggplot(aes(x = reorder(Sport,Total_sport_medal), y = Total_sport_medal, fill = Total_sport_medal)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 Sports by Medals Won", y = "Number of Athletes", x = "Sport") +
    geom_text(aes(label = paste(Total_sport_medal_perc,"%",sep = "")), size = 3, check_overlap = T,  position = position_stack(vjust = 1.12) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
    coord_flip()

# Top 10 Sports by Medal Types Won
nm_medal_plus_sport %>% arrange(-Total_sport_medal) %>% top_n(30, Total_sport_medal) %>% 
  ggplot(aes(x = reorder(Sport,-Total_sport_medal), y = Total_medal_type_sport, fill = Medal)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 Sports by Total Medal(Types) Won", y = "Number of Athletes", x = "Sport") +
    geom_text(aes(label = paste(Total_medal_type_sport_perc,"%",sep = "")), size = 3, check_overlap = T,  position = position_stack(vjust = 1.12) ) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))

# Top 10 Sports by Medal Types Won
# Version 2
nm_medal_plus_sport %>% arrange(-Total_sport_medal) %>% top_n(30, Total_sport_medal) %>% 
  ggplot(aes(x = reorder(Sport,Total_sport_medal), y = Total_medal_type_sport, fill = Medal)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    facet_wrap(~Medal) +
    labs(title = "Top 10 Sports by Total Medal(Types) Won_V2", y = "Number of Athletes", x = "Sport") +
    geom_text(aes(label = paste(round(Total_medal_type_sport_perc,0),"%",sep = "")), size = 3, check_overlap = T,  
              position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
    coord_flip()

# Top 10 Sports by Medal Types Won
nm_medal_plus_sport %>% arrange(-Total_medal_type_sport) %>% top_n(10, Total_medal_type_sport) %>% 
  ggplot(aes(x = reorder(paste(Sport,"\n",Medal,sep=""),Total_medal_type_sport), y = Total_medal_type_sport, fill = Medal)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 Sports by Medal(Types) Won", y = "Number of Athletes", x = "Sports + Medal type") +
    geom_text(aes(label = paste(round((Total_medal_type_sport/sum(Total_sport_medal))*100,0),"%",sep = "")), 
              size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
    coord_flip()

# Top 10 MALE Medal Winners Category
nm_total_sport_sex %>% filter(Sex == "M")  %>%  arrange(-No_of_athlete_ms) %>% top_n(10, No_of_athlete_ms) %>% 
  ggplot(aes(x = reorder(paste(Sport,"\n",Medal,sep=""),No_of_athlete_ms), y = No_of_athlete_ms, fill = Medal)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 MALE Medal Winners Category", y = "Number of Athletes", x = "Sports + Medal type") +
    geom_text(aes(label = paste(round((No_of_athlete_ms/sum(Total_sport_medal))*100,0),"%",sep = "")), 
              size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
    coord_flip()

# Top 10 FEMALE Medal Winners Category
nm_total_sport_sex %>% filter(Sex == "F")  %>%  arrange(-No_of_athlete_ms) %>% top_n(10, No_of_athlete_ms) %>% 
  ggplot(aes(x = reorder(paste(Sport,"\n",Medal,sep=""),No_of_athlete_ms), y = No_of_athlete_ms, fill = Medal)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "Top 10 FEMALE Medal Winners Category", y = "Number of Athletes", x = "Sports + Medal type") +
    geom_text(aes(label = paste(round((No_of_athlete_ms/sum(Total_sport_medal))*100,0),"%",sep = "")), 
              size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
    coord_flip()

Analysis
* Swimminng, Athletics and Rowing are the top three sports with most medals won.
* Equal proportion of Medals won among Gold, Silver and Bronze categories.
* Swimming Gold and Athletics Gold are the top two male medal winning sports category.
* Swimming Gold and Swimming Silver are the top two Female medal winning sports category.
* Football Gold and Football Bronze medals won by Female players but not by Male players, in Top 10.

Top 10 Sports participitation by Season

#  count_BMI_team

# athlete_clean %>% group_by(Year,Season, Sport) %>% summarize(Total_player = n())


for (bin_i in levels(athlete_clean$Games)) {

plot_sport_games <- athlete_clean %>% group_by(Games, Sport) %>% summarize(Total_player = n()) %>% 
          filter(Games == bin_i) %>% 
          mutate(percentage_cbmiT = round(Total_player/sum(Total_player) * 100,2)) %>% 
          arrange(-Total_player) %>% top_n(10,Total_player) %>% 
          ggplot(aes(x = reorder(Sport,Total_player), y = Total_player, fill = Sport)) +
            geom_bar(stat = "identity") +
            theme_bw() +
            # facet_wrap(~Medal) +
            labs(title = paste("Top 10 Sports Participitation in \n",bin_i,sep=""), y = "Number of Athletes", x="Sports") +
            geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
                      size = 3, check_overlap = T,  position = position_stack(vjust = 0.5) ) +
            theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
            coord_flip()

print(plot_sport_games)
# gridExtra::grid.arrange(plot_summer_sport, plot_winter_sport, ncol = 2)

}

Analysis
* Summer Sports - Top 10 Sports that saw maximum participitation
+ Athletics, Swimming, Gymnastics and Cycling were the top four sports throughout.
+ Number of participitants for Gymnastics reduced over time.
+ Shooting had alternate rank movement over time.
+ Sailing popularity reduced over time.
* Winter Sports - Top 10 Sports that saw maximum participitation
+ Cross Country Skiing, Biathlon, Alpine Skiing, Ice Hockey and Speed Skating were constantly the top five sports.
+ Bobsleigh gained and lost popularity alternatively.
+ Ski jumping had random popularity rank movement in terms of the number of participitants.

Top 10 Teams with most healthy BMI participitants

np_bmi <- athlete_clean %>% group_by(BMI_cat) %>% summarize(Count_BMI = n())

np_bmi_team <- athlete_clean %>% filter(BMI_cat == "Healthy") %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>% 
  mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2))

# np_bmi_team_city <- left_join(np_city, np_city_sex, by= c("Year","City"))


# BMI Count
np_bmi %>%  
  ggplot(aes(x = BMI_cat, y = Count_BMI, fill = BMI_cat)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    # facet_wrap(~Medal) +
    labs(title = "BMI Count", y = "Number of Athletes", x = "BMI Category") +
    geom_text(aes(label = paste(round((Count_BMI/sum(Count_BMI))*100,2),"%",sep = "")), 
              size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
    coord_flip()

Analysis
* About 78% participitants are under Healthy BMI category. * 16% of participitants are Overweight.

# Using for loop to find apporopriate bin value
# for (bin_i in levels(athlete_clean$BMI_cat)) {
# 
# plot_BMI <- athlete_clean %>% filter(BMI_cat == bin_i) %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>% 
#           mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>% 
#           arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>% 
#           ggplot(aes(x = Team, y = count_BMI_team, fill = Team)) +
#             geom_bar(stat = "identity") +
#             theme_bw() +
#             # facet_wrap(~Medal) +
#             labs(title = paste("Top 10 Participating Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes", 
#                  x = paste("Teams - ",bin_i," BMI Category", sep="")) +
#             geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
#                       size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
#             theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
#             coord_flip()
# 
# print(plot_BMI)
# }


for (bin_i in levels(athlete_clean$BMI_cat)) {

plot_BMI <- athlete_clean %>% filter(BMI_cat == bin_i) %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>% 
          mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>% 
          arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>% 
          ggplot(aes(x = reorder(Team,count_BMI_team), y = count_BMI_team, fill = Team)) +
            geom_bar(stat = "identity") +
            theme_bw() +
            # facet_wrap(~Medal) +
            labs(title = paste("Top 10 Participating Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes", 
                 x = paste("Teams - ",bin_i," BMI Category", sep="")) +
            geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                      size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
            theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
            coord_flip()

plot_BMI_Medal <- athlete_clean %>% 
          filter(!is.na(Medal), BMI_cat == bin_i) %>% 
          group_by(BMI_cat,Team) %>% 
          summarize(count_BMI_team = n()) %>% 
          mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>% 
          arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>% 
          ggplot(aes(x = reorder(Team,count_BMI_team), y = count_BMI_team, fill = Team)) +
            geom_bar(stat = "identity") +
            theme_bw() +
            # facet_wrap(~Medal) +
            labs(title = paste("Top 10 Winning Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes", 
                 x = paste("Teams - ",bin_i," BMI Category", sep="")) +
            geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                      size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
            theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
            coord_flip()

gridExtra::grid.arrange(plot_BMI, plot_BMI_Medal, ncol = 2)

}

Analysis
* United States has the most number of participitants and Wins across (almost) all BMI categories.
* Severely Underweight BMI
+ Spain had the highest participitation but China had the maximum wins.
+ Ukraine did not win any medal even it third highest participitation from this BMI category.
+ Only Seven Teams won medals in this BMI category.
* Underweight BMI
+ Russia and China were the top two teams with maximum participitation and Wins.
+ Japan and Australia did not made it to Top 10 winning ranks even though it had third and fourth highest participitations respectively.
* Healthy BMI
+ USA, Germany and Russia lead the charts here.
+ Australian team ranked fourth even though it was the seventh team in participitation ranking.
* Overweight BMI
+ Poland and Czech Republic did not make it to the Top 10 winning teams.
+ Sweden stood at seventh rank with wins contrast to the observation it was not present in Top 10 participitation ranks.
+ United States and Canada were among top two teams for both participitation and wins.
* Moderately Obese BMI
+ The number of wins for Russia and Poland increased than their participitation proportion/number.
+ Austalia did not make it to the winning ranks here.
* Severely Obese BMI
+ Russia, China, Turkey, Japan and Belarus won the most number of medals, even though they had relevantly less participitation.
+ Proportion of participitation and wins were equal for United States.
+ Australia did not make it to the wining list here.
+ INDIA made it to both the rankings here.
* Very Severely Obese BMI
+ United States topped the charts for both.
+ China and Iran won more proportion of medals than their participitation proportions.

Top 10 Participating V/s Winning Teams - Overall

# Base code
# for (bin_i in levels(athlete_clean$Sport)) {
# 
# plot_Sport <- athlete_clean %>% filter(Sport == bin_i) %>% group_by(Sport,Team) %>% summarize(count_BMI_team = n()) %>% 
#                 mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>% 
#                 arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>% 
#                 ggplot(aes(x = Team, y = count_BMI_team, fill = Team)) +
#                   geom_bar(stat = "identity") +
#                   theme_bw() +
#                   # facet_wrap(~Medal) +
#                   labs(title = paste("Top 10 Teams in ",bin_i," Sports",sep=""), y = "Number of Athletes", 
#                        x = paste("Teams - ",bin_i, sep="")) +
#                   geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
#                             size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
#                   theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
#                   coord_flip()
# 
# print(plot_Sport)
# }


# Efficient Modified Code V2
# for (bin_i in levels(athlete_clean$Sport)) {
#   
#   plot_Sport <- athlete_clean %>% 
#                   filter(Sport == bin_i) %>% 
#                   group_by(Team) %>% 
#                   summarize(count_team = n()) %>% 
#                   mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
#                   arrange(-count_team) %>% 
#                   top_n(10,count_team) %>% 
#                     ggplot(aes(x = Team, y = count_team, fill = Team)) +
#                       geom_bar(stat = "identity") +
#                       theme_bw() +
#                       # facet_wrap(~Medal) +
#                       labs(title = paste("Top 10 Participating Teams in ",bin_i," Sports",sep=""), y = "Number of Athletes", 
#                            x = paste("Teams - ",bin_i, sep="")) +
#                       geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
#                                 size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
#                       theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
#                       coord_flip()
# 
# print(plot_Sport)
# }


# Top 10 Teams participitation V/s Win across sports and Games/Year

plot_Team_p <- athlete_clean %>% group_by(Team) %>% summarize(Count = n()) %>% 
                mutate(perc = round((Count/sum(Count))*100,2))  %>% arrange(-Count) %>% top_n(10,Count) %>% 
                ggplot(aes(x = reorder(Team,Count), y = Count, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      labs(title = paste("Top 10 Participating Teams \n Overall"), y = "Number of Athletes", x = "Team") +
                      geom_text(aes(label = paste(perc,"% ","(",Count,")",sep = "")),
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
        
plot_Team_w <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Team) %>% summarize(Count = n()) %>% 
                mutate(perc = round((Count/sum(Count))*100,2))  %>% arrange(-Count) %>% top_n(10,Count) %>% 
                ggplot(aes(x = reorder(Team,Count), y = Count, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      labs(title = paste("Top 10 Winning Teams \n Overall"), y = "Number of Athletes", x = "Team") +
                      geom_text(aes(label = paste(perc,"% ","(",Count,")",sep = "")),
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()

gridExtra::grid.arrange(plot_Team_p, plot_Team_w, ncol = 2)

Analysis
* Top 5 Teams are constant for both the charts and their proportionate wins increased with respect to their participitation.
* Japan did not made it to the Top 10 Wins ranking. It’s position in Top 10 list was replaced by Netherlands.

Top 10 Participating V/s Winning Teams in every Sport

# Modified Code V3 with comparission
for (bin_i in levels(athlete_clean$Sport)) {
  
  plot_Sport <- athlete_clean %>% 
                  filter(Sport == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 Participating Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()


  plot_Sport_Medal <- athlete_clean %>% 
                  filter(!is.na(Medal), Sport == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 Winning Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
  
  # print(plot_Sport)
  # cowplot::plot_grid(plot_Sport, plot_Sport_Medal, ncol = 2, align = "hv", axis = "t")
  gridExtra::grid.arrange(plot_Sport, plot_Sport_Medal, ncol = 2)
}

Analysis - Top 10 Teams Participitating Vs Winning for all Sports Individually
* Participitants from United States, Sweden, SOuth Korea and China only have won medals for Golf repeatedly.
* Participitants from United States, Australie, and Japan only have won medals for Softball repeatedly.
* Participitants from Cuba, United States, South Korea, Japan and Australia only have won medals for Baseball repeatedly.
* Participitants from China were most awarded athletes for Diving Sports.
* Participitants from Canada were most awarded athletes for Curling Sports.
* Participitants from Germany and Canoeing were most awarded athletes for Canoeing Sports.
* Participitants from Cuba and Russia were most awarded athletes for Boxing. * Participitants from Cuba were most awarded players for Baseball.
* Participitants from Austria were most awarded athletes for Alpine Skiing Sports.
* Participitants from South Korea were most awarded athletes for Archery.
* Participitants from United States were most awarded players for Athletics and Basketball.
* Participitants from China and South Korea-1 were most awarded players for Badminton.
* Participitants from Brazil-1 were most awarded players for Beach Volleyball.
* Even though Germany had lower participitation for Biathlon, they were the most awarded.
* Participitants from United States-1 and Germany-1 were the most awarded players for Bobsleigh sports.
* Participitants from United States-1, Russia and Canada were the most awarded players for Figure Skating.
* Participitants from United States, Great Britain and Germany were the most awarded players for Equestrianism sports.
* Participitants from Great Britain were the most awarded players for Cycling.
* Participitants from Russia, Japan, China and Spain were constatly the most awarded players for Synchronized Swimming.
* Participitants from United States and Australia were the most awarded players for Swimming.
* Participitants from Russia were the most awarded players for Wrestling and Rhythmic Gymnastics.
* Participitants from China and Canada were the most awarded players for Trampolining.
* Participitants from China were the most awarded players for Table Tennis.
* Participitants from Great Britain, Switzerland and Australia were the most awarded players for Triathlons.
* Participitants from Netherlands were the most awarded players for Speed Skating.
* Participitants from Great Britain and Australia were the most awarded players for Sailing.
* Participitants from Japan were the most awarded players for Judo.
* Participitants from Great Britain, Russia and Lithuania were the most awarded players for Modern Pentathlon.
* Participitants from Germany, Austria and Finland were the most awarded players for Nordic Combined Sports.

Top 10 Participating VWinning Teams by Games

for (bin_i in levels(athlete_clean$Games)) {
  
  plot_games <- athlete_clean %>% 
                  filter(Games == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 Participating Teams in \n",bin_i,sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()


  plot_games_Medal <- athlete_clean %>% 
                  filter(!is.na(Medal), Games == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 Winning Teams in \n",bin_i,sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
  
  gridExtra::grid.arrange(plot_games, plot_games_Medal, ncol = 2)
}

Analysis - Top 10 Participitating V/s Winning teams across Games/Year
* Summer Games
+ United States was the top team with maximum wins across the years. It won the most in 2008 Summer.
+ Germany jumped to second spot for winning list in 2016 from its position of mid three in the previous years.
+ Cuba did not make it to top 10 winning list for the last three summer games.
+ Russia’s winning poportion detoriated with time.
+ Canada made it to top 10 winnig list only in the last year of 2016 Summer games.
+ Great Britain’s performance/wins increased substantially from 2008 summer games onwards.
* Winter Games
+ Canadian team won the most medals across time.
+ United States was among the top three constantly.
+ Russian teams win percentage detoriated over time before finally bouncing back to top three in 2014 Winter games.
+ Italy’s performance/wins detoriated over time even though they were constantly in top 10 participitating teams.

Top 10 Participating VWinning Teams by Season

for (bin_i in levels(athlete_clean$Season)) {
  
  plot_games <- athlete_clean %>% 
                  filter(Season == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 Participating Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()


  plot_games_Medal <- athlete_clean %>% 
                  filter(!is.na(Medal), Season == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 Winning Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
  
  gridExtra::grid.arrange(plot_games, plot_games_Medal, ncol = 2)
}

Analysis - Overall Top 10 Participitating V/s Winning teams
* United States, Australia, and Russia were the top three teams in terms of participitation and wins for Summer sports.
* Canada did not perform well in Summer sports but was the top team in Winter sports.
* Italy and China performed well in Summer sports but did not make the cut for Winter sports.
* Even though Japan, Czech Republic and Sweden were among the top ten participitating teams for Winter Sports -
+ Only Sweden made it to the top four(or ten) winning teams.
+ Among them, Japan was the only team making the list of top 10 for Summer Sports for both participitation and Wins.

# LOOP MODIFICATION 4

# Adding sex grouping to above plots

for (bin_i in levels(athlete_clean$Season)) {
  
  plot_games_M <- athlete_clean %>% 
                  filter(Season == bin_i, Sex == "M") %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 M playing Teams in \n",bin_i,sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
  
  
  plot_games_F <- athlete_clean %>% 
                  filter(Season == bin_i, Sex == "F") %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 F playing Teams in \n",bin_i,sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()


  plot_games_Medal_M <- athlete_clean %>% 
                  filter(!is.na(Medal), Sex == "M", Season == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 M Winning Teams in \n",bin_i,sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
  
   plot_games_Medal_F <- athlete_clean %>% 
                  filter(!is.na(Medal), Sex == "F", Season == bin_i) %>% 
                  group_by(Team) %>% 
                  summarize(count_team = n()) %>% 
                  mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>% 
                  arrange(-count_team) %>% 
                  top_n(10,count_team) %>% 
                    ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
                      geom_bar(stat = "identity") +
                      theme_bw() +
                      # facet_wrap(~Medal) +
                      labs(title = paste("Top 10 F Winning Teams in \n",bin_i,sep=""), y = "Number of Athletes", 
                           x = paste("Teams - ",bin_i, sep="")) +
                      geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")), 
                                size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                      coord_flip()
  
  gridExtra::grid.arrange(plot_games_M, plot_games_Medal_M, 
                          plot_games_F, plot_games_Medal_F,
                          ncol = 2, nrow = 2)
}

Analysis - Overall Top 10 Participitating V/s Winning teams Wraped by Sex
* Summer Sports
+ United State Teams for both Male and Female performed best or won the maximum number of medals.
+ Brazil, South Korea, Italy and France Male Team made it to the top 10 wining list but not their female team.
+ Netherlands, Japan, Canada and Romania Female teams performed better than their male counterparts.
* Winter Sports
+ Canadian team was the best performer for Winter sports and their Female team further performed better than their Male team.
+ China, and South Korea female teams performed better than their Male teams.
+ Finland’s Male team performed way better than their Female teams.
+ Switzerland, Germany and Swedens’ Female teams performed better than their Male teams.

Top 10 Medal Winning Players

# athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Games,Sex,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win)

# athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport,Sex,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win)

# Male Across Olympics
athlete_clean %>% filter(!is.na(Medal), Sex=="M") %>% group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>% 
   head(10) %>% 
    ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
      geom_bar(stat = "identity") +
      theme_bw() +
      # facet_wrap(~Medal) +
      labs(title = paste("Top 10 Male Winners across Years"), y = "Number of Medals", x = "Athlete's Name") +
      geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
      coord_flip()

# FeMale Across Olympics
athlete_clean %>% filter(!is.na(Medal), Sex=="F") %>% group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>% 
   head(10) %>% 
    ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
      geom_bar(stat = "identity") +
      theme_bw() +
      # facet_wrap(~Medal) +
      labs(title = paste("Top 10 Female Winners across Years"), y = "Number of Medals", x = "Athlete's Name") +
      geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
      theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
      coord_flip()

Analysis - Top 10 Athlete’s to Win - Male and Female
* Male Athletes
+ Michael Phelps(Swimming) won the most number of medals across all sports categories.
+ Usian Bolt won a total 8 medals for Athelics between 2000 to 2016 games.
+ Ole Einar won a total of 11 medals for Biathlon between 2000 to 2016 games.
* Female Athletes
+ Natalie Anne won the maximum number of medals(12) for Swimming among all participitants.
+ Marit Bjrgen won the second most number of medals(10) for Cross Country Skiing among all.

for (bin_i in levels(athlete_clean$Games)) {
  
  plot_M_year <- athlete_clean %>% filter(!is.na(Medal), Sex=="M", Games == bin_i) %>% 
                  group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>% 
                  head(10) %>% 
                  ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
                    geom_bar(stat = "identity") +
                    theme_bw() +
                    # facet_wrap(~Medal) +
                    labs(title = paste("Top 10 Male Winners in",bin_i), y = "Number of Medals", x = "Athlete's Name") +
                    geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                    coord_flip()
  plot(plot_M_year)
}

for (bin_i in levels(athlete_clean$Games)) {
  plot_F_year <-athlete_clean %>% filter(!is.na(Medal), Sex=="F", Games == bin_i) %>% 
                  group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>% 
                  head(10) %>% 
                  ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
                    geom_bar(stat = "identity") +
                    theme_bw() +
                    # facet_wrap(~Medal) +
                    labs(title = paste("T10 Female Winners in",bin_i), y = "Number of Medals", x = "Athlete's Name") +
                    geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T,  position = position_stack(vjust = 0.7) ) +
                    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
                    coord_flip()
  
  # gridExtra::grid.arrange(plot_M_year, plot_F_year, ncol = 2)
  plot(plot_F_year)
}

Analysis - Top 10 Athlete’s to Win - Male and Female - Summers and Winter Games
* Summer Games - Male Winners
+ Michael Phelps(Swimming) was the top performer since 2004 with almost twice as many medals than other competitors across all the sports.
+ Top performer for 2000 summer Aleksey was his last game where he won 6 medals.
+ Most athletes in top ten won atmost three medals.
* Summer Games - Female Winners
+ Athletes from Swimming and Gymnastics made it top 10 winning list among all participitants for all sports.
+ Maximum number of medals won by any one athlete across the years was five.
+ Almost no consistent trend for a paticular player for games between 2000 to 2016.
* Winter Games - Male Winners
+ Most of the top ten athletes won approximately three medals each year.
+ Victor An(Short Track Speed Skating) held the top spot with 4 wins for years 2006 and 2014.
+ Top three positions were held by athelete’s for the sport of Skating, Skiing and Biathlon across years 2000 to 2016.
* Winter Games - Female Winners
+ The winning list was dominated by athletes from Speed Skating and Alpine Skiing sports for years 2000 to 2016.
+ Most of the participitants in the list won approximately three medals each.
+ The top three positions were held by different athletes each year.

Line Charts(Trend Analysis)

No of participants over time

athlete_clean %>% group_by(Year,Season) %>% 
  summarize(P_Count = n()) %>% 
  ggplot(aes(x = Year, y = P_Count, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("Participitation Trend Line"), y = "Number of Athletes") +
    geom_text(aes(label = P_Count), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

athlete_clean %>% group_by(Year,Season) %>% filter(Sex=="M") %>% 
  summarize(P_Count = n()) %>% 
  ggplot(aes(x = Year, y = P_Count, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("Male Participitation Trend Line"), y = "Number of Athletes") +
    geom_text(aes(label = P_Count), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

athlete_clean %>% group_by(Year,Season) %>% filter(Sex=="F") %>% 
  summarize(P_Count = n()) %>% 
  ggplot(aes(x = Year, y = P_Count, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("Female Participitation Trend Line"), y = "Number of Athletes") +
    geom_text(aes(label = P_Count), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

Analysis - Participitation Trend
* There was a drop in the number of all participitants in Summer sports for the year 2012.
* Winter sports had a gradually increasing trend line.
* Male Summer participitation dropped gradually while the Female participitation increased over the years.
* Male and Female Winter participitation had a constant increasing trend line.

Sex Ratio over time

gMale <- athlete_clean %>% filter(Sex == "M") %>% group_by(Year,Season) %>% 
          summarize(M_Count = n())

gFemale <- athlete_clean %>% filter(Sex == "F") %>% group_by(Year,Season) %>% 
          summarize(F_Count = n())

Grp_M_F <- left_join(gMale, gFemale, by = c("Year","Season"))


Grp_M_F %>% mutate(M_to_F_SexRatio = M_Count/F_Count) %>% 
  ggplot(aes(x = Year, y = M_to_F_SexRatio, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("Sex Ratio Trend Line"), y = "Number of Athletes") +
    geom_text(aes(label = round(M_to_F_SexRatio,2)), size = 3, check_overlap = T,  position = position_stack(vjust = 1.05) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

Analysis - Sex Ratio Trend
* Male to Female Sex ratio declined over the years.
* Winter games had a higer Male to Female Sex ratio than Summer games.

Number of Teams Each Year - Trend

athlete_clean %>% group_by(Year,Season) %>% 
  summarize(P_Count = length(unique(Team))) %>% 
  ggplot(aes(x = Year, y = P_Count, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("No of Teams Each Year - Trend Line"), y = "Number of Teams") +
    geom_text(aes(label = P_Count), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

Analysis
* Number of teams for Winter games were almost constant across the years.
* Number of teams participitation increased for few years and then dropped for the last two games.

Number of Medals won over years

total_medal <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season) %>% summarize(Total_Medal = n())
tm_sex <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season,Sex) %>% summarize(Total_Medal = n())

total_medal %>% 
  ggplot(aes(x = Year, y = Total_Medal, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("No of Medals won each Year - Trend"), y = "Number of Teams") +
    geom_text(aes(label = Total_Medal), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

tm_sex %>%  filter(Sex=="M") %>% 
  ggplot(aes(x = Year, y = Total_Medal, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("No of Medals won by MALE each Year - Trend"), y = "Number of Teams") +
    geom_text(aes(label = Total_Medal), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

tm_sex %>%  filter(Sex=="F") %>% 
  ggplot(aes(x = Year, y = Total_Medal, group = Season)) +
    geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Season, shape = Season), size = 3) +
    theme_bw() +
    labs(title = paste("No of Medals won by FEMALE each Year - Trend"), y = "Number of Teams") +
    geom_text(aes(label = Total_Medal), size = 3, check_overlap = T,  position = position_stack(vjust = 0.93) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

Analysis
* Number of medals awarded were on a constant rise for Winter games.
* Number of medals awarded for Summer games were approximately in the same range with a drop for year 2012.
* Female Medals holders numebr increased constatnly for both Summer and Winter games.
* A sudden drop on the number of Male medal holders for Summer games.

Select Teams performance(Medals Won) over years

total_medal <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season,Team) %>% summarize(Total_Medal = n()) %>% arrange(-Total_Medal)
# tm_sex <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season,Team,Sex) %>% summarize(Total_Medal = n()) %>% arrange(-Total_Medal)

total_medal %>% 
  filter(Season == "Winter",Team %in% c("United States","Russia","Canada","Germany","Italy","France")) %>% 
  ggplot(aes(x = Year, y = Total_Medal, group = Team)) +
    geom_line(aes(color = Team), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Team, shape = Team), size = 3) +
    theme_bw() +
    labs(title = paste("Winter Trend Line for Select Teams - Medals Won"), y = "Number of Teams") +
    # geom_text(aes(label = ""), size = 3, check_overlap = F,  position = position_stack(vjust = 0) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

total_medal %>% 
  filter(Season == "Summer",Team %in% c("United States","Russia","Canada","Germany","Italy","France")) %>% 
  ggplot(aes(x = Year, y = Total_Medal, group = Team)) +
    geom_line(aes(color = Team), size = 1.1, linejoin = "round", linetype = 1 ) + 
    geom_point(aes(color = Team, shape = Team), size = 3) +
    theme_bw() +
    labs(title = paste("Summer Trend Line for Select Teams - Medals Won"), y = "Number of Teams") +
    # geom_text(aes(label = ""), size = 3, check_overlap = F,  position = position_stack(vjust = 0) ) +
    theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")

Analysis
* Winter Games
+ Canada and France saw an increasing trend for the number of medals won in olympics.
+ Germany saw a decreasing trend for the number of medals won.
+ United states and Russia had alternate trend movements.
* Summer Games
+ canada and France saw an increasing trend.
+ Russia saw a decreasing trend.
+ Germany, Italy and United states had a fluctutating trend line for the years 2000 to 2016.

Logistic Regression

Creating a new column called Win based on condition if an athlete won a medal Win = Y else N.

glm_athlete <- athlete_clean

glm_athlete$Win <- ifelse(!is.na(glm_athlete$Medal), "Y", "N" )
glm_athlete$Win <- factor(glm_athlete$Win, levels = c("N","Y"))

# set.seed(123)
# SplitIndex <- sample(x = c("Train", "Test"), size = nrow(glm_athlete),replace = T, prob = c(0.7,0.3))
# 
# #Subset data into a train and test set based on the SplitIndex vector
# Train_Data <- glm_athlete[SplitIndex == "Train", ]
# Test_Data <- glm_athlete[SplitIndex == "Test", ]
# 
# dim(Test_Data)
# dim(Train_Data)
# str(Train_Data)
# Logistic Regression

# glm_model <- glm(formula =  Win ~ . ,data = Train_Data, family = 'binomial')
# summary(glm_model)

# str(glm_athlete)
# Extracting only metric columns for usage
glm_athlete_metric <- glm_athlete[,c("Age","Height(cms)","Weight(Kgs)","BMI","Win")]
str(glm_athlete_metric)
Classes 'tbl_df', 'tbl' and 'data.frame':   83909 obs. of  5 variables:
 $ Age        : num  31 34 31 22 30 25 25 24 23 23 ...
 $ Height(cms): num  172 159 189 187 189 194 194 172 178 178 ...
 $ Weight(Kgs): num  70 55.5 130 89 72 78 78 58 76 76 ...
 $ BMI        : num  23.7 22 36.4 25.5 20.2 ...
 $ Win        : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...

Model 1

  • Since we obtain the log(odds) ratio from logistic regression, we use to categorize it for prediction as Yes or No based on 0.5 decision boundary.
glm_model_1 <- glm(formula =  Win ~ . ,data = glm_athlete_metric, family = 'binomial')
# summary(glm_model_1)


glm_athlete_metric$Predict <- predict(object = glm_model_1, newdata = glm_athlete_metric, type = "response")
glm_athlete_metric$YPredict <- ifelse(glm_athlete_metric$Predict > 0.5, 1, 0)

# Model Summary
summary(glm_model_1)

Call:
glm(formula = Win ~ ., family = "binomial", data = glm_athlete_metric)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9763  -0.5786  -0.5341  -0.4931   2.2309  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -2.226289   0.994665  -2.238  0.02521 *  
Age            0.011620   0.001830   6.349 2.17e-10 ***
`Height(cms)` -0.001141   0.005596  -0.204  0.83843    
`Weight(Kgs)`  0.021782   0.006778   3.213  0.00131 ** 
BMI           -0.053850   0.022054  -2.442  0.01462 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 69025  on 83908  degrees of freedom
Residual deviance: 68507  on 83904  degrees of freedom
AIC: 68517

Number of Fisher Scoring iterations: 4
# Confusion Matrix
table(glm_athlete_metric$Win, glm_athlete_metric$YPredict)
   
        0
  N 71866
  Y 12043

Analysis
* This model performed very bad as is evident from the confusion matrix where it was not able to predict any wins.
* However, Age, weight and BMI have statistical significance.

Model 2

  • Building the model with only Age, weight and Height.
glm_model_2 <- glm(formula =  Win ~ Age + `Weight(Kgs)` + `Height(cms)` ,data = glm_athlete_metric, family = 'binomial')
# summary(glm_model_2)


glm_athlete_metric$Predict <- predict(object = glm_model_1, newdata = glm_athlete_metric, type = "response")
glm_athlete_metric$YPredict <- ifelse(glm_athlete_metric$Predict > 0.5, 1, 0)

# Model Summary
summary(glm_model_2)

Call:
glm(formula = Win ~ Age + `Weight(Kgs)` + `Height(cms)`, family = "binomial", 
    data = glm_athlete_metric)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.8586  -0.5818  -0.5360  -0.4909   2.3231  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -4.604778   0.210135 -21.913  < 2e-16 ***
Age            0.011273   0.001825   6.176 6.56e-10 ***
`Weight(Kgs)`  0.005397   0.001015   5.319 1.04e-07 ***
`Height(cms)`  0.012086   0.001449   8.340  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 69025  on 83908  degrees of freedom
Residual deviance: 68513  on 83905  degrees of freedom
AIC: 68521

Number of Fisher Scoring iterations: 4
# Probability of WinningS
head(glm_athlete_metric$Predict) 
[1] 0.1404309 0.1206924 0.2297364 0.1657656 0.1665300 0.1716221
# Confusion Matrix
table(glm_athlete_metric$Win, glm_athlete_metric$YPredict)
   
        0
  N 71866
  Y 12043

Analysis
* We can conclude that Age, Weight and Height have statistical significance on Wins.
* Model was still really bad at prediciton of a Win based on only these variables.