# Libraries used for Analysis
# library(tidyverse)
# library(readxl)
# library(summarytools)
# library(cowplot)
# library(gridExtra)
setwd("D:/JGBS/IV Sem/Data Analysis using R/Assessments/Mid Term/Assigned Dataset/Analysis")
athele_events <- readxl::read_xlsx("athlete_events.xlsx")
str(athele_events)
Classes 'tbl_df', 'tbl' and 'data.frame': 85258 obs. of 16 variables:
$ ID : num 12 13 18 23 30 36 36 53 68 68 ...
$ Name : chr "Jyri Tapani Aalto" "Minna Maarit Aalto" "Timo Antero Aaltonen" "Fritz Aanes" ...
$ Sex : chr "M" "F" "M" "M" ...
$ Age : num 31 34 31 22 30 25 25 24 23 23 ...
$ Height(cms): num 172 159 189 187 189 194 194 172 178 178 ...
$ Weight(Kgs): num 70 55.5 130 89 72 78 78 58 76 76 ...
$ BMI : logi NA NA NA NA NA NA ...
$ Team : chr "Finland" "Finland" "Finland" "Norway" ...
$ NOC : chr "FIN" "FIN" "FIN" "NOR" ...
$ Games : chr "2000 Summer" "2000 Summer" "2000 Summer" "2000 Summer" ...
$ Year : num 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
$ Season : chr "Summer" "Summer" "Summer" "Summer" ...
$ City : chr "Sydney" "Sydney" "Sydney" "Sydney" ...
$ Sport : chr "Badminton" "Sailing" "Athletics" "Wrestling" ...
$ Event : chr "Badminton Men's Singles" "Sailing Women's Windsurfer" "Athletics Men's Shot Put" "Wrestling Men's Light-Heavyweight, Greco-Roman" ...
$ Medal : chr "NA" "NA" "NA" "NA" ...
head(athele_events)
# A tibble: 6 x 16
ID Name Sex Age `Height(cms)` `Weight(Kgs)` BMI Team NOC
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <lgl> <chr> <chr>
1 12 Jyri~ M 31 172 70 NA Finl~ FIN
2 13 Minn~ F 34 159 55.5 NA Finl~ FIN
3 18 Timo~ M 31 189 130 NA Finl~ FIN
4 23 Frit~ M 22 187 89 NA Norw~ NOR
5 30 Pepi~ M 30 189 72 NA Neth~ NED
6 36 Stef~ M 25 194 78 NA Neth~ NED
# ... with 7 more variables: Games <chr>, Year <dbl>, Season <chr>,
# City <chr>, Sport <chr>, Event <chr>, Medal <chr>
Analysis
* The athlete dataset consists of 85,258 rows and 16 attributes/columns.
* The attributes needs to be converted into appropriate class or types to continue analysis and derive insights.
* Attribute Games is divided into Year and Season.
* Need to calculate BMI for each athlete.
# str(athele_events)
summary(athele_events)
ID Name Sex Age
Min. : 2 Length:85258 Length:85258 Min. :12.00
1st Qu.: 35375 Class :character Class :character 1st Qu.:22.00
Median : 69193 Mode :character Mode :character Median :25.00
Mean : 68988 Mean :25.83
3rd Qu.:102782 3rd Qu.:29.00
Max. :135571 Max. :71.00
NA's :3
Height(cms) Weight(Kgs) BMI Team
Min. :133.0 Min. : 28.00 Mode:logical Length:85258
1st Qu.:168.0 1st Qu.: 60.00 NA's:85258 Class :character
Median :176.0 Median : 70.00 Mode :character
Mean :175.8 Mean : 71.13
3rd Qu.:183.0 3rd Qu.: 80.00
Max. :226.0 Max. :214.00
NA's :711 NA's :1210
NOC Games Year Season
Length:85258 Length:85258 Min. :2000 Length:85258
Class :character Class :character 1st Qu.:2004 Class :character
Mode :character Mode :character Median :2008 Mode :character
Mean :2008
3rd Qu.:2012
Max. :2016
City Sport Event
Length:85258 Length:85258 Length:85258
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Medal
Length:85258
Class :character
Mode :character
summarytools::descr(athele_events)
Non-numerical variable(s) ignored: Name, Sex, BMI, Team, NOC, Games, Season, City, Sport, Event, Medal
Descriptive Statistics
athele_events
N: 85258
Age Height(cms) ID Weight(Kgs) Year
----------------- ---------- ------------- ----------- ------------- ----------
Mean 25.83 175.83 68987.57 71.13 2008.02
Std.Dev 5.45 10.98 39183.80 15.30 5.46
Min 12.00 133.00 2.00 28.00 2000.00
Q1 22.00 168.00 35374.00 60.00 2004.00
Median 25.00 176.00 69192.50 70.00 2008.00
Q3 29.00 183.00 102782.00 80.00 2012.00
Max 71.00 226.00 135571.00 214.00 2016.00
MAD 4.45 11.86 49940.64 14.83 5.93
IQR 7.00 15.00 67407.00 20.00 8.00
CV 0.21 0.06 0.57 0.22 0.00
Skewness 0.99 0.06 -0.02 0.87 -0.01
SE.Skewness 0.01 0.01 0.01 0.01 0.01
Kurtosis 2.37 0.04 -1.18 1.93 -1.27
N.Valid 85255.00 84547.00 85258.00 84048.00 85258.00
Pct.Valid 100.00 99.17 100.00 98.58 100.00
Analysis
* Olympics data is available from Year 2000 to 2016.
* Min Age of athlete is 12 years and Max Age is 71. The average age of participitants was 25.8 years.
* The average height of athletes was 175.83 cms and the average weight was 71.13 Kgs.
* The distribution for Height is mesokurtic (almost standard tail shape).
* The distribution for Weight is slightly leptokurtic (fat tailed distribution).
Steps
1. Calculating BMI for each athlete.
2. Converting Non Metric attributes to appropriate type, i.e., factors.
3. Quick observance of NA values in each column.
4. Removing rows with NA values in the dataset.
5. Creating a new column called BMI_cat which segments the athletes into one of the seven categories. Source: Wiki.
6. Reordering the factor levels in newly created columns BMI_cat based on logical levels.
# BMI Calculation
athele_events$BMI <- round(athele_events$`Weight(Kgs)` / ((athele_events$`Height(cms)`/100)^2),3)
# Converting apporopriate columns to respective types
non_metric_col <- c("Sex","Team","NOC","Games","Year","Season","City","Sport","Event","Medal")
athele_events[non_metric_col] <- lapply(athele_events[non_metric_col], factor)
# str(athele_events)
# Calculate number of NA's for each column
sapply(athele_events, function(col) sum(is.na(col)))
ID Name Sex Age Height(cms) Weight(Kgs)
0 0 0 3 711 1210
BMI Team NOC Games Year Season
1330 0 0 0 0 0
City Sport Event Medal
0 0 0 0
#Removing rows with NA values
athele_events <- athele_events[complete.cases(athele_events),]
# BMI Category
athele_events$BMI_cat <- 0
athele_events$BMI_cat[athele_events$BMI <= 16] <- "Severely underweight"
athele_events$BMI_cat[athele_events$BMI > 16 & athele_events$BMI <= 18.5] <- "Underweight"
athele_events$BMI_cat[athele_events$BMI > 18.5 & athele_events$BMI <= 25] <- "Healthy"
athele_events$BMI_cat[athele_events$BMI > 25 & athele_events$BMI <= 30] <- "Overweight"
athele_events$BMI_cat[athele_events$BMI > 30 & athele_events$BMI <= 35] <- "Moderately obese"
athele_events$BMI_cat[athele_events$BMI > 35 & athele_events$BMI <= 40] <- "Severely obese"
athele_events$BMI_cat[athele_events$BMI > 40] <- "Very Severely obese"
athlete_data <- athele_events
athlete_data$BMI_cat <- factor(athlete_data$BMI_cat,
levels = c("Severely underweight", "Underweight", "Healthy", "Overweight", "Moderately obese", "Severely obese",
"Very Severely obese"))
str(athlete_data)
Classes 'tbl_df', 'tbl' and 'data.frame': 83926 obs. of 17 variables:
$ ID : num 12 13 18 23 30 36 36 53 68 68 ...
$ Name : chr "Jyri Tapani Aalto" "Minna Maarit Aalto" "Timo Antero Aaltonen" "Fritz Aanes" ...
$ Sex : Factor w/ 2 levels "F","M": 2 1 2 2 2 2 2 2 2 2 ...
$ Age : num 31 34 31 22 30 25 25 24 23 23 ...
$ Height(cms): num 172 159 189 187 189 194 194 172 178 178 ...
$ Weight(Kgs): num 70 55.5 130 89 72 78 78 58 76 76 ...
$ BMI : num 23.7 22 36.4 25.5 20.2 ...
$ Team : Factor w/ 331 levels "Afghanistan",..: 102 102 102 216 205 205 205 93 8 8 ...
$ NOC : Factor w/ 210 levels "AFG","AHO","ALB",..: 65 65 65 142 138 138 138 58 8 8 ...
$ Games : Factor w/ 9 levels "2000 Summer",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Year : Factor w/ 9 levels "2000","2002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Season : Factor w/ 2 levels "Summer","Winter": 1 1 1 1 1 1 1 1 1 1 ...
$ City : Factor w/ 9 levels "Athina","Beijing",..: 7 7 7 7 7 7 7 7 7 7 ...
$ Sport : Factor w/ 51 levels "Alpine Skiing",..: 4 33 3 51 31 41 41 44 41 41 ...
$ Event : Factor w/ 442 levels "Alpine Skiing Men's Combined",..: 63 288 37 428 262 347 351 386 347 355 ...
$ Medal : Factor w/ 4 levels "Bronze","Gold",..: 3 3 3 3 3 3 3 3 3 3 ...
$ BMI_cat : Factor w/ 7 levels "Severely underweight",..: 3 3 6 4 3 3 3 3 3 3 ...
Quick Comment
* The dataset now looks in a proper format for analysis and is reduced to 83,296 records with 17 columns.
Quick Tabulation of all factor columns to have an overview of data before proceeding with analysis.
* Re-leveling Medal columns with respect to NA and medal categories.
# Unique values/levels for all Non-Metric data
non_metric_col <- c("Sex","Team","NOC","Games","Year","Season","City","Sport","Event","Medal","BMI_cat")
# sapply(athlete_data[,non_metric_col], function(x) unique(x))
#
# # Tabulation for all levels in non-metric columns
# sapply(athlete_data[,non_metric_col], function(col) table(col))
athlete_data$Medal <- factor(athlete_data$Medal, levels = c(NA, "Bronze", "Silver", "Gold"))
# str(athlete_data)
Analysis
* There are numerous teams with total participitants less than 10.
* Winter games have less participitation than Summer games.
* Athletics is the most popular Sport.
* There are some events with only double digit participitation.
* 71,881 records with No medals among 83,296 total records.
* Almost 70% of athletes seem to be in Healthy BMI category.
Steps
* Detecting outliers in Age, Height, Weight and BMI.
* Plotting Box plot to have a quick observance of above metric attributes.
* Histogram plotting for Age and BMI to observe the distribution. Since BMI depends on Height and Weight.
metric_col <- c("Age", "Height(cms)", "Weight(Kgs)", "BMI")
# Age
ggplot(data = athlete_data, aes(y = Age, fill = Sex)) +
geom_boxplot() +
theme_bw() +
labs(title = "Age Outlier Detection - Boxplot", y = "Age") +
theme(plot.title = element_text(hjust = 0.5))
# Age Histogram to see distribution
ggplot(data = athlete_data, aes(x = Age, fill = Sex)) +
geom_histogram(binwidth = 1) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
labs(title = "Age Distribution Histogram", x = "Age", y = "Count")
# Age Histogram to see distribution
ggplot(data = athlete_data, aes(x = log10(Age), fill = Sex)) +
geom_histogram(binwidth = 0.01) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
labs(title = "Normalized Age Distribution Histogram", x = "Age", y = "Count")
# Height
ggplot(data = athlete_data, aes(y = `Height(cms)`, fill = Sex)) +
geom_boxplot() +
theme_bw() +
labs(title = "Height(cms) Outlier Detection - Boxplot") +
theme(plot.title = element_text(hjust = 0.5))
# Weight
ggplot(data = athlete_data, aes(y = `Weight(Kgs)`, fill = Sex)) +
geom_boxplot() +
theme_bw() +
labs(title = "Weight(Kgs) Outlier Detection - Boxplot") +
theme(plot.title = element_text(hjust = 0.5))
# BMI
ggplot(data = athlete_data, aes(y = BMI, fill = Sex)) +
geom_boxplot() +
theme_bw() +
labs(title = "BMI Outlier Detection - Boxplot") +
theme(plot.title = element_text(hjust = 0.5))
# BMI Histogram to see distribution
ggplot(data = athlete_data, aes(x = BMI, fill = Sex)) +
geom_histogram(binwidth = 1) +
theme_bw() +
labs(title = "BMI Distribution Histogram") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# BMI Histogram to see distribution
ggplot(data = athlete_data, aes(x = log10(BMI), fill = Sex)) +
geom_histogram(binwidth = 0.01) +
theme_bw() +
labs(title = "Normalized BMI Distribution - Histogram") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
Analysis
* Male Age has more outliers than Female age.
* Average Age for Males is slightly higher than Female Age.
* To normalize the age distribution, plotted the logarithmic (log10) conversion of Age distribution. It looks close to a normal distribution.
* Male athletes are taller than Female athletes with few more athletes of height more than 200 cms.
* Male athletes are heavier than Female athletes with few more athletes with weight more than 175 kgs.
* Thus, Male athletes have little higher BMI than Female athletes with few more outliers than Female participitants.
* Similarly, as logarithmic Age transformation, observing at logarthimic BMI plot we observe that it is close to a normal distribution.
Further Oultier Exploration of Age to better understand the data outliers.
* Plotting Boxplots of Age Vs Sport, Sport with Medal, Year, BMI category
# Age Bivariate Outlier Exploration
# Detailed look at age with type of sport to make sense of Age outliers
ggplot(data = athlete_data, aes(x = Sport, y = Age)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - Age V/s Sports") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# Further look by faceting with Medal
# Maximum outliers in NA Medal category
ggplot(data = athlete_data, aes(x = Sport, y = Age, fill = Medal)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - Age V/s Sports - Wrap by Medal") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
facet_grid(Medal ~.)
ggplot(data = athlete_data, aes(x = paste(athlete_data$Games,"\n",athlete_data$City), y = Age)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - Age V/s Olympic Year or City", x = "Year/City") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
ggplot(data = athlete_data, aes(x = BMI_cat, y = Age, fill = Medal)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - Age V/s BMI_category - Wrap by Medal", x = "BMI Category") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
facet_wrap(~Medal)
# nrow(athlete_data[athlete_data$Age > 40,])
Analysis
* Age V/s Sports reveals few sports have more Age outliers than others.
+ Accounting these outliers to their corrosponding sports, these outliers may be real (upon reading for each sports).
* When faceted by Medal type, we have more Age outliers for players with no medals.
* Summer Games have more Age Outliers than Winter games.
* Maximum number of Age Outliers with No Medals.
* Few Age outliers won other medals with maximum number of player in Healthy BMI category.
Quick tabulation for Age > 45 and Sport,medal
table(athlete_data[athlete_data$Age > 45,]$Sport, athlete_data[athlete_data$Age > 45,]$Age)
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
Alpine Skiing 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0
Archery 5 2 2 0 2 0 2 0 0 0 0 0 0 0 0
Athletics 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0
Badminton 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Baseball 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Basketball 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Beach Volleyball 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Biathlon 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Bobsleigh 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
Boxing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Canoeing 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0
Cross Country Skiing 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
Curling 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0
Cycling 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0
Diving 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Equestrianism 34 41 34 30 22 20 24 16 8 5 8 3 6 0 8
Fencing 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
Figure Skating 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Football 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Freestyle Skiing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Golf 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Gymnastics 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Handball 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Hockey 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ice Hockey 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Judo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Luge 0 1 2 0 0 0 1 0 0 0 0 0 0 0 0
Modern Pentathlon 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Nordic Combined 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Rhythmic Gymnastics 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Rowing 0 0 1 0 0 2 1 0 0 0 1 0 0 0 0
Rugby Sevens 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Sailing 6 2 1 3 1 0 0 0 1 0 0 1 1 0 0
Shooting 29 24 18 18 7 9 11 6 6 2 2 3 0 1 0
Short Track Speed Skating 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Skeleton 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ski Jumping 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Snowboarding 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Softball 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Speed Skating 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Swimming 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Synchronized Swimming 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Table Tennis 3 0 0 1 1 0 0 1 1 0 0 0 0 0 0
Taekwondo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Tennis 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
Trampolining 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Triathlon 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Volleyball 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Water Polo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Weightlifting 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Wrestling 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
61 62 63 65 67 71
Alpine Skiing 0 0 0 0 0 0
Archery 0 0 0 0 0 0
Athletics 0 0 0 0 0 0
Badminton 0 0 0 0 0 0
Baseball 0 0 0 0 0 0
Basketball 0 0 0 0 0 0
Beach Volleyball 0 0 0 0 0 0
Biathlon 0 0 0 0 0 0
Bobsleigh 0 0 0 0 0 0
Boxing 0 0 0 0 0 0
Canoeing 0 0 0 0 0 0
Cross Country Skiing 0 0 0 0 0 0
Curling 0 0 0 0 0 0
Cycling 0 0 0 0 0 0
Diving 0 0 0 0 0 0
Equestrianism 4 1 0 2 2 1
Fencing 0 0 0 0 0 0
Figure Skating 0 0 0 0 0 0
Football 0 0 0 0 0 0
Freestyle Skiing 0 0 0 0 0 0
Golf 0 0 0 0 0 0
Gymnastics 0 0 0 0 0 0
Handball 0 0 0 0 0 0
Hockey 0 0 0 0 0 0
Ice Hockey 0 0 0 0 0 0
Judo 0 0 0 0 0 0
Luge 0 0 0 0 0 0
Modern Pentathlon 0 0 0 0 0 0
Nordic Combined 0 0 0 0 0 0
Rhythmic Gymnastics 0 0 0 0 0 0
Rowing 0 0 0 0 0 0
Rugby Sevens 0 0 0 0 0 0
Sailing 0 0 0 0 0 0
Shooting 2 0 1 1 0 0
Short Track Speed Skating 0 0 0 0 0 0
Skeleton 0 0 0 0 0 0
Ski Jumping 0 0 0 0 0 0
Snowboarding 0 0 0 0 0 0
Softball 0 0 0 0 0 0
Speed Skating 0 0 0 0 0 0
Swimming 0 0 0 0 0 0
Synchronized Swimming 0 0 0 0 0 0
Table Tennis 0 0 0 0 0 0
Taekwondo 0 0 0 0 0 0
Tennis 0 0 0 0 0 0
Trampolining 0 0 0 0 0 0
Triathlon 0 0 0 0 0 0
Volleyball 0 0 0 0 0 0
Water Polo 0 0 0 0 0 0
Weightlifting 0 0 0 0 0 0
Wrestling 0 0 0 0 0 0
table(athlete_data[athlete_data$Age > 45,]$Medal, athlete_data[athlete_data$Age > 45,]$Age)
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 65 67 71
Bronze 6 4 2 2 1 2 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0
Silver 9 2 1 1 0 1 2 0 0 1 0 0 0 0 0 1 0 0 0 0 0
Gold 3 4 0 3 2 1 1 1 2 0 0 0 1 0 0 0 0 0 0 0 0
# table(athlete_data[athlete_data$Age > 45,]$Sport, athlete_data[athlete_data$Age > 45,]$Age, athlete_data[athlete_data$Age > 45,]$Medal)
Analysis
* Shooting and Equestrianism Sport have greater number of players who are older, which is acceptable for these sports.
* Not many older players won medals. Only 3 Medals won overall for Age greater than 55.
Further BMI Outlier detection w.r.t Sport, Sport + Medal, Year, BMI Category
# Detailed look at age with type of sport to make sense of Age outliers
ggplot(data = athlete_data, aes(x = Sport, y = BMI)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - BMI V/s Sport") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# Further look by faceting with Medal
# Maximum outliers in NA Medal category
ggplot(data = athlete_data, aes(x = Sport, y = BMI, fill = Medal)) +
geom_boxplot() +
theme_bw() +
# labs(title = "Outlier Detection - BMI V/s Sport - Wrap by Medal") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
facet_grid(Medal ~.)
ggplot(data = athlete_data, aes(x = paste(athlete_data$Games,"\n",athlete_data$City), y = BMI)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - BMI V/s Year/City", x = "Year-City") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
ggplot(data = athlete_data, aes(x = BMI_cat, y = BMI, fill = Medal)) +
geom_boxplot() +
theme_bw() +
labs(title = "Outlier Detection - BMI V/s BMI_category - Wrap by Medal", x = "BMI Category") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
facet_wrap(~Medal)
Analysis
* Large number of BMI outliers for Athletics, Judo, Weightlifting, Shooting and Rugby Sevens.
+ These makes sense since these specific sports generally have overweight players and (some)sport themselves have a weight category.
* Same distribution of outliers is followed for different medal types also, with NA having most outliers.
* Beijing and London Summer Olympics had a few extreme BMI outlier.
* BMI V/s BMI category outlier detection follows a normal distribution as expected.
Quick Tabulation of BMI(>55) category with Medal
# table(athlete_data[athlete_data$BMI > 55,]$Sport, athlete_data[athlete_data$BMI > 45,]$BMI_cat)
# table(athlete_data[athlete_data$BMI > 45,]$Medal, athlete_data[athlete_data$BMI > 45,]$BMI_cat)
table(athlete_data[athlete_data$BMI > 55,]$Medal, athlete_data[athlete_data$BMI > 55,]$BMI_cat)
Severely underweight Underweight Healthy Overweight
Bronze 0 0 0 0
Silver 0 0 0 0
Gold 0 0 0 0
Moderately obese Severely obese Very Severely obese
Bronze 0 0 1
Silver 0 0 0
Gold 0 0 0
Analysis
* Only one Very Severly Obese athlete won a Bronze wedal across all years of our dataset.
* Thus, based on above outlier detection we can filter the data with Age < 61 and BMI < 56 to have minimum loss of Winners, while staying true to population representation.
Based on above outlier detection and considering the relvance of age and BMI for respective sports, we subset the data with Age less than 61 and BMI less than 56 for further exploration. Overall, there is a loss of only two records with a Medal among 12045 Medal holders.
athlete_clean <- subset(athlete_data, Age < 61 & BMI < 56)
str(athlete_clean)
Classes 'tbl_df', 'tbl' and 'data.frame': 83909 obs. of 17 variables:
$ ID : num 12 13 18 23 30 36 36 53 68 68 ...
$ Name : chr "Jyri Tapani Aalto" "Minna Maarit Aalto" "Timo Antero Aaltonen" "Fritz Aanes" ...
$ Sex : Factor w/ 2 levels "F","M": 2 1 2 2 2 2 2 2 2 2 ...
$ Age : num 31 34 31 22 30 25 25 24 23 23 ...
$ Height(cms): num 172 159 189 187 189 194 194 172 178 178 ...
$ Weight(Kgs): num 70 55.5 130 89 72 78 78 58 76 76 ...
$ BMI : num 23.7 22 36.4 25.5 20.2 ...
$ Team : Factor w/ 331 levels "Afghanistan",..: 102 102 102 216 205 205 205 93 8 8 ...
$ NOC : Factor w/ 210 levels "AFG","AHO","ALB",..: 65 65 65 142 138 138 138 58 8 8 ...
$ Games : Factor w/ 9 levels "2000 Summer",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Year : Factor w/ 9 levels "2000","2002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Season : Factor w/ 2 levels "Summer","Winter": 1 1 1 1 1 1 1 1 1 1 ...
$ City : Factor w/ 9 levels "Athina","Beijing",..: 7 7 7 7 7 7 7 7 7 7 ...
$ Sport : Factor w/ 51 levels "Alpine Skiing",..: 4 33 3 51 31 41 41 44 41 41 ...
$ Event : Factor w/ 442 levels "Alpine Skiing Men's Combined",..: 63 288 37 428 262 347 351 386 347 355 ...
$ Medal : Factor w/ 3 levels "Bronze","Silver",..: NA NA NA NA NA NA NA NA NA NA ...
$ BMI_cat : Factor w/ 7 levels "Severely underweight",..: 3 3 6 4 3 3 3 3 3 3 ...
Comment
* Our dataset is now in a proper format with a (supposedly) true representation of the population.
* We now have 83,909 number of records after subsetting/filtering with respect to outlier detection.
* We can now start with Expolatory Data Analysis to derive insteresting insights.
# qplot(x = Medal, y = Age, data = athlete_data, geom = "boxplot", fill = Sex )
# qplot(x = Sport, y = Age, data = athlete_data, geom = "boxplot" )
# Using for loop to find apporopriate bin value
# for (bin_i in c(5,10,15,20,25,30,35,40)) {
#
# plot <- ggplot(data = athlete_clean, aes(x = Age, fill = BMI_cat)) +
# geom_density(stat = "bin", bins = bin_i) +
# theme_bw() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
# facet_wrap(~Sex)
#
# print(plot)
# }
ggplot(data = athlete_clean, aes(x = Age, fill = BMI_cat)) +
geom_density(stat = "bin", bins = 30) +
theme_bw() +
labs(title = "Density Plot for M/F Age Distribution wrt BMI Category") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5)) +
facet_wrap(~Sex)
# labs(title = "Outlier Detection - BMI V/s BMI_category - Wrap by Medal", x = "BMI Category") +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
ggplot(data = athlete_clean, aes(x = BMI_cat, y = Age, fill = Sex)) +
geom_violin(scale = "area") +
theme_bw() +
labs(title = "Violin plot for Age Distribution wrt BMI Category - M/F", x = "BMI Category") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
ggplot(data = athlete_clean, aes(x = BMI_cat, y = `Height(cms)`, fill = Sex)) +
geom_violin(scale = "area") +
theme_bw() +
labs(title = "Violin plot for Heights Distribution wrt BMI Category - M/F", x = "BMI Category") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
ggplot(data = athlete_clean, aes(x = BMI_cat, y = `Weight(Kgs)`, fill = Sex)) +
geom_violin(scale = "area") +
theme_bw() +
labs(title = "Violin plot for Weight Distribution wrt BMI Category - M/F", x = "BMI Category") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
ggplot(data = athlete_clean, aes(x = BMI_cat, y = BMI, fill = Sex)) +
geom_violin(scale = "area") +
theme_bw() +
labs(title = "BMI Trend V/s BMI Category - M/F", x = "BMI Category") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# ggplot(data = athlete_data, aes(x = Medal, y = Age, fill = Sex)) +
# geom_violin() +
# theme_bw()
Analysis
* Age distribution for Male and Female athletes does not follow a true normal distribution for all BMI categories.
+ Maximum athletes belong to Healthy BMI category between the age of 20 to 30.
* Violin plot for Age wrt BMI category
+ Females under the age of 20 are severly underweight.
+ Slightly Normal Distribution for Healthy BMI category for both genders.
+ Maximum Age outliers in Healthy and Overweight BMI Category.
* Violin plot for Heights wrt BMI category
+ More number of males with more height under Severly Underweight category.
+ Males are taller than Females across the distribution.
+ Almost equal Height distribution for both genders.
* Violin plot for Weight and BMI category.
+ There is an increasing trend in weights distribution with each incresing level of BMI category, as expected.
* Same trend is followed for the plot of BMI V/s BMI Category, since there are corelated to each other.
# Total Medals Won by Sex
athlete_clean %>% filter(!is.na(Medal)) %>%
group_by(Medal, Sex) %>%
summarize(count = n()) %>%
mutate(percentage = round(count/sum(count) * 100,2)) %>%
ggplot(aes(x = Sex, y = count, fill = Medal)) +
geom_bar(stat = "identity") +
theme_bw() +
facet_wrap(~Medal) +
labs(title = "Total Medals Won by Sex") +
geom_text(aes(label = paste(percentage,"%")), vjust = 1.5) +
theme(plot.title = element_text(hjust = 0.5))
# table(athlete_clean$Sex, athlete_clean$Season, athlete_clean$Medal)
# Total Medals Won by Season and Sex
athlete_clean %>% filter(!is.na(Medal)) %>%
group_by(Season, Medal, Sex) %>%
summarize(count = n()) %>%
mutate(percentage = round(count/sum(count) * 100,2)) %>%
ggplot(aes(x = Sex, y = count, fill = Season)) +
geom_bar(stat = "identity", position = "stack") +
theme_bw() +
facet_wrap(~Medal) +
labs(title = "Total Medals Won by Season and Sex") +
geom_text(aes(label = paste(percentage,"%")), vjust = 1.5) +
theme(plot.title = element_text(hjust = 0.5))
# Medals Proportion Distribution Won by Games
athlete_clean %>% filter(!is.na(Medal)) %>%
group_by(Games, Medal) %>%
summarize(count = n()) %>%
mutate(percentage = round(count/sum(count) * 100,2)) %>%
ggplot(aes(x = Medal, y = count, fill = Medal)) +
geom_bar(stat = "identity", position = "stack") +
theme_bw() +
facet_wrap(~Games) +
labs(title = "Proportion of Medals Won by Games") +
geom_text(aes(label = paste(percentage,"%")), vjust = 1.1, hjust = 0.5, size = 3)+ #position = position_jitter(width = 0, height = 0.9)) +
theme(plot.title = element_text(hjust = 0.5))
# Medals Proportion Distribution Won by Games and Sex
athlete_clean %>% filter(!is.na(Medal), Season == "Summer") %>%
group_by(Games,Medal,Sex) %>%
summarize(count = n()) %>%
mutate(percentage = round(count/sum(count) * 100,2)) %>%
ggplot(aes(x = Medal, y = count, fill = Sex)) +
geom_bar(stat = "identity", position = "stack") +
theme_bw() +
facet_wrap(~Games) +
labs(title = "Proportion of Medals Won by Games and Sex - SUMMER") +
geom_text(aes(label = paste(percentage,"%")), size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
#position = position_jitter(width = 0, height = 0)) +
theme(plot.title = element_text(hjust = 0.5))
athlete_clean %>% filter(!is.na(Medal), Season == "Winter") %>%
group_by(Games,Medal,Sex) %>%
summarize(count = n()) %>%
mutate(percentage = round(count/sum(count) * 100,2)) %>%
ggplot(aes(x = Medal, y = count, fill = Sex)) +
geom_bar(stat = "identity", position = "stack") +
theme_bw() +
facet_wrap(~Games) +
labs(title = "Proportion of Medals Won by Games and Sex - WINTER") +
geom_text(aes(label = paste(percentage,"%")), size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
#position = position_jitter(width = 0, height = 0)) +
theme(plot.title = element_text(hjust = 0.5))
Analysis
* Male athletes won more number(or proportion) of medals than females, about 55% on average for all medal caegories.
* Same trend is followed for both the seasons, i.e., Summer and Winter games.
+ More number of participitants in Summer games than in Winter.
* Equal proportion(33.3%) distribution of Gold, Silver and Bronze Medals across all the games - from 2000 Summer to 2016 Summer.
* Female participitants performance and Wins increased in the last games of 2016 Summer.
* Similar performance of both Male and Female participitants for all Winter games.
# str(athlete_clean)
# Base
# athlete_clean %>% group_by(Team,Sex) %>%
# summarize(No_of_athlete = n()) %>%
# arrange(-No_of_athlete) %>%
# mutate(percentage = round(No_of_athlete/sum(No_of_athlete) * 100,0)) %>%
# filter(No_of_athlete > 1000 ) %>%
# ggplot(aes(x = Team, y = No_of_athlete, fill = Sex)) +
# geom_bar(stat = "identity") +
# theme_bw() +
# # facet_wrap(~Medal) +
# labs(title = "No of participants(>1000) from Teams") +
# geom_text(aes(label = paste(percentage,"%")), size = 3, check_overlap = F, position = position_stack(vjust = 0.5) ) +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
np_team <- athlete_clean %>% group_by(Team) %>% summarize(Total_athletes_team = n())
np_team_sex <- athlete_clean %>% group_by(Team,Sex) %>% summarize(No_of_athlete_ts = n()) %>%
mutate(percentage_ts = round(No_of_athlete_ts/sum(No_of_athlete_ts) * 100,0))
np_total <- left_join(np_team, np_team_sex, by= "Team" )
# No of participants(>1000) for Teams
np_total %>% filter(Total_athletes_team > 1000 ) %>%
ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = Sex)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of participants(>1000) for Teams", y = "Number of Athletes", x = "Teams") +
geom_text(aes(label = paste(percentage_ts,"\n%",sep = "")), size = 3, check_overlap = F, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# No of participants(>1000) from Teams - MALE
np_total %>% filter(Sex == "M", No_of_athlete_ts > 1000 ) %>%
ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of MALE participants(>1000) for Teams", y = "Number of Athletes", x = "Teams") +
geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F, position = position_stack(vjust = 1.1) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# No of participants(>1000) from Teams - FEMALE
np_total %>% filter(Sex == "F", No_of_athlete_ts > 1000 ) %>%
ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of FEMALE participants(>1000) for Teams", y = "Number of Athletes", x = "Teams") +
geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F, position = position_stack(vjust = 1.1) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# Top 10 Teams with highest participation
np_total %>% arrange(-Total_athletes_team) %>% top_n(20, Total_athletes_team) %>%
ggplot(aes(x = reorder(Team,-No_of_athlete_ts), y = No_of_athlete_ts, fill = Sex)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 Teams with Highest participation", y = "Number of Athletes", x = "Teams") +
geom_text(aes(label = paste(percentage_ts,"\n%",sep = "")), size = 3, check_overlap = F, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# Top 10 Teams with highest participation - MALE
np_total %>% filter(Sex == "M") %>% arrange(-No_of_athlete_ts) %>% top_n(10, No_of_athlete_ts) %>%
ggplot(aes(x = reorder(Team,No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 Teams with highest MALE participation", y = "Number of Athletes", x = "Teams") +
geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F, position = position_stack(vjust = 1.1) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.2)) +
coord_flip()
# Top 10 Teams with highest participation - FEMALE
np_total %>% filter(Sex == "F") %>% arrange(-No_of_athlete_ts) %>% top_n(10, No_of_athlete_ts) %>%
ggplot(aes(x = reorder(Team,No_of_athlete_ts), y = No_of_athlete_ts, fill = No_of_athlete_ts)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 Teams with highest FEMALE participation", y = "Number of Athletes", x = "Teams") +
geom_text(aes(label = paste(No_of_athlete_ts)), size = 3, check_overlap = F, position = position_stack(vjust = 1.1) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
coord_flip()
Analysis
* Plot - No of participitants (>1000) for Teams + 26 out of 331 Total Teams have more than 1000 participitants.
+ China has more Female players than Males.
+ Italy, France, Austria, Kazakhstan, Switzerland, Spain and Germany have more Male players than Females.
* Plots - No of participitants (>1000) for Teams - MALE and FEMALE + United States has the maximum number of Male and Female participitants, followed by Russian and Germany.
+ Only 14 teams have more than 1000 Male participitants.
+ Only 10 teams have more than 1000 Female participitants.
+ Top 3 Teams with Male players (>1000) are United States, Germany and Russia.
+ Top 3 Teams with FeMale players (>1000) are United States, Russia and China.
* Plots - Top 10 Teams with highest MALE and FEMALE participation
+ China was last in this list for Male players, whereas third country for Female participitation.
+ France at 4 position for Male and at 9 position for Female participitation.
+ Canade at 7 position for Male and at 5 position for Female participitation.
np_sport <- athlete_clean %>% group_by(Sport) %>% summarize(Total_athletes_sport = n())
np_sport_sex <- athlete_clean %>% group_by(Sport,Sex) %>% summarize(No_of_athlete_ss = n()) %>%
mutate(percentage_ss = round(No_of_athlete_ss/sum(No_of_athlete_ss) * 100,0))
np_total_sport <- left_join(np_sport, np_sport_sex, by= "Sport" )
# No of participants(>1000) from Teams
np_total_sport %>% filter(Total_athletes_sport > 1000 ) %>%
ggplot(aes(x = reorder(Sport,-No_of_athlete_ss), y = No_of_athlete_ss, fill = Sex)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of participants(>1000) for Sports", y = "Number of Athletes", x="Sport") +
geom_text(aes(label = paste(percentage_ss,sep = "")), size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
np_total_sport %>% filter(Sex == "M", No_of_athlete_ss > 1000 ) %>%
ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of MALE participants(>1000) for Sports", y = "Number of Athletes", x="Sport") +
geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F, position = position_stack(vjust = 1.20) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
np_total_sport %>% filter(Sex == "F", No_of_athlete_ss > 1000 ) %>%
ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of FEMALE participants(>1000) for Sports", y = "Number of Athletes", x="Sport") +
geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F, position = position_stack(vjust = 1.20) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
# plot_grid(p1,p2)
# Top 10 Sports
np_total_sport %>% filter(Total_athletes_sport > 1000 ) %>% arrange(-Total_athletes_sport) %>% top_n(20, Total_athletes_sport) %>%
ggplot(aes(x = reorder(Sport,-No_of_athlete_ss), y = No_of_athlete_ss, fill = Sex)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 - Popular Sports", y = "Number of Athletes", x="Sport") +
geom_text(aes(label = paste(percentage_ss,"%",sep = "")), size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# Top 10 Sports - MEN
np_total_sport %>% filter(Sex == "M") %>% arrange(-No_of_athlete_ss) %>% top_n(10, No_of_athlete_ss) %>%
ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 - Popular Sports - Men", y = "Number of Athletes", x="Sport") +
geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F, position = position_stack(vjust = 1.20) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
# Top 10 Sports - WOMEN
np_total_sport %>% filter(Sex == "F") %>% arrange(-No_of_athlete_ss) %>% top_n(10, No_of_athlete_ss) %>%
ggplot(aes(x = reorder(Sport,No_of_athlete_ss), y = No_of_athlete_ss, fill = No_of_athlete_ss)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 - Popular Sports - Female", y = "Number of Athletes", x="Sport") +
geom_text(aes(label = paste(No_of_athlete_ss)), size = 3, check_overlap = F, position = position_stack(vjust = 1.20) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
Analysis
* Plot - No of participitants (>1000) for SPORTS
+ Athletics, Swimming and Gymnastics have the highest participitation.
+ Boxing has 95% Male participitants.
+ Archery and Short Track Speed Skating have lowest participitation. (for players>1000)
+ More Male participitants than Females for Boxing, Cycling, Gymnastics, Shooting, Rowing, Football, Ice Hockey and Equestrianism sports.
* Plots - No of participitants (>1000) for SPORTS - MALE and FEMALE
+ 17 Sports with more than 1000 Male participitants.
+ Only 8 Sports with more than 1000 Female participitants.
* Plots - Top 10 SPORTS with highest MALE and FEMALE participation
+ Cross Country Skiing and Alpine Skiing are more popular sports for Females than Males.
+ Canoeing and Wrestling sports are popular among Men but same is not true for Females.
np_event <- athlete_clean %>% group_by(Event) %>% summarize(Total_athletes_event = n())
np_event_sex <- athlete_clean %>% group_by(Event,Sex) %>% summarize(No_of_athlete_es = n()) %>%
mutate(percentage_es = round(No_of_athlete_es/sum(No_of_athlete_es) * 100,0))
np_total_event <- left_join(np_event, np_event_sex, by= "Event" )
# Top 10 Events Participation - MEN
np_total_event %>% filter(Sex == "M") %>% arrange(-No_of_athlete_es) %>% top_n(10, No_of_athlete_es) %>%
ggplot(aes(x = reorder(Event,No_of_athlete_es), y = No_of_athlete_es, fill = No_of_athlete_es)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 - Events - Men", y = "Number of Athletes", x="Events") +
geom_text(aes(label = paste(No_of_athlete_es)), size = 3, check_overlap = F, position = position_stack(vjust = 1.20) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
# Top 10 Events Participation - WOMEN
np_total_event %>% filter(Sex == "F") %>% arrange(-No_of_athlete_es) %>% top_n(10, No_of_athlete_es) %>%
ggplot(aes(x = reorder(Event,No_of_athlete_es), y = No_of_athlete_es, fill = No_of_athlete_es)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 - Events - Female", y = "Number of Athletes", x="Events") +
geom_text(aes(label = paste(No_of_athlete_es)), size = 3, check_overlap = F, position = position_stack(vjust = 1.20) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
Analysis
* Plots - Top 10 EVENTS with highest MALE and FEMALE participation
+ Football, Water polo and Ice Hockey events are more popular among Men than Women.
+ Hockey, Athletics and Volleyball events are more popular among Women than Men.
np_city <- athlete_clean %>% group_by(Year,City) %>% summarize(Total_athletes_city = n())
np_city_sex <- athlete_clean %>% group_by(Year,City,Sex) %>% summarize(No_of_athlete_es = n()) %>%
mutate(percentage_es = round(No_of_athlete_es/sum(No_of_athlete_es) * 100,0))
np_total_city <- left_join(np_city, np_city_sex, by= c("Year","City"))
# np_total_city %>% arrange(-Total_athletes_city) %>% #top_n(20, Total_athletes_city) %>%
# ggplot(aes(x = paste(City,"\n",Year), y = No_of_athlete_es, fill = Sex)) +
# geom_bar(stat = "identity") +
# theme_bw() +
# # facet_wrap(~Medal) +
# labs(title = "No of Participants in City", y = "Number of Athletes") +
# geom_text(aes(label = paste(percentage_es,"%\n",No_of_athlete_es, sep="")), size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
np_total_city %>% arrange(-Total_athletes_city) %>% #top_n(20, Total_athletes_city) %>%
ggplot(aes(x = reorder(paste(City,"\n",Year),No_of_athlete_es), y = No_of_athlete_es, fill = Sex)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "No of Participants in City", y = "Number of Athletes", x = "City and Year") +
geom_text(aes(label = paste(percentage_es,"%\n",No_of_athlete_es, sep="")), size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
Analysis
* Almost one third participitation for Winter games than Summer games.
* More proportion of Female participitants for Winter games than Summer games.
* Higher participitation by male athletes than Female athletes.
* sydney 2000 had the highest participitation whereas Salt Lake City 2002 had the lowest.
nm_medal <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport) %>% summarize(Total_sport_medal = n()) %>%
mutate(Total_sport_medal_perc = round((Total_sport_medal/sum(Total_sport_medal)*100),2))
nm_sport <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport,Medal) %>% summarize(Total_medal_type_sport = n()) %>%
mutate(Total_medal_type_sport_perc = round((Total_medal_type_sport/sum(Total_medal_type_sport)*100),2))
nm_sport_sex <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport,Medal,Sex) %>% summarize(No_of_athlete_ms = n()) %>%
mutate(percentage_ms = round(No_of_athlete_ms/sum(No_of_athlete_ms) * 100,0))
nm_medal_plus_sport <- left_join(nm_medal, nm_sport, by="Sport")
nm_total_sport_sex <- left_join(nm_medal_plus_sport, nm_sport_sex, by= c("Sport","Medal"))
# Top 10 Sports by Medals Won
nm_medal %>% arrange(-Total_sport_medal) %>% top_n(10, Total_sport_medal) %>%
ggplot(aes(x = reorder(Sport,Total_sport_medal), y = Total_sport_medal, fill = Total_sport_medal)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 Sports by Medals Won", y = "Number of Athletes", x = "Sport") +
geom_text(aes(label = paste(Total_sport_medal_perc,"%",sep = "")), size = 3, check_overlap = T, position = position_stack(vjust = 1.12) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
coord_flip()
# Top 10 Sports by Medal Types Won
nm_medal_plus_sport %>% arrange(-Total_sport_medal) %>% top_n(30, Total_sport_medal) %>%
ggplot(aes(x = reorder(Sport,-Total_sport_medal), y = Total_medal_type_sport, fill = Medal)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 Sports by Total Medal(Types) Won", y = "Number of Athletes", x = "Sport") +
geom_text(aes(label = paste(Total_medal_type_sport_perc,"%",sep = "")), size = 3, check_overlap = T, position = position_stack(vjust = 1.12) ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5))
# Top 10 Sports by Medal Types Won
# Version 2
nm_medal_plus_sport %>% arrange(-Total_sport_medal) %>% top_n(30, Total_sport_medal) %>%
ggplot(aes(x = reorder(Sport,Total_sport_medal), y = Total_medal_type_sport, fill = Medal)) +
geom_bar(stat = "identity") +
theme_bw() +
facet_wrap(~Medal) +
labs(title = "Top 10 Sports by Total Medal(Types) Won_V2", y = "Number of Athletes", x = "Sport") +
geom_text(aes(label = paste(round(Total_medal_type_sport_perc,0),"%",sep = "")), size = 3, check_overlap = T,
position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
coord_flip()
# Top 10 Sports by Medal Types Won
nm_medal_plus_sport %>% arrange(-Total_medal_type_sport) %>% top_n(10, Total_medal_type_sport) %>%
ggplot(aes(x = reorder(paste(Sport,"\n",Medal,sep=""),Total_medal_type_sport), y = Total_medal_type_sport, fill = Medal)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 Sports by Medal(Types) Won", y = "Number of Athletes", x = "Sports + Medal type") +
geom_text(aes(label = paste(round((Total_medal_type_sport/sum(Total_sport_medal))*100,0),"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
coord_flip()
# Top 10 MALE Medal Winners Category
nm_total_sport_sex %>% filter(Sex == "M") %>% arrange(-No_of_athlete_ms) %>% top_n(10, No_of_athlete_ms) %>%
ggplot(aes(x = reorder(paste(Sport,"\n",Medal,sep=""),No_of_athlete_ms), y = No_of_athlete_ms, fill = Medal)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 MALE Medal Winners Category", y = "Number of Athletes", x = "Sports + Medal type") +
geom_text(aes(label = paste(round((No_of_athlete_ms/sum(Total_sport_medal))*100,0),"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
coord_flip()
# Top 10 FEMALE Medal Winners Category
nm_total_sport_sex %>% filter(Sex == "F") %>% arrange(-No_of_athlete_ms) %>% top_n(10, No_of_athlete_ms) %>%
ggplot(aes(x = reorder(paste(Sport,"\n",Medal,sep=""),No_of_athlete_ms), y = No_of_athlete_ms, fill = Medal)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "Top 10 FEMALE Medal Winners Category", y = "Number of Athletes", x = "Sports + Medal type") +
geom_text(aes(label = paste(round((No_of_athlete_ms/sum(Total_sport_medal))*100,0),"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5)) +
coord_flip()
Analysis
* Swimminng, Athletics and Rowing are the top three sports with most medals won.
* Equal proportion of Medals won among Gold, Silver and Bronze categories.
* Swimming Gold and Athletics Gold are the top two male medal winning sports category.
* Swimming Gold and Swimming Silver are the top two Female medal winning sports category.
* Football Gold and Football Bronze medals won by Female players but not by Male players, in Top 10.
# count_BMI_team
# athlete_clean %>% group_by(Year,Season, Sport) %>% summarize(Total_player = n())
for (bin_i in levels(athlete_clean$Games)) {
plot_sport_games <- athlete_clean %>% group_by(Games, Sport) %>% summarize(Total_player = n()) %>%
filter(Games == bin_i) %>%
mutate(percentage_cbmiT = round(Total_player/sum(Total_player) * 100,2)) %>%
arrange(-Total_player) %>% top_n(10,Total_player) %>%
ggplot(aes(x = reorder(Sport,Total_player), y = Total_player, fill = Sport)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Sports Participitation in \n",bin_i,sep=""), y = "Number of Athletes", x="Sports") +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.5) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
print(plot_sport_games)
# gridExtra::grid.arrange(plot_summer_sport, plot_winter_sport, ncol = 2)
}
Analysis
* Summer Sports - Top 10 Sports that saw maximum participitation
+ Athletics, Swimming, Gymnastics and Cycling were the top four sports throughout.
+ Number of participitants for Gymnastics reduced over time.
+ Shooting had alternate rank movement over time.
+ Sailing popularity reduced over time.
* Winter Sports - Top 10 Sports that saw maximum participitation
+ Cross Country Skiing, Biathlon, Alpine Skiing, Ice Hockey and Speed Skating were constantly the top five sports.
+ Bobsleigh gained and lost popularity alternatively.
+ Ski jumping had random popularity rank movement in terms of the number of participitants.
np_bmi <- athlete_clean %>% group_by(BMI_cat) %>% summarize(Count_BMI = n())
np_bmi_team <- athlete_clean %>% filter(BMI_cat == "Healthy") %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>%
mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2))
# np_bmi_team_city <- left_join(np_city, np_city_sex, by= c("Year","City"))
# BMI Count
np_bmi %>%
ggplot(aes(x = BMI_cat, y = Count_BMI, fill = BMI_cat)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = "BMI Count", y = "Number of Athletes", x = "BMI Category") +
geom_text(aes(label = paste(round((Count_BMI/sum(Count_BMI))*100,2),"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
Analysis
* About 78% participitants are under Healthy BMI category. * 16% of participitants are Overweight.
# Using for loop to find apporopriate bin value
# for (bin_i in levels(athlete_clean$BMI_cat)) {
#
# plot_BMI <- athlete_clean %>% filter(BMI_cat == bin_i) %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>%
# mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>%
# arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>%
# ggplot(aes(x = Team, y = count_BMI_team, fill = Team)) +
# geom_bar(stat = "identity") +
# theme_bw() +
# # facet_wrap(~Medal) +
# labs(title = paste("Top 10 Participating Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes",
# x = paste("Teams - ",bin_i," BMI Category", sep="")) +
# geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
# size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
# theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
# coord_flip()
#
# print(plot_BMI)
# }
for (bin_i in levels(athlete_clean$BMI_cat)) {
plot_BMI <- athlete_clean %>% filter(BMI_cat == bin_i) %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>%
mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>%
arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>%
ggplot(aes(x = reorder(Team,count_BMI_team), y = count_BMI_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Participating Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i," BMI Category", sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_BMI_Medal <- athlete_clean %>%
filter(!is.na(Medal), BMI_cat == bin_i) %>%
group_by(BMI_cat,Team) %>%
summarize(count_BMI_team = n()) %>%
mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>%
arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>%
ggplot(aes(x = reorder(Team,count_BMI_team), y = count_BMI_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Winning Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i," BMI Category", sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
gridExtra::grid.arrange(plot_BMI, plot_BMI_Medal, ncol = 2)
}
Analysis
* United States has the most number of participitants and Wins across (almost) all BMI categories.
* Severely Underweight BMI
+ Spain had the highest participitation but China had the maximum wins.
+ Ukraine did not win any medal even it third highest participitation from this BMI category.
+ Only Seven Teams won medals in this BMI category.
* Underweight BMI
+ Russia and China were the top two teams with maximum participitation and Wins.
+ Japan and Australia did not made it to Top 10 winning ranks even though it had third and fourth highest participitations respectively.
* Healthy BMI
+ USA, Germany and Russia lead the charts here.
+ Australian team ranked fourth even though it was the seventh team in participitation ranking.
* Overweight BMI
+ Poland and Czech Republic did not make it to the Top 10 winning teams.
+ Sweden stood at seventh rank with wins contrast to the observation it was not present in Top 10 participitation ranks.
+ United States and Canada were among top two teams for both participitation and wins.
* Moderately Obese BMI
+ The number of wins for Russia and Poland increased than their participitation proportion/number.
+ Austalia did not make it to the winning ranks here.
* Severely Obese BMI
+ Russia, China, Turkey, Japan and Belarus won the most number of medals, even though they had relevantly less participitation.
+ Proportion of participitation and wins were equal for United States.
+ Australia did not make it to the wining list here.
+ INDIA made it to both the rankings here.
* Very Severely Obese BMI
+ United States topped the charts for both.
+ China and Iran won more proportion of medals than their participitation proportions.
# Base code
# for (bin_i in levels(athlete_clean$Sport)) {
#
# plot_Sport <- athlete_clean %>% filter(Sport == bin_i) %>% group_by(Sport,Team) %>% summarize(count_BMI_team = n()) %>%
# mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>%
# arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>%
# ggplot(aes(x = Team, y = count_BMI_team, fill = Team)) +
# geom_bar(stat = "identity") +
# theme_bw() +
# # facet_wrap(~Medal) +
# labs(title = paste("Top 10 Teams in ",bin_i," Sports",sep=""), y = "Number of Athletes",
# x = paste("Teams - ",bin_i, sep="")) +
# geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
# size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
# theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
# coord_flip()
#
# print(plot_Sport)
# }
# Efficient Modified Code V2
# for (bin_i in levels(athlete_clean$Sport)) {
#
# plot_Sport <- athlete_clean %>%
# filter(Sport == bin_i) %>%
# group_by(Team) %>%
# summarize(count_team = n()) %>%
# mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
# arrange(-count_team) %>%
# top_n(10,count_team) %>%
# ggplot(aes(x = Team, y = count_team, fill = Team)) +
# geom_bar(stat = "identity") +
# theme_bw() +
# # facet_wrap(~Medal) +
# labs(title = paste("Top 10 Participating Teams in ",bin_i," Sports",sep=""), y = "Number of Athletes",
# x = paste("Teams - ",bin_i, sep="")) +
# geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
# size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
# theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
# coord_flip()
#
# print(plot_Sport)
# }
# Top 10 Teams participitation V/s Win across sports and Games/Year
plot_Team_p <- athlete_clean %>% group_by(Team) %>% summarize(Count = n()) %>%
mutate(perc = round((Count/sum(Count))*100,2)) %>% arrange(-Count) %>% top_n(10,Count) %>%
ggplot(aes(x = reorder(Team,Count), y = Count, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
labs(title = paste("Top 10 Participating Teams \n Overall"), y = "Number of Athletes", x = "Team") +
geom_text(aes(label = paste(perc,"% ","(",Count,")",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_Team_w <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Team) %>% summarize(Count = n()) %>%
mutate(perc = round((Count/sum(Count))*100,2)) %>% arrange(-Count) %>% top_n(10,Count) %>%
ggplot(aes(x = reorder(Team,Count), y = Count, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
labs(title = paste("Top 10 Winning Teams \n Overall"), y = "Number of Athletes", x = "Team") +
geom_text(aes(label = paste(perc,"% ","(",Count,")",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
gridExtra::grid.arrange(plot_Team_p, plot_Team_w, ncol = 2)
Analysis
* Top 5 Teams are constant for both the charts and their proportionate wins increased with respect to their participitation.
* Japan did not made it to the Top 10 Wins ranking. It’s position in Top 10 list was replaced by Netherlands.
# Modified Code V3 with comparission
for (bin_i in levels(athlete_clean$Sport)) {
plot_Sport <- athlete_clean %>%
filter(Sport == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Participating Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_Sport_Medal <- athlete_clean %>%
filter(!is.na(Medal), Sport == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Winning Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
# print(plot_Sport)
# cowplot::plot_grid(plot_Sport, plot_Sport_Medal, ncol = 2, align = "hv", axis = "t")
gridExtra::grid.arrange(plot_Sport, plot_Sport_Medal, ncol = 2)
}
Analysis - Top 10 Teams Participitating Vs Winning for all Sports Individually
* Participitants from United States, Sweden, SOuth Korea and China only have won medals for Golf repeatedly.
* Participitants from United States, Australie, and Japan only have won medals for Softball repeatedly.
* Participitants from Cuba, United States, South Korea, Japan and Australia only have won medals for Baseball repeatedly.
* Participitants from China were most awarded athletes for Diving Sports.
* Participitants from Canada were most awarded athletes for Curling Sports.
* Participitants from Germany and Canoeing were most awarded athletes for Canoeing Sports.
* Participitants from Cuba and Russia were most awarded athletes for Boxing. * Participitants from Cuba were most awarded players for Baseball.
* Participitants from Austria were most awarded athletes for Alpine Skiing Sports.
* Participitants from South Korea were most awarded athletes for Archery.
* Participitants from United States were most awarded players for Athletics and Basketball.
* Participitants from China and South Korea-1 were most awarded players for Badminton.
* Participitants from Brazil-1 were most awarded players for Beach Volleyball.
* Even though Germany had lower participitation for Biathlon, they were the most awarded.
* Participitants from United States-1 and Germany-1 were the most awarded players for Bobsleigh sports.
* Participitants from United States-1, Russia and Canada were the most awarded players for Figure Skating.
* Participitants from United States, Great Britain and Germany were the most awarded players for Equestrianism sports.
* Participitants from Great Britain were the most awarded players for Cycling.
* Participitants from Russia, Japan, China and Spain were constatly the most awarded players for Synchronized Swimming.
* Participitants from United States and Australia were the most awarded players for Swimming.
* Participitants from Russia were the most awarded players for Wrestling and Rhythmic Gymnastics.
* Participitants from China and Canada were the most awarded players for Trampolining.
* Participitants from China were the most awarded players for Table Tennis.
* Participitants from Great Britain, Switzerland and Australia were the most awarded players for Triathlons.
* Participitants from Netherlands were the most awarded players for Speed Skating.
* Participitants from Great Britain and Australia were the most awarded players for Sailing.
* Participitants from Japan were the most awarded players for Judo.
* Participitants from Great Britain, Russia and Lithuania were the most awarded players for Modern Pentathlon.
* Participitants from Germany, Austria and Finland were the most awarded players for Nordic Combined Sports.
for (bin_i in levels(athlete_clean$Games)) {
plot_games <- athlete_clean %>%
filter(Games == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Participating Teams in \n",bin_i,sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_games_Medal <- athlete_clean %>%
filter(!is.na(Medal), Games == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Winning Teams in \n",bin_i,sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
gridExtra::grid.arrange(plot_games, plot_games_Medal, ncol = 2)
}
Analysis - Top 10 Participitating V/s Winning teams across Games/Year
* Summer Games
+ United States was the top team with maximum wins across the years. It won the most in 2008 Summer.
+ Germany jumped to second spot for winning list in 2016 from its position of mid three in the previous years.
+ Cuba did not make it to top 10 winning list for the last three summer games.
+ Russia’s winning poportion detoriated with time.
+ Canada made it to top 10 winnig list only in the last year of 2016 Summer games.
+ Great Britain’s performance/wins increased substantially from 2008 summer games onwards.
* Winter Games
+ Canadian team won the most medals across time.
+ United States was among the top three constantly.
+ Russian teams win percentage detoriated over time before finally bouncing back to top three in 2014 Winter games.
+ Italy’s performance/wins detoriated over time even though they were constantly in top 10 participitating teams.
for (bin_i in levels(athlete_clean$Season)) {
plot_games <- athlete_clean %>%
filter(Season == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Participating Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_games_Medal <- athlete_clean %>%
filter(!is.na(Medal), Season == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Winning Teams in \n",bin_i," Sports",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
gridExtra::grid.arrange(plot_games, plot_games_Medal, ncol = 2)
}
Analysis - Overall Top 10 Participitating V/s Winning teams
* United States, Australia, and Russia were the top three teams in terms of participitation and wins for Summer sports.
* Canada did not perform well in Summer sports but was the top team in Winter sports.
* Italy and China performed well in Summer sports but did not make the cut for Winter sports.
* Even though Japan, Czech Republic and Sweden were among the top ten participitating teams for Winter Sports -
+ Only Sweden made it to the top four(or ten) winning teams.
+ Among them, Japan was the only team making the list of top 10 for Summer Sports for both participitation and Wins.
# LOOP MODIFICATION 4
# Adding sex grouping to above plots
for (bin_i in levels(athlete_clean$Season)) {
plot_games_M <- athlete_clean %>%
filter(Season == bin_i, Sex == "M") %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 M playing Teams in \n",bin_i,sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_games_F <- athlete_clean %>%
filter(Season == bin_i, Sex == "F") %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 F playing Teams in \n",bin_i,sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_games_Medal_M <- athlete_clean %>%
filter(!is.na(Medal), Sex == "M", Season == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 M Winning Teams in \n",bin_i,sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_games_Medal_F <- athlete_clean %>%
filter(!is.na(Medal), Sex == "F", Season == bin_i) %>%
group_by(Team) %>%
summarize(count_team = n()) %>%
mutate(percentage_cbmiT = round(count_team/sum(count_team) * 100,2)) %>%
arrange(-count_team) %>%
top_n(10,count_team) %>%
ggplot(aes(x = reorder(Team,count_team), y = count_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 F Winning Teams in \n",bin_i,sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i, sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
gridExtra::grid.arrange(plot_games_M, plot_games_Medal_M,
plot_games_F, plot_games_Medal_F,
ncol = 2, nrow = 2)
}
Analysis - Overall Top 10 Participitating V/s Winning teams Wraped by Sex
* Summer Sports
+ United State Teams for both Male and Female performed best or won the maximum number of medals.
+ Brazil, South Korea, Italy and France Male Team made it to the top 10 wining list but not their female team.
+ Netherlands, Japan, Canada and Romania Female teams performed better than their male counterparts.
* Winter Sports
+ Canadian team was the best performer for Winter sports and their Female team further performed better than their Male team.
+ China, and South Korea female teams performed better than their Male teams.
+ Finland’s Male team performed way better than their Female teams.
+ Switzerland, Germany and Swedens’ Female teams performed better than their Male teams.
# athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Games,Sex,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win)
# athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Sport,Sex,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win)
# Male Across Olympics
athlete_clean %>% filter(!is.na(Medal), Sex=="M") %>% group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>%
head(10) %>%
ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Male Winners across Years"), y = "Number of Medals", x = "Athlete's Name") +
geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
# FeMale Across Olympics
athlete_clean %>% filter(!is.na(Medal), Sex=="F") %>% group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>%
head(10) %>%
ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Female Winners across Years"), y = "Number of Medals", x = "Athlete's Name") +
geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
Analysis - Top 10 Athlete’s to Win - Male and Female
* Male Athletes
+ Michael Phelps(Swimming) won the most number of medals across all sports categories.
+ Usian Bolt won a total 8 medals for Athelics between 2000 to 2016 games.
+ Ole Einar won a total of 11 medals for Biathlon between 2000 to 2016 games.
* Female Athletes
+ Natalie Anne won the maximum number of medals(12) for Swimming among all participitants.
+ Marit Bjrgen won the second most number of medals(10) for Cross Country Skiing among all.
for (bin_i in levels(athlete_clean$Games)) {
plot_M_year <- athlete_clean %>% filter(!is.na(Medal), Sex=="M", Games == bin_i) %>%
group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>%
head(10) %>%
ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Male Winners in",bin_i), y = "Number of Medals", x = "Athlete's Name") +
geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot(plot_M_year)
}
for (bin_i in levels(athlete_clean$Games)) {
plot_F_year <-athlete_clean %>% filter(!is.na(Medal), Sex=="F", Games == bin_i) %>%
group_by(Sport,Name) %>% summarize(P_Medal_win = n()) %>% arrange(-P_Medal_win) %>%
head(10) %>%
ggplot(aes(x = reorder(paste(Name,"\n",Sport,sep = ""),P_Medal_win), y = P_Medal_win, fill = Name)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("T10 Female Winners in",bin_i), y = "Number of Medals", x = "Athlete's Name") +
geom_text(aes(label = P_Medal_win), size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
# gridExtra::grid.arrange(plot_M_year, plot_F_year, ncol = 2)
plot(plot_F_year)
}
Analysis - Top 10 Athlete’s to Win - Male and Female - Summers and Winter Games
* Summer Games - Male Winners
+ Michael Phelps(Swimming) was the top performer since 2004 with almost twice as many medals than other competitors across all the sports.
+ Top performer for 2000 summer Aleksey was his last game where he won 6 medals.
+ Most athletes in top ten won atmost three medals.
* Summer Games - Female Winners
+ Athletes from Swimming and Gymnastics made it top 10 winning list among all participitants for all sports.
+ Maximum number of medals won by any one athlete across the years was five.
+ Almost no consistent trend for a paticular player for games between 2000 to 2016.
* Winter Games - Male Winners
+ Most of the top ten athletes won approximately three medals each year.
+ Victor An(Short Track Speed Skating) held the top spot with 4 wins for years 2006 and 2014.
+ Top three positions were held by athelete’s for the sport of Skating, Skiing and Biathlon across years 2000 to 2016.
* Winter Games - Female Winners
+ The winning list was dominated by athletes from Speed Skating and Alpine Skiing sports for years 2000 to 2016.
+ Most of the participitants in the list won approximately three medals each.
+ The top three positions were held by different athletes each year.
athlete_clean %>% group_by(Year,Season) %>%
summarize(P_Count = n()) %>%
ggplot(aes(x = Year, y = P_Count, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("Participitation Trend Line"), y = "Number of Athletes") +
geom_text(aes(label = P_Count), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
athlete_clean %>% group_by(Year,Season) %>% filter(Sex=="M") %>%
summarize(P_Count = n()) %>%
ggplot(aes(x = Year, y = P_Count, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("Male Participitation Trend Line"), y = "Number of Athletes") +
geom_text(aes(label = P_Count), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
athlete_clean %>% group_by(Year,Season) %>% filter(Sex=="F") %>%
summarize(P_Count = n()) %>%
ggplot(aes(x = Year, y = P_Count, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("Female Participitation Trend Line"), y = "Number of Athletes") +
geom_text(aes(label = P_Count), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
Analysis - Participitation Trend
* There was a drop in the number of all participitants in Summer sports for the year 2012.
* Winter sports had a gradually increasing trend line.
* Male Summer participitation dropped gradually while the Female participitation increased over the years.
* Male and Female Winter participitation had a constant increasing trend line.
gMale <- athlete_clean %>% filter(Sex == "M") %>% group_by(Year,Season) %>%
summarize(M_Count = n())
gFemale <- athlete_clean %>% filter(Sex == "F") %>% group_by(Year,Season) %>%
summarize(F_Count = n())
Grp_M_F <- left_join(gMale, gFemale, by = c("Year","Season"))
Grp_M_F %>% mutate(M_to_F_SexRatio = M_Count/F_Count) %>%
ggplot(aes(x = Year, y = M_to_F_SexRatio, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("Sex Ratio Trend Line"), y = "Number of Athletes") +
geom_text(aes(label = round(M_to_F_SexRatio,2)), size = 3, check_overlap = T, position = position_stack(vjust = 1.05) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
Analysis - Sex Ratio Trend
* Male to Female Sex ratio declined over the years.
* Winter games had a higer Male to Female Sex ratio than Summer games.
athlete_clean %>% group_by(Year,Season) %>%
summarize(P_Count = length(unique(Team))) %>%
ggplot(aes(x = Year, y = P_Count, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("No of Teams Each Year - Trend Line"), y = "Number of Teams") +
geom_text(aes(label = P_Count), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
Analysis
* Number of teams for Winter games were almost constant across the years.
* Number of teams participitation increased for few years and then dropped for the last two games.
total_medal <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season) %>% summarize(Total_Medal = n())
tm_sex <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season,Sex) %>% summarize(Total_Medal = n())
total_medal %>%
ggplot(aes(x = Year, y = Total_Medal, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("No of Medals won each Year - Trend"), y = "Number of Teams") +
geom_text(aes(label = Total_Medal), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
tm_sex %>% filter(Sex=="M") %>%
ggplot(aes(x = Year, y = Total_Medal, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("No of Medals won by MALE each Year - Trend"), y = "Number of Teams") +
geom_text(aes(label = Total_Medal), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
tm_sex %>% filter(Sex=="F") %>%
ggplot(aes(x = Year, y = Total_Medal, group = Season)) +
geom_line(aes(color = Season), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Season, shape = Season), size = 3) +
theme_bw() +
labs(title = paste("No of Medals won by FEMALE each Year - Trend"), y = "Number of Teams") +
geom_text(aes(label = Total_Medal), size = 3, check_overlap = T, position = position_stack(vjust = 0.93) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
Analysis
* Number of medals awarded were on a constant rise for Winter games.
* Number of medals awarded for Summer games were approximately in the same range with a drop for year 2012.
* Female Medals holders numebr increased constatnly for both Summer and Winter games.
* A sudden drop on the number of Male medal holders for Summer games.
total_medal <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season,Team) %>% summarize(Total_Medal = n()) %>% arrange(-Total_Medal)
# tm_sex <- athlete_clean %>% filter(!is.na(Medal)) %>% group_by(Year,Season,Team,Sex) %>% summarize(Total_Medal = n()) %>% arrange(-Total_Medal)
total_medal %>%
filter(Season == "Winter",Team %in% c("United States","Russia","Canada","Germany","Italy","France")) %>%
ggplot(aes(x = Year, y = Total_Medal, group = Team)) +
geom_line(aes(color = Team), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Team, shape = Team), size = 3) +
theme_bw() +
labs(title = paste("Winter Trend Line for Select Teams - Medals Won"), y = "Number of Teams") +
# geom_text(aes(label = ""), size = 3, check_overlap = F, position = position_stack(vjust = 0) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
total_medal %>%
filter(Season == "Summer",Team %in% c("United States","Russia","Canada","Germany","Italy","France")) %>%
ggplot(aes(x = Year, y = Total_Medal, group = Team)) +
geom_line(aes(color = Team), size = 1.1, linejoin = "round", linetype = 1 ) +
geom_point(aes(color = Team, shape = Team), size = 3) +
theme_bw() +
labs(title = paste("Summer Trend Line for Select Teams - Medals Won"), y = "Number of Teams") +
# geom_text(aes(label = ""), size = 3, check_overlap = F, position = position_stack(vjust = 0) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "right")
Analysis
* Winter Games
+ Canada and France saw an increasing trend for the number of medals won in olympics.
+ Germany saw a decreasing trend for the number of medals won.
+ United states and Russia had alternate trend movements.
* Summer Games
+ canada and France saw an increasing trend.
+ Russia saw a decreasing trend.
+ Germany, Italy and United states had a fluctutating trend line for the years 2000 to 2016.
Creating a new column called Win based on condition if an athlete won a medal Win = Y else N.
glm_athlete <- athlete_clean
glm_athlete$Win <- ifelse(!is.na(glm_athlete$Medal), "Y", "N" )
glm_athlete$Win <- factor(glm_athlete$Win, levels = c("N","Y"))
# set.seed(123)
# SplitIndex <- sample(x = c("Train", "Test"), size = nrow(glm_athlete),replace = T, prob = c(0.7,0.3))
#
# #Subset data into a train and test set based on the SplitIndex vector
# Train_Data <- glm_athlete[SplitIndex == "Train", ]
# Test_Data <- glm_athlete[SplitIndex == "Test", ]
#
# dim(Test_Data)
# dim(Train_Data)
# str(Train_Data)
# Logistic Regression
# glm_model <- glm(formula = Win ~ . ,data = Train_Data, family = 'binomial')
# summary(glm_model)
# str(glm_athlete)
# Extracting only metric columns for usage
glm_athlete_metric <- glm_athlete[,c("Age","Height(cms)","Weight(Kgs)","BMI","Win")]
str(glm_athlete_metric)
Classes 'tbl_df', 'tbl' and 'data.frame': 83909 obs. of 5 variables:
$ Age : num 31 34 31 22 30 25 25 24 23 23 ...
$ Height(cms): num 172 159 189 187 189 194 194 172 178 178 ...
$ Weight(Kgs): num 70 55.5 130 89 72 78 78 58 76 76 ...
$ BMI : num 23.7 22 36.4 25.5 20.2 ...
$ Win : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
glm_model_1 <- glm(formula = Win ~ . ,data = glm_athlete_metric, family = 'binomial')
# summary(glm_model_1)
glm_athlete_metric$Predict <- predict(object = glm_model_1, newdata = glm_athlete_metric, type = "response")
glm_athlete_metric$YPredict <- ifelse(glm_athlete_metric$Predict > 0.5, 1, 0)
# Model Summary
summary(glm_model_1)
Call:
glm(formula = Win ~ ., family = "binomial", data = glm_athlete_metric)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9763 -0.5786 -0.5341 -0.4931 2.2309
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.226289 0.994665 -2.238 0.02521 *
Age 0.011620 0.001830 6.349 2.17e-10 ***
`Height(cms)` -0.001141 0.005596 -0.204 0.83843
`Weight(Kgs)` 0.021782 0.006778 3.213 0.00131 **
BMI -0.053850 0.022054 -2.442 0.01462 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 69025 on 83908 degrees of freedom
Residual deviance: 68507 on 83904 degrees of freedom
AIC: 68517
Number of Fisher Scoring iterations: 4
# Confusion Matrix
table(glm_athlete_metric$Win, glm_athlete_metric$YPredict)
0
N 71866
Y 12043
Analysis
* This model performed very bad as is evident from the confusion matrix where it was not able to predict any wins.
* However, Age, weight and BMI have statistical significance.
glm_model_2 <- glm(formula = Win ~ Age + `Weight(Kgs)` + `Height(cms)` ,data = glm_athlete_metric, family = 'binomial')
# summary(glm_model_2)
glm_athlete_metric$Predict <- predict(object = glm_model_1, newdata = glm_athlete_metric, type = "response")
glm_athlete_metric$YPredict <- ifelse(glm_athlete_metric$Predict > 0.5, 1, 0)
# Model Summary
summary(glm_model_2)
Call:
glm(formula = Win ~ Age + `Weight(Kgs)` + `Height(cms)`, family = "binomial",
data = glm_athlete_metric)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.8586 -0.5818 -0.5360 -0.4909 2.3231
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.604778 0.210135 -21.913 < 2e-16 ***
Age 0.011273 0.001825 6.176 6.56e-10 ***
`Weight(Kgs)` 0.005397 0.001015 5.319 1.04e-07 ***
`Height(cms)` 0.012086 0.001449 8.340 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 69025 on 83908 degrees of freedom
Residual deviance: 68513 on 83905 degrees of freedom
AIC: 68521
Number of Fisher Scoring iterations: 4
# Probability of WinningS
head(glm_athlete_metric$Predict)
[1] 0.1404309 0.1206924 0.2297364 0.1657656 0.1665300 0.1716221
# Confusion Matrix
table(glm_athlete_metric$Win, glm_athlete_metric$YPredict)
0
N 71866
Y 12043
Analysis
* We can conclude that Age, Weight and Height have statistical significance on Wins.
* Model was still really bad at prediciton of a Win based on only these variables.