1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataset_olympics <- read.csv("~/School/Data 101/dataset_olympics.csv")
2
dim(dataset_olympics)
## [1] 70000 15
summary(dataset_olympics)
## ID Name Sex Age
## Min. : 1 Length:70000 Length:70000 Min. :11.00
## 1st Qu.: 9326 Class :character Class :character 1st Qu.:21.00
## Median :18032 Mode :character Mode :character Median :25.00
## Mean :18082 Mean :25.64
## 3rd Qu.:26978 3rd Qu.:28.00
## Max. :35658 Max. :88.00
## NA's :2732
## Height Weight Team NOC
## Min. :127.0 Min. : 25.0 Length:70000 Length:70000
## 1st Qu.:168.0 1st Qu.: 61.0 Class :character Class :character
## Median :175.0 Median : 70.0 Mode :character Mode :character
## Mean :175.5 Mean : 70.9
## 3rd Qu.:183.0 3rd Qu.: 79.0
## Max. :223.0 Max. :214.0
## NA's :16254 NA's :17101
## Games Year Season City
## Length:70000 Min. :1896 Length:70000 Length:70000
## Class :character 1st Qu.:1960 Class :character Class :character
## Mode :character Median :1984 Mode :character Mode :character
## Mean :1978
## 3rd Qu.:2002
## Max. :2016
##
## Sport Event Medal
## Length:70000 Length:70000 Length:70000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
str(dataset_olympics)
## 'data.frame': 70000 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 5 5 5 5 5 ...
## $ Name : chr "A Dijiang" "A Lamusi" "Gunnar Nielsen Aaby" "Edgar Lindenau Aabye" ...
## $ Sex : chr "M" "M" "M" "M" ...
## $ Age : num 24 23 24 34 21 21 25 25 27 27 ...
## $ Height: num 180 170 NA NA 185 185 185 185 185 185 ...
## $ Weight: num 80 60 NA NA 82 82 82 82 82 82 ...
## $ Team : chr "China" "China" "Denmark" "Denmark/Sweden" ...
## $ NOC : chr "CHN" "CHN" "DEN" "DEN" ...
## $ Games : chr "1992 Summer" "2012 Summer" "1920 Summer" "1900 Summer" ...
## $ Year : int 1992 2012 1920 1900 1988 1988 1992 1992 1994 1994 ...
## $ Season: chr "Summer" "Summer" "Summer" "Summer" ...
## $ City : chr "Barcelona" "London" "Antwerpen" "Paris" ...
## $ Sport : chr "Basketball" "Judo" "Football" "Tug-Of-War" ...
## $ Event : chr "Basketball Men's Basketball" "Judo Men's Extra-Lightweight" "Football Men's Football" "Tug-Of-War Men's Tug-Of-War" ...
## $ Medal : chr "" "" "" "Gold" ...
colSums(is.na(dataset_olympics))
## ID Name Sex Age Height Weight Team NOC Games Year Season
## 0 0 0 2732 16254 17101 0 0 0 0 0
## City Sport Event Medal
## 0 0 0 0
Age:2732 Height:16254 Weight:17101
3
dataset_olympics %>%
select(City, everything()) %>%
slice(1:10)
## City ID Name Sex Age Height Weight Team
## 1 Barcelona 1 A Dijiang M 24 180 80 China
## 2 London 2 A Lamusi M 23 170 60 China
## 3 Antwerpen 3 Gunnar Nielsen Aaby M 24 NA NA Denmark
## 4 Paris 4 Edgar Lindenau Aabye M 34 NA NA Denmark/Sweden
## 5 Calgary 5 Christine Jacoba Aaftink F 21 185 82 Netherlands
## 6 Calgary 5 Christine Jacoba Aaftink F 21 185 82 Netherlands
## 7 Albertville 5 Christine Jacoba Aaftink F 25 185 82 Netherlands
## 8 Albertville 5 Christine Jacoba Aaftink F 25 185 82 Netherlands
## 9 Lillehammer 5 Christine Jacoba Aaftink F 27 185 82 Netherlands
## 10 Lillehammer 5 Christine Jacoba Aaftink F 27 185 82 Netherlands
## NOC Games Year Season Sport Event
## 1 CHN 1992 Summer 1992 Summer Basketball Basketball Men's Basketball
## 2 CHN 2012 Summer 2012 Summer Judo Judo Men's Extra-Lightweight
## 3 DEN 1920 Summer 1920 Summer Football Football Men's Football
## 4 DEN 1900 Summer 1900 Summer Tug-Of-War Tug-Of-War Men's Tug-Of-War
## 5 NED 1988 Winter 1988 Winter Speed Skating Speed Skating Women's 500 metres
## 6 NED 1988 Winter 1988 Winter Speed Skating Speed Skating Women's 1,000 metres
## 7 NED 1992 Winter 1992 Winter Speed Skating Speed Skating Women's 500 metres
## 8 NED 1992 Winter 1992 Winter Speed Skating Speed Skating Women's 1,000 metres
## 9 NED 1994 Winter 1994 Winter Speed Skating Speed Skating Women's 500 metres
## 10 NED 1994 Winter 1994 Winter Speed Skating Speed Skating Women's 1,000 metres
## Medal
## 1
## 2
## 3
## 4 Gold
## 5
## 6
## 7
## 8
## 9
## 10
4
summer68 <- dataset_olympics %>%
filter(Year == 1968, Season == "Summer")
dim(summer68)
## [1] 2315 15
5
ts <- summer68 %>%
group_by(Team) %>%
summarise(Num_Athletes = n(), .groups = 'drop')
ts
## # A tibble: 98 × 2
## Team Num_Athletes
## <chr> <int>
## 1 Afghanistan 5
## 2 Argentina 43
## 3 Australia 82
## 4 Austria 11
## 5 Bahamas 3
## 6 Barbados 3
## 7 Belgium 29
## 8 Belize 1
## 9 Bermuda 2
## 10 Bolivia 1
## # ℹ 88 more rows
United States
6
ma <- mean(summer68$Age, na.rm = TRUE)
ma
## [1] 24.32589
ma2 <- summer68 %>%
group_by(Sex) %>%
summarise(Mean_Age = mean(Age, na.rm = TRUE))
ma2
## # A tibble: 2 × 2
## Sex Mean_Age
## <chr> <dbl>
## 1 F 20.4
## 2 M 25.4
Yes, about a 5 year difference
7
summer68[summer68 == ""] <- NA
medals <- summer68 %>%
filter(Team == "United States", !is.na(Medal)) %>%
select(Name, Event, Medal)
medals
## Name
## 1 Gary Lee Anderson
## 2 Margaret Ann Bailes (Johnson-)
## 3 John Lee "Johnny" Baldwin
## 4 Catherine Northcutt "Catie" Ball (-Condon)
## 5 Jane Louise Barkman (-Brown)
## 6 Jane Louise Barkman (-Brown)
## 7 Michael Thomas "Mike" Barrett
## 8 Peter Jones Barrett
## 9 Robert "Bob" Beamon
## 10 Donald Ray "Don" Behm
## 11 Ralph Harold Boston
## 12 Gregory Fenton "Greg" Buckingham
## 13 Michael Jay "Mike" Burton
## 14 Michael Jay "Mike" Burton
## 15 John Wesley Carlos
## 16 Edward Julius "Ed" Caruthers, Jr.
## 17 John Richard Clawson
## 18 Eleanor Suzanne "Ellie" Daniel (-Drye)
## 19 Eleanor Suzanne "Ellie" Daniel (-Drye)
## 20 Eleanor Suzanne "Ellie" Daniel (-Drye)
## 21 Wilbur D. "Willie" Davenport
## 22 Donald Francis "Don" Dee
## 23 Joseph Douglas "Joe" Dube, Sr.
## 24 Lee Edward Evans
## 25 Lee Edward Evans
## 26 Thomas Francis "Tom" Farrell
## 27 Barbara Ann Ferrell (-Edmonson)
## 28 Barbara Ann Ferrell (-Edmonson)
## 29 John Edward Ferris
## 30 John Edward Ferris
## Event Medal
## 1 Shooting Mixed Free Rifle, Three Positions, 300 metres Gold
## 2 Athletics Women's 4 x 100 metres Relay Gold
## 3 Boxing Men's Light-Middleweight Bronze
## 4 Swimming Women's 4 x 100 metres Medley Relay Gold
## 5 Swimming Women's 200 metres Freestyle Bronze
## 6 Swimming Women's 4 x 100 metres Freestyle Relay Gold
## 7 Basketball Men's Basketball Gold
## 8 Sailing Mixed Two Person Keelboat Gold
## 9 Athletics Men's Long Jump Gold
## 10 Wrestling Men's Bantamweight, Freestyle Silver
## 11 Athletics Men's Long Jump Bronze
## 12 Swimming Men's 200 metres Individual Medley Silver
## 13 Swimming Men's 400 metres Freestyle Gold
## 14 Swimming Men's 1,500 metres Freestyle Gold
## 15 Athletics Men's 200 metres Bronze
## 16 Athletics Men's High Jump Silver
## 17 Basketball Men's Basketball Gold
## 18 Swimming Women's 100 metres Butterfly Silver
## 19 Swimming Women's 200 metres Butterfly Bronze
## 20 Swimming Women's 4 x 100 metres Medley Relay Gold
## 21 Athletics Men's 110 metres Hurdles Gold
## 22 Basketball Men's Basketball Gold
## 23 Weightlifting Men's Heavyweight Bronze
## 24 Athletics Men's 400 metres Gold
## 25 Athletics Men's 4 x 400 metres Relay Gold
## 26 Athletics Men's 800 metres Bronze
## 27 Athletics Women's 100 metres Silver
## 28 Athletics Women's 4 x 100 metres Relay Gold
## 29 Swimming Men's 200 metres Butterfly Bronze
## 30 Swimming Men's 200 metres Individual Medley Bronze
8
You could use as.factor() to convert it to a factor inorder to
sort
9
summer68 <- summer68 %>%
mutate(BMI = Weight / (Height / 100)^2)
max_BMI <- max(summer68$BMI, na.rm = TRUE)
min_BMI <- min(summer68$BMI, na.rm = TRUE)
mean_BMI <- mean(summer68$BMI, na.rm = TRUE)
max_BMI
## [1] 43.5964
min_BMI
## [1] 16.56065
mean_BMI
## [1] 22.68064