Winter Olympics Medals over Time

1. Medal Counts over Time

Combine the information in the three spreadsheets athletes_and_events.csv, noc_regions.csv, and gdp_pop.csv. Note, that the noc_regions.csv is the set all NOC regions, while gdp_pop.csv only contains a snapshot of the current set of countries.

library(readr)
ath <- read_csv("data/athletes_and_events.csv")

## Rows: 271116 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl  (5): ID, Age, Height, Weight, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gdp <- read_csv("data/gdp_pop.csv")

## Rows: 201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Code
## dbl (2): Population, GDP per Capita
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

noc <- read_csv("data/noc_regions.csv")

## Rows: 230 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): NOC, region, notes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#combine all German designations under German and all Russian designation under Russia. 
ath$NOC[ath$NOC %in% c("SAA","EUA","GDR","FRG")] <- "GER"
ath$NOC[ath$NOC %in% c("RUS","RU1","URS","EUN",'ROC')] <- "RUS"
temp_df <- merge(ath, noc, by.x = "NOC", 
             by.y = "NOC", all.x = TRUE, all.y = FALSE)
total_df <- merge(temp_df, noc, by.x = "NOC", 
             by.y = "NOC", all.x = TRUE, all.y = FALSE)

Calculate a summary of
how many winter games each country competed in, and

how many medals of each type the country won. Use that summary to provide a visual comparison of medal count by country.

Please provide (i) one visualization showing an over time comparison and (ii) one visualization in which a total medal count (across all Winter Olympics) is used. Briefly discuss which visualization you recommend to your editor and why.

b- i)How many winter games did each country compete in? The number of winter games each country competed in ranges from 1 to 22, as illustrated in the below data table.

total_df %>% 
  filter(Season == 'Winter') %>% 
  group_by(NOC, Year) %>% 
  summarize(Medal_Count=n()) %>% 
  distinct()

## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.

## # A tibble: 1,021 × 3
## # Groups:   NOC [115]
##    NOC    Year Medal_Count
##    <chr> <dbl>       <int>
##  1 AHO    1988           3
##  2 AHO    1992           2
##  3 ALB    2006           3
##  4 ALB    2010           2
##  5 ALB    2014           2
##  6 ALG    1992           8
##  7 ALG    2006           3
##  8 ALG    2010           1
##  9 AND    1976          11
## 10 AND    1980           9
## # … with 1,011 more rows

Calculate how many medals each country won over time

top10 <- total_df %>%
  filter(Season == "Winter") %>%
  group_by(NOC) %>%
  summarise(count = n_distinct(Sport,Event,Sex,Medal)) %>%
  arrange(desc(count))%>%
  select(NOC) %>%
  head(10)
top10_byyear<- total_df %>%
  filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")
         , NOC %in% top10$NOC)%>%
  group_by(NOC,Year) %>%
  summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, NOC)) %>%
  arrange(desc(Medal_Count))

## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.

Graph medals overtime by Country - Top 10 Countries

Calculating the All-time medal totals by Country

top10 <- total_df %>%
  filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")) %>%
  group_by(NOC) %>%
  summarise(count = n_distinct(Sport,Event,Sex,Medal,Year)) %>%
  arrange(desc(count))%>%
  select(NOC) %>%
  head(10)
totals_all_time<- total_df%>%
  filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")
         , NOC %in% top10$NOC)%>%
  group_by(NOC,Medal) %>%
  summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, NOC)) %>%
  arrange(desc(Medal_Count))

## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.

Graph the all-time medal counts by Country

plot0 <- ggplot(totals_all_time
       , aes(x=reorder(NOC, desc(Medal_Count), sum), y=Medal_Count, fill=factor(Medal, levels=c("Gold", "Silver", "Bronze")))) +
  theme_tufte(base_size = 10) +
  labs(fill = "Medal") +
  labs(x="Country", y="Number of Medals", title="All Time Medals per Country") +
  geom_bar(stat="identity") +
  scale_fill_manual(values=c('#D6AF36','#A7A7AD','#A77044'))+
  theme(plot.title=element_text(hjust = 0.5))

plot0

temp<- total_df %>% 
  filter(Season == 'Winter') %>% 
  group_by(NOC, Year) %>% 
  summarize(n=n()) %>% 
  distinct()

## `summarise()` has grouped output by 'NOC'. You can override using the `.groups`
## argument.

temp1 <- temp[order(-temp$n),] %>% 
  group_by(NOC) %>% 
  summarize(n = sum(n))

temp1 <- temp1[order(-temp1$n),]
temp2 <- temp1[1:10,1]

top10_countries <- temp2[['NOC']]

plot2 <- subset(temp, NOC %in% top10_countries) %>% 
        ggplot( aes( x= Year, y= n, group = NOC, color = NOC)) +
        geom_line() +
        labs(y="Number of Medals", x="Year") +
        ggtitle("Number of Medals per Year") +
        theme_bw()

# medals_count_year_top10 <- temp %>% filter(NOC %in% head(temp, 10)$NOC)
# plot <- ggplot(medals_count_year_top10, aes(x = Year, y = n, group = NOC, color = NOC)) +
#   geom_line() +
#   labs(y="Number of Medals", x="Year") +
#   ggtitle("Number of Medals per Year") +
#   theme_bw()
# 
plot2

plot1<-ggplot(top10_byyear,aes(Year,Medal_Count))+
  #theme_tufte(base_size = 10)+
  geom_point(aes(color=NOC))+
  geom_line(aes(color=NOC))+
  scale_color_brewer(palette="Spectral")+
  #facet_grid(rows = vars(Country))+
  #facet_wrap(~Country, ncol=3)+
  labs(x="Year", y="Number of Medals", title="Number of Medals per Year")+
  theme(legend.position="bottom")+
  theme(plot.title=element_text(hjust = 0.5))

plot1

#count_function:lets you quickly count the unique values of one or more variables: df %>% count(a, b) is roughly equivalent to df %>% group_by(a, b) %>% summarise(n = n()). count() is paired with tally(), a lower-level helper that is equivalent to df %>% summarise(n = n()).

is.winter <- total_df

topcountry.game<-is.winter %>%
  group_by(NOC) %>%
  select(Year) %>%
  unique() %>%
  summarize(count=n()) %>%
  arrange(desc(count),NOC) %>%
  mutate(rank =row_number())

## Adding missing grouping variables: `NOC`

How many medals of each type did each country win?

#how many medals of each type the country won
has.medal <- is.winter %>% drop_na(Medal)

#gold
topcountry.gold <- has.medal%>%
  filter(Medal == "Gold") %>%
  group_by(NOC) %>% 
  summarize(count=n()) %>%
  arrange(desc(count)) %>%
  mutate(rank = row_number())

plot_gold<- head(topcountry.gold,25) %>% top_n(10) %>% 
  ggplot(aes(x=reorder(NOC, -count), y=count, fill=NOC)) +  
  geom_bar(stat="identity", fill='#d4af37')+
  theme_bw()+
  theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
  labs(x="Name Of Countries", y="Medals Count", title="Number Of Gold Medals Won")+
  theme(plot.title=element_text(hjust=0.5, size=8))+
  theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
  guides(fill="none")

## Selecting by rank

#silver

topcountry.silver <- has.medal%>%
  filter(Medal == "Silver") %>%
  group_by(NOC) %>% 
  summarize(count=n()) %>%
  arrange(desc(count)) %>%
  mutate(rank = row_number())


plot_silver<-head(topcountry.silver,25) %>%  top_n(10) %>% 
  ggplot(aes(x=reorder(NOC, -count), y=count,fill=NOC),fill=NOC) +
  geom_bar(stat="identity", fill='#C0C0C0')+
  theme_bw()+
  theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
  labs(x="Name Of Countries", y="Medals Count", title="Number Of Silver Medals Won")+
  theme(plot.title=element_text(hjust=0.5, size=8))+
  theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
  guides(fill="none")

## Selecting by rank

#bronze
topcountry.bronze <- has.medal%>% 
  filter(Medal == "Bronze") %>%
  group_by(NOC) %>% 
  summarize(count=n()) %>%
  arrange(desc(count)) %>%
  mutate(rank = row_number())


plot_bronze<-head(topcountry.bronze,25) %>% top_n(10) %>% 
  ggplot(aes(x=reorder(NOC, -count), y=count, fill=NOC))+
  geom_bar(stat="identity", fill='#CD7F32')+
  theme_bw()+
  theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
  labs(x="Name Of Countries", y="Medals Count", title="Number Of Bronze Medals Won")+
  theme(plot.title=element_text(hjust=0.5, size=8))+
  theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
  guides(fill="none")

## Selecting by rank

print(plot_gold)

print(plot_silver)

print(plot_bronze)

2. Medal Counts adjusted by Population, GDP

There are different ways to calculate “success”. Consider the following variants and choose one (and make sure your choice is clear in the visualization):
- Just consider gold medals.
- Simply add up the number of medals of different types.
- Create an index in which medals are valued differently. (gold=3, silver=2, bronze=1).
- A reasonable other way that you prefer.

Now, adjust the ranking of medal success by (a) GDP per capita and (b) population. You have now three rankings: unadjusted ranking, adjusted by GDP per capita, and adjusted by population.

Visualize how these rankings differ. Try to highlight a specific pattern (e.g. “South Korea – specialization reaps benefits” or “The superpowers losing their grip”).

ath = merge(ath, noc, by.x="NOC", by.y="NOC")
ath = merge(ath, gdp, by.x="NOC", by.y="Code")
medals_count = ath[complete.cases(ath[ , 15]),] %>% filter(Season == 'Winter')%>% count(NOC, sort = TRUE)

medals_count <- medals_count[order(medals_count$n, decreasing=TRUE),]

medal_gdp_df = merge(medals_count, gdp, by.x="NOC", by.y="Code")
medal_gdp_df <- merge(medals_count, gdp, by.x = "NOC", 
             by.y = "Code", all.x = TRUE, all.y = FALSE)
medal_gdp_df <- medal_gdp_df[order(medal_gdp_df$n, decreasing=TRUE),]

pop_new <- medal_gdp_df$n/(medal_gdp_df$Population)
medal_gdp_df_2 <- cbind(medal_gdp_df,pop_new)
medal_gdp_plot <-medal_gdp_df_2 %>%  top_n(10) %>% 
  ggplot(aes(x=reorder(NOC, -pop_new) , y=pop_new , fill=Country))+
  geom_bar(stat="identity",)+ scale_fill_hue(l=40)+ labs(x="Name Of Countries", y="Medals Count", title="Number Of Medals Won Adjusted per Population")+
    theme_bw()+
  theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
    theme(plot.title=element_text(hjust=0.5, size=15))+
  theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
  guides(fill="none")

## Selecting by pop_new

medal_gdp_plot

medal_gdp_df <- na.omit(medal_gdp_df)

gdp_new <- medal_gdp_df$n/(medal_gdp_df$'GDP per Capita')

#adjusted by GDP 

medal_gdp_df_2 <- cbind(medal_gdp_df,gdp_new)
medal_gdp_plot <-medal_gdp_df_2 %>%  top_n(10) %>% 
  ggplot(aes(x=reorder(NOC, -gdp_new) , y=gdp_new , fill=Country))+
  geom_bar(stat="identity",)+ scale_fill_hue(l=40)+ labs(x="Name Of Countries", y="Medals Count", title="Number Of Medals Won Adjusted per GDP per Capita")+
    theme_bw()+
  theme(legend.position="left", legend.title=element_text(size=8), legend.text=element_text(size=8))+
    theme(plot.title=element_text(hjust=0.5, size=15))+
  theme(axis.text.x=element_text(size=9),axis.text.y=element_text(size=8),axis.title.x=element_text(size=10), axis.title.y= element_text(size=10))+
  guides(fill="none")

## Selecting by gdp_new

medal_gdp_plot

3. Host Country Advantage

Until the 2014 Sochi Winter Olympics (our data for Winter Olympics end here), there were 19 host cities. Calculate whether the host nation had an advantage. That is calculate whether the host country did win more medals when the Winter Olympics was in their country compared to other times.

Note, that the 19 host cities are noted in the data but not the countries they are located in. This happens commonly and often Wikipedia has the kind of additional data you want for the task. To save you some time, here is a quick way to get this kind of table from Wikipedia into R.

Provide a visualization of the host country advantage (or absence thereof).

wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
winter_hosts<-hosts %>% filter(Winter != "", Year<='2016') %>%
  rename(Host_City = City) %>%
  rename(Host_Country = Country)%>% 
  select(Host_City, Host_Country, Year)

athletes_events <- read.csv("./data/athletes_and_events.csv")
gdp_pop <- read.csv("./data/gdp_pop.csv")
noc_regions <- read.csv("./data/noc_regions.csv")
#recoding: all russian teams = Russia, all german teams = Germany
athletes_events$NOC[athletes_events$NOC == "URS"] <- "RUS"
athletes_events$NOC[athletes_events$NOC == "FRG"] <- "GER"
athletes_events$NOC[athletes_events$NOC == "GDR"] <- "GER"
athletes_events$NOC[athletes_events$NOC == "EUA"] <- "GER"
#removing duplicate medals for team-events
athletes <- distinct(athletes_events, Sex, Team, NOC, Games, Year, Season, City, Sport, Event, Medal)
merge1 <- left_join(athletes, noc_regions)

## Joining, by = "NOC"

dataset <- left_join(merge1, gdp_pop, by=c("NOC" = "Code"))

wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
hosts <- hosts %>% filter(Winter != "") %>%
  select(City, Country, Year)
hosts <- select(hosts, Year, Country, City)
hosts <- rename(hosts, HostCountry = Country)
hosts <- rename(hosts, HostCity = City)
hosts$HostCountry[hosts$HostCountry == "Russia[h]"] <- "Russia"
hosts$HostCity[hosts$HostCity == "Innsbruck[g]"] <- "Innsbruck"
data_prep <- filter(dataset, Season == "Winter")
host_dataset <- left_join(data_prep, hosts, by=c("Year" = "Year"))

avg_medals <- host_dataset %>%
  filter(Country %in% hosts$HostCountry, Medal =="Gold", Country != "China") %>%
  group_by(Country) %>%
  summarise(avgtotal = sum(Medal=="Gold")/length(unique(Year)))
avg_medals_hosting <- host_dataset %>%
  filter(Country == HostCountry, Medal=="Gold") %>%
  group_by(Country) %>%
  summarise(avgtotal = sum(Medal=="Gold")/length(unique(Year)))
  

#ggplot(host_diff, aes(x=reorder(country, diff), y=diff, fill = country)) +

a <- ggplot(avg_medals_hosting, aes(x=Country, y=avgtotal)) + 
  geom_point(data=avg_medals_hosting, size=3, color="violetred4") + 
  geom_point(data=avg_medals, aes(x=Country, y=avgtotal), size=3, color="grey" ) +
  scale_colour_manual(values=c('Average Gold Medals Earned (Total)'='blue', 'Average Gold Medals Earned (Hosting)'='red')) +
  labs(x = "Country",
       y = "Average Gold Medals",
       title = "Winter Olympics Host Country Advantage (Gold Medals)",caption="Sources: International Olympic Committee, Wikipedia") +
  theme_classic()

a

4. Most successful athletes

Now, let’s look at the most successful athletes. Provide a visual display of the most successful Winte Olympics athletes of all time.

athletes<-read.csv("data/athletes_and_events.csv")
gdp_pop<-read.csv("data/gdp_pop.csv")
noc_regions<-read.csv("data/noc_regions.csv")

athletes$NOC[athletes$NOC %in% c("SAA","EUA","GDR","FRG")] <- "GER"
athletes$NOC[athletes$NOC %in% c("RUS","RU1","URS","EUN",'ROC')] <- "RUS"
colnames(gdp_pop)[colnames(gdp_pop) == 'Code'] <- 'NOC'

all_data <- athletes %>%
  left_join(noc_regions, by = 'NOC') %>%
  left_join(gdp_pop,by='NOC')%>%
  filter(Season=="Winter")

top_athletes <-all_data %>%
  filter(Medal %in% c("Gold",'Silver','Bronze'))%>%
    group_by(ID, Name, Sex) %>%
      summarise(Medal_Count = n_distinct(Sport,Event,Sex,Medal,Year)) %>%
        arrange(desc(Medal_Count))%>%
          head(10)

## `summarise()` has grouped output by 'ID', 'Name'. You can override using the
## `.groups` argument.

ggplot(data=top_athletes
       , aes(x=Medal_Count, y=reorder(reorder(Name, Sex), Medal_Count, sum), fill=factor(Sex))) +
  theme_minimal()+
  labs(fill = "Sex") +
  labs(x="Medal Count", y="Athlete", title="Top 10 Athletes with the Most Medal Count") +
  geom_bar(stat="identity") +
  scale_fill_manual(values=c('#FF6666',' dark blue'))+
  theme(plot.title=element_text(hjust = 0.5))

Surprisingly, the athletes with the most medals count are female.

Chose of the athlete specific dimensions (e.g. gender, height, weight) and visualize an interesting pattern in the data.

athlete_dimensions <- all_data %>%
  filter(Medal %in% c("Gold",'Silver','Bronze')) %>% 
  filter(!is.na(Height))  %>% 
  filter(!is.na(Weight)) %>% 
    distinct(Name, Sport,Sex,Height, Weight)

ave_dimensions<- athlete_dimensions %>%
    group_by(Sport,Sex) %>%
      summarise(Avg_Height = mean(Height)
                , Ave_Weight = mean(Weight))

## `summarise()` has grouped output by 'Sport'. You can override using the
## `.groups` argument.

ggplot(data=ave_dimensions
  , aes(x=Avg_Height, y=Ave_Weight, color =Sex)) +
  theme_minimal()+
  labs(x="Average Height(cm)", y="Average Weight", title="Average Height and Weight of Medal Winners per Sport") +
  geom_point() +
  theme(plot.title=element_text(hjust = 0.5))+
    geom_text_repel(aes(label = Sport),
              color = "gray20", size = 1.9)

Interactivity

5. Make two plots interactive

Choose 2 of the plots you created above and add interactivity. One of the plots needs to be written in plotly rather than just using the ggplotly automation. Briefly describe to the editor why interactivity in these visualization is particularly helpful for a reader.

ggplotly(plot2)

Use plot_ly to make the Average Height and Weight graph interactive

fig = plot_ly(ave_dimensions, x=~Avg_Height, y=~Ave_Weight
              , color=~Sex,colors = c("hotpink", "royalblue")
              , type = "scatter", mode='markers'
              , text = ~paste('Sport: ', Sport)) %>% layout(title = 'Average Height and Weight of Medal Winners per Sport')


fig

6. Data Table

Prepare a selected data set and add a datatable to the output. Make sure the columns are clearly labelled. Select the appropriate options for the data table (e.g. search bar, sorting, column filters etc.). Suggest to the editor which kind of information you would like to provide in a data table in the online version of the article and why.

table_data <- total_df %>%
  filter(Season == "Winter", Medal %in% c("Gold",'Silver','Bronze')) %>%
  group_by(Team ,Medal) %>%
  summarise(Medal_Count = n_distinct(Year, Season, Medal, Event, Team),
            Athlete_Count = n_distinct(Name)
  ) %>%
  arrange((Team))

## `summarise()` has grouped output by 'Team'. You can override using the
## `.groups` argument.

datatable(totals_all_time, rownames = FALSE,
          filter = list(position = "top"),
          options = list(language = list(sSearch = "Filter:")))

Assignment 1: Using ggplot2 for visualization

Shauna Han (shh2145)