# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 6
# Topic: R - Messy Data
# Last updated: 4/12/2017
# Data: summer_winter_olympics.csv
# Describe: all-time olympic medal data
# import data
medals <- read.csv("summer_winter_olympics.csv", sep=",", header=TRUE)
# check class
class(medals)
## [1] "data.frame"
# convert to data frame
medals <- data.frame(medals)
# how many variables are in the data frame? (number of columns)
print(length(medals))
## [1] 17
# how many rows, columns are in the data frame?
dim(medals)
## [1] 146 17
# what are the names of the columns?
names(medals)
## [1] "X.9" "Team..IOC.code." "X..Summer"
## [4] "X" "X.1" "X.2"
## [7] "Total" "X..Winter" "X.3"
## [10] "X.4" "X.5" "Total.1"
## [13] "X..Games" "X.6" "X.7"
## [16] "X.8" "Combined.total"
str(medals)
## 'data.frame': 146 obs. of 17 variables:
## $ X.9 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Team..IOC.code.: Factor w/ 146 levels " Afghanistan (AFG)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ X..Summer : int 13 12 23 5 2 25 26 5 15 8 ...
## $ X : int 0 5 18 1 3 138 18 6 5 0 ...
## $ X.1 : int 0 2 24 2 4 153 33 5 2 0 ...
## $ X.2 : int 2 8 28 9 5 177 35 15 5 1 ...
## $ Total : int 2 15 70 12 12 468 86 26 12 1 ...
## $ X..Winter : int 0 3 18 6 0 18 22 5 0 0 ...
## $ X.3 : int 0 0 0 0 0 5 59 0 0 0 ...
## $ X.4 : int 0 0 0 0 0 3 78 0 0 0 ...
## $ X.5 : int 0 0 0 0 0 4 81 0 0 0 ...
## $ Total.1 : int 0 0 0 0 0 12 218 0 0 0 ...
## $ X..Games : int 13 15 41 11 2 43 48 10 15 8 ...
## $ X.6 : int 0 5 18 1 3 143 77 6 5 0 ...
## $ X.7 : int 0 2 24 2 4 156 111 5 2 0 ...
## $ X.8 : int 2 8 28 9 5 181 116 15 5 1 ...
## $ Combined.total : int 2 15 70 12 12 480 304 26 12 1 ...
#renaming columns in the data frame
names(medals) = c("NA","country", "s_games", "s_gold", "s_silver", "s_bronze",
"s_total", "w_games", "w_gold", "w_silver", "w_bronze",
"w_total", "sw_games", "sw_gold", "sw_silver", "sw_bronze",
"sw_total")
dim(medals)
## [1] 146 17
names(medals)
## [1] "NA" "country" "s_games" "s_gold" "s_silver"
## [6] "s_bronze" "s_total" "w_games" "w_gold" "w_silver"
## [11] "w_bronze" "w_total" "sw_games" "sw_gold" "sw_silver"
## [16] "sw_bronze" "sw_total"
# View to see that column names have been changed
#View(medals)
# attach data for easier use
attach(medals)
# do some summary tables
table(s_games)
## s_games
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 3 2 6 1 17 3 1 7 8 2 7 10 13 5 8 11 4 2 3 5 4 5 3 2 5
## 26 27
## 5 4
# let's do histogram of how many summer games in which each country participated
hist(as.numeric(s_games), breaks=10)

dev.copy2pdf(file="histogram.pdf", width = 7, height = 5)
## quartz_off_screen
## 2
hist(as.numeric(s_games), breaks=20)

dev.copy2pdf(file="histograms_compare.pdf", width = 7, height = 5)
## quartz_off_screen
## 2
# let's compare histogram with winter games
hist(w_games, breaks=10)

hist(w_games, breaks=20)

# let's put the two histograms side by side for easier comparison
par(mfrow = c( 1, 2 ) )
hist(as.numeric(s_games), breaks=20)
hist(as.numeric(w_games), breaks=20)

# let's compare how many total medals are given out in each of the season games
par(mfrow = c( 1, 2 ) )
hist(s_total, breaks=20)
hist(w_total, breaks=20)

# let's reset to one graph per page
par(mfrow = c( 1, 1 ) )
# is there a correlation between number of medals given out in winter and summer?
plot(s_total, w_total, type="p")

# how about number of games each country competes in.
# Is there correlation between winter and summer?
plot(s_games, w_games, type="p")

# let's look at distribution of each of the types of medals, by season
par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=20)
hist(s_silver, breaks=20)
hist(s_bronze, breaks=20)
hist(w_gold, breaks=20)
hist(w_silver, breaks=20)
hist(w_bronze, breaks=20)

par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=10)
hist(s_silver, breaks=10)
hist(s_bronze, breaks=10)
hist(w_gold, breaks=10)
hist(w_silver, breaks=10)
hist(w_bronze, breaks=10)

# extra exploration
# let's look at correlation between number of games competed, and total medals won,
# by summer and winter
par(mfrow = c( 1, 2 ) )
plot(s_games, s_total, type="p")
plot(w_games, w_total, type="p")

# how many countries did not compete in the winter olympic games?
no_winter <- subset(medals, w_games == 0)
dim(no_winter)
## [1] 45 17
# how many countries did not compete in the Summer olympic games?
no_summer <- subset(medals, s_games == 0)
dim(no_summer)
## [1] 0 17