# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      6
# Topic:        R - Messy Data
# Last updated: 4/12/2017
# Data:         summer_winter_olympics.csv
# Describe:     all-time olympic medal data

# import data
medals <- read.csv("summer_winter_olympics.csv", sep=",", header=TRUE)

# check class
class(medals)
## [1] "data.frame"
# convert to data frame
medals <- data.frame(medals)

# how many variables are in the data frame? (number of columns)
print(length(medals))
## [1] 17
# how many rows, columns are in the data frame?
dim(medals)
## [1] 146  17
# what are the names of the columns?
names(medals)
##  [1] "X.9"             "Team..IOC.code." "X..Summer"      
##  [4] "X"               "X.1"             "X.2"            
##  [7] "Total"           "X..Winter"       "X.3"            
## [10] "X.4"             "X.5"             "Total.1"        
## [13] "X..Games"        "X.6"             "X.7"            
## [16] "X.8"             "Combined.total"
str(medals)
## 'data.frame':    146 obs. of  17 variables:
##  $ X.9            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Team..IOC.code.: Factor w/ 146 levels " Afghanistan (AFG)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ X..Summer      : int  13 12 23 5 2 25 26 5 15 8 ...
##  $ X              : int  0 5 18 1 3 138 18 6 5 0 ...
##  $ X.1            : int  0 2 24 2 4 153 33 5 2 0 ...
##  $ X.2            : int  2 8 28 9 5 177 35 15 5 1 ...
##  $ Total          : int  2 15 70 12 12 468 86 26 12 1 ...
##  $ X..Winter      : int  0 3 18 6 0 18 22 5 0 0 ...
##  $ X.3            : int  0 0 0 0 0 5 59 0 0 0 ...
##  $ X.4            : int  0 0 0 0 0 3 78 0 0 0 ...
##  $ X.5            : int  0 0 0 0 0 4 81 0 0 0 ...
##  $ Total.1        : int  0 0 0 0 0 12 218 0 0 0 ...
##  $ X..Games       : int  13 15 41 11 2 43 48 10 15 8 ...
##  $ X.6            : int  0 5 18 1 3 143 77 6 5 0 ...
##  $ X.7            : int  0 2 24 2 4 156 111 5 2 0 ...
##  $ X.8            : int  2 8 28 9 5 181 116 15 5 1 ...
##  $ Combined.total : int  2 15 70 12 12 480 304 26 12 1 ...
#renaming columns in the data frame
names(medals) = c("NA","country", "s_games", "s_gold", "s_silver", "s_bronze", 
                  "s_total", "w_games", "w_gold", "w_silver", "w_bronze", 
                  "w_total", "sw_games", "sw_gold", "sw_silver", "sw_bronze", 
                  "sw_total")
dim(medals)
## [1] 146  17
names(medals)
##  [1] "NA"        "country"   "s_games"   "s_gold"    "s_silver" 
##  [6] "s_bronze"  "s_total"   "w_games"   "w_gold"    "w_silver" 
## [11] "w_bronze"  "w_total"   "sw_games"  "sw_gold"   "sw_silver"
## [16] "sw_bronze" "sw_total"
# View to see that column names have been changed
#View(medals)

# attach data for easier use
attach(medals)

# do some summary tables
table(s_games)
## s_games
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  3  2  6  1 17  3  1  7  8  2  7 10 13  5  8 11  4  2  3  5  4  5  3  2  5 
## 26 27 
##  5  4
# let's do histogram of how many summer games in which each country participated
hist(as.numeric(s_games), breaks=10)

dev.copy2pdf(file="histogram.pdf", width = 7, height = 5)
## quartz_off_screen 
##                 2
hist(as.numeric(s_games), breaks=20)

dev.copy2pdf(file="histograms_compare.pdf", width = 7, height = 5)
## quartz_off_screen 
##                 2
# let's compare histogram with winter games
hist(w_games, breaks=10)

hist(w_games, breaks=20)

# let's put the two histograms side by side for easier comparison
par(mfrow = c( 1, 2 ) )
hist(as.numeric(s_games), breaks=20)
hist(as.numeric(w_games), breaks=20)

# let's compare how many total medals are given out in each of the season games
par(mfrow = c( 1, 2 ) )
hist(s_total, breaks=20)
hist(w_total, breaks=20)

# let's reset to one graph per page
par(mfrow = c( 1, 1 ) )

# is there a correlation between number of medals given out in winter and summer?
plot(s_total, w_total, type="p")

# how about number of games each country competes in.  
# Is there correlation between winter and summer?
plot(s_games, w_games, type="p")

# let's look at distribution of each of the types of medals, by season
par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=20)
hist(s_silver, breaks=20)
hist(s_bronze, breaks=20)

hist(w_gold, breaks=20)
hist(w_silver, breaks=20)
hist(w_bronze, breaks=20)

par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=10)
hist(s_silver, breaks=10)
hist(s_bronze, breaks=10)

hist(w_gold, breaks=10)
hist(w_silver, breaks=10)
hist(w_bronze, breaks=10)

# extra exploration
# let's look at correlation between number of games competed, and total medals won,
# by summer and winter
par(mfrow = c( 1, 2 ) )
plot(s_games, s_total, type="p")
plot(w_games, w_total, type="p")

# how many countries did not compete in the winter olympic games?
no_winter <- subset(medals, w_games == 0)
dim(no_winter)
## [1] 45 17
# how many countries did not compete in the Summer olympic games?
no_summer <- subset(medals, s_games == 0)
dim(no_summer)
## [1]  0 17