# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      6
# Topic:        R - Messy Data
# Last updated: 4/17/2014
# Data:         summer_winter_olympics.csv
# Describe:     all-time olympic medal data
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")

# Source:
# http://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table

# import data
medals <- read.csv("summer_winter_olympics.csv", sep=",", header=TRUE)

# open and look at data
#View(medals)

# how many variables are in the data frame? (number of columns)
print(length(medals))
## [1] 17
# how many rows, columns are in the data frame?
dim(medals)
## [1] 146  17
# what are the names of the columns?
names(medals)
##  [1] "X.9"             "Team..IOC.code." "X..Summer"      
##  [4] "X"               "X.1"             "X.2"            
##  [7] "Total"           "X..Winter"       "X.3"            
## [10] "X.4"             "X.5"             "Total.1"        
## [13] "X..Games"        "X.6"             "X.7"            
## [16] "X.8"             "Combined.total"
str(medals)
## 'data.frame':    146 obs. of  17 variables:
##  $ X.9            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Team..IOC.code.: Factor w/ 146 levels " Afghanistan (AFG)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ X..Summer      : int  13 12 23 5 2 25 26 5 15 8 ...
##  $ X              : int  0 5 18 1 3 138 18 6 5 0 ...
##  $ X.1            : int  0 2 24 2 4 153 33 5 2 0 ...
##  $ X.2            : int  2 8 28 9 5 177 35 15 5 1 ...
##  $ Total          : int  2 15 70 12 12 468 86 26 12 1 ...
##  $ X..Winter      : int  0 3 18 6 0 18 22 5 0 0 ...
##  $ X.3            : int  0 0 0 0 0 5 59 0 0 0 ...
##  $ X.4            : int  0 0 0 0 0 3 78 0 0 0 ...
##  $ X.5            : int  0 0 0 0 0 4 81 0 0 0 ...
##  $ Total.1        : int  0 0 0 0 0 12 218 0 0 0 ...
##  $ X..Games       : int  13 15 41 11 2 43 48 10 15 8 ...
##  $ X.6            : int  0 5 18 1 3 143 77 6 5 0 ...
##  $ X.7            : int  0 2 24 2 4 156 111 5 2 0 ...
##  $ X.8            : int  2 8 28 9 5 181 116 15 5 1 ...
##  $ Combined.total : int  2 15 70 12 12 480 304 26 12 1 ...
#renaming columns in the data frame
names(medals) = c("country", "s_games", "s_gold", "s_silver", "s_bronze", 
                  "s_total", "w_games", "w_gold", "w_silver", "w_bronze", 
                  "w_total", "sw_games", "sw_gold", "sw_silver", "sw_bronze", 
                  "sw_total")
dim(medals)
## [1] 146  17
names(medals)
##  [1] "country"   "s_games"   "s_gold"    "s_silver"  "s_bronze" 
##  [6] "s_total"   "w_games"   "w_gold"    "w_silver"  "w_bronze" 
## [11] "w_total"   "sw_games"  "sw_gold"   "sw_silver" "sw_bronze"
## [16] "sw_total"  NA
# View to see that column names have been changed
#View(medals)

# attach data for easier use
attach(medals)

# do some summary tables
table(s_games)
## s_games
##                       Afghanistan (AFG) 
##                                       1 
##                           Algeria (ALG) 
##                                       1 
##                         Argentina (ARG) 
##                                       1 
##                           Armenia (ARM) 
##                                       1 
##                       Australasia (ANZ) 
##                                       1 
##                         Australia (AUS) 
##                                       1 
##                           Austria (AUT) 
##                                       1 
##                        Azerbaijan (AZE) 
##                                       1 
##                           Bahamas (BAH) 
##                                       1 
##                           Bahrain (BRN) 
##                                       1 
##                          Barbados (BAR) 
##                                       1 
##                           Belarus (BLR) 
##                                       1 
##                           Belgium (BEL) 
##                                       1 
##                           Bermuda (BER) 
##                                       1 
##                           Bohemia (BOH) 
##                                       1 
##                          Botswana (BOT) 
##                                       1 
##                            Brazil (BRA) 
##                                       1 
##               British West Indies (BWI) 
##                                       1 
##                          Bulgaria (BUL) 
##                                       1 
##                           Burundi (BDI) 
##                                       1 
##                          Cameroon (CMR) 
##                                       1 
##                            Canada (CAN) 
##                                       1 
##                             Chile (CHI) 
##                                       1 
##                             China (CHN) 
##                                       1 
##                    Chinese Taipei (TPE) 
##                                       1 
##                          Colombia (COL) 
##                                       1 
##                        Costa Rica (CRC) 
##                                       1 
##                           Croatia (CRO) 
##                                       1 
##                              Cuba (CUB) 
##                                       1 
##                            Cyprus (CYP) 
##                                       1 
##                    Czech Republic (CZE) 
##                                       1 
##                    Czechoslovakia (TCH) 
##                                       1 
##                           Denmark (DEN) 
##                                       1 
##                          Djibouti (DJI) 
##                                       1 
##                Dominican Republic (DOM) 
##                                       1 
##                      East Germany (GDR) 
##                                       1 
##                           Ecuador (ECU) 
##                                       1 
##                             Egypt (EGY) 
##                                       1 
##                           Eritrea (ERI) 
##                                       1 
##                           Estonia (EST) 
##                                       1 
##                          Ethiopia (ETH) 
##                                       1 
##                           Finland (FIN) 
##                                       1 
##                            France (FRA) 
##                                       1 
##                             Gabon (GAB) 
##                                       1 
##                           Georgia (GEO) 
##                                       1 
##                          Germany (GER)  
##                                       1 
##                             Ghana (GHA) 
##                                       1 
##                     Great Britain (GBR) 
##                                       1 
##                            Greece (GRE) 
##                                       1 
##                           Grenada (GRN) 
##                                       1 
##                         Guatemala (GUA) 
##                                       1 
##                            Guyana (GUY) 
##                                       1 
##                             Haiti (HAI) 
##                                       1 
##                        Hong Kong (HKG)  
##                                       1 
##                           Hungary (HUN) 
##                                       1 
##                           Iceland (ISL) 
##                                       1 
##  Independent Olympic Participants (IOP) 
##                                       1 
##                            India (IND)  
##                                       1 
##                         Indonesia (INA) 
##                                       1 
##                              Iran (IRI) 
##                                       1 
##                              Iraq (IRQ) 
##                                       1 
##                           Ireland (IRL) 
##                                       1 
##                            Israel (ISR) 
##                                       1 
##                             Italy (ITA) 
##                                       1 
##                      Ivory Coast (CIV)  
##                                       1 
##                           Jamaica (JAM) 
##                                       1 
##                             Japan (JPN) 
##                                       1 
##                        Kazakhstan (KAZ) 
##                                       1 
##                             Kenya (KEN) 
##                                       1 
##                            Kuwait (KUW) 
##                                       1 
##                        Kyrgyzstan (KGZ) 
##                                       1 
##                            Latvia (LAT) 
##                                       1 
##                           Lebanon (LIB) 
##                                       1 
##                     Liechtenstein (LIE) 
##                                       1 
##                         Lithuania (LTU) 
##                                       1 
##                        Luxembourg (LUX) 
##                                       1 
##                         Macedonia (MKD) 
##                                       1 
##                          Malaysia (MAS) 
##                                       1 
##                         Mauritius (MRI) 
##                                       1 
##                            Mexico (MEX) 
##                                       1 
##                        Mixed team (ZZX) 
##                                       1 
##                           Moldova (MDA) 
##                                       1 
##                          Mongolia (MGL) 
##                                       1 
##                        Montenegro (MNE) 
##                                       1 
##                           Morocco (MAR) 
##                                       1 
##                        Mozambique (MOZ) 
##                                       1 
##                           Namibia (NAM) 
##                                       1 
##                       Netherlands (NED) 
##                                       1 
##              Netherlands Antilles (AHO) 
##                                       1 
##                       New Zealand (NZL) 
##                                       1 
##                             Niger (NIG) 
##                                       1 
##                           Nigeria (NGR) 
##                                       1 
##                       North Korea (PRK) 
##                                       1 
##                            Norway (NOR) 
##                                       1 
##                          Pakistan (PAK) 
##                                       1 
##                            Panama (PAN) 
##                                       1 
##                          Paraguay (PAR) 
##                                       1 
##                              Peru (PER) 
##                                       1 
##                       Philippines (PHI) 
##                                       1 
##                            Poland (POL) 
##                                       1 
##                          Portugal (POR) 
##                                       1 
##                       Puerto Rico (PUR) 
##                                       1 
##                             Qatar (QAT) 
##                                       1 
##                           Romania (ROU) 
##                                       1 
##                            Russia (RUS) 
##                                       1 
##                    Russian Empire (RU1) 
##                                       1 
##                      Saudi Arabia (KSA) 
##                                       1 
##                           Senegal (SEN) 
##                                       1 
##                            Serbia (SRB) 
##                                       1 
##             Serbia and Montenegro (SCG) 
##                                       1 
##                         Singapore (SIN) 
##                                       1 
##                          Slovakia (SVK) 
##                                       1 
##                          Slovenia (SLO) 
##                                       1 
##                      South Africa (RSA) 
##                                       1 
##                       South Korea (KOR) 
##                                       1 
##                     Soviet Union (URS)  
##                                       1 
##                             Spain (ESP) 
##                                       1 
##                         Sri Lanka (SRI) 
##                                       1 
##                             Sudan (SUD) 
##                                       1 
##                          Suriname (SUR) 
##                                       1 
##                            Sweden (SWE) 
##                                       1 
##                       Switzerland (SUI) 
##                                       1 
##                             Syria (SYR) 
##                                       1 
##                        Tajikistan (TJK) 
##                                       1 
##                          Tanzania (TAN) 
##                                       1 
##                          Thailand (THA) 
##                                       1 
##                              Togo (TOG) 
##                                       1 
##                             Tonga (TGA) 
##                                       1 
##               Trinidad and Tobago (TRI) 
##                                       1 
##                           Tunisia (TUN) 
##                                       1 
##                            Turkey (TUR) 
##                                       1 
##                            Uganda (UGA) 
##                                       1 
##                           Ukraine (UKR) 
##                                       1 
##                      Unified Team (EUN) 
##                                       1 
##           Unified Team of Germany (EUA) 
##                                       1 
##              United Arab Emirates (UAE) 
##                                       1 
##                     United States (USA) 
##                                       1 
##                           Uruguay (URU) 
##                                       1 
##                        Uzbekistan (UZB) 
##                                       1 
##                         Venezuela (VEN) 
##                                       1 
##                           Vietnam (VIE) 
##                                       1 
##                    Virgin Islands (ISV) 
##                                       1 
##                      West Germany (FRG) 
##                                       1 
##                        Yugoslavia (YUG) 
##                                       1 
##                            Zambia (ZAM) 
##                                       1 
##                          Zimbabwe (ZIM) 
##                                       1
# let's do histogram of how many summer games in which each country participated
hist(as.numeric(s_games), breaks=10)

dev.copy2pdf(file="histogram.pdf", width = 7, height = 5)
## quartz_off_screen 
##                 2
hist(as.numeric(s_games), breaks=20)

dev.copy2pdf(file="histograms_compare.pdf", width = 7, height = 5)
## quartz_off_screen 
##                 2
# let's compare histogram with winter games
hist(w_games, breaks=10)

hist(w_games, breaks=20)

# let's put the two histograms side by side for easier comparison
par(mfrow = c( 1, 2 ) )
hist(as.numeric(s_games), breaks=20)
hist(as.numeric(w_games), breaks=20)

# let's compare how many total medals are given out in each of the season games
par(mfrow = c( 1, 2 ) )
hist(s_total, breaks=20)
hist(w_total, breaks=20)

# let's reset to one graph per page
par(mfrow = c( 1, 1 ) )

# is there a correlation between number of medals given out in winter and summer?
plot(s_total, w_total, type="p")

# how about number of games each country competes in.  
# Is there correlation between winter and summer?
plot(s_games, w_games, type="p")

# let's look at distribution of each of the types of medals, by season
par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=20)
hist(s_silver, breaks=20)
hist(s_bronze, breaks=20)

hist(w_gold, breaks=20)
hist(w_silver, breaks=20)
hist(w_bronze, breaks=20)

par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=10)
hist(s_silver, breaks=10)
hist(s_bronze, breaks=10)

hist(w_gold, breaks=10)
hist(w_silver, breaks=10)
hist(w_bronze, breaks=10)

# extra exploration
# let's look at correlation between number of games competed, and total medals won,
# by summer and winter
par(mfrow = c( 1, 2 ) )
plot(s_games, s_total, type="p")
plot(w_games, w_total, type="p")

# how many countries did not compete in the winter olympic games?
no_winter <- subset(medals, w_games == 0)
dim(no_winter)
## [1]  1 17
# how many countries did not compete in the Summer olympic games?
no_summer <- subset(medals, s_games == 0)
dim(no_summer)
## [1]  0 17
# Should any of the variables be factor?
# Answer: No, they are continuous, numerical
# END