# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 6
# Topic: R - Messy Data
# Last updated: 4/17/2014
# Data: summer_winter_olympics.csv
# Describe: all-time olympic medal data
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
# Source:
# http://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table
# import data
medals <- read.csv("summer_winter_olympics.csv", sep=",", header=TRUE)
# open and look at data
#View(medals)
# how many variables are in the data frame? (number of columns)
print(length(medals))
## [1] 17
# how many rows, columns are in the data frame?
dim(medals)
## [1] 146 17
# what are the names of the columns?
names(medals)
## [1] "X.9" "Team..IOC.code." "X..Summer"
## [4] "X" "X.1" "X.2"
## [7] "Total" "X..Winter" "X.3"
## [10] "X.4" "X.5" "Total.1"
## [13] "X..Games" "X.6" "X.7"
## [16] "X.8" "Combined.total"
str(medals)
## 'data.frame': 146 obs. of 17 variables:
## $ X.9 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Team..IOC.code.: Factor w/ 146 levels " Afghanistan (AFG)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ X..Summer : int 13 12 23 5 2 25 26 5 15 8 ...
## $ X : int 0 5 18 1 3 138 18 6 5 0 ...
## $ X.1 : int 0 2 24 2 4 153 33 5 2 0 ...
## $ X.2 : int 2 8 28 9 5 177 35 15 5 1 ...
## $ Total : int 2 15 70 12 12 468 86 26 12 1 ...
## $ X..Winter : int 0 3 18 6 0 18 22 5 0 0 ...
## $ X.3 : int 0 0 0 0 0 5 59 0 0 0 ...
## $ X.4 : int 0 0 0 0 0 3 78 0 0 0 ...
## $ X.5 : int 0 0 0 0 0 4 81 0 0 0 ...
## $ Total.1 : int 0 0 0 0 0 12 218 0 0 0 ...
## $ X..Games : int 13 15 41 11 2 43 48 10 15 8 ...
## $ X.6 : int 0 5 18 1 3 143 77 6 5 0 ...
## $ X.7 : int 0 2 24 2 4 156 111 5 2 0 ...
## $ X.8 : int 2 8 28 9 5 181 116 15 5 1 ...
## $ Combined.total : int 2 15 70 12 12 480 304 26 12 1 ...
#renaming columns in the data frame
names(medals) = c("country", "s_games", "s_gold", "s_silver", "s_bronze",
"s_total", "w_games", "w_gold", "w_silver", "w_bronze",
"w_total", "sw_games", "sw_gold", "sw_silver", "sw_bronze",
"sw_total")
dim(medals)
## [1] 146 17
names(medals)
## [1] "country" "s_games" "s_gold" "s_silver" "s_bronze"
## [6] "s_total" "w_games" "w_gold" "w_silver" "w_bronze"
## [11] "w_total" "sw_games" "sw_gold" "sw_silver" "sw_bronze"
## [16] "sw_total" NA
# View to see that column names have been changed
#View(medals)
# attach data for easier use
attach(medals)
# do some summary tables
table(s_games)
## s_games
##  Afghanistan (AFG)
## 1
##  Algeria (ALG)
## 1
##  Argentina (ARG)
## 1
##  Armenia (ARM)
## 1
##  Australasia (ANZ)
## 1
##  Australia (AUS)
## 1
##  Austria (AUT)
## 1
##  Azerbaijan (AZE)
## 1
##  Bahamas (BAH)
## 1
##  Bahrain (BRN)
## 1
##  Barbados (BAR)
## 1
##  Belarus (BLR)
## 1
##  Belgium (BEL)
## 1
##  Bermuda (BER)
## 1
##  Bohemia (BOH)
## 1
##  Botswana (BOT)
## 1
##  Brazil (BRA)
## 1
##  British West Indies (BWI)
## 1
##  Bulgaria (BUL)
## 1
##  Burundi (BDI)
## 1
##  Cameroon (CMR)
## 1
##  Canada (CAN)
## 1
##  Chile (CHI)
## 1
##  China (CHN)
## 1
##  Chinese Taipei (TPE)
## 1
##  Colombia (COL)
## 1
##  Costa Rica (CRC)
## 1
##  Croatia (CRO)
## 1
##  Cuba (CUB)
## 1
##  Cyprus (CYP)
## 1
##  Czech Republic (CZE)
## 1
##  Czechoslovakia (TCH)
## 1
##  Denmark (DEN)
## 1
##  Djibouti (DJI)
## 1
##  Dominican Republic (DOM)
## 1
##  East Germany (GDR)
## 1
##  Ecuador (ECU)
## 1
##  Egypt (EGY)
## 1
##  Eritrea (ERI)
## 1
##  Estonia (EST)
## 1
##  Ethiopia (ETH)
## 1
##  Finland (FIN)
## 1
##  France (FRA)
## 1
##  Gabon (GAB)
## 1
##  Georgia (GEO)
## 1
##  Germany (GER)
## 1
##  Ghana (GHA)
## 1
##  Great Britain (GBR)
## 1
##  Greece (GRE)
## 1
##  Grenada (GRN)
## 1
##  Guatemala (GUA)
## 1
##  Guyana (GUY)
## 1
##  Haiti (HAI)
## 1
##  Hong Kong (HKG)
## 1
##  Hungary (HUN)
## 1
##  Iceland (ISL)
## 1
##  Independent Olympic Participants (IOP)
## 1
##  India (IND)
## 1
##  Indonesia (INA)
## 1
##  Iran (IRI)
## 1
##  Iraq (IRQ)
## 1
##  Ireland (IRL)
## 1
##  Israel (ISR)
## 1
##  Italy (ITA)
## 1
##  Ivory Coast (CIV)
## 1
##  Jamaica (JAM)
## 1
##  Japan (JPN)
## 1
##  Kazakhstan (KAZ)
## 1
##  Kenya (KEN)
## 1
##  Kuwait (KUW)
## 1
##  Kyrgyzstan (KGZ)
## 1
##  Latvia (LAT)
## 1
##  Lebanon (LIB)
## 1
##  Liechtenstein (LIE)
## 1
##  Lithuania (LTU)
## 1
##  Luxembourg (LUX)
## 1
##  Macedonia (MKD)
## 1
##  Malaysia (MAS)
## 1
##  Mauritius (MRI)
## 1
##  Mexico (MEX)
## 1
##  Mixed team (ZZX)
## 1
##  Moldova (MDA)
## 1
##  Mongolia (MGL)
## 1
##  Montenegro (MNE)
## 1
##  Morocco (MAR)
## 1
##  Mozambique (MOZ)
## 1
##  Namibia (NAM)
## 1
##  Netherlands (NED)
## 1
##  Netherlands Antilles (AHO)
## 1
##  New Zealand (NZL)
## 1
##  Niger (NIG)
## 1
##  Nigeria (NGR)
## 1
##  North Korea (PRK)
## 1
##  Norway (NOR)
## 1
##  Pakistan (PAK)
## 1
##  Panama (PAN)
## 1
##  Paraguay (PAR)
## 1
##  Peru (PER)
## 1
##  Philippines (PHI)
## 1
##  Poland (POL)
## 1
##  Portugal (POR)
## 1
##  Puerto Rico (PUR)
## 1
##  Qatar (QAT)
## 1
##  Romania (ROU)
## 1
##  Russia (RUS)
## 1
##  Russian Empire (RU1)
## 1
##  Saudi Arabia (KSA)
## 1
##  Senegal (SEN)
## 1
##  Serbia (SRB)
## 1
##  Serbia and Montenegro (SCG)
## 1
##  Singapore (SIN)
## 1
##  Slovakia (SVK)
## 1
##  Slovenia (SLO)
## 1
##  South Africa (RSA)
## 1
##  South Korea (KOR)
## 1
##  Soviet Union (URS)
## 1
##  Spain (ESP)
## 1
##  Sri Lanka (SRI)
## 1
##  Sudan (SUD)
## 1
##  Suriname (SUR)
## 1
##  Sweden (SWE)
## 1
##  Switzerland (SUI)
## 1
##  Syria (SYR)
## 1
##  Tajikistan (TJK)
## 1
##  Tanzania (TAN)
## 1
##  Thailand (THA)
## 1
##  Togo (TOG)
## 1
##  Tonga (TGA)
## 1
##  Trinidad and Tobago (TRI)
## 1
##  Tunisia (TUN)
## 1
##  Turkey (TUR)
## 1
##  Uganda (UGA)
## 1
##  Ukraine (UKR)
## 1
##  Unified Team (EUN)
## 1
##  Unified Team of Germany (EUA)
## 1
##  United Arab Emirates (UAE)
## 1
##  United States (USA)
## 1
##  Uruguay (URU)
## 1
##  Uzbekistan (UZB)
## 1
##  Venezuela (VEN)
## 1
##  Vietnam (VIE)
## 1
##  Virgin Islands (ISV)
## 1
##  West Germany (FRG)
## 1
##  Yugoslavia (YUG)
## 1
##  Zambia (ZAM)
## 1
##  Zimbabwe (ZIM)
## 1
# let's do histogram of how many summer games in which each country participated
hist(as.numeric(s_games), breaks=10)
dev.copy2pdf(file="histogram.pdf", width = 7, height = 5)
## quartz_off_screen
## 2
hist(as.numeric(s_games), breaks=20)
dev.copy2pdf(file="histograms_compare.pdf", width = 7, height = 5)
## quartz_off_screen
## 2
# let's compare histogram with winter games
hist(w_games, breaks=10)
hist(w_games, breaks=20)
# let's put the two histograms side by side for easier comparison
par(mfrow = c( 1, 2 ) )
hist(as.numeric(s_games), breaks=20)
hist(as.numeric(w_games), breaks=20)
# let's compare how many total medals are given out in each of the season games
par(mfrow = c( 1, 2 ) )
hist(s_total, breaks=20)
hist(w_total, breaks=20)
# let's reset to one graph per page
par(mfrow = c( 1, 1 ) )
# is there a correlation between number of medals given out in winter and summer?
plot(s_total, w_total, type="p")
# how about number of games each country competes in.
# Is there correlation between winter and summer?
plot(s_games, w_games, type="p")
# let's look at distribution of each of the types of medals, by season
par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=20)
hist(s_silver, breaks=20)
hist(s_bronze, breaks=20)
hist(w_gold, breaks=20)
hist(w_silver, breaks=20)
hist(w_bronze, breaks=20)
par(mfrow = c( 2, 3 ) )
hist(s_gold, breaks=10)
hist(s_silver, breaks=10)
hist(s_bronze, breaks=10)
hist(w_gold, breaks=10)
hist(w_silver, breaks=10)
hist(w_bronze, breaks=10)
# extra exploration
# let's look at correlation between number of games competed, and total medals won,
# by summer and winter
par(mfrow = c( 1, 2 ) )
plot(s_games, s_total, type="p")
plot(w_games, w_total, type="p")
# how many countries did not compete in the winter olympic games?
no_winter <- subset(medals, w_games == 0)
dim(no_winter)
## [1] 1 17
# how many countries did not compete in the Summer olympic games?
no_summer <- subset(medals, s_games == 0)
dim(no_summer)
## [1] 0 17
# Should any of the variables be factor?
# Answer: No, they are continuous, numerical
# END