# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      4 (online meeting 2)
# Topic:        Descriptive Statistics
# Last updated: 4/3/2016
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")

# import data
data <- read.csv("winter_olympic.csv", sep=",", header=TRUE)

# open and look at data
#View(data)

# how many variables are in the data frame? (number of columns)
print(length(data))
## [1] 7
# what are the names of the columns?
names(data)
## [1] "Rank"   "NOC"    "Gold"   "Silver" "Bronze" "Total"  "Region"
# how many countries (rows) are in the data frame?
dim(data)
## [1] 26  7
#print(dim(data))

# The data frame is currently sorted by rank
# Sort it now by the total medals won and country

# attach data frame
attach(data)

# sort by total medals and country
sort_total <- data[order(Total, NOC),]
sort_total
##    Rank                   NOC Gold Silver Bronze Total    Region
## 25   25         Croatia (CRO)    0      1      0     1    EUROPE
## 26   26      Kazakhstan (KAZ)    0      0      1     1   EURASIA
## 21   21        Slovakia (SVK)    1      0      0     1    EUROPE
## 20   20         Ukraine (UKR)    1      0      1     2   EURASIA
## 24   24       Australia (AUS)    0      2      1     3 AUSTRALIA
## 19   19   Great Britain (GBR)    1      1      2     4    EUROPE
## 23   23          Latvia (LAT)    0      2      2     4   EURASIA
## 18   18         Finland (FIN)    1      3      1     5    EUROPE
## 8     8         Belarus (BLR)    5      0      1     6   EURASIA
## 11   11          Poland (POL)    4      1      1     6    EUROPE
## 15   15  Czech Republic (CZE)    2      4      2     8    EUROPE
## 22   22           Italy (ITA)    0      2      6     8    EUROPE
## 17   17           Japan (JPN)    1      4      3     8      ASIA
## 16   16        Slovenia (SLO)    2      2      4     8    EUROPE
## 13   13     South Korea (KOR)    3      3      2     8      ASIA
## 12   12           China (CHN)    3      4      2     9      ASIA
## 7     7     Switzerland (SUI)    6      3      2    11    EUROPE
## 10   10          France (FRA)    4      4      7    15    EUROPE
## 14   14          Sweden (SWE)    2      7      6    15    EUROPE
## 9     9         Austria (AUT)    4      8      5    17    EUROPE
## 6     6         Germany (GER)    8      6      5    19    EUROPE
## 5     5     Netherlands (NED)    8      7      9    24    EUROPE
## 3     3          Canada (CAN)   10     10      5    25   NORTH_A
## 2     2          Norway (NOR)   11      5     10    26    EUROPE
## 4     4   United States (USA)    9      7     12    28   NORTH_A
## 1     1         Russia (RUS)*   13     11      9    33   EURASIA
# get summary statistics on data
#describe(data) 

# we might get error:  
# Error: could not find function "describe"

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
# we get another error
# > library(Hmisc)
# Error in library(Hmisc) : there is no package called ‘Hmisc’

# Go to Packages
# Install:  Hmisc

# Let's try again
library(Hmisc)

# Now, let's try the describe() function again
describe(data)
## data 
## 
##  7  Variables      26  Observations
## ---------------------------------------------------------------------------
## Rank 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      26       0      26       1    13.5    2.25    3.50    7.25   13.50 
##     .75     .90     .95 
##   19.75   23.50   24.75 
## 
## lowest :  1  2  3  4  5, highest: 22 23 24 25 26 
## ---------------------------------------------------------------------------
## NOC 
##       n missing  unique 
##      26       0      26 
## 
## lowest :  Australia (AUS)      Austria (AUT)        Belarus (BLR)        Canada (CAN)         China (CHN)        
## highest:  South Korea (KOR)    Sweden (SWE)         Switzerland (SUI)    Ukraine (UKR)        United States (USA) 
## ---------------------------------------------------------------------------
## Gold 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      26       0      12    0.98   3.808    0.00    0.00    1.00    2.50 
##     .75     .90     .95 
##    5.75    9.50   10.75 
## 
##            0  1  2 3  4 5 6 8 9 10 11 13
## Frequency  5  5  3 2  3 1 1 2 1  1  1  1
## %         19 19 12 8 12 4 4 8 4  4  4  4
## ---------------------------------------------------------------------------
## Silver 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      26       0      11    0.99   3.731    0.00    0.00    1.25    3.00 
##     .75     .90     .95 
##    5.75    7.50    9.50 
## 
##            0  1  2  3  4 5 6  7 8 10 11
## Frequency  4  3  4  3  4 1 1  3 1  1  1
## %         15 12 15 12 15 4 4 12 4  4  4
## ---------------------------------------------------------------------------
## Bronze 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      26       0      11    0.97   3.808    0.25    1.00    1.00    2.00 
##     .75     .90     .95 
##    5.75    9.00    9.75 
## 
##           0  1  2 3 4  5 6 7 9 10 12
## Frequency 2  6  6 1 1  3 2 1 2  1  1
## %         8 23 23 4 4 12 8 4 8  4  4
## ---------------------------------------------------------------------------
## Total 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      26       0      17    0.99   11.35    1.00    1.50    4.25    8.00 
##     .75     .90     .95 
##   16.50   25.50   27.50 
## 
##            1 2 3 4 5 6  8 9 11 15 17 19 24 25 26 28 33
## Frequency  3 1 1 2 1 2  5 1  1  2  1  1  1  1  1  1  1
## %         12 4 4 8 4 8 19 4  4  8  4  4  4  4  4  4  4
## ---------------------------------------------------------------------------
## Region 
##       n missing  unique 
##      26       0       5 
## 
##           ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## Frequency    3         1       5     15       2
## %           12         4      19     58       8
## ---------------------------------------------------------------------------
# What is median of number of gold, silver, bronze and total medals?
median(data$Gold)
## [1] 2.5
median(data$Silver)
## [1] 3
median(data$Bronze)
## [1] 2
median(data$Total)
## [1] 8
"
# from past examples:
# print mean number of gold, silver and bronze medals

mean(data$Gold)
mean(data$Silver)
mean(data$Bronze)

# How many of each of the 3 types of medals were awarded?
sum(data$Gold)
sum(data$Silver)
sum(data$Bronze)

# Print list of countries that did not win each of the 3 medals
# Print number of countries that did not win each of the 3 medals
# Hint:  use subset

medal_subset = subset(x = data, subset = Gold == 0 | Silver == 0 | Bronze == 0)
medal_subset
length(medal_subset)
"
## [1] "\n# from past examples:\n# print mean number of gold, silver and bronze medals\n\nmean(data$Gold)\nmean(data$Silver)\nmean(data$Bronze)\n\n# How many of each of the 3 types of medals were awarded?\nsum(data$Gold)\nsum(data$Silver)\nsum(data$Bronze)\n\n# Print list of countries that did not win each of the 3 medals\n# Print number of countries that did not win each of the 3 medals\n# Hint:  use subset\n\nmedal_subset = subset(x = data, subset = Gold == 0 | Silver == 0 | Bronze == 0)\nmedal_subset\nlength(medal_subset)\n"
# What is IQR of Gold medals won? How about min, max, mean
# What is variance and SD of number of Gold medals won?
IQR(data$Gold)
## [1] 4.75
min(data$Gold)
## [1] 0
max(data$Gold)
## [1] 13
mean(data$Gold)
## [1] 3.807692
var(data$Gold)
## [1] 14.64154
sd(data$Gold)
## [1] 3.826426
# Look at skew of number of Gold medals
#skew(data$Gold)

library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
# library not found, so will install it under Packages
library(psych)
skew(data$Gold)
## [1] 0.87898
# look at summary of data
summary(data)
##       Rank                     NOC          Gold            Silver      
##  Min.   : 1.00    Australia (AUS): 1   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 7.25    Austria (AUT)  : 1   1st Qu.: 1.000   1st Qu.: 1.250  
##  Median :13.50    Belarus (BLR)  : 1   Median : 2.500   Median : 3.000  
##  Mean   :13.50    Canada (CAN)   : 1   Mean   : 3.808   Mean   : 3.731  
##  3rd Qu.:19.75    China (CHN)    : 1   3rd Qu.: 5.750   3rd Qu.: 5.750  
##  Max.   :26.00    Croatia (CRO)  : 1   Max.   :13.000   Max.   :11.000  
##                  (Other)         :20                                    
##      Bronze           Total             Region  
##  Min.   : 0.000   Min.   : 1.00   ASIA     : 3  
##  1st Qu.: 1.000   1st Qu.: 4.25   AUSTRALIA: 1  
##  Median : 2.000   Median : 8.00   EURASIA  : 5  
##  Mean   : 3.808   Mean   :11.35   EUROPE   :15  
##  3rd Qu.: 5.750   3rd Qu.:16.50   NORTH_A  : 2  
##  Max.   :12.000   Max.   :33.00                 
## 
describe(data)
##         vars  n  mean   sd median trimmed  mad min max range  skew
## Rank       1 26 13.50 7.65   13.5   13.50 9.64   1  26    25  0.00
## NOC*       2 26 13.50 7.65   13.5   13.50 9.64   1  26    25  0.00
## Gold       3 26  3.81 3.83    2.5    3.41 2.97   0  13    13  0.88
## Silver     4 26  3.73 3.11    3.0    3.45 2.97   0  11    11  0.69
## Bronze     5 26  3.81 3.35    2.0    3.50 2.22   0  12    12  0.87
## Total      6 26 11.35 9.32    8.0   10.55 8.15   1  33    32  0.81
## Region*    7 26  3.46 1.10    4.0    3.55 0.00   1   5     4 -1.11
##         kurtosis   se
## Rank       -1.34 1.50
## NOC*       -1.34 1.50
## Gold       -0.48 0.75
## Silver     -0.51 0.61
## Bronze     -0.41 0.66
## Total      -0.63 1.83
## Region*     0.30 0.22
# look at summary by Geographic Region
describeBy(data, group=data$Region)
## group: ASIA
##         vars n  mean   sd median trimmed   mad min max range  skew
## Rank       1 3 14.00 2.65     13   14.00  1.48  12  17     5  0.32
## NOC*       2 3 13.33 8.50     13   13.33 11.86   5  22    17  0.04
## Gold       3 3  2.33 1.15      3    2.33  0.00   1   3     2 -0.38
## Silver     4 3  3.67 0.58      4    3.67  0.00   3   4     1 -0.38
## Bronze     5 3  2.33 0.58      2    2.33  0.00   2   3     1  0.38
## Total      6 3  8.33 0.58      8    8.33  0.00   8   9     1  0.38
## Region*    7 3  1.00 0.00      1    1.00  0.00   1   1     0   NaN
##         kurtosis   se
## Rank       -2.33 1.53
## NOC*       -2.33 4.91
## Gold       -2.33 0.67
## Silver     -2.33 0.33
## Bronze     -2.33 0.33
## Total      -2.33 0.33
## Region*      NaN 0.00
## -------------------------------------------------------- 
## group: AUSTRALIA
##         vars n mean sd median trimmed mad min max range skew kurtosis se
## Rank       1 1   24 NA     24      24   0  24  24     0   NA       NA NA
## NOC*       2 1    1 NA      1       1   0   1   1     0   NA       NA NA
## Gold       3 1    0 NA      0       0   0   0   0     0   NA       NA NA
## Silver     4 1    2 NA      2       2   0   2   2     0   NA       NA NA
## Bronze     5 1    1 NA      1       1   0   1   1     0   NA       NA NA
## Total      6 1    3 NA      3       3   0   3   3     0   NA       NA NA
## Region*    7 1    2 NA      2       2   0   2   2     0   NA       NA NA
## -------------------------------------------------------- 
## group: EURASIA
##         vars n mean    sd median trimmed  mad min max range  skew kurtosis
## Rank       1 5 15.6 10.64     20    15.6 8.90   1  26    25 -0.32    -2.01
## NOC*       2 5 15.2  8.07     15    15.2 5.93   3  25    22 -0.31    -1.51
## Gold       3 5  3.8  5.54      1     3.8 1.48   0  13    13  0.76    -1.38
## Silver     4 5  2.6  4.77      0     2.6 0.00   0  11    11  0.99    -1.03
## Bronze     5 5  2.8  3.49      1     2.8 0.00   1   9     8  1.03    -0.97
## Total      6 5  9.2 13.44      4     9.2 2.97   1  33    32  1.02    -0.99
## Region*    7 5  3.0  0.00      3     3.0 0.00   3   3     0   NaN      NaN
##           se
## Rank    4.76
## NOC*    3.61
## Gold    2.48
## Silver  2.14
## Bronze  1.56
## Total   6.01
## Region* 0.00
## -------------------------------------------------------- 
## group: EUROPE
##         vars  n  mean   sd median trimmed   mad min max range skew
## Rank       1 15 13.33 6.87     14   13.31  7.41   2  25    23 0.04
## NOC*       2 15 13.60 6.76     12   13.69  7.41   2  24    22 0.03
## Gold       3 15  3.60 3.31      2    3.31  2.97   0  11    11 0.79
## Silver     4 15  3.60 2.53      3    3.54  2.97   0   8     8 0.30
## Bronze     5 15  4.00 3.18      4    3.85  2.97   0  10    10 0.37
## Total      6 15 11.20 7.85      8   10.85 10.38   1  26    25 0.43
## Region*    7 15  4.00 0.00      4    4.00  0.00   4   4     0  NaN
##         kurtosis   se
## Rank       -1.34 1.77
## NOC*       -1.41 1.75
## Gold       -0.63 0.86
## Silver     -1.38 0.65
## Bronze     -1.21 0.82
## Total      -1.12 2.03
## Region*      NaN 0.00
## -------------------------------------------------------- 
## group: NORTH_A
##         vars n mean    sd median trimmed   mad min max range skew kurtosis
## Rank       1 2  3.5  0.71    3.5     3.5  0.74   3   4     1    0    -2.75
## NOC*       2 2 15.0 15.56   15.0    15.0 16.31   4  26    22    0    -2.75
## Gold       3 2  9.5  0.71    9.5     9.5  0.74   9  10     1    0    -2.75
## Silver     4 2  8.5  2.12    8.5     8.5  2.22   7  10     3    0    -2.75
## Bronze     5 2  8.5  4.95    8.5     8.5  5.19   5  12     7    0    -2.75
## Total      6 2 26.5  2.12   26.5    26.5  2.22  25  28     3    0    -2.75
## Region*    7 2  5.0  0.00    5.0     5.0  0.00   5   5     0  NaN      NaN
##           se
## Rank     0.5
## NOC*    11.0
## Gold     0.5
## Silver   1.5
## Bronze   3.5
## Total    1.5
## Region*  0.0
# look at total medals by Geographic Region
describeBy(data$Total, group=data$Region)
## group: ASIA
##   vars n mean   sd median trimmed mad min max range skew kurtosis   se
## 1    1 3 8.33 0.58      8    8.33   0   8   9     1 0.38    -2.33 0.33
## -------------------------------------------------------- 
## group: AUSTRALIA
##   vars n mean sd median trimmed mad min max range skew kurtosis se
## 1    1 1    3 NA      3       3   0   3   3     0   NA       NA NA
## -------------------------------------------------------- 
## group: EURASIA
##   vars n mean    sd median trimmed  mad min max range skew kurtosis   se
## 1    1 5  9.2 13.44      4     9.2 2.97   1  33    32 1.02    -0.99 6.01
## -------------------------------------------------------- 
## group: EUROPE
##   vars  n mean   sd median trimmed   mad min max range skew kurtosis   se
## 1    1 15 11.2 7.85      8   10.85 10.38   1  26    25 0.43    -1.12 2.03
## -------------------------------------------------------- 
## group: NORTH_A
##   vars n mean   sd median trimmed  mad min max range skew kurtosis  se
## 1    1 2 26.5 2.12   26.5    26.5 2.22  25  28     3    0    -2.75 1.5
# Which region won the highest mean total medals?
# How many countries are in this Geographic Region?

north_america <- subset(data, Region == "NORTH_A")
print(north_america)
##   Rank                  NOC Gold Silver Bronze Total  Region
## 3    3         Canada (CAN)   10     10      5    25 NORTH_A
## 4    4  United States (USA)    9      7     12    28 NORTH_A
# How many countries are in the EUROPE group?
europe <- subset(data, Region == "EUROPE")
print(europe)
##    Rank                   NOC Gold Silver Bronze Total Region
## 2     2          Norway (NOR)   11      5     10    26 EUROPE
## 5     5     Netherlands (NED)    8      7      9    24 EUROPE
## 6     6         Germany (GER)    8      6      5    19 EUROPE
## 7     7     Switzerland (SUI)    6      3      2    11 EUROPE
## 9     9         Austria (AUT)    4      8      5    17 EUROPE
## 10   10          France (FRA)    4      4      7    15 EUROPE
## 11   11          Poland (POL)    4      1      1     6 EUROPE
## 14   14          Sweden (SWE)    2      7      6    15 EUROPE
## 15   15  Czech Republic (CZE)    2      4      2     8 EUROPE
## 16   16        Slovenia (SLO)    2      2      4     8 EUROPE
## 18   18         Finland (FIN)    1      3      1     5 EUROPE
## 19   19   Great Britain (GBR)    1      1      2     4 EUROPE
## 21   21        Slovakia (SVK)    1      0      0     1 EUROPE
## 22   22           Italy (ITA)    0      2      6     8 EUROPE
## 25   25         Croatia (CRO)    0      1      0     1 EUROPE
dim(europe)
## [1] 15  7
# What is the max number of medals won?  What country won the max?
max(data$Total)
## [1] 33
subset(data, data$Total == 33)
##   Rank            NOC Gold Silver Bronze Total  Region
## 1    1  Russia (RUS)*   13     11      9    33 EURASIA
table(data$Total, data$Region)
##     
##      ASIA AUSTRALIA EURASIA EUROPE NORTH_A
##   1     0         0       1      2       0
##   2     0         0       1      0       0
##   3     0         1       0      0       0
##   4     0         0       1      1       0
##   5     0         0       0      1       0
##   6     0         0       1      1       0
##   8     2         0       0      3       0
##   9     1         0       0      0       0
##   11    0         0       0      1       0
##   15    0         0       0      2       0
##   17    0         0       0      1       0
##   19    0         0       0      1       0
##   24    0         0       0      1       0
##   25    0         0       0      0       1
##   26    0         0       0      1       0
##   28    0         0       0      0       1
##   33    0         0       1      0       0
# explore correlations between Total medals and number of Gold and Bronze
cor(data$Total,data$Gold)
## [1] 0.9186698
cor(data$Total,data$Bronze)
## [1] 0.8989262
cor(data$Gold,data$Bronze)
## [1] 0.7247493
# What is the correlation between Rank and Total medals?  Is this expected or surprising?
cor(data$Rank, data$Total)
## [1] -0.874864