# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 4 (online meeting 2)
# Topic: Descriptive Statistics
# Last updated: 4/3/2016
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
# import data
data <- read.csv("winter_olympic.csv", sep=",", header=TRUE)
# open and look at data
#View(data)
# how many variables are in the data frame? (number of columns)
print(length(data))
## [1] 7
# what are the names of the columns?
names(data)
## [1] "Rank" "NOC" "Gold" "Silver" "Bronze" "Total" "Region"
# how many countries (rows) are in the data frame?
dim(data)
## [1] 26 7
#print(dim(data))
# The data frame is currently sorted by rank
# Sort it now by the total medals won and country
# attach data frame
attach(data)
# sort by total medals and country
sort_total <- data[order(Total, NOC),]
sort_total
## Rank NOC Gold Silver Bronze Total Region
## 25 25 Croatia (CRO) 0 1 0 1 EUROPE
## 26 26 Kazakhstan (KAZ) 0 0 1 1 EURASIA
## 21 21 Slovakia (SVK) 1 0 0 1 EUROPE
## 20 20 Ukraine (UKR) 1 0 1 2 EURASIA
## 24 24 Australia (AUS) 0 2 1 3 AUSTRALIA
## 19 19 Great Britain (GBR) 1 1 2 4 EUROPE
## 23 23 Latvia (LAT) 0 2 2 4 EURASIA
## 18 18 Finland (FIN) 1 3 1 5 EUROPE
## 8 8 Belarus (BLR) 5 0 1 6 EURASIA
## 11 11 Poland (POL) 4 1 1 6 EUROPE
## 15 15 Czech Republic (CZE) 2 4 2 8 EUROPE
## 22 22 Italy (ITA) 0 2 6 8 EUROPE
## 17 17 Japan (JPN) 1 4 3 8 ASIA
## 16 16 Slovenia (SLO) 2 2 4 8 EUROPE
## 13 13 South Korea (KOR) 3 3 2 8 ASIA
## 12 12 China (CHN) 3 4 2 9 ASIA
## 7 7 Switzerland (SUI) 6 3 2 11 EUROPE
## 10 10 France (FRA) 4 4 7 15 EUROPE
## 14 14 Sweden (SWE) 2 7 6 15 EUROPE
## 9 9 Austria (AUT) 4 8 5 17 EUROPE
## 6 6 Germany (GER) 8 6 5 19 EUROPE
## 5 5 Netherlands (NED) 8 7 9 24 EUROPE
## 3 3 Canada (CAN) 10 10 5 25 NORTH_A
## 2 2 Norway (NOR) 11 5 10 26 EUROPE
## 4 4 United States (USA) 9 7 12 28 NORTH_A
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
# get summary statistics on data
#describe(data)
# we might get error:
# Error: could not find function "describe"
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
# we get another error
# > library(Hmisc)
# Error in library(Hmisc) : there is no package called ‘Hmisc’
# Go to Packages
# Install: Hmisc
# Let's try again
library(Hmisc)
# Now, let's try the describe() function again
describe(data)
## data
##
## 7 Variables 26 Observations
## ---------------------------------------------------------------------------
## Rank
## n missing unique Info Mean .05 .10 .25 .50
## 26 0 26 1 13.5 2.25 3.50 7.25 13.50
## .75 .90 .95
## 19.75 23.50 24.75
##
## lowest : 1 2 3 4 5, highest: 22 23 24 25 26
## ---------------------------------------------------------------------------
## NOC
## n missing unique
## 26 0 26
##
## lowest : Australia (AUS) Austria (AUT) Belarus (BLR) Canada (CAN) China (CHN)
## highest: South Korea (KOR) Sweden (SWE) Switzerland (SUI) Ukraine (UKR) United States (USA)
## ---------------------------------------------------------------------------
## Gold
## n missing unique Info Mean .05 .10 .25 .50
## 26 0 12 0.98 3.808 0.00 0.00 1.00 2.50
## .75 .90 .95
## 5.75 9.50 10.75
##
## 0 1 2 3 4 5 6 8 9 10 11 13
## Frequency 5 5 3 2 3 1 1 2 1 1 1 1
## % 19 19 12 8 12 4 4 8 4 4 4 4
## ---------------------------------------------------------------------------
## Silver
## n missing unique Info Mean .05 .10 .25 .50
## 26 0 11 0.99 3.731 0.00 0.00 1.25 3.00
## .75 .90 .95
## 5.75 7.50 9.50
##
## 0 1 2 3 4 5 6 7 8 10 11
## Frequency 4 3 4 3 4 1 1 3 1 1 1
## % 15 12 15 12 15 4 4 12 4 4 4
## ---------------------------------------------------------------------------
## Bronze
## n missing unique Info Mean .05 .10 .25 .50
## 26 0 11 0.97 3.808 0.25 1.00 1.00 2.00
## .75 .90 .95
## 5.75 9.00 9.75
##
## 0 1 2 3 4 5 6 7 9 10 12
## Frequency 2 6 6 1 1 3 2 1 2 1 1
## % 8 23 23 4 4 12 8 4 8 4 4
## ---------------------------------------------------------------------------
## Total
## n missing unique Info Mean .05 .10 .25 .50
## 26 0 17 0.99 11.35 1.00 1.50 4.25 8.00
## .75 .90 .95
## 16.50 25.50 27.50
##
## 1 2 3 4 5 6 8 9 11 15 17 19 24 25 26 28 33
## Frequency 3 1 1 2 1 2 5 1 1 2 1 1 1 1 1 1 1
## % 12 4 4 8 4 8 19 4 4 8 4 4 4 4 4 4 4
## ---------------------------------------------------------------------------
## Region
## n missing unique
## 26 0 5
##
## ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## Frequency 3 1 5 15 2
## % 12 4 19 58 8
## ---------------------------------------------------------------------------
# What is median of number of gold, silver, bronze and total medals?
median(data$Gold)
## [1] 2.5
median(data$Silver)
## [1] 3
median(data$Bronze)
## [1] 2
median(data$Total)
## [1] 8
"
# from past examples:
# print mean number of gold, silver and bronze medals
mean(data$Gold)
mean(data$Silver)
mean(data$Bronze)
# How many of each of the 3 types of medals were awarded?
sum(data$Gold)
sum(data$Silver)
sum(data$Bronze)
# Print list of countries that did not win each of the 3 medals
# Print number of countries that did not win each of the 3 medals
# Hint: use subset
medal_subset = subset(x = data, subset = Gold == 0 | Silver == 0 | Bronze == 0)
medal_subset
length(medal_subset)
"
## [1] "\n# from past examples:\n# print mean number of gold, silver and bronze medals\n\nmean(data$Gold)\nmean(data$Silver)\nmean(data$Bronze)\n\n# How many of each of the 3 types of medals were awarded?\nsum(data$Gold)\nsum(data$Silver)\nsum(data$Bronze)\n\n# Print list of countries that did not win each of the 3 medals\n# Print number of countries that did not win each of the 3 medals\n# Hint: use subset\n\nmedal_subset = subset(x = data, subset = Gold == 0 | Silver == 0 | Bronze == 0)\nmedal_subset\nlength(medal_subset)\n"
# What is IQR of Gold medals won? How about min, max, mean
# What is variance and SD of number of Gold medals won?
IQR(data$Gold)
## [1] 4.75
min(data$Gold)
## [1] 0
max(data$Gold)
## [1] 13
mean(data$Gold)
## [1] 3.807692
var(data$Gold)
## [1] 14.64154
sd(data$Gold)
## [1] 3.826426
# Look at skew of number of Gold medals
#skew(data$Gold)
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# library not found, so will install it under Packages
library(psych)
skew(data$Gold)
## [1] 0.87898
# look at summary of data
summary(data)
## Rank NOC Gold Silver
## Min. : 1.00 Australia (AUS): 1 Min. : 0.000 Min. : 0.000
## 1st Qu.: 7.25 Austria (AUT) : 1 1st Qu.: 1.000 1st Qu.: 1.250
## Median :13.50 Belarus (BLR) : 1 Median : 2.500 Median : 3.000
## Mean :13.50 Canada (CAN) : 1 Mean : 3.808 Mean : 3.731
## 3rd Qu.:19.75 China (CHN) : 1 3rd Qu.: 5.750 3rd Qu.: 5.750
## Max. :26.00 Croatia (CRO) : 1 Max. :13.000 Max. :11.000
## (Other) :20
## Bronze Total Region
## Min. : 0.000 Min. : 1.00 ASIA : 3
## 1st Qu.: 1.000 1st Qu.: 4.25 AUSTRALIA: 1
## Median : 2.000 Median : 8.00 EURASIA : 5
## Mean : 3.808 Mean :11.35 EUROPE :15
## 3rd Qu.: 5.750 3rd Qu.:16.50 NORTH_A : 2
## Max. :12.000 Max. :33.00
##
describe(data)
## vars n mean sd median trimmed mad min max range skew
## Rank 1 26 13.50 7.65 13.5 13.50 9.64 1 26 25 0.00
## NOC* 2 26 13.50 7.65 13.5 13.50 9.64 1 26 25 0.00
## Gold 3 26 3.81 3.83 2.5 3.41 2.97 0 13 13 0.88
## Silver 4 26 3.73 3.11 3.0 3.45 2.97 0 11 11 0.69
## Bronze 5 26 3.81 3.35 2.0 3.50 2.22 0 12 12 0.87
## Total 6 26 11.35 9.32 8.0 10.55 8.15 1 33 32 0.81
## Region* 7 26 3.46 1.10 4.0 3.55 0.00 1 5 4 -1.11
## kurtosis se
## Rank -1.34 1.50
## NOC* -1.34 1.50
## Gold -0.48 0.75
## Silver -0.51 0.61
## Bronze -0.41 0.66
## Total -0.63 1.83
## Region* 0.30 0.22
# look at summary by Geographic Region
describeBy(data, group=data$Region)
## group: ASIA
## vars n mean sd median trimmed mad min max range skew
## Rank 1 3 14.00 2.65 13 14.00 1.48 12 17 5 0.32
## NOC* 2 3 13.33 8.50 13 13.33 11.86 5 22 17 0.04
## Gold 3 3 2.33 1.15 3 2.33 0.00 1 3 2 -0.38
## Silver 4 3 3.67 0.58 4 3.67 0.00 3 4 1 -0.38
## Bronze 5 3 2.33 0.58 2 2.33 0.00 2 3 1 0.38
## Total 6 3 8.33 0.58 8 8.33 0.00 8 9 1 0.38
## Region* 7 3 1.00 0.00 1 1.00 0.00 1 1 0 NaN
## kurtosis se
## Rank -2.33 1.53
## NOC* -2.33 4.91
## Gold -2.33 0.67
## Silver -2.33 0.33
## Bronze -2.33 0.33
## Total -2.33 0.33
## Region* NaN 0.00
## --------------------------------------------------------
## group: AUSTRALIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Rank 1 1 24 NA 24 24 0 24 24 0 NA NA NA
## NOC* 2 1 1 NA 1 1 0 1 1 0 NA NA NA
## Gold 3 1 0 NA 0 0 0 0 0 0 NA NA NA
## Silver 4 1 2 NA 2 2 0 2 2 0 NA NA NA
## Bronze 5 1 1 NA 1 1 0 1 1 0 NA NA NA
## Total 6 1 3 NA 3 3 0 3 3 0 NA NA NA
## Region* 7 1 2 NA 2 2 0 2 2 0 NA NA NA
## --------------------------------------------------------
## group: EURASIA
## vars n mean sd median trimmed mad min max range skew kurtosis
## Rank 1 5 15.6 10.64 20 15.6 8.90 1 26 25 -0.32 -2.01
## NOC* 2 5 15.2 8.07 15 15.2 5.93 3 25 22 -0.31 -1.51
## Gold 3 5 3.8 5.54 1 3.8 1.48 0 13 13 0.76 -1.38
## Silver 4 5 2.6 4.77 0 2.6 0.00 0 11 11 0.99 -1.03
## Bronze 5 5 2.8 3.49 1 2.8 0.00 1 9 8 1.03 -0.97
## Total 6 5 9.2 13.44 4 9.2 2.97 1 33 32 1.02 -0.99
## Region* 7 5 3.0 0.00 3 3.0 0.00 3 3 0 NaN NaN
## se
## Rank 4.76
## NOC* 3.61
## Gold 2.48
## Silver 2.14
## Bronze 1.56
## Total 6.01
## Region* 0.00
## --------------------------------------------------------
## group: EUROPE
## vars n mean sd median trimmed mad min max range skew
## Rank 1 15 13.33 6.87 14 13.31 7.41 2 25 23 0.04
## NOC* 2 15 13.60 6.76 12 13.69 7.41 2 24 22 0.03
## Gold 3 15 3.60 3.31 2 3.31 2.97 0 11 11 0.79
## Silver 4 15 3.60 2.53 3 3.54 2.97 0 8 8 0.30
## Bronze 5 15 4.00 3.18 4 3.85 2.97 0 10 10 0.37
## Total 6 15 11.20 7.85 8 10.85 10.38 1 26 25 0.43
## Region* 7 15 4.00 0.00 4 4.00 0.00 4 4 0 NaN
## kurtosis se
## Rank -1.34 1.77
## NOC* -1.41 1.75
## Gold -0.63 0.86
## Silver -1.38 0.65
## Bronze -1.21 0.82
## Total -1.12 2.03
## Region* NaN 0.00
## --------------------------------------------------------
## group: NORTH_A
## vars n mean sd median trimmed mad min max range skew kurtosis
## Rank 1 2 3.5 0.71 3.5 3.5 0.74 3 4 1 0 -2.75
## NOC* 2 2 15.0 15.56 15.0 15.0 16.31 4 26 22 0 -2.75
## Gold 3 2 9.5 0.71 9.5 9.5 0.74 9 10 1 0 -2.75
## Silver 4 2 8.5 2.12 8.5 8.5 2.22 7 10 3 0 -2.75
## Bronze 5 2 8.5 4.95 8.5 8.5 5.19 5 12 7 0 -2.75
## Total 6 2 26.5 2.12 26.5 26.5 2.22 25 28 3 0 -2.75
## Region* 7 2 5.0 0.00 5.0 5.0 0.00 5 5 0 NaN NaN
## se
## Rank 0.5
## NOC* 11.0
## Gold 0.5
## Silver 1.5
## Bronze 3.5
## Total 1.5
## Region* 0.0
# look at total medals by Geographic Region
describeBy(data$Total, group=data$Region)
## group: ASIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 3 8.33 0.58 8 8.33 0 8 9 1 0.38 -2.33 0.33
## --------------------------------------------------------
## group: AUSTRALIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 1 3 NA 3 3 0 3 3 0 NA NA NA
## --------------------------------------------------------
## group: EURASIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 5 9.2 13.44 4 9.2 2.97 1 33 32 1.02 -0.99 6.01
## --------------------------------------------------------
## group: EUROPE
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 15 11.2 7.85 8 10.85 10.38 1 26 25 0.43 -1.12 2.03
## --------------------------------------------------------
## group: NORTH_A
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 2 26.5 2.12 26.5 26.5 2.22 25 28 3 0 -2.75 1.5
# Which region won the highest mean total medals?
# How many countries are in this Geographic Region?
north_america <- subset(data, Region == "NORTH_A")
print(north_america)
## Rank NOC Gold Silver Bronze Total Region
## 3 3 Canada (CAN) 10 10 5 25 NORTH_A
## 4 4 United States (USA) 9 7 12 28 NORTH_A
# How many countries are in the EUROPE group?
europe <- subset(data, Region == "EUROPE")
print(europe)
## Rank NOC Gold Silver Bronze Total Region
## 2 2 Norway (NOR) 11 5 10 26 EUROPE
## 5 5 Netherlands (NED) 8 7 9 24 EUROPE
## 6 6 Germany (GER) 8 6 5 19 EUROPE
## 7 7 Switzerland (SUI) 6 3 2 11 EUROPE
## 9 9 Austria (AUT) 4 8 5 17 EUROPE
## 10 10 France (FRA) 4 4 7 15 EUROPE
## 11 11 Poland (POL) 4 1 1 6 EUROPE
## 14 14 Sweden (SWE) 2 7 6 15 EUROPE
## 15 15 Czech Republic (CZE) 2 4 2 8 EUROPE
## 16 16 Slovenia (SLO) 2 2 4 8 EUROPE
## 18 18 Finland (FIN) 1 3 1 5 EUROPE
## 19 19 Great Britain (GBR) 1 1 2 4 EUROPE
## 21 21 Slovakia (SVK) 1 0 0 1 EUROPE
## 22 22 Italy (ITA) 0 2 6 8 EUROPE
## 25 25 Croatia (CRO) 0 1 0 1 EUROPE
dim(europe)
## [1] 15 7
# What is the max number of medals won? What country won the max?
max(data$Total)
## [1] 33
subset(data, data$Total == 33)
## Rank NOC Gold Silver Bronze Total Region
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
table(data$Total, data$Region)
##
## ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## 1 0 0 1 2 0
## 2 0 0 1 0 0
## 3 0 1 0 0 0
## 4 0 0 1 1 0
## 5 0 0 0 1 0
## 6 0 0 1 1 0
## 8 2 0 0 3 0
## 9 1 0 0 0 0
## 11 0 0 0 1 0
## 15 0 0 0 2 0
## 17 0 0 0 1 0
## 19 0 0 0 1 0
## 24 0 0 0 1 0
## 25 0 0 0 0 1
## 26 0 0 0 1 0
## 28 0 0 0 0 1
## 33 0 0 1 0 0
# explore correlations between Total medals and number of Gold and Bronze
cor(data$Total,data$Gold)
## [1] 0.9186698
cor(data$Total,data$Bronze)
## [1] 0.8989262
cor(data$Gold,data$Bronze)
## [1] 0.7247493
# What is the correlation between Rank and Total medals? Is this expected or surprising?
cor(data$Rank, data$Total)
## [1] -0.874864