# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 2
# Topic: R - data structures, variables and data types
# Last updated: 4/06/2015
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
# import data
data <- read.csv("winter_olympic.csv", sep=",", header=TRUE)
# open and look at data
#View(data)
# how many variables are in the data frame? (number of columns)
print(length(data))
## [1] 7
# what are the names of the columns?
names(data)
## [1] "Rank" "NOC" "Gold" "Silver" "Bronze" "Total" "Region"
# how many countries (rows) are in the data frame?
dim(data)
## [1] 26 7
print(dim(data))
## [1] 26 7
# print the first row of data
data[1,]
## Rank NOC Gold Silver Bronze Total Region
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
# print the first column of data
data[,1]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
# print the first 5 rows of data
data[1:5,]
## Rank NOC Gold Silver Bronze Total Region
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
## 2 2 Norway (NOR) 11 5 10 26 EUROPE
## 3 3 Canada (CAN) 10 10 5 25 NORTH_A
## 4 4 United States (USA) 9 7 12 28 NORTH_A
## 5 5 Netherlands (NED) 8 7 9 24 EUROPE
# create a vector called "country_medals" from data frame
country_medals <- data$NOC
country_medals
## [1] Russia (RUS)* Norway (NOR) Canada (CAN)
## [4] United States (USA) Netherlands (NED) Germany (GER)
## [7] Switzerland (SUI) Belarus (BLR) Austria (AUT)
## [10] France (FRA) Poland (POL) China (CHN)
## [13] South Korea (KOR) Sweden (SWE) Czech Republic (CZE)
## [16] Slovenia (SLO) Japan (JPN) Finland (FIN)
## [19] Great Britain (GBR) Ukraine (UKR) Slovakia (SVK)
## [22] Italy (ITA) Latvia (LAT) Australia (AUS)
## [25] Croatia (CRO) Kazakhstan (KAZ)
## 26 Levels: Australia (AUS) Austria (AUT) ... United States (USA)
# create a vector called "gold" from data frame
gold <- data$Gold
gold
## [1] 13 11 10 9 8 8 6 5 4 4 4 3 3 2 2 2 1 1 1 1 1 0 0
## [24] 0 0 0
length(gold)
## [1] 26
# what variable type is "gold"
class(gold)
## [1] "integer"
# create a new data frame that holds data from the region Asia
asia <- subset(data, Region == "ASIA")
asia
## Rank NOC Gold Silver Bronze Total Region
## 12 12 China (CHN) 3 4 2 9 ASIA
## 13 13 South Korea (KOR) 3 3 2 8 ASIA
## 17 17 Japan (JPN) 1 4 3 8 ASIA
# how many countries are in the "asia" data frame
dim(asia)
## [1] 3 7
print(dim(asia))
## [1] 3 7
# create the data frame "total_medals"
# "total_medals" will have two columns: country, total
# 1. create vector "country"
# 2. create vector "total_medal_ct"
# 3. use cbind() to combine the two vectors
country <- data$NOC
country
## [1] Russia (RUS)* Norway (NOR) Canada (CAN)
## [4] United States (USA) Netherlands (NED) Germany (GER)
## [7] Switzerland (SUI) Belarus (BLR) Austria (AUT)
## [10] France (FRA) Poland (POL) China (CHN)
## [13] South Korea (KOR) Sweden (SWE) Czech Republic (CZE)
## [16] Slovenia (SLO) Japan (JPN) Finland (FIN)
## [19] Great Britain (GBR) Ukraine (UKR) Slovakia (SVK)
## [22] Italy (ITA) Latvia (LAT) Australia (AUS)
## [25] Croatia (CRO) Kazakhstan (KAZ)
## 26 Levels: Australia (AUS) Austria (AUT) ... United States (USA)
length(country)
## [1] 26
total_medal_ct <- data$Total
total_medal_ct
## [1] 33 26 25 28 24 19 11 6 17 15 6 9 8 15 8 8 8 5 4 2 1 8 4
## [24] 3 1 1
length(total_medal_ct)
## [1] 26
total_medals <- cbind(country, total_medal_ct)
total_medals
## country total_medal_ct
## [1,] 19 33
## [2,] 17 26
## [3,] 4 25
## [4,] 26 28
## [5,] 16 24
## [6,] 10 19
## [7,] 24 11
## [8,] 3 6
## [9,] 2 17
## [10,] 9 15
## [11,] 18 6
## [12,] 5 9
## [13,] 22 8
## [14,] 23 15
## [15,] 7 8
## [16,] 21 8
## [17,] 13 8
## [18,] 8 5
## [19,] 11 4
## [20,] 25 2
## [21,] 20 1
## [22,] 12 8
## [23,] 15 4
## [24,] 1 3
## [25,] 6 1
## [26,] 14 1
class(total_medals)
## [1] "matrix"
#what's the issue? try data.frame() to combine the two vectors
total_medals<- data.frame(country, total_medal_ct)
total_medals
## country total_medal_ct
## 1 Russia (RUS)* 33
## 2 Norway (NOR) 26
## 3 Canada (CAN) 25
## 4 United States (USA) 28
## 5 Netherlands (NED) 24
## 6 Germany (GER) 19
## 7 Switzerland (SUI) 11
## 8 Belarus (BLR) 6
## 9 Austria (AUT) 17
## 10 France (FRA) 15
## 11 Poland (POL) 6
## 12 China (CHN) 9
## 13 South Korea (KOR) 8
## 14 Sweden (SWE) 15
## 15 Czech Republic (CZE) 8
## 16 Slovenia (SLO) 8
## 17 Japan (JPN) 8
## 18 Finland (FIN) 5
## 19 Great Britain (GBR) 4
## 20 Ukraine (UKR) 2
## 21 Slovakia (SVK) 1
## 22 Italy (ITA) 8
## 23 Latvia (LAT) 4
## 24 Australia (AUS) 3
## 25 Croatia (CRO) 1
## 26 Kazakhstan (KAZ) 1
class(total_medals)
## [1] "data.frame"
# what are the different levels of Region?
levels(data$Region)
## [1] "ASIA" "AUSTRALIA" "EURASIA" "EUROPE" "NORTH_A"
# Hint: use str() function to see which variables are factor variables
str(data)
## 'data.frame': 26 obs. of 7 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ NOC : Factor w/ 26 levels " Australia (AUS)",..: 19 17 4 26 16 10 24 3 2 9 ...
## $ Gold : int 13 11 10 9 8 8 6 5 4 4 ...
## $ Silver: int 11 5 10 7 7 6 3 0 8 4 ...
## $ Bronze: int 9 10 5 12 9 5 2 1 5 7 ...
## $ Total : int 33 26 25 28 24 19 11 6 17 15 ...
## $ Region: Factor w/ 5 levels "ASIA","AUSTRALIA",..: 3 4 5 5 4 4 4 3 4 4 ...
# create a data frame that holds countries that did not win any gold medal
no_gold <- subset(data, Gold == 0)
no_gold
## Rank NOC Gold Silver Bronze Total Region
## 22 22 Italy (ITA) 0 2 6 8 EUROPE
## 23 23 Latvia (LAT) 0 2 2 4 EURASIA
## 24 24 Australia (AUS) 0 2 1 3 AUSTRALIA
## 25 25 Croatia (CRO) 0 1 0 1 EUROPE
## 26 26 Kazakhstan (KAZ) 0 0 1 1 EURASIA