# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      2
# Topic:        R - data structures, variables and data types
# Last updated: 4/06/2015
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")

# import data
data <- read.csv("winter_olympic.csv", sep=",", header=TRUE)

# open and look at data
#View(data)

# how many variables are in the data frame? (number of columns)
print(length(data))
## [1] 7
# what are the names of the columns?
names(data)
## [1] "Rank"   "NOC"    "Gold"   "Silver" "Bronze" "Total"  "Region"
# how many countries (rows) are in the data frame?
dim(data)
## [1] 26  7
print(dim(data))
## [1] 26  7
# print the first row of data
data[1,]
##   Rank            NOC Gold Silver Bronze Total  Region
## 1    1  Russia (RUS)*   13     11      9    33 EURASIA
# print the first column of data
data[,1]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
# print the first 5 rows of data
data[1:5,]
##   Rank                  NOC Gold Silver Bronze Total  Region
## 1    1        Russia (RUS)*   13     11      9    33 EURASIA
## 2    2         Norway (NOR)   11      5     10    26  EUROPE
## 3    3         Canada (CAN)   10     10      5    25 NORTH_A
## 4    4  United States (USA)    9      7     12    28 NORTH_A
## 5    5    Netherlands (NED)    8      7      9    24  EUROPE
# create a vector called "country_medals" from data frame
country_medals <- data$NOC
country_medals
##  [1]  Russia (RUS)*         Norway (NOR)          Canada (CAN)        
##  [4]  United States (USA)   Netherlands (NED)     Germany (GER)       
##  [7]  Switzerland (SUI)     Belarus (BLR)         Austria (AUT)       
## [10]  France (FRA)          Poland (POL)          China (CHN)         
## [13]  South Korea (KOR)     Sweden (SWE)          Czech Republic (CZE)
## [16]  Slovenia (SLO)        Japan (JPN)           Finland (FIN)       
## [19]  Great Britain (GBR)   Ukraine (UKR)         Slovakia (SVK)      
## [22]  Italy (ITA)           Latvia (LAT)          Australia (AUS)     
## [25]  Croatia (CRO)         Kazakhstan (KAZ)    
## 26 Levels:  Australia (AUS)  Austria (AUT) ...  United States (USA)
# create a vector called "gold" from data frame
gold <- data$Gold
gold
##  [1] 13 11 10  9  8  8  6  5  4  4  4  3  3  2  2  2  1  1  1  1  1  0  0
## [24]  0  0  0
length(gold)
## [1] 26
# what variable type is "gold"
class(gold)
## [1] "integer"
# create a new data frame that holds data from the region Asia
asia <- subset(data, Region == "ASIA")
asia
##    Rank                NOC Gold Silver Bronze Total Region
## 12   12        China (CHN)    3      4      2     9   ASIA
## 13   13  South Korea (KOR)    3      3      2     8   ASIA
## 17   17        Japan (JPN)    1      4      3     8   ASIA
# how many countries are in the "asia" data frame
dim(asia)
## [1] 3 7
print(dim(asia))
## [1] 3 7
# create the data frame "total_medals"
# "total_medals" will have two columns:  country, total
# 1. create vector "country"
# 2. create vector "total_medal_ct"
# 3. use cbind() to combine the two vectors
country <- data$NOC
country
##  [1]  Russia (RUS)*         Norway (NOR)          Canada (CAN)        
##  [4]  United States (USA)   Netherlands (NED)     Germany (GER)       
##  [7]  Switzerland (SUI)     Belarus (BLR)         Austria (AUT)       
## [10]  France (FRA)          Poland (POL)          China (CHN)         
## [13]  South Korea (KOR)     Sweden (SWE)          Czech Republic (CZE)
## [16]  Slovenia (SLO)        Japan (JPN)           Finland (FIN)       
## [19]  Great Britain (GBR)   Ukraine (UKR)         Slovakia (SVK)      
## [22]  Italy (ITA)           Latvia (LAT)          Australia (AUS)     
## [25]  Croatia (CRO)         Kazakhstan (KAZ)    
## 26 Levels:  Australia (AUS)  Austria (AUT) ...  United States (USA)
length(country)
## [1] 26
total_medal_ct <- data$Total
total_medal_ct
##  [1] 33 26 25 28 24 19 11  6 17 15  6  9  8 15  8  8  8  5  4  2  1  8  4
## [24]  3  1  1
length(total_medal_ct)
## [1] 26
total_medals <- cbind(country, total_medal_ct)
total_medals
##       country total_medal_ct
##  [1,]      19             33
##  [2,]      17             26
##  [3,]       4             25
##  [4,]      26             28
##  [5,]      16             24
##  [6,]      10             19
##  [7,]      24             11
##  [8,]       3              6
##  [9,]       2             17
## [10,]       9             15
## [11,]      18              6
## [12,]       5              9
## [13,]      22              8
## [14,]      23             15
## [15,]       7              8
## [16,]      21              8
## [17,]      13              8
## [18,]       8              5
## [19,]      11              4
## [20,]      25              2
## [21,]      20              1
## [22,]      12              8
## [23,]      15              4
## [24,]       1              3
## [25,]       6              1
## [26,]      14              1
class(total_medals)
## [1] "matrix"
#what's the issue? try data.frame() to combine the two vectors
total_medals<- data.frame(country, total_medal_ct)
total_medals
##                  country total_medal_ct
## 1          Russia (RUS)*             33
## 2           Norway (NOR)             26
## 3           Canada (CAN)             25
## 4    United States (USA)             28
## 5      Netherlands (NED)             24
## 6          Germany (GER)             19
## 7      Switzerland (SUI)             11
## 8          Belarus (BLR)              6
## 9          Austria (AUT)             17
## 10          France (FRA)             15
## 11          Poland (POL)              6
## 12           China (CHN)              9
## 13     South Korea (KOR)              8
## 14          Sweden (SWE)             15
## 15  Czech Republic (CZE)              8
## 16        Slovenia (SLO)              8
## 17           Japan (JPN)              8
## 18         Finland (FIN)              5
## 19   Great Britain (GBR)              4
## 20         Ukraine (UKR)              2
## 21        Slovakia (SVK)              1
## 22           Italy (ITA)              8
## 23          Latvia (LAT)              4
## 24       Australia (AUS)              3
## 25         Croatia (CRO)              1
## 26      Kazakhstan (KAZ)              1
class(total_medals)
## [1] "data.frame"
# what are the different levels of Region?
levels(data$Region)
## [1] "ASIA"      "AUSTRALIA" "EURASIA"   "EUROPE"    "NORTH_A"
# Hint:  use str() function to see which variables are factor variables 
str(data)
## 'data.frame':    26 obs. of  7 variables:
##  $ Rank  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ NOC   : Factor w/ 26 levels " Australia (AUS)",..: 19 17 4 26 16 10 24 3 2 9 ...
##  $ Gold  : int  13 11 10 9 8 8 6 5 4 4 ...
##  $ Silver: int  11 5 10 7 7 6 3 0 8 4 ...
##  $ Bronze: int  9 10 5 12 9 5 2 1 5 7 ...
##  $ Total : int  33 26 25 28 24 19 11 6 17 15 ...
##  $ Region: Factor w/ 5 levels "ASIA","AUSTRALIA",..: 3 4 5 5 4 4 4 3 4 4 ...
# create a data frame that holds countries that did not win any gold medal
no_gold <- subset(data, Gold == 0)
no_gold
##    Rank               NOC Gold Silver Bronze Total    Region
## 22   22       Italy (ITA)    0      2      6     8    EUROPE
## 23   23      Latvia (LAT)    0      2      2     4   EURASIA
## 24   24   Australia (AUS)    0      2      1     3 AUSTRALIA
## 25   25     Croatia (CRO)    0      1      0     1    EUROPE
## 26   26  Kazakhstan (KAZ)    0      0      1     1   EURASIA