setwd("~/NYU/classes/2. R/Assignments/Lesson 5")
library(readr)
medals <- read_csv("summer_winter_olympics.csv")
## New names:
## * `` -> ...1
## Rows: 146 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Team..IOC.code.
## dbl (16): ...1, X..Summer, X, X.1, X.2, Total, X..Winter, X.3, X.4, X.5, Tot...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#1b View the data
View(medals)
class(medals)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
#convert to data frame
medals=data.frame(medals)
class(medals)
## [1] "data.frame"
#1c look at the column names
names(medals)
## [1] "...1" "Team..IOC.code." "X..Summer" "X"
## [5] "X.1" "X.2" "Total" "X..Winter"
## [9] "X.3" "X.4" "X.5" "Total.1"
## [13] "X..Games" "X.6" "X.7" "X.8"
## [17] "Combined.total"
#1d look at the dimensions of data
dim(medals)
## [1] 146 17
print(length(medals))
## [1] 17
#2 Dealing with data
#2 change the names to more meaningful names, in order
names(medals)=c("NA", "country", "s_games", "s_gold", "s_silver", "s_bronze", "s_total", "w_games", "w_gold", "w_silver", "w_bronze", "w_total", "sw_games", "sw_gold", "sw_silver", "sw_bronze", "sw_total")
names(medals)
## [1] "NA" "country" "s_games" "s_gold" "s_silver" "s_bronze"
## [7] "s_total" "w_games" "w_gold" "w_silver" "w_bronze" "w_total"
## [13] "sw_games" "sw_gold" "sw_silver" "sw_bronze" "sw_total"
#3 Summary
#3a - frequency with tables
attach(medals)
table(s_games)
## s_games
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 3 2 6 1 17 3 1 7 8 2 7 10 13 5 8 11 4 2 3 5 4 5 3 2 5 5
## 27
## 4
#3b - explore the data with other variables
attach(medals)
## The following objects are masked from medals (pos = 3):
##
## country, NA, s_bronze, s_games, s_gold, s_silver, s_total,
## sw_bronze, sw_games, sw_gold, sw_silver, sw_total, w_bronze,
## w_games, w_gold, w_silver, w_total
table(w_games)
## w_games
## 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 17 18 19 20 22
## 45 11 6 6 4 3 19 7 4 3 4 2 2 1 1 4 2 4 2 4 12
table(sw_games)
## sw_games
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 2 2 3 2 2 3 1 5 5 4 17 7 8 8 5 5 1 5 8 3 3 1 2 4 2 2
## 28 30 32 33 34 36 37 38 39 40 41 42 43 45 46 47 48 49
## 1 4 3 1 1 1 2 2 1 1 3 1 1 3 2 2 4 3
#4 Graphs
#4a,b - histogram of summer & winter games (total)
hist_s=hist(as.numeric(s_games), breaks=10, main = "Frequencies of the summer games played by country", xlab="Summer Games", col="Orange", labels=TRUE, border="#FFFFFF")
hist_w=hist(as.numeric(w_games), breaks=10, main = "Frequencies of the winter games played by country", xlab="Winter Games", col="blue", labels=TRUE, border="#FFFFFF")
#4c - put the two histograms on one page
par(mfrow=c(1,2))
hist(as.numeric(s_games), breaks=10, main = "summer games ", xlab="Summer Games", col="Orange", labels=TRUE, border="#FFFFFF")
hist_w=hist(as.numeric(w_games), breaks=10, main = "winter games", xlab="Winter Games", col="blue", labels=TRUE, border="#FFFFFF")
#4d two histograms on one page: total summer, total winter medals won
par(mfrow=c(1,2))
hist(as.numeric(s_total), breaks=10, main = "summer games ", xlab="Games", col="Orange", labels=TRUE, border="#FFFFFF")
hist_w=hist(as.numeric(w_total), breaks=10, main = "winter games", xlab="Games", col="blue", labels=TRUE, border="#FFFFFF")
#4e is there a correlation between number of medals given out in winter and summer? (do plot)
#to reset to one graph per page
par(mfrow=c(1,1))
#the 4e plot
plot(s_total, w_total, type="p", frame.plot = FALSE, xlab = "Total Winter Games", ylab = "Total Summer Games", col="Blue", main = "Relationship btwn total # of medals awarded")
#4f : how about number of games each country competes in. Is there correlation between winter and summer?
plot(s_games, w_games, type="p", frame.plot=FALSE, xlab = "Winter Games", ylab= "Summer Games", col="Purple", main = "Is there a correlation between summer vs. witner olympics?")
#4g look at distribution of each of the types of medals, by season (6 histograms on one page)
par(mfrow=c(2,3))
hist(s_gold, breaks = 20, xlab= "Summer Gold", col = "Orange", border = "#FFFFFF")
hist(s_silver, breaks = 20, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(s_bronze, breaks = 20, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(w_gold, breaks = 20, xlab= "Winter Gold", col = "Blue", border = "#FFFFFF")
hist(w_silver, breaks = 20, xlab= "Winter Silver", col = "Blue", border = "#FFFFFF")
hist(w_bronze, breaks = 20, xlab= "Winter Bronze", col = "Blue", border = "#FFFFFF")
#4h - redo g with different number of bins (10 instead of 20)
par(mfrow=c(2,3))
hist(s_gold, breaks = 10, xlab= "Summer Gold", col = "Orange", border = "#FFFFFF")
hist(s_silver, breaks = 10, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(s_bronze, breaks = 10, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(w_gold, breaks = 10, xlab= "Winter Gold", col = "Blue", border = "#FFFFFF")
hist(w_silver, breaks = 10, xlab= "Winter Silver", col = "Blue", border = "#FFFFFF")
hist(w_bronze, breaks = 10, xlab= "Winter Bronze", col = "Blue", border = "#FFFFFF")