setwd("~/NYU/classes/2. R/Assignments/Lesson 5")
library(readr)
medals <- read_csv("summer_winter_olympics.csv")
## New names:
## * `` -> ...1
## Rows: 146 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): Team..IOC.code.
## dbl (16): ...1, X..Summer, X, X.1, X.2, Total, X..Winter, X.3, X.4, X.5, Tot...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#1b View the data
View(medals)
class(medals)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
#convert to data frame
medals=data.frame(medals)
class(medals)
## [1] "data.frame"
#1c look at the column names
names(medals)
##  [1] "...1"            "Team..IOC.code." "X..Summer"       "X"              
##  [5] "X.1"             "X.2"             "Total"           "X..Winter"      
##  [9] "X.3"             "X.4"             "X.5"             "Total.1"        
## [13] "X..Games"        "X.6"             "X.7"             "X.8"            
## [17] "Combined.total"
#1d  look at the dimensions of data
dim(medals)
## [1] 146  17
print(length(medals))
## [1] 17

#2 Dealing with data

#2 change the names to more meaningful names, in order
names(medals)=c("NA", "country", "s_games", "s_gold", "s_silver", "s_bronze", "s_total", "w_games", "w_gold", "w_silver", "w_bronze", "w_total", "sw_games", "sw_gold", "sw_silver", "sw_bronze", "sw_total")
names(medals)
##  [1] "NA"        "country"   "s_games"   "s_gold"    "s_silver"  "s_bronze" 
##  [7] "s_total"   "w_games"   "w_gold"    "w_silver"  "w_bronze"  "w_total"  
## [13] "sw_games"  "sw_gold"   "sw_silver" "sw_bronze" "sw_total"

#3 Summary

#3a - frequency with tables 
attach(medals)
table(s_games)
## s_games
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  3  2  6  1 17  3  1  7  8  2  7 10 13  5  8 11  4  2  3  5  4  5  3  2  5  5 
## 27 
##  4
#3b - explore the data with other variables 
attach(medals)
## The following objects are masked from medals (pos = 3):
## 
##     country, NA, s_bronze, s_games, s_gold, s_silver, s_total,
##     sw_bronze, sw_games, sw_gold, sw_silver, sw_total, w_bronze,
##     w_games, w_gold, w_silver, w_total
table(w_games)
## w_games
##  0  1  2  3  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19 20 22 
## 45 11  6  6  4  3 19  7  4  3  4  2  2  1  1  4  2  4  2  4 12
table(sw_games)
## sw_games
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  2  2  3  2  2  3  1  5  5  4 17  7  8  8  5  5  1  5  8  3  3  1  2  4  2  2 
## 28 30 32 33 34 36 37 38 39 40 41 42 43 45 46 47 48 49 
##  1  4  3  1  1  1  2  2  1  1  3  1  1  3  2  2  4  3

#4 Graphs

#4a,b - histogram of summer & winter games (total)
hist_s=hist(as.numeric(s_games), breaks=10, main = "Frequencies of the summer games played by country", xlab="Summer Games", col="Orange", labels=TRUE, border="#FFFFFF")

hist_w=hist(as.numeric(w_games), breaks=10, main = "Frequencies of the winter games played by country", xlab="Winter Games", col="blue", labels=TRUE, border="#FFFFFF")

#4c - put the two histograms on one page
par(mfrow=c(1,2))
hist(as.numeric(s_games), breaks=10, main = "summer games ", xlab="Summer Games", col="Orange", labels=TRUE, border="#FFFFFF")
hist_w=hist(as.numeric(w_games), breaks=10, main = "winter games", xlab="Winter Games", col="blue", labels=TRUE, border="#FFFFFF")

#4d  two histograms on one page: total summer, total winter medals won
par(mfrow=c(1,2))
hist(as.numeric(s_total), breaks=10, main = "summer games ", xlab="Games", col="Orange", labels=TRUE, border="#FFFFFF")
hist_w=hist(as.numeric(w_total), breaks=10, main = "winter games", xlab="Games", col="blue", labels=TRUE, border="#FFFFFF")

#4e is there a correlation between number of medals given out in winter and summer? (do plot)

#to reset to one graph per page
par(mfrow=c(1,1))

#the 4e plot
plot(s_total, w_total, type="p", frame.plot = FALSE, xlab = "Total Winter Games", ylab = "Total Summer Games", col="Blue", main = "Relationship btwn total # of medals awarded")

#4f : how about number of games each country competes in. Is there correlation between winter and summer?

plot(s_games, w_games, type="p", frame.plot=FALSE, xlab = "Winter Games", ylab= "Summer Games", col="Purple", main = "Is there a correlation between summer vs. witner olympics?")

#4g look at distribution of each of the types of medals, by season (6 histograms on one page)
par(mfrow=c(2,3))
hist(s_gold, breaks = 20, xlab= "Summer Gold", col = "Orange", border = "#FFFFFF")
hist(s_silver, breaks = 20, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(s_bronze, breaks = 20, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(w_gold, breaks = 20, xlab= "Winter Gold", col = "Blue", border = "#FFFFFF")
hist(w_silver, breaks = 20, xlab= "Winter Silver", col = "Blue", border = "#FFFFFF")
hist(w_bronze, breaks = 20, xlab= "Winter Bronze", col = "Blue", border = "#FFFFFF")

#4h - redo g with different number of bins (10 instead of 20)
par(mfrow=c(2,3))
hist(s_gold, breaks = 10, xlab= "Summer Gold", col = "Orange", border = "#FFFFFF")
hist(s_silver, breaks = 10, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(s_bronze, breaks = 10, xlab= "Summer Silver", col = "Orange", border = "#FFFFFF")
hist(w_gold, breaks = 10, xlab= "Winter Gold", col = "Blue", border = "#FFFFFF")
hist(w_silver, breaks = 10, xlab= "Winter Silver", col = "Blue", border = "#FFFFFF")
hist(w_bronze, breaks = 10, xlab= "Winter Bronze", col = "Blue", border = "#FFFFFF")