This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Week 1 Assignment – Loading Data into a Data Frame
# Choose one dataset on this url: https://data.fivethirtyeight.com/
# "American Chess Is Great Again" - Chess transfer data
chess_transfer_data <- read.csv('~/Downloads/transfers_csv.csv')
# First thing first - look at the structure of the dataframe. Also, look at the first 3 and last 3 rows using head and tail functions
str(chess_transfer_data)
## 'data.frame': 932 obs. of 5 variables:
## $ url : Factor w/ 18 levels "https://ratings.fide.com/fedchange.phtml?year=2000",..: 1 1 1 1 1 1 2 2 2 2 ...
## $ id : int 2019221 14401754 14401762 2019221 14401754 14401762 6700284 1613782 2600536 2603977 ...
## $ federation : Factor w/ 105 levels "aho","alb","alg",..: 101 14 14 101 14 14 29 9 8 18 ...
## $ form.fed : Factor w/ 101 levels "","aho","alb",..: 70 23 100 70 23 100 40 47 18 100 ...
## $ transfer.date: Factor w/ 545 levels "1/10/14","1/11/07",..: 110 32 32 110 32 32 84 464 102 334 ...
head(chess_transfer_data, n = 3)
## url id federation
## 1 https://ratings.fide.com/fedchange.phtml?year=2000 2019221 usa
## 2 https://ratings.fide.com/fedchange.phtml?year=2000 14401754 bih
## 3 https://ratings.fide.com/fedchange.phtml?year=2000 14401762 bih
## form.fed transfer.date
## 1 phi 12/15/00
## 2 cro 1/31/00
## 3 yug 1/31/00
tail(chess_transfer_data, n = 3)
## url id federation
## 930 https://ratings.fide.com/fedchange.phtml?year=2017 2002515 pol
## 931 https://ratings.fide.com/fedchange.phtml?year=2017 407747 sco
## 932 https://ratings.fide.com/fedchange.phtml?year=2017 13900820 ger
## form.fed transfer.date
## 930 usa 1/9/17
## 931 eng 1/12/17
## 932 fid 3/29/17
# After looking at the structure, I think the variable 'transfer.date' should be a date, not a factor
chess_transfer_data$transfer.date <- as.Date(chess_transfer_data$transfer.date, "%m/%d/%y")
str(chess_transfer_data)
## 'data.frame': 932 obs. of 5 variables:
## $ url : Factor w/ 18 levels "https://ratings.fide.com/fedchange.phtml?year=2000",..: 1 1 1 1 1 1 2 2 2 2 ...
## $ id : int 2019221 14401754 14401762 2019221 14401754 14401762 6700284 1613782 2600536 2603977 ...
## $ federation : Factor w/ 105 levels "aho","alb","alg",..: 101 14 14 101 14 14 29 9 8 18 ...
## $ form.fed : Factor w/ 101 levels "","aho","alb",..: 70 23 100 70 23 100 40 47 18 100 ...
## $ transfer.date: Date, format: "2000-12-15" "2000-01-31" ...
# Now I want to look at transfer date as a year only. Substring works because the format is the same for all observations
chess_transfer_data$transfer.date <- substring(chess_transfer_data$transfer.date,1,4)
str(chess_transfer_data)
## 'data.frame': 932 obs. of 5 variables:
## $ url : Factor w/ 18 levels "https://ratings.fide.com/fedchange.phtml?year=2000",..: 1 1 1 1 1 1 2 2 2 2 ...
## $ id : int 2019221 14401754 14401762 2019221 14401754 14401762 6700284 1613782 2600536 2603977 ...
## $ federation : Factor w/ 105 levels "aho","alb","alg",..: 101 14 14 101 14 14 29 9 8 18 ...
## $ form.fed : Factor w/ 101 levels "","aho","alb",..: 70 23 100 70 23 100 40 47 18 100 ...
## $ transfer.date: chr "2000" "2000" "2000" "2000" ...
# loading dplyr so I can use the pipe operator - %>%
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# how many transfers happen for each federation each year?
chess_transfers_clean <- chess_transfer_data %>%
group_by(federation, transfer.date) %>%
count()
# which country has the most transfers total?
count_of_total_transfers_by_country_table <- chess_transfers_clean %>%
group_by(federation) %>%
summarize(sum = sum(n))
# there are probably too many countries to graph - I'm choosing top three
top_three_countries_total <- count_of_total_transfers_by_country_table %>%
top_n(3)
## Selecting by sum
# let's merge the dataframes so everything is in one place
totals_chess_transfers_table <- merge(chess_transfers_clean, count_of_total_transfers_by_country_table, by = "federation")
# I'm going to rename the headers
names(totals_chess_transfers_table) <- c("Current_Federation", "Transfer_Year", "Number_of_Tranfers_in_Year", "Number_of_Tranfers_Total")
# take the main data frame and filter to include only the countries whose total number of transfers will put that in the top 3
chess_transfers_top_3 <- totals_chess_transfers_table %>%
filter(Current_Federation %in% top_three_countries_total$federation)
# try to graph this as a scatterplot
library(ggplot2)
ggplot(data = chess_transfers_top_3, aes(x = Transfer_Year, y = Number_of_Tranfers_in_Year, fill = Current_Federation)) +
geom_col(position = "dodge") +
labs(title="International Chess Transfers", subtitle = "Top Three Countries") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
theme(plot.subtitle = element_text(hjust = 0.5)) +
labs(caption = "Week 1 Assignment") +
xlab("Year") +
ylab("Number of Players")
# this graph seems to support the meaning of the article. The United States does seem to be trying to acquire the most chess players in order to improve the quality of the USA team overall. Even the countries that acquire the second and third highest number of transfers don't acquire nearly as many as the United States.
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.