In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player's Name, Player's State, Total Number of Points, Player's Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
All of your code should be in an R markdown file (and published to rpubs.com), with your data accessible for the person running the script.
library(stringr)
#read data
raw_data<-read.delim("https://raw.githubusercontent.com/anrcarson/CUNY-MSDA/master/DATA607/Project1/tournamentinfo.txt",header = FALSE, stringsAsFactors = FALSE)
#view data
#raw_data
#remove header
data_no_header<-raw_data[5:196,]
#get name row
data_name_rows<-data_no_header[seq(from=1,to=192, by=3)]
name_df<-
as.data.frame(matrix(unlist(str_split(data_name_rows,"\\|")),nrow=64, byrow=T), stringsAsFactors = FALSE) #credit: http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
#set names
names(name_df)<-c("PairNum","PlayerName","TotalPoints","Round1","Round2","Round3","Round4","Round5","Round6","Round7","Empty")
name_df$Empty<-NULL
#get state row
data_state_rows<-data_no_header[seq(from=2,to=192, by=3)]
state_df<-
as.data.frame(matrix(unlist(str_split(data_state_rows,"\\|")),nrow=64, byrow=T), stringsAsFactors = FALSE) #credit: http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
names(state_df)<-c("PlayerState","IdRating","TotalPoints_2","Round1_2","Round2_2","Round3_2","Round4_2","Round5_2","Round6_2","Round7_2","Empty")
state_df$Empty<-NULL
#combine the two dfs
df<-cbind(name_df,state_df)
#get pre-rating
df$PreRating<-matrix(unlist(str_extract_all(df$IdRating,"[:digit:]{3,}")), nrow=64, byrow=T)[,2]
df$PreRating<-as.numeric(df$PreRating)
#get opponents
df$Round1_Opp<-str_extract(df$Round1,"[:digit:]{1,}")
df$Round2_Opp<-str_extract(df$Round2,"[:digit:]{1,}")
df$Round3_Opp<-str_extract(df$Round3,"[:digit:]{1,}")
df$Round4_Opp<-str_extract(df$Round4,"[:digit:]{1,}")
df$Round5_Opp<-str_extract(df$Round5,"[:digit:]{1,}")
df$Round6_Opp<-str_extract(df$Round6,"[:digit:]{1,}")
df$Round7_Opp<-str_extract(df$Round7,"[:digit:]{1,}")
#trim pairnum
df$PairNum<-str_trim(df$PairNum)
#get average pre-rating of opponents
average_prerating <- c()
for(i in 1:nrow(df)) {
#i<-44
average_prerating<-c(average_prerating,
round(
mean(
c(
df$PreRating[which(df$PairNum == df$Round1_Opp[i])]
,df$PreRating[which(df$PairNum == df$Round2_Opp[i])]
,df$PreRating[which(df$PairNum == df$Round3_Opp[i])]
,df$PreRating[which(df$PairNum == df$Round4_Opp[i])]
,df$PreRating[which(df$PairNum == df$Round5_Opp[i])]
,df$PreRating[which(df$PairNum == df$Round6_Opp[i])]
,df$PreRating[which(df$PairNum == df$Round7_Opp[i])]
)
,na.rm = TRUE, trim=0)
))
}
df$AveragePreRating_Opponents<-average_prerating
#assemble final df
final_df<-data.frame(
cbind(
str_trim(df$PlayerName)
,str_trim(df$PlayerState)
,str_trim(df$TotalPoints)
,str_trim(df$PreRating)
,str_trim(df$AveragePreRating_Opponents)
) #cbind
,stringsAsFactors = FALSE)
names(final_df)<-c("PlayerName","PlayerState","PlayerTotalPoints","PlayerPreRating","AvgPreRating_Opponents")
#write the csv file
write.csv(final_df, file="tournamentinfo_clean.csv", row.names = FALSE)
#get post-rating
final_df$PlayerPostRating<-
matrix(unlist(str_extract_all(df$IdRating,"[:digit:]{3,}")), nrow=64, byrow=T)[,3]
final_df$PlayerPostRating<-as.numeric(final_df$PlayerPostRating)
#make numeric
final_df$PlayerPreRating<-as.numeric(final_df$PlayerPreRating)
final_df$AvgPreRating_Opponents<-as.numeric(final_df$AvgPreRating_Opponents)
#get PostRating - PreRating difference
final_df$PlayerPostPreDiff<-final_df$PlayerPostRating - final_df$PlayerPreRating
#get PlayerPrerating - AvgPreRating_Opponents diff
final_df$PlayerPreRateAvgOppDiff<- final_df$PlayerPreRating - final_df$AvgPreRating_Opponents
Here is the data.
library(DT)
datatable(final_df)
Pre-rating is skewed left. The mean is 1378 with a low of 377 and a max of 1764. The standard deviation is 265.
hist(final_df$PlayerPreRating, xlim = c(0,2000), ylim = c(0,10), breaks = seq(0,2000,50))
summary(final_df$PlayerPreRating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 377 1227 1407 1378 1583 1794
sd(final_df$PlayerPreRating)
## [1] 265.4756
Post-rating is more normal than pre-rating, though still skewed left. The mean is 1398 with a min of 878 and a max of 1817. The standard deviation is 231.
hist(final_df$PlayerPostRating, xlim = c(0,2000), ylim = c(0,10), breaks = seq(0,2000,50))
summary(final_df$PlayerPostRating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 878 1200 1418 1398 1578 1817
sd(final_df$PlayerPostRating)
## [1] 231.2936
Looking at a boxplot for each, the distribution doesn’t change much, except that PostRating no longer has the outlier that PreRating had.
boxplot(final_df$PlayerPreRating, final_df$PlayerPostRating, names = c("PreRating","PostRating"))
Viewing the difference between the pre-rating and the post-rating, we see that it is not normall distributed. It is highly skewed right. The min is -94, the max is 699, and the median is 0 while the mean is 19.67. Interestingly, the same number of players gained in rating (31) as did lose in rating (31), while 2 players had no change.
hist(final_df$PlayerPostPreDiff)
summary(final_df$PlayerPostPreDiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -94.00 -26.50 0.00 19.67 24.00 699.00
sum(final_df$PlayerPostPreDiff < 0)/length(final_df$PlayerPostPreDiff)
## [1] 0.484375
sum(final_df$PlayerPostPreDiff > 0)/length(final_df$PlayerPostPreDiff)
## [1] 0.484375
sum(final_df$PlayerPostPreDiff == 0)/length(final_df$PlayerPostPreDiff)
## [1] 0.03125