library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(dplyr)
data <- read.csv("games.csv")
str(data)
## 'data.frame':    20058 obs. of  16 variables:
##  $ id            : chr  "TZJHLljE" "l1NXvwaE" "mIICvQHh" "kWKvrqYL" ...
##  $ rated         : chr  "FALSE" "TRUE" "TRUE" "TRUE" ...
##  $ created_at    : num  1.5e+12 1.5e+12 1.5e+12 1.5e+12 1.5e+12 ...
##  $ last_move_at  : num  1.5e+12 1.5e+12 1.5e+12 1.5e+12 1.5e+12 ...
##  $ turns         : int  13 16 61 61 95 5 33 9 66 119 ...
##  $ victory_status: chr  "outoftime" "resign" "mate" "mate" ...
##  $ winner        : chr  "white" "black" "white" "white" ...
##  $ increment_code: chr  "15+2" "5+10" "5+10" "20+0" ...
##  $ white_id      : chr  "bourgris" "a-00" "ischia" "daniamurashov" ...
##  $ white_rating  : int  1500 1322 1496 1439 1523 1250 1520 1413 1439 1381 ...
##  $ black_id      : chr  "a-00" "skinnerua" "a-00" "adivanov2009" ...
##  $ black_rating  : int  1191 1261 1500 1454 1469 1002 1423 2108 1392 1209 ...
##  $ moves         : chr  "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4" "d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6 Qe5+ Nxe5 c4 Bb4+" "e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc6 bxc6 Ra6 Nc4 a4 c3 a3 Nxa3 Rxa3 Rxa3 c4 dxc4 d5 cxd5 Qxd5 exd5 "| __truncated__ "d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O-O O-O-O Nb5 Nb4 Rc1 Nxa2 Ra1 Nb4 Nxa7+ Kb8 Nb5 Bxc2 Bxc7+ Kc8 Qd"| __truncated__ ...
##  $ opening_eco   : chr  "D10" "B00" "C20" "D02" ...
##  $ opening_name  : chr  "Slav Defense: Exchange Variation" "Nimzowitsch Defense: Kennedy Variation" "King's Pawn Game: Leonardis Variation" "Queen's Pawn Game: Zukertort Variation" ...
##  $ opening_ply   : int  5 4 3 3 5 4 10 5 6 4 ...
data <- data %>%
  mutate(game_duration = last_move_at - created_at)
data_cleaned<-na.omit(data)
data_cleaned<-data_cleaned %>%
  rename(outcome=victory_status)
rated_games<- data_cleaned %>%
  filter(rated_games=TRUE) %>%
  mutate(average_rating=(white_rating+black_rating )/2)
mean_white <- mean(rated_games$white_rating) 
median_white <- median(rated_games$white_rating)  
sd_white <- sd(rated_games$white_rating)
mean_black <- mean(rated_games$black_rating)  
median_black <- median(rated_games$black_rating)  
sd_black <- sd(rated_games$black_rating) 
ggplot(rated_games, aes(x = white_rating)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.6, color = "black") +
  labs(title = "Distribution of White Ratings", x = "Rating", y = "Frequency") +
  theme_minimal() 

ggplot(rated_games, aes(x = black_rating)) +
  geom_histogram(bins = 30, fill = "red", alpha = 0.6, color = "black") +
  labs(title = "Distribution of Black Ratings", x = "Rating", y = "Frequency") +
  theme_minimal()