# load file
URL <- "https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/Projects/Project1/data/raw_data_input.csv"
input <- RCurl::getURL(URL) %>% textConnection(.) %>% 
        read.table(., stringsAsFactors = F, sep = "|", blank.lines.skip = T, fill = T)
head(input)
# clean up the input file
df <- input %>% 
        dplyr::filter(!grepl(pattern = "[-]+", V1)) %>%
        dplyr::select(V1:V10) %>%
        sapply(., function(x) stringr::str_trim(x, side = "both")) %>%
        as.data.frame
head(df)
# create a key-value pair for each user using the V1, V2 columns 
kyPair <- df %>%
        dplyr::select(V1, V2) %>%
        dplyr::filter(!grepl(pattern = "[[:lower:]]", V2)) %>%  # extract every row except any that contains a lower case letter in V2
        dplyr::mutate(row_id = row_number())

# extract values & break the table into two pieces before merging
table1 <- kyPair %>%
        dplyr::filter(grepl(pattern = "[[:digit:]]", V1)) %>%
        dplyr::rename(player_id = V1, player_name = V2)

table2 <- kyPair %>%
        dplyr::filter(grepl(pattern = "[[:alpha:]]", V1)) %>%
        dplyr::rename(player_state = V1, player_prerating = V2) %>%
        dplyr::mutate(row_id = row_id -1,
                      player_prerating = stringr::str_extract(player_prerating, pattern = "R:[ ]*[[:digit:]]+") %>%
                              gsub(pattern = "R: ", "", .) %>% as.numeric)
                              
# merge the two tables together, overwrite the kyPair table
kyPair <- merge(table1, table2) %>%
        dplyr::select(-row_id)
head(kyPair)
# merge it back with df, and then tidy up the "rounds" using tidyr::gather
dfTidy <- merge(kyPair, df, by.x = "player_id", by.y = "V1") %>%
        dplyr::select(-V2) %>%
        # rename columns
        dplyr::rename(total_pts = V3,
                      round_1 = V4,
                      round_2 = V5,
                      round_3 = V6,
                      round_4 = V7,
                      round_5 = V8,
                      round_6 = V9,
                      round_7 = V10) %>%
        # fix data type from factor to appropriate type
        dplyr::mutate(player_id = player_id %>% as.character %>% as.integer,
                      player_name = player_name %>% as.character,
                      player_state = player_state %>% as.character,
                      total_pts = total_pts %>% as.character %>% as.numeric,
                      round_1 = stringr::str_extract(round_1, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer,
                      round_2 = stringr::str_extract(round_2, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer,
                      round_3 = stringr::str_extract(round_3, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer,
                      round_4 = stringr::str_extract(round_4, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer,
                      round_5 = stringr::str_extract(round_5, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer,
                      round_6 = stringr::str_extract(round_6, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer,
                      round_7 = stringr::str_extract(round_7, pattern = "[[:digit:]]{1,2}") %>% as.character %>% as.integer) %>% 
        # gather the "rounds" together into just one field before calculation
        tidyr::gather(., round, opponent_id, 
                      one_of(c("round_1", "round_2", "round_3", "round_4", "round_5", "round_6", "round_7"))) %>%
        dplyr::filter(opponent_id != 0) %>%
        arrange(player_id, round)

head(dfTidy)
# output - calculation of "avg_pre_chess_opponent_rating" in sql
output <- sqldf::sqldf(
"
select c.player_name
, c.player_state
, c.total_pts
, c.player_prerating
, round(avg(c.opponent_prerating)) as avg_pre_chess_opponent_rating
from (
    select a.player_id
    , a.player_name
    , a.player_state
    , a.player_prerating
    , a.total_pts
    , a.round 
    , a.opponent_id
    , b.player_prerating as opponent_prerating
    from dfTidy a
    join ( select player_id, player_prerating from dfTidy group by 1, 2 ) b on a.opponent_id = b.player_id
) c
group by 1, 2, 3, 4
"
)

output
# save output in csv
# setwd("../")
# setwd("data")
# write.csv(output, "tidy_data_output.csv", append = F, row.names = F)