===================================================================================================

1 Functions: clean up, filter, and subset the data

We have written a few functions in R that could be useful, and can be applied to any similar data frame. The data frame must contain columns, \(player\), \(played\_est\), and \(minute\).

1.1 \(get\_starters()\)

The function \(get\_starters()\) inputs a data frame, filters the data, and will return a data frame with only the starters for each game. We have defined a player as a starter, if \(played\_est = 1\) when \(minute=1\).

get_starters <- function(data) {
    
    n <- length(unique(data$gameId))
    m <- length(unique(data$player))
    my_df <- data.frame()
    
    for (i in 1:n) {
        for (j in 1:m) {
            game <- which(data$gameId == unique(data$gameId)[i])  # vector of unique games
            player <- which(data$player == unique(data$player)[j])  # vector of unique players
            
            if (!(!length(intersect(game, player)))) {
                
                if (data[intersect(game, player), ]$played_est[1] == 1) {
                  starter <- data[intersect(game, player), ]
                  my_df <- rbind(my_df, starter)
                }
                
                
            }
        }
    }
    return(my_df)
    
}

1.2 \(get\_bench()\)

The function \(get\_bench()\) inputs a data frame, filters the data, and will return a data frame with only the bench players for each game. We have defined a player as a bench player, if \(played\_est=0\) when \(minute=1\).

get_bench <- function(data) {
    n <- length(unique(data$gameId))
    m <- length(unique(data$player))
    my_df <- data.frame()
    # pb <- txtProgressBar(min = 0, max = n, style = 3)
    
    count = 0
    
    for (i in 1:n) {
        for (j in 1:m) {
            game <- which(data$gameId == unique(data$gameId)[i])
            player <- which(data$player == unique(data$player)[j])
            if (!(!length(intersect(game, player)))) {
                
                if (data[intersect(game, player), ]$played_est[1] == 0) {
                  bench <- data[intersect(game, player), ]
                  my_df <- rbind(my_df, bench)
                }
                
                
            }
        }
        # setTxtProgressBar(pb, i)
    }
    return(my_df)
}

1.3 \(filter\_absent()\)

The function \(filter\_absent()\) inputs a data frame, and will return a data frame with only the rows when a player has played in that game. If the sum of \(played\_est=0\) for a certain player during an individual game, then the data from that player/game is removed.

filter_absent <- function(data) {
    n <- length(unique(data$gameId))
    m <- length(unique(data$player))
    my_df <- data.frame()
    
    for (i in 1:n) {
        for (j in 1:m) {
            game <- which(data$gameId == unique(data$gameId)[i])  # vector of unique games
            player <- which(data$player == unique(data$player)[j])  # vector of unique players
            
            if (!(!length(intersect(game, player)))) {
                
                if (sum(data[intersect(game, player), ]$played_est) != 0) {
                  non_absent <- data[intersect(game, player), ]
                  my_df <- rbind(my_df, non_absent)
                }
                
                
            }
        }
    }
    return(my_df)
    
}

1.4 \(filter\_player()\)

filter_player <- function(data, player1) {
    
    my_df = data[which(data$player == player1), ]
    
    return(my_df)
}

1.5 \(filter\_minutes()\)

filter_minutes <- function(data, minutes) {
    
    my_df = data[which(data$minute %in% minutes), ]
    
    return(my_df)
}

1.6 \(add\_quarters()\)

add_quarters <- function(data1) {
    
    data1$quarter <- ifelse(data1$minute <= 48, "4", "5")
    data1$quarter <- ifelse(data1$minute <= 36, "3", data1$quarter)
    data1$quarter <- ifelse(data1$minute <= 24, "2", data1$quarter)
    data1$quarter <- ifelse(data1$minute <= 12, "1", data1$quarter)
    return(data1)
}

1.7 \(add\_cum\_min()\)

add_cum_mins <- function(data) {
    
    n <- length(unique(data$gameId))
    m <- length(unique(data$player))
    my_df <- data.frame()
    
    
    for (i in 1:n) {
        for (j in 1:m) {
            
            game <- which(data$gameId == unique(data$gameId)[i])  # vector of unique games
            player <- which(data$player == unique(data$player)[j])
            player_game <- data[intersect(game, player), ]
            
            player_game$cum_min <- cumsum(data[intersect(game, player), ]$mins_round)
            
            
            final <- transform(player_game, consec_min = mins_round * ave(mins_round, 
                c(0L, cumsum(diff(mins_round) != 0)), FUN = seq_along))
            
            my_df <- rbind(my_df, final)
        }
    }
    return(my_df)
}

1.8 \(make\_heat()\)

The function \(make\_heat()\) will aggregate the inputted data frame, taking the mean of the \(heat\) per each group of the \(y\_dim\), then genereate a heat map.

make_heat <- function(data, x_dim, y_dim, heat) {
    
    sub <- data[which(data[x_dim] == data[x_dim][1, 1]), ]
    my_df <- aggregate(sub[heat], by = list(sub[[y_dim]]), mean)
    
    n <- length(unique(data[[x_dim]]))
    
    cbind.fill <- function(...) {
        transpoted <- lapply(list(...), t)
        transpoted_dataframe <- lapply(transpoted, as.data.frame)
        return(data.frame(t(rbind.fill(transpoted_dataframe))))
    }
    
    for (i in (unique(data[x_dim])[1, ] + 1):(unique(data[x_dim])[n, ])) {
        sub <- data[which(data[[x_dim]] == i), ]
        my_df <- cbind.fill(my_df, aggregate(sub[heat], by = list(sub[[y_dim]]), 
            mean)[2])
    }
    
    names(my_df)[1] <- c("Player")
    names(my_df)[2:(n + 1)] <- as.character((unique(data[x_dim])[1, ]):(unique(data[x_dim])[n, 
        ]))
    
    
    my_df_melt <- melt(my_df, id.vars = "Player")
    my_df_melt[is.na(my_df_melt)] <- 0
    my_df_melt[3] <- as.numeric(data.matrix(my_df_melt[3]))
    
    names(my_df_melt) <- c("Player", "Minute", "Probability")
    
    
    base_size <- 8
    p <- ggplot(my_df_melt, aes(Minute, Player))
    p <- p + geom_tile(aes(fill = Probability), colour = "white")
    p <- p + scale_fill_gradient(low = "red", high = "yellow", limits = c(0, 
        1))
    p <- p + theme_grey(base_size = base_size)
    p <- p + labs(x = x_dim, y = "")
    p <- p + scale_x_discrete(expand = c(0, 0))
    p <- p + scale_y_discrete(expand = c(0, 0))
    p <- p + ggtitle("Heat Map: When a player is in the Game")
    p <- p + theme(panel.background = element_rect(fill = "white"), axis.ticks = element_blank(), 
        title = element_text(size = base_size * 1.5, family = "Proxima Nova Bold"), 
        axis.text.x = element_text(size = base_size * 0.8, angle = 360, hjust = 0, 
            colour = "grey50"))
    p
}

2 Understanding player substitutions and playing time

2.1 Explore the probability of a player playing each minute.

We now explore a heat map.

filtered <- filter_absent(cavs)

make_heat2(data = filtered, x_dim = "minute", y_dim = "player", heat = "played_est")

starters <- get_starters(cavs)
make_heat2(data = starters, x_dim = "minute", y_dim = "player", heat = "played_est")

filtered_bench <- get_bench(filtered)
make_heat2(data = filtered_bench, x_dim = "minute", y_dim = "player", heat = "played_est")

2.2 Explore the impact of foul trouble on playing time.

make_heat(data = filtered, x_dim = "fouls", y_dim = "player", heat = "played_est")

make_heat(data = starters, x_dim = "fouls", y_dim = "player", heat = "played_est")

make_heat(data = filtered_bench, x_dim = "fouls", y_dim = "player", heat = "played_est")

LeBron <- filter_player(cavs, "LeBron James")
LeBron <- filter_absent(LeBron)

make_heat3(LeBron, x_dim = "minute", y_dim = "fouls", heat = "played_est")

kyrie <- filter_player(cavs, "Kyrie Irving")
kyrie <- filter_absent(kyrie)

make_heat3(kyrie, x_dim = "minute", y_dim = "fouls", heat = "played_est")

love <- filter_player(cavs, "Kevin Love")
love <- filter_absent(love)

make_heat3(love, x_dim = "minute", y_dim = "fouls", heat = "played_est")

2.3 Explore how the number of prior minutes played and game competitiveness impact playing time.

make_consec_min_plot(cavs)