Lab_4

Loading data

Here we are loading the data and finding the summary

player_data <-read.csv('C:/Users/rohan/OneDrive/Desktop/INTRO TO STATISTICS IN R/DATA SETS/Datasets/Data/Nba_all_seasons_1996_2021.csv')

summary(player_data)

##        X         player_name        team_abbreviation       age       
##  Min.   :    0   Length:12305       Length:12305       Min.   :18.00  
##  1st Qu.: 3076   Class :character   Class :character   1st Qu.:24.00  
##  Median : 6152   Mode  :character   Mode  :character   Median :26.00  
##  Mean   : 6152                                         Mean   :27.08  
##  3rd Qu.: 9228                                         3rd Qu.:30.00  
##  Max.   :12304                                         Max.   :44.00  
##  player_height   player_weight      college            country         
##  Min.   :160.0   Min.   : 60.33   Length:12305       Length:12305      
##  1st Qu.:193.0   1st Qu.: 90.72   Class :character   Class :character  
##  Median :200.7   Median : 99.79   Mode  :character   Mode  :character  
##  Mean   :200.6   Mean   :100.37                                        
##  3rd Qu.:208.3   3rd Qu.:108.86                                        
##  Max.   :231.1   Max.   :163.29                                        
##   draft_year        draft_round        draft_number             gp       
##  Length:12305       Length:12305       Length:12305       Min.   : 1.00  
##  Class :character   Class :character   Class :character   1st Qu.:31.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :57.00  
##                                                           Mean   :51.29  
##                                                           3rd Qu.:73.00  
##                                                           Max.   :85.00  
##       pts              reb              ast           net_rating      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :-250.000  
##  1st Qu.: 3.600   1st Qu.: 1.800   1st Qu.: 0.600   1st Qu.:  -6.400  
##  Median : 6.700   Median : 3.000   Median : 1.200   Median :  -1.300  
##  Mean   : 8.173   Mean   : 3.559   Mean   : 1.814   Mean   :  -2.256  
##  3rd Qu.:11.500   3rd Qu.: 4.700   3rd Qu.: 2.400   3rd Qu.:   3.200  
##  Max.   :36.100   Max.   :16.300   Max.   :11.700   Max.   : 300.000  
##     oreb_pct          dreb_pct        usg_pct           ts_pct      
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.02100   1st Qu.:0.096   1st Qu.:0.1490   1st Qu.:0.4800  
##  Median :0.04100   Median :0.131   Median :0.1810   Median :0.5240  
##  Mean   :0.05447   Mean   :0.141   Mean   :0.1849   Mean   :0.5111  
##  3rd Qu.:0.08400   3rd Qu.:0.180   3rd Qu.:0.2170   3rd Qu.:0.5610  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.5000  
##     ast_pct          season         
##  Min.   :0.0000   Length:12305      
##  1st Qu.:0.0660   Class :character  
##  Median :0.1030   Mode  :character  
##  Mean   :0.1314                     
##  3rd Qu.:0.1780                     
##  Max.   :1.0000

Extracting samples from the data

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.1.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
# You can change this to 10 if needed
sub_sample <- 5

# Create an empty list to store the subsamples
sub_sample_list <- list()

# Define the size of each subsample (roughly 50%)
size_of_sample <- floor(0.5 * nrow(player_data))

# Define the columns i want to include in the subsamples
selected_columns <- c("player_name", "draft_number","gp", "pts", "reb", "ast" )

# Create random subsamples and store them in the list
for (i in 1: sub_sample) {
  # Sample rows with replacement
  sub_sample <- player_data %>%
    select(all_of(selected_columns)) %>%
    sample_n(size = size_of_sample, replace = TRUE)
  
  # Assign a unique name to each subsample
  subsample_name <- paste0("df_", i)
  
  # Store the subsample in the list
  sub_sample_list[[subsample_name]] <- sub_sample
}

# Define the continuous column names
continuous_columns <- c("gp", "pts", "reb","ast")

# Initialize an empty list to store statistics data frames
list_for_stats <- list()

# Loop through each continuous column
for (c_name in continuous_columns) {
  
  # Initialize a data frame to store statistics for each subsample
  stats_df <- data.frame(
    Subsample = character(0),
    Mean = numeric(0),
    Median = numeric(0),
    SD = numeric(0),
    IQR = numeric(0),
    Lower_Range = numeric(0),
    Upper_Range = numeric(0)
  )
  
  # Loop through each subsample and calculate statistics for the continuous column
  for (i in 1:length(sub_sample_list)) {
    # Extract the current subsample
    current_df <- sub_sample_list[[i]]
    
    # Calculate statistics for the continuous column
    mean_value <- mean(current_df[[c_name]])
    sd_value <- sd(current_df[[c_name]])
    median_value <- median(current_df[[c_name]])

    # Calculate IQR, lower range, and upper range for outliers
    Q1 <- quantile(current_df[[c_name]], 0.25)
    Q3 <- quantile(current_df[[c_name]], 0.75)
    iqr_value <- Q3 - Q1
    lower_range <- Q1 - 1.5 * iqr_value
    upper_range <- Q3 + 1.5 * iqr_value
    
    # Add the statistics to the stats_df data frame
    statistics_df <- rbind(stats_df, data.frame(Subsample = paste0("df_", i), Mean = mean_value, Median = median_value, SD = sd_value, IQR = iqr_value, Lower_Range = lower_range, Upper_Range = upper_range))
  }
  
  # Add the statistics data frame to the stats_list
  list_for_stats[[c_name]] <- statistics_df
}

#Statistics for each column
for (col_name in continuous_columns) {
  print(paste("Statistics for Column:", c_name))
  print(list_for_stats[[c_name]])
}

## [1] "Statistics for Column: ast"
##     Subsample     Mean Median       SD IQR Lower_Range Upper_Range
## 75%      df_5 1.859915    1.3 1.855384 1.9       -2.25        5.35
## [1] "Statistics for Column: ast"
##     Subsample     Mean Median       SD IQR Lower_Range Upper_Range
## 75%      df_5 1.859915    1.3 1.855384 1.9       -2.25        5.35
## [1] "Statistics for Column: ast"
##     Subsample     Mean Median       SD IQR Lower_Range Upper_Range
## 75%      df_5 1.859915    1.3 1.855384 1.9       -2.25        5.35
## [1] "Statistics for Column: ast"
##     Subsample     Mean Median       SD IQR Lower_Range Upper_Range
## 75%      df_5 1.859915    1.3 1.855384 1.9       -2.25        5.35

Lab_4

Rohan Royal

18/09/2023

Loading data

Extracting samples from the data