Here we are loading the data and finding the summary
player_data <-read.csv('C:/Users/rohan/OneDrive/Desktop/INTRO TO STATISTICS IN R/DATA SETS/Datasets/Data/Nba_all_seasons_1996_2021.csv')
summary(player_data)
## X player_name team_abbreviation age
## Min. : 0 Length:12305 Length:12305 Min. :18.00
## 1st Qu.: 3076 Class :character Class :character 1st Qu.:24.00
## Median : 6152 Mode :character Mode :character Median :26.00
## Mean : 6152 Mean :27.08
## 3rd Qu.: 9228 3rd Qu.:30.00
## Max. :12304 Max. :44.00
## player_height player_weight college country
## Min. :160.0 Min. : 60.33 Length:12305 Length:12305
## 1st Qu.:193.0 1st Qu.: 90.72 Class :character Class :character
## Median :200.7 Median : 99.79 Mode :character Mode :character
## Mean :200.6 Mean :100.37
## 3rd Qu.:208.3 3rd Qu.:108.86
## Max. :231.1 Max. :163.29
## draft_year draft_round draft_number gp
## Length:12305 Length:12305 Length:12305 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.:31.00
## Mode :character Mode :character Mode :character Median :57.00
## Mean :51.29
## 3rd Qu.:73.00
## Max. :85.00
## pts reb ast net_rating
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :-250.000
## 1st Qu.: 3.600 1st Qu.: 1.800 1st Qu.: 0.600 1st Qu.: -6.400
## Median : 6.700 Median : 3.000 Median : 1.200 Median : -1.300
## Mean : 8.173 Mean : 3.559 Mean : 1.814 Mean : -2.256
## 3rd Qu.:11.500 3rd Qu.: 4.700 3rd Qu.: 2.400 3rd Qu.: 3.200
## Max. :36.100 Max. :16.300 Max. :11.700 Max. : 300.000
## oreb_pct dreb_pct usg_pct ts_pct
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.02100 1st Qu.:0.096 1st Qu.:0.1490 1st Qu.:0.4800
## Median :0.04100 Median :0.131 Median :0.1810 Median :0.5240
## Mean :0.05447 Mean :0.141 Mean :0.1849 Mean :0.5111
## 3rd Qu.:0.08400 3rd Qu.:0.180 3rd Qu.:0.2170 3rd Qu.:0.5610
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.5000
## ast_pct season
## Min. :0.0000 Length:12305
## 1st Qu.:0.0660 Class :character
## Median :0.1030 Mode :character
## Mean :0.1314
## 3rd Qu.:0.1780
## Max. :1.0000
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# You can change this to 10 if needed
sub_sample <- 5
# Create an empty list to store the subsamples
sub_sample_list <- list()
# Define the size of each subsample (roughly 50%)
size_of_sample <- floor(0.5 * nrow(player_data))
# Define the columns i want to include in the subsamples
selected_columns <- c("player_name", "draft_number","gp", "pts", "reb", "ast" )
# Create random subsamples and store them in the list
for (i in 1: sub_sample) {
# Sample rows with replacement
sub_sample <- player_data %>%
select(all_of(selected_columns)) %>%
sample_n(size = size_of_sample, replace = TRUE)
# Assign a unique name to each subsample
subsample_name <- paste0("df_", i)
# Store the subsample in the list
sub_sample_list[[subsample_name]] <- sub_sample
}
# Define the continuous column names
continuous_columns <- c("gp", "pts", "reb","ast")
# Initialize an empty list to store statistics data frames
list_for_stats <- list()
# Loop through each continuous column
for (c_name in continuous_columns) {
# Initialize a data frame to store statistics for each subsample
stats_df <- data.frame(
Subsample = character(0),
Mean = numeric(0),
Median = numeric(0),
SD = numeric(0),
IQR = numeric(0),
Lower_Range = numeric(0),
Upper_Range = numeric(0)
)
# Loop through each subsample and calculate statistics for the continuous column
for (i in 1:length(sub_sample_list)) {
# Extract the current subsample
current_df <- sub_sample_list[[i]]
# Calculate statistics for the continuous column
mean_value <- mean(current_df[[c_name]])
sd_value <- sd(current_df[[c_name]])
median_value <- median(current_df[[c_name]])
# Calculate IQR, lower range, and upper range for outliers
Q1 <- quantile(current_df[[c_name]], 0.25)
Q3 <- quantile(current_df[[c_name]], 0.75)
iqr_value <- Q3 - Q1
lower_range <- Q1 - 1.5 * iqr_value
upper_range <- Q3 + 1.5 * iqr_value
# Add the statistics to the stats_df data frame
statistics_df <- rbind(stats_df, data.frame(Subsample = paste0("df_", i), Mean = mean_value, Median = median_value, SD = sd_value, IQR = iqr_value, Lower_Range = lower_range, Upper_Range = upper_range))
}
# Add the statistics data frame to the stats_list
list_for_stats[[c_name]] <- statistics_df
}
#Statistics for each column
for (col_name in continuous_columns) {
print(paste("Statistics for Column:", c_name))
print(list_for_stats[[c_name]])
}
## [1] "Statistics for Column: ast"
## Subsample Mean Median SD IQR Lower_Range Upper_Range
## 75% df_5 1.859915 1.3 1.855384 1.9 -2.25 5.35
## [1] "Statistics for Column: ast"
## Subsample Mean Median SD IQR Lower_Range Upper_Range
## 75% df_5 1.859915 1.3 1.855384 1.9 -2.25 5.35
## [1] "Statistics for Column: ast"
## Subsample Mean Median SD IQR Lower_Range Upper_Range
## 75% df_5 1.859915 1.3 1.855384 1.9 -2.25 5.35
## [1] "Statistics for Column: ast"
## Subsample Mean Median SD IQR Lower_Range Upper_Range
## 75% df_5 1.859915 1.3 1.855384 1.9 -2.25 5.35