knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(modeest)
## Warning: package 'modeest' was built under R version 4.4.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
#Introduccion Este proyecto analiza datos de jugadores de la NBA desde 1996 hasta 2022. El objetivo es descubrir patrones entre variables como edad, estatura, peso, puntos, rebotes, entre otras.
#Cargar el Dataset
Datos_NBA <- read.csv("C:/MisDatasets/all_seasons.csv")
#Hipótesis ¿Los jugadores más altos tienen más rebotes? ¿El peso tiene relación con los puntos anotados? ¿La edad al momento del draft ha cambiado con el tiempo?
#Estadisticas basicas
# Edad
mean(Datos_NBA$age, na.rm = TRUE)
## [1] 27.04531
median(Datos_NBA$age, na.rm = TRUE)
## [1] 26
mfv(Datos_NBA$age, na_rm = TRUE)
## [1] 24
sd(Datos_NBA$age, na.rm = TRUE)
## [1] 4.339211
# Estatura
mean(Datos_NBA$player_height, na.rm = TRUE)
## [1] 200.5551
median(Datos_NBA$player_height, na.rm = TRUE)
## [1] 200.66
mfv(Datos_NBA$player_height, na_rm = TRUE)
## [1] 205.74
sd(Datos_NBA$player_height, na.rm = TRUE)
## [1] 9.11109
# Peso
mean(Datos_NBA$player_weight, na.rm = TRUE)
## [1] 100.2633
median(Datos_NBA$player_weight, na.rm = TRUE)
## [1] 99.79024
mfv(Datos_NBA$player_weight, na_rm = TRUE)
## [1] 99.79024
sd(Datos_NBA$player_weight, na.rm = TRUE)
## [1] 12.42663
# Puntos
mean(Datos_NBA$pts, na.rm = TRUE)
## [1] 8.212582
median(Datos_NBA$pts, na.rm = TRUE)
## [1] 6.7
mfv(Datos_NBA$pts, na_rm = TRUE)
## [1] 2
sd(Datos_NBA$pts, na.rm = TRUE)
## [1] 6.016573
# Rebotes
mean(Datos_NBA$reb, na.rm = TRUE)
## [1] 3.558486
median(Datos_NBA$reb, na.rm = TRUE)
## [1] 3
mfv(Datos_NBA$reb, na_rm = TRUE)
## [1] 2
sd(Datos_NBA$reb, na.rm = TRUE)
## [1] 2.477885
#Correlaciones y Graficos
# Altura vs Rebotes
cor(Datos_NBA$player_height, Datos_NBA$reb, use = "complete.obs")
## [1] 0.4242205
ggplot(Datos_NBA, aes(x = player_height, y = reb)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula = 'y ~ x'
# Peso vs Puntos
cor(Datos_NBA$player_weight, Datos_NBA$pts, use = "complete.obs")
## [1] -0.0250229
ggplot(Datos_NBA, aes(x = player_weight, y = pts)) +
geom_point(color = "darkgreen") +
geom_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula = 'y ~ x'
# Draft year vs Edad
Datos_NBA <- Datos_NBA %>%
filter(!is.na(draft_year), draft_year != "Undrafted", draft_year != "")
Datos_NBA$draft_year <- as.numeric(as.character(Datos_NBA$draft_year))
cor(Datos_NBA$draft_year, Datos_NBA$age, use = "complete.obs")
## [1] -0.5590364
ggplot(Datos_NBA, aes(x = draft_year, y = age)) +
geom_point(color = "purple") +
geom_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula = 'y ~ x'
#Boxplots y Outliers
# Edad
ggplot(Datos_NBA, aes(y = age)) +
geom_boxplot(fill = "skyblue") +
geom_hline(yintercept = mean(Datos_NBA$age, na.rm = TRUE), linetype = "dashed", color = "red")
# Estatura
ggplot(Datos_NBA, aes(y = player_height)) +
geom_boxplot(fill = "lightgreen") +
geom_hline(yintercept = mean(Datos_NBA$player_height, na.rm = TRUE), linetype = "dashed", color = "red")
# Peso
ggplot(Datos_NBA, aes(y = player_weight)) +
geom_boxplot(fill = "lightblue") +
geom_hline(yintercept = mean(Datos_NBA$player_weight, na.rm = TRUE), linetype = "dashed", color = "red")
# Puntos
ggplot(Datos_NBA, aes(y = pts)) +
geom_boxplot(fill = "orange") +
geom_hline(yintercept = mean(Datos_NBA$pts, na.rm = TRUE), linetype = "dashed", color = "red")
# Rebotes
ggplot(Datos_NBA, aes(y = reb)) +
geom_boxplot(fill = "purple") +
geom_hline(yintercept = mean(Datos_NBA$reb, na.rm = TRUE), linetype = "dashed", color = "red")
#Clustering (K-means)
clustering_data <- Datos_NBA %>%
select(age, player_height, player_weight, pts, reb) %>%
na.omit()
clustering_data_scaled <- scale(clustering_data)
set.seed(123)
kmeans_result <- kmeans(clustering_data_scaled, centers = 3, nstart = 25)
# Ver tamaño de los grupos
table(kmeans_result$cluster)
##
## 1 2 3
## 3982 2453 4051
# Visualización
fviz_cluster(kmeans_result, data = clustering_data_scaled)
#Regresion Lineal
modelo_reg <- lm(reb ~ player_height, data = Datos_NBA)
summary(modelo_reg)
##
## Call:
## lm(formula = reb ~ player_height, data = Datos_NBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4090 -1.5774 -0.2751 1.2226 12.6204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.096426 0.506423 -39.68 <2e-16 ***
## player_height 0.118999 0.002515 47.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.297 on 10484 degrees of freedom
## Multiple R-squared: 0.176, Adjusted R-squared: 0.1759
## F-statistic: 2239 on 1 and 10484 DF, p-value: < 2.2e-16
ggplot(Datos_NBA, aes(x = player_height, y = reb)) +
geom_point(alpha = 0.3, color = "blue") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Regresión Lineal: Estatura vs Rebotes")
## `geom_smooth()` using formula = 'y ~ x'
#Conclusion Este análisis mostró que la estatura de los jugadores se relaciona positivamente con los rebotes, mientras que el peso no se relaciona claramente con los puntos anotados. También se encontró que los jugadores están siendo drafteados a edades cada vez más jóvenes, y que existen valores atípicos (outliers) en variables como puntos, edad y estatura. El clustering agrupó a los jugadores en tres perfiles distintos, y el modelo de regresión lineal confirmó que la estatura ayuda a predecir la cantidad de rebotes, aunque no por sí sola.