# Cargar paquetes necesarios
library(readr)      # Para leer archivos CSV
library(dplyr)      # Para manipulación de nike_sales_2024
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)    # Para visualización de nike_sales_2024
## Warning: package 'ggplot2' was built under R version 4.4.2
library(skimr)      # Para análisis exploratorio rápido
## Warning: package 'skimr' was built under R version 4.4.2
library(readxl)     # Para leer archivos Excel
library(gridExtra)  # Para organizar gráficos
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(moments)    # Para cálculo de simetría y curtosis
library(kableExtra) # Para tablas en RMarkdown
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Cargar los nike_sales_2024
nike_sales_2024 <- read_excel("D:/Usuario windows/Downloads/nike_sales_2024.xlsx")


# Mostrar estructura y resumen
str(nike_sales_2024)
## tibble [1,000 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Month                  : chr [1:1000] "November" "January" "October" "December" ...
##  $ Region                 : chr [1:1000] "India" "India" "India" "Greater China" ...
##  $ Main_Category          : chr [1:1000] "Equipment" "Equipment" "Apparel" "Footwear" ...
##  $ Sub_Category           : chr [1:1000] "Bags" "Accessories" "Tops" "Cricket" ...
##  $ Product_Line           : chr [1:1000] "Gym Sack" "Hats" "Tech Fleece" "Vapor Cricket" ...
##  $ Price_Tier             : chr [1:1000] "Budget" "Budget" "Mid-Range" "Premium" ...
##  $ Units_Sold             : num [1:1000] 48356 9842 25079 41404 33569 ...
##  $ Revenue_USD            : num [1:1000] 14506800 2066820 1755530 8694840 5371040 ...
##  $ Online_Sales_Percentage: num [1:1000] 73 50 90 58 53 73 50 55 78 86 ...
##  $ Retail_Price           : num [1:1000] 300 210 70 210 160 140 230 150 150 230 ...
summary(nike_sales_2024)
##     Month              Region          Main_Category      Sub_Category      
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Product_Line        Price_Tier          Units_Sold     Revenue_USD      
##  Length:1000        Length:1000        Min.   : 5028   Min.   :  287400  
##  Class :character   Class :character   1st Qu.:17554   1st Qu.: 2344675  
##  Mode  :character   Mode  :character   Median :28685   Median : 4328020  
##                                        Mean   :28499   Mean   : 5039576  
##                                        3rd Qu.:40026   3rd Qu.: 7264942  
##                                        Max.   :49992   Max.   :14864700  
##  Online_Sales_Percentage  Retail_Price  
##  Min.   :50.00           Min.   : 50.0  
##  1st Qu.:60.00           1st Qu.:110.0  
##  Median :71.00           Median :180.0  
##  Mean   :70.04           Mean   :176.3  
##  3rd Qu.:80.00           3rd Qu.:240.0  
##  Max.   :90.00           Max.   :300.0

Objetivo

Identificar tendencias y patrones de crecimiento: Analizar la evolución de las ventas anuales de Nike para detectar patrones de crecimiento, estacionalidad y posibles fluctuaciones en el rendimiento financiero a lo largo del tiempo.

tipo_variable <- ifelse(sapply(nike_sales_2024, is.numeric), "Cuantitativa", "Cualitativa")

variables_ordinales <- c("Price_Tier")  

escala_medida <- ifelse(tipo_variable == "Cuantitativa", "Razón", 
                        ifelse(names(nike_sales_2024) %in% variables_ordinales, "Ordinal", "Nominal"))

frecuencias <- ifelse(tipo_variable == "Cualitativa", "Sí", "Agrupadas")
medidas_localizacion <- ifelse(tipo_variable == "Cuantitativa", "Media, Mediana, Moda", "Moda")
medidas_dispersion <- ifelse(tipo_variable == "Cuantitativa", "Desviación estándar, Rango intercuartílico", "N/A")
medidas_distribucion <- ifelse(tipo_variable == "Cuantitativa", "Asimetría, Curtosis", "N/A")
graficos <- ifelse(tipo_variable == "Cuantitativa", "Histograma, Boxplot", 
                   ifelse(escala_medida == "Ordinal", "Diagrama de barras ordenado", "Gráfico de barras"))

tabla_operacionalizacion <- data.frame(
  Variable = names(nike_sales_2024),
  Naturaleza_Variable = tipo_variable,
  Escala_de_Medidas = escala_medida,
  Frecuencias = frecuencias,
  Medidas_Localizacion = medidas_localizacion,
  Medidas_Dispersion = medidas_dispersion,
  Medidas_Distribucion = medidas_distribucion,
  Graficos_Sugeridos = graficos,
  stringsAsFactors = FALSE
)

tabla_operacionalizacion %>%
  kbl(caption = "Tabla de Operacionalización de Variables", escape = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE) %>%
  column_spec(1, bold = TRUE, color = "white", background = "#4CAF50") %>%
  column_spec(2:8, background = "#f0f0f0")
Tabla de Operacionalización de Variables
Variable Naturaleza_Variable Escala_de_Medidas Frecuencias Medidas_Localizacion Medidas_Dispersion Medidas_Distribucion Graficos_Sugeridos
Month Month Cualitativa Nominal Moda N/A N/A Gráfico de barras
Region Region Cualitativa Nominal Moda N/A N/A Gráfico de barras
Main_Category Main_Category Cualitativa Nominal Moda N/A N/A Gráfico de barras
Sub_Category Sub_Category Cualitativa Nominal Moda N/A N/A Gráfico de barras
Product_Line Product_Line Cualitativa Nominal Moda N/A N/A Gráfico de barras
Price_Tier Price_Tier Cualitativa Ordinal Moda N/A N/A Diagrama de barras ordenado
Units_Sold Units_Sold Cuantitativa Razón Agrupadas Media, Mediana, Moda Desviación estándar, Rango intercuartílico Asimetría, Curtosis Histograma, Boxplot
Revenue_USD Revenue_USD Cuantitativa Razón Agrupadas Media, Mediana, Moda Desviación estándar, Rango intercuartílico Asimetría, Curtosis Histograma, Boxplot
Online_Sales_Percentage Online_Sales_Percentage Cuantitativa Razón Agrupadas Media, Mediana, Moda Desviación estándar, Rango intercuartílico Asimetría, Curtosis Histograma, Boxplot
Retail_Price Retail_Price Cuantitativa Razón Agrupadas Media, Mediana, Moda Desviación estándar, Rango intercuartílico Asimetría, Curtosis Histograma, Boxplot
cat("\nInformación general del dataset:")
## 
## Información general del dataset:
print(str(nike_sales_2024))
## tibble [1,000 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Month                  : chr [1:1000] "November" "January" "October" "December" ...
##  $ Region                 : chr [1:1000] "India" "India" "India" "Greater China" ...
##  $ Main_Category          : chr [1:1000] "Equipment" "Equipment" "Apparel" "Footwear" ...
##  $ Sub_Category           : chr [1:1000] "Bags" "Accessories" "Tops" "Cricket" ...
##  $ Product_Line           : chr [1:1000] "Gym Sack" "Hats" "Tech Fleece" "Vapor Cricket" ...
##  $ Price_Tier             : chr [1:1000] "Budget" "Budget" "Mid-Range" "Premium" ...
##  $ Units_Sold             : num [1:1000] 48356 9842 25079 41404 33569 ...
##  $ Revenue_USD            : num [1:1000] 14506800 2066820 1755530 8694840 5371040 ...
##  $ Online_Sales_Percentage: num [1:1000] 73 50 90 58 53 73 50 55 78 86 ...
##  $ Retail_Price           : num [1:1000] 300 210 70 210 160 140 230 150 150 230 ...
## NULL
cat("\nResumen estadístico:")
## 
## Resumen estadístico:
print(summary(nike_sales_2024))
##     Month              Region          Main_Category      Sub_Category      
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Product_Line        Price_Tier          Units_Sold     Revenue_USD      
##  Length:1000        Length:1000        Min.   : 5028   Min.   :  287400  
##  Class :character   Class :character   1st Qu.:17554   1st Qu.: 2344675  
##  Mode  :character   Mode  :character   Median :28685   Median : 4328020  
##                                        Mean   :28499   Mean   : 5039576  
##                                        3rd Qu.:40026   3rd Qu.: 7264942  
##                                        Max.   :49992   Max.   :14864700  
##  Online_Sales_Percentage  Retail_Price  
##  Min.   :50.00           Min.   : 50.0  
##  1st Qu.:60.00           1st Qu.:110.0  
##  Median :71.00           Median :180.0  
##  Mean   :70.04           Mean   :176.3  
##  3rd Qu.:80.00           3rd Qu.:240.0  
##  Max.   :90.00           Max.   :300.0
# Seleccionar variables numéricas y categóricas
numeric_vars <- nike_sales_2024 %>% select(where(is.numeric))
categorical_vars <- nike_sales_2024 %>% select(where(is.character) | where(is.factor))

# --- 1. Histogramas de variables numéricas ---
histograms <- lapply(names(numeric_vars), function(var) {
  ggplot(nike_sales_2024, aes(x = .data[[var]])) +
    geom_histogram(bins = 30, fill = "blue", alpha = 0.6, color = "black") +
    theme_minimal() +
    ggtitle(paste("Histograma de", var))
})

if (length(histograms) > 0) {
  do.call(grid.arrange, c(histograms, ncol = 2))
}

# --- 2. Boxplots para detectar outliers ---
boxplots <- lapply(names(numeric_vars), function(var) {
  ggplot(nike_sales_2024, aes(y = .data[[var]])) +
    geom_boxplot(fill = "red", alpha = 0.6, outlier.color = "black") +
    theme_minimal() +
    ggtitle(paste("Boxplot de", var))
})

if (length(boxplots) > 0) {
  do.call(grid.arrange, c(boxplots, ncol = 2))
}

# --- 3. Gráficos de dispersión entre pares de variables numéricas ---
if (ncol(numeric_vars) >= 2) {
  scatter_plots <- lapply(combn(names(numeric_vars), 2, simplify = FALSE), function(vars) {
    ggplot(nike_sales_2024, aes(x = .data[[vars[1]]], y = .data[[vars[2]]])) +
      geom_point(alpha = 0.6, color = "darkgreen") +
      theme_minimal() +
      ggtitle(paste("Dispersión entre", vars[1], "y", vars[2]))
  })
  
  if (length(scatter_plots) > 0) {
    do.call(grid.arrange, c(scatter_plots, ncol = 2))
  }
}

# --- 4. Gráficos de barras para variables categóricas ---
if (ncol(categorical_vars) > 0) {
  barplots <- lapply(names(categorical_vars), function(var) {
    ggplot(nike_sales_2024, aes(x = .data[[var]])) +
      geom_bar(fill = "purple", alpha = 0.6) +
      theme_minimal() +
      ggtitle(paste("Distribución de", var)) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
  })
  
  if (length(barplots) > 0) {
    do.call(grid.arrange, c(barplots, ncol = 2))
  }
}

# --- 5. Prueba de normalidad (Shapiro-Wilk) ---
normality_results <- sapply(numeric_vars, function(x) shapiro.test(x)$p.value)

# Crear dataframe con resultados
normality_df <- data.frame(Variable = names(normality_results), 
                           P_Value = normality_results,
                           Normality = ifelse(normality_results > 0.05, "Normal", "No Normal"))

print(normality_df)
##                                        Variable      P_Value Normality
## Units_Sold                           Units_Sold 3.559201e-17 No Normal
## Revenue_USD                         Revenue_USD 7.832045e-20 No Normal
## Online_Sales_Percentage Online_Sales_Percentage 8.222526e-17 No Normal
## Retail_Price                       Retail_Price 1.708022e-17 No Normal
# --- 6. Análisis de Simetría (Skewness) ---
skewness_results <- sapply(numeric_vars, skewness)

# Crear dataframe con resultados
skewness_df <- data.frame(Variable = names(skewness_results), 
                          Skewness = skewness_results)

skewness_df <- skewness_df %>%
  mutate(Interpretation = case_when(
    Skewness > 1  ~ "Sesgo positivo alto (asimétrica a la derecha)",
    Skewness > 0.5 & Skewness <= 1  ~ "Sesgo positivo moderado",
    Skewness >= -0.5 & Skewness <= 0.5 ~ "Simétrica o aproximadamente normal",
    Skewness >= -1 & Skewness < -0.5 ~ "Sesgo negativo moderado",
    Skewness < -1  ~ "Sesgo negativo alto (asimétrica a la izquierda)"
  ))

print(skewness_df)
##                                        Variable     Skewness
## Units_Sold                           Units_Sold -0.063246745
## Revenue_USD                         Revenue_USD  0.721613314
## Online_Sales_Percentage Online_Sales_Percentage  0.009362956
## Retail_Price                       Retail_Price -0.015083579
##                                             Interpretation
## Units_Sold              Simétrica o aproximadamente normal
## Revenue_USD                        Sesgo positivo moderado
## Online_Sales_Percentage Simétrica o aproximadamente normal
## Retail_Price            Simétrica o aproximadamente normal
# 5. Comparación entre grupos
table(nike_sales_2024$Month)  # Ver los meses disponibles
## 
##     April    August  December  February   January      July      June     March 
##        68        75        96        96        78        90        91        73 
##       May  November   October September 
##        85        72        85        91
if ("Month" %in% names(nike_sales_2024) && length(unique(nike_sales_2024$Month)) > 1) {
  # Seleccionar los dos primeros meses disponibles
  meses <- unique(nike_sales_2024$Month)[1:2]
  nike_sales_2024_filtered <- nike_sales_2024 %>% filter(Month %in% meses)
  
  for (var in names(nike_sales_2024_filtered)[sapply(nike_sales_2024_filtered, is.numeric)]) {
    p <- ggplot(nike_sales_2024_filtered, aes(x = as.factor(Month), y = .data[[var]])) +
      geom_boxplot(fill = "lightblue") +
      ggtitle(paste("Comparación de", var, "entre los meses", meses[1], "y", meses[2]))
    print(p)
    
    # Prueba de diferencia de medias
    group1 <- nike_sales_2024_filtered %>% filter(Month == meses[1]) %>% pull(var)
    group2 <- nike_sales_2024_filtered %>% filter(Month == meses[2]) %>% pull(var)
    
    t_test <- t.test(group1, group2, var.equal = FALSE)
    cat("\nComparación de", var, "entre", meses[1], "y", meses[2], ": p-value =", round(t_test$p.value, 4), "\n")
    if (t_test$p.value < 0.05) {
      cat("Diferencia estadísticamente significativa.\n")
    } else {
      cat("No hay diferencia estadísticamente significativa.\n")
    }
  }
} else {
  cat("No hay suficientes meses en la base de nike_sales_2024 para realizar la comparación.\n")
}

## 
## Comparación de Units_Sold entre November y January : p-value = 0.8697 
## No hay diferencia estadísticamente significativa.

## 
## Comparación de Revenue_USD entre November y January : p-value = 0.2074 
## No hay diferencia estadísticamente significativa.

## 
## Comparación de Online_Sales_Percentage entre November y January : p-value = 0.7947 
## No hay diferencia estadísticamente significativa.

## 
## Comparación de Retail_Price entre November y January : p-value = 0.3531 
## No hay diferencia estadísticamente significativa.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.