Directorio de trabajo

knitr::opts_knit$set(root.dir = "~/Desktop/Semestre 4/Ciencia de Datos para la Toma de Decisiones/Reto/Reto_CienciaDatosI")

Bibliotecas

# install.packages('tidyverse')
library('tidyverse')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# install.packages('ggalluvial')
library('ggalluvial')
# install.packages('ggplot2')
library('ggplot2')
# install.packages('dplyr')
library('dplyr')
# install.packages('tidygraph')
library('tidygraph')
## 
## Attaching package: 'tidygraph'
## 
## The following object is masked from 'package:stats':
## 
##     filter
# install.packages('igraph')                                                 
library('igraph')
## 
## Attaching package: 'igraph'
## 
## The following object is masked from 'package:tidygraph':
## 
##     groups
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
# install.packages('ggraph')
library('ggraph')
# install.packages('ggrepel')
library('ggrepel')
# install.packages('GGally')
library('GGally')
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# install.packages('janitor')
library('janitor')
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# install.packages('FactoMineR')
library('FactoMineR') 
# install.packages('factoextra')
library('factoextra')
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# install.packages('readxl')
library('readxl')
# install.packages('p2distance')
library('p2distance')
# install.packages('stratification')
library('stratification')
# install.packages('sf')
library('sf')
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
# install.packages('RColorBrewer')
library('RColorBrewer')
# install.packages('dendextend')
library('dendextend')
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## 
## Attaching package: 'dendextend'
## 
## The following object is masked from 'package:stats':
## 
##     cutree
# install.packages("viridis")
library('viridis')
## Loading required package: viridisLite
# install.packages(ggdendro)
library('ggdendro')
## 
## Attaching package: 'ggdendro'
## 
## The following object is masked from 'package:dendextend':
## 
##     theme_dendro
# install.packages('sysfonts')
library('sysfonts')
# install.packages('showtext')
library('showtext')
## Loading required package: showtextdb

Datos

df <- read.csv('data/economy_foreign_trade_nat_2025-05-28T00_31_14.973Z.csv') %>% 
  clean_names() %>% 
  select(-hs4) %>% 
  mutate(hs4_id = as.character(hs4_id -160000))
  
df_2 <- read.csv('data/economy_foreign_trade_nat_2025-05-28T15_38_42.698Z.csv') %>% 
  clean_names()
  
df_3 <- read.csv('data/economy_foreign_trade_ent_2025-05-28T16_19_50.381Z.csv') %>% 
  clean_names()

df_4 <- read.csv('data/economy_foreign_trade_ent_2025-05-31T08_49_02.497Z.csv') %>% 
  clean_names() %>% 
  select(-hs4) %>% 
  mutate(hs4_id = as.character(hs4_id -160000)) 

mapa <- st_read('data/ne_10m_admin_0_countries')
## Reading layer `ne_10m_admin_0_countries' from data source 
##   `/Users/Diego1/Desktop/Semestre 4/Ciencia de Datos para la Toma de Decisiones/Reto/Reto_CienciaDatosI/data/ne_10m_admin_0_countries' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 258 features and 168 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## Geodetic CRS:  WGS 84
mexico <- st_read('data/entidades') %>% 
  clean_names() %>% 
  select(-cvegeo, -cve_ent)
## Reading layer `00ent' from data source 
##   `/Users/Diego1/Desktop/Semestre 4/Ciencia de Datos para la Toma de Decisiones/Reto/Reto_CienciaDatosI/data/entidades' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 32 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 911292 ymin: 319149.1 xmax: 4082997 ymax: 2349615
## Projected CRS: MEXICO_ITRF_2008_LCC
font_add_google("Montserrat", "Montserrat")

Pregunta de investigación

¿Qué oportunidades comerciales están siendo desaprovechadas en la industria mexicana de semiconductores en función de los patrones de exportación actuales?

Hipótesis

La industria mexicana de semiconductores presenta oportunidades comerciales no aprovechadas hacia países fuera de sus socios tradicionales, las cuales pueden identificarse mediante análisis jerárquico y visualización geográfica de flujos de exportación.

Estratificación Jerárquica (¿Qué oportunidades no se están aprovechando?)

Para encontrar un grupo de productos que representan una oportunidad, se hace una estratificación jerárquica con ward.D2, que se centra en minimizar la varianza dentro de cada cluster. Se resaltan los productos que se encuentran en el mismo cluster. La lógica es que los productos con volúmen de exportaciones similares presentan relaciones ya existentes pero no fortalecidas, oportunidades para la industria de Semiconductores.

distancias = df %>% 
  column_to_rownames(var = 'hs4_id') %>% 
  dist(method = "euclidean")

complete_fit = hclust(distancias, method = "ward.D2") # Se crea una estratificación
# jerárquica con método ward.D2 (que busca minimizar las varianzas dentro de los 
# clusters)

dendograma = as.dendrogram(complete_fit) # Se crea el dendrograma

clusters = cutree(complete_fit, k = 5) %>% # Se establecen cinco 
# clusters a partir de los datos
  as_tibble(rownames = 'hs4_id') %>% 
  rename(cluster = value) %>%         
  mutate(cluster = factor(cluster))

labels_in_cluster <- clusters %>%
  filter(cluster == 1) %>%
  pull(hs4_id)

all_labels_colors <- rep("#C69C6D", length(labels(dendograma)))
names(all_labels_colors) <- labels(dendograma) 

all_labels_colors[labels_in_cluster] <- "#B21B40"

all_labels_colors["8541"] <- "#5E1224"

dendograma_coloreado <- dendograma %>%
  set("labels_colors", all_labels_colors[labels(dendograma)]) %>% 
  set("labels", ifelse(labels(dendograma) == "8541", 
                       expression(bold("8541")), 
                       labels(dendograma)))

colores_para_bordes_rect <- c("#C69C6D", "#C69C6D", "#C69C6D", "#C69C6D", "#B21B40") 

plot( # Se grafica el dendograma con las configuraciones visuales
  x = dendograma_coloreado,
  family = 'Montserrat',
  main = 'Aglomeración por volumen de exportación totales',
  ylab = 'Distancia entre cluster',
  cex.axis = 0.8,
  cex.lab = 0.9,
  cex.main = 1,
  mar = c(12, 4, 4, 2) + 0.1,
  sub = 'Elaboración propia con datos de Data México'
  ) 
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in axis(side = side, at = at, labels = labels, ...): no font could be
## found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
## Warning in title(...): no font could be found for family "Montserrat"
rect.hclust(complete_fit, k = 5, border = colores_para_bordes_rect) # Se 

# grafican los clusters con las configuraciones visuales

Mapa (¿Qué flujos similares se pueden aprovechar para semiconductores?)

Bajo la misma lógica, se buscan identificar los destinos de las rutas que representan una oportunidad para los semiconductores. Para esto, se seleccionan los productos en el cluster del dendograma pasado y se observa el volúmen de exportaciones delcluster entero, para ver qué mercados convendría priorizar. Se eliminan E.E.U.U.y China para resaltar oportunidades externas a posibles tensiones con el T-MEC.

mapa_mundial_limpio <- mapa %>% # Se crea el mapa con datos de los productos 
# seleccionados en el clustering 
  clean_names() %>%
  select(iso_a3, geometry) %>% 
  mutate(iso_a3 = tolower(iso_a3))

robinson_proj_string <- "+proj=robin +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"

mapa_mundial_proyectado <- mapa_mundial_limpio %>%
  st_transform(crs = robinson_proj_string)

mapa_mundial_disuelto <- mapa_mundial_proyectado %>%
  mutate(geometry = st_make_valid(geometry)) %>%
  group_by(iso_a3) %>%
  summarise(geometry = st_union(geometry), .groups = 'drop') %>%
  ungroup()

mapa_final <- mapa_mundial_disuelto %>%
  left_join(df_2, by = c("iso_a3" = "country_id"))  %>% 
  mutate(
    trade_value = ifelse(is.na(trade_value), 0, trade_value),
    trade_value = case_when(
      iso_a3 == "mex" ~ NA,
      iso_a3 == "chn" ~ NA,
      iso_a3 == "usa" ~NA,
      TRUE ~ trade_value
    )
  )

showtext_auto()

ggplot(data = mapa_final) + # Se grafica el mapa con las configuraciones visuales
  geom_sf(aes(fill = log(trade_value)), color = "gray40") +
  scale_fill_gradientn(
    colors = c('antiquewhite', '#E6C79C', '#23675A'),
    na.value = "white",
    name = "Valor de Exportaciones \n(Escala logarítmica)"
  ) +
  labs( 
    caption = 'Elaboración propia con datos de Data México y geometría de Natural Earth Data'
  ) +
  theme_minimal(base_family = "Montserrat") +
  theme(
    plot.background = element_rect(fill = "antiquewhite", color = NA),
    panel.background = element_rect(fill = "antiquewhite", color = NA),
    plot.caption = element_text(face = "italic", hjust = 1),
    legend.text = element_text(face = "bold"),
    legend.title = element_text(face = "bold")
  ) + 
  guides(fill = guide_colorbar(
    frame.colour = "black",  
    frame.linewidth = 0.1      
  ))

Mapa México (¿Qué puntos de oportunidad hay en el país?)

Bajo la misma lógica, se buscan identificar los destinos de las rutas que representan una oportunidad para los semiconductores. Para esto, se seleccionan los productos en el cluster del dendograma pasado y se observa el volúmen de exportaciones del cluster entero, para ver qué mercados convendría priorizar. Se eliminan E.E.U.U.y China para resaltar oportunidades externas a posibles tensiones con el T-MEC.

total_estado <- df_3 %>% 
  group_by(state) %>% 
  summarise(total_trade_value_estado = sum(trade_value, na.rm = TRUE)) %>% 
  ungroup() 

map_mex <- mexico %>% # Se crea el mapa con datos de los productos seleccionados
# en el clustering
  left_join(total_estado, by = c('nomgeo' = 'state'))

setwd("~/Desktop/Semestre 4/Ciencia de Datos para la Toma de Decisiones/Reto/Reto_CienciaDatosI")
ggplot(data = map_mex) + # Se grafica el mapa con las configuraciones visuales
  geom_sf(aes(fill = log(total_trade_value_estado)), color = "gray40", lwd = 0.2) +
  scale_fill_gradientn(
    colors = c('antiquewhite', '#E6C79C', '#23675A'),
    na.value = "antiquewhite",
    name = "Valor de Exportaciones\n(Escala logarítmica)"
  ) +
  theme_minimal(base_family = "Montserrat") +
  theme(
    plot.background = element_rect(fill = "antiquewhite", color = NA),
    panel.background = element_rect(fill = "antiquewhite", color = NA),
    plot.caption = element_text(face = "italic", hjust = 1),
    axis.text = element_text(face = "bold"),
    legend.text = element_text(face = "bold"),
    legend.title = element_text(face = "bold")
  ) + 
  labs(
    caption = 'Elaboración propia con datos de Data México y geometría de INEGI'
  ) +
  guides(fill = guide_colorbar(
    frame.colour = "black",  
    frame.linewidth = 0.1      
  ))

Alluvial (¿Cómo se ven los flujos comerciales de la industria?)

Para identificar los flujos comerciales de la industria, notando las tendencias de el volúmen de exportaciones de los productos, se seleccionan los cinco productos más exportados. De aquí, se seleccionan los cinco estados con mayor volúmen de exportaciones de la sección HS85, así como los cinco países que más importan a México. Se dejan los demás elementos como “Otros”.

top_5_states_names <- df_4 %>% # Se toman los cinco estados con mayor volúmen de
# exportaciones
  group_by(state) %>%
  summarise(total_trade_value = sum(trade_value, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(total_trade_value)) %>%
  head(5) %>%
  pull(state)

top_5_hs4_ids <- df_4 %>% # Se toman los cinco productos con mayor volúmen de
# exportaciones
  group_by(hs4_id) %>%
  summarise(total_trade_value = sum(trade_value, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(total_trade_value)) %>%
  head(5) %>%
  pull(hs4_id)

top_5_countries <- df_4 %>% # # Se toman los cinco países con mayor volúmen de 
# importaciones a México
  group_by(country) %>%
  summarise(total_trade_value = sum(trade_value, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(total_trade_value)) %>%
  head(5) %>%
  pull(country)

df_clasificado_base <- df_4 %>% # Se juntan todas las categorías pasadas y se deja
# como "Otros" a todo registro no incluido en ellas
  mutate(
    state_category = ifelse(
      state %in% top_5_states_names,
      as.character(state),
      "Otras Entidades \nFederativas"
    ),
    hs4_category = ifelse(
      hs4_id %in% top_5_hs4_ids,
      as.character(hs4_id),
      "Otros Subsectores \nde HS85"
    ),
    country_category = ifelse(
      country %in% top_5_countries,
      as.character(country),
      "Otros Países"
    )
  )

state_order <- df_clasificado_base %>%
  group_by(state_category) %>%
  summarise(total_val = sum(trade_value, na.rm = TRUE)) %>%
  arrange(total_val) %>%
  pull(state_category)

hs4_order <- df_clasificado_base %>%
  group_by(hs4_category) %>%
  summarise(total_val = sum(trade_value, na.rm = TRUE)) %>%
  arrange(total_val) %>%
  pull(hs4_category)

country_order <- df_clasificado_base %>%
  group_by(country_category) %>%
  summarise(total_val = sum(trade_value, na.rm = TRUE)) %>%
  arrange(total_val) %>%
  pull(country_category)

df_alluvial_data <- df_clasificado_base %>%
  group_by(state_category, hs4_category, country_category) %>%
  summarise(trade_value_flow = sum(trade_value, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(
    state_category = factor(state_category, levels = state_order),
    hs4_category = factor(hs4_category, levels = hs4_order),
    country_category = factor(country_category, levels = country_order),
    trade_value_flow = (trade_value_flow / sum(trade_value_flow, na.rm = TRUE)) * 100
  ) %>% 
  group_by(state_category, hs4_category, country_category) 
## `summarise()` has grouped output by 'state_category', 'hs4_category'. You can
## override using the `.groups` argument.
my_all_strata_colors <- c(
  # Estados 
  "Ciudad de México" = "gray20",
  "Tamaulipas" = "gray30",
  "Jalisco" = "gray40",
  "Baja California" = "gray50",
  "Chihuahua" = "gray60",
  "Otras Entidades \nFederativas" = "gray70",
  
  # Códigos HS4 
  "8536" = "#E6C79C",
  "8544" = "#C69C6D",
  "8528" = "#B21B40",
  "8542" = "#5E1224",
  "8517" = "#23675A",
  "Otros Subsectores \nde HS85" = "#00332A",
  
  # Países 
  "Japón" = "gray20",
  "Corea del Sur" = "gray30",
  "Malasia" = "gray40",
  "China" = "gray50",
  "Otros Países" = "gray60",
  "Estados Unidos" = "gray70"
)

ggplot(data = df_alluvial_data,
       aes(axis1 = state_category,
           axis2 = hs4_category,
           axis3 = country_category,
           y = trade_value_flow)) +
  geom_alluvium(aes(fill = hs4_category),
                curve_type = "quintic",
                alpha = 0.70
                ) +
  geom_stratum(aes(fill = after_stat(stratum)), color = NA) +
  geom_text(stat = "stratum",
             aes(label = after_stat(stratum)), color = 'white',  
             fontface = 'bold',
             family = 'Montserrat',
             size = 2
             ) +
  theme_minimal() +
  scale_fill_manual(values = my_all_strata_colors) + 
  scale_x_discrete(limits = c("Entidades Federativas", "Subsector", "Países"),
                   expand = c(0.15, 0.05)) +
  labs(
    title = '\nFlujo Comercial por Entidad Federativa',
    y = 'Porcentaje del flujo comercial agregado del sector HS85',
    caption = 'Elaboración propia con datos de Data México'
  ) +
  theme_minimal(base_family = "Montserrat") +
  theme(
    plot.background = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA),
    plot.title = element_text(face = "bold", hjust = 0.5, 
                              family = 'Montserrat'),
    legend.position = 'none',
    axis.text = element_text(face = "bold"), 
    axis.title.y = element_text(face = "bold"),
    axis.title.x = element_blank(),
    plot.caption = element_text(face = "italic", hjust = 1)
  )

ggsave("graf_alluvial.jpg", width = 14, height = 8, dpi = 300)