Introducción

Sin funciones de activación, las redes neuronales serían solo regresiones lineales disfrazadas. Son estas funciones las que permiten que los modelos aprendan patrones complejos y resuelvan problemas del mundo real

# Cargar librerías necesarias
library(ggplot2)
library(gridExtra)
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(kableExtra)

## 
## Adjuntando el paquete: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Definir las funciones de activación y sus derivadas

# 1. Función Sigmoide
sigmoid <- function(x) {
  1 / (1 + exp(-x))
}

sigmoid_derivative <- function(x) {
  s <- sigmoid(x)
  s * (1 - s)
}

# 2. Función Tanh
tanh_func <- function(x) {
  tanh(x)
}

tanh_derivative <- function(x) {
  1 - tanh(x)^2
}

# 3. Función ReLU
relu <- function(x) {
  pmax(0, x)
}

relu_derivative <- function(x) {
  ifelse(x > 0, 1, 0)
}

# 4. Función Leaky ReLU
leaky_relu <- function(x, alpha = 0.01) {
  ifelse(x > 0, x, alpha * x)
}

leaky_relu_derivative <- function(x, alpha = 0.01) {
  ifelse(x > 0, 1, alpha)
}

# 5. Función Swish
swish <- function(x, beta = 1) {
  x * sigmoid(beta * x)
}

swish_derivative <- function(x, beta = 1) {
  s <- sigmoid(beta * x)
  s + x * s * (1 - s) * beta
}

1. Función Sigmoide

La función sigmoide es una de las funciones de activación más clásicas, mapeando cualquier valor real a un rango entre 0 y 1.

Primera Derivada

\[f'(x) = f(x) \cdot (1 - f(x)) = \frac{e^{-x}}{(1 + e^{-x})^2}\]

# Crear datos para graficar
x <- seq(-6, 6, 0.1)

# Crear el gráfico de la función sigmoide
p1 <- ggplot(data.frame(x = x, y = sigmoid(x)), aes(x, y)) +
  geom_line(color = "blue", size = 1.2) +
  labs(title = "Función Sigmoide", 
       x = "x", y = "f(x)") +
  theme_minimal() +
  geom_hline(yintercept = c(0, 1), linetype = "dashed", alpha = 0.5) +
  ylim(-0.1, 1.1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Crear el gráfico de la derivada
p2 <- ggplot(data.frame(x = x, y = sigmoid_derivative(x)), aes(x, y)) +
  geom_line(color = "red", size = 1.2) +
  labs(title = "Derivada de Sigmoide", 
       x = "x", y = "f'(x)") +
  theme_minimal() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5)

grid.arrange(p1, p2, ncol = 2)

Características:

Rango: (0, 1)
Diferenciable: Sí, en todo su dominio
Monotónica: Sí, estrictamente creciente
Problemas: Vanishing gradient para valores extremos

2. Función Tanh (Tangente Hiperbólica)

La función tanh es similar a la sigmoide pero centrada en cero, con rango entre -1 y 1.

Primera Derivada

\[f'(x) = 1 - \tanh^2(x) = 1 - f(x)^2\]

# Crear el gráfico de la función tanh
p3 <- ggplot(data.frame(x = x, y = tanh_func(x)), aes(x, y)) +
  geom_line(color = "green", size = 1.2) +
  labs(title = "Función Tanh", 
       x = "x", y = "f(x)") +
  theme_minimal() +
  geom_hline(yintercept = c(-1, 0, 1), linetype = "dashed", alpha = 0.5) +
  ylim(-1.1, 1.1)

# Crear el gráfico de la derivada
p4 <- ggplot(data.frame(x = x, y = tanh_derivative(x)), aes(x, y)) +
  geom_line(color = "darkgreen", size = 1.2) +
  labs(title = "Derivada de Tanh", 
       x = "x", y = "f'(x)") +
  theme_minimal() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5)

grid.arrange(p3, p4, ncol = 2)

Características:

Rango: (-1, 1)
Diferenciable: Sí, en todo su dominio
Monotónica: Sí, estrictamente creciente
Ventaja: Centrada en cero, mejor que sigmoide

3. Función ReLU

ReLU es la función de activación más popular en deep learning por su simplicidad y efectividad.

Primera Derivada

\[f'(x) = \begin{cases} 0 & \text{si } x < 0 \\ 1 & \text{si } x > 0 \\ \text{indefinida} & \text{en } x = 0 \end{cases}\]

# Crear el gráfico de la función ReLU
p5 <- ggplot(data.frame(x = x, y = relu(x)), aes(x, y)) +
  geom_line(color = "purple", size = 1.2) +
  labs(title = "Función ReLU", 
       x = "x", y = "f(x)") +
  theme_minimal() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5)

# Crear el gráfico de la derivada
p6 <- ggplot(data.frame(x = x, y = relu_derivative(x)), aes(x, y)) +
  geom_line(color = "darkmagenta", size = 1.2) +
  labs(title = "Derivada de ReLU", 
       x = "x", y = "f'(x)") +
  theme_minimal() +
  geom_hline(yintercept = c(0, 1), linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  ylim(-0.1, 1.1)

grid.arrange(p5, p6, ncol = 2)

Características:

Rango: [0, +∞)
No diferenciable: En x = 0
No saturante: Para valores positivos
Problema: “Dying ReLU” para valores negativos

4. Función Leaky ReLU

Una variación de ReLU que permite un pequeño gradiente para valores negativos.

Primera Derivada

\[f'(x) = \begin{cases} \alpha & \text{si } x < 0 \\ 1 & \text{si } x > 0 \\ \text{indefinida} & \text{en } x = 0 \end{cases}\]

# Crear el gráfico de la función Leaky ReLU
p7 <- ggplot(data.frame(x = x, y = leaky_relu(x)), aes(x, y)) +
  geom_line(color = "orange", size = 1.2) +
  labs(title = "Función Leaky ReLU (α = 0.01)", 
       x = "x", y = "f(x)") +
  theme_minimal() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5)

# Crear el gráfico de la derivada
p8 <- ggplot(data.frame(x = x, y = leaky_relu_derivative(x)), aes(x, y)) +
  geom_line(color = "darkorange", size = 1.2) +
  labs(title = "Derivada de Leaky ReLU", 
       x = "x", y = "f'(x)") +
  theme_minimal() +
  geom_hline(yintercept = c(0, 0.01, 1), linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  ylim(-0.1, 1.1)

grid.arrange(p7, p8, ncol = 2)

Características:

Rango: (-∞, +∞)
Ventaja: Evita el problema de “dying ReLU”
Parámetro: α (usualmente 0.01)

5. Función Swish

Función de activación propuesta por Google, que combina las ventajas de ReLU y sigmoide.

Primera Derivada

\[f'(x) = \sigma(\beta x) + x \cdot \sigma(\beta x) \cdot (1 - \sigma(\beta x)) \cdot \beta\]

donde \(\sigma(x) = \frac{1}{1 + e^{-x}}\).

# Crear el gráfico de la función Swish
p9 <- ggplot(data.frame(x = x, y = swish(x)), aes(x, y)) +
  geom_line(color = "brown", size = 1.2) +
  labs(title = "Función Swish (β = 1)", 
       x = "x", y = "f(x)") +
  theme_minimal() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5)

# Crear el gráfico de la derivada
p10 <- ggplot(data.frame(x = x, y = swish_derivative(x)), aes(x, y)) +
  geom_line(color = "darkred", size = 1.2) +
  labs(title = "Derivada de Swish", 
       x = "x", y = "f'(x)") +
  theme_minimal() +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5)

grid.arrange(p9, p10, ncol = 2)

Características:

Rango: (-∞, +∞)
Suave: Diferenciable en todo punto
Performance: Mejor que ReLU en muchos casos

Comparación Visual de Todas las Funciones

# Crear un dataframe con todas las funciones
df_functions <- data.frame(
  x = rep(x, 5),
  y = c(sigmoid(x), tanh_func(x), relu(x), leaky_relu(x), swish(x)),
  Function = rep(c("Sigmoide", "Tanh", "ReLU", "Leaky ReLU", "Swish"), each = length(x))
)

# Gráfico comparativo de funciones
p_comp1 <- ggplot(df_functions, aes(x, y, color = Function)) +
  geom_line(size = 1.2) +
  labs(title = "Comparación de Funciones de Activación", 
       x = "x", y = "f(x)") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_manual(values = c("blue", "green", "purple", "orange", "brown"))

# Crear un dataframe con todas las derivadas
df_derivatives <- data.frame(
  x = rep(x, 5),
  y = c(sigmoid_derivative(x), tanh_derivative(x), relu_derivative(x), 
        leaky_relu_derivative(x), swish_derivative(x)),
  Function = rep(c("Sigmoide", "Tanh", "ReLU", "Leaky ReLU", "Swish"), each = length(x))
)

# Gráfico comparativo de derivadas
p_comp2 <- ggplot(df_derivatives, aes(x, y, color = Function)) +
  geom_line(size = 1.2) +
  labs(title = "Comparación de Derivadas", 
       x = "x", y = "f'(x)") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_manual(values = c("blue", "green", "purple", "orange", "brown"))

grid.arrange(p_comp1, p_comp2, nrow = 2)

Tabla Comparativa

# Crear tabla comparativa
comparison_data <- data.frame(
  Función = c("Sigmoide", "Tanh", "ReLU", "Leaky ReLU", "Swish"),
  Rango = c("(0, 1)", "(-1, 1)", "[0, +∞)", "(-∞, +∞)", "(-∞, +∞)"),
  Pros = c("Interpretación probabilística", "Centrada en cero", "Simple, eficiente", "Evita dying neurons", "Rendimiento superior"),
  Contras = c("Vanishing gradient", "Vanishing gradient", "Dying ReLU", "Parámetro adicional", "Más costosa computacionalmente"),
  Uso_Recomendado = c("Clasificación binaria (salida)", "RNNs, capas ocultas clásicas", "Opción por defecto, CNNs", "Cuando ReLU falla", "Modelos de alta performance")
)

kable(comparison_data, 
      col.names = c("Función", "Rango", "Pros", "Contras", "Uso Recomendado"),
      caption = "Comparación de Funciones de Activación") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#3498db")

Comparación de Funciones de Activación
Función	Rango	Pros	Contras	Uso Recomendado
Sigmoide	(0, 1)	Interpretación probabilística	Vanishing gradient	Clasificación binaria (salida)
Tanh	(-1, 1)	Centrada en cero	Vanishing gradient	RNNs, capas ocultas clásicas
ReLU	[0, +∞)	Simple, eficiente	Dying ReLU	Opción por defecto, CNNs
Leaky ReLU	(-∞, +∞)	Evita dying neurons	Parámetro adicional	Cuando ReLU falla
Swish	(-∞, +∞)	Rendimiento superior	Más costosa computacionalmente	Modelos de alta performance

Funciones de activacion

2025-08-04

Introducción

1. Función Sigmoide

Primera Derivada

2. Función Tanh (Tangente Hiperbólica)

Primera Derivada

3. Función ReLU

Primera Derivada

4. Función Leaky ReLU

Primera Derivada

5. Función Swish

Primera Derivada

Comparación Visual de Todas las Funciones

Tabla Comparativa

Referencias