Se eligió el dataset de automóviles porque contiene variables técnicas y de rendimiento que influyen directamente en el precio. Es ideal para modelar una red neuronal por la combinación de variables numéricas y categóricas.
if (!require("neuralnet")) install.packages("neuralnet")
## Cargando paquete requerido: neuralnet
## Warning: package 'neuralnet' was built under R version 4.4.3
if (!require("corrplot")) install.packages("corrplot")
## Cargando paquete requerido: corrplot
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
if (!require("ggplot2")) install.packages("ggplot2")
## Cargando paquete requerido: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
if (!require("dplyr")) install.packages("dplyr")
## Cargando paquete requerido: dplyr
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Adjuntando el paquete: 'dplyr'
## The following object is masked from 'package:neuralnet':
##
## compute
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(neuralnet)
library(corrplot)
library(ggplot2)
library(dplyr)
# Leer el dataset
df <- read.csv("Automobile_data.csv", na.strings = c("?", "NA"))
# Estructura y primeras filas
dplyr::glimpse(df)
## Rows: 205
## Columns: 26
## $ symboling <int> 3, 3, 1, 2, 2, 2, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0…
## $ normalized.losses <int> NA, NA, NA, 164, 164, NA, 158, NA, 158, NA, 192, 192…
## $ make <chr> "alfa-romero", "alfa-romero", "alfa-romero", "audi",…
## $ fuel.type <chr> "gas", "gas", "gas", "gas", "gas", "gas", "gas", "ga…
## $ aspiration <chr> "std", "std", "std", "std", "std", "std", "std", "st…
## $ num.of.doors <chr> "two", "two", "two", "four", "four", "two", "four", …
## $ body.style <chr> "convertible", "convertible", "hatchback", "sedan", …
## $ drive.wheels <chr> "rwd", "rwd", "rwd", "fwd", "4wd", "fwd", "fwd", "fw…
## $ engine.location <chr> "front", "front", "front", "front", "front", "front"…
## $ wheel.base <dbl> 88.6, 88.6, 94.5, 99.8, 99.4, 99.8, 105.8, 105.8, 10…
## $ length <dbl> 168.8, 168.8, 171.2, 176.6, 176.6, 177.3, 192.7, 192…
## $ width <dbl> 64.1, 64.1, 65.5, 66.2, 66.4, 66.3, 71.4, 71.4, 71.4…
## $ height <dbl> 48.8, 48.8, 52.4, 54.3, 54.3, 53.1, 55.7, 55.7, 55.9…
## $ curb.weight <int> 2548, 2548, 2823, 2337, 2824, 2507, 2844, 2954, 3086…
## $ engine.type <chr> "dohc", "dohc", "ohcv", "ohc", "ohc", "ohc", "ohc", …
## $ num.of.cylinders <chr> "four", "four", "six", "four", "five", "five", "five…
## $ engine.size <int> 130, 130, 152, 109, 136, 136, 136, 136, 131, 131, 10…
## $ fuel.system <chr> "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpfi", "mpf…
## $ bore <dbl> 3.47, 3.47, 2.68, 3.19, 3.19, 3.19, 3.19, 3.19, 3.13…
## $ stroke <dbl> 2.68, 2.68, 3.47, 3.40, 3.40, 3.40, 3.40, 3.40, 3.40…
## $ compression.ratio <dbl> 9.00, 9.00, 9.00, 10.00, 8.00, 8.50, 8.50, 8.50, 8.3…
## $ horsepower <int> 111, 111, 154, 102, 115, 110, 110, 110, 140, 160, 10…
## $ peak.rpm <int> 5000, 5000, 5000, 5500, 5500, 5500, 5500, 5500, 5500…
## $ city.mpg <int> 21, 21, 19, 24, 18, 19, 19, 19, 17, 16, 23, 23, 21, …
## $ highway.mpg <int> 27, 27, 26, 30, 22, 25, 25, 25, 20, 22, 29, 29, 28, …
## $ price <int> 13495, 16500, 16500, 13950, 17450, 15250, 17710, 189…
head(df)
## symboling normalized.losses make fuel.type aspiration num.of.doors
## 1 3 NA alfa-romero gas std two
## 2 3 NA alfa-romero gas std two
## 3 1 NA alfa-romero gas std two
## 4 2 164 audi gas std four
## 5 2 164 audi gas std four
## 6 2 NA audi gas std two
## body.style drive.wheels engine.location wheel.base length width height
## 1 convertible rwd front 88.6 168.8 64.1 48.8
## 2 convertible rwd front 88.6 168.8 64.1 48.8
## 3 hatchback rwd front 94.5 171.2 65.5 52.4
## 4 sedan fwd front 99.8 176.6 66.2 54.3
## 5 sedan 4wd front 99.4 176.6 66.4 54.3
## 6 sedan fwd front 99.8 177.3 66.3 53.1
## curb.weight engine.type num.of.cylinders engine.size fuel.system bore stroke
## 1 2548 dohc four 130 mpfi 3.47 2.68
## 2 2548 dohc four 130 mpfi 3.47 2.68
## 3 2823 ohcv six 152 mpfi 2.68 3.47
## 4 2337 ohc four 109 mpfi 3.19 3.40
## 5 2824 ohc five 136 mpfi 3.19 3.40
## 6 2507 ohc five 136 mpfi 3.19 3.40
## compression.ratio horsepower peak.rpm city.mpg highway.mpg price
## 1 9.0 111 5000 21 27 13495
## 2 9.0 111 5000 21 27 16500
## 3 9.0 154 5000 19 26 16500
## 4 10.0 102 5500 24 30 13950
## 5 8.0 115 5500 18 22 17450
## 6 8.5 110 5500 19 25 15250
Las variables incluyen: - horsepower: potencia del motor - engine.size: tamaño del motor - curb.weight: peso del vehículo - city.mpg: rendimiento en ciudad - highway.mpg: rendimiento en carretera - price: variable objetivo
Variables categóricas incluyen: - make, fuel.type, aspiration, num.of.doors, body.style, etc.
df_clean <- df[!is.na(df$price), ]
numeric_cols <- sapply(df_clean, is.numeric)
df_clean[, numeric_cols] <- lapply(df_clean[, numeric_cols], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
})
categorical_cols <- c("make", "fuel.type", "aspiration", "num.of.doors",
"body.style", "drive.wheels", "engine.location",
"engine.type", "num.of.cylinders", "fuel.system")
df_clean[categorical_cols] <- lapply(df_clean[categorical_cols], as.factor)
df_clean$power.to.weight <- df_clean$horsepower / df_clean$`curb.weight`
selected_vars <- c("horsepower", "engine.size", "curb.weight",
"power.to.weight", "city.mpg", "highway.mpg",
"wheel.base", "price")
df_model <- df_clean[, selected_vars]
cor_matrix <- cor(df_model[, -which(names(df_model) == "price")])
corrplot(cor_matrix, method = "circle", type = "upper")
normalize <- function(x) {(x - min(x)) / (max(x) - min(x))}
df_normalized <- as.data.frame(lapply(df_model, normalize))
set.seed(123)
train_indices <- sample(1:nrow(df_normalized), 0.7 * nrow(df_normalized))
train_data <- df_normalized[train_indices, ]
test_data <- df_normalized[-train_indices, ]
Se eligió una arquitectura con 2 capas ocultas usando la función logística. Esto permite modelar relaciones no lineales sin sobreajustar.
formula <- price ~ horsepower + engine.size + curb.weight + power.to.weight
model <- neuralnet(
formula,
data = train_data,
hidden = c(5, 3),
linear.output = FALSE,
act.fct = "logistic",
lifesign = "full",
threshold = 0.01,
stepmax = 1e5
)
## hidden: 5, 3 thresh: 0.01 rep: 1/1 steps: 116 error: 0.35172 time: 0.05 secs
plot(model, rep = "best")
predictions <- predict(model, test_data)
mse <- mean((test_data$price - predictions)^2)
cat("MSE en prueba:", mse, "\n")
## MSE en prueba: 0.01248459
plot_data <- data.frame(
Real = test_data$price,
Predicted = predictions
)
ggplot(plot_data, aes(x = Real, y = Predicted)) +
geom_point(color = "blue") +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(title = "Predicción vs Real", x = "Precio Real (normalizado)", y = "Precio Predicho") +
theme_minimal()
neuralnet