##COURSE ACTIVITY 26-27 FEB #Librerías
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Cargando paquete requerido: lattice
## Warning: package 'lattice' was built under R version 4.4.3
##
## Adjuntando el paquete: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(readxl)
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Adjuntando el paquete: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Adjuntando el paquete: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
"file.choose()"
## [1] "file.choose()"
islr <- read_excel("C:\\Users\\anton\\Downloads\\dataset_islr.xlsx")
head(islr)
## # A tibble: 6 × 11
## price_competition household_income advertising_exp population price quality
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 111 48 16 260 83 Good
## 2 136 81 15 425 120 Good
## 3 132 110 0 108 124 Medium
## 4 121 78 9 150 100 Bad
## 5 147 74 13 251 131 Good
## 6 121 31 0 292 109 Medium
## # ℹ 5 more variables: pop_age <dbl>, education <dbl>, urban_zone <chr>,
## # national <chr>, sales <dbl>
str(islr)
## tibble [400 × 11] (S3: tbl_df/tbl/data.frame)
## $ price_competition: num [1:400] 111 136 132 121 147 121 107 121 130 117 ...
## $ household_income : num [1:400] 48 81 110 78 74 31 32 41 60 42 ...
## $ advertising_exp : num [1:400] 16 15 0 9 13 0 12 5 0 7 ...
## $ population : num [1:400] 260 425 108 150 251 292 236 412 144 144 ...
## $ price : num [1:400] 83 120 124 100 131 109 137 110 138 111 ...
## $ quality : chr [1:400] "Good" "Good" "Medium" "Bad" ...
## $ pop_age : num [1:400] 65 67 76 26 52 79 64 54 38 62 ...
## $ education : num [1:400] 10 10 10 10 10 10 10 10 10 10 ...
## $ urban_zone : chr [1:400] "Yes" "Yes" "No" "No" ...
## $ national : chr [1:400] "Yes" "Yes" "No" "Yes" ...
## $ sales : num [1:400] 11.22 11.85 6.54 9.01 12.29 ...
summary(islr)
## price_competition household_income advertising_exp population
## Min. : 77 Min. : 21.00 Min. : 0.000 Min. : 10.0
## 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000 1st Qu.:139.0
## Median :125 Median : 69.00 Median : 5.000 Median :272.0
## Mean :125 Mean : 68.66 Mean : 6.635 Mean :264.8
## 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000 3rd Qu.:398.5
## Max. :175 Max. :120.00 Max. :29.000 Max. :509.0
## price quality pop_age education
## Min. : 24.0 Length:400 Min. :25.00 Min. :10.0
## 1st Qu.:100.0 Class :character 1st Qu.:39.75 1st Qu.:12.0
## Median :117.0 Mode :character Median :54.50 Median :14.0
## Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :191.0 Max. :80.00 Max. :18.0
## urban_zone national sales
## Length:400 Length:400 Min. : 0.000
## Class :character Class :character 1st Qu.: 5.390
## Mode :character Mode :character Median : 7.490
## Mean : 7.496
## 3rd Qu.: 9.320
## Max. :16.270
colSums(is.na(islr))
## price_competition household_income advertising_exp population
## 0 0 0 0
## price quality pop_age education
## 0 0 0 0
## urban_zone national sales
## 0 0 0
nzv <- nearZeroVar(islr)
if(length(nzv) > 0){
islr <- islr[, -nzv]
}
ggplot(islr, aes(x = sales)) +
geom_histogram(bins = 30, fill = "steelblue") +
theme_minimal()
ggplot(islr, aes(x = price, y = sales)) +
geom_point(color = "green") +
theme_minimal()
# Convertimos variables categóricas a factores
islr$quality <- as.factor(islr$quality)
islr$urban_zone <- as.factor(islr$urban_zone)
islr$national <- as.factor(islr$national)
islr <- islr %>%
mutate(
price_ratio = price / price_competition,
income_per_person = household_income / population,
advertising_per_person = advertising_exp / population,
high_quality = ifelse(quality == "Good", 1, 0)
)
numeric_vars <- islr %>%
select(where(is.numeric)) %>%
select(-sales)
preproc <- preProcess(numeric_vars, method = c("center", "scale"))
scaled_data <- predict(preproc, numeric_vars)
islr_scaled <- bind_cols(scaled_data, sales = islr$sales)
islr_scaled$log_advertising <- log(islr$advertising_exp + 1)
cor_matrix <- cor(islr_scaled %>% select(where(is.numeric)))
corrplot::corrplot(cor_matrix, method = "color")
# Modelo linear
lm_model <- lm(sales ~ ., data = islr_scaled)
summary(lm_model)
##
## Call:
## lm(formula = sales ~ ., data = islr_scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2686 -0.8623 0.0862 0.9260 4.0351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.69631 0.24102 31.932 < 2e-16 ***
## price_competition 1.13279 0.33407 3.391 0.000768 ***
## household_income 0.41588 0.06876 6.049 3.45e-09 ***
## advertising_exp 0.87535 0.18628 4.699 3.64e-06 ***
## population -0.02312 0.08580 -0.269 0.787757
## price -1.72991 0.55774 -3.102 0.002065 **
## pop_age -0.69897 0.06543 -10.682 < 2e-16 ***
## education -0.07405 0.06620 -1.119 0.264023
## price_ratio -0.41728 0.45259 -0.922 0.357119
## income_per_person -0.15005 0.09340 -1.606 0.108983
## advertising_per_person 0.15794 0.09021 1.751 0.080791 .
## high_quality 1.42009 0.06533 21.739 < 2e-16 ***
## log_advertising -0.13675 0.15874 -0.861 0.389533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.295 on 387 degrees of freedom
## Multiple R-squared: 0.7959, Adjusted R-squared: 0.7896
## F-statistic: 125.8 on 12 and 387 DF, p-value: < 2.2e-16
library(randomForest)
library(dplyr)
islr <- na.omit(islr)
islr <- islr %>%
mutate(across(where(is.character), as.factor))
set.seed(123)
n <- nrow(islr)
train_index <- sample(1:n, size = 0.8*n)
train <- islr[train_index, ]
test <- islr[-train_index, ]
set.seed(123)
rf_model <- randomForest(
sales ~ .,
data = train,
ntree = 500
)
print(rf_model)
##
## Call:
## randomForest(formula = sales ~ ., data = train, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 2.186221
## % Var explained: 71.97
rf_pred <- predict(rf_model, test)
RMSE(rf_pred, test$sales)
## [1] 1.361225
R2(rf_pred, test$sales)
## [1] 0.8339966
MAE(rf_pred, test$sales)
## [1] 1.084201
varImpPlot(rf_model)
set.seed(123)
control <- trainControl(
method = "cv",
number = 5
)
rf_cv <- train(
sales ~ .,
data = islr,
method = "rf",
trControl = control,
tuneLength = 5,
importance = TRUE
)
print(rf_cv)
## Random Forest
##
## 400 samples
## 14 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 320, 321, 319, 320, 320
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.621794 0.7588956 1.285373
## 5 1.384150 0.7853376 1.105722
## 8 1.350819 0.7864161 1.096519
## 11 1.332514 0.7882025 1.094154
## 15 1.335244 0.7857476 1.100387
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 11.
A partir de la metodología de creación de características y la estimación de un modelo de Random Forest, se encontró que la variable con mayor impacto en las ventas continúa siendo el price_ratio, es decir, el precio relativo frente a la competencia. El modelo confirma que precios más altos en comparación con los competidores reducen significativamente las ventas, especialmente cuando la calidad no es alta. En contraste, una estrategia de precios competitivos combinada con buena calidad y mayor inversión en publicidad incrementa considerablemente el nivel de ventas.
La validación cruzada mostró un R cuadrada aproximado de 0.79 y un RMSE cercano a 1.33, lo que indica que el modelo explica cerca del 79% de la variabilidad en ventas y presenta un error promedio bajo. En comparación con el árbol de decisión individual R cuadrada ≈ 0.39, el Random Forest mejora sustancialmente la capacidad predictiva al reducir la varianza y combinar múltiples árboles. En general, el análisis confirma que la estrategia de precios relativa, junto con la calidad y la publicidad, son factores determinantes clave en el desempeño de ventas.