Introducción:
Este análisis explora el dataset diamantes que contiene información
de 53,940 diamantes con 10 variables relacionadas con sus
características físicas y precio.
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
summary(diamonds)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
data(diamonds)
diamonds %>%
skim()
Data summary
| Name |
Piped data |
| Number of rows |
53940 |
| Number of columns |
10 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
3 |
| numeric |
7 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| cut |
0 |
1 |
TRUE |
5 |
Ide: 21551, Pre: 13791, Ver: 12082, Goo: 4906 |
| color |
0 |
1 |
TRUE |
7 |
G: 11292, E: 9797, F: 9542, H: 8304 |
| clarity |
0 |
1 |
TRUE |
8 |
SI1: 13065, VS2: 12258, SI2: 9194, VS1: 8171 |
Variable type: numeric
| carat |
0 |
1 |
0.80 |
0.47 |
0.2 |
0.40 |
0.70 |
1.04 |
5.01 |
▇▂▁▁▁ |
| depth |
0 |
1 |
61.75 |
1.43 |
43.0 |
61.00 |
61.80 |
62.50 |
79.00 |
▁▁▇▁▁ |
| table |
0 |
1 |
57.46 |
2.23 |
43.0 |
56.00 |
57.00 |
59.00 |
95.00 |
▁▇▁▁▁ |
| price |
0 |
1 |
3932.80 |
3989.44 |
326.0 |
950.00 |
2401.00 |
5324.25 |
18823.00 |
▇▂▁▁▁ |
| x |
0 |
1 |
5.73 |
1.12 |
0.0 |
4.71 |
5.70 |
6.54 |
10.74 |
▁▁▇▃▁ |
| y |
0 |
1 |
5.73 |
1.14 |
0.0 |
4.72 |
5.71 |
6.54 |
58.90 |
▇▁▁▁▁ |
| z |
0 |
1 |
3.54 |
0.71 |
0.0 |
2.91 |
3.53 |
4.04 |
31.80 |
▇▁▁▁▁ |
Análisis de variables principales
estadisticas_precio <- diamonds %>%
summarise(
Observaciones = n(),
Precio_Promedio = mean(price) %>% round(2),
Precio_Mediano = median(price),
Desviación_Estándar = sd(price) %>% round(2),
Precio_Mínimo = min(price),
Precio_Máximo = max(price),
Rango_Intercuartil = IQR(price)
)
estadisticas_precio %>%
kable() %>%
kable_styling(bootstrap_options = "striped")
|
Observaciones
|
Precio_Promedio
|
Precio_Mediano
|
Desviación_Estándar
|
Precio_Mínimo
|
Precio_Máximo
|
Rango_Intercuartil
|
|
53940
|
3932.8
|
2401
|
3989.44
|
326
|
18823
|
4374.25
|
Distribución del precio
ggplot(diamonds, aes(x = price)) +
geom_histogram(bins = 50, fill = "darkblue", alpha = 0.7, color = "black") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Distribución del Precio de Diamantes",
subtitle = "Se observa distribución asimétrica a la derecha",
x = "Precio (USD)",
y = "Frecuencia") +
theme_minimal()

Análisis de calidad del corte
precio_por_corte <- diamonds %>%
group_by(cut) %>%
summarise(
Cantidad = n(),
Porcentaje = round(n() / nrow(diamonds) * 100, 1),
Precio_Promedio = round(mean(price), 2),
Precio_Mediano = median(price)
) %>%
arrange(desc(Precio_Promedio))
precio_por_corte %>%
kable() %>%
kable_styling() %>%
row_spec(0, bold = TRUE)
|
cut
|
Cantidad
|
Porcentaje
|
Precio_Promedio
|
Precio_Mediano
|
|
Premium
|
13791
|
25.6
|
4584.26
|
3185.0
|
|
Fair
|
1610
|
3.0
|
4358.76
|
3282.0
|
|
Very Good
|
12082
|
22.4
|
3981.76
|
2648.0
|
|
Good
|
4906
|
9.1
|
3928.86
|
3050.5
|
|
Ideal
|
21551
|
40.0
|
3457.54
|
1810.0
|
Precio vs calidad del corte
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
geom_boxplot(alpha = 0.7) +
scale_y_log10(labels = scales::comma) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribución del Precio por Calidad del Corte",
subtitle = "Escala logarítmica en el eje Y",
x = "Calidad del Corte",
y = "Precio (USD - escala log)") +
theme_minimal() +
theme(legend.position = "none")

Relación entre quilates y precio
diamonds_quilates <- diamonds %>%
mutate(rango_quilates = cut(carat,
breaks = c(0, 0.5, 1, 1.5, 2, 3, 5),
labels = c("0-0.5", "0.5-1", "1-1.5", "1.5-2", "2-3", "3-5"))) %>%
group_by(rango_quilates) %>%
summarise(
Cantidad = n(),
Precio_Promedio = round(mean(price), 2),
Quilates_Promedio = round(mean(carat), 2)
)
diamonds_quilates %>%
kable() %>%
kable_styling() %>%
row_spec(0, bold = TRUE)
|
rango_quilates
|
Cantidad
|
Precio_Promedio
|
Quilates_Promedio
|
|
0-0.5
|
18932
|
839.72
|
0.35
|
|
0.5-1
|
17506
|
2811.34
|
0.72
|
|
1-1.5
|
12060
|
6513.53
|
1.15
|
|
1.5-2
|
3553
|
11321.77
|
1.62
|
|
2-3
|
1857
|
14946.22
|
2.13
|
|
3-5
|
31
|
15153.48
|
3.30
|
|
NA
|
1
|
18018.00
|
5.01
|
Relación Quilates - precio
set.seed(123) # Para reproducibilidad del sample
diamonds_sample <- diamonds %>% sample_n(2000) # Muestra para mejor visualización
ggplot(diamonds_sample, aes(x = carat, y = price, color = cut)) +
geom_point(alpha = 0.6, size = 1.5) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_y_continuous(labels = scales::comma) +
scale_color_brewer(palette = "Set1") +
labs(title = "Relación entre Quilates y Precio",
subtitle = "Muestra aleatoria de 2,000 diamantes",
x = "Quilates",
y = "Precio (USD)",
color = "Calidad del Corte") +
theme_minimal()

Análisis multivariado
precio_color_claridad <- diamonds %>%
group_by(color, clarity) %>%
summarise(Precio_Promedio = mean(price))
ggplot(precio_color_claridad, aes(x = color, y = clarity, fill = Precio_Promedio)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "darkred",
labels = scales::comma,
name = "Precio Promedio") +
labs(title = "Heatmap: Precio Promedio por Color y Claridad",
x = "Color (D = mejor, J = peor)",
y = "Claridad (IF = mejor, I1 = peor)") +
theme_minimal()

Relación entre produnidad y tabla del diamante
ggplot(diamonds, aes(x = depth, y = table)) +
geom_bin2d(bins = 50) +
scale_fill_viridis_c(option = "plasma") +
labs(title = "Relación entre Profundidad y Tabla del Diamante",
x = "Profundidad (%)",
y = "Tabla (%)",
fill = "Cantidad") +
theme_minimal()

Diamantes más caros
top10_caros <- diamonds %>%
arrange(desc(price)) %>%
head(10) %>%
select(carat, cut, color, clarity, depth, table, price)
top10_caros %>%
kable() %>%
kable_styling() %>%
row_spec(0, bold = TRUE)
|
carat
|
cut
|
color
|
clarity
|
depth
|
table
|
price
|
|
2.29
|
Premium
|
I
|
VS2
|
60.8
|
60
|
18823
|
|
2.00
|
Very Good
|
G
|
SI1
|
63.5
|
56
|
18818
|
|
1.51
|
Ideal
|
G
|
IF
|
61.7
|
55
|
18806
|
|
2.07
|
Ideal
|
G
|
SI2
|
62.5
|
55
|
18804
|
|
2.00
|
Very Good
|
H
|
SI1
|
62.8
|
57
|
18803
|
|
2.29
|
Premium
|
I
|
SI1
|
61.8
|
59
|
18797
|
|
2.04
|
Premium
|
H
|
SI1
|
58.1
|
60
|
18795
|
|
2.00
|
Premium
|
I
|
VS1
|
60.8
|
59
|
18795
|
|
1.71
|
Premium
|
F
|
VS2
|
62.3
|
59
|
18791
|
|
2.15
|
Ideal
|
G
|
SI2
|
62.6
|
54
|
18791
|