library(ggplot2)
library(plotly)
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
music =read.csv("data.csv", stringsAsFactors = T)
dim(music)
## [1] 170653 19
head(music)
## valence year acousticness
## 1 0.0594 1921 0.982
## 2 0.9630 1921 0.732
## 3 0.0394 1921 0.961
## 4 0.1650 1921 0.967
## 5 0.2530 1921 0.957
## 6 0.1960 1921 0.579
## artists
## 1 ['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']
## 2 ['Dennis Day']
## 3 ['KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat']
## 4 ['Frank Parker']
## 5 ['Phil Regan']
## 6 ['KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat']
## danceability duration_ms energy explicit id
## 1 0.279 831667 0.211 0 4BJqT0PrAfrxzMOxytFOIz
## 2 0.819 180533 0.341 0 7xPhfUan2yNtyFG0cUWkt8
## 3 0.328 500062 0.166 0 1o6I8BglA6ylDMrIELygv1
## 4 0.275 210000 0.309 0 3ftBPsC5vPBKxYSee08FDH
## 5 0.418 166693 0.193 0 4d6HGyGT8e121BsdKmw9v6
## 6 0.697 395076 0.346 0 4pyw9DVHGStUre4J6hPngr
## instrumentalness key liveness loudness mode
## 1 8.78e-01 10 0.665 -20.096 1
## 2 0.00e+00 7 0.160 -12.441 1
## 3 9.13e-01 3 0.101 -14.850 1
## 4 2.77e-05 5 0.381 -9.316 1
## 5 1.68e-06 3 0.229 -10.096 1
## 6 1.68e-01 2 0.130 -12.506 1
## name popularity
## 1 Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve 4
## 2 Clancy Lowered the Boom 5
## 3 Gati Bali 5
## 4 Danny Boy 3
## 5 When Irish Eyes Are Smiling 2
## 6 Gati Mardika 6
## release_date speechiness tempo
## 1 1921 0.0366 80.954
## 2 1921 0.4150 60.936
## 3 1921 0.0339 110.339
## 4 1921 0.0354 100.109
## 5 1921 0.0380 101.665
## 6 1921 0.0700 119.824
Vamos a eliminar las columnas id y release_date, ya que el id de una cancion no nos proporciona información valiosa para ningun tipo de análisis y release_date esta incompleta, por lo que con la variable Year que nos indica el año de la canción seria mas que suficiente.
# Eliminar las columnas 'release_date' e 'id'
music <- music[, !(colnames(music) %in% c("release_date", "id"))]
# Comprobar si hay filas duplicadas
duplicados <- music[duplicated(music), ]
# Verificar el número de filas duplicadas
n_duplicados <- nrow(duplicados)
print(paste("Número de filas duplicadas:", n_duplicados))
## [1] "Número de filas duplicadas: 565"
# Eliminar filas duplicadas
music <- music[!duplicated(music), ]
# Modificar directamente la columna duration_ms
music <- music %>%
mutate(duration_ms = duration_ms / 1000)
Correlaciones entre diferentes variables de los datos:
# Seleccionar solo las variables cuantitativas
music_numeric <- music %>% select_if(is.numeric)
# Calcular la matriz de correlación
cor_matrix <- cor(music_numeric, use = "complete.obs", method = "pearson")
# Transformar la matriz de correlación a formato largo (long format)
cor_data <- melt(cor_matrix)
# Crear el heatmap con ggplot2
heatmap_plot <- ggplot(cor_data, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3) + # Agregar números
scale_fill_gradient2(low = "white", mid = "lightgreen", high = "darkgreen",
midpoint = 0, limits = c(-1, 1), name = "Correlación") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()
) +
labs(
title = "Matriz de Correlación",
x = "Variables",
y = "Variables"
)
# Convertir el gráfico a interactivo con plotly
heatmap_interactive <- ggplotly(heatmap_plot)
# Mostrar el gráfico interactivo
heatmap_interactive
Evolución de diferentes variables a lo largo del tiempo
# 1. Evolución de Acousticness
music_yearly <- music %>%
group_by(year) %>%
summarise(acousticness_avg = mean(acousticness, na.rm = TRUE))
yearly_plot <- ggplot(music_yearly, aes(x = year, y = acousticness_avg)) +
geom_line(color = "green", size = 1) + # Línea verde
geom_point(color = "darkgreen", size = 2) + # Puntos
theme_minimal() +
labs(
title = "Evolución de Acousticness a lo Largo de los Años",
x = "Año",
y = "Promedio de Acousticness"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_line(color = "gray", size = 0.2)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Convertir a interactivo
interactive_yearly_plot <- ggplotly(yearly_plot)
interactive_yearly_plot
# 2. Evolución de Duration_ms
music_duration <- music %>%
group_by(year) %>%
summarise(duration_avg = mean(duration_ms, na.rm = TRUE))
duration_plot <- ggplot(music_duration, aes(x = year, y = duration_avg)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "darkblue", size = 2) +
theme_minimal() +
labs(
title = "Evolución de Duration_ms a lo Largo de los Años",
x = "Año",
y = "Promedio de Duration_ms"
)
# Convertir a interactivo
interactive_duration_plot <- ggplotly(duration_plot)
interactive_duration_plot
# 3. Evolución de Danceability
music_danceability <- music %>%
group_by(year) %>%
summarise(danceability_avg = mean(danceability, na.rm = TRUE))
danceability_plot <- ggplot(music_danceability, aes(x = year, y = danceability_avg)) +
geom_line(color = "orange", size = 1) +
geom_point(color = "darkorange", size = 2) +
theme_minimal() +
labs(
title = "Evolución de Danceability a lo Largo de los Años",
x = "Año",
y = "Promedio de Danceability"
)
# Convertir a interactivo
interactive_danceability_plot <- ggplotly(danceability_plot)
interactive_danceability_plot
# 4. Evolución de Energy
music_energy <- music %>%
group_by(year) %>%
summarise(energy_avg = mean(energy, na.rm = TRUE))
energy_plot <- ggplot(music_energy, aes(x = year, y = energy_avg)) +
geom_line(color = "red", size = 1) +
geom_point(color = "darkred", size = 2) +
theme_minimal() +
labs(
title = "Evolución de Energy a lo Largo de los Años",
x = "Año",
y = "Promedio de Energy"
)
# Convertir a interactivo
interactive_energy_plot <- ggplotly(energy_plot)
interactive_energy_plot
# 5. Evolución de Loudness
music_loudness <- music %>%
group_by(year) %>%
summarise(loudness_avg = mean(loudness, na.rm = TRUE))
loudness_plot <- ggplot(music_loudness, aes(x = year, y = loudness_avg)) +
geom_line(color = "purple", size = 1) +
geom_point(color = "purple4", size = 2) +
theme_minimal() +
labs(
title = "Evolución de Loudness a lo Largo de los Años",
x = "Año",
y = "Promedio de Loudness"
)
# Convertir a interactivo
interactive_loudness_plot <- ggplotly(loudness_plot)
interactive_loudness_plot
explicit_trend <- music %>%
group_by(year, explicit) %>%
summarise(count = n(), .groups = "drop")
# Convertir explicit a factor para que ggplot lo trate como categórico
explicit_trend$explicit <- factor(explicit_trend$explicit, levels = c(0, 1), labels = c("Non-Explicit", "Explicit"))
# Crear un gráfico de barras apiladas con ggplot
ggplot_bar <- ggplot(explicit_trend, aes(x = year, y = count, fill = explicit)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("Non-Explicit" = "#191414", "Explicit" = "#1db954")) +
theme_minimal() +
labs(
title = "Explicit vs. Non-Explicit",
x = "Year",
y = "Count",
fill = "Explicit"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 16)
)
# Convertir el gráfico a interactivo con plotly
ggplotly(ggplot_bar)
# Calcular el porcentaje dentro de cada año
explicit_trend <- music %>%
group_by(year, explicit) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(year) %>%
mutate(percentage = count / sum(count) * 100) %>% # Calcular el porcentaje
ungroup()
# Convertir explicit a factor
explicit_trend$explicit <- factor(explicit_trend$explicit, levels = c(0, 1), labels = c("Non-Explicit", "Explicit"))
# Crear un gráfico de barras apiladas con porcentaje
ggplot_bar <- ggplot(explicit_trend, aes(x = year, y = percentage, fill = explicit)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("Non-Explicit" = "#191414", "Explicit" = "#1db954")) +
theme_minimal() +
labs(
title = "Explicit vs. Non-Explicit (Porcentaje)",
x = "Year",
y = "Percentage",
fill = "Explicit"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 16)
)
# Convertir el gráfico a interactivo con plotly
ggplotly(ggplot_bar)
# Crear una nueva columna para clasificar las canciones por duración
music <- music %>%
mutate(duration_category = ifelse(duration_ms <= 210, "≤ 210 segundos", "> 210 segundos"))
# Crear el boxplot con ggplot
boxplot_duration <- ggplot(music, aes(x = duration_category, y = popularity, fill = duration_category)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
scale_fill_manual(values = c("≤ 210 segundos" = "#1DB954", "> 210 segundos" = "#191414")) +
theme_minimal() +
labs(
title = "Comparación de Popularidad por Duración de Canción",
x = "Duración de la Canción",
y = "Popularidad",
fill = "Duración"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12)
)
# Convertir el gráfico a interactivo con plotly
boxplot_interactive <- ggplotly(boxplot_duration)
# Mostrar el gráfico interactivo
boxplot_interactive
Vamos a realizar un test estadistico para comprobar si existen diferencias significativas en la popularidad media entre los dos grupos.
# Filtrar los datos
grupo_1 <- subset(music, duration_ms > 210) # canciones con duración > 210 segundos
grupo_2 <- subset(music, duration_ms <= 210) # canciones con duración <= 210 segundos
# Realizar la prueba t
resultado <- t.test(grupo_1$popularity, grupo_2$popularity)
# Mostrar los resultados
print(resultado)
##
## Welch Two Sample t-test
##
## data: grupo_1$popularity and grupo_2$popularity
## t = 113.08, df = 169074, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 11.28699 11.68515
## sample estimates:
## mean of x mean of y
## 37.43677 25.95070
Dado el valor p extremadamente bajo, podemos concluir que hay una diferencia estadísticamente significativa entre las popularidades de las canciones más largas y las más cortas.