library(ggplot2)
library(plotly)
## 
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
music =read.csv("data.csv", stringsAsFactors = T)
dim(music)
## [1] 170653     19
head(music)
##   valence year acousticness
## 1  0.0594 1921        0.982
## 2  0.9630 1921        0.732
## 3  0.0394 1921        0.961
## 4  0.1650 1921        0.967
## 5  0.2530 1921        0.957
## 6  0.1960 1921        0.579
##                                                              artists
## 1 ['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']
## 2                                                     ['Dennis Day']
## 3            ['KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat']
## 4                                                   ['Frank Parker']
## 5                                                     ['Phil Regan']
## 6            ['KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat']
##   danceability duration_ms energy explicit                     id
## 1        0.279      831667  0.211        0 4BJqT0PrAfrxzMOxytFOIz
## 2        0.819      180533  0.341        0 7xPhfUan2yNtyFG0cUWkt8
## 3        0.328      500062  0.166        0 1o6I8BglA6ylDMrIELygv1
## 4        0.275      210000  0.309        0 3ftBPsC5vPBKxYSee08FDH
## 5        0.418      166693  0.193        0 4d6HGyGT8e121BsdKmw9v6
## 6        0.697      395076  0.346        0 4pyw9DVHGStUre4J6hPngr
##   instrumentalness key liveness loudness mode
## 1         8.78e-01  10    0.665  -20.096    1
## 2         0.00e+00   7    0.160  -12.441    1
## 3         9.13e-01   3    0.101  -14.850    1
## 4         2.77e-05   5    0.381   -9.316    1
## 5         1.68e-06   3    0.229  -10.096    1
## 6         1.68e-01   2    0.130  -12.506    1
##                                                               name popularity
## 1 Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve          4
## 2                                          Clancy Lowered the Boom          5
## 3                                                        Gati Bali          5
## 4                                                        Danny Boy          3
## 5                                      When Irish Eyes Are Smiling          2
## 6                                                     Gati Mardika          6
##   release_date speechiness   tempo
## 1         1921      0.0366  80.954
## 2         1921      0.4150  60.936
## 3         1921      0.0339 110.339
## 4         1921      0.0354 100.109
## 5         1921      0.0380 101.665
## 6         1921      0.0700 119.824

Vamos a eliminar las columnas id y release_date, ya que el id de una cancion no nos proporciona información valiosa para ningun tipo de análisis y release_date esta incompleta, por lo que con la variable Year que nos indica el año de la canción seria mas que suficiente.

# Eliminar las columnas 'release_date' e 'id'
music <- music[, !(colnames(music) %in% c("release_date", "id"))]
# Comprobar si hay filas duplicadas
duplicados <- music[duplicated(music), ]

# Verificar el número de filas duplicadas
n_duplicados <- nrow(duplicados)
print(paste("Número de filas duplicadas:", n_duplicados))
## [1] "Número de filas duplicadas: 565"
# Eliminar filas duplicadas
music <- music[!duplicated(music), ]

# Modificar directamente la columna duration_ms
music <- music %>%
  mutate(duration_ms = duration_ms / 1000)

Correlaciones entre diferentes variables de los datos:

# Seleccionar solo las variables cuantitativas
music_numeric <- music %>% select_if(is.numeric)

# Calcular la matriz de correlación
cor_matrix <- cor(music_numeric, use = "complete.obs", method = "pearson")

# Transformar la matriz de correlación a formato largo (long format)
cor_data <- melt(cor_matrix)

# Crear el heatmap con ggplot2
heatmap_plot <- ggplot(cor_data, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3) + # Agregar números
  scale_fill_gradient2(low = "white", mid = "lightgreen", high = "darkgreen", 
                       midpoint = 0, limits = c(-1, 1), name = "Correlación") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  ) +
  labs(
    title = "Matriz de Correlación",
    x = "Variables",
    y = "Variables"
  )

# Convertir el gráfico a interactivo con plotly
heatmap_interactive <- ggplotly(heatmap_plot)

# Mostrar el gráfico interactivo
heatmap_interactive

Evolución de diferentes variables a lo largo del tiempo

# 1. Evolución de Acousticness
music_yearly <- music %>%
  group_by(year) %>%
  summarise(acousticness_avg = mean(acousticness, na.rm = TRUE))

yearly_plot <- ggplot(music_yearly, aes(x = year, y = acousticness_avg)) +
  geom_line(color = "green", size = 1) + # Línea verde
  geom_point(color = "darkgreen", size = 2) + # Puntos
  theme_minimal() +
  labs(
    title = "Evolución de Acousticness a lo Largo de los Años",
    x = "Año",
    y = "Promedio de Acousticness"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major = element_line(color = "gray", size = 0.2)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Convertir a interactivo
interactive_yearly_plot <- ggplotly(yearly_plot)
interactive_yearly_plot
# 2. Evolución de Duration_ms
music_duration <- music %>%
  group_by(year) %>%
  summarise(duration_avg = mean(duration_ms, na.rm = TRUE))

duration_plot <- ggplot(music_duration, aes(x = year, y = duration_avg)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "darkblue", size = 2) +
  theme_minimal() +
  labs(
    title = "Evolución de Duration_ms a lo Largo de los Años",
    x = "Año",
    y = "Promedio de Duration_ms"
  )

# Convertir a interactivo
interactive_duration_plot <- ggplotly(duration_plot)
interactive_duration_plot
# 3. Evolución de Danceability
music_danceability <- music %>%
  group_by(year) %>%
  summarise(danceability_avg = mean(danceability, na.rm = TRUE))

danceability_plot <- ggplot(music_danceability, aes(x = year, y = danceability_avg)) +
  geom_line(color = "orange", size = 1) +
  geom_point(color = "darkorange", size = 2) +
  theme_minimal() +
  labs(
    title = "Evolución de Danceability a lo Largo de los Años",
    x = "Año",
    y = "Promedio de Danceability"
  )

# Convertir a interactivo
interactive_danceability_plot <- ggplotly(danceability_plot)
interactive_danceability_plot
# 4. Evolución de Energy
music_energy <- music %>%
  group_by(year) %>%
  summarise(energy_avg = mean(energy, na.rm = TRUE))

energy_plot <- ggplot(music_energy, aes(x = year, y = energy_avg)) +
  geom_line(color = "red", size = 1) +
  geom_point(color = "darkred", size = 2) +
  theme_minimal() +
  labs(
    title = "Evolución de Energy a lo Largo de los Años",
    x = "Año",
    y = "Promedio de Energy"
  )

# Convertir a interactivo
interactive_energy_plot <- ggplotly(energy_plot)
interactive_energy_plot
# 5. Evolución de Loudness
music_loudness <- music %>%
  group_by(year) %>%
  summarise(loudness_avg = mean(loudness, na.rm = TRUE))

loudness_plot <- ggplot(music_loudness, aes(x = year, y = loudness_avg)) +
  geom_line(color = "purple", size = 1) +
  geom_point(color = "purple4", size = 2) +
  theme_minimal() +
  labs(
    title = "Evolución de Loudness a lo Largo de los Años",
    x = "Año",
    y = "Promedio de Loudness"
  )

# Convertir a interactivo
interactive_loudness_plot <- ggplotly(loudness_plot)
interactive_loudness_plot
explicit_trend <- music %>%
  group_by(year, explicit) %>%
  summarise(count = n(), .groups = "drop")

# Convertir explicit a factor para que ggplot lo trate como categórico
explicit_trend$explicit <- factor(explicit_trend$explicit, levels = c(0, 1), labels = c("Non-Explicit", "Explicit"))

# Crear un gráfico de barras apiladas con ggplot
ggplot_bar <- ggplot(explicit_trend, aes(x = year, y = count, fill = explicit)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = c("Non-Explicit" = "#191414", "Explicit" = "#1db954")) +
  theme_minimal() +
  labs(
    title = "Explicit vs. Non-Explicit",
    x = "Year",
    y = "Count",
    fill = "Explicit"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 16)
  )

# Convertir el gráfico a interactivo con plotly
ggplotly(ggplot_bar)
# Calcular el porcentaje dentro de cada año
explicit_trend <- music %>%
  group_by(year, explicit) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(year) %>%
  mutate(percentage = count / sum(count) * 100) %>% # Calcular el porcentaje
  ungroup()

# Convertir explicit a factor
explicit_trend$explicit <- factor(explicit_trend$explicit, levels = c(0, 1), labels = c("Non-Explicit", "Explicit"))

# Crear un gráfico de barras apiladas con porcentaje
ggplot_bar <- ggplot(explicit_trend, aes(x = year, y = percentage, fill = explicit)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = c("Non-Explicit" = "#191414", "Explicit" = "#1db954")) +
  theme_minimal() +
  labs(
    title = "Explicit vs. Non-Explicit (Porcentaje)",
    x = "Year",
    y = "Percentage",
    fill = "Explicit"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 16)
  )

# Convertir el gráfico a interactivo con plotly
ggplotly(ggplot_bar)
# Crear una nueva columna para clasificar las canciones por duración
music <- music %>%
  mutate(duration_category = ifelse(duration_ms <= 210, "≤ 210 segundos", "> 210 segundos"))

# Crear el boxplot con ggplot
boxplot_duration <- ggplot(music, aes(x = duration_category, y = popularity, fill = duration_category)) +
  geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
  scale_fill_manual(values = c("≤ 210 segundos" = "#1DB954", "> 210 segundos" = "#191414")) +
  theme_minimal() +
  labs(
    title = "Comparación de Popularidad por Duración de Canción",
    x = "Duración de la Canción",
    y = "Popularidad",
    fill = "Duración"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12)
  )

# Convertir el gráfico a interactivo con plotly
boxplot_interactive <- ggplotly(boxplot_duration)

# Mostrar el gráfico interactivo
boxplot_interactive

Vamos a realizar un test estadistico para comprobar si existen diferencias significativas en la popularidad media entre los dos grupos.

# Filtrar los datos
grupo_1 <- subset(music, duration_ms > 210)  # canciones con duración > 210 segundos
grupo_2 <- subset(music, duration_ms <= 210)  # canciones con duración <= 210 segundos

# Realizar la prueba t
resultado <- t.test(grupo_1$popularity, grupo_2$popularity)

# Mostrar los resultados
print(resultado)
## 
##  Welch Two Sample t-test
## 
## data:  grupo_1$popularity and grupo_2$popularity
## t = 113.08, df = 169074, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  11.28699 11.68515
## sample estimates:
## mean of x mean of y 
##  37.43677  25.95070

Dado el valor p extremadamente bajo, podemos concluir que hay una diferencia estadísticamente significativa entre las popularidades de las canciones más largas y las más cortas.