Project 2

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(stringr)
library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

raw_data <- read.csv("https://raw.githubusercontent.com/Jeovany97/Data-607/refs/heads/main/Project%202/List%20of%20highest%20grossing%20music%20tours%20by%20Women/my_file%20(1).csv", check.names = FALSE)

Tidying the data

tidy_tours <- raw_data |>
  clean_names() |>  
  rename(
    actual_gross_usd = actual_gross,
    adjusted_gross_2022_usd = adjusted_gross_in_2022_dollars,
    years = year_s
  ) |>
  mutate(
    across(c(actual_gross_usd, adjusted_gross_2022_usd, average_gross), 
           ~as.numeric(str_remove_all(as.character(.x), "[\\$,\\[\\]a-z†‡*]"))),
    start_year = as.numeric(str_extract(years, "^\\d{4}")),
    tour_title = str_remove_all(tour_title, "[†‡*]") |> str_trim()
  ) |>
  pivot_longer(
    cols = c(actual_gross_usd, adjusted_gross_2022_usd),
    names_to = "valuation_type",
    values_to = "gross_amount"
  ) |>
  select(artist, tour_title, start_year, shows, valuation_type, gross_amount, average_gross) |>
  drop_na(gross_amount)

head(tidy_tours)

# A tibble: 6 × 7
  artist   tour_title start_year shows valuation_type gross_amount average_gross
  <chr>    <chr>           <dbl> <int> <chr>                 <dbl>         <dbl>
1 Taylor … The Eras …       2023    56 actual_gross_…    780000000      13928571
2 Taylor … The Eras …       2023    56 adjusted_gros…    780000000      13928571
3 Beyoncé  Renaissan…       2023    56 actual_gross_…    579800000      10353571
4 Beyoncé  Renaissan…       2023    56 adjusted_gros…    579800000      10353571
5 Madonna  Sticky & …       2008    85 actual_gross_…    411000000       4835294
6 Madonna  Sticky & …       2008    85 adjusted_gros…    560622615       4835294

Data analysis. Finding which artist generate

artist_efficiency <- tidy_tours |>
  # Use unique entries per tour (since each tour appears twice due to pivot_longer)
  distinct(artist, tour_title, .keep_all = TRUE) |>
  group_by(artist) |>
  summarize(
    number_of_tours = n(),
    mean_avg_gross_per_show = mean(average_gross, na.rm = TRUE)
  ) |>
  arrange(desc(mean_avg_gross_per_show))

# Visualization
ggplot(artist_efficiency, aes(x = reorder(artist, mean_avg_gross_per_show), y = mean_avg_gross_per_show / 1e6)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Mean Average Gross per Show by Artist",
    subtitle = "Calculated across all tours in the dataset",
    x = "Artist",
    y = "Gross per Show (Millions USD)"
  ) +
  theme_minimal()