#|label: cleaning 1 box office mojo dataset
bom2022 <- read_csv("data/box_office_mojo_2022.csv", skip=11, show_col_types = F) |>
select(1,4,7,9) |> # select columns by number (use with care)
rename("date" = "Date",
"top10gross" = "Top 10 Gross",
"num_releases" = "Releases",
"num1gross" = "Gross") |>
filter(!is.na(top10gross)) |> # filters out empty holiday rows
mutate(date = dmy(paste(date,2022)),
num_releases = as.integer(num_releases),
# gross variables cleaned one at a time
top10gross = gsub(pattern="$", replacement="", x=top10gross, fixed=T),
top10gross = gsub(pattern=",", replacement="", x=top10gross, fixed=T) |>
as.numeric(),
num1gross = gsub(pattern="$", replacement="", x=num1gross, fixed=T),
num1gross = gsub(pattern=",", replacement="", x=num1gross, fixed=T) |>
as.numeric()) |>
glimpse()
Rows: 365
Columns: 4
$ date <date> 2022-12-31, 2022-12-30, 2022-12-29, 2022-12-28, 2022-12-…
$ top10gross <dbl> 27962493, 37900613, 31204428, 31866473, 37343124, 4696327…
$ num_releases <int> 31, 32, 28, 28, 28, 29, 28, 28, 29, 30, 31, 31, 30, 31, 3…
$ num1gross <dbl> 18053159, 24836835, 20117061, 20582014, 24128503, 3227043…