library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data2 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_2.csv")
glimpse(data2)
## Rows: 20
## Columns: 14
## $ Store.ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ Store <chr> "Palisades", "Billings", "Laguardia", "Cheeseburger", "Detro…
## $ Country <chr> "US", "US", "US", "US", "US", "US", "Canada", "Canada", "Can…
## $ January <chr> "$371,700.00", "$97,530.00", "$346,130.00", "$442,010.00", "…
## $ February <chr> "$435,950.00", "$324,140.00", "$157,510.00", "$212,390.00", …
## $ March <chr> "$372,460.00", "$454,480.00", "$288,990.00", "$183,580.00", …
## $ April <chr> "$192,260.00", "$36,810.00", "$358,190.00", "$308,650.00", "…
## $ May <chr> "$157,550.00", "$219,790.00", "$96,860.00", "$184,340.00", "…
## $ June <chr> "$332,550.00", "$210,970.00", "$461,950.00", "$156,540.00", …
## $ July <chr> "$89,630.00", "$84,840.00", "$80,440.00", "$328,180.00", "$3…
## $ August <chr> "$372,090.00", "$175,440.00", "$404,990.00", "$281,430.00", …
## $ September <chr> "$421,670.00", "$283,710.00", "$450,630.00", "$498,150.00", …
## $ October <chr> "$173,010.00", "$275,320.00", "$327,270.00", "$473,150.00", …
## $ November <chr> "$173,220.00", "$401,940.00", "$370,100.00", "$23,740.00", "…
The dataset appears to be list of store ids, their location, their country, and a monetary amount listed by month in a wide format.
To start the data will need to be pivoted longer in order take the months out of the column headers and place them into rows.
The following code pivots the data longer into a column for the months and a column named “dollars”, since it is not clear what the monetary data represents and the only indicator is a dollar symbol.
A note on the money: In reality, this would require an investigation into the dataset and the domain knowledge of what the money represents. Furthermore, all columns have dollar signs, but there are several countries listed, including the US and Canada which both use the dollars but different dollars. There are several more countries that use currencies other than the dollar, yet the values listed in their rows are as dollars. For the purposes of this project, we will use the assumption that the dataset is listed exclusively in US dollars since the source data does not specify.
data2 <- data2 |>
pivot_longer(
cols = !(Store.ID:Country),
names_to = "month",
values_to = "dollars"
)
glimpse(data2)
## Rows: 220
## Columns: 5
## $ Store.ID <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ Store <chr> "Palisades", "Palisades", "Palisades", "Palisades", "Palisade…
## $ Country <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "…
## $ month <chr> "January", "February", "March", "April", "May", "June", "July…
## $ dollars <chr> "$371,700.00", "$435,950.00", "$372,460.00", "$192,260.00", "…
The following code makes an ordinal factor for the months.
month_levels <- c(
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"
)
The following code cleans up the data frame. It: - renames columns - changes character classes of month to factor, and dollars to a double and removes the currency symbol
data2 <- data2 |>
rename(
store_id = Store.ID,
store = Store,
country = Country
) |>
mutate(
month = factor(month, levels = month_levels),
dollars = parse_number(dollars),
store_id = as.character(store_id)
)
glimpse(data2)
## Rows: 220
## Columns: 5
## $ store_id <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "…
## $ store <chr> "Palisades", "Palisades", "Palisades", "Palisades", "Palisade…
## $ country <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "…
## $ month <fct> January, February, March, April, May, June, July, August, Sep…
## $ dollars <dbl> 371700, 435950, 372460, 192260, 157550, 332550, 89630, 372090…
The data frame is now clean and tidy. The data is significantly longer as a result but now it is in a form that will allow it to be analyzed easily and consistently.
The code below totals all of the monetary data by store and plots and sorts it in descending order. It seems store 3 in Laguardia in the US has the highest monetary information while Chelsea in the UK has the lowest.
total_dollars_by_store <- data2 |>
group_by(store_id, store, country) |>
summarise(total_dollars = sum(dollars)) |>
arrange(desc(total_dollars)) |>
print()
## `summarise()` has grouped output by 'store_id', 'store'. You can override using
## the `.groups` argument.
## # A tibble: 20 × 4
## # Groups: store_id, store [20]
## store_id store country total_dollars
## <chr> <chr> <chr> <dbl>
## 1 3 Laguardia US 3343060
## 2 4 Cheeseburger US 3092160
## 3 1 Palisades US 3092090
## 4 5 Detroit US 2832170
## 5 2 Billings US 2564970
## 6 20 Camembert France 1047658
## 7 8 Eh Canada 367044
## 8 9 Maple Canada 362619
## 9 19 Paolo Brazil 330761
## 10 14 Innit UK 315480
## 11 17 Steve China 308982
## 12 16 Fuji Japan 283811
## 13 10 Victoria Canada 264917
## 14 7 Tim Horton's Canada 258845
## 15 15 Hokkaido Japan 244173
## 16 13 Blimey UK 240649
## 17 18 Nanjing China 219850
## 18 6 Towns US 208703
## 19 12 Wimbledon UK 199778
## 20 11 Chelsea UK 191157
ggplot(total_dollars_by_store, aes(x = reorder(store_id, -total_dollars), y = total_dollars)) +
geom_col()
The following code adds up the monetary data by country and sorts it in descending order. It also calculates the average dollars per store by country. From this data it is clear that the US has the highest total dollars. This is to be expected as it has the most stores. However it also has the higher average dollars per store.
A notable insight from this data is that while France has only one store, it has the third highest total dollars and the second highest average dollars per store.
total_dollars_by_country <- data2 |>
group_by(country) |>
summarise(
store_per_country = n()/11,
total_dollars = sum(dollars),
avg_dollars_per_store = (total_dollars / store_per_country)
) |>
arrange(desc(total_dollars)
) |>
print()
## # A tibble: 7 × 4
## country store_per_country total_dollars avg_dollars_per_store
## <chr> <dbl> <dbl> <dbl>
## 1 US 6 15133153 2522192.
## 2 Canada 4 1253425 313356.
## 3 France 1 1047658 1047658
## 4 UK 4 947064 236766
## 5 China 2 528832 264416
## 6 Japan 2 527984 263992
## 7 Brazil 1 330761 330761
ggplot(total_dollars_by_country, aes(x = reorder(country, -total_dollars), y = total_dollars)) +
geom_col()
ggplot(total_dollars_by_country, aes(x = reorder(country, -avg_dollars_per_store), y = avg_dollars_per_store)) +
geom_col()
The following code totals the dollars by month. It seems Spetemeber has the highest dollar total while July has the lowest.
dollars_by_month <- data2 |>
group_by(month) |>
summarise(
total_dollars = sum(dollars)
) |>
arrange(desc(total_dollars)) |>
print()
## # A tibble: 11 × 2
## month total_dollars
## <fct> <dbl>
## 1 September 2568518
## 2 November 2040795
## 3 March 1995192
## 4 June 1993274
## 5 October 1955800
## 6 February 1715421
## 7 January 1694355
## 8 August 1612337
## 9 April 1580057
## 10 May 1368979
## 11 July 1244149
ggplot(dollars_by_month, aes(x = month, y = total_dollars)) +
geom_col()