library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Project 2 - Dataset 2

Tidying

Load the Dataset

data2 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_2.csv")

glimpse(data2)
## Rows: 20
## Columns: 14
## $ Store.ID  <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ Store     <chr> "Palisades", "Billings", "Laguardia", "Cheeseburger", "Detro…
## $ Country   <chr> "US", "US", "US", "US", "US", "US", "Canada", "Canada", "Can…
## $ January   <chr> "$371,700.00", "$97,530.00", "$346,130.00", "$442,010.00", "…
## $ February  <chr> "$435,950.00", "$324,140.00", "$157,510.00", "$212,390.00", …
## $ March     <chr> "$372,460.00", "$454,480.00", "$288,990.00", "$183,580.00", …
## $ April     <chr> "$192,260.00", "$36,810.00", "$358,190.00", "$308,650.00", "…
## $ May       <chr> "$157,550.00", "$219,790.00", "$96,860.00", "$184,340.00", "…
## $ June      <chr> "$332,550.00", "$210,970.00", "$461,950.00", "$156,540.00", …
## $ July      <chr> "$89,630.00", "$84,840.00", "$80,440.00", "$328,180.00", "$3…
## $ August    <chr> "$372,090.00", "$175,440.00", "$404,990.00", "$281,430.00", …
## $ September <chr> "$421,670.00", "$283,710.00", "$450,630.00", "$498,150.00", …
## $ October   <chr> "$173,010.00", "$275,320.00", "$327,270.00", "$473,150.00", …
## $ November  <chr> "$173,220.00", "$401,940.00", "$370,100.00", "$23,740.00", "…

Pivot data longer

The dataset appears to be list of store ids, their location, their country, and a monetary amount listed by month in a wide format.

To start the data will need to be pivoted longer in order take the months out of the column headers and place them into rows.

The following code pivots the data longer into a column for the months and a column named “dollars”, since it is not clear what the monetary data represents and the only indicator is a dollar symbol.

A note on the money: In reality, this would require an investigation into the dataset and the domain knowledge of what the money represents. Furthermore, all columns have dollar signs, but there are several countries listed, including the US and Canada which both use the dollars but different dollars. There are several more countries that use currencies other than the dollar, yet the values listed in their rows are as dollars. For the purposes of this project, we will use the assumption that the dataset is listed exclusively in US dollars since the source data does not specify.

data2 <- data2 |> 
  pivot_longer(
    cols = !(Store.ID:Country),
    names_to = "month",
    values_to = "dollars"
  )

glimpse(data2)
## Rows: 220
## Columns: 5
## $ Store.ID <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ Store    <chr> "Palisades", "Palisades", "Palisades", "Palisades", "Palisade…
## $ Country  <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "…
## $ month    <chr> "January", "February", "March", "April", "May", "June", "July…
## $ dollars  <chr> "$371,700.00", "$435,950.00", "$372,460.00", "$192,260.00", "…

Make a factor for months

The following code makes an ordinal factor for the months.

month_levels <- c(
  "January", "February", "March", "April", "May", "June",
  "July", "August", "September", "October", "November", "December"
)

Clean up the data frame

The following code cleans up the data frame. It: - renames columns - changes character classes of month to factor, and dollars to a double and removes the currency symbol

data2 <- data2 |> 
  rename(
    store_id = Store.ID,
    store = Store,
    country = Country
  ) |>
  mutate(
    month = factor(month, levels = month_levels),
    dollars = parse_number(dollars),
    store_id = as.character(store_id)
  )

glimpse(data2)
## Rows: 220
## Columns: 5
## $ store_id <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "…
## $ store    <chr> "Palisades", "Palisades", "Palisades", "Palisades", "Palisade…
## $ country  <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "…
## $ month    <fct> January, February, March, April, May, June, July, August, Sep…
## $ dollars  <dbl> 371700, 435950, 372460, 192260, 157550, 332550, 89630, 372090…

Tidy data

The data frame is now clean and tidy. The data is significantly longer as a result but now it is in a form that will allow it to be analyzed easily and consistently.

Analysis

The code below totals all of the monetary data by store and plots and sorts it in descending order. It seems store 3 in Laguardia in the US has the highest monetary information while Chelsea in the UK has the lowest.

total_dollars_by_store <- data2 |> 
  group_by(store_id, store, country) |> 
  summarise(total_dollars = sum(dollars)) |> 
  arrange(desc(total_dollars)) |> 
  print()
## `summarise()` has grouped output by 'store_id', 'store'. You can override using
## the `.groups` argument.
## # A tibble: 20 × 4
## # Groups:   store_id, store [20]
##    store_id store        country total_dollars
##    <chr>    <chr>        <chr>           <dbl>
##  1 3        Laguardia    US            3343060
##  2 4        Cheeseburger US            3092160
##  3 1        Palisades    US            3092090
##  4 5        Detroit      US            2832170
##  5 2        Billings     US            2564970
##  6 20       Camembert    France        1047658
##  7 8        Eh           Canada         367044
##  8 9        Maple        Canada         362619
##  9 19       Paolo        Brazil         330761
## 10 14       Innit        UK             315480
## 11 17       Steve        China          308982
## 12 16       Fuji         Japan          283811
## 13 10       Victoria     Canada         264917
## 14 7        Tim Horton's Canada         258845
## 15 15       Hokkaido     Japan          244173
## 16 13       Blimey       UK             240649
## 17 18       Nanjing      China          219850
## 18 6        Towns        US             208703
## 19 12       Wimbledon    UK             199778
## 20 11       Chelsea      UK             191157
ggplot(total_dollars_by_store, aes(x = reorder(store_id, -total_dollars), y = total_dollars)) +
  geom_col()

The following code adds up the monetary data by country and sorts it in descending order. It also calculates the average dollars per store by country. From this data it is clear that the US has the highest total dollars. This is to be expected as it has the most stores. However it also has the higher average dollars per store.

A notable insight from this data is that while France has only one store, it has the third highest total dollars and the second highest average dollars per store.

total_dollars_by_country <- data2 |> 
  group_by(country) |> 
  summarise(
    store_per_country = n()/11,
    total_dollars = sum(dollars),
    avg_dollars_per_store = (total_dollars / store_per_country)
    ) |> 
  arrange(desc(total_dollars)
  ) |> 
  print()
## # A tibble: 7 × 4
##   country store_per_country total_dollars avg_dollars_per_store
##   <chr>               <dbl>         <dbl>                 <dbl>
## 1 US                      6      15133153              2522192.
## 2 Canada                  4       1253425               313356.
## 3 France                  1       1047658              1047658 
## 4 UK                      4        947064               236766 
## 5 China                   2        528832               264416 
## 6 Japan                   2        527984               263992 
## 7 Brazil                  1        330761               330761
ggplot(total_dollars_by_country, aes(x = reorder(country, -total_dollars), y = total_dollars)) +
  geom_col()

ggplot(total_dollars_by_country, aes(x = reorder(country, -avg_dollars_per_store), y = avg_dollars_per_store)) +
  geom_col()

The following code totals the dollars by month. It seems Spetemeber has the highest dollar total while July has the lowest.

dollars_by_month <- data2 |> 
  group_by(month) |> 
  summarise(
    total_dollars = sum(dollars)
  ) |> 
  arrange(desc(total_dollars)) |> 
  print()
## # A tibble: 11 × 2
##    month     total_dollars
##    <fct>             <dbl>
##  1 September       2568518
##  2 November        2040795
##  3 March           1995192
##  4 June            1993274
##  5 October         1955800
##  6 February        1715421
##  7 January         1694355
##  8 August          1612337
##  9 April           1580057
## 10 May             1368979
## 11 July            1244149
ggplot(dollars_by_month, aes(x = month, y = total_dollars)) +
  geom_col()