Washing Machine Project

1. Machine lifetime & intensity

Key variables:

machine_commissioning
datapoint_time
customer_cycles_amount

What to analyse:

Machine age at service

→ datapoint_time – commissioning date

Usage intensity:

→ cycles per year / per month

Calculation of machine age

Here I created a new data frame just to test it out if the code works. At the end I made a new column in the original joined data frame.

machine.age <- join %>% 
  select(datapoint_time,machine_commissioning, machine_id) %>% 
  group_by(machine_id)

machine.age$dt <- as.vector(machine.age$datapoint_time)
machine.age$mt <- as.vector(machine.age$machine_commissioning)

machine.age <- machine.age %>% 
  group_by(machine_id) %>% 
  mutate(age_year = round((dt - mt)/365.2425), na.rm = TRUE)

join$age_year <- machine.age$age_year

library(knitr)
kable(machine.age[1:5,], caption = "New data frame to calculate age in years")

New data frame to calculate age in years
datapoint_time	machine_commissioning	machine_id	dt	mt	age_year	na.rm
2020-01-02	2014-06-20	201406209290	18263	16241	6	TRUE
2020-01-02	2007-09-14	200709147860	18263	13770	12	TRUE
2020-01-03	2007-06-17	200706173882	18264	13681	13	TRUE
2020-01-06	2012-01-28	201201287469	18267	15367	8	TRUE
2020-01-07	2009-03-20	200903202799	18268	14323	11	TRUE

Interesting questions:

Do failures depend on age or usage?
Are machines failing early or only after heavy use?

Grouped bar chart with an overlaid line

For the next part I made age intervals to more easily see the differences between machine of different use time.

I also converted the service type in order to visualize it later.

join <- join %>% 
  mutate(age_range = cut(age_year, c(0, 5, 10, 15, 20, 25, 100), labels = c("0-5", "5-10", "10-15", "15-20", "20-25", ">25")))

join$service_event_type <- as.factor(join$service_event_type)

I made separate data frames for both so it’s easier and error-free on the original data

bars_age_error <- join %>%
  select(service_event_type, age_range) %>% 
  count(age_range, service_event_type) %>% 
  na.omit()

line_age_cycle <- join %>% 
  group_by(age_range) %>%
  summarise(mean(customer_cycles_amount), .groups = "drop") %>% 
  na.omit() %>% 
  rename(
    "mean_cycle" = 'mean(customer_cycles_amount)'
  )
scale_factor <- max(bars_age_error$n) / max(line_age_cycle$mean_cycle) #This is to illustrate the second y-axis later on in the bar chart

ggplot() +
  # bars
  geom_bar(
    data = bars_age_error,
    aes(x = age_range, y = n, fill = service_event_type),
    stat = "identity", position = "dodge", 
    color = "black", linewidth = 0.3
  ) +
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  # line
  geom_line(
    data = line_age_cycle,
    aes(x = as.numeric(age_range), y = mean_cycle  * scale_factor),
    color = "red", linewidth = 1, group = 1
  ) +
  geom_point(
    data = line_age_cycle,
    aes(x = as.numeric(age_range), y = mean_cycle * scale_factor),
    color = "red", size = 3
  ) +
  # dual y-axis
  scale_y_continuous(
    name = "Number of service events",
    sec.axis = sec_axis(
      transform = ~ . / scale_factor,
      name = "Average total number of cycles"
    )
  ) +
  scale_x_discrete(name = "Machine age (years)") +
  labs(
    title = "Number of service events and average cycles per month by age range",
    fill = NULL) +
  theme_classic() +
  theme(
    legend.position        = "bottom",
    legend.direction       = "horizontal",
    axis.text              = element_text(size = 10),
    axis.title             = element_text(size = 10),
    axis.title.y.right     = element_text(color = "red"),
    axis.text.y.right      = element_text(color = "red")
  )

2. Failure type, warranty status, and repair type

This relationship can be showcased through a heatmap

# Making Heatmap to show relationship between error case, warranty status, and repair outcome

pcd <- join %>% 
  select(customer_warranty_status,repair_type, service_event_type, age_range, machine_id) %>% 
  na.omit()

pcd$customer_warranty_status <- as.factor(pcd$customer_warranty_status)
pcd$repair_type <- as.factor(pcd$repair_type)
min(pcd$cycle_per_month)

## [1] Inf

# Making a separate data frame for the heatmap. 

heatmap_df <- pcd %>%
  group_by(service_event_type, repair_type, customer_warranty_status) %>%
  summarise(count = n(), .groups = "drop") %>%
  complete(service_event_type, repair_type, customer_warranty_status,
           fill = list(count = 0))


# Plot
ggplot(heatmap_df, aes(x = repair_type, y = service_event_type, fill = count)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = ifelse(count == 0, "0", as.character(count))),
            size = 3.5, color = "white") +
  scale_fill_gradient(low = "#A4BED5FF", high = "#023743FF") +
  facet_wrap(~ customer_warranty_status) +        # <-- this is to have 2 maps for 2 warranty statuses
  labs(
    x    = "Repair Type",
    y    = "Failure Type",
    fill = "Service events"
  ) +
  theme_minimal() +
  theme(
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 10),
    axis.text.y     = element_text(size = 10),
    axis.title      = element_text(size = 10),
    panel.grid      = element_blank(),
    legend.position = "right",
    strip.text      = element_text(size = 10, face = "bold")  # styles the facet titles
  )

Some interesting questions we could investigate from this map:

Which failure types consistently lead to disposal rather than repair?
Preventative repair? Where? Which part?

3. Machine use and Business Model

Do Rental and Pay-Per-Use machines get better care than Purchased ones?

Box plot: total cycles at time of service by BM — which model drives more machine use

bm_cycle <- join %>% 
  select(customer_business_model, repair_type, customer_cycles_amount, machine_program_60degrees,machine_program_40degrees,machine_program_eco)
  
ggplot(bm_cycle, aes(x = customer_business_model, y = customer_cycles_amount, fill = customer_business_model))+
  geom_boxplot(colour = "black", median.color = "red")+
  theme_classic()+
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  theme(legend.position = "none",
        axis.text.x     = element_text(angle = 45, hjust = 1, size = 10))+
  labs(
    fill = NULL,
    x = "Business Model",
    y = "Total cycle amount"
  )

ggplot(bm_cycle, aes(x = repair_type, y = customer_cycles_amount, fill = repair_type))+
  geom_boxplot(colour = "black", median.color = "red")+
  theme_classic()+
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  theme(legend.position = "none",
        axis.text.x     = element_text(angle = 45, hjust = 1, size = 10))+
  facet_grid(.~customer_business_model)+
  labs(
    fill = NULL,
    x = "Repair Type",
    y = "Total cycle amount"
  )

4. Service events by canton

Choropleth map

#Switzerland cantons from GISCO (Eurostat). This method requires the giscoR and sf packages!
#So do install.packages and then library first

swiss_cantons <- gisco_get_nuts(
  country    = "CH",
  nuts_level = 3, #This is canton level
  year       = "2021",
  resolution = "03" 
)

After doing this, it’s good to check the names in the swiss_cantons data, and then our data. Cause there can be mismatches such as umlauts or different spellings.

# Check names
unique(swiss_cantons$NAME_LATN)
unique(join$customer_canton)

# Fix canton names & aggregate data
canton_df <- join %>%
  mutate(customer_canton = recode(customer_canton,
                                  "Zurich"      = "Zürich",
                                  "Geneva"      = "Genève",
                                  "Lucerne"     = "Luzern",
                                  "Fribourg"    = "Freiburg",
                                  "Wallis"      = "Valais",
                                  "Basel-City"  = "Basel-Stadt"
  )) %>%
  group_by(customer_canton) %>%
  summarise(service_events = n(), .groups = "drop")
            
# Make abbreviations and join data
swiss_map <- swiss_cantons %>%
  left_join(canton_df, by = c("NAME_LATN" = "customer_canton"))

abbrev_lookup <- data.frame(
  NUTS_ID = c("CH011","CH012","CH013","CH021","CH022","CH023","CH024",
              "CH025","CH031","CH032","CH033","CH040","CH051","CH052",
              "CH053","CH054","CH055","CH056","CH057","CH061","CH062",
              "CH063","CH064","CH065","CH066","CH070"),
  abbr    = c("VD","VS","GE","BE","FR","SO","NE",
              "JU","BS","BL","AG","ZH","GL","SH",
              "AR","AI","SG","GR","TG","LU","UR",
              "SZ","OW","NW","ZG","TI")
)

swiss_map <- swiss_map %>%
  left_join(abbrev_lookup, by = "NUTS_ID")

# --- Step 5: plot
ggplot(swiss_map) +
  geom_sf(aes(fill = service_events), color = "white", linewidth = 0.4) +
  geom_sf_text(aes(label = abbr), size = 2.5, color = "gray20") +
  scale_fill_gradient(low = "#ffffcc", high = "#1a7a3c",
                      na.value = "grey90",
                      name = "Service events") +
  labs(title = "Service Events by Canton", subtitle = "2020–2023") +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title      = element_text(size = 13, face = "bold"),
    plot.subtitle   = element_text(size = 10, color = "gray40")
  )

At this point it is good to cross reference and see if the high number of service events mean the machines here tend to fail more of if it’s purely due to customer volume

# customers per canton per business model
customers_per_canton <- join %>%
  mutate(customer_canton = recode(customer_canton,
                                  "Zurich"      = "Zürich",
                                  "Geneva"      = "Genève",
                                  "Lucerne"     = "Luzern",
                                  "Fribourg"    = "Freiburg",
                                  "Wallis"      = "Valais",
                                  "Basel-City"  = "Basel-Stadt"
  )) %>%
  group_by(customer_canton, customer_business_model) %>%
  summarise(
    service_events   = n(),
    unique_customers = n_distinct(customer_id),
    .groups = "drop"
  ) %>%
  mutate(events_per_customer = service_events / unique_customers)

#rank cantons by total service events
canton_order <- customers_per_canton %>%
  group_by(customer_canton) %>%
  summarise(total = sum(service_events)) %>%
  arrange(desc(total)) %>%
  pull(customer_canton)

customers_per_canton <- customers_per_canton %>%
  mutate(customer_canton = factor(customer_canton, levels = canton_order))

###Plot this data

ggplot(customers_per_canton,
       aes(x = customer_business_model,
           y = events_per_customer,
           fill = customer_business_model)) +
  geom_bar(stat = "identity", color = "black", linewidth = 0.3, width = 0.6) +
  geom_text(aes(label = round(events_per_customer, 1)),
            vjust = -0.5, size = 3) +
  scale_fill_manual(values = c("#ffffcc", "#78c679", "#1a7a3c")) +
  facet_wrap(~ customer_canton) +
  labs(x = NULL, y = "Service events per customer") +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y     = element_text(size = 8),
    strip.text      = element_text(size = 10, face = "bold"),
    panel.spacing   = unit(1, "lines")
  )

What we should pay attention to in this graph:

Jura showing 3 events in just Purchase model: could be one large customer skewing the number, or a data quality issue
Pay-Per-Use and Rental contracts are still concentrated in larger cantons, they haven’t penetrated rural or smaller cantons yet -> A business opportunity.

If failure rates are consistent everywhere, there’s no operational risk in expanding PPU nationally — the service burden per customer won’t increase.

Failure rate is consistent across cantons and business models, the problem isn’t geographic. This points toward product-level interventions — better components, predictive maintenance, or design improvements

Dot Density Graph

As discussed last call, I have completed the dot density graph with the following code

# There are some lat and lon values swapped, so we have to make sure they are correct
join2 <- join %>% 
  filter_out(customer_address_lat < 40)

weird <- join %>% 
  filter(customer_address_lat < 40)
weird <- rename(weird,
                "customer_address_lat" = "customer_address_lon",
                "customer_address_lon" = "customer_address_lat"
)

weird <- weird %>% 
  relocate(customer_address_lat, .before = customer_address_lon)

join2 <- bind_rows(join2, weird)

That was a prepocessing step, now we can move on to making the graph

#After this, we can make the density map
density <- st_as_sf(join2, coords = c("customer_address_lon", "customer_address_lat"), crs = 4269)

#This map is interactive
mapview(density)

#This one is an image
ggplot(density) +
  geom_point(
    data = join2, aes(x = customer_address_lon, y = customer_address_lat),
    fill = "#ffffcc", size = 2, pch=21, color = "black"
  ) +
  geom_sf(data = swiss_cantons, fill = "transparent", color = "black", linewidth = 0.6)+
  geom_sf(
    data = density, fill = "transparent",
    color = "transparent", linewidth = .6
  ) +
  labs(
    title = "Dot Density Map of Service Events",
    subtitle = "2020-2023"
  )+
  theme_void()+
  theme(
    plot.title      = element_text(size = 13, face = "bold"),
    plot.subtitle   = element_text(size = 10, color = "gray40")
  )