Key variables:
machine_commissioning
datapoint_time
customer_cycles_amount
What to analyse:
→ datapoint_time – commissioning date
→ cycles per year / per month
Here I created a new data frame just to test it out if the code works. At the end I made a new column in the original joined data frame.
machine.age <- join %>%
select(datapoint_time,machine_commissioning, machine_id) %>%
group_by(machine_id)
machine.age$dt <- as.vector(machine.age$datapoint_time)
machine.age$mt <- as.vector(machine.age$machine_commissioning)
machine.age <- machine.age %>%
group_by(machine_id) %>%
mutate(age_year = round((dt - mt)/365.2425), na.rm = TRUE)
join$age_year <- machine.age$age_year
library(knitr)
kable(machine.age[1:5,], caption = "New data frame to calculate age in years")
| datapoint_time | machine_commissioning | machine_id | dt | mt | age_year | na.rm |
|---|---|---|---|---|---|---|
| 2020-01-02 | 2014-06-20 | 201406209290 | 18263 | 16241 | 6 | TRUE |
| 2020-01-02 | 2007-09-14 | 200709147860 | 18263 | 13770 | 12 | TRUE |
| 2020-01-03 | 2007-06-17 | 200706173882 | 18264 | 13681 | 13 | TRUE |
| 2020-01-06 | 2012-01-28 | 201201287469 | 18267 | 15367 | 8 | TRUE |
| 2020-01-07 | 2009-03-20 | 200903202799 | 18268 | 14323 | 11 | TRUE |
Interesting questions:
Do failures depend on age or usage?
Are machines failing early or only after heavy use?
For the next part I made age intervals to more easily see the differences between machine of different use time.
I also converted the service type in order to visualize it later.
join <- join %>%
mutate(age_range = cut(age_year, c(0, 5, 10, 15, 20, 25, 100), labels = c("0-5", "5-10", "10-15", "15-20", "20-25", ">25")))
join$service_event_type <- as.factor(join$service_event_type)
I made separate data frames for both so it’s easier and error-free on the original data
bars_age_error <- join %>%
select(service_event_type, age_range) %>%
count(age_range, service_event_type) %>%
na.omit()
line_age_cycle <- join %>%
group_by(age_range) %>%
summarise(mean(customer_cycles_amount), .groups = "drop") %>%
na.omit() %>%
rename(
"mean_cycle" = 'mean(customer_cycles_amount)'
)
scale_factor <- max(bars_age_error$n) / max(line_age_cycle$mean_cycle) #This is to illustrate the second y-axis later on in the bar chart
ggplot() +
# bars
geom_bar(
data = bars_age_error,
aes(x = age_range, y = n, fill = service_event_type),
stat = "identity", position = "dodge",
color = "black", linewidth = 0.3
) +
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
# line
geom_line(
data = line_age_cycle,
aes(x = as.numeric(age_range), y = mean_cycle * scale_factor),
color = "red", linewidth = 1, group = 1
) +
geom_point(
data = line_age_cycle,
aes(x = as.numeric(age_range), y = mean_cycle * scale_factor),
color = "red", size = 3
) +
# dual y-axis
scale_y_continuous(
name = "Number of service events",
sec.axis = sec_axis(
transform = ~ . / scale_factor,
name = "Average total number of cycles"
)
) +
scale_x_discrete(name = "Machine age (years)") +
labs(
title = "Number of service events and average cycles per month by age range",
fill = NULL) +
theme_classic() +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
axis.text = element_text(size = 10),
axis.title = element_text(size = 10),
axis.title.y.right = element_text(color = "red"),
axis.text.y.right = element_text(color = "red")
)
This relationship can be showcased through a heatmap
# Making Heatmap to show relationship between error case, warranty status, and repair outcome
pcd <- join %>%
select(customer_warranty_status,repair_type, service_event_type, age_range, machine_id) %>%
na.omit()
pcd$customer_warranty_status <- as.factor(pcd$customer_warranty_status)
pcd$repair_type <- as.factor(pcd$repair_type)
min(pcd$cycle_per_month)
## [1] Inf
# Making a separate data frame for the heatmap.
heatmap_df <- pcd %>%
group_by(service_event_type, repair_type, customer_warranty_status) %>%
summarise(count = n(), .groups = "drop") %>%
complete(service_event_type, repair_type, customer_warranty_status,
fill = list(count = 0))
# Plot
ggplot(heatmap_df, aes(x = repair_type, y = service_event_type, fill = count)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = ifelse(count == 0, "0", as.character(count))),
size = 3.5, color = "white") +
scale_fill_gradient(low = "#A4BED5FF", high = "#023743FF") +
facet_wrap(~ customer_warranty_status) + # <-- this is to have 2 maps for 2 warranty statuses
labs(
x = "Repair Type",
y = "Failure Type",
fill = "Service events"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
axis.title = element_text(size = 10),
panel.grid = element_blank(),
legend.position = "right",
strip.text = element_text(size = 10, face = "bold") # styles the facet titles
)
Some interesting questions we could investigate from this
map:
Which failure types consistently lead to disposal rather than repair?
Preventative repair? Where? Which part?
Do Rental and Pay-Per-Use machines get better care than Purchased ones?
Box plot: total cycles at time of service by BM — which model drives more machine use
bm_cycle <- join %>%
select(customer_business_model, repair_type, customer_cycles_amount, machine_program_60degrees,machine_program_40degrees,machine_program_eco)
ggplot(bm_cycle, aes(x = customer_business_model, y = customer_cycles_amount, fill = customer_business_model))+
geom_boxplot(colour = "black", median.color = "red")+
theme_classic()+
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10))+
labs(
fill = NULL,
x = "Business Model",
y = "Total cycle amount"
)
ggplot(bm_cycle, aes(x = repair_type, y = customer_cycles_amount, fill = repair_type))+
geom_boxplot(colour = "black", median.color = "red")+
theme_classic()+
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10))+
facet_grid(.~customer_business_model)+
labs(
fill = NULL,
x = "Repair Type",
y = "Total cycle amount"
)
#Switzerland cantons from GISCO (Eurostat). This method requires the giscoR and sf packages!
#So do install.packages and then library first
swiss_cantons <- gisco_get_nuts(
country = "CH",
nuts_level = 3, #This is canton level
year = "2021",
resolution = "03"
)
After doing this, it’s good to check the names in the swiss_cantons data, and then our data. Cause there can be mismatches such as umlauts or different spellings.
# Check names
unique(swiss_cantons$NAME_LATN)
unique(join$customer_canton)
# Fix canton names & aggregate data
canton_df <- join %>%
mutate(customer_canton = recode(customer_canton,
"Zurich" = "Zürich",
"Geneva" = "Genève",
"Lucerne" = "Luzern",
"Fribourg" = "Freiburg",
"Wallis" = "Valais",
"Basel-City" = "Basel-Stadt"
)) %>%
group_by(customer_canton) %>%
summarise(service_events = n(), .groups = "drop")
# Make abbreviations and join data
swiss_map <- swiss_cantons %>%
left_join(canton_df, by = c("NAME_LATN" = "customer_canton"))
abbrev_lookup <- data.frame(
NUTS_ID = c("CH011","CH012","CH013","CH021","CH022","CH023","CH024",
"CH025","CH031","CH032","CH033","CH040","CH051","CH052",
"CH053","CH054","CH055","CH056","CH057","CH061","CH062",
"CH063","CH064","CH065","CH066","CH070"),
abbr = c("VD","VS","GE","BE","FR","SO","NE",
"JU","BS","BL","AG","ZH","GL","SH",
"AR","AI","SG","GR","TG","LU","UR",
"SZ","OW","NW","ZG","TI")
)
swiss_map <- swiss_map %>%
left_join(abbrev_lookup, by = "NUTS_ID")
# --- Step 5: plot
ggplot(swiss_map) +
geom_sf(aes(fill = service_events), color = "white", linewidth = 0.4) +
geom_sf_text(aes(label = abbr), size = 2.5, color = "gray20") +
scale_fill_gradient(low = "#ffffcc", high = "#1a7a3c",
na.value = "grey90",
name = "Service events") +
labs(title = "Service Events by Canton", subtitle = "2020–2023") +
theme_void() +
theme(
legend.position = "right",
plot.title = element_text(size = 13, face = "bold"),
plot.subtitle = element_text(size = 10, color = "gray40")
)
At this point it is good to cross reference and see if the high number of service events mean the machines here tend to fail more of if it’s purely due to customer volume
# customers per canton per business model
customers_per_canton <- join %>%
mutate(customer_canton = recode(customer_canton,
"Zurich" = "Zürich",
"Geneva" = "Genève",
"Lucerne" = "Luzern",
"Fribourg" = "Freiburg",
"Wallis" = "Valais",
"Basel-City" = "Basel-Stadt"
)) %>%
group_by(customer_canton, customer_business_model) %>%
summarise(
service_events = n(),
unique_customers = n_distinct(customer_id),
.groups = "drop"
) %>%
mutate(events_per_customer = service_events / unique_customers)
#rank cantons by total service events
canton_order <- customers_per_canton %>%
group_by(customer_canton) %>%
summarise(total = sum(service_events)) %>%
arrange(desc(total)) %>%
pull(customer_canton)
customers_per_canton <- customers_per_canton %>%
mutate(customer_canton = factor(customer_canton, levels = canton_order))
###Plot this data
ggplot(customers_per_canton,
aes(x = customer_business_model,
y = events_per_customer,
fill = customer_business_model)) +
geom_bar(stat = "identity", color = "black", linewidth = 0.3, width = 0.6) +
geom_text(aes(label = round(events_per_customer, 1)),
vjust = -0.5, size = 3) +
scale_fill_manual(values = c("#ffffcc", "#78c679", "#1a7a3c")) +
facet_wrap(~ customer_canton) +
labs(x = NULL, y = "Service events per customer") +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
strip.text = element_text(size = 10, face = "bold"),
panel.spacing = unit(1, "lines")
)
What we should pay attention to in this graph:
Jura showing 3 events in just Purchase model: could be one large customer skewing the number, or a data quality issue
Pay-Per-Use and Rental contracts are still concentrated in larger cantons, they haven’t penetrated rural or smaller cantons yet -> A business opportunity.
If failure rates are consistent everywhere, there’s no operational risk in expanding PPU nationally — the service burden per customer won’t increase.
As discussed last call, I have completed the dot density graph with the following code
# There are some lat and lon values swapped, so we have to make sure they are correct
join2 <- join %>%
filter_out(customer_address_lat < 40)
weird <- join %>%
filter(customer_address_lat < 40)
weird <- rename(weird,
"customer_address_lat" = "customer_address_lon",
"customer_address_lon" = "customer_address_lat"
)
weird <- weird %>%
relocate(customer_address_lat, .before = customer_address_lon)
join2 <- bind_rows(join2, weird)
That was a prepocessing step, now we can move on to making the graph
#After this, we can make the density map
density <- st_as_sf(join2, coords = c("customer_address_lon", "customer_address_lat"), crs = 4269)
#This map is interactive
mapview(density)
#This one is an image
ggplot(density) +
geom_point(
data = join2, aes(x = customer_address_lon, y = customer_address_lat),
fill = "#ffffcc", size = 2, pch=21, color = "black"
) +
geom_sf(data = swiss_cantons, fill = "transparent", color = "black", linewidth = 0.6)+
geom_sf(
data = density, fill = "transparent",
color = "transparent", linewidth = .6
) +
labs(
title = "Dot Density Map of Service Events",
subtitle = "2020-2023"
)+
theme_void()+
theme(
plot.title = element_text(size = 13, face = "bold"),
plot.subtitle = element_text(size = 10, color = "gray40")
)