#Libaries to be used
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
library(tibble)
library(paletteer)
library(forcats)
library(giscoR)
library(sf)
library(ggforce)
library(classInt)
library(cartogram)
library(showtext)
library(dots)
library(mapview)Washing Machine Project
Tracking progress
Pre-processing of the data
#data import
df_2020 <- read_delim("group2_df_2020.csv")
df_2021 <- read_delim("group2_df_2021.csv")
df_2022 <- read_delim("group2_df_2022.csv")
df_2023 <- read_delim("group2_df_2023.csv")
#datapoint_time is a character in df_2021 --> into date
df_2021$datapoint_time <- as.Date(df_2021$datapoint_time, format = "%d.%m.%Y")
df_2021$machine_commissioning <- as.Date(df_2021$machine_commissioning, format = "%d.%m.%Y")# Collect column types for all datasets
types <- rbind(
df_2020 = sapply(df_2020, class),
df_2021 = sapply(df_2021, class),
df_2022 = sapply(df_2022, class),
df_2023 = sapply(df_2023, class)
)
#Check if types are consistent across datasets
all_equal_types <- all(apply(types, 2, function(col) length(unique(col)) == 1))#check if all the variables are the same
identical(names(df_2020), names(df_2021))[1] TRUE
identical(names(df_2020), names(df_2022))[1] TRUE
identical(names(df_2020), names(df_2023))[1] TRUE
#ensure that all collums are in order
df_2021 <- df_2021[names(df_2020)]
df_2022 <- df_2022[names(df_2020)]
df_2023 <- df_2023[names(df_2020)]#combining all data
all_data <- bind_rows(df_2020, df_2021, df_2022, df_2023)
#check if all data was merged
nrow(all_data)[1] 1755
nrow(df_2020) + nrow(df_2021) + nrow(df_2022) + nrow(df_2023)[1] 1755
#check for missing values (NA)
anyNA(all_data)[1] TRUE
#how many missing values (NA)
sum(is.na(all_data))[1] 19
#where are the missing values
colSums(is.na(all_data)) datapoint_id datapoint_time service_event_type
0 0 0
customer_warranty_status repair_type machine_id
0 0 0
machine_commissioning customer_id customer_name
19 0 0
customer_household_type customer_address customer_canton
0 0 0
customer_address_lat customer_address_lon customer_business_model
0 0 0
customer_cycles_amount machine_program_60degrees machine_program_40degrees
0 0 0
machine_program_eco
0
#get missing data form machine id
all_data <- all_data %>%
mutate(
machine_commissioning = if_else(
is.na(machine_commissioning),
as.Date(substr(as.character(machine_id), 1, 8), "%Y%m%d"),
machine_commissioning
)
)#are the data enteries unique
sum(duplicated(all_data))[1] 82
#removing duplicates
all_data <- distinct(all_data)#are incident id unique?
sum(duplicated(all_data$datapoint_id))[1] 1
#There is 1 duplicated ID here
dup_id <- all_data$datapoint_id[duplicated(all_data$datapoint_id)][1]
#ID 202206293560
all_data[all_data$datapoint_id == dup_id, ]# A tibble: 2 × 19
datapoint_id datapoint_time service_event_type customer_warranty_sta…¹
<dbl> <date> <chr> <chr>
1 202206293560 2023-01-29 Failure Detergent Feedpump Full Warranty
2 202206293560 2023-01-29 Failure Detergent Feedpump Full Warranty
# ℹ abbreviated name: ¹customer_warranty_status
# ℹ 15 more variables: repair_type <chr>, machine_id <dbl>,
# machine_commissioning <date>, customer_id <chr>, customer_name <chr>,
# customer_household_type <chr>, customer_address <chr>,
# customer_canton <chr>, customer_address_lat <dbl>,
# customer_address_lon <dbl>, customer_business_model <chr>,
# customer_cycles_amount <dbl>, machine_program_60degrees <dbl>, …
print(all_data[all_data$datapoint_id == dup_id, ], width = Inf)# A tibble: 2 × 19
datapoint_id datapoint_time service_event_type
<dbl> <date> <chr>
1 202206293560 2023-01-29 Failure Detergent Feedpump
2 202206293560 2023-01-29 Failure Detergent Feedpump
customer_warranty_status repair_type
<chr> <chr>
1 Full Warranty No, Machine is replaced and old machine is disposed
2 Full Warranty No, Machine is replaced and old machine is disposed
machine_id machine_commissioning customer_id
<dbl> <date> <chr>
1 199901219994 1999-01-21 37828698-da33-4168-9d5d-07715ba7a55f
2 199901219994 1999-01-21 37828698-da33-4168-9d5d-07715ba7a55f
customer_name customer_household_type
<chr> <chr>
1 Manuel Kost Shared Multi Family
2 Manuel Kost Shared Multi Family
customer_address customer_canton
<chr> <chr>
1 16a, Schaufelbergerstrasse Wiedikon, Zurich 8063 Zurich
2 16a, Schaufelbergerstrasse Wiedikon, Zurich 8063 Zurich
customer_address_lat customer_address_lon customer_business_model
<dbl> <dbl> <chr>
1 8.50 47.4 Pay-Per-Use
2 47.4 8.50 Pay-Per-Use
customer_cycles_amount machine_program_60degrees machine_program_40degrees
<dbl> <dbl> <dbl>
1 15441. 0.196 0.76
2 15441. 0.196 0.76
machine_program_eco
<dbl>
1 0.044
2 0.044
#In the case of this ID, the lats and longs are swapped#remove duplicated id where lat & long are swapped
all_data <- all_data %>%
filter(
!(datapoint_id == dup_id &
customer_address_lat < 20)
)#are more lat & long swapped?
all_data2 <- all_data %>%
filter_out(customer_address_lat < 20)
swapped <- all_data %>%
filter(customer_address_lat < 20)
swapped <- rename(swapped,
"customer_address_lat" = "customer_address_lon",
"customer_address_lon" = "customer_address_lat"
)
swapped <- swapped %>%
relocate(customer_address_lat, .before = customer_address_lon)
all_data <- bind_rows(all_data2, swapped)
all_data <- all_data %>%
relocate(customer_address_lon, .before = customer_address_lat)
sum(all_data$customer_address_lat < 20 & all_data$customer_address_lon > 20)[1] 0
#datapoint_id vs. datapoint_time
all_data %>%
mutate(
id_date = as.Date(substr(as.character(datapoint_id), 1, 8), "%Y%m%d")
) %>%
filter(id_date != datapoint_time)# A tibble: 172 × 20
datapoint_id datapoint_time service_event_type customer_warranty_st…¹
<dbl> <date> <chr> <chr>
1 202108057413 2020-01-22 Drain Clogged Full Warranty
2 202101187577 2020-03-11 Bearing Failure Full Warranty
3 202307244818 2020-04-15 Failure Detergent Feedpump No Warranty
4 202107314841 2020-07-22 Other User Damage Full Warranty
5 202010259894 2020-08-01 Bearing Failure No Warranty
6 202011304284 2020-11-08 Drain Clogged No Warranty
7 202111303762 2020-11-11 Other User Damage No Warranty
8 202007213322 2020-11-20 Motor Fault Full Warranty
9 202001218717 2021-01-16 Other User Damage No Warranty
10 202009257681 2021-02-13 Drain Clogged Full Warranty
# ℹ 162 more rows
# ℹ abbreviated name: ¹customer_warranty_status
# ℹ 16 more variables: repair_type <chr>, machine_id <dbl>,
# machine_commissioning <date>, customer_id <chr>, customer_name <chr>,
# customer_household_type <chr>, customer_address <chr>,
# customer_canton <chr>, customer_address_lon <dbl>,
# customer_address_lat <dbl>, customer_business_model <chr>, …
all_data %>%
mutate(
id_date = as.Date(substr(as.character(datapoint_id), 1, 8), "%Y%m%d"),
diff_days = as.numeric(datapoint_time - id_date)
) %>%
count(diff_days)# A tibble: 169 × 2
diff_days n
<dbl> <int>
1 -1195 1
2 -876 1
3 -860 1
4 -793 1
5 -721 1
6 -706 1
7 -674 1
8 -665 1
9 -664 1
10 -639 1
# ℹ 159 more rows
#Although the datapoint_id was described as containing a date component, comparison with datapoint_time revealed substantial inconsistencies. Therefore, the encoded date was not used for further analysis, and datapoint_time was treated as the reliable temporal variable.#machine_id vs. machine commissioning
all_data %>%
mutate(id_date = as.Date(substr(as.character(machine_id), 1, 8), "%Y%m%d")) %>%
filter(id_date != machine_commissioning)# A tibble: 0 × 20
# ℹ 20 variables: datapoint_id <dbl>, datapoint_time <date>,
# service_event_type <chr>, customer_warranty_status <chr>,
# repair_type <chr>, machine_id <dbl>, machine_commissioning <date>,
# customer_id <chr>, customer_name <chr>, customer_household_type <chr>,
# customer_address <chr>, customer_canton <chr>, customer_address_lon <dbl>,
# customer_address_lat <dbl>, customer_business_model <chr>,
# customer_cycles_amount <dbl>, machine_program_60degrees <dbl>, …
#Test: time relationship commissioning vs. service date --> good
sum(all_data$machine_commissioning > all_data$datapoint_time, na.rm = TRUE)[1] 0
#do percentages sum up to 100? --> yes
all_data %>%
mutate(total = machine_program_60degrees +
machine_program_40degrees +
machine_program_eco) %>%
summary() datapoint_id datapoint_time service_event_type
Min. :2.020e+11 Min. :2020-01-02 Length:1672
1st Qu.:2.020e+11 1st Qu.:2021-02-02 Class :character
Median :2.021e+11 Median :2022-02-11 Mode :character
Mean :2.022e+11 Mean :2022-02-04
3rd Qu.:2.023e+11 3rd Qu.:2023-02-16
Max. :2.023e+11 Max. :2023-12-31
customer_warranty_status repair_type machine_id
Length:1672 Length:1672 Min. :1.966e+11
Class :character Class :character 1st Qu.:2.006e+11
Mode :character Mode :character Median :2.011e+11
Mean :2.010e+11
3rd Qu.:2.015e+11
Max. :2.022e+11
machine_commissioning customer_id customer_name
Min. :1966-12-30 Length:1672 Length:1672
1st Qu.:2006-12-12 Class :character Class :character
Median :2011-10-04 Mode :character Mode :character
Mean :2010-03-16
3rd Qu.:2015-05-22
Max. :2022-05-26
customer_household_type customer_address customer_canton
Length:1672 Length:1672 Length:1672
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
customer_address_lon customer_address_lat customer_business_model
Min. :5.974 Min. :46.04 Length:1672
1st Qu.:6.258 1st Qu.:46.34 Class :character
Median :8.482 Median :47.26 Mode :character
Mean :7.860 Mean :46.99
3rd Qu.:8.650 3rd Qu.:47.36
Max. :9.579 Max. :47.64
customer_cycles_amount machine_program_60degrees machine_program_40degrees
Min. : 1000 Min. :0.1420 Min. :0.1400
1st Qu.: 2358 1st Qu.:0.1840 1st Qu.:0.2080
Median : 3708 Median :0.1980 Median :0.7100
Mean : 5411 Mean :0.1984 Mean :0.5165
3rd Qu.: 7744 3rd Qu.:0.2120 3rd Qu.:0.7500
Max. :16246 Max. :0.2620 Max. :0.8120
machine_program_eco total
Min. :0.0220 Min. :1
1st Qu.:0.0500 1st Qu.:1
Median :0.0960 Median :1
Mean :0.2851 Mean :1
3rd Qu.:0.5900 3rd Qu.:1
Max. :0.6720 Max. :1
#check for impossible numeric values
summary(all_data$customer_cycles_amount) Min. 1st Qu. Median Mean 3rd Qu. Max.
1000 2358 3708 5411 7744 16246
#check for typos
unique(all_data$service_event_type)[1] "Other User Damage" "Failure Detergent Feedpump"
[3] "Bearing Failure" "Motor Fault"
[5] "Drain Clogged"
unique(all_data$repair_type)[1] "Yes, at location"
[2] "Yes, at factory"
[3] "No, Machine is replaced and old machine is disposed"
[4] "No, Machine is disposed"
unique(all_data$customer_business_model)[1] "Purchase" "Rental" "Pay-Per-Use"
1. Machine lifetime & intensity
Key variables:
machine_commissioning
datapoint_time
customer_cycles_amount
What to analyse:
- Machine age at service
→ datapoint_time – commissioning date
- Usage intensity:
→ cycles per year / per month
Calculation of machine age
Here I created a new data frame just to test it out if the code works. At the end I made a new column in the original all_dataed data frame.
machine.age <- all_data %>%
select(datapoint_time,machine_commissioning, machine_id) %>%
group_by(machine_id)
machine.age$dt <- as.vector(machine.age$datapoint_time)
machine.age$mt <- as.vector(machine.age$machine_commissioning)
machine.age <- machine.age %>%
group_by(machine_id) %>%
mutate(age_year = round((dt - mt)/365.2425), na.rm = TRUE)
all_data$age_year <- machine.age$age_year
library(knitr)
kable(machine.age[1:5,], caption = "New data frame to calculate age in years")| datapoint_time | machine_commissioning | machine_id | dt | mt | age_year | na.rm |
|---|---|---|---|---|---|---|
| 2020-01-02 | 2014-06-20 | 201406209290 | 18263 | 16241 | 6 | TRUE |
| 2020-01-02 | 2007-09-14 | 200709147860 | 18263 | 13770 | 12 | TRUE |
| 2020-01-03 | 2007-06-17 | 200706173882 | 18264 | 13681 | 13 | TRUE |
| 2020-01-06 | 2012-01-28 | 201201287469 | 18267 | 15367 | 8 | TRUE |
| 2020-01-07 | 2009-03-20 | 200903202799 | 18268 | 14323 | 11 | TRUE |
Interesting questions:
Do failures depend on age or usage?
Are machines failing early or only after heavy use?
Grouped bar chart with an overlaid line
For the next part I made age intervals to more easily see the differences between machine of different use time.
I also converted the service type in order to visualize it later.
all_data <- all_data %>%
mutate(age_range = cut(age_year, c(0, 5, 10, 15, 20, 25, 100), labels = c("0-5", "5-10", "10-15", "15-20", "20-25", ">25")))
all_data$service_event_type <- as.factor(all_data$service_event_type)I made separate data frames for both so it’s easier and error-free on the original data
bars_age_error <- all_data %>%
select(service_event_type, age_range) %>%
count(age_range, service_event_type)
line_age_cycle <- all_data %>%
group_by(age_range) %>%
summarise(mean(customer_cycles_amount), .groups = "drop") %>%
rename(
"mean_cycle" = 'mean(customer_cycles_amount)'
)
scale_factor <- max(bars_age_error$n) / max(line_age_cycle$mean_cycle) #This is to illustrate the second y-axis later on in the bar chartggplot() +
# bars
geom_bar(
data = bars_age_error,
aes(x = age_range, y = n, fill = service_event_type),
stat = "identity", position = "dodge",
color = "black", linewidth = 0.3
) +
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
# line
geom_line(
data = line_age_cycle,
aes(x = as.numeric(age_range), y = mean_cycle * scale_factor),
color = "red", linewidth = 1, group = 1
) +
geom_point(
data = line_age_cycle,
aes(x = as.numeric(age_range), y = mean_cycle * scale_factor),
color = "red", size = 3
) +
# dual y-axis
scale_y_continuous(
name = "Number of service events",
sec.axis = sec_axis(
transform = ~ . / scale_factor,
name = "Average total number of cycles"
)
) +
scale_x_discrete(name = "Machine age (years)") +
labs(
title = "Number of service events and average cycles per month by age range",
fill = NULL) +
theme_classic() +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
axis.text = element_text(size = 10),
axis.title = element_text(size = 10),
axis.title.y.right = element_text(color = "red"),
axis.text.y.right = element_text(color = "red")
)Question — cycles vs machine age A simple linear regression — do more cycles predict earlier breakdown?
model <- lm(machine_age_years ~ customer_cycles_amount +
customer_household_type +
customer_business_model,
data = all_data %>%
mutate(machine_age_years = as.numeric(difftime(
datapoint_time, machine_commissioning, units = "days")) / 365))
summary(model)
Call:
lm(formula = machine_age_years ~ customer_cycles_amount + customer_household_type +
customer_business_model, data = all_data %>% mutate(machine_age_years = as.numeric(difftime(datapoint_time,
machine_commissioning, units = "days"))/365))
Residuals:
Min 1Q Median 3Q Max
-8.866 -4.074 -0.544 2.160 36.640
Coefficients:
Estimate Std. Error t value
(Intercept) 9.425e+00 5.275e-01 17.867
customer_cycles_amount 1.425e-03 5.014e-05 28.422
customer_household_typeShared Multi Family -1.140e+01 5.320e-01 -21.425
customer_household_typeSingle User Household 1.759e+00 4.791e-01 3.672
customer_business_modelPurchase -7.675e-01 4.119e-01 -1.863
customer_business_modelRental -1.134e+00 5.861e-01 -1.935
Pr(>|t|)
(Intercept) < 2e-16 ***
customer_cycles_amount < 2e-16 ***
customer_household_typeShared Multi Family < 2e-16 ***
customer_household_typeSingle User Household 0.000248 ***
customer_business_modelPurchase 0.062589 .
customer_business_modelRental 0.053213 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.988 on 1666 degrees of freedom
Multiple R-squared: 0.3882, Adjusted R-squared: 0.3864
F-statistic: 211.4 on 5 and 1666 DF, p-value: < 2.2e-16
2. Failure type, warranty status, and repair type
This relationship can be showcased through a heatmap
# Making Heatmap to show relationship between error case, warranty status, and repair outcome
pcd <- all_data %>%
select(customer_warranty_status,repair_type, service_event_type, age_range, machine_id)
pcd$customer_warranty_status <- as.factor(pcd$customer_warranty_status)
pcd$repair_type <- as.factor(pcd$repair_type)
min(pcd$cycle_per_month)[1] Inf
# Making a separate data frame for the heatmap.
heatmap_df <- pcd %>%
group_by(service_event_type, repair_type, customer_warranty_status) %>%
summarise(count = n(), .groups = "drop") %>%
complete(service_event_type, repair_type, customer_warranty_status,
fill = list(count = 0))
# Plot
ggplot(heatmap_df, aes(x = repair_type, y = service_event_type, fill = count)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = ifelse(count == 0, "0", as.character(count))),
size = 3.5, color = "white") +
scale_fill_gradient(low = "#A4BED5FF", high = "#023743FF") +
facet_wrap(~ customer_warranty_status) + # <-- this is to have 2 maps for 2 warranty statuses
labs(
x = "Repair Type",
y = "Failure Type",
fill = "Service events"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
axis.title = element_text(size = 10),
panel.grid = element_blank(),
legend.position = "right",
strip.text = element_text(size = 10, face = "bold") # styles the facet titles
)Some interesting questions we could investigate from this map:
Which failure types consistently lead to disposal rather than repair?
Preventative repair? Where? Which part?
Statistical tests
Question — business model vs repair type
A chi-square test — does business model significantly affect what kind of repair is needed?
Chi-square — “do these two categories relate to each other?”
Imagine you have two jars of marbles. One jar is business models (Purchase, Rental, PPU) and the other is repair types (factory, on-site, disposal). The chi-square test asks: “does the type of marble you pick from jar 1 affect what you get from jar 2?”
We chose it here because both variables are categories — you can’t calculate an average of “Rental” or “disposal”, you can only count them. Chi-square is specifically designed for counting categories against each other.
chisq.test(table(all_data$customer_business_model, all_data$repair_type))
Pearson's Chi-squared test
data: table(all_data$customer_business_model, all_data$repair_type)
X-squared = 528.15, df = 6, p-value < 2.2e-16
3. Machine use and Business Model
Do Rental and Pay-Per-Use machines get better care than Purchased ones?
Box plot: total cycles at time of service by BM — which model drives more machine use
bm_cycle <- all_data %>%
select(customer_business_model, repair_type, customer_cycles_amount, machine_program_60degrees,machine_program_40degrees,machine_program_eco)
ggplot(bm_cycle, aes(x = customer_business_model, y = customer_cycles_amount, fill = customer_business_model))+
geom_boxplot(colour = "black", median.color = "red")+
theme_classic()+
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10))+
labs(
fill = NULL,
x = "Business Model",
y = "Total cycle amount"
)4.Household type and cycles
Question — does household type predict cycles? One-way ANOVA — do flatshares genuinely use machines more than single households?
aov_result <- aov(customer_cycles_amount ~ customer_household_type, data = all_data)
summary(aov_result) Df Sum Sq Mean Sq F value Pr(>F)
customer_household_type 2 1.190e+10 5.948e+09 695.9 <2e-16 ***
Residuals 1669 1.426e+10 8.547e+06
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOVA — “are these group averages actually different from each other?”
Imagine three groups of kids — singles, flatshares, and multi-family — and you measure how many times each group uses the washing machine. ANOVA asks: “are these averages genuinely different, or could the difference just be random luck?”
We chose it here because you have one number (cycles) being compared across multiple groups (household types). It’s essentially a chi-square but for when one of your variables is a number rather than a category.
ggplot(all_data, aes(machine_program_60degrees, customer_cycles_amount, colour = customer_cycles_amount)) +
geom_point()+
scale_color_gradient(low = "blue", high = "red")+
geom_smooth(method = "lm")`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation:
colour.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
ggplot(all_data, aes(machine_program_eco, customer_cycles_amount, colour = customer_cycles_amount)) +
geom_point()+
scale_color_gradient(low = "blue", high = "red")+
geom_smooth(method = "lm")`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation:
colour.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
all_data %>%
mutate(
machine_age_years = as.numeric(difftime(datapoint_time,
machine_commissioning, units = "days")) / 365,
eco_user = ifelse(machine_program_eco > 0.33, "High eco usage",
"Low eco usage")
) %>%
ggplot(aes(x = eco_user, y = machine_age_years, fill = eco_user)) +
geom_boxplot(color = "black", linewidth = 0.3) +
scale_fill_manual(values = c("#ffffcc", "#1a7a3c")) +
labs(x = NULL, y = "Machine age at service (years)") +
theme_classic() +
theme(legend.position = "none")# --- split into two groups
high_eco <- all_data %>%
mutate(machine_age_years = as.numeric(difftime(datapoint_time,
machine_commissioning, units = "days")) / 365) %>%
filter(machine_program_eco > 0.33) %>%
pull(machine_age_years)
low_eco <- all_data %>%
mutate(machine_age_years = as.numeric(difftime(datapoint_time,
machine_commissioning, units = "days")) / 365) %>%
filter(machine_program_eco <= 0.33) %>%
pull(machine_age_years)
# --- t-test
t.test(high_eco, low_eco)
Welch Two Sample t-test
data: high_eco and low_eco
t = 8.1013, df = 1066.1, p-value = 1.479e-15
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
2.477448 4.061149
sample estimates:
mean of x mean of y
13.91612 10.64682
#Does eco program usage correlate with fewer breakdowns? (Regression)
model_eco <- lm(customer_cycles_amount ~ machine_program_eco +
machine_program_60degrees,
data = all_data)
summary(model_eco)
Call:
lm(formula = customer_cycles_amount ~ machine_program_eco + machine_program_60degrees,
data = all_data)
Residuals:
Min 1Q Median 3Q Max
-6745.9 -1627.4 -51.6 1445.6 8591.1
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7627.3 808.0 9.439 <2e-16 ***
machine_program_eco -9758.9 294.7 -33.111 <2e-16 ***
machine_program_60degrees 2853.2 4032.3 0.708 0.479
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3075 on 1669 degrees of freedom
Multiple R-squared: 0.3966, Adjusted R-squared: 0.3959
F-statistic: 548.5 on 2 and 1669 DF, p-value: < 2.2e-16
all_data %>%
mutate(machine_age_years = as.numeric(difftime(datapoint_time,
machine_commissioning, units = "days")) / 365) %>%
summarise(correlation = cor(machine_program_eco, machine_age_years,
use = "complete.obs"))# A tibble: 1 × 1
correlation
<dbl>
1 0.241
all_data %>%
group_by(customer_household_type) %>%
summarise(
eco = mean(machine_program_eco),
deg40 = mean(machine_program_40degrees),
deg60 = mean(machine_program_60degrees)
) %>%
pivot_longer(-customer_household_type,
names_to = "program", values_to = "pct") %>%
ggplot(aes(x = customer_household_type, y = pct, fill = program)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
labs(x = NULL, y = "Program usage share")###Bar chart: service_event_type frequency by customer_household_type
all_data %>%
group_by(customer_household_type) %>%
count(customer_household_type, customer_business_model) %>%
ggplot(aes(customer_household_type, n, fill = customer_business_model))+
geom_bar(stat = "identity", position = "stack",
color = "black", linewidth = 0.3)+
theme_classic()+
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
labs(
x = "Household type",
y = "Total service events",
fill = "Business model"
)5. Returning customers
returning_customers <- all_data %>%
group_by(customer_id) %>%
filter(n() > 1) %>%
ungroup() %>%
arrange(customer_id, datapoint_time)
returning_customers %>%
group_by(customer_id) %>%
summarise(
total_events = n(),
first_service = min(datapoint_time),
last_service = max(datapoint_time),
business_model = first(customer_business_model),
canton = first(customer_canton)
) %>%
arrange(desc(total_events))# A tibble: 172 × 6
customer_id total_events first_service last_service business_model canton
<chr> <int> <date> <date> <chr> <chr>
1 00431f6b-6563-… 2 2020-08-07 2022-09-29 Rental Solot…
2 014d5176-463e-… 2 2021-03-09 2023-01-09 Purchase Geneva
3 04b3fb95-439f-… 2 2020-07-23 2023-04-08 Purchase Zurich
4 06945565-3612-… 2 2020-09-08 2022-11-28 Pay-Per-Use Geneva
5 0708ddcb-49d4-… 2 2022-11-17 2023-01-14 Purchase Bern
6 0912c2cd-e487-… 2 2021-04-30 2023-08-23 Rental Aargau
7 0b4bb0ec-5ad6-… 2 2022-04-13 2023-06-25 Purchase Zurich
8 0c252a9f-f3a7-… 2 2020-11-18 2022-10-20 Pay-Per-Use Geneva
9 0de1e871-2a5c-… 2 2021-06-18 2023-12-14 Purchase Schwyz
10 0f28189c-a0ce-… 2 2022-03-23 2022-04-26 Pay-Per-Use St. G…
# ℹ 162 more rows
A histogram to show the distribution of how long customers stay in the service loop. Are most returning customers coming back within a year, or are they spanning multiple years?
returning_summary <- returning_customers %>%
group_by(customer_id) %>%
summarise(
total_events = n(),
first_service = min(datapoint_time),
last_service = max(datapoint_time),
days_span = as.numeric(difftime(max(datapoint_time),
min(datapoint_time), units = "days")),
business_model = first(customer_business_model)
)
ggplot(returning_summary, aes(x = days_span / 365)) +
geom_histogram(binwidth = 0.5, fill = "#1a7a3c",
color = "white", linewidth = 0.3) +
labs(x = "Years between first and last service event",
y = "Number of customers") +
theme_classic()error_recurrence <- returning_customers %>%
group_by(customer_id, service_event_type) %>%
summarise(occurrences = n(), .groups = "drop") %>%
filter(occurrences > 1)
# --- Step 2: count how many customers experience each recurring error
error_recurrence_summary <- error_recurrence %>%
group_by(service_event_type) %>%
summarise(
customers_affected = n()
) %>%
arrange(desc(customers_affected))
# --- Step 3: plot
ggplot(error_recurrence_summary,
aes(x = reorder(service_event_type, customers_affected),
y = customers_affected)) +
geom_bar(stat = "identity", color = "black", linewidth = 0.3, fill = "#1a7a3c") +
geom_text(aes(label = customers_affected),
hjust = -0.3, size = 3.5) +
coord_flip() +
labs(
x = NULL,
y = "Number of customers experiencing recurring error"
) +
scale_fill_paletteer_d("nationalparkcolors::Acadia")+
theme_classic() +
theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 11)
)6. Service events by canton
Choropleth map
#Switzerland cantons from GISCO (Eurostat). This method requires the giscoR and sf packages!
#So do install.packages and then library first
swiss_cantons <- gisco_get_nuts(
country = "CH",
nuts_level = 3, #This is canton level
year = "2021",
resolution = "03"
)After doing this, it’s good to check the names in the swiss_cantons data, and then our data. Cause there can be mismatches such as umlauts or different spellings.
# Check names
unique(swiss_cantons$NAME_LATN)
unique(all_data$customer_canton)# Fix canton names & aggregate data
canton_df <- all_data %>%
mutate(customer_canton = recode(customer_canton,
"Zurich" = "Zürich",
"Geneva" = "Genève",
"Lucerne" = "Luzern",
"Fribourg" = "Freiburg",
"Wallis" = "Valais",
"Basel-City" = "Basel-Stadt"
)) %>%
group_by(customer_canton) %>%
summarise(service_events = n(), .groups = "drop")
# Make abbreviations and all_data data
swiss_map <- swiss_cantons %>%
left_join(canton_df, by = c("NAME_LATN" = "customer_canton"))
abbrev_lookup <- data.frame(
NUTS_ID = c("CH011","CH012","CH013","CH021","CH022","CH023","CH024",
"CH025","CH031","CH032","CH033","CH040","CH051","CH052",
"CH053","CH054","CH055","CH056","CH057","CH061","CH062",
"CH063","CH064","CH065","CH066","CH070"),
abbr = c("VD","VS","GE","BE","FR","SO","NE",
"JU","BS","BL","AG","ZH","GL","SH",
"AR","AI","SG","GR","TG","LU","UR",
"SZ","OW","NW","ZG","TI")
)
swiss_map <- swiss_map %>%
left_join(abbrev_lookup, by = "NUTS_ID")
# --- Step 5: plot
ggplot(swiss_map) +
geom_sf(aes(fill = service_events), color = "white", linewidth = 0.4) +
geom_sf_text(aes(label = abbr), size = 2.5, color = "gray20") +
scale_fill_gradient(low = "#ffffcc", high = "#1a7a3c",
na.value = "grey90",
name = "Service events") +
labs(title = "Service Events by Canton", subtitle = "2020–2023") +
theme_void() +
theme(
legend.position = "right",
plot.title = element_text(size = 13, face = "bold"),
plot.subtitle = element_text(size = 10, color = "gray40")
)At this point it is good to cross reference and see if the high number of service events mean the machines here tend to fail more of if it’s purely due to customer volume
# customers per canton per business model
customers_per_canton <- all_data %>%
mutate(customer_canton = recode(customer_canton,
"Zurich" = "Zürich",
"Geneva" = "Genève",
"Lucerne" = "Luzern",
"Fribourg" = "Freiburg",
"Wallis" = "Valais",
"Basel-City" = "Basel-Stadt"
)) %>%
group_by(customer_canton, customer_business_model) %>%
summarise(
service_events = n(),
unique_customers = n_distinct(customer_id),
.groups = "drop"
) %>%
mutate(events_per_customer = service_events / unique_customers)
#rank cantons by total service events
canton_order <- customers_per_canton %>%
group_by(customer_canton) %>%
summarise(total = sum(service_events)) %>%
arrange(desc(total)) %>%
pull(customer_canton)
customers_per_canton <- customers_per_canton %>%
mutate(customer_canton = factor(customer_canton, levels = canton_order))
###Plot this data
ggplot(customers_per_canton,
aes(x = customer_business_model,
y = events_per_customer,
fill = customer_business_model)) +
geom_bar(stat = "identity", color = "black", linewidth = 0.3, width = 0.6) +
geom_text(aes(label = round(events_per_customer, 1)),
vjust = -0.5, size = 3) +
scale_fill_manual(values = c("#ffffcc", "#78c679", "#1a7a3c")) +
facet_wrap(~ customer_canton) +
labs(x = NULL, y = "Service events per customer") +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
strip.text = element_text(size = 10, face = "bold"),
panel.spacing = unit(1, "lines")
)What we should pay attention to in this graph:
Jura showing 2 events in just Purchase model: 1 returning customer in whole Jura
Pay-Per-Use and Rental contracts are still concentrated in larger cantons, they haven’t penetrated rural or smaller cantons yet -> A business opportunity.
If failure rates are consistent everywhere, there’s no operational risk in expanding PPU nationally — the service burden per customer won’t increase.
- Failure rate is consistent across cantons and business models, the problem isn’t geographic. This points toward product-level interventions — better components, predictive maintenance, or design improvements
Dot Density Graph
test <- all_data %>%
select(customer_address_lon, customer_address_lat)
test2 <- st_as_sf(test, coords = c("customer_address_lon", "customer_address_lat"), crs = 4326)
test2 <- st_transform(test2, 21781)
library(rstudioapi)
library(magrittr) # pipes
Attache Paket: 'magrittr'
Das folgende Objekt ist maskiert 'package:tidyr':
extract
library(lintr) # code linting
library(sf) # spatial data handling
library(raster) # raster handling (needed for relief)Lade nötiges Paket: sp
Attache Paket: 'raster'
Das folgende Objekt ist maskiert 'package:dplyr':
select
library(viridis) # viridis color scaleLade nötiges Paket: viridisLite
library(cowplot) # stack ggplots
Attache Paket: 'cowplot'
Das folgende Objekt ist maskiert 'package:lubridate':
stamp
canton_geo <- read_sf("g2k15.shp")
# read country borders
country_geo <- read_sf("g2l15.shp")
# read lakes
lake_geo <- read_sf("g2s15.shp")
# read productive area (2324 municipalities)
municipality_prod_geo <- read_sf("gde-1-1-15.shp")
# read in raster of relief
relief <- raster("02-relief-ascii.asc") %>%
# hide relief outside of Switzerland by masking with country borders
mask(country_geo) %>%
as("SpatialPixelsDataFrame") %>%
as.data.frame() %>%
rename(value = `X02.relief.ascii`)
# clean up
rm(country_geo)
#Join Geodata with Thematic Data
municipality_prod_geo %<>%
st_join(test2, by = c("BFS_ID" = "bfs_id"))
class(municipality_prod_geo)[1] "sf" "tbl_df" "tbl" "data.frame"
#Create a theme map
theme_map <- function(...) {
theme_minimal() +
theme(
text = element_text(family = default_font_family,
color = default_font_color),
# remove all axes
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
# add a subtle grid
panel.grid.major = element_line(color = "#dbdbd9", size = 0.2),
panel.grid.minor = element_blank(),
# background colors
plot.background = element_rect(fill = default_background_color,
color = NA),
panel.background = element_rect(fill = default_background_color,
color = NA),
legend.background = element_rect(fill = default_background_color,
color = NA),
# borders and margins
plot.margin = unit(c(.5, .5, .2, .5), "cm"),
panel.border = element_blank(),
panel.spacing = unit(c(-.1, 0.2, .2, 0.2), "cm"),
# titles
legend.title = element_text(size = 11),
legend.text = element_text(size = 9, hjust = 0,
color = default_font_color),
plot.title = element_text(size = 15, hjust = 0.5,
color = default_font_color),
plot.subtitle = element_text(size = 10, hjust = 0.5,
color = default_font_color,
margin = margin(b = -0.1,
t = -0.1,
l = 2,
unit = "cm"),
debug = F),
# captions
plot.caption = element_text(size = 7,
hjust = .5,
margin = margin(t = 0.2,
b = 0,
unit = "cm"),
color = "#939184"),
...
)
}
swiss_map <- st_transform(swiss_map, 21781)
#Make the graph
ggplot(
# define main data source
data = swiss_map
) +
geom_sf(aes(fill = service_events))+
# first: draw the relief
geom_raster(
data = relief,
inherit.aes = FALSE,
aes(
x = x,
y = y,
alpha = value
)
) +
# use the "alpha hack" (as the "fill" aesthetic is already taken)
scale_alpha(name = "",
range = c(0.2, 0),
guide = F) + # suppress legend
# add main fill aesthetic
# use thin white stroke for municipality borders
scale_fill_viridis(
option = "magma",
alpha = 0.8, # make fill a bit brighter
begin = 0.1, # this option seems to be new (compared to 2016):
# with this we can truncate the
# color scale, so that extreme colors (very dark and very bright) are not
# used, which makes the map a bit more aesthetic
end = 0.9,
na.value = "grey") +
# use the Viridis color scale
# use thicker white stroke for cantonal borders
geom_sf(
data = canton_geo,
fill = "transparent",
color = "white",
size = 0.5
) +
# draw lakes in light blue
geom_sf(
data = lake_geo,
fill = "#D6F1FF",
color = "transparent"
)+
geom_sf(
data = test2,
fill = "transparent",
color = "red"
)+
geom_sf_text(aes(label = abbr), size = 2, color = "white")+
theme_void()Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.