Washing Machine Project

Tracking progress

Pre-processing of the data

#Libaries to be used
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
library(tibble)
library(paletteer)
library(forcats)
library(giscoR)
library(sf)
library(ggforce)
library(classInt)
library(cartogram)
library(showtext)
library(dots)
library(mapview)
#data import
df_2020 <- read_delim("group2_df_2020.csv") 
df_2021 <- read_delim("group2_df_2021.csv") 
df_2022 <- read_delim("group2_df_2022.csv") 
df_2023 <- read_delim("group2_df_2023.csv") 

#datapoint_time is a character in df_2021 --> into date
df_2021$datapoint_time <- as.Date(df_2021$datapoint_time, format = "%d.%m.%Y")
df_2021$machine_commissioning <- as.Date(df_2021$machine_commissioning, format = "%d.%m.%Y")
# Collect column types for all datasets
types <- rbind(
  df_2020 = sapply(df_2020, class),
  df_2021 = sapply(df_2021, class),
  df_2022 = sapply(df_2022, class),
  df_2023 = sapply(df_2023, class)
)
#Check if types are consistent across datasets
all_equal_types <- all(apply(types, 2, function(col) length(unique(col)) == 1))
#check if all the variables are the same
identical(names(df_2020), names(df_2021))
[1] TRUE
identical(names(df_2020), names(df_2022))
[1] TRUE
identical(names(df_2020), names(df_2023))
[1] TRUE
#ensure that all collums are in order
df_2021 <- df_2021[names(df_2020)]
df_2022 <- df_2022[names(df_2020)]
df_2023 <- df_2023[names(df_2020)]
#combining all data
all_data <- bind_rows(df_2020, df_2021, df_2022, df_2023)

#check if all data was merged
nrow(all_data)
[1] 1755
nrow(df_2020) + nrow(df_2021) + nrow(df_2022) + nrow(df_2023)
[1] 1755
#check for missing values (NA)
anyNA(all_data)
[1] TRUE
#how many missing values (NA)
sum(is.na(all_data))
[1] 19
#where are the missing values
colSums(is.na(all_data))
             datapoint_id            datapoint_time        service_event_type 
                        0                         0                         0 
 customer_warranty_status               repair_type                machine_id 
                        0                         0                         0 
    machine_commissioning               customer_id             customer_name 
                       19                         0                         0 
  customer_household_type          customer_address           customer_canton 
                        0                         0                         0 
     customer_address_lat      customer_address_lon   customer_business_model 
                        0                         0                         0 
   customer_cycles_amount machine_program_60degrees machine_program_40degrees 
                        0                         0                         0 
      machine_program_eco 
                        0 
#get missing data form machine id
all_data <- all_data %>%
  mutate(
    machine_commissioning = if_else(
      is.na(machine_commissioning),
      as.Date(substr(as.character(machine_id), 1, 8), "%Y%m%d"),
      machine_commissioning
    )
  )
#are the data enteries unique
sum(duplicated(all_data))
[1] 82
#removing duplicates
all_data <- distinct(all_data)
#are incident id unique? 
sum(duplicated(all_data$datapoint_id))
[1] 1
#There is 1 duplicated ID here
dup_id <- all_data$datapoint_id[duplicated(all_data$datapoint_id)][1]

#ID 202206293560 

all_data[all_data$datapoint_id == dup_id, ]
# A tibble: 2 × 19
  datapoint_id datapoint_time service_event_type         customer_warranty_sta…¹
         <dbl> <date>         <chr>                      <chr>                  
1 202206293560 2023-01-29     Failure Detergent Feedpump Full Warranty          
2 202206293560 2023-01-29     Failure Detergent Feedpump Full Warranty          
# ℹ abbreviated name: ¹​customer_warranty_status
# ℹ 15 more variables: repair_type <chr>, machine_id <dbl>,
#   machine_commissioning <date>, customer_id <chr>, customer_name <chr>,
#   customer_household_type <chr>, customer_address <chr>,
#   customer_canton <chr>, customer_address_lat <dbl>,
#   customer_address_lon <dbl>, customer_business_model <chr>,
#   customer_cycles_amount <dbl>, machine_program_60degrees <dbl>, …
print(all_data[all_data$datapoint_id == dup_id, ], width = Inf)
# A tibble: 2 × 19
  datapoint_id datapoint_time service_event_type        
         <dbl> <date>         <chr>                     
1 202206293560 2023-01-29     Failure Detergent Feedpump
2 202206293560 2023-01-29     Failure Detergent Feedpump
  customer_warranty_status repair_type                                        
  <chr>                    <chr>                                              
1 Full Warranty            No, Machine is replaced and old machine is disposed
2 Full Warranty            No, Machine is replaced and old machine is disposed
    machine_id machine_commissioning customer_id                         
         <dbl> <date>                <chr>                               
1 199901219994 1999-01-21            37828698-da33-4168-9d5d-07715ba7a55f
2 199901219994 1999-01-21            37828698-da33-4168-9d5d-07715ba7a55f
  customer_name customer_household_type
  <chr>         <chr>                  
1 Manuel Kost   Shared Multi Family    
2 Manuel Kost   Shared Multi Family    
  customer_address                                 customer_canton
  <chr>                                            <chr>          
1 16a, Schaufelbergerstrasse Wiedikon, Zurich 8063 Zurich         
2 16a, Schaufelbergerstrasse Wiedikon, Zurich 8063 Zurich         
  customer_address_lat customer_address_lon customer_business_model
                 <dbl>                <dbl> <chr>                  
1                 8.50                47.4  Pay-Per-Use            
2                47.4                  8.50 Pay-Per-Use            
  customer_cycles_amount machine_program_60degrees machine_program_40degrees
                   <dbl>                     <dbl>                     <dbl>
1                 15441.                     0.196                      0.76
2                 15441.                     0.196                      0.76
  machine_program_eco
                <dbl>
1               0.044
2               0.044
#In the case of this ID, the lats and longs are swapped
#remove duplicated id where lat & long are swapped
all_data <- all_data %>%
  filter(
    !(datapoint_id == dup_id &
        customer_address_lat < 20)
  )
#are more lat & long swapped?

all_data2 <- all_data %>% 
  filter_out(customer_address_lat < 20)

swapped <- all_data %>% 
  filter(customer_address_lat < 20)
swapped <- rename(swapped,
                "customer_address_lat" = "customer_address_lon",
                "customer_address_lon" = "customer_address_lat"
)

swapped <- swapped %>% 
  relocate(customer_address_lat, .before = customer_address_lon)

all_data <- bind_rows(all_data2, swapped)

all_data <- all_data %>% 
  relocate(customer_address_lon, .before = customer_address_lat)

sum(all_data$customer_address_lat < 20 & all_data$customer_address_lon > 20)
[1] 0
#datapoint_id vs. datapoint_time
all_data %>%
  mutate(
    id_date = as.Date(substr(as.character(datapoint_id), 1, 8), "%Y%m%d")
  ) %>%
  filter(id_date != datapoint_time)
# A tibble: 172 × 20
   datapoint_id datapoint_time service_event_type         customer_warranty_st…¹
          <dbl> <date>         <chr>                      <chr>                 
 1 202108057413 2020-01-22     Drain Clogged              Full Warranty         
 2 202101187577 2020-03-11     Bearing Failure            Full Warranty         
 3 202307244818 2020-04-15     Failure Detergent Feedpump No Warranty           
 4 202107314841 2020-07-22     Other User Damage          Full Warranty         
 5 202010259894 2020-08-01     Bearing Failure            No Warranty           
 6 202011304284 2020-11-08     Drain Clogged              No Warranty           
 7 202111303762 2020-11-11     Other User Damage          No Warranty           
 8 202007213322 2020-11-20     Motor Fault                Full Warranty         
 9 202001218717 2021-01-16     Other User Damage          No Warranty           
10 202009257681 2021-02-13     Drain Clogged              Full Warranty         
# ℹ 162 more rows
# ℹ abbreviated name: ¹​customer_warranty_status
# ℹ 16 more variables: repair_type <chr>, machine_id <dbl>,
#   machine_commissioning <date>, customer_id <chr>, customer_name <chr>,
#   customer_household_type <chr>, customer_address <chr>,
#   customer_canton <chr>, customer_address_lon <dbl>,
#   customer_address_lat <dbl>, customer_business_model <chr>, …
all_data %>%
  mutate(
    id_date = as.Date(substr(as.character(datapoint_id), 1, 8), "%Y%m%d"),
    diff_days = as.numeric(datapoint_time - id_date)
  ) %>%
  count(diff_days)
# A tibble: 169 × 2
   diff_days     n
       <dbl> <int>
 1     -1195     1
 2      -876     1
 3      -860     1
 4      -793     1
 5      -721     1
 6      -706     1
 7      -674     1
 8      -665     1
 9      -664     1
10      -639     1
# ℹ 159 more rows
#Although the datapoint_id was described as containing a date component, comparison with datapoint_time revealed substantial inconsistencies. Therefore, the encoded date was not used for further analysis, and datapoint_time was treated as the reliable temporal variable.
#machine_id vs. machine commissioning
all_data %>%
  mutate(id_date = as.Date(substr(as.character(machine_id), 1, 8), "%Y%m%d")) %>%
  filter(id_date != machine_commissioning)
# A tibble: 0 × 20
# ℹ 20 variables: datapoint_id <dbl>, datapoint_time <date>,
#   service_event_type <chr>, customer_warranty_status <chr>,
#   repair_type <chr>, machine_id <dbl>, machine_commissioning <date>,
#   customer_id <chr>, customer_name <chr>, customer_household_type <chr>,
#   customer_address <chr>, customer_canton <chr>, customer_address_lon <dbl>,
#   customer_address_lat <dbl>, customer_business_model <chr>,
#   customer_cycles_amount <dbl>, machine_program_60degrees <dbl>, …
#Test: time relationship commissioning vs. service date --> good
sum(all_data$machine_commissioning > all_data$datapoint_time, na.rm = TRUE)
[1] 0
#do percentages sum up to 100? --> yes
all_data %>%
  mutate(total = machine_program_60degrees +
           machine_program_40degrees +
           machine_program_eco) %>%
  summary()
  datapoint_id       datapoint_time       service_event_type
 Min.   :2.020e+11   Min.   :2020-01-02   Length:1672       
 1st Qu.:2.020e+11   1st Qu.:2021-02-02   Class :character  
 Median :2.021e+11   Median :2022-02-11   Mode  :character  
 Mean   :2.022e+11   Mean   :2022-02-04                     
 3rd Qu.:2.023e+11   3rd Qu.:2023-02-16                     
 Max.   :2.023e+11   Max.   :2023-12-31                     
 customer_warranty_status repair_type          machine_id       
 Length:1672              Length:1672        Min.   :1.966e+11  
 Class :character         Class :character   1st Qu.:2.006e+11  
 Mode  :character         Mode  :character   Median :2.011e+11  
                                             Mean   :2.010e+11  
                                             3rd Qu.:2.015e+11  
                                             Max.   :2.022e+11  
 machine_commissioning customer_id        customer_name     
 Min.   :1966-12-30    Length:1672        Length:1672       
 1st Qu.:2006-12-12    Class :character   Class :character  
 Median :2011-10-04    Mode  :character   Mode  :character  
 Mean   :2010-03-16                                         
 3rd Qu.:2015-05-22                                         
 Max.   :2022-05-26                                         
 customer_household_type customer_address   customer_canton   
 Length:1672             Length:1672        Length:1672       
 Class :character        Class :character   Class :character  
 Mode  :character        Mode  :character   Mode  :character  
                                                              
                                                              
                                                              
 customer_address_lon customer_address_lat customer_business_model
 Min.   :5.974        Min.   :46.04        Length:1672            
 1st Qu.:6.258        1st Qu.:46.34        Class :character       
 Median :8.482        Median :47.26        Mode  :character       
 Mean   :7.860        Mean   :46.99                               
 3rd Qu.:8.650        3rd Qu.:47.36                               
 Max.   :9.579        Max.   :47.64                               
 customer_cycles_amount machine_program_60degrees machine_program_40degrees
 Min.   : 1000          Min.   :0.1420            Min.   :0.1400           
 1st Qu.: 2358          1st Qu.:0.1840            1st Qu.:0.2080           
 Median : 3708          Median :0.1980            Median :0.7100           
 Mean   : 5411          Mean   :0.1984            Mean   :0.5165           
 3rd Qu.: 7744          3rd Qu.:0.2120            3rd Qu.:0.7500           
 Max.   :16246          Max.   :0.2620            Max.   :0.8120           
 machine_program_eco     total  
 Min.   :0.0220      Min.   :1  
 1st Qu.:0.0500      1st Qu.:1  
 Median :0.0960      Median :1  
 Mean   :0.2851      Mean   :1  
 3rd Qu.:0.5900      3rd Qu.:1  
 Max.   :0.6720      Max.   :1  
#check for impossible numeric values
summary(all_data$customer_cycles_amount)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1000    2358    3708    5411    7744   16246 
#check for typos
unique(all_data$service_event_type)
[1] "Other User Damage"          "Failure Detergent Feedpump"
[3] "Bearing Failure"            "Motor Fault"               
[5] "Drain Clogged"             
unique(all_data$repair_type)
[1] "Yes, at location"                                   
[2] "Yes, at factory"                                    
[3] "No, Machine is replaced and old machine is disposed"
[4] "No, Machine is disposed"                            
unique(all_data$customer_business_model)
[1] "Purchase"    "Rental"      "Pay-Per-Use"

1. Machine lifetime & intensity

Key variables:

  • machine_commissioning

  • datapoint_time

  • customer_cycles_amount

What to analyse:

  • Machine age at service

→ datapoint_time – commissioning date

  • Usage intensity:

→ cycles per year / per month

Calculation of machine age

Here I created a new data frame just to test it out if the code works. At the end I made a new column in the original all_dataed data frame.

machine.age <- all_data %>% 
  select(datapoint_time,machine_commissioning, machine_id) %>% 
  group_by(machine_id)

machine.age$dt <- as.vector(machine.age$datapoint_time)
machine.age$mt <- as.vector(machine.age$machine_commissioning)

machine.age <- machine.age %>% 
  group_by(machine_id) %>% 
  mutate(age_year = round((dt - mt)/365.2425), na.rm = TRUE)

all_data$age_year <- machine.age$age_year

library(knitr)
kable(machine.age[1:5,], caption = "New data frame to calculate age in years")
New data frame to calculate age in years
datapoint_time machine_commissioning machine_id dt mt age_year na.rm
2020-01-02 2014-06-20 201406209290 18263 16241 6 TRUE
2020-01-02 2007-09-14 200709147860 18263 13770 12 TRUE
2020-01-03 2007-06-17 200706173882 18264 13681 13 TRUE
2020-01-06 2012-01-28 201201287469 18267 15367 8 TRUE
2020-01-07 2009-03-20 200903202799 18268 14323 11 TRUE

Interesting questions:

  1. Do failures depend on age or usage?

  2. Are machines failing early or only after heavy use?

Grouped bar chart with an overlaid line

For the next part I made age intervals to more easily see the differences between machine of different use time.

I also converted the service type in order to visualize it later.

all_data <- all_data %>% 
  mutate(age_range = cut(age_year, c(0, 5, 10, 15, 20, 25, 100), labels = c("0-5", "5-10", "10-15", "15-20", "20-25", ">25")))

all_data$service_event_type <- as.factor(all_data$service_event_type)

I made separate data frames for both so it’s easier and error-free on the original data

bars_age_error <- all_data %>%
  select(service_event_type, age_range) %>% 
  count(age_range, service_event_type)

line_age_cycle <- all_data %>% 
  group_by(age_range) %>%
  summarise(mean(customer_cycles_amount), .groups = "drop") %>% 
  rename(
    "mean_cycle" = 'mean(customer_cycles_amount)'
  )
scale_factor <- max(bars_age_error$n) / max(line_age_cycle$mean_cycle) #This is to illustrate the second y-axis later on in the bar chart
ggplot() +
  # bars
  geom_bar(
    data = bars_age_error,
    aes(x = age_range, y = n, fill = service_event_type),
    stat = "identity", position = "dodge", 
    color = "black", linewidth = 0.3
  ) +
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  # line
  geom_line(
    data = line_age_cycle,
    aes(x = as.numeric(age_range), y = mean_cycle  * scale_factor),
    color = "red", linewidth = 1, group = 1
  ) +
  geom_point(
    data = line_age_cycle,
    aes(x = as.numeric(age_range), y = mean_cycle * scale_factor),
    color = "red", size = 3
  ) +
  # dual y-axis
  scale_y_continuous(
    name = "Number of service events",
    sec.axis = sec_axis(
      transform = ~ . / scale_factor,
      name = "Average total number of cycles"
    )
  ) +
  scale_x_discrete(name = "Machine age (years)") +
  labs(
    title = "Number of service events and average cycles per month by age range",
    fill = NULL) +
  theme_classic() +
  theme(
    legend.position        = "bottom",
    legend.direction       = "horizontal",
    axis.text              = element_text(size = 10),
    axis.title             = element_text(size = 10),
    axis.title.y.right     = element_text(color = "red"),
    axis.text.y.right      = element_text(color = "red")
  )

Question — cycles vs machine age A simple linear regression — do more cycles predict earlier breakdown?

model <- lm(machine_age_years ~ customer_cycles_amount + 
                                customer_household_type + 
                                customer_business_model, 
            data = all_data %>%
              mutate(machine_age_years = as.numeric(difftime(
                datapoint_time, machine_commissioning, units = "days")) / 365))
summary(model)

Call:
lm(formula = machine_age_years ~ customer_cycles_amount + customer_household_type + 
    customer_business_model, data = all_data %>% mutate(machine_age_years = as.numeric(difftime(datapoint_time, 
    machine_commissioning, units = "days"))/365))

Residuals:
   Min     1Q Median     3Q    Max 
-8.866 -4.074 -0.544  2.160 36.640 

Coefficients:
                                               Estimate Std. Error t value
(Intercept)                                   9.425e+00  5.275e-01  17.867
customer_cycles_amount                        1.425e-03  5.014e-05  28.422
customer_household_typeShared Multi Family   -1.140e+01  5.320e-01 -21.425
customer_household_typeSingle User Household  1.759e+00  4.791e-01   3.672
customer_business_modelPurchase              -7.675e-01  4.119e-01  -1.863
customer_business_modelRental                -1.134e+00  5.861e-01  -1.935
                                             Pr(>|t|)    
(Intercept)                                   < 2e-16 ***
customer_cycles_amount                        < 2e-16 ***
customer_household_typeShared Multi Family    < 2e-16 ***
customer_household_typeSingle User Household 0.000248 ***
customer_business_modelPurchase              0.062589 .  
customer_business_modelRental                0.053213 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.988 on 1666 degrees of freedom
Multiple R-squared:  0.3882,    Adjusted R-squared:  0.3864 
F-statistic: 211.4 on 5 and 1666 DF,  p-value: < 2.2e-16

2. Failure type, warranty status, and repair type

This relationship can be showcased through a heatmap

# Making Heatmap to show relationship between error case, warranty status, and repair outcome

pcd <- all_data %>% 
  select(customer_warranty_status,repair_type, service_event_type, age_range, machine_id)

pcd$customer_warranty_status <- as.factor(pcd$customer_warranty_status)
pcd$repair_type <- as.factor(pcd$repair_type)
min(pcd$cycle_per_month)
[1] Inf
# Making a separate data frame for the heatmap. 

heatmap_df <- pcd %>%
  group_by(service_event_type, repair_type, customer_warranty_status) %>%
  summarise(count = n(), .groups = "drop") %>%
  complete(service_event_type, repair_type, customer_warranty_status,
           fill = list(count = 0))


# Plot
ggplot(heatmap_df, aes(x = repair_type, y = service_event_type, fill = count)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = ifelse(count == 0, "0", as.character(count))),
            size = 3.5, color = "white") +
  scale_fill_gradient(low = "#A4BED5FF", high = "#023743FF") +
  facet_wrap(~ customer_warranty_status) +        # <-- this is to have 2 maps for 2 warranty statuses
  labs(
    x    = "Repair Type",
    y    = "Failure Type",
    fill = "Service events"
  ) +
  theme_minimal() +
  theme(
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 10),
    axis.text.y     = element_text(size = 10),
    axis.title      = element_text(size = 10),
    panel.grid      = element_blank(),
    legend.position = "right",
    strip.text      = element_text(size = 10, face = "bold")  # styles the facet titles
  )

Some interesting questions we could investigate from this map:

  1. Which failure types consistently lead to disposal rather than repair?

  2. Preventative repair? Where? Which part?

Statistical tests

Question — business model vs repair type

A chi-square test — does business model significantly affect what kind of repair is needed?

Chi-square — “do these two categories relate to each other?”

Imagine you have two jars of marbles. One jar is business models (Purchase, Rental, PPU) and the other is repair types (factory, on-site, disposal). The chi-square test asks: “does the type of marble you pick from jar 1 affect what you get from jar 2?”

We chose it here because both variables are categories — you can’t calculate an average of “Rental” or “disposal”, you can only count them. Chi-square is specifically designed for counting categories against each other.

chisq.test(table(all_data$customer_business_model, all_data$repair_type))

    Pearson's Chi-squared test

data:  table(all_data$customer_business_model, all_data$repair_type)
X-squared = 528.15, df = 6, p-value < 2.2e-16

3. Machine use and Business Model

Do Rental and Pay-Per-Use machines get better care than Purchased ones?

Box plot: total cycles at time of service by BM — which model drives more machine use

bm_cycle <- all_data %>% 
  select(customer_business_model, repair_type, customer_cycles_amount, machine_program_60degrees,machine_program_40degrees,machine_program_eco)
  
ggplot(bm_cycle, aes(x = customer_business_model, y = customer_cycles_amount, fill = customer_business_model))+
  geom_boxplot(colour = "black", median.color = "red")+
  theme_classic()+
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  theme(legend.position = "none",
        axis.text.x     = element_text(angle = 45, hjust = 1, size = 10))+
  labs(
    fill = NULL,
    x = "Business Model",
    y = "Total cycle amount"
  )

4.Household type and cycles

Question — does household type predict cycles? One-way ANOVA — do flatshares genuinely use machines more than single households?

aov_result <- aov(customer_cycles_amount ~ customer_household_type, data = all_data)
summary(aov_result)
                          Df    Sum Sq   Mean Sq F value Pr(>F)    
customer_household_type    2 1.190e+10 5.948e+09   695.9 <2e-16 ***
Residuals               1669 1.426e+10 8.547e+06                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA — “are these group averages actually different from each other?”

Imagine three groups of kids — singles, flatshares, and multi-family — and you measure how many times each group uses the washing machine. ANOVA asks: “are these averages genuinely different, or could the difference just be random luck?”

We chose it here because you have one number (cycles) being compared across multiple groups (household types). It’s essentially a chi-square but for when one of your variables is a number rather than a category.

ggplot(all_data, aes(machine_program_60degrees, customer_cycles_amount, colour = customer_cycles_amount)) +
  geom_point()+
  scale_color_gradient(low = "blue", high = "red")+
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation:
colour.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

ggplot(all_data, aes(machine_program_eco, customer_cycles_amount, colour = customer_cycles_amount)) +
  geom_point()+
  scale_color_gradient(low = "blue", high = "red")+
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation:
colour.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

all_data %>%
  mutate(
    machine_age_years = as.numeric(difftime(datapoint_time,
                                            machine_commissioning, units = "days")) / 365,
    eco_user = ifelse(machine_program_eco > 0.33, "High eco usage", 
                      "Low eco usage")
  ) %>%
  ggplot(aes(x = eco_user, y = machine_age_years, fill = eco_user)) +
  geom_boxplot(color = "black", linewidth = 0.3) +
  scale_fill_manual(values = c("#ffffcc", "#1a7a3c")) +
  labs(x = NULL, y = "Machine age at service (years)") +
  theme_classic() +
  theme(legend.position = "none")

# --- split into two groups
high_eco <- all_data %>%
  mutate(machine_age_years = as.numeric(difftime(datapoint_time,
                                                 machine_commissioning, units = "days")) / 365) %>%
  filter(machine_program_eco > 0.33) %>%
  pull(machine_age_years)

low_eco <- all_data %>%
  mutate(machine_age_years = as.numeric(difftime(datapoint_time,
                                                 machine_commissioning, units = "days")) / 365) %>%
  filter(machine_program_eco <= 0.33) %>%
  pull(machine_age_years)

# --- t-test
t.test(high_eco, low_eco)

    Welch Two Sample t-test

data:  high_eco and low_eco
t = 8.1013, df = 1066.1, p-value = 1.479e-15
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 2.477448 4.061149
sample estimates:
mean of x mean of y 
 13.91612  10.64682 
#Does eco program usage correlate with fewer breakdowns? (Regression)
model_eco <- lm(customer_cycles_amount ~ machine_program_eco + 
                                         machine_program_60degrees,
                data = all_data)
summary(model_eco)

Call:
lm(formula = customer_cycles_amount ~ machine_program_eco + machine_program_60degrees, 
    data = all_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6745.9 -1627.4   -51.6  1445.6  8591.1 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 7627.3      808.0   9.439   <2e-16 ***
machine_program_eco        -9758.9      294.7 -33.111   <2e-16 ***
machine_program_60degrees   2853.2     4032.3   0.708    0.479    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3075 on 1669 degrees of freedom
Multiple R-squared:  0.3966,    Adjusted R-squared:  0.3959 
F-statistic: 548.5 on 2 and 1669 DF,  p-value: < 2.2e-16
all_data %>%
  mutate(machine_age_years = as.numeric(difftime(datapoint_time,
                                                 machine_commissioning, units = "days")) / 365) %>%
  summarise(correlation = cor(machine_program_eco, machine_age_years,
                              use = "complete.obs"))
# A tibble: 1 × 1
  correlation
        <dbl>
1       0.241
all_data %>%
  group_by(customer_household_type) %>%
  summarise(
    eco     = mean(machine_program_eco),
    deg40   = mean(machine_program_40degrees),
    deg60   = mean(machine_program_60degrees)
  ) %>%
  pivot_longer(-customer_household_type, 
               names_to = "program", values_to = "pct") %>%
  ggplot(aes(x = customer_household_type, y = pct, fill = program)) +
  geom_bar(stat = "identity", position = "fill") +
 scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  labs(x = NULL, y = "Program usage share")

###Bar chart: service_event_type frequency by customer_household_type

all_data %>% 
  group_by(customer_household_type) %>% 
  count(customer_household_type, customer_business_model) %>% 
  ggplot(aes(customer_household_type, n, fill = customer_business_model))+
  geom_bar(stat = "identity", position = "stack",
           color = "black", linewidth = 0.3)+
  theme_classic()+
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  labs(
    x = "Household type",
    y = "Total service events",
    fill = "Business model"
  )

5. Returning customers

returning_customers <- all_data %>%
  group_by(customer_id) %>%
  filter(n() > 1) %>%
  ungroup() %>%
  arrange(customer_id, datapoint_time)

returning_customers %>%
  group_by(customer_id) %>%
  summarise(
    total_events     = n(),
    first_service    = min(datapoint_time),
    last_service     = max(datapoint_time),
    business_model   = first(customer_business_model),
    canton           = first(customer_canton)
  ) %>%
  arrange(desc(total_events))
# A tibble: 172 × 6
   customer_id     total_events first_service last_service business_model canton
   <chr>                  <int> <date>        <date>       <chr>          <chr> 
 1 00431f6b-6563-…            2 2020-08-07    2022-09-29   Rental         Solot…
 2 014d5176-463e-…            2 2021-03-09    2023-01-09   Purchase       Geneva
 3 04b3fb95-439f-…            2 2020-07-23    2023-04-08   Purchase       Zurich
 4 06945565-3612-…            2 2020-09-08    2022-11-28   Pay-Per-Use    Geneva
 5 0708ddcb-49d4-…            2 2022-11-17    2023-01-14   Purchase       Bern  
 6 0912c2cd-e487-…            2 2021-04-30    2023-08-23   Rental         Aargau
 7 0b4bb0ec-5ad6-…            2 2022-04-13    2023-06-25   Purchase       Zurich
 8 0c252a9f-f3a7-…            2 2020-11-18    2022-10-20   Pay-Per-Use    Geneva
 9 0de1e871-2a5c-…            2 2021-06-18    2023-12-14   Purchase       Schwyz
10 0f28189c-a0ce-…            2 2022-03-23    2022-04-26   Pay-Per-Use    St. G…
# ℹ 162 more rows

A histogram to show the distribution of how long customers stay in the service loop. Are most returning customers coming back within a year, or are they spanning multiple years?

returning_summary <- returning_customers %>%
  group_by(customer_id) %>%
  summarise(
    total_events  = n(),
    first_service = min(datapoint_time),
    last_service  = max(datapoint_time),
    days_span     = as.numeric(difftime(max(datapoint_time), 
                               min(datapoint_time), units = "days")),
    business_model = first(customer_business_model)
  )

ggplot(returning_summary, aes(x = days_span / 365)) +
  geom_histogram(binwidth = 0.5, fill = "#1a7a3c", 
                 color = "white", linewidth = 0.3) +
  labs(x = "Years between first and last service event",
       y = "Number of customers") +
  theme_classic()

error_recurrence <- returning_customers %>%
  group_by(customer_id, service_event_type) %>%
  summarise(occurrences = n(), .groups = "drop") %>%
  filter(occurrences > 1)

# --- Step 2: count how many customers experience each recurring error
error_recurrence_summary <- error_recurrence %>%
  group_by(service_event_type) %>%
  summarise(
    customers_affected = n()
  ) %>%
  arrange(desc(customers_affected))

# --- Step 3: plot
ggplot(error_recurrence_summary, 
       aes(x = reorder(service_event_type, customers_affected),
           y = customers_affected)) +
  geom_bar(stat = "identity", color = "black", linewidth = 0.3, fill = "#1a7a3c") +
  geom_text(aes(label = customers_affected), 
            hjust = -0.3, size = 3.5) +
  coord_flip() +
  labs(
    x = NULL,
    y = "Number of customers experiencing recurring error"
  ) +
  scale_fill_paletteer_d("nationalparkcolors::Acadia")+
  theme_classic() +
  theme(
    axis.text  = element_text(size = 10),
    axis.title = element_text(size = 11)
  )

6. Service events by canton

Choropleth map

#Switzerland cantons from GISCO (Eurostat). This method requires the giscoR and sf packages!
#So do install.packages and then library first

swiss_cantons <- gisco_get_nuts(
  country    = "CH",
  nuts_level = 3, #This is canton level
  year       = "2021",
  resolution = "03" 
)

After doing this, it’s good to check the names in the swiss_cantons data, and then our data. Cause there can be mismatches such as umlauts or different spellings.

# Check names
unique(swiss_cantons$NAME_LATN)
unique(all_data$customer_canton)
# Fix canton names & aggregate data
canton_df <- all_data %>%
  mutate(customer_canton = recode(customer_canton,
                                  "Zurich"      = "Zürich",
                                  "Geneva"      = "Genève",
                                  "Lucerne"     = "Luzern",
                                  "Fribourg"    = "Freiburg",
                                  "Wallis"      = "Valais",
                                  "Basel-City"  = "Basel-Stadt"
  )) %>%
  group_by(customer_canton) %>%
  summarise(service_events = n(), .groups = "drop")
            
# Make abbreviations and all_data data
swiss_map <- swiss_cantons %>%
  left_join(canton_df, by = c("NAME_LATN" = "customer_canton"))

abbrev_lookup <- data.frame(
  NUTS_ID = c("CH011","CH012","CH013","CH021","CH022","CH023","CH024",
              "CH025","CH031","CH032","CH033","CH040","CH051","CH052",
              "CH053","CH054","CH055","CH056","CH057","CH061","CH062",
              "CH063","CH064","CH065","CH066","CH070"),
  abbr    = c("VD","VS","GE","BE","FR","SO","NE",
              "JU","BS","BL","AG","ZH","GL","SH",
              "AR","AI","SG","GR","TG","LU","UR",
              "SZ","OW","NW","ZG","TI")
)

swiss_map <- swiss_map %>%
  left_join(abbrev_lookup, by = "NUTS_ID")

# --- Step 5: plot
ggplot(swiss_map) +
  geom_sf(aes(fill = service_events), color = "white", linewidth = 0.4) +
  geom_sf_text(aes(label = abbr), size = 2.5, color = "gray20") +
  scale_fill_gradient(low = "#ffffcc", high = "#1a7a3c",
                      na.value = "grey90",
                      name = "Service events") +
  labs(title = "Service Events by Canton", subtitle = "2020–2023") +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title      = element_text(size = 13, face = "bold"),
    plot.subtitle   = element_text(size = 10, color = "gray40")
  )

At this point it is good to cross reference and see if the high number of service events mean the machines here tend to fail more of if it’s purely due to customer volume

# customers per canton per business model
customers_per_canton <- all_data %>%
  mutate(customer_canton = recode(customer_canton,
                                  "Zurich"      = "Zürich",
                                  "Geneva"      = "Genève",
                                  "Lucerne"     = "Luzern",
                                  "Fribourg"    = "Freiburg",
                                  "Wallis"      = "Valais",
                                  "Basel-City"  = "Basel-Stadt"
  )) %>%
  group_by(customer_canton, customer_business_model) %>%
  summarise(
    service_events   = n(),
    unique_customers = n_distinct(customer_id),
    .groups = "drop"
  ) %>%
  mutate(events_per_customer = service_events / unique_customers)

#rank cantons by total service events
canton_order <- customers_per_canton %>%
  group_by(customer_canton) %>%
  summarise(total = sum(service_events)) %>%
  arrange(desc(total)) %>%
  pull(customer_canton)

customers_per_canton <- customers_per_canton %>%
  mutate(customer_canton = factor(customer_canton, levels = canton_order))

###Plot this data

ggplot(customers_per_canton,
       aes(x = customer_business_model,
           y = events_per_customer,
           fill = customer_business_model)) +
  geom_bar(stat = "identity", color = "black", linewidth = 0.3, width = 0.6) +
  geom_text(aes(label = round(events_per_customer, 1)),
            vjust = -0.5, size = 3) +
  scale_fill_manual(values = c("#ffffcc", "#78c679", "#1a7a3c")) +
  facet_wrap(~ customer_canton) +
  labs(x = NULL, y = "Service events per customer") +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y     = element_text(size = 8),
    strip.text      = element_text(size = 10, face = "bold"),
    panel.spacing   = unit(1, "lines")
  )

What we should pay attention to in this graph:

  • Jura showing 2 events in just Purchase model: 1 returning customer in whole Jura

  • Pay-Per-Use and Rental contracts are still concentrated in larger cantons, they haven’t penetrated rural or smaller cantons yet -> A business opportunity.

If failure rates are consistent everywhere, there’s no operational risk in expanding PPU nationally — the service burden per customer won’t increase.

  • Failure rate is consistent across cantons and business models, the problem isn’t geographic. This points toward product-level interventions — better components, predictive maintenance, or design improvements

Dot Density Graph

test <- all_data %>% 
  select(customer_address_lon, customer_address_lat)
test2 <- st_as_sf(test, coords = c("customer_address_lon", "customer_address_lat"), crs = 4326)
test2 <- st_transform(test2, 21781)

library(rstudioapi)
library(magrittr) # pipes

Attache Paket: 'magrittr'
Das folgende Objekt ist maskiert 'package:tidyr':

    extract
library(lintr) # code linting
library(sf) # spatial data handling
library(raster) # raster handling (needed for relief)
Lade nötiges Paket: sp

Attache Paket: 'raster'
Das folgende Objekt ist maskiert 'package:dplyr':

    select
library(viridis) # viridis color scale
Lade nötiges Paket: viridisLite
library(cowplot) # stack ggplots

Attache Paket: 'cowplot'
Das folgende Objekt ist maskiert 'package:lubridate':

    stamp
canton_geo <- read_sf("g2k15.shp")

# read country borders
country_geo <- read_sf("g2l15.shp")

# read lakes
lake_geo <- read_sf("g2s15.shp")

# read productive area (2324 municipalities)
municipality_prod_geo <- read_sf("gde-1-1-15.shp")

# read in raster of relief
relief <- raster("02-relief-ascii.asc") %>%
  # hide relief outside of Switzerland by masking with country borders
  mask(country_geo) %>%
  as("SpatialPixelsDataFrame") %>%
  as.data.frame() %>%
  rename(value = `X02.relief.ascii`)

# clean up
rm(country_geo)

#Join Geodata with Thematic Data
municipality_prod_geo %<>%
  st_join(test2, by = c("BFS_ID" = "bfs_id"))
class(municipality_prod_geo)
[1] "sf"         "tbl_df"     "tbl"        "data.frame"
#Create a theme map
theme_map <- function(...) {
  theme_minimal() +
  theme(
    text = element_text(family = default_font_family,
                        color = default_font_color),
    # remove all axes
    axis.line = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    # add a subtle grid
    panel.grid.major = element_line(color = "#dbdbd9", size = 0.2),
    panel.grid.minor = element_blank(),
    # background colors
    plot.background = element_rect(fill = default_background_color,
                                   color = NA),
    panel.background = element_rect(fill = default_background_color,
                                    color = NA),
    legend.background = element_rect(fill = default_background_color,
                                     color = NA),
    # borders and margins
    plot.margin = unit(c(.5, .5, .2, .5), "cm"),
    panel.border = element_blank(),
    panel.spacing = unit(c(-.1, 0.2, .2, 0.2), "cm"),
    # titles
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 9, hjust = 0,
                               color = default_font_color),
    plot.title = element_text(size = 15, hjust = 0.5,
                              color = default_font_color),
    plot.subtitle = element_text(size = 10, hjust = 0.5,
                                 color = default_font_color,
                                 margin = margin(b = -0.1,
                                                 t = -0.1,
                                                 l = 2,
                                                 unit = "cm"),
                                 debug = F),
    # captions
    plot.caption = element_text(size = 7,
                                hjust = .5,
                                margin = margin(t = 0.2,
                                                b = 0,
                                                unit = "cm"),
                                color = "#939184"),
    ...
  )
}

swiss_map <- st_transform(swiss_map, 21781)

#Make the graph
ggplot(
  # define main data source
  data = swiss_map
) +
  geom_sf(aes(fill = service_events))+
  # first: draw the relief
  geom_raster(
    data = relief,
    inherit.aes = FALSE,
    aes(
      x = x,
      y = y,
      alpha = value
    )
  ) +
  # use the "alpha hack" (as the "fill" aesthetic is already taken)
  scale_alpha(name = "",
              range = c(0.2, 0),
              guide = F) + # suppress legend
  # add main fill aesthetic
  # use thin white stroke for municipality borders
scale_fill_viridis(
    option = "magma",
    alpha = 0.8, # make fill a bit brighter
    begin = 0.1, # this option seems to be new (compared to 2016):
    # with this we can truncate the
    # color scale, so that extreme colors (very dark and very bright) are not
    # used, which makes the map a bit more aesthetic
    end = 0.9,
    na.value = "grey") +
  
  # use the Viridis color scale
  # use thicker white stroke for cantonal borders
  geom_sf(
    data = canton_geo,
    fill = "transparent",
    color = "white",
    size = 0.5
  ) +
  # draw lakes in light blue
  geom_sf(
    data = lake_geo,
    fill = "#D6F1FF",
    color = "transparent"
  )+
    geom_sf(
    data = test2,
    fill = "transparent",
    color = "red"
  )+
  geom_sf_text(aes(label = abbr), size = 2, color = "white")+
  theme_void()
Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.