library(readxl) # lecture CSV
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(forcats)
library(psych)
library(knitr)
library(kableExtra)
library(patchwork) # composer plusieurs ggplots
# Palette maison
pal_bleu <- c("#0D1B2A","#1B4F72","#2E86AB","#5DADE2","#AED6F1")
pal_mix <- c("#2E86AB","#F4A261","#27AE60","#C0392B","#8E44AD","#F39C12","#1B4F72")
theme_actuariel <- function() {
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1B4F72"),
plot.subtitle = element_text(size = 11, color = "#5D6D7E"),
plot.caption = element_text(size = 9, color = "#95A5A6", hjust = 1),
axis.title = element_text(color = "#34495E", face = "bold"),
panel.grid.major = element_line(color = "#ECF0F1"),
panel.grid.minor = element_blank(),
legend.title = element_text(face = "bold"),
strip.text = element_text(face = "bold", color = "#1B4F72")
)
}Dimensions de la table contrat : 301437 lignes Ă— 40 colonnes.
describe(contrat) %>%
select(n, mean, sd, min, max, skew, kurtosis) %>%
round(3) %>%
kbl(caption = "Statistiques descriptives — Table Contrat") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE, font_size = 12)| n | mean | sd | min | max | skew | kurtosis | |
|---|---|---|---|---|---|---|---|
| idxCt | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| idxYear | 301437 | 2021.113 | 1.427 | 2019 | 2023 | -0.100 | -1.314 |
| vhImmat | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| sitStartDate | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| sitEndDate | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| sitExpo | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| drv1Age | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| drv1Sex | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| drv1DriveLicenceType | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| drv1DriveLicenceAge | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| vhAge | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| ctFrm | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| ctAssBase | 301437 | 1.000 | 0.000 | 1 | 1 | NaN | NaN |
| ctAss0km | 301437 | 0.118 | 0.323 | 0 | 1 | 2.369 | 3.612 |
| ctAssVHR | 301437 | 0.064 | 0.244 | 0 | 1 | 3.568 | 10.734 |
| vhSegment | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| vhMarque | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| vhEnergy | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| vhWeight | 301437 | 1387.546 | 308.583 | 600 | 2793 | 0.091 | -0.263 |
| vhDIN | 301437 | 132.559 | 47.502 | 50 | 300 | 0.466 | -0.229 |
| vhValue | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| vhGroup | 301437 | 14.481 | 7.020 | 1 | 31 | 0.186 | -0.720 |
| vhClass | 301437 | 2.877 | 0.793 | 1 | 5 | 0.289 | -0.193 |
| ctUsage | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| ctKM | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| ctDeduc | 301437 | 289.779 | 104.144 | 200 | 500 | 0.727 | -0.839 |
| claimsAnt | 301437 | 0.396 | 0.660 | 0 | 2 | 1.410 | 0.653 |
| ctINSEE | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| id1_AssBase | 28581 | NaN | NA | Inf | -Inf | NA | NA |
| id1_Ass0km | 2443 | NaN | NA | Inf | -Inf | NA | NA |
| id1_AssVHR | 965 | NaN | NA | Inf | -Inf | NA | NA |
| id2_AssBase | 1552 | NaN | NA | Inf | -Inf | NA | NA |
| id2_Ass0km | 0 | NaN | NA | Inf | -Inf | NA | NA |
| id2_AssVHR | 0 | NaN | NA | Inf | -Inf | NA | NA |
| id3_AssBase | 68 | NaN | NA | Inf | -Inf | NA | NA |
| id3_Ass0km | 0 | NaN | NA | Inf | -Inf | NA | NA |
| id3_AssVHR | 0 | NaN | NA | Inf | -Inf | NA | NA |
| COT_AssBase | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| COT_Ass0km | 301437 | NaN | NA | Inf | -Inf | NA | NA |
| COT_AssVHR | 301437 | NaN | NA | Inf | -Inf | NA | NA |
# Calcul
miss_df <- data.frame(
variable = names(contrat),
n_manquant = colSums(is.na(contrat)),
pct = colSums(is.na(contrat)) / nrow(contrat) * 100
) %>% filter(n_manquant > 0) %>% arrange(desc(pct))
# Graphique
ggplot(miss_df, aes(x = reorder(variable, pct), y = pct, fill = pct)) +
geom_col(show.legend = FALSE, width = 0.7) +
geom_text(aes(label = sprintf("%.1f %%", pct)), hjust = -0.15,
size = 3.5, fontface = "bold", color = "#2C3E50") +
scale_fill_gradient(low = "#AED6F1", high = "#1B4F72") +
coord_flip(clip = "off") +
scale_y_continuous(expand = expansion(mult = c(0, 0.18)),
labels = label_percent(scale = 1)) +
labs(
title = "Taux de valeurs manquantes par variable",
subtitle = "Table Contrat — les colonnes 'id*' sont des NA structurels (absence de sinistre)",
x = NULL, y = "% de valeurs manquantes",
caption = "Source : prjAct2426-detailContrats"
) +
theme_actuariel()ℹ️ NA structurels — Les colonnes
id1_AssBase, id1_Ass0km,
id1_AssVHR, id2_*, id3_*
contiennent des NA intentionnels : l’absence de valeur
signifie simplement qu’aucun sinistre de ce type n’a été déclaré pour ce
contrat. Ces colonnes seront recodées en
"Non renseigné".
cols_sinistres <- c(
"id1_AssBase","id1_Ass0km","id1_AssVHR",
"id2_AssBase","id2_Ass0km","id2_AssVHR",
"id3_AssBase","id3_Ass0km","id3_AssVHR"
)
contrat[cols_sinistres] <- lapply(
contrat[cols_sinistres],
function(x) ifelse(is.na(x), "Non renseigné", x)
)
cat("âś… Valeurs manquantes restantes :", sum(is.na(contrat)))## âś… Valeurs manquantes restantes : 0
âś… Plus aucune valeur manquante dans la table contrat.
## Nombre de doublons : 0
✅ Aucun doublon détecté dans la table contrat.
num_cols <- contrat %>% select(where(is.numeric)) %>% colnames()
outlier_df <- lapply(num_cols, function(col) {
Q1 <- quantile(contrat[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(contrat[[col]], 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
n_out <- sum(contrat[[col]] < Q1 - 1.5*IQR_val |
contrat[[col]] > Q3 + 1.5*IQR_val, na.rm = TRUE)
data.frame(variable = col,
n_outliers = n_out,
pct_outliers = round(n_out / nrow(contrat) * 100, 2))
}) %>% bind_rows() %>% arrange(desc(pct_outliers))
# Graphique
ggplot(outlier_df %>% filter(pct_outliers > 0),
aes(x = reorder(variable, pct_outliers), y = pct_outliers, fill = pct_outliers)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(pct_outliers, " %")), hjust = -0.1,
size = 3.5, color = "#2C3E50", fontface = "bold") +
scale_fill_gradient(low = "#F9E79F", high = "#C0392B") +
coord_flip(clip = "off") +
scale_y_continuous(expand = expansion(mult = c(0, 0.20))) +
labs(
title = "Pourcentage de valeurs extrêmes par variable (méthode IQR)",
subtitle = "Les outliers ne seront pas supprimés — ils seront analysés dans le Shiny",
x = NULL, y = "% d'outliers",
caption = "Méthode : Q1 - 1.5×IQR / Q3 + 1.5×IQR"
) +
theme_actuariel()⚠️ Certaines variables comme COT_Ass0km présentent des
valeurs extrêmes liées à la structure tarifaire (primes élevées pour
certains profils). Ces observations sont conservées —
elles feront l’objet d’analyses spécifiques dans le tableau de bord
interactif.
p1 <- ggplot(contrat, aes(x = drv1Age)) +
geom_histogram(bins = 40, fill = "#2E86AB", color = "white", alpha = 0.9) +
geom_vline(xintercept = median(contrat$drv1Age, na.rm=TRUE),
linetype = "dashed", color = "#F4A261", linewidth = 1) +
annotate("text", x = median(contrat$drv1Age, na.rm=TRUE)+2,
y = Inf, label = paste0("Médiane : ", round(median(contrat$drv1Age, na.rm=TRUE),1), " ans"),
vjust = 2, hjust = 0, size = 3.5, color = "#F4A261") +
labs(title = "Âge du conducteur principal",
subtitle = "Distribution de la population assurée",
x = "Âge (années)", y = "Effectif") +
theme_actuariel()
p2 <- ggplot(contrat, aes(x = fct_infreq(ctFrm), fill = ctFrm)) +
geom_bar(width = 0.65, show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5,
fontface = "bold", size = 3.5) +
scale_fill_manual(values = pal_mix) +
labs(title = "Formule souscrite (ctFrm)",
subtitle = "Répartition des contrats par formule",
x = "Formule", y = "Nombre de contrats") +
theme_actuariel()
p3 <- ggplot(contrat, aes(x = vhAge)) +
geom_histogram(bins = 35, fill = "#1B4F72", color = "white", alpha = 0.9) +
geom_vline(xintercept = median(contrat$vhAge, na.rm=TRUE),
linetype = "dashed", color = "#F4A261", linewidth = 1) +
labs(title = "Âge du véhicule",
subtitle = "Ancienneté du parc assuré",
x = "Âge véhicule (années)", y = "Effectif") +
theme_actuariel()
p4 <- ggplot(contrat, aes(x = fct_infreq(vhSegment), fill = vhSegment)) +
geom_bar(width = 0.65, show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5,
fontface = "bold", size = 3.5) +
scale_fill_manual(values = pal_bleu) +
labs(title = "Segment du véhicule",
x = "Segment", y = "Nombre de contrats") +
theme_actuariel() +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
(p1 + p2) / (p3 + p4) +
plot_annotation(
title = "Exploration des variables contrat",
theme = theme(plot.title = element_text(size = 17, face = "bold", color = "#0D1B2A"))
)Dimensions de la table sinistre : 72130 lignes Ă— 8 colonnes.
describe(sinistre %>% select(mt_eval, mt_regl)) %>%
select(n, mean, sd, median, min, max, skew) %>%
round(2) %>%
kbl(caption = "Statistiques des montants sinistres") %>%
kable_styling(bootstrap_options = c("striped","hover"), font_size = 12)| n | mean | sd | median | min | max | skew | |
|---|---|---|---|---|---|---|---|
| mt_eval* | 72130 | 6653.81 | 3104.36 | 5943.5 | 1 | 15329 | 0.31 |
| mt_regl* | 72130 | 3876.93 | 4029.61 | 1105.0 | 1 | 15486 | 1.18 |
sinistre <- sinistre %>%
mutate(decl_sin = as.Date(decl_sin, format = "%Y-%m-%d"),
clo_sin = as.Date(clo_sin, format = "%Y-%m-%d"),
gest_sin = as.Date(gest_sin, format = "%Y-%m-%d"),
surv_sin = as.Date(surv_sin, format = "%Y-%m-%d"))miss_sin <- data.frame(
variable = names(sinistre),
n_manquant = colSums(is.na(sinistre)),
pct = round(colSums(is.na(sinistre)) / nrow(sinistre) * 100, 2)
) %>% filter(n_manquant > 0)
miss_sin %>%
kbl(caption = "Valeurs manquantes — Table Sinistre") %>%
kable_styling(bootstrap_options = c("striped","hover"), font_size = 12)| variable | n_manquant | pct | |
|---|---|---|---|
| clo_sin | clo_sin | 39099 | 54.21 |
ℹ️ La colonne clo_sin (date de clôture) contient des NA
car certains sinistres sont encore ouverts. Pour les
sinistres avec plusieurs enregistrements, la date sera propagée via
fill().
# Propagation de la date de clĂ´ture
sinistre <- sinistre %>%
group_by(idx_sin) %>%
tidyr::fill(clo_sin, .direction = "downup") %>%
ungroup()
# Garde la date la plus récente si plusieurs dates différentes
sinistre <- sinistre %>%
group_by(idx_sin) %>%
mutate(clo_sin = if (n_distinct(clo_sin, na.rm = TRUE) > 1)
max(clo_sin, na.rm = TRUE) else clo_sin) %>%
ungroup()
# Seconde propagation après correction
sinistre <- sinistre %>%
group_by(idx_sin) %>%
tidyr::fill(clo_sin, .direction = "downup") %>%
ungroup()
cat("NA restants dans clo_sin (sinistres non clôturés) :",
sum(is.na(sinistre$clo_sin)))## NA restants dans clo_sin (sinistres non clôturés) : 4428
sinistre$sinistre_clos <- ifelse(is.na(sinistre$clo_sin), "Ouvert", "Clos")
sinistre$delai_cloture <- as.numeric(sinistre$clo_sin - sinistre$decl_sin)
sinistre$delai_gestion <- as.numeric(sinistre$clo_sin - sinistre$gest_sin)
sinistre$sinistre_clos <- as.factor(sinistre$sinistre_clos)p5 <- ggplot(sinistre %>% distinct(idx_sin, .keep_all = TRUE),
aes(x = gar_sin, fill = gar_sin)) +
geom_bar(width = 0.6, show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5,
fontface = "bold", size = 4) +
scale_fill_manual(values = c("#2E86AB","#F4A261","#27AE60")) +
labs(title = "Répartition par type de garantie sinistrée",
x = "Garantie", y = "Nombre de sinistres") +
theme_actuariel()
p6 <- ggplot(sinistre %>% distinct(idx_sin, .keep_all = TRUE),
aes(x = sinistre_clos, fill = sinistre_clos)) +
geom_bar(width = 0.5, show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5,
fontface = "bold", size = 4) +
scale_fill_manual(values = c("#27AE60","#C0392B")) +
labs(title = "Sinistres ouverts vs clôturés",
x = "Statut", y = "Nombre de sinistres") +
theme_actuariel()
p7 <- sinistre %>%
filter(!is.na(delai_cloture), delai_cloture >= 0, delai_cloture <= 730) %>%
distinct(idx_sin, .keep_all = TRUE) %>%
ggplot(aes(x = delai_cloture)) +
geom_histogram(bins = 40, fill = "#8E44AD", color = "white", alpha = 0.9) +
geom_vline(aes(xintercept = median(delai_cloture, na.rm = TRUE)),
linetype = "dashed", color = "#F4A261", linewidth = 1) +
labs(title = "Délai de clôture (déclaration → clôture)",
subtitle = "En jours — sinistres clos uniquement (≤ 730 j)",
x = "Délai (jours)", y = "Effectif") +
theme_actuariel()
p8 <- sinistre %>%
# Conversion sécurisée en numérique
mutate(mt_eval = as.numeric(as.character(mt_eval))) %>%
# On enlève les NA potentiels générés par la conversion ou déjà présents
filter(!is.na(mt_eval), mt_eval > 0) %>%
# Calcul du quantile sur une colonne propre
filter(mt_eval < quantile(mt_eval, 0.99, na.rm = TRUE)) %>%
distinct(idx_sin, .keep_all = TRUE) %>%
ggplot(aes(x = mt_eval)) +
geom_histogram(bins = 40, fill = "#E74C3C", color = "white", alpha = 0.9) +
scale_x_continuous(labels = scales::label_comma(big.mark = " ", suffix = " €")) +
labs(title = "Distribution du montant évalué (mt_eval)",
subtitle = "Hors valeurs extrĂŞmes (> P99)",
x = "Montant évalué (€)", y = "Effectif") +
theme_actuariel()
(p5 + p6) / (p7 + p8) +
plot_annotation(
title = "Exploration des variables sinistres",
theme = theme(plot.title = element_text(size = 17, face = "bold", color = "#0D1B2A"))
)contrat_sinistre)contrat_long <- contrat %>%
pivot_longer(
cols = all_of(cols_sinistres),
names_to = "type_sinistre",
values_to = "idx_sin"
) %>%
filter(idx_sin != "Non renseigné")
contrat_sinistre <- contrat_long %>%
inner_join(sinistre, by = "idx_sin")Table contrat_sinistre : 58377 lignes —
soit 22757 contrats distincts ayant déclaré au moins un
sinistre.
contrat_sans_sinistre)contrat_sinistre %>%
distinct(idxCt, ctFrm, gar_sin) %>%
count(ctFrm, gar_sin) %>%
ggplot(aes(x = ctFrm, y = n, fill = gar_sin)) +
geom_col(position = "dodge", width = 0.65) +
geom_text(aes(label = n), position = position_dodge(0.65), vjust = -0.5,
size = 3.5, fontface = "bold") +
scale_fill_manual(values = c("#2E86AB","#F4A261","#27AE60"),
name = "Garantie") +
labs(
title = "Nombre de sinistres par formule et par garantie",
subtitle = "Contrats avec au moins un sinistre",
x = "Formule", y = "Nombre de sinistres",
caption = "Source : jointure contrat Ă— sinistre"
) +
theme_actuariel()library(readr)
write_csv(contrat_sans_sinistre, "contrat_sans_sinistre.csv")
write_csv(sinistre, "sinistre_clean.csv")
write_csv(contrat_sinistre, "contrat_sinistre.csv")
cat("✅ Fichiers exportés avec succès.\n")## ✅ Fichiers exportés avec succès.
✅ 3 fichiers exportés :
contrat_sans_sinistre.csv — tous les contrats, sans les
colonnes d’identifiants de sinistressinistre_clean.csv — table sinistre nettoyée avec
variables calculéescontrat_sinistre.csv — jointure complète contrat ×
sinistre (contrats sinistrés uniquement)