library(readxl)
## Warning: package 'readxl' was built under R version 4.5.1
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.1
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.1
setwd("C:/Users/user/OneDrive/Documents/CAS")
ICA_data <- read_excel("ICA Dataset - Claim Only .xlsx")
ICA_data <- clean_names(ICA_data)
names(ICA_data) <- trimws(names(ICA_data))
claim_cols <- c(
"domestic_building_claims",
"domestic_content_claims",
"domestic_motor_claims",
"domestic_other_claims",
"commercial_property_claims",
"commercial_motor",
"commercial_bi_claims",
"commercial_other_claims",
"commercial_crop_claims"
)
contingency_norm <- ICA_data %>%
pivot_longer(cols = all_of(claim_cols),
names_to = "Claim_Type",
values_to = "Count") %>%
group_by(type, Claim_Type) %>%
summarise(Avg_Claims = mean(Count, na.rm = TRUE),
.groups = "drop")
#Create Contingency Table
contingency_wide <- contingency_norm %>%
pivot_wider(names_from = Claim_Type,
values_from = Avg_Claims,
values_fill = 0)
claim_labels <- c(
"domestic_building_claims" = "A",
"domestic_content_claims" = "B",
"domestic_motor_claims" = "C",
"domestic_other_claims" = "D",
"commercial_property_claims" = "E",
"commercial_motor" = "F",
"commercial_bi_claims" = "G",
"commercial_other_claims" = "H",
"commercial_crop_claims" = "I"
)
contingency_label <- contingency_norm %>%
mutate(Claim_Label = claim_labels[Claim_Type])
ggplot(contingency_label, aes(x = Claim_Label,
y = type,
fill = Avg_Claims)) +
geom_tile(color = "white") +
geom_text(aes(label = scales::comma(Avg_Claims)), size = 3, color = "black") +
scale_fill_gradient(low = "lightblue", high = "#0061FE") +
labs(
title = "Avg Claims by Catastrophe Type and Claim Type",
x = "Claim Type",
y = "Catastrophe Type",) +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5),
plot.title = element_text(face = "bold", size = 14))

# TEST
contingency_sum <- ICA_data %>%
pivot_longer(cols = all_of(claim_cols),
names_to = "Claim_Type",
values_to = "Count") %>%
group_by(type, Claim_Type) %>%
summarise(Total_Claims = sum(Count, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = Claim_Type,
values_from = Total_Claims,
values_fill = 0)
contingency_matrix <- as.matrix(contingency_sum[ ,-1])
rownames(contingency_matrix) <- contingency_sum$type
# Then chi-square
chi_result <- chisq.test(contingency_matrix)
chi_result
##
## Pearson's Chi-squared test
##
## data: contingency_matrix
## X-squared = 459535, df = 40, p-value < 2.2e-16
# And Cramer's V
cramer_v_value <- CramerV(contingency_matrix, bias.correct = TRUE)
cramer_v_value
## [1] 0.2417976