Correlation

library(readxl)

## Warning: package 'readxl' was built under R version 4.5.1

library(janitor)

## Warning: package 'janitor' was built under R version 4.5.1

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.5.1

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.1

library(DescTools)

## Warning: package 'DescTools' was built under R version 4.5.1

setwd("C:/Users/user/OneDrive/Documents/CAS")

ICA_data <- read_excel("ICA Dataset - Claim Only .xlsx")
ICA_data <- clean_names(ICA_data)

names(ICA_data) <- trimws(names(ICA_data))

claim_cols <- c(
  "domestic_building_claims",
  "domestic_content_claims",
  "domestic_motor_claims",
  "domestic_other_claims",
  "commercial_property_claims",
  "commercial_motor",
  "commercial_bi_claims",
  "commercial_other_claims",
  "commercial_crop_claims"
)



contingency_norm <- ICA_data %>%
  pivot_longer(cols = all_of(claim_cols),
               names_to = "Claim_Type",
               values_to = "Count") %>%
  group_by(type, Claim_Type) %>%
  summarise(Avg_Claims = mean(Count, na.rm = TRUE),
            .groups = "drop")


#Create Contingency Table
contingency_wide <- contingency_norm %>%
  pivot_wider(names_from = Claim_Type,
              values_from = Avg_Claims,
              values_fill = 0)


claim_labels <- c(
  "domestic_building_claims" = "A",
  "domestic_content_claims"  = "B",
  "domestic_motor_claims"    = "C",
  "domestic_other_claims"    = "D",
  "commercial_property_claims" = "E",
  "commercial_motor"           = "F",
  "commercial_bi_claims"       = "G",
  "commercial_other_claims"    = "H",
  "commercial_crop_claims"     = "I"
)

contingency_label <- contingency_norm %>%
  mutate(Claim_Label = claim_labels[Claim_Type])

ggplot(contingency_label, aes(x = Claim_Label,
                              y = type,
                              fill = Avg_Claims)) +
  geom_tile(color = "white") +
  geom_text(aes(label = scales::comma(Avg_Claims)), size = 3, color = "black") +
  scale_fill_gradient(low = "lightblue", high = "#0061FE") +
  labs(
    title = "Avg Claims by Catastrophe Type and Claim Type",
    x = "Claim Type",
    y = "Catastrophe Type",) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5),
    plot.title = element_text(face = "bold", size = 14))

# TEST
contingency_sum <- ICA_data %>%
  pivot_longer(cols = all_of(claim_cols),
               names_to = "Claim_Type",
               values_to = "Count") %>%
  group_by(type, Claim_Type) %>%
  summarise(Total_Claims = sum(Count, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = Claim_Type,
              values_from = Total_Claims,
              values_fill = 0)

contingency_matrix <- as.matrix(contingency_sum[ ,-1])
rownames(contingency_matrix) <- contingency_sum$type

# Then chi-square
chi_result <- chisq.test(contingency_matrix)
chi_result

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_matrix
## X-squared = 459535, df = 40, p-value < 2.2e-16

# And Cramer's V
cramer_v_value <- CramerV(contingency_matrix, bias.correct = TRUE)
cramer_v_value

## [1] 0.2417976

Correlation

Hayden

2025-11-17