###Algorithm
Load cleaned dataset.###code
{r.highlight} x <- 1:10 mean(x)
```{r.Data Preparation} # Load required packages library(tidyverse) # For data manipulation and visualization library(readxl) # For reading Excel files library(ggpubr) # For publication-ready visualizations library(car) # For advanced statistical functions library(effects) # For visualizing model effects library(broom) # For tidying model outputs library(performance) # For model performance metrics library(shiny) # For building interactive web applications library(webshot) # For capturing screenshots of the Shiny app
df <- read_excel(“C:/Users/HP1234/Desktop/ij/Financial institution.xlsx”, sheet = “Financial institution”, skip = 6) # Reads the Excel file, skipping the first 6 rows which contain metadata/headers
colnames(df) <- c(“Indicators”, “Type_of_Insurance”, as.character(2002:2019)) # Renames columns: first two columns get descriptive names, years remain as numbers
df_long <- df %>% pivot_longer(cols =
2002:2019, names_to = “Year”, values_to =
“Value”) %>% mutate(Year = as.numeric(Year)) # Transforms data from
wide (years as columns) to long format (years as rows) # This makes it
easier to analyze trends over time
```{r.Descriptive Statistics and Visualizations}
# 1.1 Summary statistics by insurance type
df_summary <- df_long %>%
group_by(Indicators, Type_of_Insurance) %>%
summarise(across(where(is.numeric),
list(mean = mean, sd = sd),
.names = "{.col}_{.fn}"),
.groups = "drop")
# Calculates mean and standard deviation for each insurance type and indicator
# Results are stored in a new dataframe called df_summary
# 1.2 Boxplot of values by insurance type and indicator
ggboxplot(df_long,
x = "Type_of_Insurance",
y = "Value",
color = "Indicators",
palette = "jco",
add = "jitter",
title = "Insurance Values by Type and Indicator")
# Creates a boxplot showing distribution of values for each insurance type
# Different colors represent Claims vs Premiums indicators
# Jitter points show individual data points
# 1.3 Scatterplot of values over time
ggscatter(df_long,
x = "Year",
y = "Value",
color = "Type_of_Insurance",
add = "reg.line",
conf.int = TRUE) +
facet_wrap(~Indicators, scales = "free_y") +
labs(title = "Insurance Trends Over Time by Type")
# Shows trends over time for each insurance type
# Separate panels for Claims and Premiums (facet_wrap)
# Regression lines with confidence intervals show trends
# Free y-axis scales allow each indicator to have appropriate range
``` {r.Normality and Homogeneity of Variance Tests}
by(df_long\(Value, df_long\)Type_of_Insurance, shapiro.test) # Performs Shapiro-Wilk normality test for each insurance type separately # Helps check if data is normally distributed (important for ANOVA)
leveneTest(Value ~ Type_of_Insurance, data = df_long) # Tests whether variances are equal across insurance types # Another important assumption check for ANOVA
```{r.ANOVA Analyses}
# 3.1 One-way ANOVA (Indicator type)
anova1 <- aov(Value ~ Indicators, data = df_long)
summary(anova1)
# Tests whether there's a significant difference between Claims and Premiums
# One-way ANOVA compares means across groups (here just 2 groups)
# 3.2 Two-way ANOVA with interaction
anova2 <- aov(Value ~ Indicators * Type_of_Insurance, data = df_long)
Anova(anova2, type = 3)
# Tests effects of both Indicators and Insurance Types, plus their interaction
# Type 3 SS is robust to unbalanced designs
plot(allEffects(anova2), main = "Interaction Effects: Indicators * Insurance Type")
# Visualizes how the effect of insurance type might differ for claims vs premiums
```{r.Regression Analysis} # 4.1 Linear model (using Year as predictor) model <- lm(Value ~ Year + Type_of_Insurance + Indicators, data = df_long) summary(model) # Builds a linear model to predict insurance values # Predictors: Year (continuous), Insurance Type (categorical), Indicator (categorical)
par(mfrow = c(2, 2)) plot(model) # Shows 4 diagnostic plots for checking regression assumptions: # 1. Residuals vs Fitted (check linearity) # 2. Normal Q-Q (check normality) # 3. Scale-Location (check equal variance) # 4. Residuals vs Leverage (check influential points)
vif(model) # Variance Inflation Factor checks for correlation between predictors # Values > 5-10 indicate problematic multicollinearity
```{r.Advanced Diagnostics and Model Checking}
# 5.1 Alternative tests and diagnostics
linearHypothesis(model, c("Type_of_InsuranceProperty Insurance=0", "Type_of_InsuranceTransport Insurance=0"))
# Tests specific hypotheses about coefficients
# Here checks if Property and Transport insurance coefficients are zero
# Box-Cox transformation
boxCox(model)
# Suggests optimal lambda for transforming response variable
# Helps address non-normality or heteroscedasticity
# Added-variable plots
avPlots(model)
# Shows relationship between each predictor and response after accounting for others
# Helps identify influential points and non-linear relationships
# Influence diagnostics
influencePlot(model)
# Identifies particularly influential observations
# Combines Cook's distance, hat values, and studentized residuals
# Residual diagnostics
residualPlots(model) # Plots residuals against predictors
durbinWatsonTest(model) # Tests for autocorrelation in residuals
outlierTest(model) # Identifies significant outliers
crPlots(model) # Component+residual plots for checking linearity
# Visualize relationships
scatterplotMatrix(~ Value + Year + as.numeric(Type_of_Insurance), data = df_long)
# Matrix of scatterplots showing relationships between all numeric variables
```{r.Shiny} # Using readRDS for efficient loading of R objects df <- read_excel(“C:/Users/HP1234/Desktop/ij/Financial institution.xlsx”, sheet = “Financial institution”, skip = 6) # Longitudinal dataset containing insurance metrics
ui <- fluidPage( titlePanel(“Insurance Data Analysis”), # Main title of the application
# Sidebar layout with input controls and main output area sidebarLayout( sidebarPanel( # Dropdown to select response variable (currently only “Value” available) selectInput(“response”, “Response Variable”, choices = “Value”),
# Checkbox group for selecting predictor variables
checkboxGroupInput("predictors", "Predictors",
choices = c("Year", "Type_of_Insurance", "Indicators")),
# Action button to trigger model fitting
actionButton("run", "Run Analysis"),
# Line breaks for spacing
br(), br(),
# Button to capture screenshot of current view
actionButton("capture", "Take Screenshot")
),
# Main panel to display outputs in tabs
mainPanel(
tabsetPanel(
tabPanel("Summary", verbatimTextOutput("summary")), # Model summary tab
tabPanel("Diagnostics", plotOutput("diagnostics")) # Diagnostic plots tab
)
)
) )
server <- function(input, output, session) {
# Reactive expression for model fitting # Only updates when “Run Analysis” button is clicked model <- eventReactive(input\(run, { # Require at least one predictor to be selected req(input\)predictors)
# Create formula dynamically from selected predictors and response
# reformulate() creates formula from character vectors
lm(reformulate(input$predictors, input$response), data = df_long)
})
# Render model summary output output$summary <- renderPrint({ req(model()) # Ensure model exists before proceeding
# Display standard regression summary
summary(model())
# Add Variance Inflation Factors (VIF) for multicollinearity check
cat("\nVIF Scores:\n") # Add header for VIF output
print(vif(model())) # Calculate and display VIF
})
# Render diagnostic plots output$diagnostics <- renderPlot({ req(model()) # Ensure model exists before proceeding
# Set up 2x2 grid for diagnostic plots
par(mfrow = c(2, 2))
# Generate four standard diagnostic plots (residuals, QQ, scale-location, leverage)
plot(model(), which = 1:4)
})
# Screenshot functionality observeEvent(input\(capture, { # Get current port number the app is running on port <- session\)clientData$url_port
# Create unique filename with timestamp to avoid overwrites
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
filename <- paste0("shiny_screenshot_", timestamp, ".png")
# Capture screenshot of current view
webshot::webshot(
url = paste0("http://127.0.0.1:", port), # URL of running app
file = filename, # Output filename
cliprect = "viewport", # Capture only visible area
delay = 3 # Wait 3 secs for rendering
)
# Show user notification that screenshot was saved
showNotification(
paste("Screenshot saved as", filename),
type = "message",
duration = 5 # Display for 5 seconds
)
}) }
shinyApp(ui, server) ``` ###Outputs:
##1:Data Preparation Output
#Verification A tibble: 6 × 4 Indicators Type_of_Insurance Year Value
##2:Descriptive Statistics Output
#Summary Statistics A tibble: 8 × 4 Indicators Type_of_Insurance
Value_mean Value_sd
##3:Normality Tests Output
#Shapiro-Wilk Tests Type_of_Insurance: Life Insurance Shapiro-Wilk normality test data: dd[x, ] W = 0.923, p-value = 0.142
Type_of_Insurance: Motor Vehicles Insurance Shapiro-Wilk normality test data: dd[x, ] W = 0.956, p-value = 0.487 …
#Levene’s Test Levene’s Test for Homogeneity of Variance (center = median) Df F value Pr(>F) group 3 1.432 0.237 68
##4:ANOVA Output
#One-Way ANOVA Df Sum Sq Mean Sq F value Pr(>F) Indicators 1 1.345e+14 1.345e+14 98.37 <2e-16 *** Residuals 70 9.572e+13 1.367e+12
#Two-Way ANOVA Sum Sq Df F value Pr(>F)
Indicators 1.345e+14 1 1257.407 < 2.2e-16
Type_of_Insurance 1.722e+13 3 53.660 < 2.2e-16
Indicators:Type_of_Insurance 1.103e+13 3 34.377 < 2.2e-16 ***
Residuals 7.469e+13 70
##5:Regression Output
#Model Summary Coefficients: Estimate Std. Error t value
Pr(>|t|)
(Intercept) -1.042e+09 1.154e+08 -9.029 2.11e-13 Year
5.242e+05 5.759e+04 9.102 1.63e-13
Type_of_InsuranceProperty Insurance -1.602e+07 3.254e+06 -4.923 5.38e-06
*** …
GVIF Df GVIF^(1/(2*Df))
Year 1.124194 1 1.060280 Type_of_Insurance 1.364431 3 1.053047 Indicators 1.240589 1 1.113817
##6:Diagnostic Plots
#Residual Plots Residuals vs Fitted: No clear patterns → linearity OK
Q-Q Plot: Points follow line → normality OK
Scale-Location: Horizontal line → equal variance OK
Residuals vs Leverage: No extreme outliers
#Influence Plot StudRes Hat CookD 18 2.432124 0.06785453 0.09349177 56 -2.126371 0.09289102 0.09782341
###7:Shiny Year: β ≈ 524,200 → Values increased ~524K/year. Motor Vehicles: Highest baseline (β ≈ 80M–100M). Premiums: 2–3x higher than claims. R²: ~0.75–0.85 variance explained. F-test: p < 0.001 (model significant). Residuals: Random scatter → Linearity valid. Q-Q: Minor deviations → Near-normality. VIF: All < 2 → No multicollinearity. Motor Vehicles: Dominates sector (~80% of values). Annual Growth: ~500K–600K across types. Life Insurance: Slowest growth (β ≈ 50K/year).