VRU EDA v2

The following data is data given to AppRISE to conduct an Exploratory Data Analysis (EDA). For now, we will keep things simple with some descriptive statistics and plots using `ggplot2` and `plotly`.

library(tidyverse)
library(ggplot2)
library(plotly)
library(kableExtra)
library(DT)
library(data.table)
library(dygraphs)
library(latticeExtra)
library(stringr)
library(hrbrthemes)
library(patchwork)
library(tinytex)

library(readr)
VRU_Clean_Data_v2 <- read_csv("~/Desktop/DataAnalysis_AppRISE_Projects/AppRISE/VRUxAppRISE/Datasets/VRU_Clean_Data_v2.csv", 
    col_types = cols(MasterKey = col_skip(), 
        LandfallDate = col_date(format = "%m/%d/%Y"), 
        PortfolioDate = col_date(format = "%m/%d/%Y"), 
        InceptionDate = col_date(format = "%m/%d/%Y"), 
        ExpirationDate = col_date(format = "%m/%d/%Y")))


# Drop cols, blend certain COBs (more info below code)
VRU_df <- VRU_Clean_Data_v2 %>%
  dplyr::select(-c(Peril, PCS, Year)) %>%
  mutate(across('COB', str_replace, 'Condominiums', 'Condominium')) %>%
  mutate(across('COB', str_replace, 'BarsTavernsCocktailLoungesAndNightclubs', 'Bars')) %>%
  mutate(across('COB', str_replace, 'Healthcare_MedicalDental_InclSuppliers|Healthcare_MedicalDental_InclSuppliers_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'Healthcare_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'HotelMotels', 'HotelsMotels')) %>%
  mutate(across('COB', str_replace, 'GasStations_ConvenienceStores_CarWashes_AutoRepair', 'GasStations')) %>%
  mutate(across('COB', str_replace, 'lightmanufacturing', 'LightManufacturing')) %>%
  mutate(across('COB', str_replace, 'Restaurants', 'Restaurant')) %>%
  mutate(across('COB', str_replace, 'Churches_Nonprofit', 'Church'))


VRU_df <- subset(VRU_df, !(COB == 'Pleasemakeaselection')) 

# This dataset has PL (under LOB) removed, 
# masterkey dropped (ContractID is still retained for reference), 
# PCS dropped (Claims designation - If a storm gets big enough it is assigned a code... all we have are named storms or 'Event', so having Event col with PCS is redundant), 
# Year dropped (the year are in all other date-type columns; this removes more redundancy), 
# and a couple of variables and observations were "cleaned-up" prior to import (less lines of code tbh); Removed Peril Variable (All Perils are Hurricanes)

# NOTE ADJUSTMENTS TO COB
# blend Condominium and Condominiums, 
# blend Bars and BarsTavernsCocktailLoungesAndNightclubs
# blend Healthcare and Healthcare_MedicalDental_InclSuppliers AND Healthcare_MedicalDental_InclSuppliers_Service
# blend HotelMotels and HotelsMotels
# blend GasStations and GasStations_ConvenienceStores_CarWashes_AutoRepair
# blend LightManufacturing and lightmanufacturing
# blend Restaurant and Restaurants 
# remove Pleasemakeaselection

View(VRU_df)

# THIS DATA DROPS NML'S (OR AFTER CONVERSIONS, NA's); WE CANNOT QUANTITATIVELY ANALYZE COLUMNS THAT HAVE 'NML'; NML'S WILL MAKE THE COLUMN CHARACTER COLS AND NOT NUMERIC

# USE THIS DATA WHEN WORKING WITH COLUMNS RMSMP OR AIRMP

VRU_NoNML <- read_csv("~/Desktop/DataAnalysis_AppRISE_Projects/AppRISE/VRUxAppRISE/Datasets/VRU_Clean_Data_v2.csv", 
    col_types = cols(MasterKey = col_skip(), 
        Peril = col_skip(), PCS = col_skip(), 
        Year = col_skip(), LandfallDate = col_date(format = "%m/%d/%Y"), 
        PortfolioDate = col_date(format = "%m/%d/%Y"), 
        InceptionDate = col_date(format = "%m/%d/%Y"), 
        ExpirationDate = col_date(format = "%m/%d/%Y"), 
        AIRMP = col_double(), RMSMP = col_double()))

VRU_NoNML <- VRU_NoNML %>%
  mutate(across('COB', str_replace, 'Condominiums', 'Condominium')) %>%
  mutate(across('COB', str_replace, 'BarsTavernsCocktailLoungesAndNightclubs', 'Bars')) %>%
  mutate(across('COB', str_replace, 'Healthcare_MedicalDental_InclSuppliers|Healthcare_MedicalDental_InclSuppliers_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'Healthcare_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'HotelMotels', 'HotelsMotels')) %>%
  mutate(across('COB', str_replace, 'GasStations_ConvenienceStores_CarWashes_AutoRepair', 'GasStations')) %>%
  mutate(across('COB', str_replace, 'lightmanufacturing', 'LightManufacturing')) %>%
  mutate(across('COB', str_replace, 'Restaurants', 'Restaurant')) %>%
  mutate(across('COB', str_replace, 'Churches_Nonprofit', 'Church'))

VRU_NoNML <- subset(VRU_NoNML, !(COB == 'Pleasemakeaselection')) 

VRU_NoNML <- VRU_NoNML[complete.cases(VRU_NoNML$RMSMP, VRU_NoNML$AIRMP),] #7573 obs (same as spreadsheet filter! There are two less because of the 'Pleasemakeaselection' drop)

# Another messier looking option...
# VRU_NoNML <- VRU_NoNML[complete.cases(VRU_NoNML[, c('AIRMP', 'RMSMP')]),]

View(VRU_NoNML)

View Dataset: filter and export if you’d like

# print the table and add filter; this will allow us view data with a filter and gives us the ability to print the filtered table from the html file.
datatable(VRU_df,
          rownames = FALSE, 
          filter = 'top',
          extensions = 'Buttons',
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          ))

Example Summary Stats

VRU_df %>%
  group_by(Event) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Event") %>%
  kable_classic(full_width = T, html_font = "Cambria")

Summary Statistics: VRU Actual Loss Sustained by Event
Event	n	x_bar	s
Delta	987	12609.76	125139.6
Hanna	568	56515.21	971030.5
Ian	3396	42723.08	299054.4
Ida	2403	165340.15	954165.3
Laura	513	201913.34	1092017.9
Sally	582	269304.06	1424577.9
Zeta	1417	20104.02	220416.1

VRU_df %>%
  group_by(COB) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by COB") %>%
  kable_classic(full_width = T, html_font = "Cambria")

Summary Statistics: VRU Actual Loss Sustained by COB
COB	n	x_bar	s
Apartments	1267	152779.9765	915917.848
ApartmentsOverFourStories	1	27921.8600	NA
ApartmentsWithMercantile	60	45967.5538	119670.098
ArtisanContractors	131	3816.2673	19420.196
AssistedLiving	59	71948.4551	250299.100
Bars	14	5833.0664	13285.295
BedAndBreakfast	15	40035.7267	70764.614
CarWashes_AutoRepair	41	20366.9120	55186.676
Casino	1	0.0000	NA
Church	274	44393.2719	191495.121
Condominium	1028	127624.8579	929114.904
CondominiumWithMercantile	22	6579.9468	24408.935
DryCleaners_LaundryMats	5	24611.5060	55033.000
Education	255	271908.3129	1832664.874
Entertainment	144	27327.6115	182089.362
Entertainment_Auditoriums_Theaters	52	98435.5413	289702.294
GasStations	132	10819.4934	50205.513
GolfCourseBuilding	3	3757.9733	6509.001
Healthcare	555	52689.4884	511566.145
Hospitality	425	251109.9307	1582968.953
HotelsMotels	303	188353.3798	768915.998
HotelsMotelsWithRestaurant	8	121868.7337	230948.374
LightManufacturing	9	625.2689	1875.807
Microbreweries_Wineries_HardCider	3	0.0000	0.000
Municipality	129	120013.4260	681555.515
Office	1572	31422.4020	282314.431
Other	359	196998.8377	918102.691
PublicAndMunicipal	13	82883.4762	164545.169
RealEstate	194	48853.8042	262508.932
Restaurant	881	33977.0223	255936.166
Retail	1116	33840.0572	204845.010
RetailWithRefrigerationEquipment	58	140384.2245	519642.315
Retail_RealEstate	204	130133.0879	593880.725
Schools	29	7599.5428	17452.110
Service	313	12871.2766	58654.809
SingleFamilyDwelling_WithAtLeastTwobldgs	10	0.0000	0.000
Warehouse_SelfStorage	13	28044.1600	87872.578
Warehousing	6	73353.7367	179679.226
Wholesalers	158	8627.1285	38370.855
WholesalersWithFood	1	0.0000	NA
WholesalersWithRefrigerationEquipment	3	231040.2733	348575.611

VRU_df %>%
  group_by(PrimaryState) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Primary State") %>%
  kable_classic(full_width = T, html_font = "Cambria")

Summary Statistics: VRU Actual Loss Sustained by Primary State
PrimaryState	n	x_bar	s
AL	369	155883.6112	1236101.981
AZ	1	12521.8300	NA
CA	150	48129.8811	316975.412
CO	3	0.0000	0.000
CT	2	1320622.6950	1784674.053
DE	5	0.0000	0.000
FL	3985	56692.9435	467209.111
GA	54	5453.8402	36315.221
HI	10	14417.4060	45591.841
IL	4	0.0000	0.000
IN	1	0.0000	NA
KS	2	0.0000	0.000
KY	1	0.0000	NA
LA	3469	143652.8762	900280.625
MA	10	0.0000	0.000
MD	6	31054.4700	76067.606
ME	3	0.0000	0.000
MI	8	98234.8800	276004.570
MN	5	0.0000	0.000
MO	1	0.0000	NA
MS	361	40275.7462	342781.596
NC	17	0.0000	0.000
NJ	16	106059.9200	414806.046
NV	6	378392.7917	923746.286
NY	42	94881.3017	299501.019
OH	1	504160.1800	NA
OR	1	0.0000	NA
PA	6	99190.0150	226261.383
SC	47	60552.6898	222981.551
TN	6	20102.2483	48612.342
TX	1246	45030.4363	704150.531
VA	28	937.7614	4962.167

VRU_df %>%
  group_by(Construction) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Construction") %>%
  kable_classic(full_width = T, html_font = "Cambria")

Summary Statistics: VRU Actual Loss Sustained by Construction
Construction	n	x_bar	s
Braced_Steelframe	4	1114201.0450	2228402.0900
Fire_Resistive	2	0.0000	0.0000
Frame	9	0.0000	0.0000
Heavy_Timber	3	0.0000	0.0000
Joisted_Masonry	2140	39418.6731	305936.4218
Light_Metal	212	33265.6072	228082.5437
Masonry	2282	66108.6681	444184.1032
Masonry_NonCombustible	15	106.2960	411.6826
Masonry_Veneer	22	1218357.0177	2846707.0214
Mobile_Homes	12	391863.0308	1129148.8200
Precast_Concrete	2	2156.1450	3049.2495
RC_MRF_D	14	539567.8943	1955592.5444
RC_MRF_URM	1	0.0000	NA
RC_SW_w_MRF	3	317764.6000	550384.4320
RC_SW_wo_MRF	17	352869.6094	1245575.4931
RM_SW_w_MRF	1	0.0000	NA
RM_SW_wo_MRF	1	162830.2000	NA
Reinforced_Concrete	863	155256.3776	975399.6492
Reinforced_Masonry	63	147527.4584	588918.2649
SF_w_C_SW	2	2592432.4000	3521597.4394
SF_w_URM	1	0.0000	NA
S_MRF_Dist	12	0.0000	0.0000
S_MRF_Perim	10	255.3320	807.4307
Steel	942	96344.4614	931545.9851
Tilt_Up	31	299.7329	1668.8422
URM_BF	1	26289.5000	NA
URM_BW	5	173961.9520	388990.7502
Unknown	26	630.1988	2310.0361
Wind_Resistive	1	186275.3300	NA
Wood_Frame	3169	108082.0059	849985.8376

Summary Stats: Claims, mean losses, distinct accnts, etc.

# find # of distinct accounts (double check to make sure `ContractID` actually are distinct/separate accounts for VRU)
# find mean losses
# find a way to give each account a 'weight' because some accounts have only 1 claim(?) and going completely off of mean will skew results
claims_df <- VRU_df %>% 
  group_by(PrimaryState) %>%
  summarize(tot_claims = n(),
            num_accnts = n_distinct(ContractID),
            tot_loss = sum(VRUInc)) %>%
  ungroup() %>%
  #summarize(prop_of_loss = tot_claims/sum(tot_claims)) %>%
  mutate(mean_loss = tot_loss/tot_claims,
         frac_claims_p_accnt = tot_claims/num_accnts) %>%
  arrange(desc(tot_loss)) 

claims_df %>%
  kbl(caption = "Claims, Accounts, and Losses by State") %>%
  kable_classic(full_width = T, html_font = "Cambria")

Claims, Accounts, and Losses by State
PrimaryState	tot_claims	num_accnts	tot_loss	mean_loss	frac_claims_p_accnt
LA	3469	2334	498331827.44	143652.8762	1.486290
FL	3985	3717	225921379.99	56692.9435	1.072101
AL	369	258	57521052.54	155883.6112	1.430233
TX	1246	962	56107923.59	45030.4363	1.295218
MS	361	222	14539544.39	40275.7462	1.626126
CA	150	104	7219482.16	48129.8811	1.442308
NY	42	33	3985014.67	94881.3017	1.272727
SC	47	44	2845976.42	60552.6898	1.068182
CT	2	2	2641245.39	1320622.6950	1.000000
NV	6	3	2270356.75	378392.7917	2.000000
NJ	16	11	1696958.72	106059.9200	1.454546
MI	8	4	785879.04	98234.8800	2.000000
PA	6	5	595140.09	99190.0150	1.200000
OH	1	1	504160.18	504160.1800	1.000000
GA	54	45	294507.37	5453.8402	1.200000
MD	6	6	186326.82	31054.4700	1.000000
HI	10	6	144174.06	14417.4060	1.666667
TN	6	6	120613.49	20102.2483	1.000000
VA	28	28	26257.32	937.7614	1.000000
AZ	1	1	12521.83	12521.8300	1.000000
CO	3	1	0.00	0.0000	3.000000
DE	5	5	0.00	0.0000	1.000000
IL	4	4	0.00	0.0000	1.000000
IN	1	1	0.00	0.0000	1.000000
KS	2	1	0.00	0.0000	2.000000
KY	1	1	0.00	0.0000	1.000000
MA	10	10	0.00	0.0000	1.000000
ME	3	3	0.00	0.0000	1.000000
MN	5	3	0.00	0.0000	1.666667
MO	1	1	0.00	0.0000	1.000000
NC	17	16	0.00	0.0000	1.062500
OR	1	1	0.00	0.0000	1.000000

# use claims_df to weigh mean losses

claims_df2 <- claims_df %>%
  #select(PrimaryState, tot_claims, num_accnts, tot_loss, mean_loss, frac_claims_p_accnt) %>%
  mutate(weight_tot_claims = tot_claims / sum(tot_claims)) #%>%
        #summarize(weighted_mean_loss = weight_tot_claims * mean_loss) #%>%
  #mutate(claim_weight = tot_claims / sum_tot_claims)

options(scipen = 999) # use this to remove scientific notation

claims_df2 %>%
  transmute(PrimaryState, tot_claims, num_accnts, tot_loss, mean_loss, frac_claims_p_accnt, weight_tot_claims, weighted_mean_loss =  weight_tot_claims * mean_loss) %>%
  arrange(desc(weighted_mean_loss)) %>%
  kbl(caption = "Weighted Losses by State") %>%
  kable_classic(full_width = T, html_font = "Cambria")

Weighted Losses by State
PrimaryState	tot_claims	num_accnts	tot_loss	mean_loss	frac_claims_p_accnt	weight_tot_claims	weighted_mean_loss
LA	3469	2334	498331827.44	143652.8762	1.486290	0.3516116	50510.016971
FL	3985	3717	225921379.99	56692.9435	1.072101	0.4039124	22898.984390
AL	369	258	57521052.54	155883.6112	1.430233	0.0374012	5830.230341
TX	1246	962	56107923.59	45030.4363	1.295218	0.1262923	5686.998134
MS	361	222	14539544.39	40275.7462	1.626126	0.0365903	1473.702046
CA	150	104	7219482.16	48129.8811	1.442308	0.0152037	731.753716
NY	42	33	3985014.67	94881.3017	1.272727	0.0042570	403.913913
SC	47	44	2845976.42	60552.6898	1.068182	0.0047638	288.463047
CT	2	2	2641245.39	1320622.6950	1.000000	0.0002027	267.711878
NV	6	3	2270356.75	378392.7917	2.000000	0.0006081	230.119273
NJ	16	11	1696958.72	106059.9200	1.454546	0.0016217	172.000681
MI	8	4	785879.04	98234.8800	2.000000	0.0008109	79.655285
PA	6	5	595140.09	99190.0150	1.200000	0.0006081	60.322328
OH	1	1	504160.18	504160.1800	1.000000	0.0001014	51.100768
GA	54	45	294507.37	5453.8402	1.200000	0.0054733	29.850737
MD	6	6	186326.82	31054.4700	1.000000	0.0006081	18.885751
HI	10	6	144174.06	14417.4060	1.666667	0.0010136	14.613223
TN	6	6	120613.49	20102.2483	1.000000	0.0006081	12.225166
VA	28	28	26257.32	937.7614	1.000000	0.0028380	2.661395
AZ	1	1	12521.83	12521.8300	1.000000	0.0001014	1.269190
CO	3	1	0.00	0.0000	3.000000	0.0003041	0.000000
DE	5	5	0.00	0.0000	1.000000	0.0005068	0.000000
IL	4	4	0.00	0.0000	1.000000	0.0004054	0.000000
IN	1	1	0.00	0.0000	1.000000	0.0001014	0.000000
KS	2	1	0.00	0.0000	2.000000	0.0002027	0.000000
KY	1	1	0.00	0.0000	1.000000	0.0001014	0.000000
MA	10	10	0.00	0.0000	1.000000	0.0010136	0.000000
ME	3	3	0.00	0.0000	1.000000	0.0003041	0.000000
MN	5	3	0.00	0.0000	1.666667	0.0005068	0.000000
MO	1	1	0.00	0.0000	1.000000	0.0001014	0.000000
NC	17	16	0.00	0.0000	1.062500	0.0017231	0.000000
OR	1	1	0.00	0.0000	1.000000	0.0001014	0.000000

Timeseries examples

ts_ex1 <- ggplot(VRU_df, aes(x = LandfallDate, y = VRUInc)) +
  geom_line() +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
#ts_ex1
ggplotly(ts_ex1)

ts_ex2 <- ggplot(VRU_df, aes(x = InceptionDate, y = VRUInc)) +
  geom_line() +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
#ts_ex2
ggplotly(ts_ex2)

Practice: Using InceptionDate and VRUInc, select time frame betw 2021-01-01 to 2021-05-01

ts_ex3 <- ggplot(VRU_df, aes(x = InceptionDate, y = VRUInc)) +
  geom_line(color = "purple") +
  geom_point(color = "black") +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_date(limit = c(as.Date("2021-01-01"), as.Date("2021-05-01")))
#ts_ex3
ggplotly(ts_ex3)

IGNORE THESE GRAPHS: xts, zoo, tidyquant?: for timeseries, try to just use PortfolioDate as an example and drop the other dates.

library(tidyquant)
library(xts)

# Remove other date columns (avoids confusion in TS data)
VRU_ts_data_test <- VRU_df %>%
  select(-c(LandfallDate, InceptionDate, ExpirationDate))

# use `relocate()` function from dplyr to move a date to the front of the dataset
VRU_ts_data_test <-  VRU_ts_data_test %>%
  relocate(PortfolioDate)

# Make your adjusted dataset an xts so we can mess with timeseries stuff
VRUxts_df_test <- as.xts(as.data.table(VRU_ts_data_test))

# These are the locations of the last observation of every year of the data
endpoints(VRUxts_df_test, on = "years")

## [1]    0 4067 6470 9866

# Notice how you didn't specify one value... try this again but only select one or two y-values

dygraph((VRUxts_df_test))

# dygraph with selected values to focus on (using VRU_df) 
test1 <- xts(x = VRU_df$TIV, order.by = VRU_df$PortfolioDate)
dygraph(test1)

test2 <- xts(x = VRU_df$AIRMP, order.by = VRU_df$PortfolioDate)
dygraph(test2)

Correlation

library(GGally)

corr_data_test1 <- VRU_df %>%
  select(Attachment:PolicyLimit, TIV:ModeledPrem, AIREst:RMSMP)

ggcorr(corr_data_test1, 
       low = "steelblue", 
       mid = "white", 
       high = "darkred", 
       size = 2, 
       color = "black", # variable label color
       label = TRUE,
       label_size = 2,
       label_color = "white", # corr label color
       layout.exp = 1) # adds another tile or space to the bottom so "attachment" isn't cut off

Adverse Selection Question: Did any clsses of business (COB) perform worse over time?

# starter/ exampl code
# FIGURE OUT a way to make grids larger... much larger
AdvSelect_facetwrap <- ggplot(VRU_df, aes(PortfolioDate, VRUInc)) +
  geom_line(color = "#330099", size = 1) +
  geom_point(color = "black") +
  labs(title = "Possible Adverse Class Selection", subtitle = "COB performance over time", x = "", y = "VRU Loss") + 
  facet_wrap(~ COB)

ggplotly(AdvSelect_facetwrap)

# Businesses (Hospitality: Food and Beverage)
AdvSelect_FoodAndBev <- VRU_df %>%
  filter(COB == c('Bars', 'HotelsMotelsWithRestaurant', 'Microbreweries_Wineries_HardCider', 'Restaurant')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Hospitality: Food and Beverage)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_FoodAndBev)

# Businesses Services (Commercial real estate)
AdvSelect_Dwelling <- VRU_df %>%
  filter(COB == c('Apartments', 'ApartmentsOverFourStories', 'ApartmentsWithMercantile', 'Condominium', 'SingleFamilyDwelling_WithAtLeastTwobldgs')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Commercial Real Estate)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_Dwelling)

# Business Services (Lodging Industry)
AdvSelect_Lodging <- VRU_df %>%
  filter(COB == c('BedAndBreakfast', 'HotelsMotels', 'HotelsMotelsWithRestaurant')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Lodging Industry)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_Lodging)

# Businesses (Hospitality Industry in general)
AdvSelect_HospitalityBus <- VRU_df %>%
  filter(COB == c('Bar', 'BedAndBreakfast', 'Casino',  'Entertainment', 'Entertainment_Auditoriums_Theaters', 'Hospitality', 'HotelsMotels',  'HotelsMotelsWithRestaurant', 'Microbreweries_Wineries_HardCider', 'Restaurant', 'Service')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Hospitality Industry)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_HospitalityBus)

# Businesses: Church and Schools
AdvSelect_ChurchAndSchools <- VRU_df %>%
  filter(COB == c('Church', 'Education', 'Schools')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Churchs and Schools)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold')) 

ggplotly(AdvSelect_ChurchAndSchools)

INVESTIGATE MODEL MISS STUFFF… PS I LOVE YOU. YOU’RE DOING GREAT.

# create a dual y-axis comparison of AIRMP and RMSMP over time (use VRU_NoNML dataset)
# usually dual y-axis comparisons should be avoided, but seeing that both variables have the same scale (0-1), we think it is appropriate.

AIRmiss <- VRU_NoNML %>%
  ggplot(aes(x = LandfallDate, y = AIRMP)) +
  geom_point(color = "skyblue") +
  #geom_jitter() +
  theme_bw()
  
RMSmiss <- VRU_NoNML %>%
  ggplot(aes(x = LandfallDate, y = RMSMP)) +
  geom_point(color = "blue") +
  #geom_jitter() +
  theme_bw()

ggplotly(AIRmiss)

ggplotly(RMSmiss)

VRU EDA v2

Omar Gonzalez

Wednesday, September 27, 2023 @ 08:44 AM

Import Packages

The following data is data given to AppRISE to conduct an Exploratory Data Analysis (EDA). For now, we will keep things simple with some descriptive statistics and plots using `ggplot2` and `plotly`.

View Dataset: filter and export if you’d like

Example Summary Stats

Summary Stats: Claims, mean losses, distinct accnts, etc.

Timeseries examples

Practice: Using InceptionDate and VRUInc, select time frame betw 2021-01-01 to 2021-05-01

IGNORE THESE GRAPHS: xts, zoo, tidyquant?: for timeseries, try to just use PortfolioDate as an example and drop the other dates.

Correlation

Adverse Selection Question: Did any clsses of business (COB) perform worse over time?

INVESTIGATE MODEL MISS STUFFF… PS I LOVE YOU. YOU’RE DOING GREAT.

NOTES:

1) find fixes for things labeled `NA` (maybe use `is.na = TRUE`) in summary stats; ANSWER the NA values in standard devation are those that only have 1 claim.

2) fix `COB` entries (are condominiums and condominium the same thing? Restaurant and Restaurants) and naming conventions in general (use `dplyr` functions to clean up, get rid of commas, slashes, etc.)

3) use `mutate()` to remove scientifc notation and make everything “in millions”

References:

IBHS research center

DT Package

KableExtra Package Rmarkdown CookBook OR Create Awesome HTML Table with knitr::kable and kableExtra

The R Graph Gallery

VRU EDA v2

Omar Gonzalez

Wednesday, September 27, 2023 @ 08:44 AM

Import Packages

The following data is data given to AppRISE to conduct an Exploratory Data Analysis (EDA). For now, we will keep things simple with some descriptive statistics and plots using ggplot2 and plotly.

View Dataset: filter and export if you’d like

Example Summary Stats

Summary Stats: Claims, mean losses, distinct accnts, etc.

Timeseries examples

Practice: Using InceptionDate and VRUInc, select time frame betw 2021-01-01 to 2021-05-01

IGNORE THESE GRAPHS: xts, zoo, tidyquant?: for timeseries, try to just use PortfolioDate as an example and drop the other dates.

Correlation

Adverse Selection Question: Did any clsses of business (COB) perform worse over time?

INVESTIGATE MODEL MISS STUFFF… PS I LOVE YOU. YOU’RE DOING GREAT.

NOTES:

1) find fixes for things labeled NA (maybe use is.na = TRUE) in summary stats; ANSWER the NA values in standard devation are those that only have 1 claim.

2) fix COB entries (are condominiums and condominium the same thing? Restaurant and Restaurants) and naming conventions in general (use dplyr functions to clean up, get rid of commas, slashes, etc.)

3) use mutate() to remove scientifc notation and make everything “in millions”

References:

IBHS research center

DT Package

KableExtra Package Rmarkdown CookBook OR Create Awesome HTML Table with knitr::kable and kableExtra

The R Graph Gallery

The following data is data given to AppRISE to conduct an Exploratory Data Analysis (EDA). For now, we will keep things simple with some descriptive statistics and plots using `ggplot2` and `plotly`.

1) find fixes for things labeled `NA` (maybe use `is.na = TRUE`) in summary stats; ANSWER the NA values in standard devation are those that only have 1 claim.

2) fix `COB` entries (are condominiums and condominium the same thing? Restaurant and Restaurants) and naming conventions in general (use `dplyr` functions to clean up, get rid of commas, slashes, etc.)

3) use `mutate()` to remove scientifc notation and make everything “in millions”