Import Packages

The following data is data given to AppRISE to conduct an Exploratory Data Analysis (EDA). For now, we will keep things simple with some descriptive statistics and plots using ggplot2 and plotly.

library(tidyverse)
library(ggplot2)
library(plotly)
library(kableExtra)
library(DT)
library(data.table)
library(dygraphs)
library(latticeExtra)
library(stringr)
library(hrbrthemes)
library(patchwork)
library(tinytex)
library(readr)
VRU_Clean_Data_v2 <- read_csv("~/Desktop/DataAnalysis_AppRISE_Projects/AppRISE/VRUxAppRISE/Datasets/VRU_Clean_Data_v2.csv", 
    col_types = cols(MasterKey = col_skip(), 
        LandfallDate = col_date(format = "%m/%d/%Y"), 
        PortfolioDate = col_date(format = "%m/%d/%Y"), 
        InceptionDate = col_date(format = "%m/%d/%Y"), 
        ExpirationDate = col_date(format = "%m/%d/%Y")))


# Drop cols, blend certain COBs (more info below code)
VRU_df <- VRU_Clean_Data_v2 %>%
  dplyr::select(-c(Peril, PCS, Year)) %>%
  mutate(across('COB', str_replace, 'Condominiums', 'Condominium')) %>%
  mutate(across('COB', str_replace, 'BarsTavernsCocktailLoungesAndNightclubs', 'Bars')) %>%
  mutate(across('COB', str_replace, 'Healthcare_MedicalDental_InclSuppliers|Healthcare_MedicalDental_InclSuppliers_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'Healthcare_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'HotelMotels', 'HotelsMotels')) %>%
  mutate(across('COB', str_replace, 'GasStations_ConvenienceStores_CarWashes_AutoRepair', 'GasStations')) %>%
  mutate(across('COB', str_replace, 'lightmanufacturing', 'LightManufacturing')) %>%
  mutate(across('COB', str_replace, 'Restaurants', 'Restaurant')) %>%
  mutate(across('COB', str_replace, 'Churches_Nonprofit', 'Church'))


VRU_df <- subset(VRU_df, !(COB == 'Pleasemakeaselection')) 

# This dataset has PL (under LOB) removed, 
# masterkey dropped (ContractID is still retained for reference), 
# PCS dropped (Claims designation - If a storm gets big enough it is assigned a code... all we have are named storms or 'Event', so having Event col with PCS is redundant), 
# Year dropped (the year are in all other date-type columns; this removes more redundancy), 
# and a couple of variables and observations were "cleaned-up" prior to import (less lines of code tbh); Removed Peril Variable (All Perils are Hurricanes)

# NOTE ADJUSTMENTS TO COB
# blend Condominium and Condominiums, 
# blend Bars and BarsTavernsCocktailLoungesAndNightclubs
# blend Healthcare and Healthcare_MedicalDental_InclSuppliers AND Healthcare_MedicalDental_InclSuppliers_Service
# blend HotelMotels and HotelsMotels
# blend GasStations and GasStations_ConvenienceStores_CarWashes_AutoRepair
# blend LightManufacturing and lightmanufacturing
# blend Restaurant and Restaurants 
# remove Pleasemakeaselection

View(VRU_df)
# THIS DATA DROPS NML'S (OR AFTER CONVERSIONS, NA's); WE CANNOT QUANTITATIVELY ANALYZE COLUMNS THAT HAVE 'NML'; NML'S WILL MAKE THE COLUMN CHARACTER COLS AND NOT NUMERIC

# USE THIS DATA WHEN WORKING WITH COLUMNS RMSMP OR AIRMP

VRU_NoNML <- read_csv("~/Desktop/DataAnalysis_AppRISE_Projects/AppRISE/VRUxAppRISE/Datasets/VRU_Clean_Data_v2.csv", 
    col_types = cols(MasterKey = col_skip(), 
        Peril = col_skip(), PCS = col_skip(), 
        Year = col_skip(), LandfallDate = col_date(format = "%m/%d/%Y"), 
        PortfolioDate = col_date(format = "%m/%d/%Y"), 
        InceptionDate = col_date(format = "%m/%d/%Y"), 
        ExpirationDate = col_date(format = "%m/%d/%Y"), 
        AIRMP = col_double(), RMSMP = col_double()))

VRU_NoNML <- VRU_NoNML %>%
  mutate(across('COB', str_replace, 'Condominiums', 'Condominium')) %>%
  mutate(across('COB', str_replace, 'BarsTavernsCocktailLoungesAndNightclubs', 'Bars')) %>%
  mutate(across('COB', str_replace, 'Healthcare_MedicalDental_InclSuppliers|Healthcare_MedicalDental_InclSuppliers_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'Healthcare_Service', 'Healthcare')) %>%
  mutate(across('COB', str_replace, 'HotelMotels', 'HotelsMotels')) %>%
  mutate(across('COB', str_replace, 'GasStations_ConvenienceStores_CarWashes_AutoRepair', 'GasStations')) %>%
  mutate(across('COB', str_replace, 'lightmanufacturing', 'LightManufacturing')) %>%
  mutate(across('COB', str_replace, 'Restaurants', 'Restaurant')) %>%
  mutate(across('COB', str_replace, 'Churches_Nonprofit', 'Church'))

VRU_NoNML <- subset(VRU_NoNML, !(COB == 'Pleasemakeaselection')) 

VRU_NoNML <- VRU_NoNML[complete.cases(VRU_NoNML$RMSMP, VRU_NoNML$AIRMP),] #7573 obs (same as spreadsheet filter! There are two less because of the 'Pleasemakeaselection' drop)

# Another messier looking option...
# VRU_NoNML <- VRU_NoNML[complete.cases(VRU_NoNML[, c('AIRMP', 'RMSMP')]),]

View(VRU_NoNML)

View Dataset: filter and export if you’d like

# print the table and add filter; this will allow us view data with a filter and gives us the ability to print the filtered table from the html file.
datatable(VRU_df,
          rownames = FALSE, 
          filter = 'top',
          extensions = 'Buttons',
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          ))

Example Summary Stats

VRU_df %>%
  group_by(Event) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Event") %>%
  kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by Event
Event n x_bar s
Delta 987 12609.76 125139.6
Hanna 568 56515.21 971030.5
Ian 3396 42723.08 299054.4
Ida 2403 165340.15 954165.3
Laura 513 201913.34 1092017.9
Sally 582 269304.06 1424577.9
Zeta 1417 20104.02 220416.1
VRU_df %>%
  group_by(COB) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by COB") %>%
  kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by COB
COB n x_bar s
Apartments 1267 152779.9765 915917.848
ApartmentsOverFourStories 1 27921.8600 NA
ApartmentsWithMercantile 60 45967.5538 119670.098
ArtisanContractors 131 3816.2673 19420.196
AssistedLiving 59 71948.4551 250299.100
Bars 14 5833.0664 13285.295
BedAndBreakfast 15 40035.7267 70764.614
CarWashes_AutoRepair 41 20366.9120 55186.676
Casino 1 0.0000 NA
Church 274 44393.2719 191495.121
Condominium 1028 127624.8579 929114.904
CondominiumWithMercantile 22 6579.9468 24408.935
DryCleaners_LaundryMats 5 24611.5060 55033.000
Education 255 271908.3129 1832664.874
Entertainment 144 27327.6115 182089.362
Entertainment_Auditoriums_Theaters 52 98435.5413 289702.294
GasStations 132 10819.4934 50205.513
GolfCourseBuilding 3 3757.9733 6509.001
Healthcare 555 52689.4884 511566.145
Hospitality 425 251109.9307 1582968.953
HotelsMotels 303 188353.3798 768915.998
HotelsMotelsWithRestaurant 8 121868.7337 230948.374
LightManufacturing 9 625.2689 1875.807
Microbreweries_Wineries_HardCider 3 0.0000 0.000
Municipality 129 120013.4260 681555.515
Office 1572 31422.4020 282314.431
Other 359 196998.8377 918102.691
PublicAndMunicipal 13 82883.4762 164545.169
RealEstate 194 48853.8042 262508.932
Restaurant 881 33977.0223 255936.166
Retail 1116 33840.0572 204845.010
RetailWithRefrigerationEquipment 58 140384.2245 519642.315
Retail_RealEstate 204 130133.0879 593880.725
Schools 29 7599.5428 17452.110
Service 313 12871.2766 58654.809
SingleFamilyDwelling_WithAtLeastTwobldgs 10 0.0000 0.000
Warehouse_SelfStorage 13 28044.1600 87872.578
Warehousing 6 73353.7367 179679.226
Wholesalers 158 8627.1285 38370.855
WholesalersWithFood 1 0.0000 NA
WholesalersWithRefrigerationEquipment 3 231040.2733 348575.611
VRU_df %>%
  group_by(PrimaryState) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Primary State") %>%
  kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by Primary State
PrimaryState n x_bar s
AL 369 155883.6112 1236101.981
AZ 1 12521.8300 NA
CA 150 48129.8811 316975.412
CO 3 0.0000 0.000
CT 2 1320622.6950 1784674.053
DE 5 0.0000 0.000
FL 3985 56692.9435 467209.111
GA 54 5453.8402 36315.221
HI 10 14417.4060 45591.841
IL 4 0.0000 0.000
IN 1 0.0000 NA
KS 2 0.0000 0.000
KY 1 0.0000 NA
LA 3469 143652.8762 900280.625
MA 10 0.0000 0.000
MD 6 31054.4700 76067.606
ME 3 0.0000 0.000
MI 8 98234.8800 276004.570
MN 5 0.0000 0.000
MO 1 0.0000 NA
MS 361 40275.7462 342781.596
NC 17 0.0000 0.000
NJ 16 106059.9200 414806.046
NV 6 378392.7917 923746.286
NY 42 94881.3017 299501.019
OH 1 504160.1800 NA
OR 1 0.0000 NA
PA 6 99190.0150 226261.383
SC 47 60552.6898 222981.551
TN 6 20102.2483 48612.342
TX 1246 45030.4363 704150.531
VA 28 937.7614 4962.167
VRU_df %>%
  group_by(Construction) %>%
  summarise(n    = n(),
            x_bar = mean(VRUInc),
            s    = sd(VRUInc)) %>%
  kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Construction") %>%
  kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by Construction
Construction n x_bar s
Braced_Steelframe 4 1114201.0450 2228402.0900
Fire_Resistive 2 0.0000 0.0000
Frame 9 0.0000 0.0000
Heavy_Timber 3 0.0000 0.0000
Joisted_Masonry 2140 39418.6731 305936.4218
Light_Metal 212 33265.6072 228082.5437
Masonry 2282 66108.6681 444184.1032
Masonry_NonCombustible 15 106.2960 411.6826
Masonry_Veneer 22 1218357.0177 2846707.0214
Mobile_Homes 12 391863.0308 1129148.8200
Precast_Concrete 2 2156.1450 3049.2495
RC_MRF_D 14 539567.8943 1955592.5444
RC_MRF_URM 1 0.0000 NA
RC_SW_w_MRF 3 317764.6000 550384.4320
RC_SW_wo_MRF 17 352869.6094 1245575.4931
RM_SW_w_MRF 1 0.0000 NA
RM_SW_wo_MRF 1 162830.2000 NA
Reinforced_Concrete 863 155256.3776 975399.6492
Reinforced_Masonry 63 147527.4584 588918.2649
SF_w_C_SW 2 2592432.4000 3521597.4394
SF_w_URM 1 0.0000 NA
S_MRF_Dist 12 0.0000 0.0000
S_MRF_Perim 10 255.3320 807.4307
Steel 942 96344.4614 931545.9851
Tilt_Up 31 299.7329 1668.8422
URM_BF 1 26289.5000 NA
URM_BW 5 173961.9520 388990.7502
Unknown 26 630.1988 2310.0361
Wind_Resistive 1 186275.3300 NA
Wood_Frame 3169 108082.0059 849985.8376

Summary Stats: Claims, mean losses, distinct accnts, etc.

# find # of distinct accounts (double check to make sure `ContractID` actually are distinct/separate accounts for VRU)
# find mean losses
# find a way to give each account a 'weight' because some accounts have only 1 claim(?) and going completely off of mean will skew results
claims_df <- VRU_df %>% 
  group_by(PrimaryState) %>%
  summarize(tot_claims = n(),
            num_accnts = n_distinct(ContractID),
            tot_loss = sum(VRUInc)) %>%
  ungroup() %>%
  #summarize(prop_of_loss = tot_claims/sum(tot_claims)) %>%
  mutate(mean_loss = tot_loss/tot_claims,
         frac_claims_p_accnt = tot_claims/num_accnts) %>%
  arrange(desc(tot_loss)) 

claims_df %>%
  kbl(caption = "Claims, Accounts, and Losses by State") %>%
  kable_classic(full_width = T, html_font = "Cambria")
Claims, Accounts, and Losses by State
PrimaryState tot_claims num_accnts tot_loss mean_loss frac_claims_p_accnt
LA 3469 2334 498331827.44 143652.8762 1.486290
FL 3985 3717 225921379.99 56692.9435 1.072101
AL 369 258 57521052.54 155883.6112 1.430233
TX 1246 962 56107923.59 45030.4363 1.295218
MS 361 222 14539544.39 40275.7462 1.626126
CA 150 104 7219482.16 48129.8811 1.442308
NY 42 33 3985014.67 94881.3017 1.272727
SC 47 44 2845976.42 60552.6898 1.068182
CT 2 2 2641245.39 1320622.6950 1.000000
NV 6 3 2270356.75 378392.7917 2.000000
NJ 16 11 1696958.72 106059.9200 1.454546
MI 8 4 785879.04 98234.8800 2.000000
PA 6 5 595140.09 99190.0150 1.200000
OH 1 1 504160.18 504160.1800 1.000000
GA 54 45 294507.37 5453.8402 1.200000
MD 6 6 186326.82 31054.4700 1.000000
HI 10 6 144174.06 14417.4060 1.666667
TN 6 6 120613.49 20102.2483 1.000000
VA 28 28 26257.32 937.7614 1.000000
AZ 1 1 12521.83 12521.8300 1.000000
CO 3 1 0.00 0.0000 3.000000
DE 5 5 0.00 0.0000 1.000000
IL 4 4 0.00 0.0000 1.000000
IN 1 1 0.00 0.0000 1.000000
KS 2 1 0.00 0.0000 2.000000
KY 1 1 0.00 0.0000 1.000000
MA 10 10 0.00 0.0000 1.000000
ME 3 3 0.00 0.0000 1.000000
MN 5 3 0.00 0.0000 1.666667
MO 1 1 0.00 0.0000 1.000000
NC 17 16 0.00 0.0000 1.062500
OR 1 1 0.00 0.0000 1.000000
# use claims_df to weigh mean losses

claims_df2 <- claims_df %>%
  #select(PrimaryState, tot_claims, num_accnts, tot_loss, mean_loss, frac_claims_p_accnt) %>%
  mutate(weight_tot_claims = tot_claims / sum(tot_claims)) #%>%
        #summarize(weighted_mean_loss = weight_tot_claims * mean_loss) #%>%
  #mutate(claim_weight = tot_claims / sum_tot_claims)

options(scipen = 999) # use this to remove scientific notation

claims_df2 %>%
  transmute(PrimaryState, tot_claims, num_accnts, tot_loss, mean_loss, frac_claims_p_accnt, weight_tot_claims, weighted_mean_loss =  weight_tot_claims * mean_loss) %>%
  arrange(desc(weighted_mean_loss)) %>%
  kbl(caption = "Weighted Losses by State") %>%
  kable_classic(full_width = T, html_font = "Cambria")
Weighted Losses by State
PrimaryState tot_claims num_accnts tot_loss mean_loss frac_claims_p_accnt weight_tot_claims weighted_mean_loss
LA 3469 2334 498331827.44 143652.8762 1.486290 0.3516116 50510.016971
FL 3985 3717 225921379.99 56692.9435 1.072101 0.4039124 22898.984390
AL 369 258 57521052.54 155883.6112 1.430233 0.0374012 5830.230341
TX 1246 962 56107923.59 45030.4363 1.295218 0.1262923 5686.998134
MS 361 222 14539544.39 40275.7462 1.626126 0.0365903 1473.702046
CA 150 104 7219482.16 48129.8811 1.442308 0.0152037 731.753716
NY 42 33 3985014.67 94881.3017 1.272727 0.0042570 403.913913
SC 47 44 2845976.42 60552.6898 1.068182 0.0047638 288.463047
CT 2 2 2641245.39 1320622.6950 1.000000 0.0002027 267.711878
NV 6 3 2270356.75 378392.7917 2.000000 0.0006081 230.119273
NJ 16 11 1696958.72 106059.9200 1.454546 0.0016217 172.000681
MI 8 4 785879.04 98234.8800 2.000000 0.0008109 79.655285
PA 6 5 595140.09 99190.0150 1.200000 0.0006081 60.322328
OH 1 1 504160.18 504160.1800 1.000000 0.0001014 51.100768
GA 54 45 294507.37 5453.8402 1.200000 0.0054733 29.850737
MD 6 6 186326.82 31054.4700 1.000000 0.0006081 18.885751
HI 10 6 144174.06 14417.4060 1.666667 0.0010136 14.613223
TN 6 6 120613.49 20102.2483 1.000000 0.0006081 12.225166
VA 28 28 26257.32 937.7614 1.000000 0.0028380 2.661395
AZ 1 1 12521.83 12521.8300 1.000000 0.0001014 1.269190
CO 3 1 0.00 0.0000 3.000000 0.0003041 0.000000
DE 5 5 0.00 0.0000 1.000000 0.0005068 0.000000
IL 4 4 0.00 0.0000 1.000000 0.0004054 0.000000
IN 1 1 0.00 0.0000 1.000000 0.0001014 0.000000
KS 2 1 0.00 0.0000 2.000000 0.0002027 0.000000
KY 1 1 0.00 0.0000 1.000000 0.0001014 0.000000
MA 10 10 0.00 0.0000 1.000000 0.0010136 0.000000
ME 3 3 0.00 0.0000 1.000000 0.0003041 0.000000
MN 5 3 0.00 0.0000 1.666667 0.0005068 0.000000
MO 1 1 0.00 0.0000 1.000000 0.0001014 0.000000
NC 17 16 0.00 0.0000 1.062500 0.0017231 0.000000
OR 1 1 0.00 0.0000 1.000000 0.0001014 0.000000

Timeseries examples

ts_ex1 <- ggplot(VRU_df, aes(x = LandfallDate, y = VRUInc)) +
  geom_line() +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
#ts_ex1
ggplotly(ts_ex1)
ts_ex2 <- ggplot(VRU_df, aes(x = InceptionDate, y = VRUInc)) +
  geom_line() +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
#ts_ex2
ggplotly(ts_ex2)
Practice: Using InceptionDate and VRUInc, select time frame betw 2021-01-01 to 2021-05-01
ts_ex3 <- ggplot(VRU_df, aes(x = InceptionDate, y = VRUInc)) +
  geom_line(color = "purple") +
  geom_point(color = "black") +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_date(limit = c(as.Date("2021-01-01"), as.Date("2021-05-01")))
#ts_ex3
ggplotly(ts_ex3)

IGNORE THESE GRAPHS: xts, zoo, tidyquant?: for timeseries, try to just use PortfolioDate as an example and drop the other dates.

library(tidyquant)
library(xts)
# Remove other date columns (avoids confusion in TS data)
VRU_ts_data_test <- VRU_df %>%
  select(-c(LandfallDate, InceptionDate, ExpirationDate))

# use `relocate()` function from dplyr to move a date to the front of the dataset
VRU_ts_data_test <-  VRU_ts_data_test %>%
  relocate(PortfolioDate)
# Make your adjusted dataset an xts so we can mess with timeseries stuff
VRUxts_df_test <- as.xts(as.data.table(VRU_ts_data_test))
# These are the locations of the last observation of every year of the data
endpoints(VRUxts_df_test, on = "years")
## [1]    0 4067 6470 9866
# Notice how you didn't specify one value... try this again but only select one or two y-values

dygraph((VRUxts_df_test))
# dygraph with selected values to focus on (using VRU_df) 
test1 <- xts(x = VRU_df$TIV, order.by = VRU_df$PortfolioDate)
dygraph(test1)
test2 <- xts(x = VRU_df$AIRMP, order.by = VRU_df$PortfolioDate)
dygraph(test2)

Correlation

library(GGally)

corr_data_test1 <- VRU_df %>%
  select(Attachment:PolicyLimit, TIV:ModeledPrem, AIREst:RMSMP)

ggcorr(corr_data_test1, 
       low = "steelblue", 
       mid = "white", 
       high = "darkred", 
       size = 2, 
       color = "black", # variable label color
       label = TRUE,
       label_size = 2,
       label_color = "white", # corr label color
       layout.exp = 1) # adds another tile or space to the bottom so "attachment" isn't cut off

Adverse Selection Question: Did any clsses of business (COB) perform worse over time?

# starter/ exampl code
# FIGURE OUT a way to make grids larger... much larger
AdvSelect_facetwrap <- ggplot(VRU_df, aes(PortfolioDate, VRUInc)) +
  geom_line(color = "#330099", size = 1) +
  geom_point(color = "black") +
  labs(title = "Possible Adverse Class Selection", subtitle = "COB performance over time", x = "", y = "VRU Loss") + 
  facet_wrap(~ COB)

ggplotly(AdvSelect_facetwrap)
# Businesses (Hospitality: Food and Beverage)
AdvSelect_FoodAndBev <- VRU_df %>%
  filter(COB == c('Bars', 'HotelsMotelsWithRestaurant', 'Microbreweries_Wineries_HardCider', 'Restaurant')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Hospitality: Food and Beverage)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_FoodAndBev)
# Businesses Services (Commercial real estate)
AdvSelect_Dwelling <- VRU_df %>%
  filter(COB == c('Apartments', 'ApartmentsOverFourStories', 'ApartmentsWithMercantile', 'Condominium', 'SingleFamilyDwelling_WithAtLeastTwobldgs')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Commercial Real Estate)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_Dwelling)
# Business Services (Lodging Industry)
AdvSelect_Lodging <- VRU_df %>%
  filter(COB == c('BedAndBreakfast', 'HotelsMotels', 'HotelsMotelsWithRestaurant')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Lodging Industry)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_Lodging)
# Businesses (Hospitality Industry in general)
AdvSelect_HospitalityBus <- VRU_df %>%
  filter(COB == c('Bar', 'BedAndBreakfast', 'Casino',  'Entertainment', 'Entertainment_Auditoriums_Theaters', 'Hospitality', 'HotelsMotels',  'HotelsMotelsWithRestaurant', 'Microbreweries_Wineries_HardCider', 'Restaurant', 'Service')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Hospitality Industry)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_HospitalityBus)
# Businesses: Church and Schools
AdvSelect_ChurchAndSchools <- VRU_df %>%
  filter(COB == c('Church', 'Education', 'Schools')) %>%
  ggplot(aes(LandfallDate, VRUInc)) +
  geom_point(color = "#330099", shape = 21) +
  geom_smooth(method = "lm") +
  theme_ipsum_pub() +
  labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Churchs and Schools)') +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold')) 

ggplotly(AdvSelect_ChurchAndSchools)

INVESTIGATE MODEL MISS STUFFF… PS I LOVE YOU. YOU’RE DOING GREAT.

# create a dual y-axis comparison of AIRMP and RMSMP over time (use VRU_NoNML dataset)
# usually dual y-axis comparisons should be avoided, but seeing that both variables have the same scale (0-1), we think it is appropriate.

AIRmiss <- VRU_NoNML %>%
  ggplot(aes(x = LandfallDate, y = AIRMP)) +
  geom_point(color = "skyblue") +
  #geom_jitter() +
  theme_bw()
  
RMSmiss <- VRU_NoNML %>%
  ggplot(aes(x = LandfallDate, y = RMSMP)) +
  geom_point(color = "blue") +
  #geom_jitter() +
  theme_bw()

ggplotly(AIRmiss)
ggplotly(RMSmiss)

NOTES:

1) find fixes for things labeled NA (maybe use is.na = TRUE) in summary stats; ANSWER the NA values in standard devation are those that only have 1 claim.

\(s=\sqrt{\frac{\sum_{i=1}^N(x_i-\bar{x})^2}{N-1}}\)

2) fix COB entries (are condominiums and condominium the same thing? Restaurant and Restaurants) and naming conventions in general (use dplyr functions to clean up, get rid of commas, slashes, etc.)

3) use mutate() to remove scientifc notation and make everything “in millions”

References: