Import Packages
The following data is data given to AppRISE to conduct an
Exploratory Data Analysis (EDA). For now, we will keep things simple
with some descriptive statistics and plots using ggplot2
and plotly.
library(tidyverse)
library(ggplot2)
library(plotly)
library(kableExtra)
library(DT)
library(data.table)
library(dygraphs)
library(latticeExtra)
library(stringr)
library(hrbrthemes)
library(patchwork)
library(tinytex)
library(readr)
VRU_Clean_Data_v2 <- read_csv("~/Desktop/DataAnalysis_AppRISE_Projects/AppRISE/VRUxAppRISE/Datasets/VRU_Clean_Data_v2.csv",
col_types = cols(MasterKey = col_skip(),
LandfallDate = col_date(format = "%m/%d/%Y"),
PortfolioDate = col_date(format = "%m/%d/%Y"),
InceptionDate = col_date(format = "%m/%d/%Y"),
ExpirationDate = col_date(format = "%m/%d/%Y")))
# Drop cols, blend certain COBs (more info below code)
VRU_df <- VRU_Clean_Data_v2 %>%
dplyr::select(-c(Peril, PCS, Year)) %>%
mutate(across('COB', str_replace, 'Condominiums', 'Condominium')) %>%
mutate(across('COB', str_replace, 'BarsTavernsCocktailLoungesAndNightclubs', 'Bars')) %>%
mutate(across('COB', str_replace, 'Healthcare_MedicalDental_InclSuppliers|Healthcare_MedicalDental_InclSuppliers_Service', 'Healthcare')) %>%
mutate(across('COB', str_replace, 'Healthcare_Service', 'Healthcare')) %>%
mutate(across('COB', str_replace, 'HotelMotels', 'HotelsMotels')) %>%
mutate(across('COB', str_replace, 'GasStations_ConvenienceStores_CarWashes_AutoRepair', 'GasStations')) %>%
mutate(across('COB', str_replace, 'lightmanufacturing', 'LightManufacturing')) %>%
mutate(across('COB', str_replace, 'Restaurants', 'Restaurant')) %>%
mutate(across('COB', str_replace, 'Churches_Nonprofit', 'Church'))
VRU_df <- subset(VRU_df, !(COB == 'Pleasemakeaselection'))
# This dataset has PL (under LOB) removed,
# masterkey dropped (ContractID is still retained for reference),
# PCS dropped (Claims designation - If a storm gets big enough it is assigned a code... all we have are named storms or 'Event', so having Event col with PCS is redundant),
# Year dropped (the year are in all other date-type columns; this removes more redundancy),
# and a couple of variables and observations were "cleaned-up" prior to import (less lines of code tbh); Removed Peril Variable (All Perils are Hurricanes)
# NOTE ADJUSTMENTS TO COB
# blend Condominium and Condominiums,
# blend Bars and BarsTavernsCocktailLoungesAndNightclubs
# blend Healthcare and Healthcare_MedicalDental_InclSuppliers AND Healthcare_MedicalDental_InclSuppliers_Service
# blend HotelMotels and HotelsMotels
# blend GasStations and GasStations_ConvenienceStores_CarWashes_AutoRepair
# blend LightManufacturing and lightmanufacturing
# blend Restaurant and Restaurants
# remove Pleasemakeaselection
View(VRU_df)
# THIS DATA DROPS NML'S (OR AFTER CONVERSIONS, NA's); WE CANNOT QUANTITATIVELY ANALYZE COLUMNS THAT HAVE 'NML'; NML'S WILL MAKE THE COLUMN CHARACTER COLS AND NOT NUMERIC
# USE THIS DATA WHEN WORKING WITH COLUMNS RMSMP OR AIRMP
VRU_NoNML <- read_csv("~/Desktop/DataAnalysis_AppRISE_Projects/AppRISE/VRUxAppRISE/Datasets/VRU_Clean_Data_v2.csv",
col_types = cols(MasterKey = col_skip(),
Peril = col_skip(), PCS = col_skip(),
Year = col_skip(), LandfallDate = col_date(format = "%m/%d/%Y"),
PortfolioDate = col_date(format = "%m/%d/%Y"),
InceptionDate = col_date(format = "%m/%d/%Y"),
ExpirationDate = col_date(format = "%m/%d/%Y"),
AIRMP = col_double(), RMSMP = col_double()))
VRU_NoNML <- VRU_NoNML %>%
mutate(across('COB', str_replace, 'Condominiums', 'Condominium')) %>%
mutate(across('COB', str_replace, 'BarsTavernsCocktailLoungesAndNightclubs', 'Bars')) %>%
mutate(across('COB', str_replace, 'Healthcare_MedicalDental_InclSuppliers|Healthcare_MedicalDental_InclSuppliers_Service', 'Healthcare')) %>%
mutate(across('COB', str_replace, 'Healthcare_Service', 'Healthcare')) %>%
mutate(across('COB', str_replace, 'HotelMotels', 'HotelsMotels')) %>%
mutate(across('COB', str_replace, 'GasStations_ConvenienceStores_CarWashes_AutoRepair', 'GasStations')) %>%
mutate(across('COB', str_replace, 'lightmanufacturing', 'LightManufacturing')) %>%
mutate(across('COB', str_replace, 'Restaurants', 'Restaurant')) %>%
mutate(across('COB', str_replace, 'Churches_Nonprofit', 'Church'))
VRU_NoNML <- subset(VRU_NoNML, !(COB == 'Pleasemakeaselection'))
VRU_NoNML <- VRU_NoNML[complete.cases(VRU_NoNML$RMSMP, VRU_NoNML$AIRMP),] #7573 obs (same as spreadsheet filter! There are two less because of the 'Pleasemakeaselection' drop)
# Another messier looking option...
# VRU_NoNML <- VRU_NoNML[complete.cases(VRU_NoNML[, c('AIRMP', 'RMSMP')]),]
View(VRU_NoNML)
View Dataset: filter and export if you’d like
# print the table and add filter; this will allow us view data with a filter and gives us the ability to print the filtered table from the html file.
datatable(VRU_df,
rownames = FALSE,
filter = 'top',
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
))
Example Summary Stats
VRU_df %>%
group_by(Event) %>%
summarise(n = n(),
x_bar = mean(VRUInc),
s = sd(VRUInc)) %>%
kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Event") %>%
kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by Event
|
Event
|
n
|
x_bar
|
s
|
|
Delta
|
987
|
12609.76
|
125139.6
|
|
Hanna
|
568
|
56515.21
|
971030.5
|
|
Ian
|
3396
|
42723.08
|
299054.4
|
|
Ida
|
2403
|
165340.15
|
954165.3
|
|
Laura
|
513
|
201913.34
|
1092017.9
|
|
Sally
|
582
|
269304.06
|
1424577.9
|
|
Zeta
|
1417
|
20104.02
|
220416.1
|
VRU_df %>%
group_by(COB) %>%
summarise(n = n(),
x_bar = mean(VRUInc),
s = sd(VRUInc)) %>%
kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by COB") %>%
kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by COB
|
COB
|
n
|
x_bar
|
s
|
|
Apartments
|
1267
|
152779.9765
|
915917.848
|
|
ApartmentsOverFourStories
|
1
|
27921.8600
|
NA
|
|
ApartmentsWithMercantile
|
60
|
45967.5538
|
119670.098
|
|
ArtisanContractors
|
131
|
3816.2673
|
19420.196
|
|
AssistedLiving
|
59
|
71948.4551
|
250299.100
|
|
Bars
|
14
|
5833.0664
|
13285.295
|
|
BedAndBreakfast
|
15
|
40035.7267
|
70764.614
|
|
CarWashes_AutoRepair
|
41
|
20366.9120
|
55186.676
|
|
Casino
|
1
|
0.0000
|
NA
|
|
Church
|
274
|
44393.2719
|
191495.121
|
|
Condominium
|
1028
|
127624.8579
|
929114.904
|
|
CondominiumWithMercantile
|
22
|
6579.9468
|
24408.935
|
|
DryCleaners_LaundryMats
|
5
|
24611.5060
|
55033.000
|
|
Education
|
255
|
271908.3129
|
1832664.874
|
|
Entertainment
|
144
|
27327.6115
|
182089.362
|
|
Entertainment_Auditoriums_Theaters
|
52
|
98435.5413
|
289702.294
|
|
GasStations
|
132
|
10819.4934
|
50205.513
|
|
GolfCourseBuilding
|
3
|
3757.9733
|
6509.001
|
|
Healthcare
|
555
|
52689.4884
|
511566.145
|
|
Hospitality
|
425
|
251109.9307
|
1582968.953
|
|
HotelsMotels
|
303
|
188353.3798
|
768915.998
|
|
HotelsMotelsWithRestaurant
|
8
|
121868.7337
|
230948.374
|
|
LightManufacturing
|
9
|
625.2689
|
1875.807
|
|
Microbreweries_Wineries_HardCider
|
3
|
0.0000
|
0.000
|
|
Municipality
|
129
|
120013.4260
|
681555.515
|
|
Office
|
1572
|
31422.4020
|
282314.431
|
|
Other
|
359
|
196998.8377
|
918102.691
|
|
PublicAndMunicipal
|
13
|
82883.4762
|
164545.169
|
|
RealEstate
|
194
|
48853.8042
|
262508.932
|
|
Restaurant
|
881
|
33977.0223
|
255936.166
|
|
Retail
|
1116
|
33840.0572
|
204845.010
|
|
RetailWithRefrigerationEquipment
|
58
|
140384.2245
|
519642.315
|
|
Retail_RealEstate
|
204
|
130133.0879
|
593880.725
|
|
Schools
|
29
|
7599.5428
|
17452.110
|
|
Service
|
313
|
12871.2766
|
58654.809
|
|
SingleFamilyDwelling_WithAtLeastTwobldgs
|
10
|
0.0000
|
0.000
|
|
Warehouse_SelfStorage
|
13
|
28044.1600
|
87872.578
|
|
Warehousing
|
6
|
73353.7367
|
179679.226
|
|
Wholesalers
|
158
|
8627.1285
|
38370.855
|
|
WholesalersWithFood
|
1
|
0.0000
|
NA
|
|
WholesalersWithRefrigerationEquipment
|
3
|
231040.2733
|
348575.611
|
VRU_df %>%
group_by(PrimaryState) %>%
summarise(n = n(),
x_bar = mean(VRUInc),
s = sd(VRUInc)) %>%
kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Primary State") %>%
kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by Primary State
|
PrimaryState
|
n
|
x_bar
|
s
|
|
AL
|
369
|
155883.6112
|
1236101.981
|
|
AZ
|
1
|
12521.8300
|
NA
|
|
CA
|
150
|
48129.8811
|
316975.412
|
|
CO
|
3
|
0.0000
|
0.000
|
|
CT
|
2
|
1320622.6950
|
1784674.053
|
|
DE
|
5
|
0.0000
|
0.000
|
|
FL
|
3985
|
56692.9435
|
467209.111
|
|
GA
|
54
|
5453.8402
|
36315.221
|
|
HI
|
10
|
14417.4060
|
45591.841
|
|
IL
|
4
|
0.0000
|
0.000
|
|
IN
|
1
|
0.0000
|
NA
|
|
KS
|
2
|
0.0000
|
0.000
|
|
KY
|
1
|
0.0000
|
NA
|
|
LA
|
3469
|
143652.8762
|
900280.625
|
|
MA
|
10
|
0.0000
|
0.000
|
|
MD
|
6
|
31054.4700
|
76067.606
|
|
ME
|
3
|
0.0000
|
0.000
|
|
MI
|
8
|
98234.8800
|
276004.570
|
|
MN
|
5
|
0.0000
|
0.000
|
|
MO
|
1
|
0.0000
|
NA
|
|
MS
|
361
|
40275.7462
|
342781.596
|
|
NC
|
17
|
0.0000
|
0.000
|
|
NJ
|
16
|
106059.9200
|
414806.046
|
|
NV
|
6
|
378392.7917
|
923746.286
|
|
NY
|
42
|
94881.3017
|
299501.019
|
|
OH
|
1
|
504160.1800
|
NA
|
|
OR
|
1
|
0.0000
|
NA
|
|
PA
|
6
|
99190.0150
|
226261.383
|
|
SC
|
47
|
60552.6898
|
222981.551
|
|
TN
|
6
|
20102.2483
|
48612.342
|
|
TX
|
1246
|
45030.4363
|
704150.531
|
|
VA
|
28
|
937.7614
|
4962.167
|
VRU_df %>%
group_by(Construction) %>%
summarise(n = n(),
x_bar = mean(VRUInc),
s = sd(VRUInc)) %>%
kbl(caption = "Summary Statistics: VRU Actual Loss Sustained by Construction") %>%
kable_classic(full_width = T, html_font = "Cambria")
Summary Statistics: VRU Actual Loss Sustained by Construction
|
Construction
|
n
|
x_bar
|
s
|
|
Braced_Steelframe
|
4
|
1114201.0450
|
2228402.0900
|
|
Fire_Resistive
|
2
|
0.0000
|
0.0000
|
|
Frame
|
9
|
0.0000
|
0.0000
|
|
Heavy_Timber
|
3
|
0.0000
|
0.0000
|
|
Joisted_Masonry
|
2140
|
39418.6731
|
305936.4218
|
|
Light_Metal
|
212
|
33265.6072
|
228082.5437
|
|
Masonry
|
2282
|
66108.6681
|
444184.1032
|
|
Masonry_NonCombustible
|
15
|
106.2960
|
411.6826
|
|
Masonry_Veneer
|
22
|
1218357.0177
|
2846707.0214
|
|
Mobile_Homes
|
12
|
391863.0308
|
1129148.8200
|
|
Precast_Concrete
|
2
|
2156.1450
|
3049.2495
|
|
RC_MRF_D
|
14
|
539567.8943
|
1955592.5444
|
|
RC_MRF_URM
|
1
|
0.0000
|
NA
|
|
RC_SW_w_MRF
|
3
|
317764.6000
|
550384.4320
|
|
RC_SW_wo_MRF
|
17
|
352869.6094
|
1245575.4931
|
|
RM_SW_w_MRF
|
1
|
0.0000
|
NA
|
|
RM_SW_wo_MRF
|
1
|
162830.2000
|
NA
|
|
Reinforced_Concrete
|
863
|
155256.3776
|
975399.6492
|
|
Reinforced_Masonry
|
63
|
147527.4584
|
588918.2649
|
|
SF_w_C_SW
|
2
|
2592432.4000
|
3521597.4394
|
|
SF_w_URM
|
1
|
0.0000
|
NA
|
|
S_MRF_Dist
|
12
|
0.0000
|
0.0000
|
|
S_MRF_Perim
|
10
|
255.3320
|
807.4307
|
|
Steel
|
942
|
96344.4614
|
931545.9851
|
|
Tilt_Up
|
31
|
299.7329
|
1668.8422
|
|
URM_BF
|
1
|
26289.5000
|
NA
|
|
URM_BW
|
5
|
173961.9520
|
388990.7502
|
|
Unknown
|
26
|
630.1988
|
2310.0361
|
|
Wind_Resistive
|
1
|
186275.3300
|
NA
|
|
Wood_Frame
|
3169
|
108082.0059
|
849985.8376
|
Summary Stats: Claims, mean losses, distinct accnts, etc.
# find # of distinct accounts (double check to make sure `ContractID` actually are distinct/separate accounts for VRU)
# find mean losses
# find a way to give each account a 'weight' because some accounts have only 1 claim(?) and going completely off of mean will skew results
claims_df <- VRU_df %>%
group_by(PrimaryState) %>%
summarize(tot_claims = n(),
num_accnts = n_distinct(ContractID),
tot_loss = sum(VRUInc)) %>%
ungroup() %>%
#summarize(prop_of_loss = tot_claims/sum(tot_claims)) %>%
mutate(mean_loss = tot_loss/tot_claims,
frac_claims_p_accnt = tot_claims/num_accnts) %>%
arrange(desc(tot_loss))
claims_df %>%
kbl(caption = "Claims, Accounts, and Losses by State") %>%
kable_classic(full_width = T, html_font = "Cambria")
Claims, Accounts, and Losses by State
|
PrimaryState
|
tot_claims
|
num_accnts
|
tot_loss
|
mean_loss
|
frac_claims_p_accnt
|
|
LA
|
3469
|
2334
|
498331827.44
|
143652.8762
|
1.486290
|
|
FL
|
3985
|
3717
|
225921379.99
|
56692.9435
|
1.072101
|
|
AL
|
369
|
258
|
57521052.54
|
155883.6112
|
1.430233
|
|
TX
|
1246
|
962
|
56107923.59
|
45030.4363
|
1.295218
|
|
MS
|
361
|
222
|
14539544.39
|
40275.7462
|
1.626126
|
|
CA
|
150
|
104
|
7219482.16
|
48129.8811
|
1.442308
|
|
NY
|
42
|
33
|
3985014.67
|
94881.3017
|
1.272727
|
|
SC
|
47
|
44
|
2845976.42
|
60552.6898
|
1.068182
|
|
CT
|
2
|
2
|
2641245.39
|
1320622.6950
|
1.000000
|
|
NV
|
6
|
3
|
2270356.75
|
378392.7917
|
2.000000
|
|
NJ
|
16
|
11
|
1696958.72
|
106059.9200
|
1.454546
|
|
MI
|
8
|
4
|
785879.04
|
98234.8800
|
2.000000
|
|
PA
|
6
|
5
|
595140.09
|
99190.0150
|
1.200000
|
|
OH
|
1
|
1
|
504160.18
|
504160.1800
|
1.000000
|
|
GA
|
54
|
45
|
294507.37
|
5453.8402
|
1.200000
|
|
MD
|
6
|
6
|
186326.82
|
31054.4700
|
1.000000
|
|
HI
|
10
|
6
|
144174.06
|
14417.4060
|
1.666667
|
|
TN
|
6
|
6
|
120613.49
|
20102.2483
|
1.000000
|
|
VA
|
28
|
28
|
26257.32
|
937.7614
|
1.000000
|
|
AZ
|
1
|
1
|
12521.83
|
12521.8300
|
1.000000
|
|
CO
|
3
|
1
|
0.00
|
0.0000
|
3.000000
|
|
DE
|
5
|
5
|
0.00
|
0.0000
|
1.000000
|
|
IL
|
4
|
4
|
0.00
|
0.0000
|
1.000000
|
|
IN
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
|
KS
|
2
|
1
|
0.00
|
0.0000
|
2.000000
|
|
KY
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
|
MA
|
10
|
10
|
0.00
|
0.0000
|
1.000000
|
|
ME
|
3
|
3
|
0.00
|
0.0000
|
1.000000
|
|
MN
|
5
|
3
|
0.00
|
0.0000
|
1.666667
|
|
MO
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
|
NC
|
17
|
16
|
0.00
|
0.0000
|
1.062500
|
|
OR
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
# use claims_df to weigh mean losses
claims_df2 <- claims_df %>%
#select(PrimaryState, tot_claims, num_accnts, tot_loss, mean_loss, frac_claims_p_accnt) %>%
mutate(weight_tot_claims = tot_claims / sum(tot_claims)) #%>%
#summarize(weighted_mean_loss = weight_tot_claims * mean_loss) #%>%
#mutate(claim_weight = tot_claims / sum_tot_claims)
options(scipen = 999) # use this to remove scientific notation
claims_df2 %>%
transmute(PrimaryState, tot_claims, num_accnts, tot_loss, mean_loss, frac_claims_p_accnt, weight_tot_claims, weighted_mean_loss = weight_tot_claims * mean_loss) %>%
arrange(desc(weighted_mean_loss)) %>%
kbl(caption = "Weighted Losses by State") %>%
kable_classic(full_width = T, html_font = "Cambria")
Weighted Losses by State
|
PrimaryState
|
tot_claims
|
num_accnts
|
tot_loss
|
mean_loss
|
frac_claims_p_accnt
|
weight_tot_claims
|
weighted_mean_loss
|
|
LA
|
3469
|
2334
|
498331827.44
|
143652.8762
|
1.486290
|
0.3516116
|
50510.016971
|
|
FL
|
3985
|
3717
|
225921379.99
|
56692.9435
|
1.072101
|
0.4039124
|
22898.984390
|
|
AL
|
369
|
258
|
57521052.54
|
155883.6112
|
1.430233
|
0.0374012
|
5830.230341
|
|
TX
|
1246
|
962
|
56107923.59
|
45030.4363
|
1.295218
|
0.1262923
|
5686.998134
|
|
MS
|
361
|
222
|
14539544.39
|
40275.7462
|
1.626126
|
0.0365903
|
1473.702046
|
|
CA
|
150
|
104
|
7219482.16
|
48129.8811
|
1.442308
|
0.0152037
|
731.753716
|
|
NY
|
42
|
33
|
3985014.67
|
94881.3017
|
1.272727
|
0.0042570
|
403.913913
|
|
SC
|
47
|
44
|
2845976.42
|
60552.6898
|
1.068182
|
0.0047638
|
288.463047
|
|
CT
|
2
|
2
|
2641245.39
|
1320622.6950
|
1.000000
|
0.0002027
|
267.711878
|
|
NV
|
6
|
3
|
2270356.75
|
378392.7917
|
2.000000
|
0.0006081
|
230.119273
|
|
NJ
|
16
|
11
|
1696958.72
|
106059.9200
|
1.454546
|
0.0016217
|
172.000681
|
|
MI
|
8
|
4
|
785879.04
|
98234.8800
|
2.000000
|
0.0008109
|
79.655285
|
|
PA
|
6
|
5
|
595140.09
|
99190.0150
|
1.200000
|
0.0006081
|
60.322328
|
|
OH
|
1
|
1
|
504160.18
|
504160.1800
|
1.000000
|
0.0001014
|
51.100768
|
|
GA
|
54
|
45
|
294507.37
|
5453.8402
|
1.200000
|
0.0054733
|
29.850737
|
|
MD
|
6
|
6
|
186326.82
|
31054.4700
|
1.000000
|
0.0006081
|
18.885751
|
|
HI
|
10
|
6
|
144174.06
|
14417.4060
|
1.666667
|
0.0010136
|
14.613223
|
|
TN
|
6
|
6
|
120613.49
|
20102.2483
|
1.000000
|
0.0006081
|
12.225166
|
|
VA
|
28
|
28
|
26257.32
|
937.7614
|
1.000000
|
0.0028380
|
2.661395
|
|
AZ
|
1
|
1
|
12521.83
|
12521.8300
|
1.000000
|
0.0001014
|
1.269190
|
|
CO
|
3
|
1
|
0.00
|
0.0000
|
3.000000
|
0.0003041
|
0.000000
|
|
DE
|
5
|
5
|
0.00
|
0.0000
|
1.000000
|
0.0005068
|
0.000000
|
|
IL
|
4
|
4
|
0.00
|
0.0000
|
1.000000
|
0.0004054
|
0.000000
|
|
IN
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
0.0001014
|
0.000000
|
|
KS
|
2
|
1
|
0.00
|
0.0000
|
2.000000
|
0.0002027
|
0.000000
|
|
KY
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
0.0001014
|
0.000000
|
|
MA
|
10
|
10
|
0.00
|
0.0000
|
1.000000
|
0.0010136
|
0.000000
|
|
ME
|
3
|
3
|
0.00
|
0.0000
|
1.000000
|
0.0003041
|
0.000000
|
|
MN
|
5
|
3
|
0.00
|
0.0000
|
1.666667
|
0.0005068
|
0.000000
|
|
MO
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
0.0001014
|
0.000000
|
|
NC
|
17
|
16
|
0.00
|
0.0000
|
1.062500
|
0.0017231
|
0.000000
|
|
OR
|
1
|
1
|
0.00
|
0.0000
|
1.000000
|
0.0001014
|
0.000000
|
Timeseries examples
ts_ex1 <- ggplot(VRU_df, aes(x = LandfallDate, y = VRUInc)) +
geom_line() +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#ts_ex1
ggplotly(ts_ex1)
ts_ex2 <- ggplot(VRU_df, aes(x = InceptionDate, y = VRUInc)) +
geom_line() +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#ts_ex2
ggplotly(ts_ex2)
Practice: Using InceptionDate and VRUInc, select time frame betw
2021-01-01 to 2021-05-01
ts_ex3 <- ggplot(VRU_df, aes(x = InceptionDate, y = VRUInc)) +
geom_line(color = "purple") +
geom_point(color = "black") +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_date(limit = c(as.Date("2021-01-01"), as.Date("2021-05-01")))
#ts_ex3
ggplotly(ts_ex3)
IGNORE THESE GRAPHS: xts, zoo, tidyquant?: for timeseries, try to
just use PortfolioDate as an example and drop the other dates.
library(tidyquant)
library(xts)
# Remove other date columns (avoids confusion in TS data)
VRU_ts_data_test <- VRU_df %>%
select(-c(LandfallDate, InceptionDate, ExpirationDate))
# use `relocate()` function from dplyr to move a date to the front of the dataset
VRU_ts_data_test <- VRU_ts_data_test %>%
relocate(PortfolioDate)
# Make your adjusted dataset an xts so we can mess with timeseries stuff
VRUxts_df_test <- as.xts(as.data.table(VRU_ts_data_test))
# These are the locations of the last observation of every year of the data
endpoints(VRUxts_df_test, on = "years")
## [1] 0 4067 6470 9866
# Notice how you didn't specify one value... try this again but only select one or two y-values
dygraph((VRUxts_df_test))
# dygraph with selected values to focus on (using VRU_df)
test1 <- xts(x = VRU_df$TIV, order.by = VRU_df$PortfolioDate)
dygraph(test1)
test2 <- xts(x = VRU_df$AIRMP, order.by = VRU_df$PortfolioDate)
dygraph(test2)
Correlation
library(GGally)
corr_data_test1 <- VRU_df %>%
select(Attachment:PolicyLimit, TIV:ModeledPrem, AIREst:RMSMP)
ggcorr(corr_data_test1,
low = "steelblue",
mid = "white",
high = "darkred",
size = 2,
color = "black", # variable label color
label = TRUE,
label_size = 2,
label_color = "white", # corr label color
layout.exp = 1) # adds another tile or space to the bottom so "attachment" isn't cut off

Adverse Selection Question: Did any clsses of business (COB) perform
worse over time?
# starter/ exampl code
# FIGURE OUT a way to make grids larger... much larger
AdvSelect_facetwrap <- ggplot(VRU_df, aes(PortfolioDate, VRUInc)) +
geom_line(color = "#330099", size = 1) +
geom_point(color = "black") +
labs(title = "Possible Adverse Class Selection", subtitle = "COB performance over time", x = "", y = "VRU Loss") +
facet_wrap(~ COB)
ggplotly(AdvSelect_facetwrap)
# Businesses (Hospitality: Food and Beverage)
AdvSelect_FoodAndBev <- VRU_df %>%
filter(COB == c('Bars', 'HotelsMotelsWithRestaurant', 'Microbreweries_Wineries_HardCider', 'Restaurant')) %>%
ggplot(aes(LandfallDate, VRUInc)) +
geom_point(color = "#330099", shape = 21) +
geom_smooth(method = "lm") +
theme_ipsum_pub() +
labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Hospitality: Food and Beverage)') +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_FoodAndBev)
# Businesses Services (Commercial real estate)
AdvSelect_Dwelling <- VRU_df %>%
filter(COB == c('Apartments', 'ApartmentsOverFourStories', 'ApartmentsWithMercantile', 'Condominium', 'SingleFamilyDwelling_WithAtLeastTwobldgs')) %>%
ggplot(aes(LandfallDate, VRUInc)) +
geom_point(color = "#330099", shape = 21) +
geom_smooth(method = "lm") +
theme_ipsum_pub() +
labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Commercial Real Estate)') +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_Dwelling)
# Business Services (Lodging Industry)
AdvSelect_Lodging <- VRU_df %>%
filter(COB == c('BedAndBreakfast', 'HotelsMotels', 'HotelsMotelsWithRestaurant')) %>%
ggplot(aes(LandfallDate, VRUInc)) +
geom_point(color = "#330099", shape = 21) +
geom_smooth(method = "lm") +
theme_ipsum_pub() +
labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Lodging Industry)') +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_Lodging)
# Businesses (Hospitality Industry in general)
AdvSelect_HospitalityBus <- VRU_df %>%
filter(COB == c('Bar', 'BedAndBreakfast', 'Casino', 'Entertainment', 'Entertainment_Auditoriums_Theaters', 'Hospitality', 'HotelsMotels', 'HotelsMotelsWithRestaurant', 'Microbreweries_Wineries_HardCider', 'Restaurant', 'Service')) %>%
ggplot(aes(LandfallDate, VRUInc)) +
geom_point(color = "#330099", shape = 21) +
geom_smooth(method = "lm") +
theme_ipsum_pub() +
labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Hospitality Industry)') +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_HospitalityBus)
# Businesses: Church and Schools
AdvSelect_ChurchAndSchools <- VRU_df %>%
filter(COB == c('Church', 'Education', 'Schools')) %>%
ggplot(aes(LandfallDate, VRUInc)) +
geom_point(color = "#330099", shape = 21) +
geom_smooth(method = "lm") +
theme_ipsum_pub() +
labs(x = 'Landfall Date', y = 'Actual Loss Sustained', title = 'COB Performance Over Time (Churchs and Schools)') +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
ggplotly(AdvSelect_ChurchAndSchools)
INVESTIGATE MODEL MISS STUFFF… PS I LOVE YOU. YOU’RE DOING
GREAT.
# create a dual y-axis comparison of AIRMP and RMSMP over time (use VRU_NoNML dataset)
# usually dual y-axis comparisons should be avoided, but seeing that both variables have the same scale (0-1), we think it is appropriate.
AIRmiss <- VRU_NoNML %>%
ggplot(aes(x = LandfallDate, y = AIRMP)) +
geom_point(color = "skyblue") +
#geom_jitter() +
theme_bw()
RMSmiss <- VRU_NoNML %>%
ggplot(aes(x = LandfallDate, y = RMSMP)) +
geom_point(color = "blue") +
#geom_jitter() +
theme_bw()
ggplotly(AIRmiss)
ggplotly(RMSmiss)