incident <- read.csv("../incident.csv")
policy_holder <- read.csv("../policy_holder.csv")
first_table <-sqldf("select i.incident_type, 
                    round(median(i.auto_year),0) median_auto_year,
                    round(median(i.total_claim_amount), 0) median_claim_amount, 
                    round(median(p.age), 0) median_age, 
                    count(i.number_of_vehicles_involved)/1000 Totals 
                    from incident i, policy_holder p 
                    join policy_holder
                    on i.policy_number = p.policy_number 
                    group by i.incident_type")
first_table
##              incident_type median_auto_year median_claim_amount median_age
## 1  Multi-vehicle Collision             2006               61650         39
## 2               Parked Car             2005                5490         37
## 3 Single Vehicle Collision             2005               62640         38
## 4            Vehicle Theft             2005                5495         38
##   Totals
## 1    419
## 2     84
## 3    403
## 4     94

Multi-vehcile collisions had more involvements than the other incident types. However, my only question is regarding multi-vehicle collisions, are all cars involved in the multi-collision around the year of 2006? How was the year for multi-vehicle collisions determined? Or is this year represented of when the accident occurred? It seems a bit difficult to comprehend multi-vehicle collisions auto year since there is more than one car involved, so for the auto_year to be a single number seems a bit off, which is why the question: is the year representing the year the collision happened? Or, is it the year of which the car was made?

second_table <- sqldf("select  
                      case 
                      when i.incident_hour_of_the_day >= 10 and i.incident_hour_of_the_day < 17 
                      then '10a.m.-4:59p.m.' 
                      when i.incident_hour_of_the_day >= 22 and i.incident_hour_of_the_day <= 24
                      then '10p.m.-midnight'
                      when i.incident_hour_of_the_day >= 17 and i.incident_hour_of_the_day < 20
                      then '5p.m.-7:59p.m.'
                      when i.incident_hour_of_the_day >= 6 and i.incident_hour_of_the_day < 10
                      then '6a.m.-9:59a.m.'
                      when i.incident_hour_of_the_day >= 20 and i.incident_hour_of_the_day < 22
                      then '8p.m.-9:59p.m.'
                      else 'midnight-5:59a.m.'
                      end time_frame,
                      round(median(i.auto_year), 0) median_auto_year,
                      round(median(i.total_claim_amount), 0) median_claim_amount,
                      round(median(p.age), 0) median_age,
                      count(i.number_of_vehicles_involved)/1000 Totals 
                      from incident i, policy_holder p 
                      join policy_holder
                      on i.policy_number = p.policy_number 
                      group by time_frame")
second_table
##          time_frame median_auto_year median_claim_amount median_age Totals
## 1   10a.m.-4:59p.m.             2005               61490         38    293
## 2   10p.m.-midnight             2006               62920         39     89
## 3    5p.m.-7:59p.m.             2005               60190         40    135
## 4    6a.m.-9:59a.m.             2005               40320         38    163
## 5    8p.m.-9:59p.m.             2005               60205         39     76
## 6 midnight-5:59a.m.             2005               52590         36    244

The time frame allows us to see what period of time had more accidents. From this table, we can see that the time frame between 10a.m. to 4:59p.m. were more. However, this interval does not make a lot of sense to add, since there is a huge gap with the times, which would makes sense that there would be more accidents that would’ve taken place within the 7 hour gap. We need to lower the interval so that we are in close range with the other time frames. A four hour gap would be reasonable but a 7 hour gap for noting the times gives a lot of room for accidents that may have occured during that time period. So, it does not make much sense to include a 7 hour gap since many accidents would have taken place within that time interval.

third_table <- sqldf("select  
                      case 
                      when i.incident_hour_of_the_day >= 10 and i.incident_hour_of_the_day < 17 
                      then '10a.m.-4:59p.m.' 
                      when i.incident_hour_of_the_day >= 22 and i.incident_hour_of_the_day <= 24
                      then '10p.m.-midnight'
                      when i.incident_hour_of_the_day >= 17 and i.incident_hour_of_the_day < 20
                      then '5p.m.-7:59p.m.'
                      when i.incident_hour_of_the_day >= 6 and i.incident_hour_of_the_day < 10
                      then '6a.m.-9:59a.m.'
                      when i.incident_hour_of_the_day >= 20 and i.incident_hour_of_the_day < 22
                      then '8p.m.-9:59p.m.'
                      else 'midnight-5:59a.m.'
                      end time_frame,
                      i.incident_type,
                      round(median(i.auto_year), 0) median_auto_year,
                      round(median(i.total_claim_amount), 0) median_claim_amount,
                      round(median(p.age), 0) median_age,
                      count(i.number_of_vehicles_involved)/1000 Totals 
                     from incident i, policy_holder p 
                     join policy_holder
                     on i.policy_number = p.policy_number
                     group by time_frame, i.incident_type")
third_table
##           time_frame            incident_type median_auto_year
## 1    10a.m.-4:59p.m.  Multi-vehicle Collision             2007
## 2    10a.m.-4:59p.m.               Parked Car             2004
## 3    10a.m.-4:59p.m. Single Vehicle Collision             2004
## 4    10a.m.-4:59p.m.            Vehicle Theft             2005
## 5    10p.m.-midnight  Multi-vehicle Collision             2008
## 6    10p.m.-midnight               Parked Car             2013
## 7    10p.m.-midnight Single Vehicle Collision             2004
## 8     5p.m.-7:59p.m.  Multi-vehicle Collision             2005
## 9     5p.m.-7:59p.m.               Parked Car             2011
## 10    5p.m.-7:59p.m. Single Vehicle Collision             2005
## 11    5p.m.-7:59p.m.            Vehicle Theft             2000
## 12    6a.m.-9:59a.m.  Multi-vehicle Collision             2004
## 13    6a.m.-9:59a.m.               Parked Car             2006
## 14    6a.m.-9:59a.m. Single Vehicle Collision             2006
## 15    6a.m.-9:59a.m.            Vehicle Theft             2006
## 16    8p.m.-9:59p.m.  Multi-vehicle Collision             2005
## 17    8p.m.-9:59p.m.               Parked Car             2004
## 18    8p.m.-9:59p.m. Single Vehicle Collision             2005
## 19    8p.m.-9:59p.m.            Vehicle Theft             2005
## 20 midnight-5:59a.m.  Multi-vehicle Collision             2006
## 21 midnight-5:59a.m.               Parked Car             2004
## 22 midnight-5:59a.m. Single Vehicle Collision             2005
## 23 midnight-5:59a.m.            Vehicle Theft             2004
##    median_claim_amount median_age Totals
## 1                60750         38    137
## 2                 5900         40      7
## 3                63700         39    138
## 4                 6820         40     11
## 5                64800         40     41
## 6                 5700         41      3
## 7                62640         37     45
## 8                61400         41     65
## 9                 6490         37      5
## 10               61110         41     62
## 11                5100         28      3
## 12               64320         39     41
## 13                5830         39     34
## 14               63685         38     46
## 15                4975         38     42
## 16               61440         39     47
## 17                5005         32      2
## 18               59355         38     22
## 19                6500         35      5
## 20               61625         37     88
## 21                4900         34     33
## 22               60185         36     90
## 23                5220         39     33

The third table is grouped by time frame and incident type which allows us to further gain more information as to during which time period, and which incident types have more claim amounts and more totals. We can use this information to individually look at time frames and incident types to see if there are any correlations as to which types of incidents are frequent during a certain time frame. For example, are more Parked Car incidents occurring at 10p.m. - midnight or 6 a.m.- 9:59a.m.? With this information, we can make better comparisons in the data for which incidents occur frequently within certain time frames.

total <- c("midnight-5:59a.m.", "6a.m.-9:59a.m.", "10a.m.-4:59p.m.", "5p.m.-7:59p.m.", "8p.m.-9:59p.m.", "10p.m.-midnight")
colors <- c("pink", "#FEA889", "#8CDCDA", "#B1D877")
graph <- ggplot(data=third_table, aes(x=time_frame, y=Totals, fill=incident_type))
graph <- graph + geom_bar(stat="identity") +
  scale_x_discrete(limits=total) +
  scale_fill_manual(values=colors) +
  labs(title="Insurance Claims by type of incident and time of day", x="Time", y="Total Claims")
graph

This graph demonstrates the total claims by incident type based off the time frame. This allows us to have a visual representation of time frames when incidents happen, and which incidents are more frequent within a time frame. From first glance, more incidents had higher claims that occurred from 10 a.m. - 4:59 p.m.

second_graph <- ggplot(data=incident, aes(x=auto_year, y=bodily_injuries, fill=incident_type))
second_graph <- second_graph + geom_bar(stat="identity") +
  labs(title="Total Bodily Injuries by Year", x="Year", y="Count of Injuries")
second_graph

This graph allows us to see which year had the most bodily injuries and from which incident. This table is beneficial in noting trends if we were to analyze if there was a year that had more bodily injuries than others. From this table, we see that the year 1995 had more injuries than the rest.

fourth_table <- sqldf("select i.policy_number, p.insured_sex, i.incident_type from incident i, policy_holder p join
                      policy_holder on i.policy_number=p.policy_number group by i.policy_number")
view(fourth_table)


third_graph <- ggplot(data=fourth_table, aes(x=incident_type, fill=insured_sex))
third_graph <- third_graph + geom_bar(position="identity") + labs(title="Male Vs. Female Insured", x="Incident Type", y="Count of Insured")
third_graph

This graph is interesting as it makes us question why very few women are insured? Why is there such a discrepancy between male and females who are insured? From this graph, we will be able to use this information to see if we can find answer to these questions. Since this is an insurance company, maybe we will be able to provide insurance to women to hike those numbers up. This graph will need further exploration as to why very few women are insured. This might also be a good marketing strategy as well to help gain customers to choose this insurance company versus another.