jaws
rm(list = ls(all.names = TRUE)) 

options(warn=-1)

library(ipred)
library(plotly)
library(monmlp)
library(ggthemes)
source('performance_util.R')
source('functions_util.R')
library(pROC)
require(e1071)
library(xgboost)
library(precrec)
library(gbm)
library(mlr)
library(purrr)
library(tibble)
library(tidyr)
library(tm)
library(tidyr)
library(readr)
library(InformationValue)
library(sqldf)
library(dplyr)
#library(dummies)
library(rpart)
library(rpart.plot)
library(caret)
library(ROCR)
library(data.table)
library(mltools)
library(PerformanceAnalytics)
library(Hmisc)
library(corrplot) 
library(skimr)
library(reshape)
library(ggplot2)
library(randomForest)
library(sqldf)
library(ranger)
library(mlrHyperopt)
library(GGally)
library(MlBayesOpt)
library(mlbench)
library(plyr); 
library(parallel)
library(akima)


reset.seed <- function()
{
  set.seed(4200)
}

Se puede predecir cuando un ataque será fatal? Y por qué? Qué opina Martin Brody?

Martin

Carga del Dataset

Exploración inicial del Dataset

Es un dataset bastante sucio, donde todas las variables son de caracter.

head(DataBase)
##    Case.Number        Date Year       Type   Country            Area
## 1   2018.06.25 25-Jun-2018 2018    Boating       USA      California
## 2   2018.06.18 18-Jun-2018 2018 Unprovoked       USA         Georgia
## 3   2018.06.09 09-Jun-2018 2018    Invalid       USA          Hawaii
## 4   2018.06.08 08-Jun-2018 2018 Unprovoked AUSTRALIA New South Wales
## 5   2018.06.04 04-Jun-2018 2018   Provoked    MEXICO          Colima
## 6 2018.06.03.b 03-Jun-2018 2018 Unprovoked AUSTRALIA New South Wales
##                         Location     Activity               Name Sex Age
## 1    Oceanside, San Diego County     Paddling        Julie Wolfe   F  57
## 2 St. Simon Island, Glynn County     Standing Adyson\xa0McNeely    F  11
## 3                   Habush, Oahu      Surfing        John Denges   M  48
## 4             Arrawarra Headland      Surfing               male   M    
## 5                       La Ticla  Free diving     Gustavo Ramos    M    
## 6             Flat Rock, Ballina Kite surfing         Chris \x85   M    
##                                                      Injury Fatal..Y.N.
## 1 No injury to occupant, outrigger canoe and paddle damaged           N
## 2                                Minor injury to left thigh           N
## 3              Injury to left lower leg from surfboard skeg           N
## 4                                 Minor injury to lower leg           N
## 5         Lacerations to leg & hand shark PROVOKED INCIDENT           N
## 6                                   No injury, board bitten           N
##            Time         Species         Investigator.or.Source
## 1         18h00     White shark               R. Collier, GSAF
## 2 14h00  -15h00                 K.McMurray, TrackingSharks.com
## 3         07h45                 K.McMurray, TrackingSharks.com
## 4                     2 m shark                 B. Myatt, GSAF
## 5               Tiger shark, 3m                      A .Kipper
## 6                                    Daily Telegraph, 6/4/2018
##                         pdf
## 1      2018.06.25-Wolfe.pdf
## 2    2018.06.18-McNeely.pdf
## 3     2018.06.09-Denges.pdf
## 4  2018.06.08-Arrawarra.pdf
## 5      2018.06.04-Ramos.pdf
## 6 2018.06.03.b-FlatRock.pdf
##                                                                      href.formula
## 1      http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf
## 2    http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf
## 3     http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf
## 4  http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf
## 5      http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.04-Ramos.pdf
## 6 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.03.b-FlatRock.pdf
##                                                                              href
## 1      http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf
## 2    http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf
## 3     http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf
## 4  http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf
## 5      http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.04-Ramos.pdf
## 6 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.03.b-FlatRock.pdf
##   Case.Number.1 Case.Number.2 original.order X X.1   DateFixed Indice
## 1    2018.06.25    2018.06.25           6303       25-Jun-2018      1
## 2    2018.06.18    2018.06.18           6302       18-Jun-2018      2
## 3    2018.06.09    2018.06.09           6301       09-Jun-2018      3
## 4    2018.06.08    2018.06.08           6300       08-Jun-2018      4
## 5    2018.06.04    2018.06.04           6299       04-Jun-2018      5
## 6  2018.06.03.b  2018.06.03.b           6298       03-Jun-2018      6
str(DataBase)
## 'data.frame':    6301 obs. of  26 variables:
##  $ Case.Number           : chr  "2018.06.25" "2018.06.18" "2018.06.09" "2018.06.08" ...
##  $ Date                  : chr  "25-Jun-2018" "18-Jun-2018" "09-Jun-2018" "08-Jun-2018" ...
##  $ Year                  : num  2018 2018 2018 2018 2018 ...
##  $ Type                  : chr  "Boating" "Unprovoked" "Invalid" "Unprovoked" ...
##  $ Country               : chr  "USA" "USA" "USA" "AUSTRALIA" ...
##  $ Area                  : chr  "California" "Georgia" "Hawaii" "New South Wales" ...
##  $ Location              : chr  "Oceanside, San Diego County" "St. Simon Island, Glynn County" "Habush, Oahu" "Arrawarra Headland" ...
##  $ Activity              : chr  "Paddling" "Standing" "Surfing" "Surfing" ...
##  $ Name                  : chr  "Julie Wolfe" "Adyson\xa0McNeely " "John Denges" "male" ...
##  $ Sex                   : chr  "F" "F" "M" "M" ...
##  $ Age                   : chr  "57" "11" "48" "" ...
##  $ Injury                : chr  "No injury to occupant, outrigger canoe and paddle damaged" "Minor injury to left thigh" "Injury to left lower leg from surfboard skeg" "Minor injury to lower leg" ...
##  $ Fatal..Y.N.           : chr  "N" "N" "N" "N" ...
##  $ Time                  : chr  "18h00" "14h00  -15h00" "07h45" "" ...
##  $ Species               : chr  "White shark" "" "" "2 m shark" ...
##  $ Investigator.or.Source: chr  "R. Collier, GSAF" "K.McMurray, TrackingSharks.com" "K.McMurray, TrackingSharks.com" "B. Myatt, GSAF" ...
##  $ pdf                   : chr  "2018.06.25-Wolfe.pdf" "2018.06.18-McNeely.pdf" "2018.06.09-Denges.pdf" "2018.06.08-Arrawarra.pdf" ...
##  $ href.formula          : chr  "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf" ...
##  $ href                  : chr  "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf" ...
##  $ Case.Number.1         : chr  "2018.06.25" "2018.06.18" "2018.06.09" "2018.06.08" ...
##  $ Case.Number.2         : chr  "2018.06.25" "2018.06.18" "2018.06.09" "2018.06.08" ...
##  $ original.order        : int  6303 6302 6301 6300 6299 6298 6297 6296 6295 6294 ...
##  $ X                     : chr  "" "" "" "" ...
##  $ X.1                   : chr  "" "" "" "" ...
##  $ DateFixed             : chr  "25-Jun-2018" "18-Jun-2018" "09-Jun-2018" "08-Jun-2018" ...
##  $ Indice                : int  1 2 3 4 5 6 7 8 9 10 ...

Limpieza de la variable de clase.

La variable de clase es si el ataque fue fatal o no. Pa En este sentido, se tienen que realizar las siguiente tareas * Estandarizar * Integrar * Inferir las nulas en base a otra variable, en este caso ‘Injuries’

DataBase = DataBase %>% mutate(Fatal = case_when(
                  Fatal..Y.N. %like% '%Y%' ~  'Y',
                  Fatal..Y.N.  %like% '%y%'  ~  'Y',
                  Fatal..Y.N. %like% '%N%' ~  'N',
                  Fatal..Y.N.  %like% '%n%'  ~  'N',
                  TRUE ~ Fatal..Y.N.))

DataBase$Fatal = toupper(DataBase$Fatal )
DataBase$Fatal  = str_trim(DataBase$Fatal)


  
DataBase = DataBase %>% mutate(Fatal = 
                                case_when(
                                  (Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'FATAL' ~ 'Y',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Fatal' ~ 'Y',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Presumed FATAL' ~ 'Y', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Presumed FATAL' ~ 'Y', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Survived' ~ 'N', 
                                  
                                  
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Minor',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Feet injured',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Foot injured',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Laceration to knee',Injury, ignore.case = TRUE)==TRUE ~ 'N',  
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Abrasions',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Calf bitten',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Survived',Injury, ignore.case = TRUE)==TRUE ~ 'N', 

                                  
                                   (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Human remains',Injury, ignore.case = TRUE)==TRUE ~ 'Y',                                 
                                   (Fatal =='' | Fatal == 'UNKNOWN') & grepl('remains found',Injury, ignore.case = TRUE)==TRUE ~ 'Y',                                  
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('body found',Injury, ignore.case = TRUE)==TRUE ~ 'Y',                                  
                                      
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('recovered',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('murdered',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('fatal',Injury, ignore.case = TRUE)==TRUE & grepl('non-fatal',Injury, ignore.case = TRUE)==FALSE ~ 'Y',
                          
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('no injury',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('no attack',Injury, ignore.case = TRUE)==TRUE ~ 'N', 
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('non-fatal',Injury, ignore.case = TRUE)==TRUE ~ 'N',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('minor injuries',Injury, ignore.case = TRUE)==TRUE ~ 'N',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('no injuries',Injury, ignore.case = TRUE)==TRUE ~ 'N',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('injuries',Injury, ignore.case = TRUE)==TRUE ~ 'N',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('presummed fatal',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
                                  
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Human remains',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
                                  (Fatal =='' | Fatal == 'UNKNOWN') & grepl('Death',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
                                  
                                  
                                  TRUE  ~ Fatal)                                 
)

DataBase = DataBase %>% mutate (FatalFixed =
                                  case_when(
                                    Fatal == 'Y'   ~  1,
                                    Fatal == 'N'   ~  0,
                                    TRUE  ~  -1
                                    
                                  )
                                      )

Completar, modificar, integrar y limpiar la variable Fecha

La fecha es un string. Además de ello, no tiene una codificación facilmente transformable. Tareas a realizar: * Pasar los meses a valores enteros * Extraer día y año. * Convertir a formato fecha

DataBase = DataBase %>% mutate(DateFixed = gsub(" +", "", DateFixed))
DataBase = DataBase %>% mutate(YearFixed = ifelse(is.na(Year), right(DateFixed, 4), Year))
DataBase = DataBase %>% mutate(MonthFixed =
                                 case_when(
                                   grepl('jan',DateFixed, ignore.case = TRUE )==TRUE ~ '01',
                                   grepl('feb',DateFixed, ignore.case = TRUE )==TRUE ~ '02',
                                   grepl('mar',DateFixed, ignore.case = TRUE )==TRUE ~ '03',
                                   grepl('apr',DateFixed, ignore.case = TRUE )==TRUE ~ '04',
                                   grepl('may',DateFixed, ignore.case = TRUE )==TRUE ~ '05',
                                   grepl('jun',DateFixed, ignore.case = TRUE )==TRUE ~ '06',
                                   grepl('jul',DateFixed, ignore.case = TRUE )==TRUE ~ '07',
                                   grepl('aug',DateFixed, ignore.case = TRUE )==TRUE ~ '08',
                                   grepl('sep',DateFixed, ignore.case = TRUE )==TRUE ~ '09',
                                   grepl('oct',DateFixed, ignore.case = TRUE )==TRUE ~ '10',
                                   grepl('nov',DateFixed, ignore.case = TRUE )==TRUE ~ '11',
                                   grepl('dec',DateFixed, ignore.case = TRUE )==TRUE ~ '12',
                                   TRUE ~ '00'
                                   
                                 )  
                               )


DataBase = DataBase %>% mutate(DayFixed =case_when
                                  (nchar(DateFixed)>=10 & !is.na(as.numeric(gsub('-','',  left(DateFixed,2)))     ) ~ gsub('-','',  left(DateFixed,2)),
                                    TRUE ~ '00'))


DataBase = DataBase %>% mutate(DateFormated = 
                                  case_when(
                                    YearFixed > 0 & MonthFixed !='00' & DayFixed != '00' ~ paste(YearFixed,MonthFixed,DayFixed, sep='/'),
                                    TRUE ~ ''
                                    
                                  )
                                 
                                 )


DataBase = DataBase %>% mutate(WeekDayNumber = wday(DateFormated))
DataBase = DataBase %>% mutate(WeekDayNumber = ifelse(is.na(WeekDayNumber), 99, WeekDayNumber))

Completar, modificar, integrar y limpiar la variable Tipo de Ataque

DataBase = DataBase %>% mutate(TypeFixed = case_when(
                                          Type %in% c('Boat','Boatomg') ~ 'Boating',
                                          Type =='' ~ 'TYPE_UNKNOWN',
                                          TRUE ~ Type ))

DataBase$TypeFixed = toupper( gsub(' ', '_', DataBase$TypeFixed ))

Completar, modificar, integrar y limpiar la variable Hora de Ataque

DataBase = DataBase %>%  mutate(TimeText = Time)
DataBase = DataBase %>%  mutate(TimeText= gsub('"','',TimeText))
DataBase = DataBase %>%  mutate(TimeText= gsub("'",'',TimeText))

Times = sqldf ("
  SELECT Indice,
  CASE     WHEN TimeText ='Afternoon'               THEN    '1500'         
  WHEN TimeText ='Morning'                 THEN   '1000'     
  WHEN TimeText ='AM' THEN '1100'
  WHEN TimeText ='A.M.' THEN '1100'
  WHEN TimeText =   'Just before dawn'  THEN '600'
  WHEN TimeText =   'Dawn'THEN  '650'
  WHEN TimeText =   'Dusk'THEN '1900'
  WHEN TimeText =   'P.M.' THEN '1600'
  WHEN TimeText =  'Dark' THEN '2100'
  WHEN TimeText =  '<a0>' THEN ''
  WHEN TimeText = '06j00' THEN '0600'

  WHEN TimeText = 'Lunchtime' THEN '1200'

  WHEN TimeText = '--' THEN ''
  WHEN TimeText = 'After dark'           THEN      '2000'  
  WHEN TimeText = 'After lunch'          THEN       '1230'  
  WHEN TimeText ='Morning '              THEN    '1000'         
  WHEN TimeText ='Morning '              THEN    '1000'         
  WHEN TimeText ='Night'                 THEN    '2000'          
  WHEN TimeText ='Evening'                THEN '1900'         
  WHEN TimeText ='Late afternoon'        THEN   '1730'         
  WHEN TimeText ='Early morning'            THEN   '700'       
  WHEN TimeText ='Midday'                 THEN '1200'         
  WHEN TimeText ='Early afternoon'        THEN '1500'         
  WHEN TimeText ='Midnight'              THEN   '2330'         
  WHEN TimeText ='After noon'             THEN'1600'         
  WHEN TimeText ='Just before noon'      THEN   '1300'       
  WHEN TimeText ='Late afternon'            THEN  '1800'       
  WHEN TimeText ='After 04h00'            THEN '1630'       
  WHEN TimeText ='After Dusk'            THEN   '1930'       
  WHEN TimeText ='After dark'            THEN   '2000'       
  WHEN TimeText ='After dusk'             THEN '1930'       
  WHEN TimeText ='After lunch'          THEN  '1200'       
  WHEN TimeText ='After midnight'         THEN '2330'       
  WHEN TimeText ='Before daybreak'        THEN' 700'         
  WHEN TimeText ='Daybreak'               THEN     '800'         
  WHEN TimeText ='Daytime'                THEN    '1200'         
  WHEN TimeText ='Early Morning'              THEN  '900'         
  WHEN TimeText ='Early evening'           THEN   '1800'        
  WHEN TimeText ='Just after 12h00'      THEN    '1230'       
  WHEN TimeText ='Late Afternoon'         THEN   '1700'         
  WHEN TimeText ='Late morning'          THEN    '1100'        
  WHEN TimeText ='Late night'             THEN     '2200'        
  WHEN TimeText ='Mid afternoon'              THEN     '1600'        
  WHEN TimeText ='Mid morning'            THEN     '1000'        
  WHEN TimeText ='Mid-morning'            THEN     '1000'        
  WHEN TimeText ='Midday.'                THEN     '1200'        
  WHEN TimeText ='Morning'                THEN    '1500'        
  WHEN TimeText ='Nightfall'                  THEN    '2030'         
  WHEN TimeText ='Noon'                   THEN    '1200'        
  WHEN TimeText ='Shortly after midnight'  THEN    '2330'        
  WHEN TimeText ='night'  THEN '2200'
  WHEN TimeText LIKE 'sundown' THEN '2000'
  WHEN TimeText like '%SunseT%' THEN '1840'
  WHEN TimeText  = '8:04 pm' THEN '2004'
  WHEN TimeText like '%sundown%' THEN '1930'
  WHEN TimeText like '%dusk%' THEN '1930'
  WHEN TimeText ='30 minutes after 1992.07.08.a' THEN ''
  ELSE  RTRIM(LTRIM(TimeText)) END AS TimeText
  , LENGTH(TimeText) AS TimeLen
  FROM DataBase"

)

stoppers = c('<','>' ,'Before', 'Sometime', 'between' ,'2 hours after Opperman', 'Prior to ', 'Between', 'Shortly', 'Just before', 'Dusk', ')', '(' ,'before', 'just','(Sunset)', 'X' )
Times = Times %>% mutate(TimeText = removeWords(TimeText, stoppers   ))
Times$TimeText = gsub('-', ' - ', Times$TimeText  )
Times$TimeText = trim(Times$TimeText)
Times= Times %>% mutate(TimeFixed = as.integer(left(extract_numeric(left(TimeText,6)),4)))
TimesToDB = Times %>% dplyr::select (Indice, TimeFixed)
DataBase = merge(DataBase, TimesToDB, by ='Indice')
DataBase$TimeFixed = as.integer(DataBase$TimeFixed)

Completar, modificar, integrar y limpiar la variable Paises, Continente y Hemisterio del ataque

CountriesFixed = read.csv('countries_fixed.csv', header = TRUE, sep = ';')
CountriesFixed$CountryFixed = trim( CountriesFixed$CountryFixed  )
CountriesFixed$CountryFixed = gsub(',', '',CountriesFixed$CountryFixed  )
CountriesFixed$CountryFixed = toupper(gsub(' ', '_',CountriesFixed$CountryFixed  ))
CountriesFixed$ContinentFixed = toupper(gsub(' ', '_',CountriesFixed$ContinentFixed  ))

CountriesFixedToDB = CountriesFixed %>%  dplyr::select (Country, CountryFixed, ContinentFixed,  HemisphereFixed)
DataBase = merge(x= DataBase, y = CountriesFixedToDB, by ='Country')

Completar, modificar, integrar y limpiar la variable Especie del tiburón del ataque

Species = sqldf (
  
  "SELECT  Indice,
  Species,
  CASE WHEN Species like '%white%' THEN 'WHITE SHARK'
       WHEN Species like '%BLUE%' THEN 'BLUE SHARK'
       WHEN Species like '%grey%' THEN 'GREY SHARK'
       WHEN Species like '%bull%' THEN 'BULL SHARK' 
       WHEN Species like '%Wobbegong%'  THEN  'WOBBEGONG SHARK'
       WHEN Species like '%Blacktip%'   THEN  'BLACKTIP SHARK'
       WHEN Species like '%Galapagos%'  THEN  'GALAPAGOS SHARK'
       WHEN Species like '%Tiger%'      THEN  'TIGER SHARK'
       WHEN Species like '%cookiecutter%' THEN  'COOKIECUTTER SHARK'
       WHEN Species like '%MAKO%' THEN  'MAKO SHARK'
       WHEN Species like '%Spinner%' THEN  'SPINNER SHARK'
       WHEN Species like '%Porbeagle%' THEN  'PORBEAGLE SHARK'
       WHEN Species like '%COW%' THEN  'COW SHARK'
       WHEN Species like '%Nurse%' THEN  'NURSE SHARK'
       WHEN Species like '%hammerhead%' THEN  'HAMMERHEAD SHARK'
       WHEN Species like '%Lemon%' THEN  'LEMON SHARK'
       WHEN Species like '%SEVEN%' THEN  'SEVENHILL SHARK'
       WHEN Species like '%ANGEL%' THEN  'ANGEL SHARK'
       WHEN Species like '%Silky%' THEN  'SILKY SHARK'
       WHEN Species like '%Goblin%' THEN  'GOBLIN SHARK'
       WHEN Species like '%Sandbar%' THEN  'SANDBAR SHARK'
       WHEN Species like '%Raggedtooth%' THEN  'RAGGEDTOOTH SHARK'
       WHEN Species like '%SALMON%' THEN  'SALMON SHARK'
       WHEN Species like '%reef%' THEN  'REEF SHARK'
       WHEN Species like '%dogfish%' THEN  'DOGFISH SHARK'
       WHEN Species like '%gill%' THEN  'SEVENHILL SHARK'
       WHEN Species like '%Port%Jackson%' THEN  'PORT JACKSON SHARK'
       WHEN Species like '%Zambesi%' THEN  'ZAMBESI SHARK'
       WHEN Species like '%Spurdog%' THEN  'SPURDOG SHARK'
       WHEN Species like '%Dusky%' THEN  'DUSKY SHARK'
       WHEN Species like '%Basking%' THEN  'BASKIN SHARK'
       WHEN Species like '%Sand %' THEN  'SAND SHARK'
       WHEN Species like '%catshark%' THEN  'CATSHARK SHARK'
       WHEN Species like '%copper%' THEN  'COPPER SHARK'
       WHEN Species like '%black%tipped%' THEN  'BLACKTIP SHARK'
       WHEN Species like '%leucas%' THEN  'LEUCAS SHARK'
       WHEN Species like '%whaler%' THEN  'COPPER SHARK'
       WHEN Species like '%bronze%' THEN  'COPPER SHARK'
       WHEN Species like '%Soupfin%' THEN  'SOUPFIN SHARK'
       WHEN Species like '%Leopard%' THEN  'LEOPARD SHARK'
       WHEN Species like '%gaffed%' THEN  'LEOPARD SHARK'
       WHEN Species like '%silvertip%' THEN 'CARCHARHINUS SHARK'
       WHEN Species like '%gray%' THEN 'CARCHARHINUS SHARK'
       WHEN Species like '%maculpinnis%' THEN 'MALCULPINNIS SHARK'
       WHEN Species like '%Whale %' THEN 'WHALE SHARK'
       WHEN Species like '%Carpet%' THEN 'CARPET SHARK'
       WHEN Species like '%albimarginatus%' THEN 'CARCHARHINUS SHARK'
       WHEN Species like '%broadnose%' THEN 'SEVENHILL SHARK'
       WHEN Species like '%stingray%' THEN 'STINGRAY SHARK'
       WHEN Species like '%Rhizoprionodon%' THEN 'RHIZOPRINODON SHARK'
       WHEN Species like '%BANJO%' THEN 'BANJO SHARK'
       WHEN Species like '%Thresher%' THEN 'THRESHER SHARK'
       WHEN Species like '%leucas%' THEN 'ZAMBESI SHARK'
       WHEN Species like '%Shovelnose%' THEN 'SHOVELNOSE SHARK'
       WHEN Species like '%macrurus%' THEN 'DUSKY SHARK'
       WHEN Species like '%Bronze%whaler%' THEN  'COPPER SHARK'
       WHEN Species ='' THEN  'SPECIE_UNKNOWN'
      ELSE 'SPECIE_UNKNOWN'  END AS SpeciesFixed
  FROM DataBase
  "
)
 

Species$SpeciesFixed = gsub(' ', '_',Species$SpeciesFixed )

Completar, modificar, integrar y limpiar la variable Edad de la victima del ataque

DataBase = DataBase %>% mutate(Age = gsub("<bd>","",Age))
DataBase = DataBase %>% mutate(Age = gsub(">","",Age))

DataBase = DataBase %>%  mutate(AgeFixed = as.integer(abs(extract_numeric(Age))))
DataBase = DataBase %>%  mutate(AgeFixed = ifelse(grepl('teen',Age, ignore.case = TRUE )==TRUE          , 14, AgeFixed))
DataBase = DataBase %>%  mutate(AgeFixed = ifelse(grepl('young',Age, ignore.case = TRUE )==TRUE          , 25, AgeFixed))

DataBase = DataBase %>%  mutate(AgeFixed = ifelse(is.na(AgeFixed), -1, AgeFixed))

DataBase = DataBase %>%  mutate(AgeFixed = ifelse(AgeFixed>105, -1, AgeFixed))

Completar, modificar, integrar y limpiar la variable Actividad del ataque

ActivitiesFixed = sqldf(
  
  
 " SELECT DISTINCT Indice
  ,       Activity
  ,       CASE WHEN Activity LIKE '%SURFING%'  THEN  'SURFING'
             WHEN Activity LIKE '%SWIMMING%' THEN  'SWIMMING'
             WHEN Activity LIKE '%FISHING%' THEN  'FISHING'
             WHEN Activity LIKE '%DIVING%' THEN   'DIVING'
             WHEN Activity LIKE '%SPEARFISHING%' THEN  'SPEARFISHING'
             WHEN Activity LIKE '%BATHING%' THEN  'BATHING'
             WHEN Activity LIKE '%STANDING%' THEN  'STANDING'
             WHEN Activity LIKE '%SNORKELING%' THEN  'SNORKELING'
             WHEN Activity LIKE '%KAYAKING%'   THEN  'KAYAKING'
            WHEN Activity LIKE '%Jumped%overboard%'   THEN  'FELL_OVERBOARD'
             WHEN Activity LIKE '%fell%overboard%'   THEN  'FELL_OVERBOARD'
             WHEN Activity LIKE '%overboard%'   THEN  'FELL_OVERBOARD'
             WHEN Activity LIKE '%oveboard%'   THEN  'FELL_OVERBOARD'
             WHEN Activity LIKE '%Wading%'   THEN  'WADING'
             WHEN Activity LIKE '%Fell%into%water%'   THEN  'FELL_INTO_WATER'
             WHEN Activity LIKE '%Canoeing%'   THEN  'CANOEING'
             WHEN Activity LIKE '%Sailing%'   THEN  'SAILING'
             WHEN Activity LIKE '%Rowing%'   THEN  'ROWING'
             WHEN Activity LIKE '%Floating%'   THEN  'FLOATING'
             WHEN Activity LIKE '%boating%'   THEN  'BOATING' 
             WHEN Activity LIKE '%boat%'   THEN  'BOATING' 
             WHEN Activity LIKE '%Angling%'   THEN  'FISHING' 
             WHEN Activity LIKE '%wreck%'   THEN  'SHIPWRECK'
             WHEN Activity LIKE '%Sea%Disaste%'   THEN  'SHIPWRECK'
             WHEN Activity LIKE '%Oystering%'   THEN  'OYSTERING'
             WHEN Activity LIKE '%CLAMMING%'   THEN  'CLAMMING'
             WHEN Activity LIKE '%WASHING%'   THEN  'WASHING'
             WHEN Activity LIKE '%Hunting%'   THEN  'HUNTING'
             WHEN Activity LIKE '%Jumped%out%'   THEN  'FELL_OVERBOARD'
        
             
      ELSE 'ACTIVITY_UNKNOWN' END AS ActivityFixed

  
  FROM DataBase

  "
    
)

 

ActivityToDB = ActivitiesFixed %>% dplyr::select(Indice, ActivityFixed)
DataBase = merge(x=DataBase, y =ActivityToDB, by = "Indice")
DataBase = DataBase %>% mutate(ActivityFixed = ifelse(ActivityFixed=='', 'ACTIVITY_UNKNOWN',ActivityFixed))

Extraer, modificar, integrar, limpiar, validar y estandarizar la variable Largo en metros del tiburón

DataBase= DataBase %>%  mutate (LargoMetros =str_extract(Species, "[:digit:]+[:punct:]?[:digit:]*[:space:]*[Cc]?[Mm]+"))
DataBase= DataBase %>%  mutate (LargoPies =str_extract(Species, "[:digit:]+'"))


DataBase= DataBase  %>% mutate (LargoMetros1 = 
                                  
                                  case_when (
                                    grepl('cm',LargoMetros, ignore.case = TRUE) ~ extract_numeric(LargoMetros)/ 100,
                                    TRUE ~ extract_numeric(gsub(',', '.', LargoMetros))
                                    
                                  )
                                
)
DataBase= DataBase  %>% mutate (LargoMetros2 = extract_numeric(gsub(',', '.', LargoPies)) * 30.48 / 100)
DataBase= DataBase  %>% mutate (LargoMetrosFixed = coalesce2(LargoMetros1,  LargoMetros2 ) )                                 

Hemisferio y estación del año

DataBase = DataBase %>% mutate(SeasonFixed = case_when (
  
  HemisphereFixed == 'South' & MonthFixed %in% c('01','02','03') ~ 'Summer',
  HemisphereFixed == 'South' & MonthFixed %in% c('04','04','06') ~ 'Autumn',
  HemisphereFixed == 'South' & MonthFixed %in% c('07','08','09') ~ 'Winter',
  HemisphereFixed == 'South' & MonthFixed %in% c('10','11','12') ~ 'Spring',
  HemisphereFixed == 'North' & MonthFixed %in% c('01','02','03') ~ 'Winter',
  HemisphereFixed == 'North' & MonthFixed %in% c('04','04','06') ~ 'Spring',
  HemisphereFixed == 'North' & MonthFixed %in% c('07','08','09') ~ 'Summer',
  HemisphereFixed == 'North' & MonthFixed %in% c('10','11','12') ~ 'Autumn',
  TRUE ~ 'Season_Unknown'
  )
)

Dataset limpio para realizar gráficos.

# DataGraficos = sqldf(
# "SELECT A.indice, 
#        yearfixed, 
#        monthfixed, 
#        typefixed,
#        speciesfixed, 
#        dateformated, 
#        weekdaynumber, 
#        hemispherefixed, 
#        agefixed, 
#        sex, 
#        activityfixed, 
#        countryfixed, 
#        continentfixed, 
#        area, 
#        largometrosfixed, 
#        CASE 
#          WHEN largometrosfixed BETWEEN 0 AND 1 THEN 'LARGO_0_1' 
#          WHEN largometrosfixed BETWEEN 1.01 AND 2 THEN 'LARGO_1_2' 
#          WHEN largometrosfixed BETWEEN 2.01 AND 3 THEN 'LARGO_2_3' 
#          WHEN largometrosfixed BETWEEN 3.01 AND 4 THEN 'LARGO_3_4' 
#          WHEN largometrosfixed BETWEEN 4.01 AND 5 THEN 'LARGO_4_5' 
#          WHEN largometrosfixed BETWEEN 5.01 AND 6 THEN 'LARGO_5_6' 
#          WHEN largometrosfixed > 6 THEN 'LARGO_MAS_6' 
#          ELSE 'N/A' 
#        END AS LargoEstandarizado,
#        FatalFixed
# FROM   DataBase A 
#        LEFT JOIN Species B 
#               ON a.indice = B.indice 
# WHERE  fatalfixed >= 0 
#        AND yearfixed >= 1900 
#        AND countryfixed != 'UNKNOWN' ")
#                                    
# write.csv(DataGraficos,'shark_attacks_plots.csv')                                  
#  

Generación de variables dummy

CountryDummy =sqldf("SELECT Indice 
,    CASE WHEN COUNTRYFIXED = 'USA' THEN 1 ELSE 0  END              AS COUNTRY_USA
,    CASE WHEN COUNTRYFIXED = 'AUSTRALIA' THEN 1 ELSE 0 END         AS COUNTRY_AUSTRALIA
,    CASE WHEN COUNTRYFIXED = 'SOUTH_AFRICA' THEN 1 ELSE 0 END      AS COUNTRY_SOUTH_AFRICA
,    CASE WHEN COUNTRYFIXED = 'PAPUA_NEW_GUINEA' THEN 1 ELSE 0 END  AS COUNTRY_PAPUA_GUINEA
,    CASE WHEN COUNTRYFIXED = 'NEW_ZEALAND' THEN 1 ELSE 0 END       AS COUNTRY_NEW_ZELAND
,    CASE WHEN COUNTRYFIXED = 'BRAZIL' THEN 1 ELSE 0  END           AS COUNTRY_BRAZIL
,    CASE WHEN COUNTRYFIXED = 'BAHAMAS' THEN 1 ELSE 0 END           AS COUNTRY_BAHAMAS
,    CASE WHEN COUNTRYFIXED NOT IN ('USA', 'AUSTRALIA', 'SOUTH_AFRICA', 'BAHAMAS',
                                    'PAPUA_NEW_GUINEA', 'NEW_ZEALAND', 'BRAZIL') THEN 1 ELSE 0 END AS COUNTRY_RESTO
FROM DataBase")

AreasDummy = sqldf("SELECT Indice 
,    CASE WHEN AREA = 'Florida' THEN 1 ELSE 0  END                 AS AREA_FLORIDA
,    CASE WHEN AREA = 'New South Wales' THEN 1 ELSE 0  END         AS AREA_SOUTH_WALES
,    CASE WHEN AREA = 'Queensland' THEN 1 ELSE 0   END             AS AREA_QUEENSLAND
,    CASE WHEN AREA = 'Hawaii' THEN 1 ELSE 0  END                  AS AREA_HAWAII
,    CASE WHEN AREA = 'California' THEN 1 ELSE 0   END             AS AREA_CALIFORNIA
,    CASE WHEN AREA = 'KwaZulu-Natal' THEN 1 ELSE 0 END            AS AREA_KWAZULU
,    CASE WHEN AREA = 'Western Australia' THEN 1 ELSE 0 END        AS AREA_WEST_AUSTRALIA
,    CASE WHEN AREA = 'Eastern Cape Province' THEN 1 ELSE 0 END    AS AREA_EAST_CAPE
,    CASE WHEN AREA = 'Western Cape Province' THEN 1 ELSE 0 END    AS AREA_WEST_CAPE

FROM DataBase")




SpeciesDummy = sqldf("SELECT    Indice, 
        CASE WHEN speciesfixed = 'SPECIE_UNKNOWN' THEN 1 ELSE 0 END AS SPECIE_UNKNOWN, 
        CASE WHEN speciesfixed = 'WHITE_SHARK' THEN 1 ELSE 0 END AS SPECIE_WHITE_SHARK, 
        CASE WHEN speciesfixed = 'TIGER_SHARK' THEN 1 ELSE 0 END AS SPECIE_TIGER_SHARK, 
        CASE WHEN speciesfixed = 'BLACKTIP_SHARK' THEN 1 ELSE 0 END AS SPECIE_BLACKTIP_SHARK, 
        CASE WHEN speciesfixed = 'BULL_SHARK' THEN 1 ELSE 0 END AS SPECIE_BULL_SHARK, 
        CASE WHEN speciesfixed = 'COPPER_SHARK' THEN 1 ELSE 0 END AS SPECIE_COPPER_SHARK, 
        CASE WHEN speciesfixed = 'GREY_SHARK' THEN 1 ELSE 0 END AS SPECIE_GREY_SHARK, 
        CASE WHEN speciesfixed = 'NURSE_SHARK' THEN 1 ELSE 0 END AS SPECIE_NURSE_SHARK, 
        CASE WHEN speciesfixed = 'BLUE_SHARK' THEN 1 ELSE 0 END AS SPECIE_BLUE_SHARK, 
        CASE WHEN speciesfixed = 'MAKO_SHARK' THEN 1 ELSE 0 END AS SPECIE_MAKO_SHARK, 
        CASE WHEN speciesfixed = 'WOBBEGONG_SHARK' THEN 1 ELSE 0 END AS SPECIE_WOBBEGONG_SHARK, 
        CASE WHEN speciesfixed = 'HAMMERHEAD_SHARK' THEN 1 ELSE 0 END AS SPECIE_HAMMERHEAD_SHARK, 
        CASE WHEN speciesfixed = 'RAGGEDTOOTH_SHARK' THEN 1 ELSE 0 END AS SPECIE_RAGGEDTOOTH_SHARK, 
        CASE WHEN speciesfixed = 'LEMON_SHARK' THEN 1 ELSE 0 END AS SPECIE_LEMON_SHARK, 
        CASE WHEN speciesfixed = 'REEF_SHARK' THEN 1 ELSE 0 END AS SPECIE_REEF_SHARK, 
        CASE WHEN speciesfixed = 'ZAMBESI_SHARK' THEN 1 ELSE 0 END AS SPECIE_ZAMBESI_SHARK, 
        CASE WHEN speciesfixed = 'SPINNER_SHARK' THEN 1 ELSE 0 END AS SPECIE_SPINNER_SHARK, 
        CASE WHEN speciesfixed = 'SEVENHILL_SHARK' THEN 1 ELSE 0 END AS SPECIE_SEVENHILL_SHARK,
        CASE WHEN speciesfixed NOT IN( 'SPECIE_UNKNOWN', 'WHITE_SHARK', 'TIGER_SHARK', 'BULL_SHARK', 'BLACKTIP_SHARK', 'COPPER_SHARK', 
                                    'GREY_SHARK', 'NURSE_SHARK', 'BLUE_SHARK', 'MAKO_SHARK', 'WOBBEGONG_SHARK' , 
                                        'HAMMERHEAD_SHARK', 'RAGGEDTOOTH_SHARK', 'LEMON_SHARK', 
                                        'REEF_SHARK', 'ZAMBESI_SHARK', 'SPINNER_SHARK', 'SEVENHILL_SHARK' ) THEN 1 ELSE 0 END AS SPECIES_OTHERS 
FROM Species"
  
  
)



SexDummy = sqldf(
  "SELECT DISTINCT Indice
  , CASE WHEN Sex LIKE '%M%' THEN 1 ELSE 0 END AS SEX_M
  , CASE WHEN Sex LIKE '%F%' THEN 1 ELSE 0 END AS SEX_F
  , CASE WHEN Sex NOT LIKE '%M%' AND Sex NOT LIKE '%F%' THEN 1 ELSE 0 END AS SEX_U
  FROM DataBase"
)


AgeDummy = sqldf (
  "SELECT DISTINCT INDICE
  , CASE WHEN AGEFIXED < 0 THEN 1 ELSE 0 END AS AGE_9999
  , CASE WHEN AGEFIXED BETWEEN 0 AND 10 THEN 1 ELSE 0 END AS AGE_0_10
  , CASE WHEN AGEFIXED BETWEEN 11 AND 18 THEN 1 ELSE 0 END AS AGE_11_18
  , CASE WHEN AGEFIXED BETWEEN 19 AND 25 THEN 1 ELSE 0 END AS AGE_19_25
  , CASE WHEN AGEFIXED BETWEEN 26 AND 35 THEN 1 ELSE 0 END AS AGE_26_35
  , CASE WHEN AGEFIXED BETWEEN 36 AND 45 THEN 1 ELSE 0 END AS AGE_36_45
  , CASE WHEN AGEFIXED BETWEEN 46 AND 55 THEN 1 ELSE 0 END AS AGE_46_55
  , CASE WHEN AGEFIXED BETWEEN 56 AND 65 THEN 1 ELSE 0 END AS AGE_56_65
  , CASE WHEN AGEFIXED BETWEEN 66 AND 80 THEN 1 ELSE 0 END AS AGE_66_75
  , CASE WHEN AGEFIXED > 90 THEN 1 ELSE 0 END AS AGE_80_100
FROM DataBase")

LargoDummy = sqldf (
  "SELECT DISTINCT Indice
  , CASE WHEN LargoMetrosFixed BETWEEN 0 AND 1 THEN 1 ELSE 0 END AS LARGO_0_1
  , CASE WHEN LargoMetrosFixed BETWEEN 1.01 AND 2 THEN 1 ELSE 0 END AS LARGO_1_2
  , CASE WHEN LargoMetrosFixed BETWEEN 2.01 AND 3 THEN 1 ELSE 0 END AS LARGO_2_3
  , CASE WHEN LargoMetrosFixed BETWEEN 3.01 AND 4 THEN 1 ELSE 0 END AS LARGO_3_4
  , CASE WHEN LargoMetrosFixed BETWEEN 4.01 AND 5 THEN 1 ELSE 0 END AS LARGO_4_5
  , CASE WHEN LargoMetrosFixed BETWEEN 5.01 AND 6 THEN 1 ELSE 0 END AS LARGO_6_6
  , CASE WHEN LargoMetrosFixed > 6 THEN 1 ELSE 0 END AS LARGO_6_10
  , CASE WHEN LargoMetrosFixed NOT BETWEEN 0 AND 999 THEN 1 ELSE 0 END AS LARGO_9999
FROM DataBase")

TimeDummy = sqldf (
  "SELECT DISTINCT Indice
  
  ,   CASE WHEN TimeFixed BETWEEN 0 AND 400 THEN 1 ELSE 0 END      AS TIME_0_4
  ,   CASE WHEN TimeFixed BETWEEN 401 AND 800 THEN 1 ELSE 0 END    AS TIME_4_8
  ,   CASE WHEN TimeFixed BETWEEN 801 AND 1200 THEN 1 ELSE 0 END   AS TIME_8_12
  ,   CASE WHEN TimeFixed BETWEEN 1201 AND 1600 THEN 1 ELSE 0 END  AS TIME_12_16
  ,   CASE WHEN TimeFixed BETWEEN 1601 AND 2000 THEN 1 ELSE 0 END  AS TIME_16_20
   ,   CASE WHEN TimeFixed BETWEEN 2001 AND 2400 THEN 1 ELSE 0 END AS TIME_20_24
  ,   CASE WHEN TimeFixed NOT BETWEEN 0 AND 2400 THEN 1 ELSE 0 END AS TIME_9999
  FROM DataBase"
)

WeekDayDummy  = sqldf(
  
  "SELECT DISTINCT INDICE, 
    CASE  WHEN WEEKDAYNUMBER = '2' THEN 1   ELSE 0   END   AS WEEKDAY_2 
  , CASE  WHEN WEEKDAYNUMBER = '7' THEN 1   ELSE 0   END AS WEEKDAY_7 
  , CASE  WHEN WEEKDAYNUMBER = '6' THEN 1   ELSE 0   END AS WEEKDAY_6   
  , CASE  WHEN WEEKDAYNUMBER = '1' THEN 1   ELSE 0   END AS WEEKDAY_1 
  , CASE  WHEN WEEKDAYNUMBER = '5' THEN 1   ELSE 0   END AS WEEKDAY_5 
  , CASE  WHEN WEEKDAYNUMBER = '99' THEN 1   ELSE 0   END AS WEEKDAY_9999 
  , CASE  WHEN WEEKDAYNUMBER = '4' THEN 1   ELSE 0   END AS WEEKDAY_4 
  , CASE  WHEN WEEKDAYNUMBER = '3' THEN 1   ELSE 0   END AS WEEKDAY_3 
  FROM   DataBase")



ActivityDummy = sqldf(
  "SELECT  Indice, 
  CASE    WHEN activityfixed = 'ACTIVITY_UNKNOWN' THEN 1   ELSE 0   END   AS ACTIVITY_UNKNOWN
  , CASE   WHEN activityfixed = 'STANDING' THEN 1   ELSE 0   END AS ACTIVITY_STANDING 
  , CASE   WHEN activityfixed = 'SURFING' THEN 1   ELSE 0   END AS ACTIVITY_SURFING 
  , CASE   WHEN activityfixed = 'DIVING' THEN 1   ELSE 0   END AS ACTIVITY_DIVING 
  , CASE   WHEN activityfixed = 'SWIMMING' THEN 1   ELSE 0   END AS ACTIVITY_SWIMMING 
  , CASE   WHEN activityfixed = 'FISHING' THEN 1   ELSE 0   END AS ACTIVITY_FISHING 
  , CASE   WHEN activityfixed = 'WADING' THEN 1   ELSE 0   END AS ACTIVITY_WADING 
  , CASE   WHEN activityfixed = 'SNORKELING' THEN 1   ELSE 0   END AS ACTIVITY_SNORKELING 
  , CASE   WHEN activityfixed = 'BOATING' THEN 1   ELSE 0   END AS ACTIVITY_BOATING 
  , CASE   WHEN activityfixed = 'BATHING' THEN 1   ELSE 0   END AS ACTIVITY_BATHING 
  , CASE   WHEN activityfixed = 'KAYAKING' THEN 1   ELSE 0   END AS ACTIVITY_KAYAKING 
  , CASE   WHEN activityfixed = 'CANOEING' THEN 1   ELSE 0   END AS ACTIVITY_CANOEING 
  , CASE   WHEN activityfixed = 'WASHING' THEN 1   ELSE 0   END AS ACTIVITY_WASHING 
  , CASE   WHEN activityfixed = 'FLOATING' THEN 1   ELSE 0   END AS ACTIVITY_FLOATING 
  , CASE   WHEN activityfixed = 'SHIPWRECK' THEN 1   ELSE 0   END AS ACTIVITY_SHIPWRECK 
  , CASE   WHEN activityfixed = 'ROWING' THEN 1   ELSE 0   END AS ACTIVITY_ROWING 
  , CASE   WHEN activityfixed = 'FELL_INTO_WATER' THEN 1   ELSE 0   END AS ACTIVITY_FELL_INTO_WATER 
  , CASE   WHEN activityfixed = 'SAILING' THEN 1   ELSE 0   END AS ACTIVITY_SAILING 
  , CASE   WHEN activityfixed = 'CLAMMING' THEN 1   ELSE 0   END AS ACTIVITY_CLAMMING 
  , CASE   WHEN activityfixed = 'HUNTING' THEN 1   ELSE 0   END AS ACTIVITY_HUNTING 
  , CASE   WHEN activityfixed = 'FELL_OVERBOARD' THEN 1   ELSE 0   END AS ACTIVITY_FELL_OVERBOARD 
  , CASE   WHEN activityfixed = 'OYSTERING' THEN 1   ELSE 0   END AS ACTIVITY_OYSTERING 
  FROM   DataBase")


SeasonDummy = sqldf ("SELECT DISTINCT INDICE, 
        CASE WHEN SEASONFIXED = 'SPRING' THEN 1 ELSE 0 END AS SEASON_SPRING 
 ,      CASE WHEN SEASONFIXED = 'AUTUMN' THEN 1 ELSE 0 END AS SEASON_AUTUMN 
 ,      CASE WHEN SEASONFIXED = 'SEASON_UNKNOWN' THEN 1 ELSE 0 END AS SEASON_UNKNOW
 ,      CASE WHEN SEASONFIXED = 'WINTER' THEN 1 ELSE 0 END AS SEASON_WINTER 
 ,      CASE WHEN SEASONFIXED = 'SUMMER' THEN 1 ELSE 0 END AS SEASON_SUMMER 
 FROM DataBase")


ContimentDummy= sqldf (
"SELECT Indice, 
    CASE WHEN CONTINENTFIXED='NORTH_AMERICA' THEN 1 ELSE 0 END AS CONTINENT_NORTH_AMERICA, 
    CASE WHEN CONTINENTFIXED='OCEANIA' THEN 1 ELSE 0 END AS CONTINENT_OCEANIA, 
    CASE WHEN CONTINENTFIXED='SOUTH_AMERICA' THEN 1 ELSE 0 END AS CONTINENT_SOUTH_AMERICA, 
    CASE WHEN CONTINENTFIXED='EUROPE' THEN 1 ELSE 0 END AS CONTINENT_EUROPE, 
    CASE WHEN CONTINENTFIXED='AFRICA' THEN 1 ELSE 0 END AS CONTINENT_AFRICA, 
    CASE WHEN CONTINENTFIXED='ASIA' THEN 1 ELSE 0 END AS CONTINENT_ASIA, 
    CASE WHEN CONTINENTFIXED='UNKNOWN' THEN 1 ELSE 0 END AS CONTINENT_UNKNOWN 
FROM DataBase
")


HemisphereDummy =sqldf("SELECT INDICE
,       CASE WHEN HEMISPHEREFIXED = 'NORTH' THEN 1 ELSE 0 END AS HEMISPHERE_NORTH
,       CASE WHEN HEMISPHEREFIXED = 'SOUTH' THEN 1 ELSE 0 END AS HEMISPHERE_SOUTH
,       CASE WHEN HEMISPHEREFIXED = '' THEN 1 ELSE 0 END AS HEMISPHERE_UNKNOWN
FROM DataBase")

Integración de las variables dummies

DataBase = merge(DataBase, WeekDayDummy, by ='Indice')
DataBase = merge(DataBase, ActivityDummy, by ='Indice')
DataBase = merge(DataBase, SeasonDummy, by ='Indice')
DataBase = merge(DataBase, TimeDummy, by ='Indice')
DataBase = merge(DataBase, SexDummy, by ='Indice')
DataBase = merge(DataBase, AgeDummy, by ='Indice')
DataBase = merge(DataBase, LargoDummy, by ='Indice')
DataBase = merge(DataBase, ContimentDummy, by ='Indice')
DataBase = merge(DataBase, HemisphereDummy, by ='Indice') 
DataBase = merge(DataBase, SpeciesDummy, by ='Indice')
#DataBase = merge(DataBase, CountryDummy, by ='Indice')
#DataBase = merge(DataBase, AreasDummy, by ='Indice')

Especies más fatales

FatalDB = read_csv('shark_attacks_plots.csv') 


FatalDB = FatalDB %>%  dplyr::select(SpeciesFixed, FatalFixed)
FatalDB2 = sqldf (
  "SELECT SpeciesFixed
  ,       COUNT(*) AS Cantidad
  ,       SUM(FatalFixed) as Fatalidad
  ,       SUM(FatalFixed) * 1.00 / COUNT(*) as PorcFatalidad
  from  FatalDB
 where SpeciesFixed <> 'SPECIE_UNKNOWN'
  group by SpeciesFixed
having Fatalidad > 0
  "
)





ggplot(FatalDB2, aes(x=Cantidad, y=PorcFatalidad, color = SpeciesFixed) ) +
  geom_point(alpha =0.5, size = 8 ) +
  xlab("Cantidad de Ataques") + ylab("Porcentaje de mortalidad del ataque") +
  geom_vline(xintercept = 320) +
  geom_hline(yintercept = .1859)+
  geom_label(
    label="Inofensivo", 
    x=150,
    y=.13,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.2,
    color = "black",
    alpha = 0.05,
    fill="#c4c4c4"
  ) +
  
  geom_label(
    label="Peligroso", 
    x=150,
    y=.27,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.2,
    color = "black",
    alpha = 0.05,
    fill="#c4c4c4"
  ) +
  
  geom_label(
    label="Agresivo", 
    x=500,
    y=.13,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.2,
    alpha = 0.05,
    color = "black",
    fill="#c4c4c4"
  ) +
  
  geom_label(
    label="Fatal", 
    x=500,
    y=.27,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.2,
    color = "black",
    alpha = 0.05,
    fill="#c4c4c4"
  ) +
  
  labs(title = "Especies de tiburones más fatales", subtitle = "Exluidas las que tienen fatalidad 0") 

Violin Plot de Largo por Especie

Sharks = read_csv('shark_attacks_plots.csv') 
Sharks = Sharks %>%  dplyr::select(SpeciesFixed, LargoMetrosFixed) %>%
     filter(LargoMetrosFixed < 15) %>% 
     filter(SpeciesFixed %in% c('WHITE_SHARK','TIGER_SHARK', 'ZAMBESI_SHARK','BULL_SHARK', 'BLUE_SHARK'))


p <- Sharks %>%
  plot_ly(
    x = ~SpeciesFixed,
    y = ~LargoMetrosFixed,
    split = ~SpeciesFixed,
    type = 'violin',
    box = list(
      visible = T
    ),
    meanline = list(
      visible = T
    )
  ) %>% 
  layout(
    title = "Violin Plot ~ Especie de tiburón en metros",
    xaxis = list(
      title = "Especie"
    ),
    yaxis = list(
      title = "Largo en metros",
      zeroline = F
    )
  )
p

Matriz de correlacion

df_cor = read_csv('shark_attacks_plots.csv') 
df_cor =df_cor %>% dplyr::select (-X1, -DateFormated, -Indice, -LargoEstandarizado )

df_cor = df_cor %>%  mutate(Sex =               as.integer(as.factor(Sex)) ,  
                            HemisphereFixed =   as.integer(as.factor(HemisphereFixed)), 
                            SpeciesFixed =      as.integer(as.factor(SpeciesFixed)), 
                            CountryFixed =      as.integer(as.factor(CountryFixed)), 
                            ContinentFixed =    as.integer(as.factor(ContinentFixed)), 
                            Area =             as.integer(as.factor(Area)), 
                            MonthFixed =        as.integer( MonthFixed),
                            ActivityFixed =        as.integer(as.factor(ActivityFixed)),
                            TypeFixed =        as.integer(as.factor(TypeFixed))
                            
                            )


cor <- cor(df_cor, use = "na.or.complete")
corrplot(cor, method="shade",shade.col=NA, tl.col="black", tl.srt=45)

Creación de un dataset con las variables necesarias para modelar

DataBaseFinal = DataBase %>% filter(FatalFixed >= 0 & YearFixed >= 1900 & CountryFixed != 'UNKNOWN')

DataBaseFinal = DataBaseFinal %>% dplyr::select(-Indice, -SeasonFixed, -WeekDayNumber, -CountryFixed, -LargoMetrosFixed, -Sex, -AgeFixed, -TimeFixed, -Fatal, -YearFixed, -TypeFixed, -Age, -href, -Case.Number.2, -DayFixed, -DateFormated, -DateFixed, -MonthFixed, -TimeText ,   -Date, -Injury, -Case.Number, - Type,  - Location,  - Fatal..Y.N. ,  - Country,  - Location,  - Year,  - Area,  - Name,  - Time,  - pdf,  - Case.Number.1,  - X,  - Investigator.or.Source,  -
         original.order,  - X.1,  - LargoMetros,  - LargoPies,  - LargoMetros2,  - LargoMetros1,  - Species,  - Activity,  - ActivityFixed,  -HemisphereFixed,   - href.formula, -href, -ContinentFixed)

Separacion en Train y Test y fórmula

reset.seed()
sample_size = nrow(DataBaseFinal) * 0.8
idxTrain = sample(seq_len(nrow(DataBaseFinal))  ,sample_size, replace = FALSE)

DataBaseTrain = DataBaseFinal[idxTrain,]
DataBaseTest = DataBaseFinal[-idxTrain,]


variables = colnames(DataBaseTrain %>% dplyr::select(-FatalFixed))
predictors = paste(variables, collapse = ' + ')
fmla = paste('FatalFixed', predictors, sep = ' ~ ')


DataBaseTrainFactor = DataBaseTrain
DataBaseTrainFactor_x = DataBaseTrainFactor  %>%  dplyr::select (-FatalFixed)
DataBaseTrainFactor_y = as.factor(DataBaseTrainFactor$FatalFixed)
DataBaseTrainFactor$FatalFixed = as.factor(DataBaseTrainFactor$FatalFixed)

trainTask <- makeClassifTask(data = DataBaseTrainFactor,target = 'FatalFixed', positive = "1")

x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
y_train = as.numeric(DataBaseTrain$FatalFixed)

x_test = DataBaseTest %>% dplyr::select(-FatalFixed)
y_test = as.numeric(DataBaseTest$FatalFixed)

save.image(file='environmentShark.RData')

Ridge Regression

library(glmnet)

#fit_ridge <- glmnet(x = as.matrix(x_train), y =as.matrix(y_train), alpha = 0, family="gaussian")
fit_ridge = readRDS("rr.rds")
y_pred_rr <- predict(fit_ridge, as.matrix(x_test), s=0.01, type="response")

#file = "rr.rds"
#saveRDS(fit_ridge, file)


precrec_obj <- evalmod(scores = y_pred_rr, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_rr, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_rr)
df_metricas = get_df_metrics()
metrics = get_metrics('Ridge Regression', y_test, y_pred_rr)
df_metricas = rbind(df_metricas, metrics)
df_metricas 
##             modelo  accuracy sensitivity specificity       auc    recall
## 1 Ridge Regression 0.7969495   0.9876999   0.1398305 0.7877265 0.3601554

Tuneo de hiperparámetros para un Decision Tree con Hyperopt (Gridseach)

 # makeatree <- makeLearner("classif.rpart", predict.type = "response")
 # set_cv <- makeResampleDesc("CV",iters = 5L)
 # 
 # gs <- makeParamSet(
 #   makeDiscreteParam("minsplit",values = seq(2,40,2)),
 #   makeDiscreteParam("minbucket", values = seq(2,12,1)),
 #   makeDiscreteParam("cp", values = c(0.0001)),
 #   makeDiscreteParam("maxdepth", values = seq(2,20,1))
 # )
 # 
 # gscontrol <- makeTuneControlGrid()
 # stune <- tuneParams(learner = makeatree, resampling = set_cv, task = trainTask, par.set = gs, control = gscontrol, measures = acc)
 # 
 # 
 # stune$x
 # stune$y

Decision Tree: Train, Fit & Test

#fit_tree = rpart(formula = fmla, data= DataBaseTrain, cp= 0.0001, minsplit=10, minbucket= 10, maxdepth=20, method = 'class')
#fit_tree$variable.importance

#dt_file = "dt.rds"
#saveRDS(fit_tree, dt_file)
fit_tree = readRDS(file = "dt.rds")


pruned.tree <- prune(fit_tree, cp = fit_tree$cptable[which.min(fit_tree$cptable[,"xerror"]),"CP"])
rpart.plot(pruned.tree)

y_pred_dt= predict(fit_tree, DataBaseTest, type = 'prob')[,2]
y_test = DataBaseTest$FatalFixed

precrec_obj <- evalmod(scores = y_pred_dt, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_dt, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_dt)

df_metricas = get_df_metrics()
metrics = get_metrics('Decision Tree', y_test, y_pred_dt)
df_metricas = rbind(df_metricas, metrics)
df_metricas 
##          modelo  accuracy sensitivity specificity       auc    recall
## 1 Decision Tree 0.8198284   0.9237392   0.4618644 0.8277071 0.4219482

KNN Gridsearch con loop

 y_train = DataBaseTrain$FatalFixed

 vector_auc_train = vector()
 vector_auc_test = vector()
 vector_k = seq(from=1, to= 100, by = 1)
 for(k in vector_k){
     fit_knn= knn3(FatalFixed ~ ., data = DataBaseTrain, k = k)
     prediction_train = predict(fit_knn, DataBaseTrain)[,2]
     roc_obj_train <- roc(y_train, prediction_train)
     auc_train = pROC::auc(roc_obj_train)
     vector_auc_train = c(vector_auc_train, auc_train)

     prediction_test = predict(fit_knn, DataBaseTest)[,2]
     roc_obj_test <- roc(y_test, prediction_test)
     auc_test = pROC::auc(roc_obj_test)
     vector_auc_test = c(vector_auc_test, auc_test)


 }

 kvecinos = data.frame(k =vector_k, auc_test =  vector_auc_test, auc_train = vector_auc_train  )


 ggplot(kvecinos, aes(x=k ))+
   geom_line(aes(y = auc_train), color= "red", size=2)+
   geom_line(aes(y = auc_test), color= "blue", size= 2)+
   theme_fivethirtyeight()  +
   labs(title = "Entrenamiento de  KNN", subtitle = "Dependiendo del número de k", x = 'K')

 #kvecinos[kvecinos$auc_test == max(kvecinos$auc_test), ]

KNN:Train, Fit & Test

#fit_knn= knn3(FatalFixed ~ ., data = DataBaseTrain, k = 48)
#knn_file = "knn.rds"
#saveRDS(fit_knn, knn_file)
fit_knn = readRDS(file = "knn.rds")


y_pred_knn = predict(fit_knn, DataBaseTest)[,2]

precrec_obj <- evalmod(scores = y_pred_knn, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_knn, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_knn)
df_metricas = get_df_metrics()
metrics = get_metrics('KNN', y_test, y_pred_knn)
df_metricas = rbind(df_metricas, metrics)
df_metricas 
##   modelo  accuracy sensitivity specificity       auc    recall
## 1    KNN 0.7940896   0.9151292   0.3771186 0.7848912 0.3459128

Tuneo de hiperparámetros para SVM Radial con mlrHyperopt (bayesiano)

# res0 <- svm_cv_opt(data = DataBaseTrain,
#                    label = FatalFixed,
#                    svm_kernel = "radial",
#                    degree_range = c(2L, 10L),
#                    n_folds = 5,
#                    kappa = 5,
#                    init_points = 5,
#                    n_iter = 25)
# 
# res0

SVM: Train, Fit & Test

# x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
# y_train = as.numeric(DataBaseTrain$FatalFixed)
# 
# x_test = DataBaseTest %>% dplyr::select(-FatalFixed)
# y_test = as.numeric(DataBaseTest$FatalFixed)

#fit_svm= svm(x =x_train, y = y_train , kernel = "radial",cost=4.6357, gamma=0.0424)
#svm_file = "svm.rds"
#saveRDS(fit_svm, svm_file)

fit_svm = readRDS(file = "svm.rds")

y_pred_svm = predict(fit_svm, x_test)

precrec_obj <- evalmod(scores = y_pred_svm, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_svm, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_svm)

df_metricas = get_df_metrics()
metrics = get_metrics('SVM', y_test, y_pred_svm)
df_metricas = rbind(df_metricas, metrics)
df_metricas 
##   modelo  accuracy sensitivity specificity      auc    recall
## 1    SVM 0.8455672   0.8745387   0.7457627 0.881838 0.4246226

Tuneo de hiperparámetros para un Random Forest con MLR (GridSearch)

# getParamSet("classif.randomForest")
# rf <- makeLearner("classif.randomForest", predict.type = "response", par.vals = list(ntree = 200, mtry = 3))
# rf$par.vals <- list(
#   importance = TRUE
# )
# 
# rf_param <- makeParamSet(
#   makeDiscreteParam("ntree",  values = c(400,500, 600, 700)),
#   makeDiscreteParam("mtry", values = c(30,40,50,60)),
#   makeDiscreteParam("nodesize", values  = c(2,4,8,10))
# )
# 
# rancontrol <- makeTuneControlRandom(maxit = 50L)
# set_cv <- makeResampleDesc("CV",iters = 3L)
# r_tune <- tuneParams(learner = rf, resampling = set_cv, task = trainTask, par.set = rf_param, control = rancontrol, measures = acc)
# 
# r_tune$x
# r_tune$y

Random Forest: Train, Fit & Test

reset.seed()
#fit_randomf = ranger(FatalFixed ~ ., data = DataBaseTrain, importance = "impurity", num.trees =500, mtry = 12 , min.node.size = 150, probability = TRUE)
#rf_file = "rf.rds"
#saveRDS(fit_randomf, rf_file)

fit_randomf = readRDS(file = "rf.rds")


p = predict(fit_randomf, DataBaseTest)           
y_pred_rf =  p$predictions[,2]

precrec_obj <- evalmod(scores = y_pred_rf, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_rf, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_rf)

df_metricas = get_df_metrics()
metrics = get_metrics('Random Forest', y_test, y_pred_rf)
df_metricas = rbind(df_metricas, metrics)
df_metricas 
##          modelo accuracy sensitivity specificity       auc    recall
## 1 Random Forest 0.828408   0.9114391   0.5423729 0.8616471 0.3889929

Tuneo de hiperparámetros para GBM con hyperopt (bayesiano)

 # reset.seed()
 # res = hyperopt(trainTask, learner = "classif.gbm")
 # res

GBM: Train, Fit & Test

x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
x_test = DataBaseTest %>% dplyr::select(-FatalFixed,)
y_train = as.numeric(DataBaseTrain$FatalFixed)

#gbm1 = gbm(FatalFixed~ ., data = DataBaseTrain, distribution = "bernoulli", bag.fraction = 0.5, n.trees = 1000, interaction.depth =6, shrinkage = 0.1, n.minobsinnode = 10)
#gbm_file = "gbm.rds"
#saveRDS(gbm1, gbm_file)

gbm1 = readRDS(file = "gbm.rds")

y_pred_xg <- predict(object= gbm1, x_test, n.trees=500, type = 'response')

precrec_obj <- evalmod(scores = y_pred_xg, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_xg, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_xg)
df_metricas = get_df_metrics()
metrics = get_metrics('GBM', y_test, y_pred_xg)
df_metricas = rbind(df_metricas, metrics)
df_metricas  
##   modelo  accuracy sensitivity specificity      auc    recall
## 1    GBM 0.8541468   0.9557196   0.5042373 0.899324 0.5107009

Logistic Regression -> Step Backward

lr <- glm(formula = fmla, data=DataBaseTrain, family=binomial(link="logit"))

#step= step(lr,direction="backward",trace= FALSE )
#file = "step.rds"
#saveRDS(step, file)


# aic =step.log.back$aic
# coefficients = step.log.back$coefficients
# aic
# coefficients

Logistic Regression: Train, Fit & Test

#data.frame(summary(lr)$coef[summary(lr)$coef[,4] <= .05, 4])

#lr2 <- glm(formula = fmla2, data=DataBaseTrain, family=binomial(link="logit"))
#lr_file = "lr_file.rds"
#saveRDS(lr2, lr_file)

lr2 = readRDS(file = "lr_file.rds") 
 
y_pred_lr = predict(lr2, DataBaseTest, type="response")

precrec_obj <- evalmod(scores = y_pred_lr, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores =y_pred_lr, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_lr)
df_metricas = get_df_metrics()
metrics = get_metrics('Logistic Regression', y_test, y_pred_lr)
df_metricas = rbind(df_metricas, metrics)
df_metricas  
##                modelo  accuracy sensitivity specificity       auc   recall
## 1 Logistic Regression 0.8007626   0.9717097   0.2118644 0.7989008 0.385087

Multilayer Perceptron: Train, Fit & Test

x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
y_train = as.numeric(DataBaseTrain$FatalFixed)

x_test = DataBaseTest %>% dplyr::select(-FatalFixed)
y_test = as.numeric(DataBaseTest$FatalFixed)


#mod_w <- monmlp.fit(as.matrix(x_train), as.matrix(y_train), hidden1=5, n.ensemble = 15, bag = TRUE,iter.max = 500, silent=TRUE, control = list(trace = 0))
#monmlp_file = "monmlp.rds"
#saveRDS(mod_w, monmlp_file)

mod_w = readRDS(file = "monmlp.rds")

y_pred_mlp <- monmlp.predict(x = as.matrix(x_test), weights = mod_w)


precrec_obj <- evalmod(scores = y_pred_mlp, labels = y_test)
autoplot(precrec_obj)

precrec_obj2 <- evalmod(scores = y_pred_mlp, labels = y_test, mode="basic")
autoplot(precrec_obj2)

auc = auc(y_test,y_pred_mlp)
df_metricas = get_df_metrics()
metrics = get_metrics('MLP', y_test, y_pred_mlp)
df_metricas = rbind(df_metricas, metrics)
df_metricas 
##   modelo  accuracy sensitivity specificity       auc    recall
## 1    MLP 0.8379409   0.9372694   0.4957627 0.8780151 0.4526358

Gráfico de todos los modelos

scores1 <- join_scores(y_pred_dt, y_pred_xg, y_pred_rf, y_pred_lr, y_pred_svm, y_pred_knn, y_pred_mlp, y_pred_rr)
labels1 <-join_labels(y_test, y_test, y_test, y_test, y_test, y_test, y_test, y_test)
msmdat2 <- mmdata(scores1, labels1, modnames = c("Decision Tree", "GBM", "Random Forest", "Logistic Regression", "SVM Radial", "KNN", "MLP", "Ridge"))


write.csv(y_test, "Scores/Y.csv")

write.csv(y_pred_rf, "Scores/prediction_RF.csv")
write.csv(y_pred_svm, "Scores/prediction_SVM.csv")
write.csv(y_pred_xg, "Scores/prediction_GBM.csv")
write.csv(y_pred_mlp, "Scores/prediction_MLP.csv")


mscurves <- evalmod(msmdat2)
autoplot(mscurves)

Métricas de todos los modelos

  Model_Metrics = get_df_metrics()
 

Model_Metrics = rbind(Model_Metrics, get_metrics('Decision Tree', y_test, y_pred_dt))
Model_Metrics = rbind(Model_Metrics, get_metrics('Random Forest', y_test, y_pred_rf))
Model_Metrics = rbind(Model_Metrics, get_metrics('KNN', y_test, y_pred_knn))
Model_Metrics = rbind(Model_Metrics, get_metrics('SVM Radial', y_test, y_pred_svm))
Model_Metrics = rbind(Model_Metrics, get_metrics('GBM', y_test, y_pred_xg))
Model_Metrics = rbind(Model_Metrics, get_metrics('Logist Regresion', y_test, y_pred_lr))
Model_Metrics = rbind(Model_Metrics, get_metrics('MLP', y_test, y_pred_mlp))
Model_Metrics = rbind(Model_Metrics, get_metrics('Ridge Regresion', y_test, y_pred_rr))


Model_Metrics
##             modelo  accuracy sensitivity specificity       auc    recall
## 1    Decision Tree 0.8198284   0.9237392   0.4618644 0.8277071 0.4219482
## 2    Random Forest 0.8284080   0.9114391   0.5423729 0.8616471 0.3889929
## 3              KNN 0.7940896   0.9151292   0.3771186 0.7848912 0.3459128
## 4       SVM Radial 0.8455672   0.8745387   0.7457627 0.8818380 0.4246226
## 5              GBM 0.8541468   0.9557196   0.5042373 0.8993240 0.5107009
## 6 Logist Regresion 0.8007626   0.9717097   0.2118644 0.7989008 0.3850870
## 7              MLP 0.8379409   0.9372694   0.4957627 0.8780151 0.4526358
## 8  Ridge Regresion 0.7969495   0.9876999   0.1398305 0.7877265 0.3601554
write.csv(Model_Metrics,'model_metrics.csv')  

Ensamble Final: Random Forest + GBM + SVM + MLP - Ponderados

 DataBaseTest$XGProba = y_pred_xg
 DataBaseTest$RFProba = y_pred_rf
 DataBaseTest$SVMProba = y_pred_svm
 DataBaseTest$MLProba = y_pred_mlp
 

 
DataBaseEnsamble = DataBaseTest %>% select(FatalFixed, XGProba, RFProba, MLProba, SVMProba   )
 DataBaseEnsamble = DataBaseEnsamble %>% rowwise() %>%  mutate(ENProba =  (XGProba * 0.5 + RFProba *0.1+ SVMProba *0.2+ MLProba *0.2) )
 
 x = DataBaseEnsamble$ENProba
 y = DataBaseEnsamble$FatalFixed
 
 precrec_obj <- evalmod(scores = x, labels = y)
 autoplot(precrec_obj)

 precrec_obj2 <- evalmod(scores =x, labels = y, mode="basic")
 autoplot(precrec_obj2)

 auc = auc(y,x)
 metrics = get_metrics('Ensamble', y, x)
 metrics
##     modelo  accuracy sensitivity specificity       auc    recall
## 1 Ensamble 0.8531935   0.8942189   0.7118644 0.9026153 0.4697014

Variables más explicativas!

df_coef = data.frame(summary(lr2)$coef[summary(lr2)$coef[,4] <= .05, 4])
names(df_coef)[1] <- "Coeficientes"

df_coef
##                         Coeficientes
## (Intercept)             2.495153e-03
## ACTIVITY_UNKNOWN        1.937225e-07
## ACTIVITY_STANDING       3.925666e-07
## ACTIVITY_SURFING        2.382149e-18
## ACTIVITY_DIVING         2.638214e-07
## ACTIVITY_SWIMMING       5.352891e-04
## ACTIVITY_FISHING        3.225815e-11
## ACTIVITY_WADING         9.965068e-07
## ACTIVITY_SNORKELING     8.305828e-06
## ACTIVITY_BOATING        7.766332e-03
## ACTIVITY_BATHING        2.665499e-02
## ACTIVITY_KAYAKING       1.223358e-04
## ACTIVITY_FLOATING       2.091669e-04
## ACTIVITY_ROWING         1.029712e-03
## TIME_4_8                1.567126e-03
## TIME_12_16              8.933454e-03
## TIME_16_20              2.282226e-03
## SEX_M                   2.813902e-03
## AGE_19_25               1.357074e-02
## AGE_66_75               3.941546e-04
## LARGO_1_2               8.362733e-18
## LARGO_2_3               1.857217e-04
## LARGO_3_4               7.995011e-03
## CONTINENT_NORTH_AMERICA 1.139816e-17
## CONTINENT_OCEANIA       1.971077e-02
## SPECIE_UNKNOWN          3.917392e-10
## SPECIE_WHITE_SHARK      3.840064e-10
## SPECIE_TIGER_SHARK      8.825148e-11
## SPECIE_BULL_SHARK       9.855771e-10
## SPECIE_COPPER_SHARK     6.454496e-04
## SPECIE_BLUE_SHARK       1.593039e-04
## SPECIE_ZAMBESI_SHARK    9.385340e-07

Actividades

Fishing ~ Surfing ~ Snorkeling ~ Swiming


Especies

White ~ Tiger ~ Bull ~ Sanbezi


Paises

USA ~ Australia ~ Papau Guinea ~ Filipinas


Miscelaneos

De la madrugada al final de la tarde ~ Hombres jovenes y adultos