rm(list = ls(all.names = TRUE))
options(warn=-1)
library(ipred)
library(plotly)
library(monmlp)
library(ggthemes)
source('performance_util.R')
source('functions_util.R')
library(pROC)
require(e1071)
library(xgboost)
library(precrec)
library(gbm)
library(mlr)
library(purrr)
library(tibble)
library(tidyr)
library(tm)
library(tidyr)
library(readr)
library(InformationValue)
library(sqldf)
library(dplyr)
#library(dummies)
library(rpart)
library(rpart.plot)
library(caret)
library(ROCR)
library(data.table)
library(mltools)
library(PerformanceAnalytics)
library(Hmisc)
library(corrplot)
library(skimr)
library(reshape)
library(ggplot2)
library(randomForest)
library(sqldf)
library(ranger)
library(mlrHyperopt)
library(GGally)
library(MlBayesOpt)
library(mlbench)
library(plyr);
library(parallel)
library(akima)
reset.seed <- function()
{
set.seed(4200)
}
head(DataBase)
## Case.Number Date Year Type Country Area
## 1 2018.06.25 25-Jun-2018 2018 Boating USA California
## 2 2018.06.18 18-Jun-2018 2018 Unprovoked USA Georgia
## 3 2018.06.09 09-Jun-2018 2018 Invalid USA Hawaii
## 4 2018.06.08 08-Jun-2018 2018 Unprovoked AUSTRALIA New South Wales
## 5 2018.06.04 04-Jun-2018 2018 Provoked MEXICO Colima
## 6 2018.06.03.b 03-Jun-2018 2018 Unprovoked AUSTRALIA New South Wales
## Location Activity Name Sex Age
## 1 Oceanside, San Diego County Paddling Julie Wolfe F 57
## 2 St. Simon Island, Glynn County Standing Adyson\xa0McNeely F 11
## 3 Habush, Oahu Surfing John Denges M 48
## 4 Arrawarra Headland Surfing male M
## 5 La Ticla Free diving Gustavo Ramos M
## 6 Flat Rock, Ballina Kite surfing Chris \x85 M
## Injury Fatal..Y.N.
## 1 No injury to occupant, outrigger canoe and paddle damaged N
## 2 Minor injury to left thigh N
## 3 Injury to left lower leg from surfboard skeg N
## 4 Minor injury to lower leg N
## 5 Lacerations to leg & hand shark PROVOKED INCIDENT N
## 6 No injury, board bitten N
## Time Species Investigator.or.Source
## 1 18h00 White shark R. Collier, GSAF
## 2 14h00 -15h00 K.McMurray, TrackingSharks.com
## 3 07h45 K.McMurray, TrackingSharks.com
## 4 2 m shark B. Myatt, GSAF
## 5 Tiger shark, 3m A .Kipper
## 6 Daily Telegraph, 6/4/2018
## pdf
## 1 2018.06.25-Wolfe.pdf
## 2 2018.06.18-McNeely.pdf
## 3 2018.06.09-Denges.pdf
## 4 2018.06.08-Arrawarra.pdf
## 5 2018.06.04-Ramos.pdf
## 6 2018.06.03.b-FlatRock.pdf
## href.formula
## 1 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf
## 2 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf
## 3 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf
## 4 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf
## 5 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.04-Ramos.pdf
## 6 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.03.b-FlatRock.pdf
## href
## 1 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf
## 2 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf
## 3 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf
## 4 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf
## 5 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.04-Ramos.pdf
## 6 http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.03.b-FlatRock.pdf
## Case.Number.1 Case.Number.2 original.order X X.1 DateFixed Indice
## 1 2018.06.25 2018.06.25 6303 25-Jun-2018 1
## 2 2018.06.18 2018.06.18 6302 18-Jun-2018 2
## 3 2018.06.09 2018.06.09 6301 09-Jun-2018 3
## 4 2018.06.08 2018.06.08 6300 08-Jun-2018 4
## 5 2018.06.04 2018.06.04 6299 04-Jun-2018 5
## 6 2018.06.03.b 2018.06.03.b 6298 03-Jun-2018 6
str(DataBase)
## 'data.frame': 6301 obs. of 26 variables:
## $ Case.Number : chr "2018.06.25" "2018.06.18" "2018.06.09" "2018.06.08" ...
## $ Date : chr "25-Jun-2018" "18-Jun-2018" "09-Jun-2018" "08-Jun-2018" ...
## $ Year : num 2018 2018 2018 2018 2018 ...
## $ Type : chr "Boating" "Unprovoked" "Invalid" "Unprovoked" ...
## $ Country : chr "USA" "USA" "USA" "AUSTRALIA" ...
## $ Area : chr "California" "Georgia" "Hawaii" "New South Wales" ...
## $ Location : chr "Oceanside, San Diego County" "St. Simon Island, Glynn County" "Habush, Oahu" "Arrawarra Headland" ...
## $ Activity : chr "Paddling" "Standing" "Surfing" "Surfing" ...
## $ Name : chr "Julie Wolfe" "Adyson\xa0McNeely " "John Denges" "male" ...
## $ Sex : chr "F" "F" "M" "M" ...
## $ Age : chr "57" "11" "48" "" ...
## $ Injury : chr "No injury to occupant, outrigger canoe and paddle damaged" "Minor injury to left thigh" "Injury to left lower leg from surfboard skeg" "Minor injury to lower leg" ...
## $ Fatal..Y.N. : chr "N" "N" "N" "N" ...
## $ Time : chr "18h00" "14h00 -15h00" "07h45" "" ...
## $ Species : chr "White shark" "" "" "2 m shark" ...
## $ Investigator.or.Source: chr "R. Collier, GSAF" "K.McMurray, TrackingSharks.com" "K.McMurray, TrackingSharks.com" "B. Myatt, GSAF" ...
## $ pdf : chr "2018.06.25-Wolfe.pdf" "2018.06.18-McNeely.pdf" "2018.06.09-Denges.pdf" "2018.06.08-Arrawarra.pdf" ...
## $ href.formula : chr "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf" ...
## $ href : chr "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.25-Wolfe.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.18-McNeely.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.09-Denges.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2018.06.08-Arrawarra.pdf" ...
## $ Case.Number.1 : chr "2018.06.25" "2018.06.18" "2018.06.09" "2018.06.08" ...
## $ Case.Number.2 : chr "2018.06.25" "2018.06.18" "2018.06.09" "2018.06.08" ...
## $ original.order : int 6303 6302 6301 6300 6299 6298 6297 6296 6295 6294 ...
## $ X : chr "" "" "" "" ...
## $ X.1 : chr "" "" "" "" ...
## $ DateFixed : chr "25-Jun-2018" "18-Jun-2018" "09-Jun-2018" "08-Jun-2018" ...
## $ Indice : int 1 2 3 4 5 6 7 8 9 10 ...
La variable de clase es si el ataque fue fatal o no. Pa En este sentido, se tienen que realizar las siguiente tareas * Estandarizar * Integrar * Inferir las nulas en base a otra variable, en este caso ‘Injuries’
DataBase = DataBase %>% mutate(Fatal = case_when(
Fatal..Y.N. %like% '%Y%' ~ 'Y',
Fatal..Y.N. %like% '%y%' ~ 'Y',
Fatal..Y.N. %like% '%N%' ~ 'N',
Fatal..Y.N. %like% '%n%' ~ 'N',
TRUE ~ Fatal..Y.N.))
DataBase$Fatal = toupper(DataBase$Fatal )
DataBase$Fatal = str_trim(DataBase$Fatal)
DataBase = DataBase %>% mutate(Fatal =
case_when(
(Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'FATAL' ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Fatal' ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Presumed FATAL' ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Presumed FATAL' ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & Injury == 'Survived' ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Minor',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Feet injured',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Foot injured',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Laceration to knee',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Abrasions',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Calf bitten',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Survived',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Human remains',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('remains found',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('body found',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('recovered',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('murdered',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('fatal',Injury, ignore.case = TRUE)==TRUE & grepl('non-fatal',Injury, ignore.case = TRUE)==FALSE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('no injury',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('no attack',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('non-fatal',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('minor injuries',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('no injuries',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('injuries',Injury, ignore.case = TRUE)==TRUE ~ 'N',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('presummed fatal',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Human remains',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
(Fatal =='' | Fatal == 'UNKNOWN') & grepl('Death',Injury, ignore.case = TRUE)==TRUE ~ 'Y',
TRUE ~ Fatal)
)
DataBase = DataBase %>% mutate (FatalFixed =
case_when(
Fatal == 'Y' ~ 1,
Fatal == 'N' ~ 0,
TRUE ~ -1
)
)
La fecha es un string. Además de ello, no tiene una codificación facilmente transformable. Tareas a realizar: * Pasar los meses a valores enteros * Extraer dÃa y año. * Convertir a formato fecha
DataBase = DataBase %>% mutate(DateFixed = gsub(" +", "", DateFixed))
DataBase = DataBase %>% mutate(YearFixed = ifelse(is.na(Year), right(DateFixed, 4), Year))
DataBase = DataBase %>% mutate(MonthFixed =
case_when(
grepl('jan',DateFixed, ignore.case = TRUE )==TRUE ~ '01',
grepl('feb',DateFixed, ignore.case = TRUE )==TRUE ~ '02',
grepl('mar',DateFixed, ignore.case = TRUE )==TRUE ~ '03',
grepl('apr',DateFixed, ignore.case = TRUE )==TRUE ~ '04',
grepl('may',DateFixed, ignore.case = TRUE )==TRUE ~ '05',
grepl('jun',DateFixed, ignore.case = TRUE )==TRUE ~ '06',
grepl('jul',DateFixed, ignore.case = TRUE )==TRUE ~ '07',
grepl('aug',DateFixed, ignore.case = TRUE )==TRUE ~ '08',
grepl('sep',DateFixed, ignore.case = TRUE )==TRUE ~ '09',
grepl('oct',DateFixed, ignore.case = TRUE )==TRUE ~ '10',
grepl('nov',DateFixed, ignore.case = TRUE )==TRUE ~ '11',
grepl('dec',DateFixed, ignore.case = TRUE )==TRUE ~ '12',
TRUE ~ '00'
)
)
DataBase = DataBase %>% mutate(DayFixed =case_when
(nchar(DateFixed)>=10 & !is.na(as.numeric(gsub('-','', left(DateFixed,2))) ) ~ gsub('-','', left(DateFixed,2)),
TRUE ~ '00'))
DataBase = DataBase %>% mutate(DateFormated =
case_when(
YearFixed > 0 & MonthFixed !='00' & DayFixed != '00' ~ paste(YearFixed,MonthFixed,DayFixed, sep='/'),
TRUE ~ ''
)
)
DataBase = DataBase %>% mutate(WeekDayNumber = wday(DateFormated))
DataBase = DataBase %>% mutate(WeekDayNumber = ifelse(is.na(WeekDayNumber), 99, WeekDayNumber))
DataBase = DataBase %>% mutate(TypeFixed = case_when(
Type %in% c('Boat','Boatomg') ~ 'Boating',
Type =='' ~ 'TYPE_UNKNOWN',
TRUE ~ Type ))
DataBase$TypeFixed = toupper( gsub(' ', '_', DataBase$TypeFixed ))
DataBase = DataBase %>% mutate(TimeText = Time)
DataBase = DataBase %>% mutate(TimeText= gsub('"','',TimeText))
DataBase = DataBase %>% mutate(TimeText= gsub("'",'',TimeText))
Times = sqldf ("
SELECT Indice,
CASE WHEN TimeText ='Afternoon' THEN '1500'
WHEN TimeText ='Morning' THEN '1000'
WHEN TimeText ='AM' THEN '1100'
WHEN TimeText ='A.M.' THEN '1100'
WHEN TimeText = 'Just before dawn' THEN '600'
WHEN TimeText = 'Dawn'THEN '650'
WHEN TimeText = 'Dusk'THEN '1900'
WHEN TimeText = 'P.M.' THEN '1600'
WHEN TimeText = 'Dark' THEN '2100'
WHEN TimeText = '<a0>' THEN ''
WHEN TimeText = '06j00' THEN '0600'
WHEN TimeText = 'Lunchtime' THEN '1200'
WHEN TimeText = '--' THEN ''
WHEN TimeText = 'After dark' THEN '2000'
WHEN TimeText = 'After lunch' THEN '1230'
WHEN TimeText ='Morning ' THEN '1000'
WHEN TimeText ='Morning ' THEN '1000'
WHEN TimeText ='Night' THEN '2000'
WHEN TimeText ='Evening' THEN '1900'
WHEN TimeText ='Late afternoon' THEN '1730'
WHEN TimeText ='Early morning' THEN '700'
WHEN TimeText ='Midday' THEN '1200'
WHEN TimeText ='Early afternoon' THEN '1500'
WHEN TimeText ='Midnight' THEN '2330'
WHEN TimeText ='After noon' THEN'1600'
WHEN TimeText ='Just before noon' THEN '1300'
WHEN TimeText ='Late afternon' THEN '1800'
WHEN TimeText ='After 04h00' THEN '1630'
WHEN TimeText ='After Dusk' THEN '1930'
WHEN TimeText ='After dark' THEN '2000'
WHEN TimeText ='After dusk' THEN '1930'
WHEN TimeText ='After lunch' THEN '1200'
WHEN TimeText ='After midnight' THEN '2330'
WHEN TimeText ='Before daybreak' THEN' 700'
WHEN TimeText ='Daybreak' THEN '800'
WHEN TimeText ='Daytime' THEN '1200'
WHEN TimeText ='Early Morning' THEN '900'
WHEN TimeText ='Early evening' THEN '1800'
WHEN TimeText ='Just after 12h00' THEN '1230'
WHEN TimeText ='Late Afternoon' THEN '1700'
WHEN TimeText ='Late morning' THEN '1100'
WHEN TimeText ='Late night' THEN '2200'
WHEN TimeText ='Mid afternoon' THEN '1600'
WHEN TimeText ='Mid morning' THEN '1000'
WHEN TimeText ='Mid-morning' THEN '1000'
WHEN TimeText ='Midday.' THEN '1200'
WHEN TimeText ='Morning' THEN '1500'
WHEN TimeText ='Nightfall' THEN '2030'
WHEN TimeText ='Noon' THEN '1200'
WHEN TimeText ='Shortly after midnight' THEN '2330'
WHEN TimeText ='night' THEN '2200'
WHEN TimeText LIKE 'sundown' THEN '2000'
WHEN TimeText like '%SunseT%' THEN '1840'
WHEN TimeText = '8:04 pm' THEN '2004'
WHEN TimeText like '%sundown%' THEN '1930'
WHEN TimeText like '%dusk%' THEN '1930'
WHEN TimeText ='30 minutes after 1992.07.08.a' THEN ''
ELSE RTRIM(LTRIM(TimeText)) END AS TimeText
, LENGTH(TimeText) AS TimeLen
FROM DataBase"
)
stoppers = c('<','>' ,'Before', 'Sometime', 'between' ,'2 hours after Opperman', 'Prior to ', 'Between', 'Shortly', 'Just before', 'Dusk', ')', '(' ,'before', 'just','(Sunset)', 'X' )
Times = Times %>% mutate(TimeText = removeWords(TimeText, stoppers ))
Times$TimeText = gsub('-', ' - ', Times$TimeText )
Times$TimeText = trim(Times$TimeText)
Times= Times %>% mutate(TimeFixed = as.integer(left(extract_numeric(left(TimeText,6)),4)))
TimesToDB = Times %>% dplyr::select (Indice, TimeFixed)
DataBase = merge(DataBase, TimesToDB, by ='Indice')
DataBase$TimeFixed = as.integer(DataBase$TimeFixed)
CountriesFixed = read.csv('countries_fixed.csv', header = TRUE, sep = ';')
CountriesFixed$CountryFixed = trim( CountriesFixed$CountryFixed )
CountriesFixed$CountryFixed = gsub(',', '',CountriesFixed$CountryFixed )
CountriesFixed$CountryFixed = toupper(gsub(' ', '_',CountriesFixed$CountryFixed ))
CountriesFixed$ContinentFixed = toupper(gsub(' ', '_',CountriesFixed$ContinentFixed ))
CountriesFixedToDB = CountriesFixed %>% dplyr::select (Country, CountryFixed, ContinentFixed, HemisphereFixed)
DataBase = merge(x= DataBase, y = CountriesFixedToDB, by ='Country')
Species = sqldf (
"SELECT Indice,
Species,
CASE WHEN Species like '%white%' THEN 'WHITE SHARK'
WHEN Species like '%BLUE%' THEN 'BLUE SHARK'
WHEN Species like '%grey%' THEN 'GREY SHARK'
WHEN Species like '%bull%' THEN 'BULL SHARK'
WHEN Species like '%Wobbegong%' THEN 'WOBBEGONG SHARK'
WHEN Species like '%Blacktip%' THEN 'BLACKTIP SHARK'
WHEN Species like '%Galapagos%' THEN 'GALAPAGOS SHARK'
WHEN Species like '%Tiger%' THEN 'TIGER SHARK'
WHEN Species like '%cookiecutter%' THEN 'COOKIECUTTER SHARK'
WHEN Species like '%MAKO%' THEN 'MAKO SHARK'
WHEN Species like '%Spinner%' THEN 'SPINNER SHARK'
WHEN Species like '%Porbeagle%' THEN 'PORBEAGLE SHARK'
WHEN Species like '%COW%' THEN 'COW SHARK'
WHEN Species like '%Nurse%' THEN 'NURSE SHARK'
WHEN Species like '%hammerhead%' THEN 'HAMMERHEAD SHARK'
WHEN Species like '%Lemon%' THEN 'LEMON SHARK'
WHEN Species like '%SEVEN%' THEN 'SEVENHILL SHARK'
WHEN Species like '%ANGEL%' THEN 'ANGEL SHARK'
WHEN Species like '%Silky%' THEN 'SILKY SHARK'
WHEN Species like '%Goblin%' THEN 'GOBLIN SHARK'
WHEN Species like '%Sandbar%' THEN 'SANDBAR SHARK'
WHEN Species like '%Raggedtooth%' THEN 'RAGGEDTOOTH SHARK'
WHEN Species like '%SALMON%' THEN 'SALMON SHARK'
WHEN Species like '%reef%' THEN 'REEF SHARK'
WHEN Species like '%dogfish%' THEN 'DOGFISH SHARK'
WHEN Species like '%gill%' THEN 'SEVENHILL SHARK'
WHEN Species like '%Port%Jackson%' THEN 'PORT JACKSON SHARK'
WHEN Species like '%Zambesi%' THEN 'ZAMBESI SHARK'
WHEN Species like '%Spurdog%' THEN 'SPURDOG SHARK'
WHEN Species like '%Dusky%' THEN 'DUSKY SHARK'
WHEN Species like '%Basking%' THEN 'BASKIN SHARK'
WHEN Species like '%Sand %' THEN 'SAND SHARK'
WHEN Species like '%catshark%' THEN 'CATSHARK SHARK'
WHEN Species like '%copper%' THEN 'COPPER SHARK'
WHEN Species like '%black%tipped%' THEN 'BLACKTIP SHARK'
WHEN Species like '%leucas%' THEN 'LEUCAS SHARK'
WHEN Species like '%whaler%' THEN 'COPPER SHARK'
WHEN Species like '%bronze%' THEN 'COPPER SHARK'
WHEN Species like '%Soupfin%' THEN 'SOUPFIN SHARK'
WHEN Species like '%Leopard%' THEN 'LEOPARD SHARK'
WHEN Species like '%gaffed%' THEN 'LEOPARD SHARK'
WHEN Species like '%silvertip%' THEN 'CARCHARHINUS SHARK'
WHEN Species like '%gray%' THEN 'CARCHARHINUS SHARK'
WHEN Species like '%maculpinnis%' THEN 'MALCULPINNIS SHARK'
WHEN Species like '%Whale %' THEN 'WHALE SHARK'
WHEN Species like '%Carpet%' THEN 'CARPET SHARK'
WHEN Species like '%albimarginatus%' THEN 'CARCHARHINUS SHARK'
WHEN Species like '%broadnose%' THEN 'SEVENHILL SHARK'
WHEN Species like '%stingray%' THEN 'STINGRAY SHARK'
WHEN Species like '%Rhizoprionodon%' THEN 'RHIZOPRINODON SHARK'
WHEN Species like '%BANJO%' THEN 'BANJO SHARK'
WHEN Species like '%Thresher%' THEN 'THRESHER SHARK'
WHEN Species like '%leucas%' THEN 'ZAMBESI SHARK'
WHEN Species like '%Shovelnose%' THEN 'SHOVELNOSE SHARK'
WHEN Species like '%macrurus%' THEN 'DUSKY SHARK'
WHEN Species like '%Bronze%whaler%' THEN 'COPPER SHARK'
WHEN Species ='' THEN 'SPECIE_UNKNOWN'
ELSE 'SPECIE_UNKNOWN' END AS SpeciesFixed
FROM DataBase
"
)
Species$SpeciesFixed = gsub(' ', '_',Species$SpeciesFixed )
DataBase = DataBase %>% mutate(Age = gsub("<bd>","",Age))
DataBase = DataBase %>% mutate(Age = gsub(">","",Age))
DataBase = DataBase %>% mutate(AgeFixed = as.integer(abs(extract_numeric(Age))))
DataBase = DataBase %>% mutate(AgeFixed = ifelse(grepl('teen',Age, ignore.case = TRUE )==TRUE , 14, AgeFixed))
DataBase = DataBase %>% mutate(AgeFixed = ifelse(grepl('young',Age, ignore.case = TRUE )==TRUE , 25, AgeFixed))
DataBase = DataBase %>% mutate(AgeFixed = ifelse(is.na(AgeFixed), -1, AgeFixed))
DataBase = DataBase %>% mutate(AgeFixed = ifelse(AgeFixed>105, -1, AgeFixed))
ActivitiesFixed = sqldf(
" SELECT DISTINCT Indice
, Activity
, CASE WHEN Activity LIKE '%SURFING%' THEN 'SURFING'
WHEN Activity LIKE '%SWIMMING%' THEN 'SWIMMING'
WHEN Activity LIKE '%FISHING%' THEN 'FISHING'
WHEN Activity LIKE '%DIVING%' THEN 'DIVING'
WHEN Activity LIKE '%SPEARFISHING%' THEN 'SPEARFISHING'
WHEN Activity LIKE '%BATHING%' THEN 'BATHING'
WHEN Activity LIKE '%STANDING%' THEN 'STANDING'
WHEN Activity LIKE '%SNORKELING%' THEN 'SNORKELING'
WHEN Activity LIKE '%KAYAKING%' THEN 'KAYAKING'
WHEN Activity LIKE '%Jumped%overboard%' THEN 'FELL_OVERBOARD'
WHEN Activity LIKE '%fell%overboard%' THEN 'FELL_OVERBOARD'
WHEN Activity LIKE '%overboard%' THEN 'FELL_OVERBOARD'
WHEN Activity LIKE '%oveboard%' THEN 'FELL_OVERBOARD'
WHEN Activity LIKE '%Wading%' THEN 'WADING'
WHEN Activity LIKE '%Fell%into%water%' THEN 'FELL_INTO_WATER'
WHEN Activity LIKE '%Canoeing%' THEN 'CANOEING'
WHEN Activity LIKE '%Sailing%' THEN 'SAILING'
WHEN Activity LIKE '%Rowing%' THEN 'ROWING'
WHEN Activity LIKE '%Floating%' THEN 'FLOATING'
WHEN Activity LIKE '%boating%' THEN 'BOATING'
WHEN Activity LIKE '%boat%' THEN 'BOATING'
WHEN Activity LIKE '%Angling%' THEN 'FISHING'
WHEN Activity LIKE '%wreck%' THEN 'SHIPWRECK'
WHEN Activity LIKE '%Sea%Disaste%' THEN 'SHIPWRECK'
WHEN Activity LIKE '%Oystering%' THEN 'OYSTERING'
WHEN Activity LIKE '%CLAMMING%' THEN 'CLAMMING'
WHEN Activity LIKE '%WASHING%' THEN 'WASHING'
WHEN Activity LIKE '%Hunting%' THEN 'HUNTING'
WHEN Activity LIKE '%Jumped%out%' THEN 'FELL_OVERBOARD'
ELSE 'ACTIVITY_UNKNOWN' END AS ActivityFixed
FROM DataBase
"
)
ActivityToDB = ActivitiesFixed %>% dplyr::select(Indice, ActivityFixed)
DataBase = merge(x=DataBase, y =ActivityToDB, by = "Indice")
DataBase = DataBase %>% mutate(ActivityFixed = ifelse(ActivityFixed=='', 'ACTIVITY_UNKNOWN',ActivityFixed))
DataBase= DataBase %>% mutate (LargoMetros =str_extract(Species, "[:digit:]+[:punct:]?[:digit:]*[:space:]*[Cc]?[Mm]+"))
DataBase= DataBase %>% mutate (LargoPies =str_extract(Species, "[:digit:]+'"))
DataBase= DataBase %>% mutate (LargoMetros1 =
case_when (
grepl('cm',LargoMetros, ignore.case = TRUE) ~ extract_numeric(LargoMetros)/ 100,
TRUE ~ extract_numeric(gsub(',', '.', LargoMetros))
)
)
DataBase= DataBase %>% mutate (LargoMetros2 = extract_numeric(gsub(',', '.', LargoPies)) * 30.48 / 100)
DataBase= DataBase %>% mutate (LargoMetrosFixed = coalesce2(LargoMetros1, LargoMetros2 ) )
DataBase = DataBase %>% mutate(SeasonFixed = case_when (
HemisphereFixed == 'South' & MonthFixed %in% c('01','02','03') ~ 'Summer',
HemisphereFixed == 'South' & MonthFixed %in% c('04','04','06') ~ 'Autumn',
HemisphereFixed == 'South' & MonthFixed %in% c('07','08','09') ~ 'Winter',
HemisphereFixed == 'South' & MonthFixed %in% c('10','11','12') ~ 'Spring',
HemisphereFixed == 'North' & MonthFixed %in% c('01','02','03') ~ 'Winter',
HemisphereFixed == 'North' & MonthFixed %in% c('04','04','06') ~ 'Spring',
HemisphereFixed == 'North' & MonthFixed %in% c('07','08','09') ~ 'Summer',
HemisphereFixed == 'North' & MonthFixed %in% c('10','11','12') ~ 'Autumn',
TRUE ~ 'Season_Unknown'
)
)
# DataGraficos = sqldf(
# "SELECT A.indice,
# yearfixed,
# monthfixed,
# typefixed,
# speciesfixed,
# dateformated,
# weekdaynumber,
# hemispherefixed,
# agefixed,
# sex,
# activityfixed,
# countryfixed,
# continentfixed,
# area,
# largometrosfixed,
# CASE
# WHEN largometrosfixed BETWEEN 0 AND 1 THEN 'LARGO_0_1'
# WHEN largometrosfixed BETWEEN 1.01 AND 2 THEN 'LARGO_1_2'
# WHEN largometrosfixed BETWEEN 2.01 AND 3 THEN 'LARGO_2_3'
# WHEN largometrosfixed BETWEEN 3.01 AND 4 THEN 'LARGO_3_4'
# WHEN largometrosfixed BETWEEN 4.01 AND 5 THEN 'LARGO_4_5'
# WHEN largometrosfixed BETWEEN 5.01 AND 6 THEN 'LARGO_5_6'
# WHEN largometrosfixed > 6 THEN 'LARGO_MAS_6'
# ELSE 'N/A'
# END AS LargoEstandarizado,
# FatalFixed
# FROM DataBase A
# LEFT JOIN Species B
# ON a.indice = B.indice
# WHERE fatalfixed >= 0
# AND yearfixed >= 1900
# AND countryfixed != 'UNKNOWN' ")
#
# write.csv(DataGraficos,'shark_attacks_plots.csv')
#
CountryDummy =sqldf("SELECT Indice
, CASE WHEN COUNTRYFIXED = 'USA' THEN 1 ELSE 0 END AS COUNTRY_USA
, CASE WHEN COUNTRYFIXED = 'AUSTRALIA' THEN 1 ELSE 0 END AS COUNTRY_AUSTRALIA
, CASE WHEN COUNTRYFIXED = 'SOUTH_AFRICA' THEN 1 ELSE 0 END AS COUNTRY_SOUTH_AFRICA
, CASE WHEN COUNTRYFIXED = 'PAPUA_NEW_GUINEA' THEN 1 ELSE 0 END AS COUNTRY_PAPUA_GUINEA
, CASE WHEN COUNTRYFIXED = 'NEW_ZEALAND' THEN 1 ELSE 0 END AS COUNTRY_NEW_ZELAND
, CASE WHEN COUNTRYFIXED = 'BRAZIL' THEN 1 ELSE 0 END AS COUNTRY_BRAZIL
, CASE WHEN COUNTRYFIXED = 'BAHAMAS' THEN 1 ELSE 0 END AS COUNTRY_BAHAMAS
, CASE WHEN COUNTRYFIXED NOT IN ('USA', 'AUSTRALIA', 'SOUTH_AFRICA', 'BAHAMAS',
'PAPUA_NEW_GUINEA', 'NEW_ZEALAND', 'BRAZIL') THEN 1 ELSE 0 END AS COUNTRY_RESTO
FROM DataBase")
AreasDummy = sqldf("SELECT Indice
, CASE WHEN AREA = 'Florida' THEN 1 ELSE 0 END AS AREA_FLORIDA
, CASE WHEN AREA = 'New South Wales' THEN 1 ELSE 0 END AS AREA_SOUTH_WALES
, CASE WHEN AREA = 'Queensland' THEN 1 ELSE 0 END AS AREA_QUEENSLAND
, CASE WHEN AREA = 'Hawaii' THEN 1 ELSE 0 END AS AREA_HAWAII
, CASE WHEN AREA = 'California' THEN 1 ELSE 0 END AS AREA_CALIFORNIA
, CASE WHEN AREA = 'KwaZulu-Natal' THEN 1 ELSE 0 END AS AREA_KWAZULU
, CASE WHEN AREA = 'Western Australia' THEN 1 ELSE 0 END AS AREA_WEST_AUSTRALIA
, CASE WHEN AREA = 'Eastern Cape Province' THEN 1 ELSE 0 END AS AREA_EAST_CAPE
, CASE WHEN AREA = 'Western Cape Province' THEN 1 ELSE 0 END AS AREA_WEST_CAPE
FROM DataBase")
SpeciesDummy = sqldf("SELECT Indice,
CASE WHEN speciesfixed = 'SPECIE_UNKNOWN' THEN 1 ELSE 0 END AS SPECIE_UNKNOWN,
CASE WHEN speciesfixed = 'WHITE_SHARK' THEN 1 ELSE 0 END AS SPECIE_WHITE_SHARK,
CASE WHEN speciesfixed = 'TIGER_SHARK' THEN 1 ELSE 0 END AS SPECIE_TIGER_SHARK,
CASE WHEN speciesfixed = 'BLACKTIP_SHARK' THEN 1 ELSE 0 END AS SPECIE_BLACKTIP_SHARK,
CASE WHEN speciesfixed = 'BULL_SHARK' THEN 1 ELSE 0 END AS SPECIE_BULL_SHARK,
CASE WHEN speciesfixed = 'COPPER_SHARK' THEN 1 ELSE 0 END AS SPECIE_COPPER_SHARK,
CASE WHEN speciesfixed = 'GREY_SHARK' THEN 1 ELSE 0 END AS SPECIE_GREY_SHARK,
CASE WHEN speciesfixed = 'NURSE_SHARK' THEN 1 ELSE 0 END AS SPECIE_NURSE_SHARK,
CASE WHEN speciesfixed = 'BLUE_SHARK' THEN 1 ELSE 0 END AS SPECIE_BLUE_SHARK,
CASE WHEN speciesfixed = 'MAKO_SHARK' THEN 1 ELSE 0 END AS SPECIE_MAKO_SHARK,
CASE WHEN speciesfixed = 'WOBBEGONG_SHARK' THEN 1 ELSE 0 END AS SPECIE_WOBBEGONG_SHARK,
CASE WHEN speciesfixed = 'HAMMERHEAD_SHARK' THEN 1 ELSE 0 END AS SPECIE_HAMMERHEAD_SHARK,
CASE WHEN speciesfixed = 'RAGGEDTOOTH_SHARK' THEN 1 ELSE 0 END AS SPECIE_RAGGEDTOOTH_SHARK,
CASE WHEN speciesfixed = 'LEMON_SHARK' THEN 1 ELSE 0 END AS SPECIE_LEMON_SHARK,
CASE WHEN speciesfixed = 'REEF_SHARK' THEN 1 ELSE 0 END AS SPECIE_REEF_SHARK,
CASE WHEN speciesfixed = 'ZAMBESI_SHARK' THEN 1 ELSE 0 END AS SPECIE_ZAMBESI_SHARK,
CASE WHEN speciesfixed = 'SPINNER_SHARK' THEN 1 ELSE 0 END AS SPECIE_SPINNER_SHARK,
CASE WHEN speciesfixed = 'SEVENHILL_SHARK' THEN 1 ELSE 0 END AS SPECIE_SEVENHILL_SHARK,
CASE WHEN speciesfixed NOT IN( 'SPECIE_UNKNOWN', 'WHITE_SHARK', 'TIGER_SHARK', 'BULL_SHARK', 'BLACKTIP_SHARK', 'COPPER_SHARK',
'GREY_SHARK', 'NURSE_SHARK', 'BLUE_SHARK', 'MAKO_SHARK', 'WOBBEGONG_SHARK' ,
'HAMMERHEAD_SHARK', 'RAGGEDTOOTH_SHARK', 'LEMON_SHARK',
'REEF_SHARK', 'ZAMBESI_SHARK', 'SPINNER_SHARK', 'SEVENHILL_SHARK' ) THEN 1 ELSE 0 END AS SPECIES_OTHERS
FROM Species"
)
SexDummy = sqldf(
"SELECT DISTINCT Indice
, CASE WHEN Sex LIKE '%M%' THEN 1 ELSE 0 END AS SEX_M
, CASE WHEN Sex LIKE '%F%' THEN 1 ELSE 0 END AS SEX_F
, CASE WHEN Sex NOT LIKE '%M%' AND Sex NOT LIKE '%F%' THEN 1 ELSE 0 END AS SEX_U
FROM DataBase"
)
AgeDummy = sqldf (
"SELECT DISTINCT INDICE
, CASE WHEN AGEFIXED < 0 THEN 1 ELSE 0 END AS AGE_9999
, CASE WHEN AGEFIXED BETWEEN 0 AND 10 THEN 1 ELSE 0 END AS AGE_0_10
, CASE WHEN AGEFIXED BETWEEN 11 AND 18 THEN 1 ELSE 0 END AS AGE_11_18
, CASE WHEN AGEFIXED BETWEEN 19 AND 25 THEN 1 ELSE 0 END AS AGE_19_25
, CASE WHEN AGEFIXED BETWEEN 26 AND 35 THEN 1 ELSE 0 END AS AGE_26_35
, CASE WHEN AGEFIXED BETWEEN 36 AND 45 THEN 1 ELSE 0 END AS AGE_36_45
, CASE WHEN AGEFIXED BETWEEN 46 AND 55 THEN 1 ELSE 0 END AS AGE_46_55
, CASE WHEN AGEFIXED BETWEEN 56 AND 65 THEN 1 ELSE 0 END AS AGE_56_65
, CASE WHEN AGEFIXED BETWEEN 66 AND 80 THEN 1 ELSE 0 END AS AGE_66_75
, CASE WHEN AGEFIXED > 90 THEN 1 ELSE 0 END AS AGE_80_100
FROM DataBase")
LargoDummy = sqldf (
"SELECT DISTINCT Indice
, CASE WHEN LargoMetrosFixed BETWEEN 0 AND 1 THEN 1 ELSE 0 END AS LARGO_0_1
, CASE WHEN LargoMetrosFixed BETWEEN 1.01 AND 2 THEN 1 ELSE 0 END AS LARGO_1_2
, CASE WHEN LargoMetrosFixed BETWEEN 2.01 AND 3 THEN 1 ELSE 0 END AS LARGO_2_3
, CASE WHEN LargoMetrosFixed BETWEEN 3.01 AND 4 THEN 1 ELSE 0 END AS LARGO_3_4
, CASE WHEN LargoMetrosFixed BETWEEN 4.01 AND 5 THEN 1 ELSE 0 END AS LARGO_4_5
, CASE WHEN LargoMetrosFixed BETWEEN 5.01 AND 6 THEN 1 ELSE 0 END AS LARGO_6_6
, CASE WHEN LargoMetrosFixed > 6 THEN 1 ELSE 0 END AS LARGO_6_10
, CASE WHEN LargoMetrosFixed NOT BETWEEN 0 AND 999 THEN 1 ELSE 0 END AS LARGO_9999
FROM DataBase")
TimeDummy = sqldf (
"SELECT DISTINCT Indice
, CASE WHEN TimeFixed BETWEEN 0 AND 400 THEN 1 ELSE 0 END AS TIME_0_4
, CASE WHEN TimeFixed BETWEEN 401 AND 800 THEN 1 ELSE 0 END AS TIME_4_8
, CASE WHEN TimeFixed BETWEEN 801 AND 1200 THEN 1 ELSE 0 END AS TIME_8_12
, CASE WHEN TimeFixed BETWEEN 1201 AND 1600 THEN 1 ELSE 0 END AS TIME_12_16
, CASE WHEN TimeFixed BETWEEN 1601 AND 2000 THEN 1 ELSE 0 END AS TIME_16_20
, CASE WHEN TimeFixed BETWEEN 2001 AND 2400 THEN 1 ELSE 0 END AS TIME_20_24
, CASE WHEN TimeFixed NOT BETWEEN 0 AND 2400 THEN 1 ELSE 0 END AS TIME_9999
FROM DataBase"
)
WeekDayDummy = sqldf(
"SELECT DISTINCT INDICE,
CASE WHEN WEEKDAYNUMBER = '2' THEN 1 ELSE 0 END AS WEEKDAY_2
, CASE WHEN WEEKDAYNUMBER = '7' THEN 1 ELSE 0 END AS WEEKDAY_7
, CASE WHEN WEEKDAYNUMBER = '6' THEN 1 ELSE 0 END AS WEEKDAY_6
, CASE WHEN WEEKDAYNUMBER = '1' THEN 1 ELSE 0 END AS WEEKDAY_1
, CASE WHEN WEEKDAYNUMBER = '5' THEN 1 ELSE 0 END AS WEEKDAY_5
, CASE WHEN WEEKDAYNUMBER = '99' THEN 1 ELSE 0 END AS WEEKDAY_9999
, CASE WHEN WEEKDAYNUMBER = '4' THEN 1 ELSE 0 END AS WEEKDAY_4
, CASE WHEN WEEKDAYNUMBER = '3' THEN 1 ELSE 0 END AS WEEKDAY_3
FROM DataBase")
ActivityDummy = sqldf(
"SELECT Indice,
CASE WHEN activityfixed = 'ACTIVITY_UNKNOWN' THEN 1 ELSE 0 END AS ACTIVITY_UNKNOWN
, CASE WHEN activityfixed = 'STANDING' THEN 1 ELSE 0 END AS ACTIVITY_STANDING
, CASE WHEN activityfixed = 'SURFING' THEN 1 ELSE 0 END AS ACTIVITY_SURFING
, CASE WHEN activityfixed = 'DIVING' THEN 1 ELSE 0 END AS ACTIVITY_DIVING
, CASE WHEN activityfixed = 'SWIMMING' THEN 1 ELSE 0 END AS ACTIVITY_SWIMMING
, CASE WHEN activityfixed = 'FISHING' THEN 1 ELSE 0 END AS ACTIVITY_FISHING
, CASE WHEN activityfixed = 'WADING' THEN 1 ELSE 0 END AS ACTIVITY_WADING
, CASE WHEN activityfixed = 'SNORKELING' THEN 1 ELSE 0 END AS ACTIVITY_SNORKELING
, CASE WHEN activityfixed = 'BOATING' THEN 1 ELSE 0 END AS ACTIVITY_BOATING
, CASE WHEN activityfixed = 'BATHING' THEN 1 ELSE 0 END AS ACTIVITY_BATHING
, CASE WHEN activityfixed = 'KAYAKING' THEN 1 ELSE 0 END AS ACTIVITY_KAYAKING
, CASE WHEN activityfixed = 'CANOEING' THEN 1 ELSE 0 END AS ACTIVITY_CANOEING
, CASE WHEN activityfixed = 'WASHING' THEN 1 ELSE 0 END AS ACTIVITY_WASHING
, CASE WHEN activityfixed = 'FLOATING' THEN 1 ELSE 0 END AS ACTIVITY_FLOATING
, CASE WHEN activityfixed = 'SHIPWRECK' THEN 1 ELSE 0 END AS ACTIVITY_SHIPWRECK
, CASE WHEN activityfixed = 'ROWING' THEN 1 ELSE 0 END AS ACTIVITY_ROWING
, CASE WHEN activityfixed = 'FELL_INTO_WATER' THEN 1 ELSE 0 END AS ACTIVITY_FELL_INTO_WATER
, CASE WHEN activityfixed = 'SAILING' THEN 1 ELSE 0 END AS ACTIVITY_SAILING
, CASE WHEN activityfixed = 'CLAMMING' THEN 1 ELSE 0 END AS ACTIVITY_CLAMMING
, CASE WHEN activityfixed = 'HUNTING' THEN 1 ELSE 0 END AS ACTIVITY_HUNTING
, CASE WHEN activityfixed = 'FELL_OVERBOARD' THEN 1 ELSE 0 END AS ACTIVITY_FELL_OVERBOARD
, CASE WHEN activityfixed = 'OYSTERING' THEN 1 ELSE 0 END AS ACTIVITY_OYSTERING
FROM DataBase")
SeasonDummy = sqldf ("SELECT DISTINCT INDICE,
CASE WHEN SEASONFIXED = 'SPRING' THEN 1 ELSE 0 END AS SEASON_SPRING
, CASE WHEN SEASONFIXED = 'AUTUMN' THEN 1 ELSE 0 END AS SEASON_AUTUMN
, CASE WHEN SEASONFIXED = 'SEASON_UNKNOWN' THEN 1 ELSE 0 END AS SEASON_UNKNOW
, CASE WHEN SEASONFIXED = 'WINTER' THEN 1 ELSE 0 END AS SEASON_WINTER
, CASE WHEN SEASONFIXED = 'SUMMER' THEN 1 ELSE 0 END AS SEASON_SUMMER
FROM DataBase")
ContimentDummy= sqldf (
"SELECT Indice,
CASE WHEN CONTINENTFIXED='NORTH_AMERICA' THEN 1 ELSE 0 END AS CONTINENT_NORTH_AMERICA,
CASE WHEN CONTINENTFIXED='OCEANIA' THEN 1 ELSE 0 END AS CONTINENT_OCEANIA,
CASE WHEN CONTINENTFIXED='SOUTH_AMERICA' THEN 1 ELSE 0 END AS CONTINENT_SOUTH_AMERICA,
CASE WHEN CONTINENTFIXED='EUROPE' THEN 1 ELSE 0 END AS CONTINENT_EUROPE,
CASE WHEN CONTINENTFIXED='AFRICA' THEN 1 ELSE 0 END AS CONTINENT_AFRICA,
CASE WHEN CONTINENTFIXED='ASIA' THEN 1 ELSE 0 END AS CONTINENT_ASIA,
CASE WHEN CONTINENTFIXED='UNKNOWN' THEN 1 ELSE 0 END AS CONTINENT_UNKNOWN
FROM DataBase
")
HemisphereDummy =sqldf("SELECT INDICE
, CASE WHEN HEMISPHEREFIXED = 'NORTH' THEN 1 ELSE 0 END AS HEMISPHERE_NORTH
, CASE WHEN HEMISPHEREFIXED = 'SOUTH' THEN 1 ELSE 0 END AS HEMISPHERE_SOUTH
, CASE WHEN HEMISPHEREFIXED = '' THEN 1 ELSE 0 END AS HEMISPHERE_UNKNOWN
FROM DataBase")
DataBase = merge(DataBase, WeekDayDummy, by ='Indice')
DataBase = merge(DataBase, ActivityDummy, by ='Indice')
DataBase = merge(DataBase, SeasonDummy, by ='Indice')
DataBase = merge(DataBase, TimeDummy, by ='Indice')
DataBase = merge(DataBase, SexDummy, by ='Indice')
DataBase = merge(DataBase, AgeDummy, by ='Indice')
DataBase = merge(DataBase, LargoDummy, by ='Indice')
DataBase = merge(DataBase, ContimentDummy, by ='Indice')
DataBase = merge(DataBase, HemisphereDummy, by ='Indice')
DataBase = merge(DataBase, SpeciesDummy, by ='Indice')
#DataBase = merge(DataBase, CountryDummy, by ='Indice')
#DataBase = merge(DataBase, AreasDummy, by ='Indice')
FatalDB = read_csv('shark_attacks_plots.csv')
FatalDB = FatalDB %>% dplyr::select(SpeciesFixed, FatalFixed)
FatalDB2 = sqldf (
"SELECT SpeciesFixed
, COUNT(*) AS Cantidad
, SUM(FatalFixed) as Fatalidad
, SUM(FatalFixed) * 1.00 / COUNT(*) as PorcFatalidad
from FatalDB
where SpeciesFixed <> 'SPECIE_UNKNOWN'
group by SpeciesFixed
having Fatalidad > 0
"
)
ggplot(FatalDB2, aes(x=Cantidad, y=PorcFatalidad, color = SpeciesFixed) ) +
geom_point(alpha =0.5, size = 8 ) +
xlab("Cantidad de Ataques") + ylab("Porcentaje de mortalidad del ataque") +
geom_vline(xintercept = 320) +
geom_hline(yintercept = .1859)+
geom_label(
label="Inofensivo",
x=150,
y=.13,
label.padding = unit(0.3, "lines"), # Rectangle size around label
label.size = 0.2,
color = "black",
alpha = 0.05,
fill="#c4c4c4"
) +
geom_label(
label="Peligroso",
x=150,
y=.27,
label.padding = unit(0.3, "lines"), # Rectangle size around label
label.size = 0.2,
color = "black",
alpha = 0.05,
fill="#c4c4c4"
) +
geom_label(
label="Agresivo",
x=500,
y=.13,
label.padding = unit(0.3, "lines"), # Rectangle size around label
label.size = 0.2,
alpha = 0.05,
color = "black",
fill="#c4c4c4"
) +
geom_label(
label="Fatal",
x=500,
y=.27,
label.padding = unit(0.3, "lines"), # Rectangle size around label
label.size = 0.2,
color = "black",
alpha = 0.05,
fill="#c4c4c4"
) +
labs(title = "Especies de tiburones más fatales", subtitle = "Exluidas las que tienen fatalidad 0")
Sharks = read_csv('shark_attacks_plots.csv')
Sharks = Sharks %>% dplyr::select(SpeciesFixed, LargoMetrosFixed) %>%
filter(LargoMetrosFixed < 15) %>%
filter(SpeciesFixed %in% c('WHITE_SHARK','TIGER_SHARK', 'ZAMBESI_SHARK','BULL_SHARK', 'BLUE_SHARK'))
p <- Sharks %>%
plot_ly(
x = ~SpeciesFixed,
y = ~LargoMetrosFixed,
split = ~SpeciesFixed,
type = 'violin',
box = list(
visible = T
),
meanline = list(
visible = T
)
) %>%
layout(
title = "Violin Plot ~ Especie de tiburón en metros",
xaxis = list(
title = "Especie"
),
yaxis = list(
title = "Largo en metros",
zeroline = F
)
)
p
df_cor = read_csv('shark_attacks_plots.csv')
df_cor =df_cor %>% dplyr::select (-X1, -DateFormated, -Indice, -LargoEstandarizado )
df_cor = df_cor %>% mutate(Sex = as.integer(as.factor(Sex)) ,
HemisphereFixed = as.integer(as.factor(HemisphereFixed)),
SpeciesFixed = as.integer(as.factor(SpeciesFixed)),
CountryFixed = as.integer(as.factor(CountryFixed)),
ContinentFixed = as.integer(as.factor(ContinentFixed)),
Area = as.integer(as.factor(Area)),
MonthFixed = as.integer( MonthFixed),
ActivityFixed = as.integer(as.factor(ActivityFixed)),
TypeFixed = as.integer(as.factor(TypeFixed))
)
cor <- cor(df_cor, use = "na.or.complete")
corrplot(cor, method="shade",shade.col=NA, tl.col="black", tl.srt=45)
DataBaseFinal = DataBase %>% filter(FatalFixed >= 0 & YearFixed >= 1900 & CountryFixed != 'UNKNOWN')
DataBaseFinal = DataBaseFinal %>% dplyr::select(-Indice, -SeasonFixed, -WeekDayNumber, -CountryFixed, -LargoMetrosFixed, -Sex, -AgeFixed, -TimeFixed, -Fatal, -YearFixed, -TypeFixed, -Age, -href, -Case.Number.2, -DayFixed, -DateFormated, -DateFixed, -MonthFixed, -TimeText , -Date, -Injury, -Case.Number, - Type, - Location, - Fatal..Y.N. , - Country, - Location, - Year, - Area, - Name, - Time, - pdf, - Case.Number.1, - X, - Investigator.or.Source, -
original.order, - X.1, - LargoMetros, - LargoPies, - LargoMetros2, - LargoMetros1, - Species, - Activity, - ActivityFixed, -HemisphereFixed, - href.formula, -href, -ContinentFixed)
reset.seed()
sample_size = nrow(DataBaseFinal) * 0.8
idxTrain = sample(seq_len(nrow(DataBaseFinal)) ,sample_size, replace = FALSE)
DataBaseTrain = DataBaseFinal[idxTrain,]
DataBaseTest = DataBaseFinal[-idxTrain,]
variables = colnames(DataBaseTrain %>% dplyr::select(-FatalFixed))
predictors = paste(variables, collapse = ' + ')
fmla = paste('FatalFixed', predictors, sep = ' ~ ')
DataBaseTrainFactor = DataBaseTrain
DataBaseTrainFactor_x = DataBaseTrainFactor %>% dplyr::select (-FatalFixed)
DataBaseTrainFactor_y = as.factor(DataBaseTrainFactor$FatalFixed)
DataBaseTrainFactor$FatalFixed = as.factor(DataBaseTrainFactor$FatalFixed)
trainTask <- makeClassifTask(data = DataBaseTrainFactor,target = 'FatalFixed', positive = "1")
x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
y_train = as.numeric(DataBaseTrain$FatalFixed)
x_test = DataBaseTest %>% dplyr::select(-FatalFixed)
y_test = as.numeric(DataBaseTest$FatalFixed)
save.image(file='environmentShark.RData')
library(glmnet)
#fit_ridge <- glmnet(x = as.matrix(x_train), y =as.matrix(y_train), alpha = 0, family="gaussian")
fit_ridge = readRDS("rr.rds")
y_pred_rr <- predict(fit_ridge, as.matrix(x_test), s=0.01, type="response")
#file = "rr.rds"
#saveRDS(fit_ridge, file)
precrec_obj <- evalmod(scores = y_pred_rr, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_rr, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_rr)
df_metricas = get_df_metrics()
metrics = get_metrics('Ridge Regression', y_test, y_pred_rr)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 Ridge Regression 0.7969495 0.9876999 0.1398305 0.7877265 0.3601554
# makeatree <- makeLearner("classif.rpart", predict.type = "response")
# set_cv <- makeResampleDesc("CV",iters = 5L)
#
# gs <- makeParamSet(
# makeDiscreteParam("minsplit",values = seq(2,40,2)),
# makeDiscreteParam("minbucket", values = seq(2,12,1)),
# makeDiscreteParam("cp", values = c(0.0001)),
# makeDiscreteParam("maxdepth", values = seq(2,20,1))
# )
#
# gscontrol <- makeTuneControlGrid()
# stune <- tuneParams(learner = makeatree, resampling = set_cv, task = trainTask, par.set = gs, control = gscontrol, measures = acc)
#
#
# stune$x
# stune$y
#fit_tree = rpart(formula = fmla, data= DataBaseTrain, cp= 0.0001, minsplit=10, minbucket= 10, maxdepth=20, method = 'class')
#fit_tree$variable.importance
#dt_file = "dt.rds"
#saveRDS(fit_tree, dt_file)
fit_tree = readRDS(file = "dt.rds")
pruned.tree <- prune(fit_tree, cp = fit_tree$cptable[which.min(fit_tree$cptable[,"xerror"]),"CP"])
rpart.plot(pruned.tree)
y_pred_dt= predict(fit_tree, DataBaseTest, type = 'prob')[,2]
y_test = DataBaseTest$FatalFixed
precrec_obj <- evalmod(scores = y_pred_dt, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_dt, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_dt)
df_metricas = get_df_metrics()
metrics = get_metrics('Decision Tree', y_test, y_pred_dt)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 Decision Tree 0.8198284 0.9237392 0.4618644 0.8277071 0.4219482
y_train = DataBaseTrain$FatalFixed
vector_auc_train = vector()
vector_auc_test = vector()
vector_k = seq(from=1, to= 100, by = 1)
for(k in vector_k){
fit_knn= knn3(FatalFixed ~ ., data = DataBaseTrain, k = k)
prediction_train = predict(fit_knn, DataBaseTrain)[,2]
roc_obj_train <- roc(y_train, prediction_train)
auc_train = pROC::auc(roc_obj_train)
vector_auc_train = c(vector_auc_train, auc_train)
prediction_test = predict(fit_knn, DataBaseTest)[,2]
roc_obj_test <- roc(y_test, prediction_test)
auc_test = pROC::auc(roc_obj_test)
vector_auc_test = c(vector_auc_test, auc_test)
}
kvecinos = data.frame(k =vector_k, auc_test = vector_auc_test, auc_train = vector_auc_train )
ggplot(kvecinos, aes(x=k ))+
geom_line(aes(y = auc_train), color= "red", size=2)+
geom_line(aes(y = auc_test), color= "blue", size= 2)+
theme_fivethirtyeight() +
labs(title = "Entrenamiento de KNN", subtitle = "Dependiendo del número de k", x = 'K')
#kvecinos[kvecinos$auc_test == max(kvecinos$auc_test), ]
#fit_knn= knn3(FatalFixed ~ ., data = DataBaseTrain, k = 48)
#knn_file = "knn.rds"
#saveRDS(fit_knn, knn_file)
fit_knn = readRDS(file = "knn.rds")
y_pred_knn = predict(fit_knn, DataBaseTest)[,2]
precrec_obj <- evalmod(scores = y_pred_knn, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_knn, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_knn)
df_metricas = get_df_metrics()
metrics = get_metrics('KNN', y_test, y_pred_knn)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 KNN 0.7940896 0.9151292 0.3771186 0.7848912 0.3459128
# res0 <- svm_cv_opt(data = DataBaseTrain,
# label = FatalFixed,
# svm_kernel = "radial",
# degree_range = c(2L, 10L),
# n_folds = 5,
# kappa = 5,
# init_points = 5,
# n_iter = 25)
#
# res0
# x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
# y_train = as.numeric(DataBaseTrain$FatalFixed)
#
# x_test = DataBaseTest %>% dplyr::select(-FatalFixed)
# y_test = as.numeric(DataBaseTest$FatalFixed)
#fit_svm= svm(x =x_train, y = y_train , kernel = "radial",cost=4.6357, gamma=0.0424)
#svm_file = "svm.rds"
#saveRDS(fit_svm, svm_file)
fit_svm = readRDS(file = "svm.rds")
y_pred_svm = predict(fit_svm, x_test)
precrec_obj <- evalmod(scores = y_pred_svm, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_svm, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_svm)
df_metricas = get_df_metrics()
metrics = get_metrics('SVM', y_test, y_pred_svm)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 SVM 0.8455672 0.8745387 0.7457627 0.881838 0.4246226
# getParamSet("classif.randomForest")
# rf <- makeLearner("classif.randomForest", predict.type = "response", par.vals = list(ntree = 200, mtry = 3))
# rf$par.vals <- list(
# importance = TRUE
# )
#
# rf_param <- makeParamSet(
# makeDiscreteParam("ntree", values = c(400,500, 600, 700)),
# makeDiscreteParam("mtry", values = c(30,40,50,60)),
# makeDiscreteParam("nodesize", values = c(2,4,8,10))
# )
#
# rancontrol <- makeTuneControlRandom(maxit = 50L)
# set_cv <- makeResampleDesc("CV",iters = 3L)
# r_tune <- tuneParams(learner = rf, resampling = set_cv, task = trainTask, par.set = rf_param, control = rancontrol, measures = acc)
#
# r_tune$x
# r_tune$y
reset.seed()
#fit_randomf = ranger(FatalFixed ~ ., data = DataBaseTrain, importance = "impurity", num.trees =500, mtry = 12 , min.node.size = 150, probability = TRUE)
#rf_file = "rf.rds"
#saveRDS(fit_randomf, rf_file)
fit_randomf = readRDS(file = "rf.rds")
p = predict(fit_randomf, DataBaseTest)
y_pred_rf = p$predictions[,2]
precrec_obj <- evalmod(scores = y_pred_rf, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_rf, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_rf)
df_metricas = get_df_metrics()
metrics = get_metrics('Random Forest', y_test, y_pred_rf)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 Random Forest 0.828408 0.9114391 0.5423729 0.8616471 0.3889929
# reset.seed()
# res = hyperopt(trainTask, learner = "classif.gbm")
# res
x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
x_test = DataBaseTest %>% dplyr::select(-FatalFixed,)
y_train = as.numeric(DataBaseTrain$FatalFixed)
#gbm1 = gbm(FatalFixed~ ., data = DataBaseTrain, distribution = "bernoulli", bag.fraction = 0.5, n.trees = 1000, interaction.depth =6, shrinkage = 0.1, n.minobsinnode = 10)
#gbm_file = "gbm.rds"
#saveRDS(gbm1, gbm_file)
gbm1 = readRDS(file = "gbm.rds")
y_pred_xg <- predict(object= gbm1, x_test, n.trees=500, type = 'response')
precrec_obj <- evalmod(scores = y_pred_xg, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_xg, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_xg)
df_metricas = get_df_metrics()
metrics = get_metrics('GBM', y_test, y_pred_xg)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 GBM 0.8541468 0.9557196 0.5042373 0.899324 0.5107009
lr <- glm(formula = fmla, data=DataBaseTrain, family=binomial(link="logit"))
#step= step(lr,direction="backward",trace= FALSE )
#file = "step.rds"
#saveRDS(step, file)
# aic =step.log.back$aic
# coefficients = step.log.back$coefficients
# aic
# coefficients
#data.frame(summary(lr)$coef[summary(lr)$coef[,4] <= .05, 4])
#lr2 <- glm(formula = fmla2, data=DataBaseTrain, family=binomial(link="logit"))
#lr_file = "lr_file.rds"
#saveRDS(lr2, lr_file)
lr2 = readRDS(file = "lr_file.rds")
y_pred_lr = predict(lr2, DataBaseTest, type="response")
precrec_obj <- evalmod(scores = y_pred_lr, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores =y_pred_lr, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_lr)
df_metricas = get_df_metrics()
metrics = get_metrics('Logistic Regression', y_test, y_pred_lr)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 Logistic Regression 0.8007626 0.9717097 0.2118644 0.7989008 0.385087
x_train = DataBaseTrain %>% dplyr::select(-FatalFixed,)
y_train = as.numeric(DataBaseTrain$FatalFixed)
x_test = DataBaseTest %>% dplyr::select(-FatalFixed)
y_test = as.numeric(DataBaseTest$FatalFixed)
#mod_w <- monmlp.fit(as.matrix(x_train), as.matrix(y_train), hidden1=5, n.ensemble = 15, bag = TRUE,iter.max = 500, silent=TRUE, control = list(trace = 0))
#monmlp_file = "monmlp.rds"
#saveRDS(mod_w, monmlp_file)
mod_w = readRDS(file = "monmlp.rds")
y_pred_mlp <- monmlp.predict(x = as.matrix(x_test), weights = mod_w)
precrec_obj <- evalmod(scores = y_pred_mlp, labels = y_test)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = y_pred_mlp, labels = y_test, mode="basic")
autoplot(precrec_obj2)
auc = auc(y_test,y_pred_mlp)
df_metricas = get_df_metrics()
metrics = get_metrics('MLP', y_test, y_pred_mlp)
df_metricas = rbind(df_metricas, metrics)
df_metricas
## modelo accuracy sensitivity specificity auc recall
## 1 MLP 0.8379409 0.9372694 0.4957627 0.8780151 0.4526358
scores1 <- join_scores(y_pred_dt, y_pred_xg, y_pred_rf, y_pred_lr, y_pred_svm, y_pred_knn, y_pred_mlp, y_pred_rr)
labels1 <-join_labels(y_test, y_test, y_test, y_test, y_test, y_test, y_test, y_test)
msmdat2 <- mmdata(scores1, labels1, modnames = c("Decision Tree", "GBM", "Random Forest", "Logistic Regression", "SVM Radial", "KNN", "MLP", "Ridge"))
write.csv(y_test, "Scores/Y.csv")
write.csv(y_pred_rf, "Scores/prediction_RF.csv")
write.csv(y_pred_svm, "Scores/prediction_SVM.csv")
write.csv(y_pred_xg, "Scores/prediction_GBM.csv")
write.csv(y_pred_mlp, "Scores/prediction_MLP.csv")
mscurves <- evalmod(msmdat2)
autoplot(mscurves)
Model_Metrics = get_df_metrics()
Model_Metrics = rbind(Model_Metrics, get_metrics('Decision Tree', y_test, y_pred_dt))
Model_Metrics = rbind(Model_Metrics, get_metrics('Random Forest', y_test, y_pred_rf))
Model_Metrics = rbind(Model_Metrics, get_metrics('KNN', y_test, y_pred_knn))
Model_Metrics = rbind(Model_Metrics, get_metrics('SVM Radial', y_test, y_pred_svm))
Model_Metrics = rbind(Model_Metrics, get_metrics('GBM', y_test, y_pred_xg))
Model_Metrics = rbind(Model_Metrics, get_metrics('Logist Regresion', y_test, y_pred_lr))
Model_Metrics = rbind(Model_Metrics, get_metrics('MLP', y_test, y_pred_mlp))
Model_Metrics = rbind(Model_Metrics, get_metrics('Ridge Regresion', y_test, y_pred_rr))
Model_Metrics
## modelo accuracy sensitivity specificity auc recall
## 1 Decision Tree 0.8198284 0.9237392 0.4618644 0.8277071 0.4219482
## 2 Random Forest 0.8284080 0.9114391 0.5423729 0.8616471 0.3889929
## 3 KNN 0.7940896 0.9151292 0.3771186 0.7848912 0.3459128
## 4 SVM Radial 0.8455672 0.8745387 0.7457627 0.8818380 0.4246226
## 5 GBM 0.8541468 0.9557196 0.5042373 0.8993240 0.5107009
## 6 Logist Regresion 0.8007626 0.9717097 0.2118644 0.7989008 0.3850870
## 7 MLP 0.8379409 0.9372694 0.4957627 0.8780151 0.4526358
## 8 Ridge Regresion 0.7969495 0.9876999 0.1398305 0.7877265 0.3601554
write.csv(Model_Metrics,'model_metrics.csv')
DataBaseTest$XGProba = y_pred_xg
DataBaseTest$RFProba = y_pred_rf
DataBaseTest$SVMProba = y_pred_svm
DataBaseTest$MLProba = y_pred_mlp
DataBaseEnsamble = DataBaseTest %>% select(FatalFixed, XGProba, RFProba, MLProba, SVMProba )
DataBaseEnsamble = DataBaseEnsamble %>% rowwise() %>% mutate(ENProba = (XGProba * 0.5 + RFProba *0.1+ SVMProba *0.2+ MLProba *0.2) )
x = DataBaseEnsamble$ENProba
y = DataBaseEnsamble$FatalFixed
precrec_obj <- evalmod(scores = x, labels = y)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores =x, labels = y, mode="basic")
autoplot(precrec_obj2)
auc = auc(y,x)
metrics = get_metrics('Ensamble', y, x)
metrics
## modelo accuracy sensitivity specificity auc recall
## 1 Ensamble 0.8531935 0.8942189 0.7118644 0.9026153 0.4697014
df_coef = data.frame(summary(lr2)$coef[summary(lr2)$coef[,4] <= .05, 4])
names(df_coef)[1] <- "Coeficientes"
df_coef
## Coeficientes
## (Intercept) 2.495153e-03
## ACTIVITY_UNKNOWN 1.937225e-07
## ACTIVITY_STANDING 3.925666e-07
## ACTIVITY_SURFING 2.382149e-18
## ACTIVITY_DIVING 2.638214e-07
## ACTIVITY_SWIMMING 5.352891e-04
## ACTIVITY_FISHING 3.225815e-11
## ACTIVITY_WADING 9.965068e-07
## ACTIVITY_SNORKELING 8.305828e-06
## ACTIVITY_BOATING 7.766332e-03
## ACTIVITY_BATHING 2.665499e-02
## ACTIVITY_KAYAKING 1.223358e-04
## ACTIVITY_FLOATING 2.091669e-04
## ACTIVITY_ROWING 1.029712e-03
## TIME_4_8 1.567126e-03
## TIME_12_16 8.933454e-03
## TIME_16_20 2.282226e-03
## SEX_M 2.813902e-03
## AGE_19_25 1.357074e-02
## AGE_66_75 3.941546e-04
## LARGO_1_2 8.362733e-18
## LARGO_2_3 1.857217e-04
## LARGO_3_4 7.995011e-03
## CONTINENT_NORTH_AMERICA 1.139816e-17
## CONTINENT_OCEANIA 1.971077e-02
## SPECIE_UNKNOWN 3.917392e-10
## SPECIE_WHITE_SHARK 3.840064e-10
## SPECIE_TIGER_SHARK 8.825148e-11
## SPECIE_BULL_SHARK 9.855771e-10
## SPECIE_COPPER_SHARK 6.454496e-04
## SPECIE_BLUE_SHARK 1.593039e-04
## SPECIE_ZAMBESI_SHARK 9.385340e-07