Se toma la base de datos proveniente del Centro Nacional de Huracanes NHC y del centro de Huracanes del pacifico central NOAA . https://www.nhc.noaa.gov/data/#hurdat De acuerdo con la pagina los datos incluye las posiciones y ciertos atributos de las tormentas desde 1975 hasta 2021. Los huracanes desde 1979 fueron medidos cada 6 horas durante el tiempo de vida de la tormenta.. Algunos huracanes en aƱos anteriores tienen datos faltantes.n
Para la adecuacion de los datos se realiza el procedimiento descrito en https://github.com/tidyverse/dplyr/blob/main/data-raw/storms.R mendiante la integracion del paquete tidyverse. El codigo se describe acontinuacion:
library(tidyverse)
# Creates storms data set from NOAA Atlantic Hurricane data, which is provided
# in an unorthodox format: a csv that alternates between metadata/identifier rows
# and data rows.
# TO UPDATE: get the latest URL from https://www.nhc.noaa.gov/data/#hurdat, and rerun this code
# Read in data set so each line is a character string
storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2021-041922.txt")
storm_strings <- read_lines(storm_file_complete)
# Identify the header lines that have three commas
header_locations <- str_count(storm_strings, "\\,") == 3
header_locations <- (1:length(storm_strings))[header_locations]
# Extract length of each sub-dataset
headers <- as.list(storm_strings[header_locations])
headers_df <- headers %>%
map(str_sub, start = 1, end = -2) %>% # to remove trailing comma
map(paste0, "\n") %>% # to trigger literal read
map_df(read_csv, col_names = c("id", "name", "n_obs"), col_types = "cci") %>%
mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
select(id, name, skip, n_obs)
column_types <- list(
date = col_character(),
time = col_character(),
record_type = col_character(),
status = col_character(),
lat = col_character(),
long = col_character(),
wind = col_integer(),
pressure = col_integer(),
extent_34_NE = col_integer(),
extent_34_SE = col_integer(),
extent_34_SW = col_integer(),
extent_34_NW = col_integer(),
extent_50_NE = col_integer(),
extent_50_SE = col_integer(),
extent_50_SW = col_integer(),
extent_50_NW = col_integer(),
extent_64_NE = col_integer(),
extent_64_SE = col_integer(),
extent_64_SW = col_integer(),
extent_64_NW = col_integer(),
nas = col_integer()
)
column_names <- names(column_types)
#### Parse each storm as its own sub-dataframe
storm_dataframes <- vector("list", nrow(headers_df))
for (i in 1:nrow(headers_df)) {
# get this storm's metadata
row_start = headers_df[i,]$skip + 1
row_end = headers_df[i,]$n_obs + row_start - 1
# subset of rows belonging to this storm
data_subset = storm_strings[row_start:row_end] %>%
paste(collapse = "\n") %>%
paste0("\n")
# read it as a csv
data_subset = read_csv(
data_subset,
col_names = column_names,
col_types = column_types,
na = c("", "-99", "-999")
)
problems()
# name and id at the front
data_subset$name = headers_df[i,]$name
data_subset = data_subset %>% relocate(name)
data_subset$id = headers_df[i,]$id
data_subset = data_subset %>% relocate(id)
# add to list of storms
storm_dataframes[[i]] = data_subset
}
# Combine and clean the data sets
library(lubridate)
# combine the storms into one dataframe
storms <- storm_dataframes %>%
bind_rows()
#####################
# format and cleanup
# format the columns
storms <- storms %>%
mutate(
date = ymd(date),
year = year(date),
month = month(date),
day = day(date),
hour = as.numeric(str_sub(time, 1, 2)),
lat_hemisphere = str_sub(lat, -1),
lat_sign = if_else(lat_hemisphere == "N", 1, -1),
lat = as.numeric(str_sub(lat, 1, -2)) * lat_sign,
long_hemisphere = str_sub(long, -1),
long_sign = if_else(long_hemisphere == "E", 1, -1),
long = as.numeric(str_sub(long, 1, -2)) * long_sign,
# wind = wind * 1.15078, # transforms knots to mph,
TSradius1 = extent_34_NE + extent_34_SW,
TSradius2 = extent_34_NW + extent_34_SE,
tropicalstorm_force_diameter = pmax(TSradius1, TSradius2),
HUradius1 = extent_64_NE + extent_64_SW,
HUradius2 = extent_64_NW + extent_64_SE,
hurricane_force_diameter = pmax(HUradius1, HUradius2)
)
# drop rows with missing pressure record
storms <- storms %>%
filter(!is.na(pressure))
# don't abrev.
storms <- storms %>% mutate(
status = factor(recode(status,
"HU" = "hurricane",
"TS" = "tropical storm",
"TD" = "tropical depression",
"EX" = "extratropical",
"SD" = "subtropical depression",
"SS" = "subtropical storm",
"LO" = "other low",
"WV" = "tropical wave",
"DB" = "disturbance"
))
)
# hurricane category
storms <- storms %>%
mutate(category = case_when(
status != "hurricane" ~ NA,
wind >= 137 ~ 5,
wind >= 113 ~ 4,
wind >= 96 ~ 3,
wind >= 83 ~ 2,
wind >= 64 ~ 1,
.default = NA
)) %>%
relocate(category, .after = status)
# drop storms without at least one record that is a tropical depression or higher
storms <- storms %>%
group_by(year, name) %>%
filter(any(status %in% c("hurricane", "tropical storm", "tropical depression"))) %>%
ungroup()
# drop all rows that are not at least a depression
# might want to use this filter if the file size is an issue
# storms <- storms %>% filter(status %in% c("hurricane", "tropical storm", "tropical depression"))
# make names Title casing
storms <- storms %>%
mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))
# drop a bad data point (add more if found)
storms <- storms %>%
filter( !((year == 1969) & (name == "Debbie") & (long < -350)) )
# simplify
storms <- storms %>%
# drop historical data for simplicity and backwards compatibility
filter(year >= 1975) %>%
# drop some columns
select(name, year, month, day, hour, lat, long, status, category, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)
# output for the package
usethis::use_data(storms, overwrite = TRUE)
storms = as.data.frame(storms)
vars = c("name", "year", "month", "day", "hour", "long", "lat","status","wind","pressure")
storms = storms[, vars]
Una vez obtenidos los datos desde el servidor se exportan a archivo CSV.
write.csv(storms, "D:\\George\\Work_files\\Storm.csv", row.names=FALSE)
Importando el archivo CSV y explorando las variables:
Storm <- read.csv("D:/George/Work_files/Storm.csv")
View(Storm)
head(Storm)
## name year month day hour long lat status wind pressure
## 1 Amy 1975 6 27 0 -79.0 27.5 tropical depression 25 1013
## 2 Amy 1975 6 27 6 -79.0 28.5 tropical depression 25 1013
## 3 Amy 1975 6 27 12 -79.0 29.5 tropical depression 25 1013
## 4 Amy 1975 6 27 18 -79.0 30.5 tropical depression 25 1013
## 5 Amy 1975 6 28 0 -78.8 31.5 tropical depression 25 1012
## 6 Amy 1975 6 28 6 -78.7 32.4 tropical depression 25 1012
tail(Storm)
## name year month day hour long lat status wind pressure
## 19061 Wanda 2021 11 6 18 -38.0 37.1 tropical storm 35 1002
## 19062 Wanda 2021 11 7 0 -37.4 37.4 tropical storm 35 1003
## 19063 Wanda 2021 11 7 6 -36.4 38.1 tropical storm 35 1004
## 19064 Wanda 2021 11 7 12 -34.9 39.2 other low 35 1006
## 19065 Wanda 2021 11 7 18 -32.8 40.9 other low 40 1006
## 19066 Wanda 2021 11 8 0 -29.7 43.2 other low 40 1006
Analisis exploratorio:
summary(Storm)
## name year month day
## Length:19066 Min. :1975 Min. : 1.000 Min. : 1.00
## Class :character 1st Qu.:1993 1st Qu.: 8.000 1st Qu.: 8.00
## Mode :character Median :2004 Median : 9.000 Median :16.00
## Mean :2002 Mean : 8.699 Mean :15.78
## 3rd Qu.:2012 3rd Qu.: 9.000 3rd Qu.:24.00
## Max. :2021 Max. :12.000 Max. :31.00
## hour long lat status
## Min. : 0.000 Min. :-109.30 Min. : 7.00 Length:19066
## 1st Qu.: 5.000 1st Qu.: -78.70 1st Qu.:18.40 Class :character
## Median :12.000 Median : -62.25 Median :26.60 Mode :character
## Mean : 9.094 Mean : -61.52 Mean :26.99
## 3rd Qu.:18.000 3rd Qu.: -45.60 3rd Qu.:33.70
## Max. :23.000 Max. : 13.50 Max. :70.70
## wind pressure
## Min. : 10.00 Min. : 882.0
## 1st Qu.: 30.00 1st Qu.: 987.0
## Median : 45.00 Median :1000.0
## Mean : 50.02 Mean : 993.6
## 3rd Qu.: 65.00 3rd Qu.:1007.0
## Max. :165.00 Max. :1024.0
Diagrama de dispersion:
par(mfrow = c(1, 2))
par( mar= c(4,4,2,2) )
plot(Storm$long,Storm$pressure)
plot(Storm$lat,Storm$pressure)