Ejercicio de la clase

Base de datos

Se toma la base de datos proveniente del Centro Nacional de Huracanes NHC y del centro de Huracanes del pacifico central NOAA . https://www.nhc.noaa.gov/data/#hurdat De acuerdo con la pagina los datos incluye las posiciones y ciertos atributos de las tormentas desde 1975 hasta 2021. Los huracanes desde 1979 fueron medidos cada 6 horas durante el tiempo de vida de la tormenta.. Algunos huracanes en aƱos anteriores tienen datos faltantes.n

Para la adecuacion de los datos se realiza el procedimiento descrito en https://github.com/tidyverse/dplyr/blob/main/data-raw/storms.R mendiante la integracion del paquete tidyverse. El codigo se describe acontinuacion:

library(tidyverse)

# Creates storms data set from NOAA Atlantic Hurricane data, which is provided
# in an unorthodox format: a csv that alternates between metadata/identifier rows
# and data rows.

# TO UPDATE: get the latest URL from https://www.nhc.noaa.gov/data/#hurdat, and rerun this code

# Read in data set so each line is a character string
storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2021-041922.txt")
storm_strings <- read_lines(storm_file_complete)

# Identify the header lines that have three commas
header_locations <- str_count(storm_strings, "\\,") == 3
header_locations <- (1:length(storm_strings))[header_locations]

# Extract length of each sub-dataset
headers <- as.list(storm_strings[header_locations])
headers_df <- headers %>%
  map(str_sub, start = 1, end = -2) %>% # to remove trailing comma
  map(paste0, "\n") %>%                 # to trigger literal read
  map_df(read_csv, col_names = c("id", "name", "n_obs"), col_types = "cci") %>%
  mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
  select(id, name, skip, n_obs)

column_types <- list(
  date = col_character(),
  time = col_character(),
  record_type = col_character(),
  status = col_character(),
  lat = col_character(),
  long = col_character(),
  wind = col_integer(),
  pressure = col_integer(),
  extent_34_NE = col_integer(),
  extent_34_SE = col_integer(),
  extent_34_SW = col_integer(),
  extent_34_NW = col_integer(),
  extent_50_NE = col_integer(),
  extent_50_SE = col_integer(),
  extent_50_SW = col_integer(),
  extent_50_NW = col_integer(),
  extent_64_NE = col_integer(),
  extent_64_SE = col_integer(),
  extent_64_SW = col_integer(),
  extent_64_NW = col_integer(),
  nas = col_integer()
)
column_names <- names(column_types)


#### Parse each storm as its own sub-dataframe
storm_dataframes <- vector("list", nrow(headers_df))
for (i in 1:nrow(headers_df)) {
  # get this storm's metadata
  row_start = headers_df[i,]$skip + 1
  row_end = headers_df[i,]$n_obs + row_start - 1
  # subset of rows belonging to this storm
  data_subset = storm_strings[row_start:row_end] %>%
    paste(collapse = "\n") %>%
    paste0("\n")
  # read it as a csv
  data_subset = read_csv(
    data_subset,
    col_names = column_names,
    col_types = column_types,
    na = c("", "-99", "-999")
  )
  problems()
  # name and id at the front
  data_subset$name = headers_df[i,]$name
  data_subset = data_subset %>% relocate(name)
  data_subset$id = headers_df[i,]$id
  data_subset = data_subset %>% relocate(id)
  # add to list of storms
  storm_dataframes[[i]] = data_subset
}

# Combine and clean the data sets
library(lubridate)

# combine the storms into one dataframe
storms <- storm_dataframes %>%
  bind_rows()


#####################
# format and cleanup


# format the columns
storms <- storms %>%
  mutate(
    date = ymd(date),
    year = year(date),
    month = month(date),
    day = day(date),
    hour = as.numeric(str_sub(time, 1, 2)),
    lat_hemisphere = str_sub(lat, -1),
    lat_sign = if_else(lat_hemisphere == "N", 1, -1),
    lat = as.numeric(str_sub(lat, 1, -2)) * lat_sign,
    long_hemisphere = str_sub(long, -1),
    long_sign = if_else(long_hemisphere == "E", 1, -1),
    long = as.numeric(str_sub(long, 1, -2)) * long_sign,
    # wind = wind * 1.15078, # transforms knots to mph,
    TSradius1 = extent_34_NE + extent_34_SW,
    TSradius2 = extent_34_NW + extent_34_SE,
    tropicalstorm_force_diameter = pmax(TSradius1, TSradius2),
    HUradius1 = extent_64_NE + extent_64_SW,
    HUradius2 = extent_64_NW + extent_64_SE,
    hurricane_force_diameter = pmax(HUradius1, HUradius2)
  )

# drop rows with missing pressure record
storms <- storms %>%
  filter(!is.na(pressure))

# don't abrev.
storms <- storms %>% mutate(
  status = factor(recode(status,
                         "HU" = "hurricane",
                         "TS" = "tropical storm",
                         "TD" = "tropical depression",
                         "EX" = "extratropical",
                         "SD" = "subtropical depression",
                         "SS" = "subtropical storm",
                         "LO" = "other low",
                         "WV" = "tropical wave",
                         "DB" = "disturbance"
  ))
)

# hurricane category
storms <- storms %>%
  mutate(category = case_when(
    status != "hurricane" ~ NA,
    wind >= 137 ~ 5,
    wind >= 113 ~ 4,
    wind >= 96 ~ 3,
    wind >= 83 ~ 2,
    wind >= 64 ~ 1,
    .default = NA
  )) %>%
  relocate(category, .after = status)

# drop storms without at least one record that is a tropical depression or higher
storms <- storms %>%
  group_by(year, name) %>%
  filter(any(status %in% c("hurricane", "tropical storm", "tropical depression"))) %>%
  ungroup()

# drop all rows that are not at least a depression
# might want to use this filter if the file size is an issue
# storms <- storms %>% filter(status %in% c("hurricane", "tropical storm", "tropical depression"))

# make names Title casing
storms <- storms %>%
  mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))

# drop a bad data point (add more if found)
storms <- storms %>%
  filter( !((year == 1969) & (name == "Debbie") & (long < -350)) )

# simplify
storms <- storms %>%
  # drop historical data for simplicity and backwards compatibility
  filter(year >= 1975) %>%
  # drop some columns
  select(name, year, month, day, hour, lat, long, status, category, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)

# output for the package
usethis::use_data(storms, overwrite = TRUE)

storms = as.data.frame(storms)
vars = c("name", "year", "month", "day", "hour", "long", "lat","status","wind","pressure")
storms = storms[, vars]

Una vez obtenidos los datos desde el servidor se exportan a archivo CSV.

write.csv(storms, "D:\\George\\Work_files\\Storm.csv", row.names=FALSE)

Importando el archivo CSV y explorando las variables:

Storm <- read.csv("D:/George/Work_files/Storm.csv")
View(Storm)
head(Storm)
##   name year month day hour  long  lat              status wind pressure
## 1  Amy 1975     6  27    0 -79.0 27.5 tropical depression   25     1013
## 2  Amy 1975     6  27    6 -79.0 28.5 tropical depression   25     1013
## 3  Amy 1975     6  27   12 -79.0 29.5 tropical depression   25     1013
## 4  Amy 1975     6  27   18 -79.0 30.5 tropical depression   25     1013
## 5  Amy 1975     6  28    0 -78.8 31.5 tropical depression   25     1012
## 6  Amy 1975     6  28    6 -78.7 32.4 tropical depression   25     1012
tail(Storm)
##        name year month day hour  long  lat         status wind pressure
## 19061 Wanda 2021    11   6   18 -38.0 37.1 tropical storm   35     1002
## 19062 Wanda 2021    11   7    0 -37.4 37.4 tropical storm   35     1003
## 19063 Wanda 2021    11   7    6 -36.4 38.1 tropical storm   35     1004
## 19064 Wanda 2021    11   7   12 -34.9 39.2      other low   35     1006
## 19065 Wanda 2021    11   7   18 -32.8 40.9      other low   40     1006
## 19066 Wanda 2021    11   8    0 -29.7 43.2      other low   40     1006

Analisis exploratorio:

summary(Storm)
##      name                year          month             day       
##  Length:19066       Min.   :1975   Min.   : 1.000   Min.   : 1.00  
##  Class :character   1st Qu.:1993   1st Qu.: 8.000   1st Qu.: 8.00  
##  Mode  :character   Median :2004   Median : 9.000   Median :16.00  
##                     Mean   :2002   Mean   : 8.699   Mean   :15.78  
##                     3rd Qu.:2012   3rd Qu.: 9.000   3rd Qu.:24.00  
##                     Max.   :2021   Max.   :12.000   Max.   :31.00  
##       hour             long              lat           status         
##  Min.   : 0.000   Min.   :-109.30   Min.   : 7.00   Length:19066      
##  1st Qu.: 5.000   1st Qu.: -78.70   1st Qu.:18.40   Class :character  
##  Median :12.000   Median : -62.25   Median :26.60   Mode  :character  
##  Mean   : 9.094   Mean   : -61.52   Mean   :26.99                     
##  3rd Qu.:18.000   3rd Qu.: -45.60   3rd Qu.:33.70                     
##  Max.   :23.000   Max.   :  13.50   Max.   :70.70                     
##       wind           pressure     
##  Min.   : 10.00   Min.   : 882.0  
##  1st Qu.: 30.00   1st Qu.: 987.0  
##  Median : 45.00   Median :1000.0  
##  Mean   : 50.02   Mean   : 993.6  
##  3rd Qu.: 65.00   3rd Qu.:1007.0  
##  Max.   :165.00   Max.   :1024.0

Diagrama de dispersion:

par(mfrow = c(1, 2))
par( mar= c(4,4,2,2) )
plot(Storm$long,Storm$pressure)
plot(Storm$lat,Storm$pressure)