EDA

#import data
library(readr)
data <- read_csv("/Users/andygarza/Downloads/us_tornado_dataset_1950_2021.csv")

## Rows: 67558 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): st
## dbl  (12): yr, mo, dy, mag, inj, fat, slat, slon, elat, elon, len, wid
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Group by state and tornado count
state_counts <- data %>%
  group_by(st) %>%
  summarise(tornado_count = n()) %>%
  arrange(desc(tornado_count))

#Print result
print(state_counts)

## # A tibble: 53 × 2
##    st    tornado_count
##    <chr>         <int>
##  1 TX             9149
##  2 KS             4375
##  3 OK             4092
##  4 FL             3497
##  5 NE             2967
##  6 IA             2773
##  7 IL             2682
##  8 MS             2476
##  9 MO             2427
## 10 AL             2358
## # ℹ 43 more rows

#find state with the highest tornado count
state_with_most_tornados <- state_counts$st[1]
max_tornado_count <- state_counts$tornado_count[1]

cat("State with the highest tornado count:", state_with_most_tornados, "with", max_tornado_count, "tornadoes.\n")

## State with the highest tornado count: TX with 9149 tornadoes.

library(dplyr)
state_counts <- data %>% 
  group_by(st) %>%
  summarise(tornado_count = n()) %>%
  arrange(desc(tornado_count))

#Filter to display top 10 states
top_10 <- state_counts %>%
  slice(1:10) %>%
  pull(st)

library(dplyr)
library(ggplot2)

# Group by state and tornado counts
state_counts <- data %>%
  group_by(st) %>%
  summarise(tornado_count = n()) %>%
  arrange(desc(tornado_count))

# Filter the top 10 states
top_10 <- state_counts %>%
  slice(1:10)

top_10 <- as.data.frame(top_10)

#Create visual
ggplot(top_10, aes(x = reorder(st, -tornado_count), y = tornado_count)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Tornado Count by State", x = "State", y = "Number of Tornadoes") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

library(dplyr)
library(ggplot2)

# Filter the dataset to show only the top 10 states
top_10_states <- top_10$st
filtered_data <- data %>%
  filter(st %in% top_10_states)

# Group by state, year, and tornado count
heatmap_data <- filtered_data %>%
  group_by(st, yr) %>%
  summarise(tornado_count = n(), .groups = 'drop')

# Create visual
ggplot(heatmap_data, aes(x = yr, y = st, fill = tornado_count)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  labs(
    title = "Heatmap of Tornado Counts for Top 10 States",
    x = "Year",
    y = "State",
    fill = "Tornado Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

EDA

Juan Garza

2025-01-30