Import Data
ufo_sightings <- read_excel("../00_data/myData_Shoals.xlsx")
ufo_sightings
## # A tibble: 96,429 × 13
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <dttm> <chr> <chr>
## 1 2022-08-29 02:03:00 2022-08-29 02:03:00 2022-09-09 00:00:00 Pinehur… NC
## 2 2022-08-19 21:51:00 2022-08-19 21:51:00 2022-10-08 00:00:00 Rapid C… MI
## 3 2022-08-13 01:30:00 2022-08-13 01:30:00 2022-09-09 00:00:00 Clevela… OH
## 4 2022-08-06 17:00:00 2022-08-06 17:00:00 2022-09-09 00:00:00 Bloomin… IN
## 5 2022-08-04 03:40:00 2022-08-04 03:40:00 2022-09-09 00:00:00 Irvine CA
## 6 2022-07-22 12:00:00 2022-07-22 12:00:00 2022-09-09 00:00:00 Moore OK
## 7 2022-07-19 12:27:00 2022-07-19 12:27:00 2022-09-09 00:00:00 Short P… VA
## 8 2022-07-14 14:56:00 2022-07-14 14:56:00 2022-09-09 00:00:00 Norwalk CT
## 9 2022-07-13 15:40:00 2022-07-13 15:40:00 2022-09-09 00:00:00 Blayney New …
## 10 2022-07-13 00:10:00 2022-07-13 00:10:00 2022-09-09 00:00:00 Greybull WY
## # ℹ 96,419 more rows
## # ℹ 8 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, summary <chr>, has_images <lgl>, day_part <chr>,
## # Time <dttm>
Introduction
Questions
# Is there a connection between location and frequency?
# Is there a connection between shape and time of day?
Variation
north_america <- ufo_sightings %>%
filter(country_code %in% c("CA", "US", "MX") & duration_seconds < 600 & shape %in% c("formation", "light", "disk", "flash", "fireball", "star", "orb"))
north_america
## # A tibble: 27,009 × 13
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <dttm> <chr> <chr>
## 1 2023-05-15 16:35:00 2023-05-15 16:35:00 2023-05-19 00:00:00 Fenton MI
## 2 2023-05-15 00:40:00 2023-05-15 00:40:00 2023-05-19 00:00:00 Cannon … OR
## 3 2023-05-10 21:12:00 2023-05-10 21:12:00 2023-05-19 00:00:00 Henders… NC
## 4 2023-04-23 04:14:00 2023-04-23 04:14:00 2023-05-19 00:00:00 Woodcli… NJ
## 5 2023-04-17 18:58:00 2023-04-17 18:58:00 2023-05-19 00:00:00 Tomball TX
## 6 2023-04-14 17:18:00 2023-04-14 17:18:00 2023-05-19 00:00:00 temple GA
## 7 2023-04-03 12:04:00 2023-04-03 12:04:00 2023-05-19 00:00:00 Glouces… MA
## 8 2023-04-02 07:12:00 2023-04-02 07:12:00 2023-04-09 00:00:00 Leslie MI
## 9 2023-03-25 19:30:00 2023-03-25 19:30:00 2023-04-09 00:00:00 alma AR
## 10 2023-03-21 21:09:00 2023-03-21 21:09:00 2023-04-09 00:00:00 Barnegat NJ
## # ℹ 26,999 more rows
## # ℹ 8 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, summary <chr>, has_images <lgl>, day_part <chr>,
## # Time <dttm>
Visualizing distributions
ggplot(data = north_america, mapping = aes(x = country_code)) +
geom_bar()

Typical values
ggplot(data = north_america, mapping = aes(x = duration_seconds)) +
geom_histogram(binwidth = 60) +
coord_cartesian(xlim = c(0,600))

Unusual values
Missing Values
# My missing values that cause issues are typically within the shape column, with a filter, I can remove those NA values.
Covariation
# I'm going to compare duration_seconds with other factors, such as shape and day_part. A conclusion may be drawn that astronomical phenomena are often mistaken for UFOs.
A categorical and continuous variable
ggplot(data = north_america, mapping = aes(x = duration_seconds)) +
geom_freqpoly(mapping = aes(colour = shape), binwidth = 60)

Two categorical variables
ggplot(data = north_america) +
geom_count(mapping = aes(x = shape, y = country_code))

north_america %>%
count(shape, country_code) %>%
ggplot(mapping = aes(x = shape, y = country_code)) +
geom_tile(mapping = aes(fill = n))

Two continous variables
ggplot(data = north_america) +
geom_bin2d(mapping = aes(x = posted_date, y = duration_seconds))

# I don't really have two continuous values.
Patterns and models
library(modelr)
## Warning: package 'modelr' was built under R version 4.4.3
# Similarly I ran the code and did not get what was shown in the textbook, could not log chr