Import Data

ufo_sightings <- read_excel("../00_data/myData_Shoals.xlsx")
ufo_sightings
## # A tibble: 96,429 × 13
##    reported_date_time  reported_date_time_utc posted_date         city     state
##    <dttm>              <dttm>                 <dttm>              <chr>    <chr>
##  1 2022-08-29 02:03:00 2022-08-29 02:03:00    2022-09-09 00:00:00 Pinehur… NC   
##  2 2022-08-19 21:51:00 2022-08-19 21:51:00    2022-10-08 00:00:00 Rapid C… MI   
##  3 2022-08-13 01:30:00 2022-08-13 01:30:00    2022-09-09 00:00:00 Clevela… OH   
##  4 2022-08-06 17:00:00 2022-08-06 17:00:00    2022-09-09 00:00:00 Bloomin… IN   
##  5 2022-08-04 03:40:00 2022-08-04 03:40:00    2022-09-09 00:00:00 Irvine   CA   
##  6 2022-07-22 12:00:00 2022-07-22 12:00:00    2022-09-09 00:00:00 Moore    OK   
##  7 2022-07-19 12:27:00 2022-07-19 12:27:00    2022-09-09 00:00:00 Short P… VA   
##  8 2022-07-14 14:56:00 2022-07-14 14:56:00    2022-09-09 00:00:00 Norwalk  CT   
##  9 2022-07-13 15:40:00 2022-07-13 15:40:00    2022-09-09 00:00:00 Blayney  New …
## 10 2022-07-13 00:10:00 2022-07-13 00:10:00    2022-09-09 00:00:00 Greybull WY   
## # ℹ 96,419 more rows
## # ℹ 8 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, summary <chr>, has_images <lgl>, day_part <chr>,
## #   Time <dttm>

Introduction

Questions

# Is there a connection between location and frequency?
# Is there a connection between shape and time of day?

Variation

north_america <- ufo_sightings %>% 
  filter(country_code %in% c("CA", "US", "MX") & duration_seconds < 600 & shape %in% c("formation", "light", "disk", "flash", "fireball", "star", "orb"))
north_america
## # A tibble: 27,009 × 13
##    reported_date_time  reported_date_time_utc posted_date         city     state
##    <dttm>              <dttm>                 <dttm>              <chr>    <chr>
##  1 2023-05-15 16:35:00 2023-05-15 16:35:00    2023-05-19 00:00:00 Fenton   MI   
##  2 2023-05-15 00:40:00 2023-05-15 00:40:00    2023-05-19 00:00:00 Cannon … OR   
##  3 2023-05-10 21:12:00 2023-05-10 21:12:00    2023-05-19 00:00:00 Henders… NC   
##  4 2023-04-23 04:14:00 2023-04-23 04:14:00    2023-05-19 00:00:00 Woodcli… NJ   
##  5 2023-04-17 18:58:00 2023-04-17 18:58:00    2023-05-19 00:00:00 Tomball  TX   
##  6 2023-04-14 17:18:00 2023-04-14 17:18:00    2023-05-19 00:00:00 temple   GA   
##  7 2023-04-03 12:04:00 2023-04-03 12:04:00    2023-05-19 00:00:00 Glouces… MA   
##  8 2023-04-02 07:12:00 2023-04-02 07:12:00    2023-04-09 00:00:00 Leslie   MI   
##  9 2023-03-25 19:30:00 2023-03-25 19:30:00    2023-04-09 00:00:00 alma     AR   
## 10 2023-03-21 21:09:00 2023-03-21 21:09:00    2023-04-09 00:00:00 Barnegat NJ   
## # ℹ 26,999 more rows
## # ℹ 8 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, summary <chr>, has_images <lgl>, day_part <chr>,
## #   Time <dttm>

Visualizing distributions

ggplot(data = north_america, mapping = aes(x = country_code)) +
  geom_bar() 

Typical values

ggplot(data = north_america, mapping = aes(x = duration_seconds)) + 
  geom_histogram(binwidth = 60) + 
    coord_cartesian(xlim = c(0,600))

Unusual values

Missing Values

# My missing values that cause issues are typically within the shape column, with a filter, I can remove those NA values.

Covariation

# I'm going to compare duration_seconds with other factors, such as shape and day_part. A conclusion may be drawn that astronomical phenomena are often mistaken for UFOs.

A categorical and continuous variable

ggplot(data = north_america, mapping = aes(x = duration_seconds)) + 
  geom_freqpoly(mapping = aes(colour = shape), binwidth = 60)

Two categorical variables

ggplot(data = north_america) +
  geom_count(mapping = aes(x = shape, y = country_code))

north_america %>% 
  count(shape, country_code) %>%  
  ggplot(mapping = aes(x = shape, y = country_code)) +
    geom_tile(mapping = aes(fill = n))

Two continous variables

ggplot(data = north_america) +
  geom_bin2d(mapping = aes(x = posted_date, y = duration_seconds))

# I don't really have two continuous values.

Patterns and models

library(modelr)
## Warning: package 'modelr' was built under R version 4.4.3
# Similarly I ran the code and did not get what was shown in  the textbook, could not log chr