#Cleaning Data 1- demo

From Riinu Pius - Sarah Elliot - Ewen Harrison

The University of Edinburgh [https://media.ed.ac.uk/media/HealthyR+demoA+cleaning+data+1/1_oypymgwo]

Day 10 of HealthyR demo

Cleaning Data 1 - demo

library(tidyverse)
library(lubridate)
library(finalfit)

Load in data

ufo_orig = readr::read_csv("https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/main/2023-08/demo_ufo_sightings_original.csv")

ufo_orig
## # A tibble: 46,194 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <chr>                  <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 22/07/2012 02:15       05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 16/04/2016 05:13       22/04/2016  Goleta         CA   
##  3 2010-07-05 00:00:00 05/07/2010 00:00       06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 11/11/2013 03:00       20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 10/11/2010 22:30       21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 15/09/2009 02:15       12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 02/04/2013 03:15       15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 16/06/2014 01:00       20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 15/02/2020 15:30       25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 26/11/2017 00:40       04/12/2017  Kingman        KS   
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
ufo_orig = ufo_orig %>% 
  mutate(reported_date_time = ymd_hms(reported_date_time),
         reported_date_time_utc = dmy_hm(reported_date_time_utc))
ufo_orig
## # A tibble: 46,194 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>

Check for outliers

summary_glimpse = ufo_orig %>% ff_glimpse()
write_csv(x = summary_glimpse[[2]], file = "summary_glimpse.csv")
ufo_orig %>%
  ggplot(aes(x = reported_date_time_utc)) +
  geom_histogram(bins = 200)

p = ufo_orig %>%
  ggplot(aes(x = reported_date_time_utc, y = 1)) +
  geom_point()
# plotly::ggplotly(p)

Clean data

## I'm surprised this worked:
ufo_orig %>% 
  filter(reported_date_time_utc != "2200-09-07 03:25:00")
## # A tibble: 46,194 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Used to have to be this:
ufo_orig %>% 
  filter(reported_date_time_utc != ymd_hms("2200-09-07 03:25:00"))
## # A tibble: 46,193 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Another option:
ufo_orig %>% 
  filter(reported_date_time_utc <= today())
## # A tibble: 46,193 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>