#Cleaning Data 1- demo
The University of Edinburgh [https://media.ed.ac.uk/media/HealthyR+demoA+cleaning+data+1/1_oypymgwo]
Day 10 of HealthyR demo
Cleaning Data 1 - demo
library(tidyverse)
library(lubridate)
library(finalfit)
ufo_orig = readr::read_csv("https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/main/2023-08/demo_ufo_sightings_original.csv")
ufo_orig
## # A tibble: 46,194 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <chr> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 22/07/2012 02:15 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 16/04/2016 05:13 22/04/2016 Goleta CA
## 3 2010-07-05 00:00:00 05/07/2010 00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 11/11/2013 03:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 10/11/2010 22:30 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 15/09/2009 02:15 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 02/04/2013 03:15 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 16/06/2014 01:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 15/02/2020 15:30 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 26/11/2017 00:40 04/12/2017 Kingman KS
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
ufo_orig = ufo_orig %>%
mutate(reported_date_time = ymd_hms(reported_date_time),
reported_date_time_utc = dmy_hm(reported_date_time_utc))
ufo_orig
## # A tibble: 46,194 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
summary_glimpse = ufo_orig %>% ff_glimpse()
write_csv(x = summary_glimpse[[2]], file = "summary_glimpse.csv")
ufo_orig %>%
ggplot(aes(x = reported_date_time_utc)) +
geom_histogram(bins = 200)
p = ufo_orig %>%
ggplot(aes(x = reported_date_time_utc, y = 1)) +
geom_point()
# plotly::ggplotly(p)
## I'm surprised this worked:
ufo_orig %>%
filter(reported_date_time_utc != "2200-09-07 03:25:00")
## # A tibble: 46,194 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Used to have to be this:
ufo_orig %>%
filter(reported_date_time_utc != ymd_hms("2200-09-07 03:25:00"))
## # A tibble: 46,193 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Another option:
ufo_orig %>%
filter(reported_date_time_utc <= today())
## # A tibble: 46,193 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>