#Cleaning Data II- demo

From Riinu Pius - Sarah Elliot - Ewen Harrison

The University of Edinburgh
[https://media.ed.ac.uk/media/HealthyR+DemoA+Data+Cleaning+2/1_yclggyiq]

Day 12 of HealthyR demo

Cleaning Data 1 - demo

library(tidyverse)
library(lubridate)
library(finalfit)

Load in data

ufo_orig = readr::read_csv("https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/main/2023-08/demo_ufo_sightings_original.csv")

ufo_orig
## # A tibble: 46,194 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <chr>                  <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 22/07/2012 02:15       05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 16/04/2016 05:13       22/04/2016  Goleta         CA   
##  3 2010-07-05 00:00:00 05/07/2010 00:00       06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 11/11/2013 03:00       20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 10/11/2010 22:30       21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 15/09/2009 02:15       12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 02/04/2013 03:15       15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 16/06/2014 01:00       20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 15/02/2020 15:30       25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 26/11/2017 00:40       04/12/2017  Kingman        KS   
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
ufo_orig = ufo_orig %>% 
  mutate(reported_date_time = ymd_hms(reported_date_time),
         reported_date_time_utc = dmy_hm(reported_date_time_utc))
ufo_orig
## # A tibble: 46,194 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>

Check for outliers

summary_glimpse = ufo_orig %>% ff_glimpse()
write_csv(x = summary_glimpse[[2]], file = "summary_glimpse.csv")
ufo_orig %>%
  ggplot(aes(x = reported_date_time_utc)) +
  geom_histogram(bins = 200)

p = ufo_orig %>%
  ggplot(aes(x = reported_date_time_utc, y = 1)) +
  geom_point()
# plotly::ggplotly(p)

Clean data

## I'm surprised this worked:
ufo_orig %>% 
  filter(reported_date_time_utc != "2200-09-07 03:25:00")
## # A tibble: 46,194 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Used to have to be this:
ufo_orig %>% 
  filter(reported_date_time_utc != ymd_hms("2200-09-07 03:25:00"))
## # A tibble: 46,193 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Another option:
ufo_orig %>% 
  filter(reported_date_time_utc <= today())
## # A tibble: 46,193 × 12
##    reported_date_time  reported_date_time_utc posted_date city           state
##    <dttm>              <dttm>                 <chr>       <chr>          <chr>
##  1 2012-07-22 02:15:00 2012-07-22 02:15:00    05/08/2012  Marietta       GA   
##  2 2016-04-16 05:13:00 2016-04-16 05:13:00    22/04/2016  Goleta         CA   
##  3 NA                  2010-07-05 00:00:00    06/07/2010  Louisville     KY   
##  4 2013-11-11 03:00:00 2013-11-11 03:00:00    20/11/2013  Virginia Beach VA   
##  5 2010-11-10 22:30:00 2010-11-10 22:30:00    21/11/2010  Groesbeck      TX   
##  6 2009-09-15 02:15:00 2009-09-15 02:15:00    12/12/2009  Chesapeake     VA   
##  7 2013-04-02 03:15:00 2013-04-02 03:15:00    15/05/2013  San Diego      CA   
##  8 2014-06-16 01:00:00 2014-06-16 01:00:00    20/06/2014  Clearwater     FL   
##  9 2020-02-15 15:30:00 2020-02-15 15:30:00    25/02/2020  Dawsonville    GA   
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00    04/12/2017  Kingman        KS   
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## #   duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>

Extract year and hour values from date

ufo = ufo_orig %>% 
  mutate(year = year(reported_date_time), .after = reported_date_time) %>% 
  mutate(hour = hour(reported_date_time), .after = year)

ufo %>% 
  count(hour, sort = TRUE)
## # A tibble: 25 × 2
##     hour     n
##    <int> <int>
##  1     2  5792
##  2     3  5327
##  3     1  5147
##  4     4  4225
##  5     5  3207
##  6     0  2312
##  7     6  2186
##  8    23  2030
##  9     7  1448
## 10    22  1369
## # ℹ 15 more rows

Check for duplicates

# library(janitor) - recommended but not used in this demo

# ufo %>% 
#   distinct(posted_date, .keep_all = TRUE) %>% 
#   arrange(posted_date)

ufo = ufo %>% 
  distinct()
udo = ufo %>% 
  mutate(shape = na_if(shape, "unknown"))

# %>% 
#   # mutate(shape = if_else(shape == "unknown" & another_column == "remove", NA, shape) %>% 
#   count(shape, sort = TRUE)
library(janitor)

ufo = ufo %>% 
  clean_names()


tibble(`date time` = 1, `animal-measurement` = 1, `Hello this is column name` = 1) %>% 
  clean_names() %>% 
  rename(hello = hello_this_is_column_name)
## # A tibble: 1 × 3
##   date_time animal_measurement hello
##       <dbl>              <dbl> <dbl>
## 1         1                  1     1