#Cleaning Data II- demo
The University of Edinburgh
[https://media.ed.ac.uk/media/HealthyR+DemoA+Data+Cleaning+2/1_yclggyiq]
Day 12 of HealthyR demo
Cleaning Data 1 - demo
library(tidyverse)
library(lubridate)
library(finalfit)
ufo_orig = readr::read_csv("https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/main/2023-08/demo_ufo_sightings_original.csv")
ufo_orig
## # A tibble: 46,194 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <chr> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 22/07/2012 02:15 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 16/04/2016 05:13 22/04/2016 Goleta CA
## 3 2010-07-05 00:00:00 05/07/2010 00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 11/11/2013 03:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 10/11/2010 22:30 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 15/09/2009 02:15 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 02/04/2013 03:15 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 16/06/2014 01:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 15/02/2020 15:30 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 26/11/2017 00:40 04/12/2017 Kingman KS
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
ufo_orig = ufo_orig %>%
mutate(reported_date_time = ymd_hms(reported_date_time),
reported_date_time_utc = dmy_hm(reported_date_time_utc))
ufo_orig
## # A tibble: 46,194 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
summary_glimpse = ufo_orig %>% ff_glimpse()
write_csv(x = summary_glimpse[[2]], file = "summary_glimpse.csv")
ufo_orig %>%
ggplot(aes(x = reported_date_time_utc)) +
geom_histogram(bins = 200)
p = ufo_orig %>%
ggplot(aes(x = reported_date_time_utc, y = 1)) +
geom_point()
# plotly::ggplotly(p)
## I'm surprised this worked:
ufo_orig %>%
filter(reported_date_time_utc != "2200-09-07 03:25:00")
## # A tibble: 46,194 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,184 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Used to have to be this:
ufo_orig %>%
filter(reported_date_time_utc != ymd_hms("2200-09-07 03:25:00"))
## # A tibble: 46,193 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
## Another option:
ufo_orig %>%
filter(reported_date_time_utc <= today())
## # A tibble: 46,193 × 12
## reported_date_time reported_date_time_utc posted_date city state
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2012-07-22 02:15:00 2012-07-22 02:15:00 05/08/2012 Marietta GA
## 2 2016-04-16 05:13:00 2016-04-16 05:13:00 22/04/2016 Goleta CA
## 3 NA 2010-07-05 00:00:00 06/07/2010 Louisville KY
## 4 2013-11-11 03:00:00 2013-11-11 03:00:00 20/11/2013 Virginia Beach VA
## 5 2010-11-10 22:30:00 2010-11-10 22:30:00 21/11/2010 Groesbeck TX
## 6 2009-09-15 02:15:00 2009-09-15 02:15:00 12/12/2009 Chesapeake VA
## 7 2013-04-02 03:15:00 2013-04-02 03:15:00 15/05/2013 San Diego CA
## 8 2014-06-16 01:00:00 2014-06-16 01:00:00 20/06/2014 Clearwater FL
## 9 2020-02-15 15:30:00 2020-02-15 15:30:00 25/02/2020 Dawsonville GA
## 10 2017-11-26 00:40:00 2017-11-26 00:40:00 04/12/2017 Kingman KS
## # ℹ 46,183 more rows
## # ℹ 7 more variables: country_code <chr>, shape <chr>, reported_duration <chr>,
## # duration_seconds <dbl>, Summary <chr>, has_images <lgl>, day_part <chr>
ufo = ufo_orig %>%
mutate(year = year(reported_date_time), .after = reported_date_time) %>%
mutate(hour = hour(reported_date_time), .after = year)
ufo %>%
count(hour, sort = TRUE)
## # A tibble: 25 × 2
## hour n
## <int> <int>
## 1 2 5792
## 2 3 5327
## 3 1 5147
## 4 4 4225
## 5 5 3207
## 6 0 2312
## 7 6 2186
## 8 23 2030
## 9 7 1448
## 10 22 1369
## # ℹ 15 more rows
# library(janitor) - recommended but not used in this demo
# ufo %>%
# distinct(posted_date, .keep_all = TRUE) %>%
# arrange(posted_date)
ufo = ufo %>%
distinct()
udo = ufo %>%
mutate(shape = na_if(shape, "unknown"))
# %>%
# # mutate(shape = if_else(shape == "unknown" & another_column == "remove", NA, shape) %>%
# count(shape, sort = TRUE)
library(janitor)
ufo = ufo %>%
clean_names()
tibble(`date time` = 1, `animal-measurement` = 1, `Hello this is column name` = 1) %>%
clean_names() %>%
rename(hello = hello_this_is_column_name)
## # A tibble: 1 × 3
## date_time animal_measurement hello
## <dbl> <dbl> <dbl>
## 1 1 1 1