Recap

  • clean versus unclean data

  • We can recode column names

  • We can recode data

  • We learned how to subset data

Functions learned

car::recode()     # change the data is stored
colnames()        # change the name of the columns
select()          # allows you to subset the data you want

Class 3 : Cleaning continued AND Wrangling (unorganized) data

Cleaning date column

Get the columns you want

data_clean            <- dplyr::select(data, "Patien's Age", Gender , "Case Type", "Admission Date")
colnames(data_clean)  <- c("age",
                           "gender",
                           "case_type",
                           "adm_date")

Cleaning date column

Change structure of date

head(data_clean$adm_date)
str(data_clean$adm_date)

min(data_clean$adm_date)
min(data_clean$adm_date, na.rm = T)

#Function to change structure of date is as.Date() 
#as.Date(name_of_column, "format_of_date")

Cleaning date column

Change structure of date

data_clean$adm_date <- as.Date(data_clean$adm_date, "%d-%b-%y")

head(data_clean$adm_date)
str(data_clean$adm_date)

min(data_clean$adm_date, na.rm  = T)

Wrangling Data

Totals by month

Unorganized data

 head(data_clean, 10)
## # A tibble: 10 x 4
##      age gender case_type adm_date  
##    <int> <chr>  <chr>     <date>    
##  1    28 Male   TBI       2026-11-17
##  2    26 Female CSDH      2016-08-08
##  3     0 <NA>   TBI       2017-02-27
##  4    24 Male   TBI       2017-02-25
##  5    19 Male   TBI       2017-02-09
##  6     0 Male   TBI       2017-01-27
##  7    33 Male   TBI       2017-01-29
##  8    46 Male   TBI       2017-01-27
##  9    27 Male   TBI       2017-01-24
## 10    23 Male   TBI       2017-01-23

Totals by month

Organized data

 head(data_clean2, 10)
## # A tibble: 10 x 6
##    month       csdh other spine   TBI Tumor
##    <date>     <int> <int> <int> <int> <int>
##  1 2015-12-01     0     0     0     1     0
##  2 2016-07-01     1     0     0    15     0
##  3 2016-08-01     6     0     2    11     1
##  4 2016-09-01    20     0     6    43    22
##  5 2016-10-01     4     0     0    70     4
##  6 2016-11-01     0     0     1    66     3
##  7 2016-12-01     3     0     0    79     1
##  8 2017-01-01     2     0     0    31     1
##  9 2017-02-01     3     0     2    52     0
## 10 2017-03-01     4     0     2    42     2

tidyverse

%>%

tidyverse

  • package with many (many!) useful functions

  • the pipe function (%>%) is one of those

  • when you think of pipe, think "and then"

 data_clean %>%
  select(adm_date, case_type, age) %>%
  head(4)

data_new <-  data_clean %>%
  dplyr::select(adm_date, case_type, age) %>%
  head(4) 

data_new %>%
  summarise(
    total = sum(age)
  )

Lets begin layering

selct the columns you want

data_clean2 <- data_clean %>%
  dplyr::select(
    age,
    gender,
    case_type,
    adm_date
  ) 

Layering cont

make a new column - grouper

data_clean2 <- data_clean %>%
  dplyr::select(
    age,
    gender,
    case_type,
    adm_date
  ) %>%
  mutate(grouper = floor_date(adm_date, "month")) 

Layering cont

group by that new column

data_clean2 <- data_clean %>%
  dplyr::select(
    age,
    gender,
    case_type,
    adm_date
  ) %>%
  mutate(grouper = floor_date(adm_date, "month")) %>%
  group_by(grouper) 

Layering cont

summarise based on grouper variable

data_clean2 <- data_clean %>%
  dplyr::select(
    age,
    gender,
    case_type,
    adm_date
  ) %>%
  mutate(grouper = floor_date(adm_date, "month")) %>%
  group_by(grouper) %>%
  summarise(
    csdh = sum(case_type == "CSDH", na.rm = T)
  )

Layering cont

data_clean2 <- data_clean %>%
  dplyr::select(
    age,
    gender,
    case_type,
    adm_date
  ) %>%
  mutate(grouper = floor_date(adm_date, "month")) %>%
  group_by(grouper) %>%
  summarise(
    csdh = sum(case_type == "CSDH", na.rm = T),
    other = sum(case_type == "Other", na.rm = T),
    spine = sum(case_type == "Spine", na.rm = T),
    TBI = sum(case_type == "TBI", na.rm = T),
    Tumor = sum(case_type == "Tumor", na.rm = T)
  )

A beautiful final product

data_clean2 %>%
kable() 

data_clean2 %>%
    subset(grouper >= "2016-08-01" & grouper <= "2017-11-01") %>%
    kable()

data_clean2 %>%
    subset(grouper >= "2016-08-01" & grouper <= "2017-11-01") %>%
    kable() %>%
    kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F) 

data_clean2 %>%
    subset(grouper >= "2016-08-01" & grouper <= "2017-11-01") %>%
    kable() %>%
    kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F) %>%
  group_rows("2016", 1, 5) %>%
  group_rows("2017", 6, 16)