clean versus unclean data
We can recode column names
We can recode data
We learned how to subset data
car::recode() # change the data is stored colnames() # change the name of the columns select() # allows you to subset the data you want
data_clean <- dplyr::select(data, "Patien's Age", Gender , "Case Type", "Admission Date")
colnames(data_clean) <- c("age",
"gender",
"case_type",
"adm_date")
head(data_clean$adm_date) str(data_clean$adm_date) min(data_clean$adm_date) min(data_clean$adm_date, na.rm = T) #Function to change structure of date is as.Date() #as.Date(name_of_column, "format_of_date")
data_clean$adm_date <- as.Date(data_clean$adm_date, "%d-%b-%y") head(data_clean$adm_date) str(data_clean$adm_date) min(data_clean$adm_date, na.rm = T)
head(data_clean, 10)
## # A tibble: 10 x 4 ## age gender case_type adm_date ## <int> <chr> <chr> <date> ## 1 28 Male TBI 2026-11-17 ## 2 26 Female CSDH 2016-08-08 ## 3 0 <NA> TBI 2017-02-27 ## 4 24 Male TBI 2017-02-25 ## 5 19 Male TBI 2017-02-09 ## 6 0 Male TBI 2017-01-27 ## 7 33 Male TBI 2017-01-29 ## 8 46 Male TBI 2017-01-27 ## 9 27 Male TBI 2017-01-24 ## 10 23 Male TBI 2017-01-23
head(data_clean2, 10)
## # A tibble: 10 x 6 ## month csdh other spine TBI Tumor ## <date> <int> <int> <int> <int> <int> ## 1 2015-12-01 0 0 0 1 0 ## 2 2016-07-01 1 0 0 15 0 ## 3 2016-08-01 6 0 2 11 1 ## 4 2016-09-01 20 0 6 43 22 ## 5 2016-10-01 4 0 0 70 4 ## 6 2016-11-01 0 0 1 66 3 ## 7 2016-12-01 3 0 0 79 1 ## 8 2017-01-01 2 0 0 31 1 ## 9 2017-02-01 3 0 2 52 0 ## 10 2017-03-01 4 0 2 42 2
package with many (many!) useful functions
the pipe function (%>%) is one of those
when you think of pipe, think "and then"
data_clean %>%
select(adm_date, case_type, age) %>%
head(4)
data_new <- data_clean %>%
dplyr::select(adm_date, case_type, age) %>%
head(4)
data_new %>%
summarise(
total = sum(age)
)
data_clean2 <- data_clean %>%
dplyr::select(
age,
gender,
case_type,
adm_date
)
data_clean2 <- data_clean %>%
dplyr::select(
age,
gender,
case_type,
adm_date
) %>%
mutate(grouper = floor_date(adm_date, "month"))
data_clean2 <- data_clean %>%
dplyr::select(
age,
gender,
case_type,
adm_date
) %>%
mutate(grouper = floor_date(adm_date, "month")) %>%
group_by(grouper)
data_clean2 <- data_clean %>%
dplyr::select(
age,
gender,
case_type,
adm_date
) %>%
mutate(grouper = floor_date(adm_date, "month")) %>%
group_by(grouper) %>%
summarise(
csdh = sum(case_type == "CSDH", na.rm = T)
)
data_clean2 <- data_clean %>%
dplyr::select(
age,
gender,
case_type,
adm_date
) %>%
mutate(grouper = floor_date(adm_date, "month")) %>%
group_by(grouper) %>%
summarise(
csdh = sum(case_type == "CSDH", na.rm = T),
other = sum(case_type == "Other", na.rm = T),
spine = sum(case_type == "Spine", na.rm = T),
TBI = sum(case_type == "TBI", na.rm = T),
Tumor = sum(case_type == "Tumor", na.rm = T)
)
data_clean2 %>%
kable()
data_clean2 %>%
subset(grouper >= "2016-08-01" & grouper <= "2017-11-01") %>%
kable()
data_clean2 %>%
subset(grouper >= "2016-08-01" & grouper <= "2017-11-01") %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F)
data_clean2 %>%
subset(grouper >= "2016-08-01" & grouper <= "2017-11-01") %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F) %>%
group_rows("2016", 1, 5) %>%
group_rows("2017", 6, 16)