Data parsing is converting data from one format to another. Widely used for data structuring, it is generally done to make the existing, often unstructured, unreadable data more comprehensible.
d1<-"45345"
d2<-"12/15/2011"
d3<- "15/12/2011"
d4<- "12/15/11"
d<-c(d1,d2,d3,d4)
# as.Date(d1)
# as.Date(d2)
as.Date(d3)
## [1] "15-12-20"
# as.Date(d4)
Sys.Date()
## [1] "2022-10-21"
Best way to convert numbers to date
janitor::convert_to_date(d1)
## [1] "2024-02-23"
janitor::convert_to_date("00000")
## [1] "1899-12-30"
janitor::convert_to_date("44444")
## [1] "2021-09-05"
janitor::convert_to_date(44444)
## [1] "2021-09-05"
class(janitor::convert_to_date("00000"))
## [1] "Date"
library(lubridate)
lubridate::today()
## [1] "2022-10-21"
lubridate::now()
## [1] "2022-10-21 13:48:22 UTC"
d1<-"45345"
d2<-"12/15/2011"
d3<- "15/12/2011"
d4<- "12/15/11"
d<-c(d1,d2,d3,d4)
lubridate::as_date(d1)
## Warning: All formats failed to parse. No formats found.
## [1] NA
lubridate::as_date(d2)
## Warning: All formats failed to parse. No formats found.
## [1] NA
lubridate::as_date(d3)
## Warning: All formats failed to parse. No formats found.
## [1] NA
lubridate::as_date(d4)
## Warning: All formats failed to parse. No formats found.
## [1] NA
mdy(d2)
## [1] "2011-12-15"
mdy(d2)->var
class(var)
## [1] "Date"
dmy(d3)
## [1] "2011-12-15"
mdy(d4)
## [1] "2011-12-15"
parse_date_time2(d, orders=c("mdY", "dmY", "mdy", "dmy"))->vars
class(vars)
## [1] "POSIXct" "POSIXt"
vars
## [1] NA "2011-12-15 UTC" "2011-12-15 UTC" "2011-12-15 UTC"
HL<- read.csv("Hodgkin study/Hodgkin.csv")
colnames(HL)
## [1] "Date.of.Diagnosis" "CR.timing"
## [3] "CR_After_2_cycles" "Age.at.DX"
## [5] "Chemotherapy" "Regimen.at.Dx"
## [7] "Total.number.of.cycles" "Total.Number.after.R.R"
## [9] "BMT" "Radiation"
## [11] "Radiation.with.primary.treatment" "Risk.Group"
## [13] "Stage" "Mets"
## [15] "B.Symptom" "Bulky"
## [17] "Nationality" "Outcome"
## [19] "Relapse.Progression" "Date.of.Last.FU.or.rellapse"
## [21] "Died" "Date.of.death.or.last.FU"
## [23] "Event..date.of.relapse" "Last.Follow.up"
## [25] "Histology" "Weight"
## [27] "Height" "X"
## [29] "X.1"
# exclude empty rows
HL<-HL[1:285,]
# it is very important to have a consistent way to name your columns
janitor::make_clean_names(colnames(HL))->colnames(HL)
colnames(HL)
## [1] "date_of_diagnosis" "cr_timing"
## [3] "cr_after_2_cycles" "age_at_dx"
## [5] "chemotherapy" "regimen_at_dx"
## [7] "total_number_of_cycles" "total_number_after_r_r"
## [9] "bmt" "radiation"
## [11] "radiation_with_primary_treatment" "risk_group"
## [13] "stage" "mets"
## [15] "b_symptom" "bulky"
## [17] "nationality" "outcome"
## [19] "relapse_progression" "date_of_last_fu_or_rellapse"
## [21] "died" "date_of_death_or_last_fu"
## [23] "event_date_of_relapse" "last_follow_up"
## [25] "histology" "weight"
## [27] "height" "x"
## [29] "x_1"
HL$date_of_diagnosis[1:10]
## [1] "1/1/2004" "1/3/2004" "1/6/2004" "1/6/2004" "1/6/2004" "1/7/2004"
## [7] "1/8/2004" "1/12/2004" "1/10/2004" "1/11/2004"
lubridate::as_date(HL$date_of_diagnosis[1:10])
## Warning: All formats failed to parse. No formats found.
## [1] NA NA NA NA NA NA NA NA NA NA
lubridate::parse_date_time2(HL$date_of_diagnosis, orders=c("dmY", "mdY", "Ymd", "Ydm"))->p
p[1:10]
## [1] "2004-01-01 UTC" "2004-03-01 UTC" "2004-06-01 UTC" "2004-06-01 UTC"
## [5] "2004-06-01 UTC" "2004-07-01 UTC" "2004-08-01 UTC" "2004-12-01 UTC"
## [9] "2004-10-01 UTC" "2004-11-01 UTC"
which(is.na(p))
## [1] 184
lubridate::parse_date_time2(d4, orders=c("dmY", "mdY", "mdy", "Ymd", "Ydm"),cutoff_2000 = 10L)
## [1] "1911-12-15 UTC"
Let us make a universal date fixing function
cat("Before applying the function \n\n")
## Before applying the function
lapply(HL[1:10], class)
## $date_of_diagnosis
## [1] "character"
##
## $cr_timing
## [1] "character"
##
## $cr_after_2_cycles
## [1] "integer"
##
## $age_at_dx
## [1] "numeric"
##
## $chemotherapy
## [1] "character"
##
## $regimen_at_dx
## [1] "integer"
##
## $total_number_of_cycles
## [1] "integer"
##
## $total_number_after_r_r
## [1] "integer"
##
## $bmt
## [1] "character"
##
## $radiation
## [1] "character"
# create fix_date function
fix_date<- function(x){
if (str_detect(x, c("/.|/|-"))){
lubridate::parse_date_time2(x, orders=c("mdY", "dmY", "mdy", "dmy", "Ymd", "Ydm"),cutoff_2000 = 23L)
}else{as_datetime(janitor::convert_to_date(x))
}}
# apply fix_date to all columns that start with "date"
HL %>% mutate(across(starts_with("date"), function(x) reduce(map(x, fix_date), c)))->HL
cat("After applying the function \n\n")
## After applying the function
lapply(HL[1:10], class)
## $date_of_diagnosis
## [1] "POSIXct" "POSIXt"
##
## $cr_timing
## [1] "character"
##
## $cr_after_2_cycles
## [1] "integer"
##
## $age_at_dx
## [1] "numeric"
##
## $chemotherapy
## [1] "character"
##
## $regimen_at_dx
## [1] "integer"
##
## $total_number_of_cycles
## [1] "integer"
##
## $total_number_after_r_r
## [1] "integer"
##
## $bmt
## [1] "character"
##
## $radiation
## [1] "character"
HL$OS<- as.numeric(HL$date_of_death_or_last_fu-HL$date_of_diagnosis)/30
HL$died[1:10]
## [1] 0 0 0 0 0 0 0 0 0 0
HL$EFS<- as.numeric(HL$date_of_last_fu_or_rellapse-HL$date_of_diagnosis)/30
HL$event<-ifelse(HL$event_date_of_relapse=="No", 0, 1)
HL$date_of_death_or_last_fu[1:50]<-NA
# HL$Date.of.death.or.last.FU
HL$date_of_death_or_last_fu<- if_else(is.na(HL$date_of_death_or_last_fu), HL$date_of_last_fu_or_rellapse, HL$date_of_death_or_last_fu)
HL$date_of_last_fu_or_rellapse[6]
## [1] NA
sapply(HL, class)
## $date_of_diagnosis
## [1] "POSIXct" "POSIXt"
##
## $cr_timing
## [1] "character"
##
## $cr_after_2_cycles
## [1] "integer"
##
## $age_at_dx
## [1] "numeric"
##
## $chemotherapy
## [1] "character"
##
## $regimen_at_dx
## [1] "integer"
##
## $total_number_of_cycles
## [1] "integer"
##
## $total_number_after_r_r
## [1] "integer"
##
## $bmt
## [1] "character"
##
## $radiation
## [1] "character"
##
## $radiation_with_primary_treatment
## [1] "integer"
##
## $risk_group
## [1] "character"
##
## $stage
## [1] "character"
##
## $mets
## [1] "character"
##
## $b_symptom
## [1] "character"
##
## $bulky
## [1] "character"
##
## $nationality
## [1] "character"
##
## $outcome
## [1] "character"
##
## $relapse_progression
## [1] "character"
##
## $date_of_last_fu_or_rellapse
## [1] "POSIXct" "POSIXt"
##
## $died
## [1] "integer"
##
## $date_of_death_or_last_fu
## [1] "POSIXct" "POSIXt"
##
## $event_date_of_relapse
## [1] "character"
##
## $last_follow_up
## [1] "character"
##
## $histology
## [1] "character"
##
## $weight
## [1] "numeric"
##
## $height
## [1] "numeric"
##
## $x
## [1] "logical"
##
## $x_1
## [1] "integer"
##
## $OS
## [1] "numeric"
##
## $EFS
## [1] "numeric"
##
## $event
## [1] "numeric"
cat("year \n")
## year
year(HL$date_of_diagnosis[1:10])
## [1] 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004
cat("wday \n")
## wday
wday(HL$date_of_diagnosis[1:10])
## [1] 5 7 3 3 3 4 5 2 7 1
cat("week \n")
## week
week(HL$date_of_diagnosis[1:10])
## [1] 1 1 1 1 1 1 2 2 2 2
cat("Month \n")
## Month
month(HL$date_of_diagnosis[1:10])
## [1] 1 1 1 1 1 1 1 1 1 1
cat("quartr \n")
## quartr
quarter(HL$date_of_diagnosis[1:10])
## [1] 1 1 1 1 1 1 1 1 1 1
cat("Are patients diagnosed equalty in the 4 quarters? \n")
## Are patients diagnosed equalty in the 4 quarters?
table(quarter(HL$date_of_diagnosis[1:10]))
##
## 1
## 10
chisq.test(table(quarter(HL$date_of_diagnosis)), p = c(1,1,1,1), rescale.p = TRUE)
##
## Chi-squared test for given probabilities
##
## data: table(quarter(HL$date_of_diagnosis))
## X-squared = 10.282, df = 3, p-value = 0.01632
cat("Approximate to the first day of the week \n")
## Approximate to the first day of the week
floor_date(HL$date_of_diagnosis[1:10], "week", week_start = getOption("lubridate.week.start", 7))->a
class(a)
## [1] "POSIXct" "POSIXt"
wday(a)
## [1] 1 1 1 1 1 1 1 1 1 1
interval(fix_date("2015-01-01"), fix_date("2020-01-01"))
## [1] 2015-01-01 UTC--2020-01-01 UTC
fix_date("2020-01-01")-fix_date("2015-01-01") ->a
as.numeric(a)
## [1] 1826
checks if dates are within an interval
int <- interval(ymd("2015-01-01"), ymd("2020-01-01"))
HL$date_of_diagnosis %within% int %>% table
## .
## FALSE TRUE
## 186 98
fix_date("2015-01-01")>fix_date("2020-01-01")
## [1] FALSE