Parse

Data parsing is converting data from one format to another. Widely used for data structuring, it is generally done to make the existing, often unstructured, unreadable data more comprehensible.

Base R to convert to date: not good

d1<-"45345"
d2<-"12/15/2011"
d3<- "15/12/2011"
d4<- "12/15/11"

d<-c(d1,d2,d3,d4)

# as.Date(d1)
# as.Date(d2)
as.Date(d3)

## [1] "15-12-20"

# as.Date(d4)

Sys.Date()

## [1] "2022-10-21"

janitor::convert_to_date

Best way to convert numbers to date

janitor::convert_to_date(d1)

## [1] "2024-02-23"

janitor::convert_to_date("00000")

## [1] "1899-12-30"

janitor::convert_to_date("44444")

## [1] "2021-09-05"

janitor::convert_to_date(44444)

## [1] "2021-09-05"

class(janitor::convert_to_date("00000"))

## [1] "Date"

lubridate is the way to go

library(lubridate)

lubridate::today()

## [1] "2022-10-21"

lubridate::now()

## [1] "2022-10-21 13:48:22 UTC"

d1<-"45345"
d2<-"12/15/2011"
d3<- "15/12/2011"
d4<- "12/15/11"

d<-c(d1,d2,d3,d4)

lubridate::as_date(d1)

## Warning: All formats failed to parse. No formats found.

## [1] NA

lubridate::as_date(d2)

## Warning: All formats failed to parse. No formats found.

## [1] NA

lubridate::as_date(d3)

## Warning: All formats failed to parse. No formats found.

## [1] NA

lubridate::as_date(d4)

## Warning: All formats failed to parse. No formats found.

## [1] NA

mdy(d2)

## [1] "2011-12-15"

mdy(d2)->var
class(var)

## [1] "Date"

dmy(d3)

## [1] "2011-12-15"

mdy(d4)

## [1] "2011-12-15"

parse_date_time2

parse_date_time2(d, orders=c("mdY", "dmY", "mdy", "dmy"))->vars
class(vars)

## [1] "POSIXct" "POSIXt"

vars

## [1] NA               "2011-12-15 UTC" "2011-12-15 UTC" "2011-12-15 UTC"

import CSV format and excel

HL<- read.csv("Hodgkin study/Hodgkin.csv")
colnames(HL)

##  [1] "Date.of.Diagnosis"                "CR.timing"                       
##  [3] "CR_After_2_cycles"                "Age.at.DX"                       
##  [5] "Chemotherapy"                     "Regimen.at.Dx"                   
##  [7] "Total.number.of.cycles"           "Total.Number.after.R.R"          
##  [9] "BMT"                              "Radiation"                       
## [11] "Radiation.with.primary.treatment" "Risk.Group"                      
## [13] "Stage"                            "Mets"                            
## [15] "B.Symptom"                        "Bulky"                           
## [17] "Nationality"                      "Outcome"                         
## [19] "Relapse.Progression"              "Date.of.Last.FU.or.rellapse"     
## [21] "Died"                             "Date.of.death.or.last.FU"        
## [23] "Event..date.of.relapse"           "Last.Follow.up"                  
## [25] "Histology"                        "Weight"                          
## [27] "Height"                           "X"                               
## [29] "X.1"

# exclude empty rows
HL<-HL[1:285,]

janitor::make_clean_names

# it is very important to have a consistent way to name your columns
janitor::make_clean_names(colnames(HL))->colnames(HL)
colnames(HL)

##  [1] "date_of_diagnosis"                "cr_timing"                       
##  [3] "cr_after_2_cycles"                "age_at_dx"                       
##  [5] "chemotherapy"                     "regimen_at_dx"                   
##  [7] "total_number_of_cycles"           "total_number_after_r_r"          
##  [9] "bmt"                              "radiation"                       
## [11] "radiation_with_primary_treatment" "risk_group"                      
## [13] "stage"                            "mets"                            
## [15] "b_symptom"                        "bulky"                           
## [17] "nationality"                      "outcome"                         
## [19] "relapse_progression"              "date_of_last_fu_or_rellapse"     
## [21] "died"                             "date_of_death_or_last_fu"        
## [23] "event_date_of_relapse"            "last_follow_up"                  
## [25] "histology"                        "weight"                          
## [27] "height"                           "x"                               
## [29] "x_1"

HL$date_of_diagnosis

HL$date_of_diagnosis[1:10]

##  [1] "1/1/2004"  "1/3/2004"  "1/6/2004"  "1/6/2004"  "1/6/2004"  "1/7/2004" 
##  [7] "1/8/2004"  "1/12/2004" "1/10/2004" "1/11/2004"

Bad lubridate::as_date

lubridate::as_date(HL$date_of_diagnosis[1:10])

## Warning: All formats failed to parse. No formats found.

##  [1] NA NA NA NA NA NA NA NA NA NA

Good lubridate::parse_date_time2

lubridate::parse_date_time2(HL$date_of_diagnosis, orders=c("dmY", "mdY", "Ymd", "Ydm"))->p
p[1:10]

##  [1] "2004-01-01 UTC" "2004-03-01 UTC" "2004-06-01 UTC" "2004-06-01 UTC"
##  [5] "2004-06-01 UTC" "2004-07-01 UTC" "2004-08-01 UTC" "2004-12-01 UTC"
##  [9] "2004-10-01 UTC" "2004-11-01 UTC"

which(is.na(p))

## [1] 184

cutoff_2000

lubridate::parse_date_time2(d4, orders=c("dmY", "mdY", "mdy", "Ymd", "Ydm"),cutoff_2000 = 10L)

## [1] "1911-12-15 UTC"

fix_date function

Let us make a universal date fixing function

cat("Before applying the function \n\n")

## Before applying the function

lapply(HL[1:10], class)

## $date_of_diagnosis
## [1] "character"
## 
## $cr_timing
## [1] "character"
## 
## $cr_after_2_cycles
## [1] "integer"
## 
## $age_at_dx
## [1] "numeric"
## 
## $chemotherapy
## [1] "character"
## 
## $regimen_at_dx
## [1] "integer"
## 
## $total_number_of_cycles
## [1] "integer"
## 
## $total_number_after_r_r
## [1] "integer"
## 
## $bmt
## [1] "character"
## 
## $radiation
## [1] "character"

# create fix_date function
fix_date<- function(x){
  if (str_detect(x, c("/.|/|-"))){
    lubridate::parse_date_time2(x, orders=c("mdY", "dmY", "mdy", "dmy", "Ymd", "Ydm"),cutoff_2000 = 23L)
  }else{as_datetime(janitor::convert_to_date(x))
  }}

# apply fix_date to all columns that start with "date" 
HL %>% mutate(across(starts_with("date"), function(x) reduce(map(x, fix_date), c)))->HL

cat("After applying the function \n\n")

## After applying the function

lapply(HL[1:10], class)

## $date_of_diagnosis
## [1] "POSIXct" "POSIXt" 
## 
## $cr_timing
## [1] "character"
## 
## $cr_after_2_cycles
## [1] "integer"
## 
## $age_at_dx
## [1] "numeric"
## 
## $chemotherapy
## [1] "character"
## 
## $regimen_at_dx
## [1] "integer"
## 
## $total_number_of_cycles
## [1] "integer"
## 
## $total_number_after_r_r
## [1] "integer"
## 
## $bmt
## [1] "character"
## 
## $radiation
## [1] "character"

Date difference

HL$OS<- as.numeric(HL$date_of_death_or_last_fu-HL$date_of_diagnosis)/30
HL$died[1:10]

##  [1] 0 0 0 0 0 0 0 0 0 0

HL$EFS<- as.numeric(HL$date_of_last_fu_or_rellapse-HL$date_of_diagnosis)/30
HL$event<-ifelse(HL$event_date_of_relapse=="No", 0, 1)

handling missing dates using if_else (not ifelse)

HL$date_of_death_or_last_fu[1:50]<-NA
# HL$Date.of.death.or.last.FU
HL$date_of_death_or_last_fu<- if_else(is.na(HL$date_of_death_or_last_fu), HL$date_of_last_fu_or_rellapse, HL$date_of_death_or_last_fu)
HL$date_of_last_fu_or_rellapse[6]

## [1] NA

sapply(HL, class)

## $date_of_diagnosis
## [1] "POSIXct" "POSIXt" 
## 
## $cr_timing
## [1] "character"
## 
## $cr_after_2_cycles
## [1] "integer"
## 
## $age_at_dx
## [1] "numeric"
## 
## $chemotherapy
## [1] "character"
## 
## $regimen_at_dx
## [1] "integer"
## 
## $total_number_of_cycles
## [1] "integer"
## 
## $total_number_after_r_r
## [1] "integer"
## 
## $bmt
## [1] "character"
## 
## $radiation
## [1] "character"
## 
## $radiation_with_primary_treatment
## [1] "integer"
## 
## $risk_group
## [1] "character"
## 
## $stage
## [1] "character"
## 
## $mets
## [1] "character"
## 
## $b_symptom
## [1] "character"
## 
## $bulky
## [1] "character"
## 
## $nationality
## [1] "character"
## 
## $outcome
## [1] "character"
## 
## $relapse_progression
## [1] "character"
## 
## $date_of_last_fu_or_rellapse
## [1] "POSIXct" "POSIXt" 
## 
## $died
## [1] "integer"
## 
## $date_of_death_or_last_fu
## [1] "POSIXct" "POSIXt" 
## 
## $event_date_of_relapse
## [1] "character"
## 
## $last_follow_up
## [1] "character"
## 
## $histology
## [1] "character"
## 
## $weight
## [1] "numeric"
## 
## $height
## [1] "numeric"
## 
## $x
## [1] "logical"
## 
## $x_1
## [1] "integer"
## 
## $OS
## [1] "numeric"
## 
## $EFS
## [1] "numeric"
## 
## $event
## [1] "numeric"

getting

cat("year \n")

## year

year(HL$date_of_diagnosis[1:10])

##  [1] 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004

cat("wday \n")

## wday

wday(HL$date_of_diagnosis[1:10])

##  [1] 5 7 3 3 3 4 5 2 7 1

cat("week \n")

## week

week(HL$date_of_diagnosis[1:10])

##  [1] 1 1 1 1 1 1 2 2 2 2

cat("Month \n")

## Month

month(HL$date_of_diagnosis[1:10])

##  [1] 1 1 1 1 1 1 1 1 1 1

cat("quartr \n")

## quartr

quarter(HL$date_of_diagnosis[1:10])

##  [1] 1 1 1 1 1 1 1 1 1 1

cat("Are patients diagnosed equalty in the 4 quarters? \n")

## Are patients diagnosed equalty in the 4 quarters?

table(quarter(HL$date_of_diagnosis[1:10]))

## 
##  1 
## 10

chisq.test(table(quarter(HL$date_of_diagnosis)), p = c(1,1,1,1), rescale.p = TRUE)

## 
##  Chi-squared test for given probabilities
## 
## data:  table(quarter(HL$date_of_diagnosis))
## X-squared = 10.282, df = 3, p-value = 0.01632

cat("Approximate to the first day of the week \n")

## Approximate to the first day of the week

floor_date(HL$date_of_diagnosis[1:10], "week", week_start = getOption("lubridate.week.start", 7))->a
class(a)

## [1] "POSIXct" "POSIXt"

wday(a)

##  [1] 1 1 1 1 1 1 1 1 1 1

interval(fix_date("2015-01-01"), fix_date("2020-01-01"))

## [1] 2015-01-01 UTC--2020-01-01 UTC

fix_date("2020-01-01")-fix_date("2015-01-01") ->a
as.numeric(a)

## [1] 1826

%within%

checks if dates are within an interval

int <- interval(ymd("2015-01-01"), ymd("2020-01-01"))

HL$date_of_diagnosis %within% int %>% table

## .
## FALSE  TRUE 
##   186    98

compare dates

fix_date("2015-01-01")>fix_date("2020-01-01")

## [1] FALSE

Lubridate

2022-10-21