How do I use codedtitles in R? This vignette will go through some of the functionality of this package and scenarios where it may be helpful.
Load the package:
library(devtools)
#> Warning: package 'devtools' was built under R version 4.4.1
#> Loading required package: usethis
#> Warning: package 'usethis' was built under R version 4.4.1
devtools::install_github("drmomalik/codedtitles", )
#> Using GitHub PAT from the git credential store.
#> Skipping install of 'codedtitles' from a github remote, the SHA1 (cd1a5482) has not changed since last install.
#> Use `force = TRUE` to force installation
library(codedtitles)“codedtitles” is a package designed to make the pre-processing of variables simpler prior to moving forward with analysis.
The motivation for this package was born out of our own clinical research experience. For example, a large surgical database with over 200 variables was provided for analysis. None of the column titles had proper coding and many contained special characters and spaces, which would make it syntactically challenging to deal with in R. Given that many of these databases are collected by clinicians or people without statistical software experience, they may not be familiar with the role of variable coding. As such, we have attempted to create a novel, simple solution to speed up this process.
data <- df
colnames(data)
#> [1] "Case_ID" "?&%#%"
#> [3] "?&%#%#" "Age_Group"
#> [5] "Gender" "Region"
#> [7] "Disease" "Symptom_Onset"
#> [9] "Hospitalized" "ICU_Admission"
#> [11] "Vaccination_Status" "Exposure_History"
#> [13] "Primary_Transmission_Route" "Secondary_Cases"
#> [15] "Serotype" "PCR_Result...16"
#> [17] "PCR_Result...17" "ELISA_Result"
#> [19] "Contact_Tracing_Status" "Isolation_Duration"
#> [21] "Recovery_Status" "Mortality_Status"
#> [23] "R0_Estimate" "Incubation_Period"
#> [25] "Attack_Rate" "Attack_Rate_Total"
#> [27] "Reinfection_Status" "Antiviral_Usage"
#> [29] "Outbreak_Cluster" "Environmental_Factors"
#> [31] "Genomic_Sequence" "Reporting_Delay"
#> [33] "Data_Source" "Study_Period"
#> [35] "attack" "Mort$Total%_"new_names <- codevar(data)
#> Loading required package: NLP
#> [1] "case_id" "X" "X_1"
#> [4] "age_group" "gender" "region"
#> [7] "diseas" "symptom_onset" "hospit"
#> [10] "icu_admiss" "vaccin_status" "exposur_histori"
#> [13] "prima_trans_rout" "secondar_case" "serotyp"
#> [16] "pcr_resul_16" "pcr_resul_17" "elisa_result"
#> [19] "conta_trace_statu" "isol_durat" "recoveri_status"
#> [22] "mortal_status" "r0_estim" "incub_period"
#> [25] "attack_rate" "attac_rate_total" "reinfect_status"
#> [28] "antivir_usag" "outbreak_cluster" "environm_factor"
#> [31] "genom_sequenc" "report_delay" "data_sourc"
#> [34] "studi_period" "attack" "mort_total"print(coderef)
#> New Original Class
#> 1 case_id Case_ID numeric
#> 2 X ?&%#% numeric
#> 3 X_1 ?&%#%# numeric
#> 4 age_group Age_Group character
#> 5 gender Gender character
#> 6 region Region character
#> 7 diseas Disease character
#> 8 symptom_onset Symptom_Onset Date
#> 9 hospit Hospitalized logical
#> 10 icu_admiss ICU_Admission logical
#> 11 vaccin_status Vaccination_Status character
#> 12 exposur_histori Exposure_History character
#> 13 prima_trans_rout Primary_Transmission_Route character
#> 14 secondar_case Secondary_Cases numeric
#> 15 serotyp Serotype character
#> 16 pcr_resul_16 PCR_Result...16 character
#> 17 pcr_resul_17 PCR_Result...17 character
#> 18 elisa_result ELISA_Result character
#> 19 conta_trace_statu Contact_Tracing_Status character
#> 20 isol_durat Isolation_Duration numeric
#> 21 recoveri_status Recovery_Status character
#> 22 mortal_status Mortality_Status character
#> 23 r0_estim R0_Estimate numeric
#> 24 incub_period Incubation_Period numeric
#> 25 attack_rate Attack_Rate numeric
#> 26 attac_rate_total Attack_Rate_Total numeric
#> 27 reinfect_status Reinfection_Status logical
#> 28 antivir_usag Antiviral_Usage logical
#> 29 outbreak_cluster Outbreak_Cluster character
#> 30 environm_factor Environmental_Factors character
#> 31 genom_sequenc Genomic_Sequence character
#> 32 report_delay Reporting_Delay numeric
#> 33 data_sourc Data_Source character
#> 34 studi_period Study_Period Date
#> 35 attack attack numeric
#> 36 mort_total Mort$Total%_ numericnew_names <- codevar(data, max_length = 6)
#> [1] "cas_id" "X" "X_1" "age_gro" "gender" "region"
#> [7] "diseas" "sym_ons" "hospit" "icu_adm" "vac_sta" "exp_his"
#> [13] "pr_tr_ro" "sec_cas" "seroty" "pc_re_16" "pc_re_17" "eli_res"
#> [19] "co_tr_st" "iso_dur" "rec_sta" "mor_sta" "r0_est" "inc_per"
#> [25] "att_rat" "at_ra_to" "rei_sta" "ant_usa" "out_clu" "env_fac"
#> [31] "gen_seq" "rep_del" "dat_sou" "stu_per" "attack" "mor_tot"new_names <- codevar(data, max_length = 3)
#> [1] "ca_i" "X" "X_1" "ag_g" "gen" "reg" "dis"
#> [8] "sy_o" "hos" "ic_a" "va_s" "ex_h" "p_t_r" "se_c"
#> [15] "ser" "p_r_1" "p_r_1_1" "el_r" "c_t_s" "is_d" "re_s"
#> [22] "mo_s" "r0_e" "in_p" "at_r" "a_r_t" "re_s_1" "an_u"
#> [29] "ou_c" "en_f" "ge_s" "re_d" "da_s" "st_p" "att"
#> [36] "mo_t"new_names <- codevar(data, max_length = 3, split = FALSE)
#> [1] "cas" "X" "X_1" "age" "gen" "reg" "dis" "sym" "hos"
#> [10] "icu" "vac" "exp" "pri" "sec" "ser" "pcr" "pcr_1" "eli"
#> [19] "con" "iso" "rec" "mor" "r0." "inc" "att" "att_1" "rei"
#> [28] "ant" "out" "env" "gen_1" "rep" "dat" "stu" "att_2" "mor_1"Lets look at our last example where our max_length was 3 and there is no splitting. This leads to equal new variable names for variables like “Attack_Rate”, “Attack_Rate_Total” and “attacK”.
The code anticipates this and will sequentially add a number tag to repeat variables:
coderef[c(25,26,35),]
#> New Original Class
#> 25 att Attack_Rate numeric
#> 26 att_1 Attack_Rate_Total numeric
#> 35 att_2 attack numeric