findRace <- function(svalue) {
ival <- as.numeric(svalue)
race = case_when(ival == 1 ~ "White",
ival == 2 ~ "Black",
ival == 3 ~ "Native American",
ival == 4 ~ "Chinese",
ival == 5 ~ "Japanese",
ival == 6 ~ "Filipino",
ival == 7 ~ "Hawaiian",
ival == 8 ~ "Korean",
ival == 10 ~ "Vietnamese",
ival == 11 ~ "Laotian",
ival == 12 ~ "Hmong",
ival == 13 ~ "Kampuchean",
ival == 14 ~ "Thai",
ival == 15 ~ "Indian",
ival == 16 ~ "Indian",
ival == 17 ~ "Pakastani",
ival > 19 & ival < 98 ~ "Pacific Islander",
ival == 98 ~ "Other",
ival == 99 ~ "Unknown")
return(race)
}
findHispanicOrigin <- function(svalue) {
ival <- as.numeric(svalue)
hval = case_when(ival == 0 ~ "Not Latino",
ival == 1 ~ "Mexican",
ival == 2 ~ "Puerto Rican",
ival == 3 ~ "Cuban",
ival == 4 ~ "South/Central American",
ival > 4 & ival < 8 ~ "Other",
ival == 8 ~ "Dominican")
return(hval)
}
findLocality <- function(svalue) {
ival <- as.numeric(svalue)
hval = case_when(ival == 1501 ~ "San Francisco",
ival == 1502 ~ "Connecticut",
ival == 1520 ~ "Detroit",
ival == 1521 ~ "Hawaii",
ival == 1522 ~ "Iowa",
ival == 1523 ~ "New Mexico",
ival == 1525 ~ "Seattle",
ival == 1526 ~ "Utah",
ival == 1527 ~ "Atlanta",
ival == 1529 ~ "Alaska",
ival == 1531 ~ "San Jose",
ival == 1535 ~ "Los Angeles",
ival == 1537 ~ "Rural Georgia",
ival == 1541 ~ "Greater California",
ival == 1542 ~ "Kentucky",
ival == 1543 ~ "Louisiana",
ival == 1544 ~ "New Jersey",
ival == 1547 ~ "Greater Georgia"
)
return(hval)
}
findLaterality <- function(svalue) {
ival <- as.numeric(svalue)
lval = case_when(ival == 0 ~ "Not paired",
ival == 1 ~ "Right",
ival == 2 ~ "Left",
ival == 3 ~ "Unspecified",
ival == 4 ~ "Bilateral",
ival == 5 ~ "Paired: midline tumor",
ival == 9 ~ "Paired: no information"
)
return(lval)
}
findBehaviorCode <- function(svalue) {
ival <- as.numeric(svalue)
bval = case_when(ival == 0 ~ "Benign",
ival == 1 ~ "Malignant potential",
ival == 2 ~ "Noninvasive",
ival == 3 ~ "Malignant"
)
return(bval)
}
findGrade <- function(svalue) {
ival <- as.numeric(svalue)
gval = case_when(ival == 1 ~ "I",
ival == 2 ~ "II",
ival == 3 ~ "III",
ival == 4 ~ "IV",
ival == 5 ~ "T-cell",
ival == 6 ~ "B-cell",
ival == 7 ~ "Null cell",
ival == 8 ~ "NK cell",
ival == 9 ~ "undetermined"
)
return(gval)
}
findDiagnosticConfirmation <- function(svalue) {
ival <- as.numeric(svalue)
dval = case_when(ival == 1 ~ "Positive histology",
ival == 2 ~ "Positive cytology",
ival == 3 ~ "Positive histology PLUS",
ival == 4 ~ "Positive microscopic confirmation",
ival == 5 ~ "Positive laboratory test",
ival == 6 ~ "Direct visualization",
ival == 7 ~ "Radiology",
ival == 8 ~ "Clinical diagnosis"
)
return(dval)
}
findReportingType <- function(svalue) {
ival <- as.numeric(svalue)
rval = case_when(ival == 1 ~ "Hospital inpatient",
ival == 2 ~ "Radiation treatment",
ival == 3 ~ "Laboratory only",
ival == 4 ~ "Personal Physician",
ival == 5 ~ "Nursing Home",
ival == 6 ~ "Autopsy only",
ival == 7 ~ "Death certificate only",
ival == 8 ~ "Other"
)
return(rval)
}
cancers=c('breast','digothr','malegen','femgen','other','respir','colrect','lymyleuk','urinary','test') # test was for debugging
# Create new csv files
for (i in 1:9)
{
oname=paste("c:/SEER/output/",cancers[i],'.csv',sep="")
fid2=file(oname,'w')
y=paste("personID",
"locality",
"maritalStatus",
"race",
"derivedHispanicOrigin",
"sex",
"ageDiagnosis",
"birthYear",
"sequenceNumber",
"monthDiagnosis",
"yearDiagnosis",
"primarySite",
"laterality",
"histology",
"behavior",
"histologicType",
"behaviorCode",
"grade",
"diagnosticConfirmation",
"reportingSourceType",
"survivalMonths",
sep=",")
writeLines(y,con=fid2)
close(fid2)
}
for (pre in c('yr1992_2015.sj_la_rg_ak/','yr1973_2015.seer9/','yr2000_2015.ca_ky_lo_nj_ga/','yr2005.lo_2nd_half/'))
for (k in 1:9)
{
iname=paste("c:/SEER/SEER_1973_2015_TEXTDATA/incidence/",pre,cancers[k],'.txt',sep="")
print(iname)
oname=paste("c:/SEER/output/",cancers[k],'.csv',sep="")
fid=file(iname,'r')
fid2=file(oname,'a')
while (1) {
s=readLines(fid,n=1,ok=T)
#print(s) # for debugging
if (length(s)==0) break
personID=substr(s,1,8)
sregistrationID=substr(s,9,18)
sms=substr(s,19,19)
srace=substr(s,20,21)
sderivedHispanicOrigin=substr(s,23,23)
ssex=substr(s,24,24)
ageDiagnosis=substr(s,25,27)
birthYear=substr(s,28,31)
sequenceNumber=substr(s,35,36)
monthDiagnosis=substr(s,37,38)
yearDiagnosis=substr(s,39,42)
primarySite=substr(s,43,46)
slaterality=substr(s,47,47)
histology=substr(s,48,51)
behavior=substr(s,52,52)
histologicType=substr(s,53,56)
sbehaviorCode=substr(s,57,57)
sgrade=substr(s,58,58)
sdiagnosticConfirmation=substr(s,59,59)
sreportingSourceType=substr(s,60,60)
StrsurvivalMonths=substr(s,301,304)
survivalMonths=as.numeric(StrsurvivalMonths)
# Data Cleaning of variables
## Find Locality
locale = findLocality(sregistrationID)
## Find Race
race = findRace(srace)
## Find Hispanic Origin
derivedHispanicOrigin = findHispanicOrigin(sderivedHispanicOrigin)
## Find Laterality
laterality = findLaterality(slaterality)
## Find Behavior Code
behaviorCode = findBehaviorCode(sbehaviorCode)
## Find Grade
grade = findGrade(sgrade)
## Find Diagnostic Confirmation
diagnosticConfirmation = findDiagnosticConfirmation(sdiagnosticConfirmation)
## Find Reporting Source Type
reportingSourceType = findReportingType(sreportingSourceType)
## convert marital status
maritalStatus = "single"
if (sms == "2")
maritalStatus = "married"
else
if (sms == "3")
maritalStatus = "separated"
else
if (sms == "4")
maritalStatus = "divorced"
else
if (sms == "5")
maritalStatus = "widowed"
else
if (sms == "6")
maritalStatus = "domestic"
else
if (sms == "9")
maritalStatus = "unknown"
## Convert sex attributes
sex = "M"
if (ssex=="2")
sex = "F"
y=paste(
personID,
locale,
maritalStatus,
race,
derivedHispanicOrigin,
sex,
ageDiagnosis,
birthYear,
sequenceNumber,
monthDiagnosis,
yearDiagnosis,
primarySite,
laterality,
histology,
behavior,
histologicType,
behaviorCode,
grade,
diagnosticConfirmation,
reportingSourceType,
survivalMonths,
sep=",")
writeLines(y,con=fid2)
}
close(fid)
close(fid2)
print("CSV save job completed!")
}
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/urinary.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/urinary.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/urinary.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/urinary.txt"
## [1] "CSV save job completed!"
## [1] "Finished MongoDB load of CSV files!"