library(RODBC)
library(mongolite)
library(knitr)
library(kableExtra)
library(stringr)
library(dplyr)
library(tidyr)
library(scales)
library(ggplot2)
library(plotly)
library(maps)
library(mapdata)
library(ggrepel) #not using this at the moment, but it does give the option to add labels.  While not useful for the 
findRace <- function(svalue) {
  ival <- as.numeric(svalue)
  race = case_when(ival == 1 ~ "White",
                   ival == 2 ~ "Black",
                   ival == 3 ~ "Native American",
                   ival == 4 ~ "Chinese",
                   ival == 5 ~ "Japanese",
                   ival == 6 ~ "Filipino",
                   ival == 7 ~ "Hawaiian",
                   ival == 8 ~ "Korean",
                   ival == 10 ~ "Vietnamese",
                   ival == 11 ~ "Laotian",
                   ival == 12 ~ "Hmong",
                   ival == 13 ~ "Kampuchean",
                   ival == 14 ~ "Thai",
                   ival == 15 ~ "Indian",
                   ival == 16 ~ "Indian",
                   ival == 17 ~ "Pakastani",
                   ival > 19 & ival < 98 ~ "Pacific Islander",
                   ival == 98 ~ "Other",
                   ival == 99 ~ "Unknown")
  return(race)
}

findHispanicOrigin <- function(svalue) {
  ival <- as.numeric(svalue)
  hval = case_when(ival == 0 ~ "Not Latino",
                   ival == 1 ~ "Mexican",
                   ival == 2 ~ "Puerto Rican",
                   ival == 3 ~ "Cuban",
                   ival == 4 ~ "South/Central American",
                   ival > 4 & ival < 8 ~ "Other",
                   ival == 8 ~ "Dominican")
  return(hval)
}

findLocality <- function(svalue) {
  ival <- as.numeric(svalue)
  hval = case_when(ival == 1501 ~ "San Francisco",
                   ival == 1502 ~ "Connecticut",
                   ival == 1520 ~ "Detroit",
                   ival == 1521 ~ "Hawaii",
                   ival == 1522 ~ "Iowa",
                   ival == 1523 ~ "New Mexico",
                   ival == 1525 ~ "Seattle",
                   ival == 1526 ~ "Utah",
                   ival == 1527 ~ "Atlanta",
                   ival == 1529 ~ "Alaska",
                   ival == 1531 ~ "San Jose",
                   ival == 1535 ~ "Los Angeles",
                   ival == 1537 ~ "Rural Georgia",
                   ival == 1541 ~ "Greater California",
                   ival == 1542 ~ "Kentucky",
                   ival == 1543 ~ "Louisiana",
                   ival == 1544 ~ "New Jersey",
                   ival == 1547 ~ "Greater Georgia"
                   )
  return(hval)
}

findLaterality <- function(svalue) {
  ival <- as.numeric(svalue)
  lval = case_when(ival == 0 ~ "Not paired",
                   ival == 1 ~ "Right",
                   ival == 2 ~ "Left",
                   ival == 3 ~ "Unspecified",
                   ival == 4 ~ "Bilateral",
                   ival == 5 ~ "Paired: midline tumor",
                   ival == 9 ~ "Paired: no information"
                   )
  return(lval)
}

findBehaviorCode <- function(svalue) {
  ival <- as.numeric(svalue)
  bval = case_when(ival == 0 ~ "Benign",
                   ival == 1 ~ "Malignant potential",
                   ival == 2 ~ "Noninvasive",
                   ival == 3 ~ "Malignant"
                   )
  return(bval)
}

findGrade <- function(svalue) {
  ival <- as.numeric(svalue)
  gval = case_when(ival == 1 ~ "I",
                   ival == 2 ~ "II",
                   ival == 3 ~ "III",
                   ival == 4 ~ "IV",
                   ival == 5 ~ "T-cell",
                   ival == 6 ~ "B-cell",
                   ival == 7 ~ "Null cell",
                   ival == 8 ~ "NK cell",
                   ival == 9 ~ "undetermined"
                   )
  return(gval)
}

findDiagnosticConfirmation <- function(svalue) {
  ival <- as.numeric(svalue)
  dval = case_when(ival == 1 ~ "Positive histology",
                   ival == 2 ~ "Positive cytology",
                   ival == 3 ~ "Positive histology PLUS",
                   ival == 4 ~ "Positive microscopic confirmation",
                   ival == 5 ~ "Positive laboratory test",
                   ival == 6 ~ "Direct visualization",
                   ival == 7 ~ "Radiology",
                   ival == 8 ~ "Clinical diagnosis"
                   )
  return(dval)
}

findReportingType <- function(svalue) {
  ival <- as.numeric(svalue)
  rval = case_when(ival == 1 ~ "Hospital inpatient",
                   ival == 2 ~ "Radiation treatment",
                   ival == 3 ~ "Laboratory only",
                   ival == 4 ~ "Personal Physician",
                   ival == 5 ~ "Nursing Home",
                   ival == 6 ~ "Autopsy only",
                   ival == 7 ~ "Death certificate only",
                   ival == 8 ~ "Other"
                   )
  return(rval)
}
#mbreast <- mongo("breast")
#mdigothr <- mongo("digothr")
#mmalegen <- mongo("malegen")
#mfemgen <- mongo("femgent")
#mother <- mongo("other")
#mrespir <- mongo("respir")
#mcolrect <- mongo("colrect")
#mlymyleuk <- mongo("lymyleuk")
#murinary <- mongo("urinary")

#mbreast$drop()
#mdigothr$drop()
#mmalegen$drop()
#mfemgen$drop()
#mother$drop()
#mrespir$drop()
#mcolrect$drop()
#mlymyleuk$drop()
#murinary$drop()
cancers=c('breast','digothr','malegen','femgen','other','respir','colrect','lymyleuk','urinary','test') # test was for debugging

# Create new csv files
for (i in 1:9)
{
    oname=paste("c:/SEER/output/",cancers[i],'.csv',sep="") 
    fid2=file(oname,'w')
    y=paste("personID",
            "locality",
            "maritalStatus",
            "race",
            "derivedHispanicOrigin",
            "sex",
            "ageDiagnosis",
            "birthYear",
            "sequenceNumber",
            "monthDiagnosis",
            "yearDiagnosis",
            "primarySite",
            "laterality",
            "histology",
            "behavior",
            "histologicType",
            "behaviorCode",
            "grade",
            "diagnosticConfirmation",
            "reportingSourceType",
            "survivalMonths",
            sep=",")
    writeLines(y,con=fid2)
    close(fid2)
}

for (pre in c('yr1992_2015.sj_la_rg_ak/','yr1973_2015.seer9/','yr2000_2015.ca_ky_lo_nj_ga/','yr2005.lo_2nd_half/'))

  
  for (k in 1:9)
  {
    iname=paste("c:/SEER/SEER_1973_2015_TEXTDATA/incidence/",pre,cancers[k],'.txt',sep="")
    print(iname)
    oname=paste("c:/SEER/output/",cancers[k],'.csv',sep="") 
    
    
    fid=file(iname,'r')
    fid2=file(oname,'a')
    
    while (1) {
      s=readLines(fid,n=1,ok=T)
      #print(s) # for debugging
      if (length(s)==0) break
      personID=substr(s,1,8)
      sregistrationID=substr(s,9,18)
      sms=substr(s,19,19)
      srace=substr(s,20,21)
      sderivedHispanicOrigin=substr(s,23,23)
      ssex=substr(s,24,24)
      ageDiagnosis=substr(s,25,27)
      birthYear=substr(s,28,31)
      sequenceNumber=substr(s,35,36)
      monthDiagnosis=substr(s,37,38)
      yearDiagnosis=substr(s,39,42)
      primarySite=substr(s,43,46)
      slaterality=substr(s,47,47)
      histology=substr(s,48,51)
      behavior=substr(s,52,52)
      histologicType=substr(s,53,56)
      sbehaviorCode=substr(s,57,57)
      sgrade=substr(s,58,58)
      sdiagnosticConfirmation=substr(s,59,59)
      sreportingSourceType=substr(s,60,60)
      StrsurvivalMonths=substr(s,301,304)
      survivalMonths=as.numeric(StrsurvivalMonths)
      
      # Data Cleaning of variables
      
      ## Find Locality
      locale = findLocality(sregistrationID)
      
      ## Find Race
      race = findRace(srace)
      
      ## Find Hispanic Origin
      derivedHispanicOrigin = findHispanicOrigin(sderivedHispanicOrigin)
      
      ## Find Laterality
      laterality = findLaterality(slaterality)
      
      ## Find Behavior Code
      behaviorCode = findBehaviorCode(sbehaviorCode)

      ## Find Grade
      grade = findGrade(sgrade)

      ## Find Diagnostic Confirmation
      diagnosticConfirmation = findDiagnosticConfirmation(sdiagnosticConfirmation)
      
      ## Find Reporting Source Type
      reportingSourceType = findReportingType(sreportingSourceType)
      
      ## convert marital status
      maritalStatus = "single"
      if (sms == "2")
        maritalStatus = "married"
      else
      if (sms == "3")
        maritalStatus = "separated"
      else
      if (sms == "4")
        maritalStatus = "divorced"
      else
      if (sms == "5")
        maritalStatus = "widowed"
      else
      if (sms == "6")
        maritalStatus = "domestic"
      else
      if (sms == "9")
        maritalStatus = "unknown"
      
      ## Convert sex attributes
      sex = "M"
      if (ssex=="2")
        sex = "F"

      y=paste(
        personID,
        locale,
        maritalStatus,
        race,
        derivedHispanicOrigin,
        sex,
        ageDiagnosis,
        birthYear,
        sequenceNumber,
        monthDiagnosis,
        yearDiagnosis,
        primarySite,
        laterality,
        histology,
        behavior,
        histologicType,
        behaviorCode,
        grade,
        diagnosticConfirmation,
        reportingSourceType,
        survivalMonths,
        sep=",")
      writeLines(y,con=fid2)
  }
  close(fid)
  close(fid2)
  
  print("CSV save job completed!")
}
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1992_2015.sj_la_rg_ak/urinary.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/urinary.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2000_2015.ca_ky_lo_nj_ga/urinary.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/breast.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/digothr.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/malegen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/femgen.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/other.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/respir.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/colrect.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/lymyleuk.txt"
## [1] "CSV save job completed!"
## [1] "c:/SEER/SEER_1973_2015_TEXTDATA/incidence/yr2005.lo_2nd_half/urinary.txt"
## [1] "CSV save job completed!"
# Open up each CSV file and store in mongodb

#dfbreast = read.csv("c:/SEER/output/breast.csv")
#mbreast$insert(dfbreast)

#dfdigothr = read.csv("c:/SEER/output/digothr.csv")
#mdigothr$insert(dfdigothr)

#dfmalegen = read.csv("c:/SEER/output/malegen.csv")
#mmalegen$insert(dfmalegen)

#dffemgen = read.csv("c:/SEER/output/femgen.csv")
#mfemgen$insert(dffemgen)

#dfother = read.csv("c:/SEER/output/other.csv")
#mother$insert(dfother)

#dfrespir = read.csv("c:/SEER/output/respir.csv")
#mrespir$insert(dfrespir)

#dfcolrect = read.csv("c:/SEER/output/colrect.csv")
#mcolrect$insert(dfcolrect)

#dflymyleuk = read.csv("c:/SEER/output/lymyleuk.csv")
#mlymyleuk$insert(dflymyleuk)

#dfurinary = read.csv("c:/SEER/output/urinary.csv")
#murinary$insert(dfurinary)

print("Finished MongoDB load of CSV files!")
## [1] "Finished MongoDB load of CSV files!"