Contact with the DataBase

The database is located at the 138.100.82.241 server:

mydb = dbConnect(MySQL(), user='adif', password='adif', 
                 dbname='ADIF', host='138.100.82.241')

Select Lines and Sublines

You can also embed plots, for example:

rs = dbSendQuery(mydb, "select * from line")
lindat = fetch(rs, n=-1)
lindat[,"name"]=unlist(lapply(lindat[,"name"],iconv,"latin1","UTF-8"))
rs = dbSendQuery(mydb, "select * from subline")
sublindat = fetch(rs, n=-1)
sublindat[,"name"]=unlist(lapply(sublindat[,"name"],iconv,"latin1","UTF-8"))
rs = dbSendQuery(mydb, "select * from contract_type")
ctypedat = fetch(rs, n=-1)
ctypedat[,"name"]=unlist(lapply(ctypedat[,"name"],iconv,"latin1","UTF-8"))
rs = dbSendQuery(mydb, "select * from contract")
contdat = fetch(rs, n=-1)
contdat[,"title"]=unlist(lapply(contdat[,"title"],iconv,"latin1","UTF-8"))
rs = dbSendQuery(mydb, "select * from line_subline_rel")
lsubrel = fetch(rs, n=-1)
cadena = paste0("SELECT a.lineid as lineid, a.sublineid as sublineid,",
       "c.contract_id as contractid ",
       "from line_subline_rel a, ",
       "     contract_subline_rel c ",
       "where a.sublineid = c.sublineid " ,
       "order by a.lineid, a.sublineid ")
rs = dbSendQuery(mydb, cadena)
sublinrel = fetch(rs, n=-1) 
rs = dbSendQuery(mydb,"select * from contract_contract_type_rel")
ctyperel  = fetch(rs, n=-1)
#
linesh = sublinrel[,c("lineid","sublineid","contractid")]
linesh[,"LineName"] = lindat[linesh[,"lineid"],"name"]
linesh[,"SubLineName"]= sublindat[linesh[,"sublineid"],"name"]
linesh[,"ContractTitle"]= contdat[linesh[,"contractid"],"title"]
linesh[,"ContractStart"]=as.Date(contdat[linesh[,"contractid"],
                                         "contract_date"])
linesh[,"ContractEnd"]=as.Date(contdat[linesh[,"contractid"],
                                      "date_final_certificate_received"])
linesh[,"ContractType"]= ctypedat[ctyperel[linesh[,"contractid"],
                                  "contract_type_id"],"name"]
linesh[,"ContractEffectiveCost"]= contdat[linesh[,"contractid"],
                                           "effective_cost"]
linesh[,"ContractAwardedAmount"]= contdat[linesh[,"contractid"],
                                           "awarded_amount"]
linesh[,"ContractStatus"]= contdat[linesh[,"contractid"],"status2"]
linesh[,"ContractEstDuration"]= contdat[linesh[,"contractid"],
                                  "estimated_contract_duration"]
linesh[,"ContractCostOverrun"]= contdat[linesh[,"contractid"],
                                           "cost_overrun"]
linesh[,"ContractTimeOverrun"]= contdat[linesh[,"contractid"],
                                           "schedule_overrun"]
#

Incident analysis

We are recovering the Incidents and Certificatios per contract.

rs = dbSendQuery(mydb, "select * from incidents")
incdat = fetch(rs, n=-1)
incdat[,"incident_type"]=unlist(lapply(incdat[,"incident_type"],
                                       iconv,"latin1","UTF-8"))
incdat[,"description"]=unlist(lapply(incdat[,"description"],
                                       iconv,"latin1","UTF-8"))
incdat[,"status"]=unlist(lapply(incdat[,"status"],
                                       iconv,"latin1","UTF-8"))
incdat[,"date_approval"]=as.Date(incdat[,"date_approval"])
#
rs = dbSendQuery(mydb, "select * from certifications")
cerdat = fetch(rs, n=-1)
cerdat[,"certificate_type"]=unlist(lapply(cerdat[,"certificate_type"],
                                       iconv,"latin1","UTF-8"))
cerdat[,"cert_date"]=as.Date(cerdat[,"cert_date"])
#
for (i in 1:nrow(linesh)) {
  cid = linesh[i,"contractid"]
  linesh[i,"ContractNumCert"]= length(which(cerdat$contract_id==i))
  linesh[i,"ContractNumInc"]= length(which(incdat$contract_id==i))  
}
dbDisconnect(mydb)
## [1] TRUE
rm(rs)
detach("package:RMySQL", unload=TRUE)
#

Main statistical figures

Statistics at level of subline

cons1 = paste0("select lineid, sublineid, min(ContractStart) as MIN__Date, ",
                       "max(ContractEnd) as MAX__Date, ",
                       "sum(ContractEffectiveCost)/1000000. as PlanCost,",
                       "sum(ContractAwardedAmount)/1000000. as RealCost, ",
                       "sum(ContractNumCert) as NumCert, ",
                       "sum(ContractNumInc) as NumInc, ", 
                       "count(ContractStart) as NumContracts ",
                       "from linesh group by lineid, sublineid ",
                       "order by lineid, sublineid")
dcons = sqldf(cons1,method = "name__class",stringsAsFactors=FALSE)

Function preparation for data extraction

extractsubline = function(idline,idsubline) {
    cons2 = paste0("select contractid,ContractStart as Start__Date, ",
               "ContractEnd as End__Date, ",
               "ContractType, ContractAwardedAmount,ContractEffectiveCost, ",
               "ContractTitle from linesh where lineid=",idline," ",
               "and sublineid=",idsubline," ",
               "and strftime('%Y',ContractStart) > 1000 ",
               "order by ContractStart asc")
    dcons = sqldf(cons2,method = "name__class",stringsAsFactors=FALSE)
    return(dcons)
}
#
HSL=list()
for (i in 1:nrow(lsubrel)) {
    line = lsubrel[i,"lineid"]
    naml = lindat[line,"name"]
    if (! naml %in% ls(HSL)) {
      HSL[[naml]] = list()
    }
    subl = lsubrel[i,"sublineid"]
    snml = sublindat[subl,"name"]
    if (! snml %in% ls(HSL[[naml]])) {
      dsl  = extractsubline(line,subl) 
      HSL[[naml]][[snml]] = dsl    
    } else {
      cat(paste0("Error: ",snml," was already defined !"))
    }
}
#

Text mining. Se extraen términos significativos

#
findex = function(x,idx){
  j = which(x==TRUE)
  if (length(j) > 1 ) {
    return (idx[j[1]])
  } else {
    if (length(j) == 1 ) {
      return(idx[j])
    } else {
      return(NA)
    }
  }
}
finditm = function(x,nextel) {
  res = NA
  res = try(strsplit(strsplit(x,nextel)[[1]][2],'[.,]')[[1]][1],TRUE)
  return(res)
}
refs=data.frame(keys=c("REDACCION","GEOTÉCNICOS","GEOTECNICOS",
                       "COORDINACION",
                       "RESTITUCION","EXPROPIACION","CONSTRUCCION",
                       "FUNCIONAL","SEGURIDAD Y SALUD",
                       "DIRECCION FACULTATIVA","COMPLEMENTARIO",
                       "SUMINISTRO Y TRANSPORTE DE BALASTO",
                       "ACOMETIDA","REDACCIÓN","MODIFICACIONES",
                       "REPOSICION","REPOSICIÓN",
                       "EJECUCIÓN","EJECUCION","APOYO AL SEGUIMIENTO",
                       "CONSULTORÍA","CYA.","INSTALACIONES",
                       "APOYO A LA DIRECCI","ASISTENCIA TECNICA"),
                cod=c("RP","EG","EG",
                      "C",
                      "REST","EXP","CONS",
                      "DF","SSL",
                      "DF","COMPL",
                      "BALAS",
                      "AC","RP","MOD",
                      "REPO","REPO",
                      "CONS","CONS","DF",
                      "CyA","CyA","INST",                      
                      "DP","DF"),
              stringsAsFactors = FALSE  
            )
#
for (i in ls(HSL)) {
  for (j in ls(HSL[[i]])) {
    cdt = HSL[[i]][[j]]
    if ( nrow(cdt) > 0) {
      if( nrow(cdt)==1) {
        mat = t(as.data.frame(sapply(refs$keys,grepl,cdt$ContractTitle)))
      }else {
        mat = sapply(refs$keys,grepl,cdt$ContractTitle)
      }
      cdt[,"Type"] = apply(mat,1,findex,as.character(refs$cod))
      cdt[,"Tramo"]= sapply(cdt$ContractTitle,finditm,
                          '[[:blank:]|[:punct:]]TRAMO[[:blank:]|[:punct:]]')
      cdt[,"SubTramo"]= sapply(cdt$ContractTitle,finditm,
                       '[[:blank:]|[:punct:]]SUBTRAMO[[:blank:]|[:punct:]]')
      HSL[[i]][[j]] = cdt
    }
  }
}
#
save(HSL,file="HSL.RData")
rm(cdt)
#

Date analysis

#
DateRef = data.frame(lineid=NA, sublineid = NA,
                     Start=as.Date("2018-01-01"),
                     End=as.Date("2018-01-01"),
                     FirstConsStart=as.Date("2018-01-01"),
                     FirstConsEnd=as.Date("2018-01-01"),
                     NumModConsAntesPrimCons = NA,
                     NumConCons = NA,
                     FirstBalasStart = as.Date("2018-01-01"),
                     NumConMod = NA,
                 stringsAsFactors=FALSE)
k=0
for (i in ls(HSL)) {
  for (j in ls(HSL[[i]])) {
    k=k+1
    cc = HSL[[i]][[j]]
    if ( nrow(cc) > 0) {
      DateRef[k,"lineid"] = lindat[which(lindat[,"name"]==i),"id"]
      DateRef[k,"sublineid"] = sublindat[which(sublindat[,"name"]==j),"id"]
      DateRef[k,"Start"] = as.Date(min(cc[,"Start"],na.rm=TRUE))
      DateRef[k,"End"] =   as.Date(max(cc[,"End"],na.rm=TRUE))
      ss=subset(cc,Type=="CONS")
      ss = ss[order(ss$Start),]
      DateRef[k,"FirstConsStart"] = as.Date(ss[1,"Start"])
      DateRef[k,"FirstConsEnd"]   = as.Date(ss[1,"End"])  
      DateRef[k,"NumConCons"]   = nrow(ss)
      ss=subset(cc,Type=="BALAS")
      ss = ss[order(ss$Start),] 
      DateRef[k,"FirstBalasStart"] = as.Date(ss[1,"Start"]) 
      ss=subset(cc,Type=="MOD")
      ss = ss[order(ss$Start),] 
      DateRef[k,"NumConMod"] = nrow(ss)
      DateRef[k,"NumModConsAntesPrimCons"] = length(which(ss$Start < 
                                             DateRef[k,"FirstConsStart"]))
      rm(cc,ss)
    } else {
      DateRef[k,"Start"] = NA
      DateRef[k,"End"] =   NA
      DateRef[k,"FirstConsStart"] = NA
      DateRef[k,"FirstConsEnd"]   = NA
      DateRef[k,"NumConCons"]   = NA
      DateRef[k,"FirstBalasStart"] = NA
      DateRef[k,"NumConMod"] = NA
      DateRef[k,"NumModConsAntesPrimCons"] = NA
    }
  }
}
#
dcons = merge(dcons,DateRef,by=c("lineid","sublineid"))
rm(cadena,cid,cons1,i,j,k,mat,mydb,line,naml,snml,subl,DateRef)
#

Now, pending to check the created features testing FCM relationships, or, maybe, to still create additional features.

#
# Created Objects:
#
#   lindat    : Stores name of lines
#   sublindat : Stores name of sublines
#   incdat    : Stores Incident events
#   lsubrel   : Relationshp Line Sublines
#   linesh    : Information about Contracts per subline
#   sublinrel : Relationshp Line Subline ContractID
#   cerdat    : Certification info related to contractID
#   contdat   : Contract info
#   ctypedat  : Typology of Contracts
#   ctyperel  : Type of used contracts
#   refs      : Type of keys for text-mining
#   dcons     : Statistical data at subline level
#   HSL       : Details of contracts per line - subline

save(lindat,sublindat,incdat,lsubrel,linesh,sublinrel,
     cerdat,contdat,ctypedat,ctyperel,refs,dcons,HSL,
     file="FeaturesADIF.RData")