Scraping ATIH GHM manual volume 2
    Copyright (C) 2014  Antoine Filipovic Pierucci

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see http://www.gnu.org/licenses.

Sélectionner le texte du manuel dans un lecteur PDF (testé à partir d’Adobe Acrobat Reader 11.0.5 et de Sumatra PDF 2.5.2) et le coller dans un traitement de texte (notepad par exemple), sauvegarder en txt.

textGuide <- scan("C:\\Documents and Settings\\afilipovicpierucci\\Bureau\\ccam.txt",
                  what = character())

Détecter la position des actes, des GHM et des listes d’actes.

patternAct <- "[[:upper:]]{4}[[:digit:]]{3}"
indexAct <- grep(pattern = patternAct,
                 x = textGuide)

patternGHM <- "[[:digit:]]{2}[CKMZH]{1}[[:digit:]]{2}[1234ABCDZTEJ]?"
indexGHM <- grep(pattern = patternGHM,
                 x = textGuide)

patternListes <- "A-[[:digit:]]{3}"
indexListes <- grep(pattern = patternListes,
                    x = textGuide)

On nettoye les données.

formatRegexpr <- function(x, pattern) {
    res <- regmatches(x, regexpr(pattern, x))
    if (length(res)) res else stop(x)
}


textGuide[indexAct] <- formatRegexpr(textGuide[indexAct], patternAct)
textGuide[indexGHM] <- formatRegexpr(textGuide[indexGHM], patternGHM)
textGuide[indexListes] <- formatRegexpr(textGuide[indexListes], patternListes)

On créé un vecteur avec les valeurs A, G et L. Les noms du vecteur correspondent à l’index de la position de la chaîne de charactère dans le fichier.

vecA <- rep("A", length(indexAct))
names(vecA) <- indexAct

vecG <- rep("G", length(indexGHM))
names(vecG) <- indexGHM

vecL <- rep("L", length(indexListes))
names(vecL) <- indexListes

res <- c(vecA, vecG, vecL)
res <- res[order(as.integer(names(res)))]

Une difficulté est que les listes d’actes apparaissent en deux endroits :

En lisant le fichier il faut différencier ces deux situations.

res_2 <- res

for (i in seq_len(length(res) - 1)) {
    if (res[i] == "L" && res[i + 1] == "A") {
        res_2[i] <- "LT"
    }
}

On fait une liste avec pour chaque GHM les noms de liste qui le concernent.

listGHM <- list()

for (i in seq_along(res_2)) {
    if (res_2[i] == "G") {
        nomGHM <- textGuide[as.integer(names(res_2[i]))]
    } else if (res_2[i] == "L") {
        listGHM[[nomGHM]] <- c(listGHM[[nomGHM]], textGuide[as.integer(names(res_2[i]))])
    }
}

head(listGHM)
## $`01C03`
## [1] "A-002"
## 
## $`01C04`
## [1] "A-002"
## 
## $`01C05`
## [1] "A-003"
## 
## $`01C06`
## [1] "A-004"
## 
## $`01C08`
## [1] "A-006"
## 
## $`01C09`
## [1] "A-195"

On fait une liste avec pour chaque liste d’acte les actes qui la concernent.

listAct <- list()

for (i in seq_along(res_2)) {
    if (res_2[i] == "LT") {
        nomAct <- textGuide[as.integer(names(res_2[i]))]
    } else if (res_2[i] == "A") {
        listAct[[nomAct]] <- c(listAct[[nomAct]],
                               textGuide[as.integer(names(res_2[i]))])
    }
}

head(listAct)
## $`A-002`
##   [1] "AAFA001" "AAFA002" "AAFA003" "AAFA004" "AAFA005" "AAFA006" "AAFA007"
##   [8] "AAFA008" "AAGA900" "AAJA001" "AAJA002" "AAJA003" "AAJA004" "AAJA005"
##  [15] "AAJA006" "AAJH001" "AAJH002" "AAJH003" "AAJH004" "AALA002" "AALB002"
##  [22] "AAPA900" "ABCA001" "ABCA002" "ABCA003" "ABCA004" "ABCB001" "ABCC001"
##  [29] "ABFA001" "ABFA002" "ABFA003" "ABFA004" "ABFA005" "ABFA006" "ABFA007"
##  [36] "ABFA008" "ABFA009" "ABFA010" "ABFC001" "ABFC002" "ABGA001" "ABGA003"
##  [43] "ABJA001" "ABJA002" "ABJA003" "ABJA004" "ABJA005" "ABJA006" "ABJA007"
##  [50] "ABJA008" "ABJC900" "ABLA001" "ABLB001" "ABLB002" "ABLB003" "ABMA003"
##  [57] "ABSA001" "ABSA002" "ABSA003" "ABSA004" "ABSA005" "ABSA006" "ABSA007"
##  [64] "ABSA008" "ABSA009" "ABSA010" "ABSA011" "ABSA012" "ACFA001" "ACFA002"
##  [71] "ACFA003" "ACFA004" "ACFA005" "ACFA006" "ACFA007" "ACFA008" "ACFA009"
##  [78] "ACFA010" "ACFA011" "ACFA012" "ACFA013" "ACFA014" "ACFA015" "ACFA016"
##  [85] "ACFA018" "ACFA019" "ACFA020" "ACFA022" "ACFA023" "ACFA024" "ACFA025"
##  [92] "ACFA026" "ACFA027" "ACFA028" "ACFA029" "ACHA001" "ACHA002" "ACHA003"
##  [99] "ACHB001" "ACHH001" "ACHJ001" "ACPA001" "ACQC001" "ADCA001" "ADCA003"
## [106] "ADCA006" "ADCA007" "ADEA001" "ADEA003" "ADEA004" "ADEA005" "ADFA005"
## [113] "ADPA001" "ADPA005" "ADPA007" "ADPA009" "ADPA010" "ADPA011" "ADPA012"
## [120] "ADPA013" "ADPA017" "ADPC900" "EABA001" "EACA001" "EACA002" "EACA003"
## [127] "EACA004" "EACA005" "EACA006" "EACA007" "EAFA001" "EAFA002" "EAFA003"
## [134] "EAFA004" "EAFA005" "EAFA006" "EAFA007" "EAFA008" "EAFA009" "EASA001"
## [141] "EASA002" "EBCA010" "EBCA011" "GAFA010" "GBBA001" "GBFA004" "GBFA006"
## [148] "KAFA001" "KAFA002" "KAFE900" "KANB001" "KBFA001" "LACA005" "LACA009"
## [155] "LACA012" "LACA016" "LACA017" "LACA020" "LAEA002" "LAEA004" "LAEA006"
## [162] "LAEA009" "LAFA003" "LAFA004" "LAFA005" "LAFA011" "LAFA900" "LAGA007"
## [169] "LAMA001" "LAMA002" "LAMA006" "LAMA009" "LAMA011" "LAPA005" "LAPA006"
## [176] "LAPA008" "LAPA016" "LARA003" "LARA004"
## 
## $`A-003`
##   [1] "AEFA001" "AEFA002" "AEFA003" "AEGA001" "AEJA001" "AEJA002" "AEJA003"
##   [8] "AEJA004" "AEJA005" "AENA001" "AEPA001" "AEPA002" "AEPA004" "AEPB001"
##  [15] "AFCA001" "AFCA002" "AFCA003" "AFCA004" "AFFA001" "AFFA002" "AFFA003"
##  [22] "AFFA004" "AFFA005" "AFFA006" "AFFA007" "AFFA008" "AFFA009" "AFFA010"
##  [29] "AFFA011" "AFGA001" "AFGA002" "AFJA001" "AFJA002" "AFJA003" "AFJA004"
##  [36] "AFJA005" "AFKB001" "AFKB002" "AFLA003" "AFLB008" "AFPA001" "AFSA001"
##  [43] "AFSA002" "AFSA003" "AHFA003" "AHFA007" "AHFA012" "AHPA025" "ENCA001"
##  [50] "ENFA007" "LDAA001" "LDAA002" "LDCA001" "LDCA002" "LDCA003" "LDCA004"
##  [57] "LDCA005" "LDCA006" "LDCA007" "LDCA008" "LDCA009" "LDCA010" "LDCA011"
##  [64] "LDCA012" "LDCA013" "LDDA001" "LDFA001" "LDFA002" "LDFA003" "LDFA004"
##  [71] "LDFA005" "LDFA006" "LDFA007" "LDFA008" "LDFA009" "LDFA010" "LDFA011"
##  [78] "LDFA012" "LDGA001" "LDGA002" "LDHA001" "LDHA002" "LDKA900" "LDPA001"
##  [85] "LDPA002" "LDPA003" "LDPA004" "LDPA005" "LDPA006" "LDPA007" "LDPA008"
##  [92] "LDPA009" "LDPA010" "LECA001" "LECA002" "LECA003" "LECA004" "LECA005"
##  [99] "LECA006" "LECC001" "LEFA001" "LEFA002" "LEFA003" "LEFA004" "LEFA005"
## [106] "LEFA006" "LEFA007" "LEFA008" "LEFA009" "LEFA010" "LEFA011" "LEFA012"
## [113] "LEFA013" "LEFA014" "LEFC002" "LEGA001" "LEGA002" "LEHA001" "LEHA002"
## [120] "LEHA003" "LEHC001" "LEMA001" "LEMA002" "LEMA003" "LEMA004" "LEPA001"
## [127] "LEPA002" "LEPA003" "LEPA004" "LEPA005" "LEPA006" "LEPA007" "LEPA008"
## [134] "LEPA009" "LFAA001" "LFAA002" "LFCA001" "LFCA002" "LFCA003" "LFCA004"
## [141] "LFCA005" "LFCC001" "LFDA001" "LFDA002" "LFDA003" "LFDA004" "LFDA005"
## [148] "LFDA006" "LFDA007" "LFDA008" "LFDA009" "LFDA010" "LFDA011" "LFDA012"
## [155] "LFDA013" "LFDA014" "LFFA001" "LFFA002" "LFFA003" "LFFA004" "LFFA005"
## [162] "LFFA006" "LFFA007" "LFFA008" "LFFA009" "LFFA010" "LFFA011" "LFFA012"
## [169] "LFFA013" "LFFA014" "LFFC002" "LFGA001" "LFHA001" "LFHC001" "LFKA001"
## [176] "LFMA001" "LFPA001" "LFPA002" "LFPA003" "LGCA001" "LGFA001" "LGFA002"
## [183] "LGFA003" "LGFA004" "LGFA006" "LHCA001" "LHCA002" "LHCA010" "LHCA011"
## [190] "LHCA016" "LHDA001" "LHDA002" "LHFA001" "LHFA003" "LHFA013" "LHFA016"
## [197] "LHFA019" "LHFA024" "LHFA025" "LHFA027" "LHFA028" "LHFA029" "LHFA031"
## [204] "LHGA004" "LHGA006" "LHGA007" "LHHA006" "LHHA007" "LHMA003" "LHMA004"
## [211] "LHMA006" "LHMA007" "LHMA008" "LHMA011" "LHMA013" "LHMA014" "LHMA015"
## [218] "LHMA016" "LHPA003" "LHPA004" "LHPA006" "LHPA010" "QZKA001" "QZKA007"
## 
## $`A-004`
##  [1] "DGCA032" "EBAA002" "EBCA001" "EBCA002" "EBCA003" "EBCA004" "EBCA005"
##  [8] "EBCA006" "EBCA007" "EBCA008" "EBCA009" "EBCA012" "EBCA013" "EBCA014"
## [15] "EBCA015" "EBCA016" "EBCA017" "EBEA001" "EBEA002" "EBEA003" "EBEA004"
## [22] "EBEA005" "EBFA001" "EBFA002" "EBFA003" "EBFA004" "EBFA005" "EBFA006"
## [29] "EBFA007" "EBFA008" "EBFA009" "EBFA010" "EBFA011" "EBFA012" "EBFA013"
## [36] "EBFA014" "EBFA015" "EBFA016" "EBFA017" "EBFA018" "EBFA019" "EBFA020"
## [43] "EBFA021" "EBKA001" "EBKA002" "EBKA003" "EBKA004" "EBPA003" "EBSA002"
## [50] "EBSA003" "EBSA005" "EBSA006" "EBSA007" "EBSA010" "ECCA007" "ECCA009"
## 
## $`A-005`
## [1] "AHPA009" "AHPA028" "AHPC001"
## 
## $`A-006`
##   [1] "AANB001" "ABGA002" "ABMA002" "ADCA002" "ADCA004" "ADCA005" "ADEA002"
##   [8] "ADFA002" "ADFA003" "ADFA004" "ADFA006" "ADGA001" "ADPA003" "ADPA004"
##  [15] "ADPA008" "ADPA015" "ADPA016" "ADPA020" "ADPA021" "ADPA023" "AHCA001"
##  [22] "AHCA002" "AHCA003" "AHCA004" "AHCA005" "AHCA006" "AHCA007" "AHCA008"
##  [29] "AHCA009" "AHCA010" "AHCA011" "AHCA012" "AHCA013" "AHCA014" "AHCA015"
##  [36] "AHCA016" "AHCA017" "AHCA018" "AHCA019" "AHCA020" "AHCA021" "AHCA022"
##  [43] "AHCA023" "AHEA001" "AHEA002" "AHEA003" "AHEA004" "AHEA005" "AHEA006"
##  [50] "AHEA007" "AHEA008" "AHEA009" "AHEA010" "AHEA011" "AHEA012" "AHEA013"
##  [57] "AHEA014" "AHEA015" "AHEA016" "AHEA017" "AHEA018" "AHFA001" "AHFA002"
##  [64] "AHFA004" "AHFA005" "AHFA008" "AHFA010" "AHFA011" "AHGA001" "AHGA002"
##  [71] "AHHA001" "AHHA002" "AHPA002" "AHPA003" "AHPA004" "AHPA005" "AHPA006"
##  [78] "AHPA007" "AHPA015" "AHPA018" "AHPA020" "AHPA026" "AJFA001" "AJFA002"
##  [85] "AJFA003" "AJFC001" "AJFC002" "AJNA001" "AJNC001" "AJNC002" "AZGA001"
##  [92] "BAFA001" "EBLA001" "EBSA012" "ECMA001" "EFFA001" "EPFA001" "EPFA004"
##  [99] "FBFA001" "FBFA003" "FBFA900" "FBFC900" "GHFA002" "GHFA003" "GHFA004"
## [106] "HABA001" "HAFA030" "HAMA015" "HCFA010" "HFCA002" "HFCC002" "HGLA001"
## [113] "HPMA002" "HPMA003" "JFFA006" "JFFA010" "JFFA021" "JFFC002" "KFFA001"
## [120] "LACA004" "LACA014" "LACA015" "LACA019" "LAHA001" "LANC001" "LAPA001"
## [127] "LAPA015" "LCFA001" "LCFA002" "LCMA001" "LCPA002" "LGFA005" "LJFA002"
## [134] "LJFA003" "LJFA006" "LJFA007" "LJFA010" "MDPA001" "MDPA004" "MEDA001"
## [141] "MEPA001" "MEPC001" "MFDA001" "MFPA001" "MFPA002" "MFPA003" "MFPC001"
## [148] "MGCA001" "MGDA002" "MGFA002" "MGFA003" "MGFA005" "MGMA004" "MGPA001"
## [155] "MHFA001" "MHFA002" "MHFA003" "MHPA001" "MHPA002" "MHPA003" "MHPA004"
## [162] "MJAA001" "MJAA002" "MJCA003" "MJCA005" "MJCA006" "MJCA007" "MJCA008"
## [169] "MJCA010" "MJDA001" "MJEA001" "MJEA002" "MJEA003" "MJEA005" "MJEA007"
## [176] "MJEA008" "MJEA009" "MJEA011" "MJEA012" "MJEA013" "MJEA014" "MJEA015"
## [183] "MJEA016" "MJEA018" "MJEA020" "MJEA021" "MJFA002" "MJFA004" "MJFA006"
## [190] "MJFA007" "MJFA010" "MJFA012" "MJFA013" "MJFA014" "MJFA015" "MJFA016"
## [197] "MJFA018" "MJMA006" "MJMA014" "MJMA015" "MJMA016" "MJPA002" "MJPA003"
## [204] "MJPA004" "MJPA005" "MJPA006" "MJPA007" "MJPA008" "MJPA011" "MJPA012"
## [211] "MJPB001" "MZMA001" "MZMA004" "NBPA002" "NBPA008" "NBPA012" "NBPA015"
## [218] "NCPA004" "NCPA012" "NDPA002" "NDPA005" "NDPA007" "NDPA008" "NDPA009"
## [225] "NDPA011" "NDPA013" "NDPA014" "NEPA001" "NFPA001" "NFPA003" "NFPC002"
## [232] "NGDA003" "NGPA001" "NGPA002" "NGPA003" "NGPC001" "NHDA003" "NHDA009"
## [239] "NHDA010" "NHPA001" "NHPA002" "NHPA004" "NHPA005" "NHPA006" "NJAA001"
## [246] "NJAA002" "NJAA003" "NJAA004" "NJBA001" "NJBA002" "NJEA001" "NJEA004"
## [253] "NJEA006" "NJEA008" "NJEA009" "NJEA010" "NJEA011" "NJEA012" "NJFA005"
## [260] "NJMA001" "NJMA002" "NJMA003" "NJPA001" "NJPA002" "NJPA003" "NJPA004"
## [267] "NJPA005" "NJPA006" "NJPA007" "NJPA008" "NJPA009" "NJPA010" "NJPA011"
## [274] "NJPA012" "NJPA013" "NJPA015" "NJPA016" "NJPA017" "NJPA018" "NJPA019"
## [281] "NJPA020" "NJPA021" "NJPA022" "NJPA023" "NJPA024" "NJPA025" "NJPA026"
## [288] "NJPA027" "NJPA028" "NJPA029" "NJPA030" "NJPA031" "NJPA032" "NJPA033"
## [295] "NJPA034" "NJPA035" "NJPA036" "PCDA001" "PCEA001" "PCEA003" "PCEA004"
## [302] "PCMA001" "PCPA001" "PCPA002" "PCPA003" "PCPA005" "PZMA001" "PZMA003"
## [309] "PZMA004" "PZMA005" "QAEA014" "QAEA015" "QAMA002" "QAMA003" "QAMA004"
## [316] "QAMA005" "QAMA008" "QAMA012" "QAMA013" "QAMA015" "QZEA005" "QZEA006"
## [323] "QZEA019" "QZEA020" "QZEA024" "QZEA026" "QZEA027" "QZEA031" "QZEA032"
## [330] "QZEA036" "QZEA039" "QZMA001" "QZMA003" "QZMA004" "QZMA005" "QZMA006"
## [337] "QZMA007" "QZMA009" "QZPA008" "ZAQA001"
## 
## $`A-195`
## [1] "AAKA001" "AALA004" "AALA900" "AALB001"

On fait une table avec en face de chaque acte le nom des listes auxquelles il appartient.

tabAct <- data.frame(cdc_act = unlist(listAct, use.names = FALSE),
                     lis_act = rep(names(listAct), unlist(lapply(listAct, length))),
                     stringsAsFactors = FALSE)
head(tabAct)
##   cdc_act lis_act
## 1 AAFA001   A-002
## 2 AAFA002   A-002
## 3 AAFA003   A-002
## 4 AAFA004   A-002
## 5 AAFA005   A-002
## 6 AAFA006   A-002

On fait une table avec en face de chaque GHM les listes qui le concernent.

tabGHM <- data.frame(lis_act = unlist(listGHM, use.names = FALSE),
                     grg_ghm = rep(names(listGHM), unlist(lapply(listGHM, length))),
                     stringsAsFactors = FALSE)
head(tabGHM)
##   lis_act grg_ghm
## 1   A-002   01C03
## 2   A-002   01C04
## 3   A-003   01C05
## 4   A-004   01C06
## 5   A-006   01C08
## 6   A-195   01C09

En fusionnant les deux on a toutes les combinaisons GHM / Liste d’acte / Actes.

tab <- merge(tabGHM, tabAct, all = TRUE)
head(tab)
##   lis_act grg_ghm cdc_act
## 1   A-002   01C11 AAFA007
## 2   A-002   01C11 AAFA008
## 3   A-002   01C11 AAJA001
## 4   A-002   01C11 AAJA002
## 5   A-002   01C11 AAJH001
## 6   A-002   01C11 AAJH002

Attention ça marche pas pour les listes “sauf”, qui sont comptées comme classant dans le GHM. Faut faire une étape à la main comme y’en a peu, ou bien affiner le scraping ? Pour l’usage que je fais de la liste ça me dérange pas, donc pour l’instant j’y touche pas.