Estimation de la production de vélo’v en se basant sur le numéro de série des vélos de façon similaire au problème du char d’assaut allemand (Ruggles et Brodie, 1947). Ceci implique une numérotation continue et une distribution uniforme. Selon l’approche fréquentiste (Goodman, 1952), le nombre total (\(N\)) est fonction du nombre d’échantillons (\(k\)) et de la valeur de l’échantillon le plus élevé (\(m\)) selon la relation suivante : \[N \approx m - 1 + \frac{m}{k}\]

library(dplyr)
library(tidyr)
library(ggplot2)

1 Google image

Les 2 premières pages de google image sur “velov” (11*6 images, 29 images exploitables)

velov_gi_brut <- c(10,9446,7352,6746,7792,10463,3013,3004,1518,15095,729,4706,12573,11181,9169,7352,
                                     7840,9770,11704,9004,9342,10182,9676,8752,2691,2405,2966,10782,7677, 10267,5174,6123)
production_gi <- data.frame(date = '2016-04-23', m = max(velov_gi_brut), k = length(unique(velov_gi_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

Les 3 premières pages de google image période 1/01/2016 - 23/04/2016 sur “velov” (17*6 images, 8 images exploitables)

velov_gi_brut_2 <- c(14903,6298,10,15095,15094,6158,2454,12573,6746,4866,2300,1759,2300,5113,5040,8396)
production_gi_2 <- data.frame(date = '2016-04-23', m = max(velov_gi_brut_2), k = length(unique(velov_gi_brut_2))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

quels vélo’v en doublon ?

velov_gi_brut %>% intersect(velov_gi_brut_2)
## [1]    10  6746 15095 12573

union des deux échantillons google image

velov_gi_all <- velov_gi_brut %>% 
    union(velov_gi_brut_2)

production_gi_all <- data.frame(source = "Google images", date = '2016-04-23', m = max(velov_gi_all), k = length(velov_gi_all)) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

2 Twitter

https://twitter.com/search?q=velov&src=typd

velov_twitter_brut <- unique(c(13070,7591,14493,14179,15410,3020,10336,15094,15095,4706,14624,15242,1,13905,15271,
                                                15701,11465,14332,12279,14625,15517,15095,14625,15095))
production_twitter <- data.frame(source = "Twitter", date = '2016-04-23', m = max(velov_twitter_brut), k = length(unique(velov_twitter_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

3 Flickr

https://www.flickr.com/search/?text=velov&sort=date-taken-desc 1/1/2015 - 23/04/2016 - 12 photos exploitables sur 45

velov_flickr_brut <- unique(c(14403,12309,14408,12431,13395,13235,12543,12410,13071,13499,11909,11957,12820,12037,14042,11579))
production_flickr <- data.frame(source = "Flickr", date = '2016-04-23', m = max(velov_flickr_brut), k = length(unique(velov_flickr_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

4 Échantillon autour de la Part Dieu

2016-04-25

velov_lpd_brut <- unique(c(14680,14075,13157,11774,14658,2284,14496,14033,4969,11955,13842,5669,13390,14145,14255,13907,15442,8730,14152,15952,15524,13435,15296,13215,7059,14447,13238,15387,14286,14636,13562,1552,2254,14990,2399,13728,10083,14193,15291,15338,13236,15811,13569,13339,11005,14995,12298,10973,13205,13773,9498,9592,14908,10732,4466,15337,14951,14006,13025,14626,11178,15483,13440,14847,13320,12852,13368,16044,14067,15800,15641,13340,14025,13761,11322,10826,15372,14987,12040,14740,14607,11138,16056,9999,14790,13925,15678,15584,11206,15044,15952,14911))
production_lpd <- data.frame(source = "Stations LPD 1", date = '2016-04-25', m = max(velov_lpd_brut), k = length(unique(velov_lpd_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

velov_lpd_brut2 <- unique(c(13736,12340,14338,13385,13144,14125,13238,10851,12059,14955,10651,12349,2362,172,7677,3917,12991,14192,13930,14150,12424,13730,15534,10293,15465,13689,14527,11255,12921,14766,15333,13614,13232,2302,13143,15584,14056,15565,15446,13394,12420,13391,13835,15497,16077,9356,12640,12528,15006,14329,14846,15699,15510,13325,15938,13912,10053,14587,14622,15325,15962,13440,12898,13434,13752,12742,15048,11956,12056,15422,9755,5397,13507,14299,15713,15401,15079,7048,13248,14185,7106,11310,14753,13729,15932,12950,11170,13255,11322,15144,8249,14331,13051,13456,15236,15167,4162,13186,13134,12276,8971,16000,11711,12401,6886,12973,14348,13041,14229,9743,12342,14738,15066,13878,14664,5397))
production_lpd2 <- data.frame(source = "Stations LPD 2", date = '2016-04-29', m = max(velov_lpd_brut2), k = length(unique(velov_lpd_brut2))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

5 Synthèse

Préparation du graphique : assemblage des résultats

velov_synthese <- velov_gi_all %>% 
    union(velov_twitter_brut) %>% 
    union(velov_flickr_brut) %>% 
    union(velov_lpd_brut) %>% 
    union(velov_lpd_brut2) 

production_synthese <- data.frame(source = "Synthèse", date = NA, m = max(velov_synthese), k = length(velov_synthese)) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

production_synthese <- production_lpd %>% 
    bind_rows(production_lpd2, production_gi_all, production_twitter, production_flickr, production_synthese)

production_synthese[production_synthese$source == "Synthèse", "date"] <- max(production_synthese$date)
kable(production_synthese)
source date m k n ic95_min ic95_max
Stations LPD 1 2016-04-25 16056 91 16231 16056 16593
Stations LPD 2 2016-04-29 16077 115 16216 16077 16501
Google images 2016-04-23 15095 42 15453 15095 16211
Twitter 2016-04-23 15701 21 16448 15701 18108
Flickr 2016-04-23 14408 16 15308 14408 17375
Synthèse NA 16077 277 16134 16077 16252
ggplot(production_synthese, aes(source, n, color = source)) +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = ic95_min, ymax = ic95_max), width = .1) +
    guides(color = FALSE) +
    geom_text(aes(y = 18200), label = production_synthese$k) +
    annotate("text", x = 0.8, y = 18200,  label = "n = ") +
    scale_x_discrete(limits = c("Stations LPD 1", "Stations LPD 2", "Google images", "Twitter", "Flickr", "Synthèse")) +
    ggtitle(paste("Nombre de Vélov' produits -", max(production_synthese$date))) +
    xlab("échantillon") +
    ylab("nombre estimé") #+

    # geom_segment(x = 0.9, y = 18270, xend = 4.1, yend = 18270, color = "gray") +
    # geom_segment(x = 2.5, y = 18340, xend = 5, yend = 18340, color = "gray") +
    # geom_segment(x = 2.5, y = 18270, xend = 2.5, yend = 18340, color = "gray") +
    # geom_segment(x = 5, y = 18270, xend = 5, yend = 18340, color = "gray")

5.1 Vérification uniformité

df <- data.frame(id = velov_synthese)
ggplot(df, aes(id)) +
    geom_histogram(binwidth = 250, boundary = 0)

chisq.test(df)
## 
##  Chi-squared test for given probabilities
## 
## data:  df
## X-squared = 348680, df = 276, p-value < 2.2e-16

p < 0.05 : on rejette l’hypothèse de distribution uniforme. Donc, ou notre échantillon n’est pas représentatif, ou il y a des trous dans la série (la numérotation n’est pas continue), ou il y a de nombreux vélos qui ont été retirés (les plus anciens)…

5.2 Test CMR

Une autre question peut se poser : combien de vélo’v circulent actuellement ? Il est possible d’y répondre par CMR (Laake, Johnson, et Conn, 2013).

# remplace toutes les fréquences > 1 par 1.
raz <- function(x) {
    as.numeric(x > 0)
}

Tableau des histoires de capture

velov_cmr <- data.frame(source = rep("gi", length(velov_gi_all)), velov = velov_gi_all) %>% 
    bind_rows(data.frame(source = rep("fl", length(velov_flickr_brut)), velov = velov_flickr_brut)) %>%
    bind_rows(data.frame(source = rep("tw", length(velov_twitter_brut)), velov = velov_twitter_brut)) %>%
    bind_rows(data.frame(source = rep("lpd", length(velov_lpd_brut)), velov = velov_lpd_brut)) %>%
    bind_rows(data.frame(source = rep("lpd2", length(velov_lpd_brut2)), velov = velov_lpd_brut2)) %>% 
    xtabs(formula = ~ velov + source, data = .) %>% 
    as.data.frame() %>% 
    spread(source, Freq) %>% 
    mutate_each(funs(raz), 2:ncol(.)) %>% 
    unite(ch, 2:ncol(.), sep = "") 

head(velov_cmr)
##   velov    ch
## 1     1 00001
## 2    10 01000
## 3   172 00010
## 4   729 01000
## 5  1518 01000
## 6  1552 00100
library(marked)
velov_proc <- process.data(velov_cmr, model = "js")
velov_ddl <- make.design.data(velov_proc)
mod_phidot_pdot <- crm(velov_proc, velov_ddl)
## Starting optimization 4  parameters
## 
 Number of evaluations:  100  -2lnl: -1589.627042
 Number of evaluations:  200  -2lnl: -1630.662869
 Number of evaluations:  300  -2lnl: -1581.676929
## Elapsed time in minutes:  0.0168
(ic <- js.hessian(mod_phidot_pdot))
## 
## crm Model Summary
## 
## Npar :  4
## -2lnL:  204.9848
## AIC  :  212.9848
## 
## Beta
##                    Estimate        se        lcl        ucl
## Phi.(Intercept)  -0.2822198 0.3675675 -1.0026521  0.4382124
## p.(Intercept)    -2.8239098 0.4465532 -3.6991541 -1.9486655
## pent.(Intercept)  1.0154897 0.3007150  0.4260883  1.6048911
## N.(Intercept)     8.0679734 0.4086529  7.2670138  8.8689331

Il y a donc 8 Vélo’v en circulation (en fait pas assez de captures pour conclure)…

Bibliographie

GOODMAN L. A. « Serial number analysis ». Journal of the American Statistical Association [En ligne]. 1 décembre 1952. Vol. 47, n°260, p. 622‑634. Disponible sur : < http://dx.doi.org/10.1080/01621459.1952.10483442 >

LAAKE J. L., JOHNSON D. S., CONN P. B. « marked: an R package for maximum likelihood and Markov Chain Monte Carlo analysis of capture–recapture data ». Methods in Ecology and Evolution [En ligne]. septembre 2013. Vol. 4, n°9, p. 885‑890. Disponible sur : < http://dx.doi.org/10.1111/2041-210X.12065 >

RUGGLES R., BRODIE H. « An empirical approach to economic intelligence in world war II ». Journal of the American Statistical Association [En ligne]. 1947. Vol. 42, n°237, p. 72‑91. Disponible sur : < http://dx.doi.org/10.1080/01621459.1947.10501915 >