1 Google image
2 Twitter
3 Flickr
4 Échantillon autour de la Part Dieu
5 Synthèse
- 5.1 Vérification uniformité
- 5.2 Test CMR
Bibliographie

Estimation de la production de vélo’v en se basant sur le numéro de série des vélos de façon similaire au problème du char d’assaut allemand (Ruggles et Brodie, 1947). Ceci implique une numérotation continue et une distribution uniforme. Selon l’approche fréquentiste (Goodman, 1952), le nombre total (\(N\)) est fonction du nombre d’échantillons (\(k\)) et de la valeur de l’échantillon le plus élevé (\(m\)) selon la relation suivante : \[N \approx m - 1 + \frac{m}{k}\]

library(dplyr)
library(tidyr)
library(ggplot2)

1 Google image

Les 2 premières pages de google image sur “velov” (11*6 images, 29 images exploitables)

velov_gi_brut <- c(10,9446,7352,6746,7792,10463,3013,3004,1518,15095,729,4706,12573,11181,9169,7352,
                                     7840,9770,11704,9004,9342,10182,9676,8752,2691,2405,2966,10782,7677, 10267,5174,6123)
production_gi <- data.frame(date = '2016-04-23', m = max(velov_gi_brut), k = length(unique(velov_gi_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

Les 3 premières pages de google image période 1/01/2016 - 23/04/2016 sur “velov” (17*6 images, 8 images exploitables)

velov_gi_brut_2 <- c(14903,6298,10,15095,15094,6158,2454,12573,6746,4866,2300,1759,2300,5113,5040,8396)
production_gi_2 <- data.frame(date = '2016-04-23', m = max(velov_gi_brut_2), k = length(unique(velov_gi_brut_2))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

quels vélo’v en doublon ?

velov_gi_brut %>% intersect(velov_gi_brut_2)

## [1]    10  6746 15095 12573

union des deux échantillons google image

velov_gi_all <- velov_gi_brut %>% 
    union(velov_gi_brut_2)

production_gi_all <- data.frame(source = "Google images", date = '2016-04-23', m = max(velov_gi_all), k = length(velov_gi_all)) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

2 Twitter

https://twitter.com/search?q=velov&src=typd

velov_twitter_brut <- unique(c(13070,7591,14493,14179,15410,3020,10336,15094,15095,4706,14624,15242,1,13905,15271,
                                                15701,11465,14332,12279,14625,15517,15095,14625,15095))
production_twitter <- data.frame(source = "Twitter", date = '2016-04-23', m = max(velov_twitter_brut), k = length(unique(velov_twitter_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

3 Flickr

https://www.flickr.com/search/?text=velov&sort=date-taken-desc 1/1/2015 - 23/04/2016 - 12 photos exploitables sur 45

velov_flickr_brut <- unique(c(14403,12309,14408,12431,13395,13235,12543,12410,13071,13499,11909,11957,12820,12037,14042,11579))
production_flickr <- data.frame(source = "Flickr", date = '2016-04-23', m = max(velov_flickr_brut), k = length(unique(velov_flickr_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

4 Échantillon autour de la Part Dieu

2016-04-25

velov_lpd_brut <- unique(c(14680,14075,13157,11774,14658,2284,14496,14033,4969,11955,13842,5669,13390,14145,14255,13907,15442,8730,14152,15952,15524,13435,15296,13215,7059,14447,13238,15387,14286,14636,13562,1552,2254,14990,2399,13728,10083,14193,15291,15338,13236,15811,13569,13339,11005,14995,12298,10973,13205,13773,9498,9592,14908,10732,4466,15337,14951,14006,13025,14626,11178,15483,13440,14847,13320,12852,13368,16044,14067,15800,15641,13340,14025,13761,11322,10826,15372,14987,12040,14740,14607,11138,16056,9999,14790,13925,15678,15584,11206,15044,15952,14911))
production_lpd <- data.frame(source = "Stations LPD 1", date = '2016-04-25', m = max(velov_lpd_brut), k = length(unique(velov_lpd_brut))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

velov_lpd_brut2 <- unique(c(13736,12340,14338,13385,13144,14125,13238,10851,12059,14955,10651,12349,2362,172,7677,3917,12991,14192,13930,14150,12424,13730,15534,10293,15465,13689,14527,11255,12921,14766,15333,13614,13232,2302,13143,15584,14056,15565,15446,13394,12420,13391,13835,15497,16077,9356,12640,12528,15006,14329,14846,15699,15510,13325,15938,13912,10053,14587,14622,15325,15962,13440,12898,13434,13752,12742,15048,11956,12056,15422,9755,5397,13507,14299,15713,15401,15079,7048,13248,14185,7106,11310,14753,13729,15932,12950,11170,13255,11322,15144,8249,14331,13051,13456,15236,15167,4162,13186,13134,12276,8971,16000,11711,12401,6886,12973,14348,13041,14229,9743,12342,14738,15066,13878,14664,5397))
production_lpd2 <- data.frame(source = "Stations LPD 2", date = '2016-04-29', m = max(velov_lpd_brut2), k = length(unique(velov_lpd_brut2))) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

5 Synthèse

Préparation du graphique : assemblage des résultats

velov_synthese <- velov_gi_all %>% 
    union(velov_twitter_brut) %>% 
    union(velov_flickr_brut) %>% 
    union(velov_lpd_brut) %>% 
    union(velov_lpd_brut2) 

production_synthese <- data.frame(source = "Synthèse", date = NA, m = max(velov_synthese), k = length(velov_synthese)) %>% 
    mutate(n = round(m - 1 + m / k), 
                 ic95_min = m,
                 ic95_max = round(m  * 20 ^ (1 / k)))

production_synthese <- production_lpd %>% 
    bind_rows(production_lpd2, production_gi_all, production_twitter, production_flickr, production_synthese)

production_synthese[production_synthese$source == "Synthèse", "date"] <- max(production_synthese$date)

kable(production_synthese)

source	date	m	k	n	ic95_min	ic95_max
Stations LPD 1	2016-04-25	16056	91	16231	16056	16593
Stations LPD 2	2016-04-29	16077	115	16216	16077	16501
Google images	2016-04-23	15095	42	15453	15095	16211
Twitter	2016-04-23	15701	21	16448	15701	18108
Flickr	2016-04-23	14408	16	15308	14408	17375
Synthèse	NA	16077	277	16134	16077	16252

ggplot(production_synthese, aes(source, n, color = source)) +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = ic95_min, ymax = ic95_max), width = .1) +
    guides(color = FALSE) +
    geom_text(aes(y = 18200), label = production_synthese$k) +
    annotate("text", x = 0.8, y = 18200,  label = "n = ") +
    scale_x_discrete(limits = c("Stations LPD 1", "Stations LPD 2", "Google images", "Twitter", "Flickr", "Synthèse")) +
    ggtitle(paste("Nombre de Vélov' produits -", max(production_synthese$date))) +
    xlab("échantillon") +
    ylab("nombre estimé") #+

    # geom_segment(x = 0.9, y = 18270, xend = 4.1, yend = 18270, color = "gray") +
    # geom_segment(x = 2.5, y = 18340, xend = 5, yend = 18340, color = "gray") +
    # geom_segment(x = 2.5, y = 18270, xend = 2.5, yend = 18340, color = "gray") +
    # geom_segment(x = 5, y = 18270, xend = 5, yend = 18340, color = "gray")

5.1 Vérification uniformité

df <- data.frame(id = velov_synthese)
ggplot(df, aes(id)) +
    geom_histogram(binwidth = 250, boundary = 0)

chisq.test(df)

## 
##  Chi-squared test for given probabilities
## 
## data:  df
## X-squared = 348680, df = 276, p-value < 2.2e-16

p < 0.05 : on rejette l’hypothèse de distribution uniforme. Donc, ou notre échantillon n’est pas représentatif, ou il y a des trous dans la série (la numérotation n’est pas continue), ou il y a de nombreux vélos qui ont été retirés (les plus anciens)…

5.2 Test CMR

Une autre question peut se poser : combien de vélo’v circulent actuellement ? Il est possible d’y répondre par CMR (Laake, Johnson, et Conn, 2013).

# remplace toutes les fréquences > 1 par 1.
raz <- function(x) {
    as.numeric(x > 0)
}

Tableau des histoires de capture

velov_cmr <- data.frame(source = rep("gi", length(velov_gi_all)), velov = velov_gi_all) %>% 
    bind_rows(data.frame(source = rep("fl", length(velov_flickr_brut)), velov = velov_flickr_brut)) %>%
    bind_rows(data.frame(source = rep("tw", length(velov_twitter_brut)), velov = velov_twitter_brut)) %>%
    bind_rows(data.frame(source = rep("lpd", length(velov_lpd_brut)), velov = velov_lpd_brut)) %>%
    bind_rows(data.frame(source = rep("lpd2", length(velov_lpd_brut2)), velov = velov_lpd_brut2)) %>% 
    xtabs(formula = ~ velov + source, data = .) %>% 
    as.data.frame() %>% 
    spread(source, Freq) %>% 
    mutate_each(funs(raz), 2:ncol(.)) %>% 
    unite(ch, 2:ncol(.), sep = "") 

head(velov_cmr)

##   velov    ch
## 1     1 00001
## 2    10 01000
## 3   172 00010
## 4   729 01000
## 5  1518 01000
## 6  1552 00100

library(marked)
velov_proc <- process.data(velov_cmr, model = "js")
velov_ddl <- make.design.data(velov_proc)
mod_phidot_pdot <- crm(velov_proc, velov_ddl)

## Starting optimization 4  parameters
## 
 Number of evaluations:  100  -2lnl: -1589.627042
 Number of evaluations:  200  -2lnl: -1630.662869
 Number of evaluations:  300  -2lnl: -1581.676929
## Elapsed time in minutes:  0.0168

(ic <- js.hessian(mod_phidot_pdot))

## 
## crm Model Summary
## 
## Npar :  4
## -2lnL:  204.9848
## AIC  :  212.9848
## 
## Beta
##                    Estimate        se        lcl        ucl
## Phi.(Intercept)  -0.2822198 0.3675675 -1.0026521  0.4382124
## p.(Intercept)    -2.8239098 0.4465532 -3.6991541 -1.9486655
## pent.(Intercept)  1.0154897 0.3007150  0.4260883  1.6048911
## N.(Intercept)     8.0679734 0.4086529  7.2670138  8.8689331

Il y a donc 8 Vélo’v en circulation (en fait pas assez de captures pour conclure)…

Bibliographie

GOODMAN L. A. « Serial number analysis ». Journal of the American Statistical Association [En ligne]. 1 décembre 1952. Vol. 47, n°260, p. 622‑634. Disponible sur : < http://dx.doi.org/10.1080/01621459.1952.10483442 >

LAAKE J. L., JOHNSON D. S., CONN P. B. « marked: an R package for maximum likelihood and Markov Chain Monte Carlo analysis of capture–recapture data ». Methods in Ecology and Evolution [En ligne]. septembre 2013. Vol. 4, n°9, p. 885‑890. Disponible sur : < http://dx.doi.org/10.1111/2041-210X.12065 >

RUGGLES R., BRODIE H. « An empirical approach to economic intelligence in world war II ». Journal of the American Statistical Association [En ligne]. 1947. Vol. 42, n°237, p. 72‑91. Disponible sur : < http://dx.doi.org/10.1080/01621459.1947.10501915 >

Nombre de vélo’v produits

michael

Fri Apr 29 23:19:10 2016