Estimation de la production de vélo’v en se basant sur le numéro de série des vélos de façon similaire au problème du char d’assaut allemand (Ruggles et Brodie, 1947). Ceci implique une numérotation continue et une distribution uniforme. Selon l’approche fréquentiste (Goodman, 1952), le nombre total (\(N\)) est fonction du nombre d’échantillons (\(k\)) et de la valeur de l’échantillon le plus élevé (\(m\)) selon la relation suivante : \[N \approx m - 1 + \frac{m}{k}\]
library(dplyr)
library(tidyr)
library(ggplot2)
Les 2 premières pages de google image sur “velov” (11*6 images, 29 images exploitables)
velov_gi_brut <- c(10,9446,7352,6746,7792,10463,3013,3004,1518,15095,729,4706,12573,11181,9169,7352,
7840,9770,11704,9004,9342,10182,9676,8752,2691,2405,2966,10782,7677, 10267,5174,6123)
production_gi <- data.frame(date = '2016-04-23', m = max(velov_gi_brut), k = length(unique(velov_gi_brut))) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
Les 3 premières pages de google image période 1/01/2016 - 23/04/2016 sur “velov” (17*6 images, 8 images exploitables)
velov_gi_brut_2 <- c(14903,6298,10,15095,15094,6158,2454,12573,6746,4866,2300,1759,2300,5113,5040,8396)
production_gi_2 <- data.frame(date = '2016-04-23', m = max(velov_gi_brut_2), k = length(unique(velov_gi_brut_2))) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
quels vélo’v en doublon ?
velov_gi_brut %>% intersect(velov_gi_brut_2)
## [1] 10 6746 15095 12573
union des deux échantillons google image
velov_gi_all <- velov_gi_brut %>%
union(velov_gi_brut_2)
production_gi_all <- data.frame(source = "Google images", date = '2016-04-23', m = max(velov_gi_all), k = length(velov_gi_all)) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
https://twitter.com/search?q=velov&src=typd
velov_twitter_brut <- unique(c(13070,7591,14493,14179,15410,3020,10336,15094,15095,4706,14624,15242,1,13905,15271,
15701,11465,14332,12279,14625,15517,15095,14625,15095))
production_twitter <- data.frame(source = "Twitter", date = '2016-04-23', m = max(velov_twitter_brut), k = length(unique(velov_twitter_brut))) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
https://www.flickr.com/search/?text=velov&sort=date-taken-desc 1/1/2015 - 23/04/2016 - 12 photos exploitables sur 45
velov_flickr_brut <- unique(c(14403,12309,14408,12431,13395,13235,12543,12410,13071,13499,11909,11957,12820,12037,14042,11579))
production_flickr <- data.frame(source = "Flickr", date = '2016-04-23', m = max(velov_flickr_brut), k = length(unique(velov_flickr_brut))) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
2016-04-25
velov_lpd_brut <- unique(c(14680,14075,13157,11774,14658,2284,14496,14033,4969,11955,13842,5669,13390,14145,14255,13907,15442,8730,14152,15952,15524,13435,15296,13215,7059,14447,13238,15387,14286,14636,13562,1552,2254,14990,2399,13728,10083,14193,15291,15338,13236,15811,13569,13339,11005,14995,12298,10973,13205,13773,9498,9592,14908,10732,4466,15337,14951,14006,13025,14626,11178,15483,13440,14847,13320,12852,13368,16044,14067,15800,15641,13340,14025,13761,11322,10826,15372,14987,12040,14740,14607,11138,16056,9999,14790,13925,15678,15584,11206,15044,15952,14911))
production_lpd <- data.frame(source = "Stations LPD 1", date = '2016-04-25', m = max(velov_lpd_brut), k = length(unique(velov_lpd_brut))) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
velov_lpd_brut2 <- unique(c(13736,12340,14338,13385,13144,14125,13238,10851,12059,14955,10651,12349,2362,172,7677,3917,12991,14192,13930,14150,12424,13730,15534,10293,15465,13689,14527,11255,12921,14766,15333,13614,13232,2302,13143,15584,14056,15565,15446,13394,12420,13391,13835,15497,16077,9356,12640,12528,15006,14329,14846,15699,15510,13325,15938,13912,10053,14587,14622,15325,15962,13440,12898,13434,13752,12742,15048,11956,12056,15422,9755,5397,13507,14299,15713,15401,15079,7048,13248,14185,7106,11310,14753,13729,15932,12950,11170,13255,11322,15144,8249,14331,13051,13456,15236,15167,4162,13186,13134,12276,8971,16000,11711,12401,6886,12973,14348,13041,14229,9743,12342,14738,15066,13878,14664,5397))
production_lpd2 <- data.frame(source = "Stations LPD 2", date = '2016-04-29', m = max(velov_lpd_brut2), k = length(unique(velov_lpd_brut2))) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
Préparation du graphique : assemblage des résultats
velov_synthese <- velov_gi_all %>%
union(velov_twitter_brut) %>%
union(velov_flickr_brut) %>%
union(velov_lpd_brut) %>%
union(velov_lpd_brut2)
production_synthese <- data.frame(source = "Synthèse", date = NA, m = max(velov_synthese), k = length(velov_synthese)) %>%
mutate(n = round(m - 1 + m / k),
ic95_min = m,
ic95_max = round(m * 20 ^ (1 / k)))
production_synthese <- production_lpd %>%
bind_rows(production_lpd2, production_gi_all, production_twitter, production_flickr, production_synthese)
production_synthese[production_synthese$source == "Synthèse", "date"] <- max(production_synthese$date)
kable(production_synthese)
| source | date | m | k | n | ic95_min | ic95_max |
|---|---|---|---|---|---|---|
| Stations LPD 1 | 2016-04-25 | 16056 | 91 | 16231 | 16056 | 16593 |
| Stations LPD 2 | 2016-04-29 | 16077 | 115 | 16216 | 16077 | 16501 |
| Google images | 2016-04-23 | 15095 | 42 | 15453 | 15095 | 16211 |
| 2016-04-23 | 15701 | 21 | 16448 | 15701 | 18108 | |
| Flickr | 2016-04-23 | 14408 | 16 | 15308 | 14408 | 17375 |
| Synthèse | NA | 16077 | 277 | 16134 | 16077 | 16252 |
ggplot(production_synthese, aes(source, n, color = source)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = ic95_min, ymax = ic95_max), width = .1) +
guides(color = FALSE) +
geom_text(aes(y = 18200), label = production_synthese$k) +
annotate("text", x = 0.8, y = 18200, label = "n = ") +
scale_x_discrete(limits = c("Stations LPD 1", "Stations LPD 2", "Google images", "Twitter", "Flickr", "Synthèse")) +
ggtitle(paste("Nombre de Vélov' produits -", max(production_synthese$date))) +
xlab("échantillon") +
ylab("nombre estimé") #+
# geom_segment(x = 0.9, y = 18270, xend = 4.1, yend = 18270, color = "gray") +
# geom_segment(x = 2.5, y = 18340, xend = 5, yend = 18340, color = "gray") +
# geom_segment(x = 2.5, y = 18270, xend = 2.5, yend = 18340, color = "gray") +
# geom_segment(x = 5, y = 18270, xend = 5, yend = 18340, color = "gray")
df <- data.frame(id = velov_synthese)
ggplot(df, aes(id)) +
geom_histogram(binwidth = 250, boundary = 0)
chisq.test(df)
##
## Chi-squared test for given probabilities
##
## data: df
## X-squared = 348680, df = 276, p-value < 2.2e-16
p < 0.05 : on rejette l’hypothèse de distribution uniforme. Donc, ou notre échantillon n’est pas représentatif, ou il y a des trous dans la série (la numérotation n’est pas continue), ou il y a de nombreux vélos qui ont été retirés (les plus anciens)…
Une autre question peut se poser : combien de vélo’v circulent actuellement ? Il est possible d’y répondre par CMR (Laake, Johnson, et Conn, 2013).
# remplace toutes les fréquences > 1 par 1.
raz <- function(x) {
as.numeric(x > 0)
}
Tableau des histoires de capture
velov_cmr <- data.frame(source = rep("gi", length(velov_gi_all)), velov = velov_gi_all) %>%
bind_rows(data.frame(source = rep("fl", length(velov_flickr_brut)), velov = velov_flickr_brut)) %>%
bind_rows(data.frame(source = rep("tw", length(velov_twitter_brut)), velov = velov_twitter_brut)) %>%
bind_rows(data.frame(source = rep("lpd", length(velov_lpd_brut)), velov = velov_lpd_brut)) %>%
bind_rows(data.frame(source = rep("lpd2", length(velov_lpd_brut2)), velov = velov_lpd_brut2)) %>%
xtabs(formula = ~ velov + source, data = .) %>%
as.data.frame() %>%
spread(source, Freq) %>%
mutate_each(funs(raz), 2:ncol(.)) %>%
unite(ch, 2:ncol(.), sep = "")
head(velov_cmr)
## velov ch
## 1 1 00001
## 2 10 01000
## 3 172 00010
## 4 729 01000
## 5 1518 01000
## 6 1552 00100
library(marked)
velov_proc <- process.data(velov_cmr, model = "js")
velov_ddl <- make.design.data(velov_proc)
mod_phidot_pdot <- crm(velov_proc, velov_ddl)
## Starting optimization 4 parameters
##
Number of evaluations: 100 -2lnl: -1589.627042
Number of evaluations: 200 -2lnl: -1630.662869
Number of evaluations: 300 -2lnl: -1581.676929
## Elapsed time in minutes: 0.0168
(ic <- js.hessian(mod_phidot_pdot))
##
## crm Model Summary
##
## Npar : 4
## -2lnL: 204.9848
## AIC : 212.9848
##
## Beta
## Estimate se lcl ucl
## Phi.(Intercept) -0.2822198 0.3675675 -1.0026521 0.4382124
## p.(Intercept) -2.8239098 0.4465532 -3.6991541 -1.9486655
## pent.(Intercept) 1.0154897 0.3007150 0.4260883 1.6048911
## N.(Intercept) 8.0679734 0.4086529 7.2670138 8.8689331
Il y a donc 8 Vélo’v en circulation (en fait pas assez de captures pour conclure)…
GOODMAN L. A. « Serial number analysis ». Journal of the American Statistical Association [En ligne]. 1 décembre 1952. Vol. 47, n°260, p. 622‑634. Disponible sur : < http://dx.doi.org/10.1080/01621459.1952.10483442 >
LAAKE J. L., JOHNSON D. S., CONN P. B. « marked: an R package for maximum likelihood and Markov Chain Monte Carlo analysis of capture–recapture data ». Methods in Ecology and Evolution [En ligne]. septembre 2013. Vol. 4, n°9, p. 885‑890. Disponible sur : < http://dx.doi.org/10.1111/2041-210X.12065 >
RUGGLES R., BRODIE H. « An empirical approach to economic intelligence in world war II ». Journal of the American Statistical Association [En ligne]. 1947. Vol. 42, n°237, p. 72‑91. Disponible sur : < http://dx.doi.org/10.1080/01621459.1947.10501915 >