Les données proviennet de la ville de Paris dans leur démarche de transparence, de communication et d’efficience permis par le Big Data. Voici le lien de chargement https://opendata.paris.fr/page/home/
library(leaflet)## Warning: package 'leaflet' was built under R version 3.3.2
setwd("~/Desktop/projet")
df<-read.csv("cofee.csv", sep=";")
df<-df[,-c(1)]
head(df)## Nom.du.café Adresse Arrondissement
## 1 Le Café Livres 10 rue Saint Martin 75004
## 2 Le Bosquet 46 avenue Bosquet 75007
## 3 Le Chaumontois 12 rue Armand Carrel 75018
## 4 Le Kleemend's 34 avenue Pierre Mendès-France 75013
## 5 Café Pierre 202 rue du faubourg st antoine 75012
## 6 Les Arcades 61 rue de Ponthieu 75008
## Prix.comptoir prix.salle prix.terasse Geoloc
## 1 1 - - 48.857728, 2.349641
## 2 1 - - 48.856003, 2.30457
## 3 1 - - 48.889426, 2.332954
## 4 1 - - 48.838521, 2.370478
## 5 1 - - 48.849861, 2.385342
## 6 1 - - 48.872202, 2.304624
df<-df[!(is.na(df$Geoloc) | df$Geoloc==""), ]
foo <- data.frame(do.call('rbind', strsplit(as.character(df$Geoloc),',',fixed=TRUE)))
colnames(foo)<-c("latitude","longitude")
df<-cbind(df,foo)
df$latitude<-as.character(df$latitude)
df$longitude<-as.character(df$longitude)
df$latitude<-as.numeric(df$latitude)
df$longitude<-as.numeric(df$longitude)m <- leaflet(df) %>% addTiles() %>%
addMarkers(lng = ~longitude, lat = ~latitude)
mrd=.5
op=.8
clr="blue"
m = leaflet(df) %>% addTiles() %>% addCircles(df$longitude,df$latitude, radius = rd,opacity=op,col=clr)
mlibrary(leaflet)
library(KernSmooth)## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(MASS)
X=cbind(df$longitude,df$latitude)
kde2d <- bkde2D(X, bandwidth=c(bw.ucv(X[,1]),bw.ucv(X[,2])))## Warning in bw.ucv(X[, 1]): minimum occurred at one end of the range
x=kde2d$x1
y=kde2d$x2
z=kde2d$fhat
CL=contourLines(x , y , z)
m = leaflet(df) %>% addTiles()
m %>% addCircles(df$longitude,df$latitude, radius = rd,opacity=op,col=clr) %>%
addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE)m = leaflet(df) %>% addTiles()
m %>% addCircles(df$longitude,df$latitude, radius = rd,opacity=op,col=clr) %>%
addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")m = leaflet(df) %>% addTiles()
m %>% addCircles(df$longitude,df$latitude, radius = rd,opacity=op,col=clr) %>%
addPolygons(CL[[1]]$x,CL[[1]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[3]]$x,CL[[3]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[5]]$x,CL[[5]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[7]]$x,CL[[7]]$y,fillColor = "red", stroke = FALSE) %>%
addPolygons(CL[[9]]$x,CL[[9]]$y,fillColor = "red", stroke = FALSE) %>%
addPolylines(CL[[1]]$x,CL[[1]]$y,color = "red") %>%
addPolylines(CL[[5]]$x,CL[[5]]$y,color = "red") %>%
addPolylines(CL[[8]]$x,CL[[8]]$y,color = "red")df$Arrondissement<-as.factor(df$Arrondissement)
library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df2<- df %>% group_by(Arrondissement) %>%
summarise(length(Nom.du.café))## Warning in grouped_df_impl(data, unname(vars), drop): '.Random.seed' n'est
## pas un vecteur d'entiers, mais est de type 'NULL', et sera donc ignoré
## Warning: package 'bindrcpp' was built under R version 3.3.2
colnames(df2)<-c("district","nombre_de_cafés")
df2$comparison<-ifelse(df2$nombre_de_cafés>12,"Quartier_Raisonnable",
ifelse(df2$nombre_de_cafés<5,"Quartier_très_Cher","Quartier_médium"))
p<- ggplot(data=df2, aes(x=district, y=nombre_de_cafés, fill=comparison)) +
geom_bar(stat="identity")+
theme_minimal()
p<- p+scale_fill_manual(values=c("darkgoldenrod1", "chartreuse3", "brown2"))
p<- p + coord_flip()
pRapporté aux comptoirs offrant un café à 1 euro, il apparaît que les arrondissements du: - 11ème - 12ème - 15ème - 18ème … sont des quarties relativement peu chers. Au contraire, des arrondissements du - 6ème (quartier latin) - 16ème - 19ème
Si l’on compare au prix du m2 dans Paris :
Notre index cofee offre des résultats mitigés en ce sens que : - il apparîaît évident que le 16ème et le 6ème arrondissement offre des prix au mètre carré largement supérieurs aux autres quartiers. Cela se ressent dans l’offre des cafés proposant un café inférieur à 1 euro. - Mais le 19ème s’avère être un des cafés les moins chers de Paris alors qu’il apparaît comme un quartier cher dans notre étude.
df2<-df2[order(as.factor(df2$district)),]
df2$immo<-c(11163,10266,10368,10916,11417,13201,12465,10999,9263,8586,8072,7608,8630,9519,8957,9526,8916,7337,5715,7731)
df2<-as.data.frame(df2)
colnames(df2)<-c("district","nb_café","compare","immo")
df2$immo<-as.numeric(df2$immo)
df2$nb_café<-as.numeric(df2$nb_café)library(plotly)## Warning: package 'plotly' was built under R version 3.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df2<-as.data.frame(df2)
ay <- list(
tickfont = list(color = "red"),
overlaying = "y",
side = "right",
title = "Café vs. Immobilier"
)
p <- plot_ly() %>%
add_lines(x = ~df2$district, y = ~df2$nb_café, name = "nombre de cafés") %>%
add_lines(x = ~df2$district, y = ~df2$immo, name = "prix du m2", yaxis = "y2") %>%
layout(
title = "Café à 1€ vs. immo", yaxis2 = ay,
xaxis = list(title="x")
)
pCalcul du prix de l’immobilier par arrondissement par arrondissement par rapport au nombre de café offrant de l’or noir à 1€ par arrondissement
x <- df2$immo
y <- df2$nb_café
# Plot with main and axis titles
# Change point shape (pch = 19) and remove frame.
plot(x, y, main = "Prix de l'immobilier vs. café à 1€",
xlab = "Prix de l'immobilier", ylab = "café à 1€",
pch = 19, frame = FALSE)res <- cor.test(df2$immo, df2$nb_café,
method = "pearson")
res##
## Pearson's product-moment correlation
##
## data: df2$immo and df2$nb_café
## t = -1.3601, df = 18, p-value = 0.1906
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6587975 0.1586837
## sample estimates:
## cor
## -0.3052737
The p-value of the test is 0.1917, which is greater than the significance level alpha = 0.05. We cannot conclude that wt and mpg are significantly correlated with a correlation coefficient of -0.3045438 and p-value of 0.1917 .
Is the covariation linear? Pretty no, form the plot above : the scatter plots show curved patterns, we are dealing with nonlinear association between the two variables.
Are the data from each of the 2 variables (x, y) follow a normal distribution? Look at the normality plot —> R function: ggpubr::ggqqplot()
Shapiro-Wilk test can be performed as follow: Null hypothesis: the data are normally distributed Alternative hypothesis: the data are not normally distributed
df3<-df2[,c(2,4)]
x <- df3$immo
y <- df3$nb_café
shapiro.test(df3$immo) ##
## Shapiro-Wilk normality test
##
## data: df3$immo
## W = 0.98853, p-value = 0.9957
shapiro.test(df3$nb_café) ##
## Shapiro-Wilk normality test
##
## data: df3$nb_café
## W = 0.97542, p-value = 0.8625
From the output, the two p-values are greater than the significance level 0.05 implying that the distribution of the data are not significantly different from normal distribution. In other words, we can assume the normality.
Visual inspection of the data normality using Q-Q plots (quantile-quantile plots). Q-Q plot draws the correlation between a given sample and the normal distribution.
library("ggpubr")## Warning: package 'ggpubr' was built under R version 3.3.2
## Loading required package: magrittr
ggqqplot(df3$immo, ylab = "Prix de l'immobilier par arrondissement")ggqqplot(df3$nb_café, ylab = "Nombre de café à 1€")From the normality plots, we conclude that both populations may come from normal distributions.
plot(x, y, main = "Prix de l'immobilier vs. café à 1€",
xlab = "Prix de l'immobilier", ylab = "nombre de café à 1€",
pch = 19, frame = FALSE)
abline(lm(y ~ x, data = df3), col = "blue")library("ggpubr")
ggscatter(df3, x = "immo", y = "nb_café",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Prix de l'immobilier", ylab = "nombre de café à 1€")• -1 indicates a strong negative correlation : this means that every time x increases, y decreases
• 0 means that there is no association between the two variables (x and y)
• 1 indicates a strong positive correlation : this means that y increases with x