This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(sf)
library(tidyverse)
library(broom)
#library(ggplot2)
devtools::install_github("tidyverse/ggplot2")
require(ggplot2)
We need these libraries to continue. geom_sf is only found in the newer version of ggplot2 that we install from github.
munis <- st_read("gem_2016.geojson", crs = 28992)
munis <- munis %>%
filter(WATER == 'NEE') %>%
select(GM_CODE, GM_NAAM)
munis.orig <- munis
provinces <- st_read("provinces.geojson") %>%
st_transform(crs=28992)
utrecht <- provinces %>%
filter(name=="Utrecht")
munis <- munis %>%
filter(st_intersects(utrecht, st_centroid(munis), sparse=F))
st_intersect filters geometrically intersecting shapes. We can reduce our huge dataset into a smaller and more focused dataset.
commute = read_csv("commuting.csv") %>%
select(source, sink, weight) %>%
rename(interaction = weight)
commute.orig <- commute
# This creates a field in munis that has ID of its GM_CODE without "GM".
# When this is done, we are able to match these id values to the ones in commute.
munis <- munis%>%
mutate(id= as.numeric(str_replace(GM_CODE, "GM", "")))
commute <- commute %>%
filter(source %in% munis$id & sink %in% munis$id)
munis.centroid <-
st_centroid(munis) %>%
select(id)
commute <- commute %>%
left_join(munis.centroid, by=c("source" = "id")) %>%
left_join(munis.centroid, by=c("sink" = "id")) %>%
rowwise() %>%
mutate(geometry = st_combine(c(geometry.x, geometry.y)) %>%
st_cast("LINESTRING")) %>%
select(-geometry.x, -geometry.y) %>% st_as_sf(crs = 28992)
# this is the plot without the background polygons.
# ggplot(commute) + geom_sf()
ggplot(munis) + geom_sf() + geom_sf(data=commute)

commute <- commute %>%
filter(sink != source) %>%
filter(interaction > 20) %>%
mutate(lineWidth = interaction/ max(interaction) * 10)
ggplot(commute) + geom_sf(aes(size=lineWidth)) + scale_size_identity()

The commutes seem to be focused onto one area. It should have a single centre.
residents <- read_csv("residents.csv") %>%
select(id, weight)
jobs <- read_csv("jobs.csv") %>%
select(id, weight)
dist <- st_distance(x = munis.centroid, y=munis.centroid)
# solution from https://github.com/tidyverse/tidyr/issues/437
rownames(dist) <- munis.centroid$id
colnames(dist) <- munis.centroid$id
dist <- list(source = rownames(dist)[row(dist)] %||% row(dist),
sink = colnames(dist)[col(dist)] %||% col(dist),
distance = dist) %>%
map_dfc(as.vector) %>%
mutate(source = as.numeric(source)) %>%
mutate(sink = as.numeric(sink))
commute <- commute %>%
left_join(dist) %>%
left_join(residents, by = c('source' = 'id')) %>%
rename(residents = weight) %>%
left_join(jobs, by=c('sink' = 'id')) %>%
rename(jobs = weight)
commute
Simple feature collection with 519 features and 7 fields
geometry type: LINESTRING
dimension: XY
bbox: xmin: 119109.2 ymin: 442367.9 xmax: 166907.8 ymax: 473542.3
epsg (SRID): 28992
proj4string: +proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs
ggplot(commute, aes(interaction, distance)) + geom_point()

ggplot(commute, aes(log(interaction), log(distance))) + geom_point()

The log plot seem to have a more linear relation than the normal plot. A log-log relationship would better describe the data points than a linear one.
lm(data = commute, formula = interaction ~ residents + jobs + distance) %>%
summary()
Call:
lm(formula = interaction ~ residents + jobs + distance, data = commute)
Residuals:
Min 1Q Median 3Q Max
-15770 -1150 -309 922 40734
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.553e+03 5.241e+02 4.872 1.47e-06 ***
residents 3.234e-02 3.423e-03 9.449 < 2e-16 ***
jobs 9.666e-02 4.960e-03 19.486 < 2e-16 ***
distance -1.965e-01 2.255e-02 -8.713 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4632 on 515 degrees of freedom
Multiple R-squared: 0.5032, Adjusted R-squared: 0.5003
F-statistic: 173.9 on 3 and 515 DF, p-value: < 2.2e-16
The model does not fit extremely well, but a certain level of correlation can be seen from the R value.
ggplot(commute, aes(interaction)) + geom_histogram()

This seems to be well estimated by a poisson distribution.
model <- glm(data = commute, formula = interaction ~ log(residents) + log(jobs) + log(distance),
family = poisson())
r2 <- function(empirical, fitted) {
return(cor(empirical, fitted)^2)
}
r2(commute$interaction, fitted(model))
[1] 0.8639754
tidy(model)
The low p values for all rows show that these results are statistically significant. The estimates show how much each value would change when another value changes. For example, for every increase in resident, an estimated 0.85853 count would increase.
commute <- commute %>%
mutate(fitted = fitted(model)) %>%
mutate(residual = interaction - fitted) %>%
mutate(residualSign = sign(residual))
commute %>%
mutate(lineWidth = fitted/ max(fitted)*10) %>%
ggplot() + geom_sf(aes(size = lineWidth)) + scale_size_identity()

commute %>%
mutate(lineWidth = abs(residual) / max(residual) * 10) %>%
ggplot() + geom_sf(aes(size=lineWidth)) + scale_size_identity()

commute %>%
mutate(lineWidth = abs(residual) / max(residual) *10) %>%
ggplot() + geom_sf(aes(color = factor(residualSign), size=lineWidth)) + scale_size_identity()

The plots look similar as the previous one.

commute.orig.sinks <- commute.orig %>%
group_by(sink) %>%
summarise(sum(interaction))
commute.orig.sinks <-
commute.orig.sinks[order(commute.orig.sinks$"sum(interaction)", decreasing = TRUE),] %>%
left_join(munis.orig.centroid.nogeom, by=c("sink"="id"))
commute.orig.sinks
Simple feature collection with 350 features and 3 fields
geometry type: GEOMETRY
dimension: XY
bbox: xmin: 24537.79 ymin: 309545.2 xmax: 271400.9 ymax: 604580
epsg (SRID): 28992
proj4string: +proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs
The top 3 destinations in Netherlands are Amsterdam, Rotterdam and ’s-Gravenhage.
dist <- st_distance(x = munis.orig.centroid, y=munis.orig.centroid)
# solution from https://github.com/tidyverse/tidyr/issues/437
rownames(dist) <- munis.orig.centroid$id
colnames(dist) <- munis.orig.centroid$id
dist <- list(source = rownames(dist)[row(dist)] %||% row(dist),
sink = colnames(dist)[col(dist)] %||% col(dist),
distance = dist) %>%
map_dfc(as.vector) %>%
mutate(source = as.numeric(source)) %>%
mutate(sink = as.numeric(sink))
commute.orig <- commute.orig %>%
left_join(dist) %>%
left_join(residents, by = c('source' = 'id')) %>%
rename(residents = weight) %>%
left_join(jobs, by=c('sink' = 'id')) %>%
rename(jobs = weight)
Joining, by = c("source", "sink", "distance")
commute.orig
Simple feature collection with 2594 features and 9 fields
geometry type: LINESTRING
dimension: XY
bbox: xmin: 24537.79 ymin: 309545.2 xmax: 271400.9 ymax: 604580
epsg (SRID): 28992
proj4string: +proj=sterea +lat_0=52.15616055555555 +lon_0=5.38763888888889 +k=0.9999079 +x_0=155000 +y_0=463000 +ellps=bessel +towgs84=565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812 +units=m +no_defs
ggplot(commute.orig, aes(interaction, distance)) + geom_point()

ggplot(commute.orig, aes(log(interaction), log(distance))) + geom_point()

lm(data = commute.orig, formula = interaction ~ residents + jobs + distance) %>%
summary()
Call:
lm(formula = interaction ~ residents + jobs + distance, data = commute.orig)
Residuals:
Min 1Q Median 3Q Max
-21961 -3370 -1490 1508 105061
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.347e+03 2.684e+02 27.37 <2e-16 ***
residents 2.217e-02 1.412e-03 15.70 <2e-16 ***
jobs 5.148e-02 1.989e-03 25.88 <2e-16 ***
distance -2.581e-01 1.220e-02 -21.16 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8618 on 2590 degrees of freedom
Multiple R-squared: 0.2417, Adjusted R-squared: 0.2408
F-statistic: 275.2 on 3 and 2590 DF, p-value: < 2.2e-16
commute.orig <- commute.orig[commute.orig$distance != 0,]
model <- glm(data = commute.orig, formula = interaction ~ log(residents) + log(jobs) + log(distance),
family = poisson())
r2 <- function(empirical, fitted) {
return(cor(empirical, fitted)^2)
}
r2(commute.orig$interaction, fitted(model))
[1] 0.3696744
tidy(model)
The Rsquared value is very low. Even on the log log graph, which showed a relatively correlated graph for only Utrecht, did not have any prominent insights when plotted with data of the whole Netherlands. This shows that the distance does not have a close correlation with the amount of interactions a location have for the whole of Netherlands. This should be because the places are not interchangeable, if one’s job was in Amsterdam one would have to go to Amsterdam to work even if the distance is long.
It still, however, seems to fit poisson quite nicely.
---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

```{r}
library(sf)
library(tidyverse)
library(broom)
#library(ggplot2)
devtools::install_github("tidyverse/ggplot2")
require(ggplot2)
```
We need these libraries to continue.
geom_sf is only found in the newer version of ggplot2 that we install from github.


```{r}
munis <- st_read("gem_2016.geojson", crs = 28992)

munis <- munis %>%
  filter(WATER == 'NEE') %>%
  select(GM_CODE, GM_NAAM)

munis.orig <- munis

provinces <- st_read("provinces.geojson") %>%
  st_transform(crs=28992) 

utrecht <- provinces %>%
  filter(name=="Utrecht") 

munis <- munis %>%
  filter(st_intersects(utrecht, st_centroid(munis), sparse=F))
```
st_intersect filters geometrically intersecting shapes.
We can reduce our huge dataset into a smaller and more focused dataset.

```{r}

commute = read_csv("commuting.csv") %>%
  select(source, sink, weight) %>%
  rename(interaction = weight)

commute.orig <- commute

# This creates a field in munis that has ID of its GM_CODE without "GM".
# When this is done, we are able to match these id values to the ones in commute.
munis <- munis%>%
  mutate(id= as.numeric(str_replace(GM_CODE, "GM", "")))

commute <- commute %>%
  filter(source %in% munis$id & sink %in% munis$id)

munis.centroid <- 
  st_centroid(munis) %>%
  select(id)

commute <- commute %>% 
  left_join(munis.centroid, by=c("source" = "id")) %>%
  left_join(munis.centroid, by=c("sink" = "id")) %>%
  rowwise() %>%
  mutate(geometry = st_combine(c(geometry.x, geometry.y)) %>% 
           st_cast("LINESTRING")) %>%
  select(-geometry.x, -geometry.y) %>% st_as_sf(crs = 28992)

# this is the plot without the background polygons.
# ggplot(commute) + geom_sf() 
```

```{r}
ggplot(munis) + geom_sf() + geom_sf(data=commute)

```

```{r}
commute <- commute %>%
  filter(sink != source) %>%
  filter(interaction > 20) %>%
  mutate(lineWidth = interaction/ max(interaction) * 10)

ggplot(commute) + geom_sf(aes(size=lineWidth)) + scale_size_identity()

```
The commutes seem to be focused onto one area. It should have a single centre. 


```{r}
residents <- read_csv("residents.csv") %>%
  select(id, weight)
jobs <- read_csv("jobs.csv") %>%
  select(id, weight)
dist <- st_distance(x = munis.centroid, y=munis.centroid)
# solution from https://github.com/tidyverse/tidyr/issues/437
rownames(dist) <- munis.centroid$id
colnames(dist) <- munis.centroid$id
dist <- list(source = rownames(dist)[row(dist)] %||% row(dist),
             sink = colnames(dist)[col(dist)] %||% col(dist), 
                             distance = dist) %>%
  map_dfc(as.vector) %>%
  mutate(source = as.numeric(source)) %>%
  mutate(sink = as.numeric(sink))

commute <- commute %>%
  left_join(dist) %>%
  left_join(residents, by = c('source' = 'id')) %>%
  rename(residents = weight) %>%
  left_join(jobs, by=c('sink' = 'id')) %>%
  rename(jobs = weight)
```

```{r}
commute
```

```{r}
ggplot(commute, aes(interaction, distance)) + geom_point()

ggplot(commute, aes(log(interaction), log(distance))) + geom_point()
```
The log plot seem to have a more linear relation than the normal plot.
A log-log relationship would better describe the data points than a linear one.

```{r}
lm(data = commute, formula = interaction ~ residents + jobs + distance) %>% 
  summary()
```
The model does not fit extremely well, but a certain level of correlation can be seen from the R value.

```{r}
ggplot(commute, aes(interaction)) + geom_histogram()
```
This seems to be well estimated by a poisson distribution. 

```{r}

model <- glm(data = commute, formula = interaction ~ log(residents) + log(jobs) + log(distance),
             family = poisson())
r2 <- function(empirical, fitted) {
  return(cor(empirical, fitted)^2)
}
r2(commute$interaction, fitted(model))

tidy(model)
```
The low p values for all rows show that these results are statistically significant.
The estimates show how much each value would change when another value changes. 
For example, for every increase in resident, an estimated 0.85853 count would increase.

```{r}
commute <- commute %>%
  mutate(fitted = fitted(model)) %>%
  mutate(residual = interaction - fitted) %>%
  mutate(residualSign = sign(residual))

commute %>%
  mutate(lineWidth = fitted/ max(fitted)*10) %>%
  ggplot() + geom_sf(aes(size = lineWidth)) + scale_size_identity()

commute %>%
  mutate(lineWidth = abs(residual) / max(residual) * 10) %>%
  ggplot() + geom_sf(aes(size=lineWidth)) + scale_size_identity()

commute %>%
  mutate(lineWidth = abs(residual) / max(residual) *10) %>%
  ggplot() + geom_sf(aes(color = factor(residualSign), size=lineWidth)) + scale_size_identity()

```
The plots look similar as the previous one.

```{r}
commute.orig <- commute.orig %>%
  filter(interaction > 2000) %>%
  mutate(lineWidth = interaction/ max(interaction) * 10)

munis.orig <- munis.orig %>%
  mutate(id= as.numeric(str_replace(GM_CODE, "GM", "")))

munis.orig.centroid <- 
  st_centroid(munis.orig) %>%
  select(id, GM_NAAM)

munis.orig.centroid.nogeom <- munis.orig.centroid
st_geometry(munis.orig.centroid.nogeom) <- NULL

commute.orig <- commute.orig %>%
  left_join(munis.orig.centroid, by=c("source" = "id")) %>%
  left_join(munis.orig.centroid, by=c("sink" = "id")) %>%
  rowwise() %>%
  mutate(geometry = st_combine(c(geometry.x,  geometry.y)) %>% 
           st_cast("LINESTRING")) %>%
  select(-geometry.x, -geometry.y) %>% st_as_sf(crs = 28992)

ggplot(commute.orig) + geom_sf(aes(size=lineWidth)) + scale_size_identity()
```

```{r}
commute.orig.sinks <- commute.orig %>%
  group_by(sink) %>%
  summarise(sum(interaction)) 


commute.orig.sinks <- 
commute.orig.sinks[order(commute.orig.sinks$"sum(interaction)", decreasing = TRUE),] %>%
  left_join(munis.orig.centroid.nogeom, by=c("sink"="id"))


commute.orig.sinks
```
The top 3 destinations in Netherlands are Amsterdam, Rotterdam and 's-Gravenhage.

```{r}
dist <- st_distance(x = munis.orig.centroid, y=munis.orig.centroid)
# solution from https://github.com/tidyverse/tidyr/issues/437
rownames(dist) <- munis.orig.centroid$id
colnames(dist) <- munis.orig.centroid$id
dist <- list(source = rownames(dist)[row(dist)] %||% row(dist),
             sink = colnames(dist)[col(dist)] %||% col(dist), 
                             distance = dist) %>%
  map_dfc(as.vector) %>%
  mutate(source = as.numeric(source)) %>%
  mutate(sink = as.numeric(sink))

commute.orig <- commute.orig %>%
  left_join(dist) %>%
  left_join(residents, by = c('source' = 'id')) %>%
  rename(residents = weight) %>%
  left_join(jobs, by=c('sink' = 'id')) %>%
  rename(jobs = weight)

commute.orig
```
```{r}

ggplot(commute.orig, aes(interaction, distance)) + geom_point()

ggplot(commute.orig, aes(log(interaction), log(distance))) + geom_point()

```
```{r}
lm(data = commute.orig, formula = interaction ~ residents + jobs + distance) %>% 
  summary()
```
```{r}
 commute.orig <- commute.orig[commute.orig$distance != 0,]

model <- glm(data = commute.orig, formula = interaction ~ log(residents) + log(jobs) + log(distance),
             family = poisson())
r2 <- function(empirical, fitted) {
  return(cor(empirical, fitted)^2)
}
r2(commute.orig$interaction, fitted(model))

tidy(model)

```


The Rsquared value is very low.
Even on the log log graph, which showed a relatively correlated graph for only Utrecht, did not have any prominent insights when plotted with data of the whole Netherlands. This shows that the distance does not have a close correlation with the amount of interactions a location have for the whole of Netherlands. This should be because the places are not interchangeable, if one's job was in Amsterdam one would have to go to Amsterdam to work even if the distance is long. 

It still, however, seems to fit poisson quite nicely.


