lus = read.table("data.csv", sep=";", fill=NA, header=T, stringsAsFactors = F)
locations = read.table("locatie_specificaties.csv", sep=";", fill=NA, header=T, stringsAsFactors = F)
head(locations)
## measurementSiteReference measurementSiteVersion measurementType
## 1 PZH01_MST_0694_00 7 trafficFlow
## 2 PZH01_MST_0694_00 7 trafficSpeed
## 3 PZH01_MST_0687_01 10 trafficFlow
## 4 PZH01_MST_0687_01 10 trafficSpeed
## 5 PZH01_MST_0013_02 6 trafficFlow
## 6 PZH01_MST_0013_02 6 trafficSpeed
## carriageway alertCDirectionCoded specificLocation offsetDistance
## 1 mainCarriageway negative 22390 1973
## 2 mainCarriageway negative 22390 1973
## 3 mainCarriageway negative 15781 302
## 4 mainCarriageway negative 15781 302
## 5 mainCarriageway negative 22352 1059
## 6 mainCarriageway negative 22352 1059
## nrOfLanes latitude longitude
## 1 1 52.02154 4.619844
## 2 1 52.02154 4.619844
## 3 1 52.23551 4.501682
## 4 1 52.23551 4.501682
## 5 3 52.19071 4.420255
## 6 3 52.19071 4.420255
table(locations$measurementType)
##
## trafficFlow trafficSpeed travelTimeInformation
## 17121 17103 38736
length(unique(locations$measurementSiteReference))
## [1] 55857
hist(lus$trafficFlow_anyVehicle, breaks=50)
Labeled as problem:
table(lus$probleem_)
##
## 0 1
## 492590 99696
head(lus[lus$probleem_==1,])
## time_ id_ specificLane_ probleem_
## 2766 2019-02-27 23:00:00 GEO2A__R_RWSTI363109 lane2 1
## 2778 2019-02-27 23:01:00 GEO2A__R_RWSTI363109 lane2 1
## 2790 2019-02-27 23:02:00 GEO2A__R_RWSTI363109 lane2 1
## 2802 2019-02-27 23:03:00 GEO2A__R_RWSTI363109 lane2 1
## 2814 2019-02-27 23:04:00 GEO2A__R_RWSTI363109 lane2 1
## 2826 2019-02-27 23:05:00 GEO2A__R_RWSTI363109 lane2 1
## trafficFlow_anyVehicle
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficFlow_greaterThan.1.85.and.lessThanOrEqualTo.2.40
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficFlow_greaterThan.11.50.and.lessThanOrEqualTo.12.20
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficFlow_greaterThan.12.20
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficFlow_greaterThan.2.40.and.lessThanOrEqualTo.5.60
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.11.50
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.12.20
## 2766 NA
## 2778 NA
## 2790 NA
## 2802 NA
## 2814 NA
## 2826 NA
## trafficFlow_lessThanOrEqualTo.5.60 trafficSpeed_anyVehicle
## 2766 NA -2
## 2778 NA -2
## 2790 NA -2
## 2802 NA -2
## 2814 NA -2
## 2826 NA -2
## trafficSpeed_greaterThan.1.85.and.lessThanOrEqualTo.2.40
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficSpeed_greaterThan.11.50.and.lessThanOrEqualTo.12.20
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficSpeed_greaterThan.12.20
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficSpeed_greaterThan.2.40.and.lessThanOrEqualTo.5.60
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.11.50
## 2766 -2
## 2778 -2
## 2790 -2
## 2802 -2
## 2814 -2
## 2826 -2
## trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.12.20
## 2766 NA
## 2778 NA
## 2790 NA
## 2802 NA
## 2814 NA
## 2826 NA
## trafficSpeed_lessThanOrEqualTo.5.60
## 2766 NA
## 2778 NA
## 2790 NA
## 2802 NA
## 2814 NA
## 2826 NA
summary(lus[lus$probleem_==1,])
## time_ id_ specificLane_ probleem_
## Length:99696 Length:99696 Length:99696 Min. :1
## Class :character Class :character Class :character 1st Qu.:1
## Mode :character Mode :character Mode :character Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
##
## trafficFlow_anyVehicle
## Min. : -2.0
## 1st Qu.: 0.0
## Median : 60.0
## Mean : 240.8
## 3rd Qu.: 360.0
## Max. :7680.0
##
## trafficFlow_greaterThan.1.85.and.lessThanOrEqualTo.2.40
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 1.59
## 3rd Qu.: 0.00
## Max. :180.00
## NA's :31910
## trafficFlow_greaterThan.11.50.and.lessThanOrEqualTo.12.20
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 0.66
## 3rd Qu.: 0.00
## Max. :300.00
## NA's :31910
## trafficFlow_greaterThan.12.20
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 14.94
## 3rd Qu.: 0.00
## Max. :1260.00
## NA's :8541
## trafficFlow_greaterThan.2.40.and.lessThanOrEqualTo.5.60
## Min. : -2.0
## 1st Qu.: 0.0
## Median : 60.0
## Mean : 221.6
## 3rd Qu.: 360.0
## Max. :3480.0
## NA's :31910
## trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.11.50
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 16.21
## 3rd Qu.: 0.00
## Max. :480.00
## NA's :31910
## trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.12.20
## Min. : -2.00
## 1st Qu.: -2.00
## Median : -2.00
## Mean : 0.53
## 3rd Qu.: 0.00
## Max. :120.00
## NA's :76327
## trafficFlow_lessThanOrEqualTo.5.60 trafficSpeed_anyVehicle
## Min. : -2.00 Min. : -2.00
## 1st Qu.: -2.00 1st Qu.: -1.00
## Median : -2.00 Median : 0.00
## Mean : 8.63 Mean : 36.57
## 3rd Qu.: 0.00 3rd Qu.: 78.00
## Max. :780.00 Max. :428.00
## NA's :76327
## trafficSpeed_greaterThan.1.85.and.lessThanOrEqualTo.2.40
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 2.42
## 3rd Qu.: 0.00
## Max. :398.00
## NA's :31910
## trafficSpeed_greaterThan.11.50.and.lessThanOrEqualTo.12.20
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 0.63
## 3rd Qu.: 0.00
## Max. :130.00
## NA's :31910
## trafficSpeed_greaterThan.12.20
## Min. : -2.000
## 1st Qu.: -2.000
## Median : 0.000
## Mean : 9.223
## 3rd Qu.: 0.000
## Max. :576.000
## NA's :8541
## trafficSpeed_greaterThan.2.40.and.lessThanOrEqualTo.5.60
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 60.00
## Mean : 51.23
## 3rd Qu.: 90.75
## Max. :222.00
## NA's :31910
## trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.11.50
## Min. : -2.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 14.93
## 3rd Qu.: 0.00
## Max. :428.00
## NA's :31910
## trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.12.20
## Min. : -2.00
## 1st Qu.: -2.00
## Median : -2.00
## Mean : 2.76
## 3rd Qu.: -1.00
## Max. :196.00
## NA's :76327
## trafficSpeed_lessThanOrEqualTo.5.60
## Min. : -2.00
## 1st Qu.: -2.00
## Median : -2.00
## Mean : 5.41
## 3rd Qu.: -1.00
## Max. :104.00
## NA's :76327
Seem to all be negative speed. Can be removed.
lus = subset(lus, probleem_ == 0)
It seems almost all traffic flows are divisible by 60. How is this possible?
sort(table(lus$trafficFlow_anyVehicle))
##
## 3300 3360 3420 3480 3240 3180 3120 3000 3060 2940
## 1 1 1 1 4 6 7 8 14 15
## 2880 2820 2700 2760 2640 2520 2580 2400 2460 2340
## 18 23 28 29 32 47 47 61 61 63
## 2280 2220 2160 2100 -2 2040 1980 1920 1860 1800
## 85 90 120 142 150 159 204 232 263 319
## 1740 1680 1620 1560 1500 1440 1380 1320 1260 1200
## 385 472 542 748 906 1123 1384 1788 2192 2767
## 1140 1080 1020 960 900 840 780 720 660 600
## 3542 4084 4950 5674 6420 7152 8136 9238 10321 11826
## 540 480 420 360 300 240 180 120 60 0
## 13350 15724 18326 21235 24955 29583 36085 46400 66180 134871
divided_by_60 = sapply(lus$trafficFlow_anyVehicle, function(x){ x %% 60 == 0})
table(divided_by_60)
## divided_by_60
## FALSE TRUE
## 150 492440
From the data specification: “trafficFlow kolommen / 60 is het aantal voertuigen wat er heeft gereden per voertuigcategorie”
So divide by 60:
hist(lus$trafficSpeed_anyVehicle)
hist(scale(lus$trafficSpeed_anyVehicle[lus$trafficSpeed_anyVehicle > 10]))
sum( scale(lus$trafficSpeed_anyVehicle[lus$trafficSpeed_anyVehicle > 10]) > 3 )
## [1] 577
library(ggplot2)
ggplot(lus, aes(x=trafficSpeed_anyVehicle, trafficFlow_anyVehicle/60)) +
geom_bin2d(bins = 30, color = "white")+
scale_fill_gradient(low = "#00AFBB", high = "#FC4E07")+
theme_minimal()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
lus$time_parsed = ymd_hms(lus$time_)
theme_bw()
## List of 65
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ size : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ size : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0pt 0pt 0pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75pt 0pt 0pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 0pt 2.75pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 2.75pt 0pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 0pt 0pt 2.75pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2pt 0pt 0pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 0pt 2.2pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 2.2pt 0pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 0pt 0pt 2.2pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks :List of 6
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ axis.ticks.length : 'unit' num 2.75pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom: NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.y : NULL
## $ legend.background :List of 5
## ..$ fill : NULL
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ legend.margin : 'margin' num [1:4] 5.5pt 5.5pt 5.5pt 5.5pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ legend.spacing : 'unit' num 11pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key :List of 5
## ..$ fill : chr "white"
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ legend.key.size : 'unit' num 1.2lines
## ..- attr(*, "valid.unit")= int 3
## ..- attr(*, "unit")= chr "lines"
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.align : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.align : NULL
## $ legend.position : chr "right"
## $ legend.direction : NULL
## $ legend.justification : chr "center"
## $ legend.box : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "valid.unit")= int 1
## ..- attr(*, "unit")= chr "cm"
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'unit' num 11pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ panel.background :List of 5
## ..$ fill : chr "white"
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ panel.border :List of 5
## ..$ fill : logi NA
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ panel.spacing : 'unit' num 5.5pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ panel.spacing.x : NULL
## $ panel.spacing.y : NULL
## $ panel.grid :List of 6
## ..$ colour : chr "grey92"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.grid.minor :List of 6
## ..$ colour : NULL
## ..$ size : 'rel' num 0.5
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.ontop : logi FALSE
## $ plot.background :List of 5
## ..$ fill : NULL
## ..$ colour : chr "white"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 1.2
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 0pt 5.5pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.subtitle :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0pt 0pt 5.5pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.caption :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : num 1
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 5.5pt 0pt 0pt 0pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.tag :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 1.2
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.tag.position : chr "topleft"
## $ plot.margin : 'margin' num [1:4] 5.5pt 5.5pt 5.5pt 5.5pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ strip.background :List of 5
## ..$ fill : chr "grey85"
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ strip.placement : chr "inside"
## $ strip.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey10"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 4.4pt 4.4pt 4.4pt 4.4pt
## .. ..- attr(*, "valid.unit")= int 8
## .. ..- attr(*, "unit")= chr "pt"
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.x : NULL
## $ strip.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.switch.pad.grid : 'unit' num 2.75pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## $ strip.switch.pad.wrap : 'unit' num 2.75pt
## ..- attr(*, "valid.unit")= int 8
## ..- attr(*, "unit")= chr "pt"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
p=ggplot(subset(lus, id_ %in% unique(lus$id_)[1:5] & specificLane_=="lane1"),
aes(x=time_parsed, y=trafficFlow_anyVehicle, col=id_)) +
geom_line()+
# geom_smooth()+
scale_color_manual(values=c("red","black","green","orange","brown"))
plot(p)
So each measurement time has only one randomly chosen day, so it is impossible to make a time dependent model
Goal: detect outliers based on relative speed vs flow at one spot
Possible additional predictors:
lus$SumFlowLongVehicles = apply(lus[,c("trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.11.50","trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.12.20")],1, function(x){sum(x,na.rm = T)})
hist(lus$SumFlowLongVehicles)
lus$fractionFlowLongVehicles = lus$SumFlowLongVehicles / lus$trafficFlow_anyVehicle
lus$fractionFlowLongVehicles[!is.finite(lus$fractionFlowLongVehicles)] = 0
hist(lus$fractionFlowLongVehicles)
# to do
daylight = read.table("SunriseSunset.txt", sep="\t", header=T)
head(daylight)
## Month Date Sunrise Sunset
## 1 1 01 Tue 8:47:58 16:38:12
## 2 1 02 Wed 8:47:50 16:39:17
## 3 1 03 Thu 8:47:39 16:40:24
## 4 1 04 Fri 8:47:25 16:41:35
## 5 1 05 Sat 8:47:07 16:42:47
## 6 1 06 Sun 8:46:46 16:44:03
daylight$DayOfWeek = sapply(daylight$Date, function(x) {substr(x, start=4, stop=6) })
daylight$DayOfMonth = sapply(daylight$Date, function(x) {as.numeric(substr(x, start=1, stop=2)) })
daylight$Date = apply(daylight[,c("Month","DayOfMonth")],1, function(x){paste0("2019-",x[1],"-",x[2])})
daylight$Date = ymd(daylight$Date)
# to be continued
weather conditions: clear, rain, snow
what lane is it
features = c("id_","specificLane_","trafficFlow_anyVehicle","trafficSpeed_anyVehicle","fractionFlowLongVehicles")
row.names(lus) = paste0("m",1:nrow(lus))
train_set = lus[,features]
for(i in c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle","fractionFlowLongVehicles")){
train_set[,i]=as.numeric(train_set[,i])
}
train_set$trafficFlow_anyVehicle = train_set$trafficFlow_anyVehicle / 60
# train_set = subset(train_set, !is.nan(fractionFlowLongVehicles) & trafficFlow_anyVehicle>=0)
hist(train_set$trafficFlow_anyVehicle)
# lane counts start at the leftmost lane, this doesn't make any sense, so reverse them:
train_set$lane = sapply(train_set$specificLane_, function(x){as.numeric(substr(x,5,5)) })
train_set$RightOrientedLane = NA
train_set$NumberOfLanes = NA
for(id in unique(lus$id_)){
train_set[train_set$id_==id,]$NumberOfLanes = max(train_set[train_set$id_==id,]$lane, na.rm = T)
train_set[train_set$id_==id,]$RightOrientedLane = max(train_set[train_set$id_==id,]$lane, na.rm = T) - train_set[train_set$id_==id,]$lane + 1
}
# install.packages("OutlierDetection")
library(OutlierDetection)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
# show bi-variate data
set.seed(12345)
id = unique(train_set$id_)[sample(1:239,size=1)]
test = nn(train_set[train_set$id_ == id,c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle")], k=500, cutoff=0.995)
plot(test$`Scatter plot`)
Run on the whole dataset, split it into folds to make it computationally faster (otherwise it needs to make a matrix of 492590 by 492590)
First, use purely the speed and flow:
folds = caret::createFolds(train_set$id_, k=250)
all.equal(length(unlist(folds)), nrow(train_set))
## [1] TRUE
features_to_use = c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle")
train_set$outlier = F
for(fold in folds){
tryCatch({
outliers = nnk(train_set[fold,features_to_use], k=length(folds), cutoff=0.995)
if(length(outliers$`Location of Outlier`)==0){next()}
train_set[fold,][outliers$`Location of Outlier`,"outlier"] = TRUE
}, error=function(e){message("ERROR :",conditionMessage(e), "\n")})
}
table(train_set$outlier)
##
## FALSE TRUE
## 491628 962
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following object is masked from 'package:GeneralFunctions':
##
## last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary = train_set %>% group_by(id_) %>% summarize(sumOutliers = sum(outlier, na.rm=T)) %>% arrange(-sumOutliers) %>% as.data.frame()
head(summary)
## id_ sumOutliers
## 1 GEO0K_TIID_976 112
## 2 GEO0C_Z_RWSTI220 93
## 3 GEO2A__R_RWSTI363409 51
## 4 GEO0C_Z_RWSTI2010 50
## 5 GEO2A__R_RWSTI363412 44
## 6 GEO0B_R_RWSTI362568 41
Plot of the loops with the highest number of outliers:
par(mfrow=c(3,3))
for(id in summary$id_[1:27]){
tmp = subset(train_set, id_ == id)
plot(x = tmp$trafficFlow_anyVehicle, y = tmp$trafficSpeed_anyVehicle,
col=ifelse(tmp$outlier,"red",transparent("black",50)),
pch = ifelse(tmp$outlier,20,1),
ylab="speed (km/h)", xlab="cars / minute", main=id)
}
Now, also use some extra predictors:
features_to_use = c("RightOrientedLane", "NumberOfLanes","trafficFlow_anyVehicle",
"trafficSpeed_anyVehicle","fractionFlowLongVehicles")
train_set$outlier = F
for(fold in folds){
tryCatch({
outliers = nnk(train_set[fold,features_to_use], k=length(folds), cutoff=0.995)
if(length(outliers$`Location of Outlier`)==0){next()}
train_set[fold,][outliers$`Location of Outlier`,"outlier"] = TRUE
}, error=function(e){message("ERROR :",conditionMessage(e), "\n")})
}
table(train_set$outlier)
##
## FALSE TRUE
## 491616 974
library(dplyr)
summary = train_set %>% group_by(id_) %>% summarize(sumOutliers = sum(outlier, na.rm=T)) %>% arrange(-sumOutliers) %>% as.data.frame()
head(summary)
## id_ sumOutliers
## 1 GEO0K_TIID_976 105
## 2 GEO0C_Z_RWSTI220 93
## 3 GEO2A__R_RWSTI363409 50
## 4 GEO0C_Z_RWSTI2010 49
## 5 GEO2A__R_RWSTI363412 46
## 6 GEO0B_R_RWSTI362568 41
Plot of the loops with the highest number of outliers:
par(mfrow=c(3,3))
for(id in summary$id_[1:27]){
tmp = subset(train_set, id_ == id)
plot(x = tmp$trafficFlow_anyVehicle, y = tmp$trafficSpeed_anyVehicle,
col=ifelse(tmp$outlier,"red",transparent("black",50)),
pch = ifelse(tmp$outlier,20,1),
ylab="speed (km/h)", xlab="cars / minute", main=id)
}
Looks interesting already, but now high speed is very influential due to its numeric range. I should probably do some scaling:
train_scaled = train_set
for(i in c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle",
"fractionFlowLongVehicles", "lane", "RightOrientedLane","NumberOfLanes")){
train_scaled[,paste0(i,"_scaled")] = norm_01(train_scaled[,i])
}
for(fold in folds){
tryCatch({
outliers = nnk(train_scaled[fold,paste0(features_to_use,"_scaled")], k=length(folds), cutoff=0.995)
if(length(outliers$`Location of Outlier`)==0){next()}
train_scaled[fold,][outliers$`Location of Outlier`,"outlier"] = TRUE
}, error=function(e){message("ERROR :",conditionMessage(e), "\n")})
}
table(train_scaled$outlier)
##
## FALSE TRUE
## 490860 1730
library(dplyr)
summary = train_scaled %>% group_by(id_) %>% summarize(sumOutliers = sum(outlier, na.rm=T)) %>% arrange(-sumOutliers) %>% as.data.frame()
head(summary)
## id_ sumOutliers
## 1 GEO0K_TIID_976 107
## 2 GEO0C_Z_RWSTI220 95
## 3 GEO0C_Z_RWSTI356716 71
## 4 PNH02_TI1007-R 62
## 5 GEO2A__R_RWSTI602 61
## 6 GEO0B_R_RWSTI363369 54
Plot of the loops with the highest number of outliers:
par(mfrow=c(3,3))
for(id in summary$id_[1:18]){
tmp = subset(train_scaled, id_ == id)
plot(x = tmp$trafficFlow_anyVehicle, y = tmp$trafficSpeed_anyVehicle,
col=ifelse(tmp$outlier,"red",transparent("black",50)),
pch = ifelse(tmp$outlier,20,1),
ylab="speed (km/h)", xlab="cars / minute", main=id)
}