Read Data

lus = read.table("data.csv", sep=";", fill=NA, header=T, stringsAsFactors = F)
locations = read.table("locatie_specificaties.csv", sep=";", fill=NA, header=T, stringsAsFactors = F)

Locations

head(locations)
##   measurementSiteReference measurementSiteVersion measurementType
## 1        PZH01_MST_0694_00                      7     trafficFlow
## 2        PZH01_MST_0694_00                      7    trafficSpeed
## 3        PZH01_MST_0687_01                     10     trafficFlow
## 4        PZH01_MST_0687_01                     10    trafficSpeed
## 5        PZH01_MST_0013_02                      6     trafficFlow
## 6        PZH01_MST_0013_02                      6    trafficSpeed
##       carriageway alertCDirectionCoded specificLocation offsetDistance
## 1 mainCarriageway             negative            22390           1973
## 2 mainCarriageway             negative            22390           1973
## 3 mainCarriageway             negative            15781            302
## 4 mainCarriageway             negative            15781            302
## 5 mainCarriageway             negative            22352           1059
## 6 mainCarriageway             negative            22352           1059
##   nrOfLanes latitude longitude
## 1         1 52.02154  4.619844
## 2         1 52.02154  4.619844
## 3         1 52.23551  4.501682
## 4         1 52.23551  4.501682
## 5         3 52.19071  4.420255
## 6         3 52.19071  4.420255
table(locations$measurementType)
## 
##           trafficFlow          trafficSpeed travelTimeInformation 
##                 17121                 17103                 38736
length(unique(locations$measurementSiteReference))
## [1] 55857

Measurements

hist(lus$trafficFlow_anyVehicle, breaks=50)

Labeled as problem:

table(lus$probleem_)
## 
##      0      1 
## 492590  99696
head(lus[lus$probleem_==1,])
##                    time_                  id_ specificLane_ probleem_
## 2766 2019-02-27 23:00:00 GEO2A__R_RWSTI363109         lane2         1
## 2778 2019-02-27 23:01:00 GEO2A__R_RWSTI363109         lane2         1
## 2790 2019-02-27 23:02:00 GEO2A__R_RWSTI363109         lane2         1
## 2802 2019-02-27 23:03:00 GEO2A__R_RWSTI363109         lane2         1
## 2814 2019-02-27 23:04:00 GEO2A__R_RWSTI363109         lane2         1
## 2826 2019-02-27 23:05:00 GEO2A__R_RWSTI363109         lane2         1
##      trafficFlow_anyVehicle
## 2766                     -2
## 2778                     -2
## 2790                     -2
## 2802                     -2
## 2814                     -2
## 2826                     -2
##      trafficFlow_greaterThan.1.85.and.lessThanOrEqualTo.2.40
## 2766                                                      -2
## 2778                                                      -2
## 2790                                                      -2
## 2802                                                      -2
## 2814                                                      -2
## 2826                                                      -2
##      trafficFlow_greaterThan.11.50.and.lessThanOrEqualTo.12.20
## 2766                                                        -2
## 2778                                                        -2
## 2790                                                        -2
## 2802                                                        -2
## 2814                                                        -2
## 2826                                                        -2
##      trafficFlow_greaterThan.12.20
## 2766                            -2
## 2778                            -2
## 2790                            -2
## 2802                            -2
## 2814                            -2
## 2826                            -2
##      trafficFlow_greaterThan.2.40.and.lessThanOrEqualTo.5.60
## 2766                                                      -2
## 2778                                                      -2
## 2790                                                      -2
## 2802                                                      -2
## 2814                                                      -2
## 2826                                                      -2
##      trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.11.50
## 2766                                                       -2
## 2778                                                       -2
## 2790                                                       -2
## 2802                                                       -2
## 2814                                                       -2
## 2826                                                       -2
##      trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.12.20
## 2766                                                       NA
## 2778                                                       NA
## 2790                                                       NA
## 2802                                                       NA
## 2814                                                       NA
## 2826                                                       NA
##      trafficFlow_lessThanOrEqualTo.5.60 trafficSpeed_anyVehicle
## 2766                                 NA                      -2
## 2778                                 NA                      -2
## 2790                                 NA                      -2
## 2802                                 NA                      -2
## 2814                                 NA                      -2
## 2826                                 NA                      -2
##      trafficSpeed_greaterThan.1.85.and.lessThanOrEqualTo.2.40
## 2766                                                       -2
## 2778                                                       -2
## 2790                                                       -2
## 2802                                                       -2
## 2814                                                       -2
## 2826                                                       -2
##      trafficSpeed_greaterThan.11.50.and.lessThanOrEqualTo.12.20
## 2766                                                         -2
## 2778                                                         -2
## 2790                                                         -2
## 2802                                                         -2
## 2814                                                         -2
## 2826                                                         -2
##      trafficSpeed_greaterThan.12.20
## 2766                             -2
## 2778                             -2
## 2790                             -2
## 2802                             -2
## 2814                             -2
## 2826                             -2
##      trafficSpeed_greaterThan.2.40.and.lessThanOrEqualTo.5.60
## 2766                                                       -2
## 2778                                                       -2
## 2790                                                       -2
## 2802                                                       -2
## 2814                                                       -2
## 2826                                                       -2
##      trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.11.50
## 2766                                                        -2
## 2778                                                        -2
## 2790                                                        -2
## 2802                                                        -2
## 2814                                                        -2
## 2826                                                        -2
##      trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.12.20
## 2766                                                        NA
## 2778                                                        NA
## 2790                                                        NA
## 2802                                                        NA
## 2814                                                        NA
## 2826                                                        NA
##      trafficSpeed_lessThanOrEqualTo.5.60
## 2766                                  NA
## 2778                                  NA
## 2790                                  NA
## 2802                                  NA
## 2814                                  NA
## 2826                                  NA
summary(lus[lus$probleem_==1,])
##     time_               id_            specificLane_        probleem_
##  Length:99696       Length:99696       Length:99696       Min.   :1  
##  Class :character   Class :character   Class :character   1st Qu.:1  
##  Mode  :character   Mode  :character   Mode  :character   Median :1  
##                                                           Mean   :1  
##                                                           3rd Qu.:1  
##                                                           Max.   :1  
##                                                                      
##  trafficFlow_anyVehicle
##  Min.   :  -2.0        
##  1st Qu.:   0.0        
##  Median :  60.0        
##  Mean   : 240.8        
##  3rd Qu.: 360.0        
##  Max.   :7680.0        
##                        
##  trafficFlow_greaterThan.1.85.and.lessThanOrEqualTo.2.40
##  Min.   : -2.00                                         
##  1st Qu.:  0.00                                         
##  Median :  0.00                                         
##  Mean   :  1.59                                         
##  3rd Qu.:  0.00                                         
##  Max.   :180.00                                         
##  NA's   :31910                                          
##  trafficFlow_greaterThan.11.50.and.lessThanOrEqualTo.12.20
##  Min.   : -2.00                                           
##  1st Qu.:  0.00                                           
##  Median :  0.00                                           
##  Mean   :  0.66                                           
##  3rd Qu.:  0.00                                           
##  Max.   :300.00                                           
##  NA's   :31910                                            
##  trafficFlow_greaterThan.12.20
##  Min.   :  -2.00              
##  1st Qu.:   0.00              
##  Median :   0.00              
##  Mean   :  14.94              
##  3rd Qu.:   0.00              
##  Max.   :1260.00              
##  NA's   :8541                 
##  trafficFlow_greaterThan.2.40.and.lessThanOrEqualTo.5.60
##  Min.   :  -2.0                                         
##  1st Qu.:   0.0                                         
##  Median :  60.0                                         
##  Mean   : 221.6                                         
##  3rd Qu.: 360.0                                         
##  Max.   :3480.0                                         
##  NA's   :31910                                          
##  trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.11.50
##  Min.   : -2.00                                          
##  1st Qu.:  0.00                                          
##  Median :  0.00                                          
##  Mean   : 16.21                                          
##  3rd Qu.:  0.00                                          
##  Max.   :480.00                                          
##  NA's   :31910                                           
##  trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.12.20
##  Min.   : -2.00                                          
##  1st Qu.: -2.00                                          
##  Median : -2.00                                          
##  Mean   :  0.53                                          
##  3rd Qu.:  0.00                                          
##  Max.   :120.00                                          
##  NA's   :76327                                           
##  trafficFlow_lessThanOrEqualTo.5.60 trafficSpeed_anyVehicle
##  Min.   : -2.00                     Min.   : -2.00         
##  1st Qu.: -2.00                     1st Qu.: -1.00         
##  Median : -2.00                     Median :  0.00         
##  Mean   :  8.63                     Mean   : 36.57         
##  3rd Qu.:  0.00                     3rd Qu.: 78.00         
##  Max.   :780.00                     Max.   :428.00         
##  NA's   :76327                                             
##  trafficSpeed_greaterThan.1.85.and.lessThanOrEqualTo.2.40
##  Min.   : -2.00                                          
##  1st Qu.:  0.00                                          
##  Median :  0.00                                          
##  Mean   :  2.42                                          
##  3rd Qu.:  0.00                                          
##  Max.   :398.00                                          
##  NA's   :31910                                           
##  trafficSpeed_greaterThan.11.50.and.lessThanOrEqualTo.12.20
##  Min.   : -2.00                                            
##  1st Qu.:  0.00                                            
##  Median :  0.00                                            
##  Mean   :  0.63                                            
##  3rd Qu.:  0.00                                            
##  Max.   :130.00                                            
##  NA's   :31910                                             
##  trafficSpeed_greaterThan.12.20
##  Min.   : -2.000               
##  1st Qu.: -2.000               
##  Median :  0.000               
##  Mean   :  9.223               
##  3rd Qu.:  0.000               
##  Max.   :576.000               
##  NA's   :8541                  
##  trafficSpeed_greaterThan.2.40.and.lessThanOrEqualTo.5.60
##  Min.   : -2.00                                          
##  1st Qu.:  0.00                                          
##  Median : 60.00                                          
##  Mean   : 51.23                                          
##  3rd Qu.: 90.75                                          
##  Max.   :222.00                                          
##  NA's   :31910                                           
##  trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.11.50
##  Min.   : -2.00                                           
##  1st Qu.:  0.00                                           
##  Median :  0.00                                           
##  Mean   : 14.93                                           
##  3rd Qu.:  0.00                                           
##  Max.   :428.00                                           
##  NA's   :31910                                            
##  trafficSpeed_greaterThan.5.60.and.lessThanOrEqualTo.12.20
##  Min.   : -2.00                                           
##  1st Qu.: -2.00                                           
##  Median : -2.00                                           
##  Mean   :  2.76                                           
##  3rd Qu.: -1.00                                           
##  Max.   :196.00                                           
##  NA's   :76327                                            
##  trafficSpeed_lessThanOrEqualTo.5.60
##  Min.   : -2.00                     
##  1st Qu.: -2.00                     
##  Median : -2.00                     
##  Mean   :  5.41                     
##  3rd Qu.: -1.00                     
##  Max.   :104.00                     
##  NA's   :76327

Seem to all be negative speed. Can be removed.

lus = subset(lus, probleem_ == 0)

Traffic flow

It seems almost all traffic flows are divisible by 60. How is this possible?

sort(table(lus$trafficFlow_anyVehicle))
## 
##   3300   3360   3420   3480   3240   3180   3120   3000   3060   2940 
##      1      1      1      1      4      6      7      8     14     15 
##   2880   2820   2700   2760   2640   2520   2580   2400   2460   2340 
##     18     23     28     29     32     47     47     61     61     63 
##   2280   2220   2160   2100     -2   2040   1980   1920   1860   1800 
##     85     90    120    142    150    159    204    232    263    319 
##   1740   1680   1620   1560   1500   1440   1380   1320   1260   1200 
##    385    472    542    748    906   1123   1384   1788   2192   2767 
##   1140   1080   1020    960    900    840    780    720    660    600 
##   3542   4084   4950   5674   6420   7152   8136   9238  10321  11826 
##    540    480    420    360    300    240    180    120     60      0 
##  13350  15724  18326  21235  24955  29583  36085  46400  66180 134871
divided_by_60 = sapply(lus$trafficFlow_anyVehicle, function(x){ x %% 60 == 0})
table(divided_by_60)
## divided_by_60
##  FALSE   TRUE 
##    150 492440

From the data specification: “trafficFlow kolommen / 60 is het aantal voertuigen wat er heeft gereden per voertuigcategorie”

So divide by 60:

Average Speed

hist(lus$trafficSpeed_anyVehicle)

hist(scale(lus$trafficSpeed_anyVehicle[lus$trafficSpeed_anyVehicle > 10]))

sum( scale(lus$trafficSpeed_anyVehicle[lus$trafficSpeed_anyVehicle > 10]) > 3 )
## [1] 577

Speed vs Flow

library(ggplot2)

ggplot(lus, aes(x=trafficSpeed_anyVehicle, trafficFlow_anyVehicle/60)) +
  geom_bin2d(bins = 30, color = "white")+
  scale_fill_gradient(low =  "#00AFBB", high = "#FC4E07")+
  theme_minimal()

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
lus$time_parsed = ymd_hms(lus$time_)

theme_bw()
## List of 65
##  $ line                      :List of 6
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                      :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                      :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 0pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75pt 0pt 0pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 2.75pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 2.75pt 0pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.right        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 0pt 2.75pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2pt 0pt 0pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 2.2pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 2.2pt 0pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.right         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 0pt 2.2pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                :List of 6
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ axis.ticks.length         : 'unit' num 2.75pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ axis.ticks.length.x       : NULL
##  $ axis.ticks.length.x.top   : NULL
##  $ axis.ticks.length.x.bottom: NULL
##  $ axis.ticks.length.y       : NULL
##  $ axis.ticks.length.y.left  : NULL
##  $ axis.ticks.length.y.right : NULL
##  $ axis.line                 : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x               : NULL
##  $ axis.line.y               : NULL
##  $ legend.background         :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.margin             : 'margin' num [1:4] 5.5pt 5.5pt 5.5pt 5.5pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ legend.spacing            : 'unit' num 11pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ legend.spacing.x          : NULL
##  $ legend.spacing.y          : NULL
##  $ legend.key                :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.key.size           : 'unit' num 1.2lines
##   ..- attr(*, "valid.unit")= int 3
##   ..- attr(*, "unit")= chr "lines"
##  $ legend.key.height         : NULL
##  $ legend.key.width          : NULL
##  $ legend.text               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align         : NULL
##  $ legend.title              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align        : NULL
##  $ legend.position           : chr "right"
##  $ legend.direction          : NULL
##  $ legend.justification      : chr "center"
##  $ legend.box                : NULL
##  $ legend.box.margin         : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "valid.unit")= int 1
##   ..- attr(*, "unit")= chr "cm"
##  $ legend.box.background     : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing        : 'unit' num 11pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ panel.background          :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.border              :List of 5
##   ..$ fill         : logi NA
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.spacing             : 'unit' num 5.5pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ panel.spacing.x           : NULL
##  $ panel.spacing.y           : NULL
##  $ panel.grid                :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.minor          :List of 6
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.ontop               : logi FALSE
##  $ plot.background           :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : chr "white"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ plot.title                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 5.5pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.subtitle             :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0pt 0pt 5.5pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 5.5pt 0pt 0pt 0pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag                  :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag.position         : chr "topleft"
##  $ plot.margin               : 'margin' num [1:4] 5.5pt 5.5pt 5.5pt 5.5pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ strip.background          :List of 5
##   ..$ fill         : chr "grey85"
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ strip.placement           : chr "inside"
##  $ strip.text                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 4.4pt 4.4pt 4.4pt 4.4pt
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x              : NULL
##  $ strip.text.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.switch.pad.grid     : 'unit' num 2.75pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  $ strip.switch.pad.wrap     : 'unit' num 2.75pt
##   ..- attr(*, "valid.unit")= int 8
##   ..- attr(*, "unit")= chr "pt"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE
p=ggplot(subset(lus, id_ %in% unique(lus$id_)[1:5] & specificLane_=="lane1"), 
         aes(x=time_parsed, y=trafficFlow_anyVehicle, col=id_))  +
  geom_line()+
  # geom_smooth()+
  scale_color_manual(values=c("red","black","green","orange","brown"))

plot(p)

So each measurement time has only one randomly chosen day, so it is impossible to make a time dependent model

Speed vs Flow

Goal: detect outliers based on relative speed vs flow at one spot

Possible additional predictors:

lus$SumFlowLongVehicles = apply(lus[,c("trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.11.50","trafficFlow_greaterThan.5.60.and.lessThanOrEqualTo.12.20")],1, function(x){sum(x,na.rm = T)})
hist(lus$SumFlowLongVehicles)

lus$fractionFlowLongVehicles = lus$SumFlowLongVehicles / lus$trafficFlow_anyVehicle
lus$fractionFlowLongVehicles[!is.finite(lus$fractionFlowLongVehicles)] = 0

hist(lus$fractionFlowLongVehicles)

# to do
daylight = read.table("SunriseSunset.txt", sep="\t", header=T)
head(daylight)
##   Month   Date Sunrise   Sunset
## 1     1 01 Tue 8:47:58 16:38:12
## 2     1 02 Wed 8:47:50 16:39:17
## 3     1 03 Thu 8:47:39 16:40:24
## 4     1 04 Fri 8:47:25 16:41:35
## 5     1 05 Sat 8:47:07 16:42:47
## 6     1 06 Sun 8:46:46 16:44:03
daylight$DayOfWeek  = sapply(daylight$Date, function(x) {substr(x, start=4, stop=6) })
daylight$DayOfMonth = sapply(daylight$Date, function(x) {as.numeric(substr(x, start=1, stop=2)) })

daylight$Date = apply(daylight[,c("Month","DayOfMonth")],1, function(x){paste0("2019-",x[1],"-",x[2])})
daylight$Date = ymd(daylight$Date)

# to be continued

make a training set

features = c("id_","specificLane_","trafficFlow_anyVehicle","trafficSpeed_anyVehicle","fractionFlowLongVehicles")
row.names(lus) = paste0("m",1:nrow(lus))
train_set = lus[,features]
for(i in c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle","fractionFlowLongVehicles")){
  train_set[,i]=as.numeric(train_set[,i])
}
train_set$trafficFlow_anyVehicle = train_set$trafficFlow_anyVehicle / 60

# train_set = subset(train_set, !is.nan(fractionFlowLongVehicles) & trafficFlow_anyVehicle>=0)
hist(train_set$trafficFlow_anyVehicle)

# lane counts start at the leftmost lane, this doesn't make any sense, so reverse them:
train_set$lane = sapply(train_set$specificLane_, function(x){as.numeric(substr(x,5,5)) })
train_set$RightOrientedLane = NA
train_set$NumberOfLanes = NA
for(id in unique(lus$id_)){
  train_set[train_set$id_==id,]$NumberOfLanes = max(train_set[train_set$id_==id,]$lane, na.rm = T)
  train_set[train_set$id_==id,]$RightOrientedLane = max(train_set[train_set$id_==id,]$lane, na.rm = T) - train_set[train_set$id_==id,]$lane + 1
}

Nearest Neighbours approach

# install.packages("OutlierDetection")
library(OutlierDetection)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
# show bi-variate data
set.seed(12345)
id = unique(train_set$id_)[sample(1:239,size=1)]
test = nn(train_set[train_set$id_ == id,c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle")], k=500, cutoff=0.995)
plot(test$`Scatter plot`)

Run on the whole dataset, split it into folds to make it computationally faster (otherwise it needs to make a matrix of 492590 by 492590)

First, use purely the speed and flow:

folds = caret::createFolds(train_set$id_, k=250)
all.equal(length(unlist(folds)), nrow(train_set))
## [1] TRUE
features_to_use = c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle")

train_set$outlier = F
for(fold in folds){
  tryCatch({
    outliers = nnk(train_set[fold,features_to_use], k=length(folds), cutoff=0.995)
    if(length(outliers$`Location of Outlier`)==0){next()}
    train_set[fold,][outliers$`Location of Outlier`,"outlier"] = TRUE
  }, error=function(e){message("ERROR :",conditionMessage(e), "\n")})
}

table(train_set$outlier)
## 
##  FALSE   TRUE 
## 491628    962
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GeneralFunctions':
## 
##     last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summary = train_set %>% group_by(id_) %>% summarize(sumOutliers = sum(outlier, na.rm=T)) %>% arrange(-sumOutliers) %>% as.data.frame()

head(summary)
##                    id_ sumOutliers
## 1       GEO0K_TIID_976         112
## 2     GEO0C_Z_RWSTI220          93
## 3 GEO2A__R_RWSTI363409          51
## 4    GEO0C_Z_RWSTI2010          50
## 5 GEO2A__R_RWSTI363412          44
## 6  GEO0B_R_RWSTI362568          41

Plot of the loops with the highest number of outliers:

par(mfrow=c(3,3))
for(id in summary$id_[1:27]){
  tmp = subset(train_set, id_ == id)
  plot(x = tmp$trafficFlow_anyVehicle, y = tmp$trafficSpeed_anyVehicle, 
       col=ifelse(tmp$outlier,"red",transparent("black",50)),
       pch = ifelse(tmp$outlier,20,1),
       ylab="speed (km/h)", xlab="cars / minute", main=id)
}

Now, also use some extra predictors:

features_to_use = c("RightOrientedLane", "NumberOfLanes","trafficFlow_anyVehicle",
                    "trafficSpeed_anyVehicle","fractionFlowLongVehicles")

train_set$outlier = F
for(fold in folds){
  tryCatch({
    outliers = nnk(train_set[fold,features_to_use], k=length(folds), cutoff=0.995)
    if(length(outliers$`Location of Outlier`)==0){next()}
    train_set[fold,][outliers$`Location of Outlier`,"outlier"] = TRUE
  }, error=function(e){message("ERROR :",conditionMessage(e), "\n")})
}

table(train_set$outlier)
## 
##  FALSE   TRUE 
## 491616    974
library(dplyr)
summary = train_set %>% group_by(id_) %>% summarize(sumOutliers = sum(outlier, na.rm=T)) %>% arrange(-sumOutliers) %>% as.data.frame()

head(summary)
##                    id_ sumOutliers
## 1       GEO0K_TIID_976         105
## 2     GEO0C_Z_RWSTI220          93
## 3 GEO2A__R_RWSTI363409          50
## 4    GEO0C_Z_RWSTI2010          49
## 5 GEO2A__R_RWSTI363412          46
## 6  GEO0B_R_RWSTI362568          41

Plot of the loops with the highest number of outliers:

par(mfrow=c(3,3))
for(id in summary$id_[1:27]){
  tmp = subset(train_set, id_ == id)
  plot(x = tmp$trafficFlow_anyVehicle, y = tmp$trafficSpeed_anyVehicle, 
       col=ifelse(tmp$outlier,"red",transparent("black",50)),
       pch = ifelse(tmp$outlier,20,1),
       ylab="speed (km/h)", xlab="cars / minute", main=id)
}

Looks interesting already, but now high speed is very influential due to its numeric range. I should probably do some scaling:

train_scaled = train_set
for(i in c("trafficFlow_anyVehicle","trafficSpeed_anyVehicle",
           "fractionFlowLongVehicles", "lane", "RightOrientedLane","NumberOfLanes")){
  train_scaled[,paste0(i,"_scaled")] = norm_01(train_scaled[,i])
}

for(fold in folds){
  tryCatch({
    outliers = nnk(train_scaled[fold,paste0(features_to_use,"_scaled")], k=length(folds), cutoff=0.995)
    if(length(outliers$`Location of Outlier`)==0){next()}
    train_scaled[fold,][outliers$`Location of Outlier`,"outlier"] = TRUE
  }, error=function(e){message("ERROR :",conditionMessage(e), "\n")})
}

table(train_scaled$outlier)
## 
##  FALSE   TRUE 
## 490860   1730
library(dplyr)
summary = train_scaled %>% group_by(id_) %>% summarize(sumOutliers = sum(outlier, na.rm=T)) %>% arrange(-sumOutliers) %>% as.data.frame()

head(summary)
##                   id_ sumOutliers
## 1      GEO0K_TIID_976         107
## 2    GEO0C_Z_RWSTI220          95
## 3 GEO0C_Z_RWSTI356716          71
## 4      PNH02_TI1007-R          62
## 5   GEO2A__R_RWSTI602          61
## 6 GEO0B_R_RWSTI363369          54

Plot of the loops with the highest number of outliers:

par(mfrow=c(3,3))
for(id in summary$id_[1:18]){
  tmp = subset(train_scaled, id_ == id)
  plot(x = tmp$trafficFlow_anyVehicle, y = tmp$trafficSpeed_anyVehicle, 
       col=ifelse(tmp$outlier,"red",transparent("black",50)),
       pch = ifelse(tmp$outlier,20,1),
       ylab="speed (km/h)", xlab="cars / minute", main=id)
}