library(data.table); library(magrittr); library(dplyr); library(ggplot2); library(lubridate)
wx = fread("981266.csv")
wx = wx[,c(1,6, 25)]
wx$HOURLYPrecip %<>% as.numeric()
wx$HOURLYPrecip[is.na(wx$HOURLYPrecip)] = 0
wx$DATE %<>% as_datetime()
GUAM = wx %>% filter(STATION == "WBAN:41415")
PHX = wx %>% filter(STATION == "WBAN:23183")
MRY = wx %>% filter(STATION == "WBAN:23259")
GUAM$SUM = cumsum(GUAM$HOURLYPrecip)
PHX$SUM = cumsum(PHX$HOURLYPrecip)
MRY$SUM = cumsum(MRY$HOURLYPrecip)
Plot of the three lines: Guam, Phoenix, and Monterey:
ggplot() + geom_line(data = PHX, aes(x = DATE, y = SUM), color = "red") + geom_line(data = GUAM, aes(x = DATE, y = SUM), color = "green") + geom_line(data = MRY, aes(x = DATE, y = SUM), color = "blue") + xlab("Date") + ylab("Cumulative Rainfall")
Use approximation functions to compute a distance metric:
#code modified from https://stackoverflow.com/questions/24742677/how-to-measure-area-between-2-distribution-curves-in-r-ggplot2
GUAM$DATE%<>% as.numeric()
GUAM$DATE = GUAM$DATE - min(GUAM$DATE)
PHX$DATE %<>% as.numeric()
PHX$DATE = PHX$DATE - min(PHX$DATE)
MRY$DATE %<>% as.numeric()
MRY$DATE = MRY$DATE - min(MRY$DATE)
aguam = approxfun(GUAM$DATE, GUAM$SUM)
aphx = approxfun(PHX$DATE, PHX$SUM)
amry = approxfun(MRY$DATE, MRY$SUM)
ovrng = c(max(min(GUAM$DATE), min(PHX$DATE)), min(max(GUAM$DATE), max(PHX$DATE)))
i = seq(min(ovrng), max(ovrng), length.out = 5000)
i = seq(min(ovrng), max(ovrng), length.out = 5000)
h = (aguam(i) - aphx(i))^2
area<-sum( (h[-1]+h[-length(h)]) /2 *diff(i) *(h[-1]>=0+0))
area %<>% sqrt()
#area %>% print()
ovrng = c(max(min(MRY$DATE), min(PHX$DATE)), min(max(MRY$DATE), max(PHX$DATE)))
i = seq(min(ovrng), max(ovrng), length.out = 5000)
i = seq(min(ovrng), max(ovrng), length.out = 5000)
h = (amry(i) - aphx(i))^2
area2<-sum( (h[-1]+h[-length(h)]) /2 *diff(i) *(h[-1]>=0+0))
area2 %<>% sqrt()
#area2%>% print()
The distance between Guam and Phoenix is 1.625445810^{4}. The distance between Monterey and Phoenix is 1811.1176442.