Weather webscrape

Taking the temperture

dataLog

#   call in required packages
library(tidyverse)   # str_extract
library(qdapRegex)   # ex_bracket
library(rvest)       # read_html
library(anytime)     # anytime
library(ggplot2)
library(dplyr)
library(hrbrthemes)
library(viridis)

Data collected from inside flat
I have done this using a Data logger Elitech RC - 5.
The data logger is left in the lounge area by a north-west facing window.

# First read in my data logger data
#
# directories
# cDat <- "C:/Users/roman/Documents/MawsonWeather/data/CleanedData"
#
#  set working directory to sub folder
# setwd(cDat)
#
# read in datalogger dat
dfl <- read.csv("C:/Users/roman/Documents/MawsonWeather/data/CleanedData/datalogger2023-12-18.csv")
#
# cast datetime to datetime data type
dfl$Datetime <- as.POSIXct(dfl$Datetime,format="%d/%m/%Y %I:%M:%S %p")

# round datetime values to nearest half hour
dfl$Datetime <- round_date(dfl$Datetime, "30 mins")

# find the mean value within each 30 min period
dfl <- aggregate( Temperature_C ~ Datetime , dfl , mean )

Take a look at the data
We have two fields:
[Datetime] - A record is logged every 15 minutes
[Temperature_C] - The recorded temperature in Celsius

head(dfl)

##              Datetime Temperature_C
## 1 2023-04-15 16:30:00         24.20
## 2 2023-04-15 17:00:00         20.05
## 3 2023-04-15 17:30:00         19.20
## 4 2023-04-15 18:00:00         18.80
## 5 2023-04-15 18:30:00         18.50
## 6 2023-04-15 19:00:00         18.15

Data collected outside the flat
To get hold of hourly temperature records for Canberra I had to find a website that keeps a record of the hourly temp (not BOM). The website I am using is www.timeanddate.com .

Web scrape data from each month
Now to scraping this data and cleaning it for every month this year to date. To do this I’ve built a simple loop to open each page and scrape and format the data

#  Set parameters
max_mth <- month(Sys.Date())
year <- year(Sys.Date())
#
# Create empty df
df_total = data.frame()
#
#
# Run loop to create a file per month
for(i in 1:max_mth){

  month <- i
  link <- paste0("https://www.timeanddate.com/weather/australia/canberra/historic?month=",month,"&year=",year)
  page <- read_html(link)
  text <- html_text(page)
  top_tail <- str_extract(str_extract(text, 'var data=.*'), '.*detail')
  ex.sq <- ex_bracket(top_tail, pattern = "square")
  ex.cur <- ex_bracket(ex.sq, pattern = "curly")
  df <- as.data.frame(ex.cur[[1]])
  df$date_rw <- str_extract(df$ex.cur, "\\d{5,9}E\\d")
  df$Datetime <-anytime(ifelse( anytime(as.numeric(df$date_rw)/1000) > '2023-04-03' 
                                ,anytime(as.numeric(df$date_rw)/1000) - (10*3600) +1
                                ,anytime(as.numeric(df$date_rw)/1000) - (11*3600) +1
  ))
  df$temp_rw <- str_extract(str_extract(df$ex.cur, "temp.*"), ":.*")
  df$Temperature_C <- as.numeric(sub(':',"",df$temp_rw))
  temperature_dat <- df[ , c("Datetime", "Temperature_C")]
  #
  # append to df_total
  df_total <- rbind(df_total,temperature_dat)
 
} #!
#
# read in web dat (drop NAs)
dfw <- na.omit(df_total)
dfw$Datetime <- round_date(dfw$Datetime, "30 mins")
dfw <- aggregate( Temperature_C ~ Datetime , dfw , mean )

data check head(full web scrape)

head(dfw)

##              Datetime Temperature_C
## 1 2023-01-01 00:30:00          17.5
## 2 2023-01-01 01:00:00          17.0
## 3 2023-01-01 01:30:00          18.0
## 4 2023-01-01 02:00:00          17.0
## 5 2023-01-01 02:30:00          18.0
## 6 2023-01-01 03:00:00          17.0

data check tail(full web scrape)

tail(dfw)

##                  Datetime Temperature_C
## 14831 2023-12-17 09:00:00          17.0
## 14832 2023-12-17 09:30:00          17.5
## 14833 2023-12-17 10:00:00          18.0
## 14834 2023-12-17 10:30:00          19.0
## 14835 2023-12-17 11:00:00          20.0
## 14836 2023-12-17 11:30:00          21.0

Match web data to data logger dates
subset web data to match data logger

dfw_s <-  dfw[which(dfw$Datetime > min(dfl$Datetime)
                    & dfw$Datetime < max(dfl$Datetime)),]

dfa <- merge(dfw_s, dfl, by = 'Datetime')

names(dfa) <- c("Datetime",  "Temperature_C.Outside", "Temperature_C.Inside")
dfa$Difference <- abs(dfa$Temperature_C.Inside - dfa$Temperature_C.Outside)

EDA all time covered

outside

summary(dfw_s$Temperature_C)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -6.00    6.00   10.00   10.95   15.00   35.00

inside

summary(dfl$Temperature_C)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.90   12.50   15.55   16.47   19.95   38.20

dfhm <- subset(dfa, month(dfa$Datetime) == 6)
dfaway <- subset(dfa, month(dfa$Datetime) == 5)

Box plots

EDA monthly

boxplot(dfw_s$Temperature_C ~ month(dfw_s$Datetime)
        , col = "skyblue"
        , main = "Outside temperature by month"
        , xlab = "Month"
        , ylab = "Temperature C" 
        , ylim = c(-10, 40)
        ,horizontal=FALSE)

boxplot(dfl$Temperature_C ~ month(dfl$Datetime)
        , col = "red"
        , main = "Inside temperature by month"
        , xlab = "Month" 
        , ylab = "Temperature C"
        , ylim =  c(-10, 40)
        , horizontal=FALSE)

boxplot(dfa$Difference ~ hour(dfa$Datetime)
        , col = "purple"
        , main = "Difference in temperature (inside vs outside) by hour of day"
        , xlab = "Hour"
        , ylab = "Temperature C"
        , ylim = c(0, max(dfa$Difference) )
        , horizontal=FALSE)

# boxplot(dfhm$Difference ~ hour(dfhm$Datetime)
#         , col = "pink"
#         , main = "Difference in temperature (inside vs outside) by hour of day - home"
#         , xlab = "Hour"
#         , ylab = "Temperature C"
#         , ylim = c(0, max(dfhm$Difference) )
#         , horizontal=FALSE)
# 
# boxplot(dfaway$Difference ~ hour(dfaway$Datetime)
#         , col = "green"
#         , main = "Difference in temperature (inside vs outside) by hour of day - away"
#         , xlab = "Hour"
#         , ylab = "Temperature C"
#         , ylim = c(0, max(dfaway$Difference) )
#         , horizontal=FALSE)

Violin plots

dfw_s$mth <- as.factor(month(dfw_s$Datetime))
sample_size = dfw_s %>% group_by(mth) %>% summarize(num=n()) 
# Plot
dfw_s %>%
  left_join(sample_size) %>%
  mutate(myaxis = paste0(mth, "\n", "n=", num)) %>%
  ggplot( aes(x=myaxis, y=Temperature_C, fill=mth)) +
  geom_violin(width=1.4) +
  geom_boxplot(width=0.1, color="grey", alpha=0.2) +
  scale_fill_viridis(discrete = TRUE) +
  theme_ipsum() +
  theme(
    legend.position="none",
    plot.title = element_text(size=11)
  ) +
  ggtitle("Outside by month- using violin and box plot") +
  xlab("")

dfl$mth <- as.factor(month(dfl$Datetime))
sample_size = dfl %>% group_by(mth) %>% summarize(num=n()) 
# Plot
dfl %>%
  left_join(sample_size) %>%
  mutate(myaxis = paste0(mth, "\n", "n=", num)) %>%
  ggplot( aes(x=myaxis, y=Temperature_C, fill=mth)) +
  geom_violin(width=1.4) +
  geom_boxplot(width=0.1, color="grey", alpha=0.2) +
  scale_fill_viridis(discrete = TRUE) +
  theme_ipsum() +
  theme(
    legend.position="none",
    plot.title = element_text(size=11)
  ) +
  ggtitle("Inside by month- using violin and box plot") +
  xlab("")

Line plots

# set parameters
c1 <- "skyblue"
c2 <- "red"
d1 <- "Outside"
d2 <- "Inside"
Title <- "Inside Vs. Outside"
xlab <- "Date"
ylab <- "Temperture"

p <- ggplot() + 
     geom_line(data=dfw_s, aes(x=Datetime, y = Temperature_C, colour = d1)) +
     geom_line(data=dfl, aes(x=Datetime, y = Temperature_C, colour = d2 )) +
     ggtitle(Title) +
     xlab(xlab) + 
     ylab(ylab) +
     scale_colour_manual("", 
                      breaks = c(d1, d2),
                      values = c(c1, c2)) +
     theme(legend.position="top")
p

# set parameters
c1 <- "skyblue"
c2 <- "red"
d1 <- "Outside"
d2 <- "Inside"
Title <- "Inside Vs. Outside"
xlab <- "Date"
ylab <- "Temperture"

p <- ggplot() + 
     # geom_line(data=dfw_s, aes(x=Datetime, y = Temperature_C, colour = d1)) +
     geom_line(data=dfl, aes(x=Datetime, y = Temperature_C, colour = d2 )) +
     ggtitle(Title) +
     xlab(xlab) + 
     ylab(ylab) +
     # scale_colour_manual("", 
     #                  breaks = c(d1, d2),
     #                  values = c(c1, c2)) +
     theme(legend.position="top")
p

Webscrape breakdown

Hourly weather data
To get hold of hourly temperature records for Canberra I had to find a website that keeps a record of the hourly temp (not BOM). The website I am using is www.timeanddate.com . Below I am going through each step of scraping this data and cleaning it for my purposes here

link <- "https://www.timeanddate.com/weather/australia/canberra/historic?month=1&year=2023"
  #
  # read in html from link
  page <- read_html(link)

data check (raw scrape)

From the output we can see this is xml format data

  str(page)

## List of 2
##  $ node:<externalptr> 
##  $ doc :<externalptr> 
##  - attr(*, "class")= chr [1:2] "xml_document" "xml_node"

Convert html to text string

  text <- html_text(page)

data check (raw - text)

Now we see the data as one long character string

  str(text)

##  chr "Weather in January 2023 in Canberra, Australian Capital Territory, Australia\n@font-face{font-family:iconfont;s"| __truncated__

Top and tail From looking at the page source in the browser I can see that the data I want is stored in a variable called ‘data’ and it ends with the word ‘detail’. So I will use this to extract the bit I want

  top_tail <- str_extract(str_extract(text, 'var data=.*'), '.*detail')

data check (top and tail)

str(top_tail)

##  chr "var data={\"copyright\":\"Contents are strictly for use by timeanddate.com\",\"units\":{\"temp\":\"°C\",\"prec\"| __truncated__

str_sub(top_tail,-20,-1)

## [1] ",\"temp\":18}],\"detail"

Grab out text contained in square brackets

From playing with the data I’ve found the data I want is contained in square brackets

  ex.sq <- ex_bracket(top_tail, pattern = "square")

data check (square brackets)

using a function that employs regex to find things in bracket, we keep only that contained in square brakes from our text string above

  str(ex.sq)

## List of 1
##  $ : chr "{\"date\":16725321E5,\"temp\":18},{\"date\":1672533E6,\"temp\":17},{\"date\":16725348E5,\"temp\":17},{\"date\":"| __truncated__
##  - attr(*, "class")= chr [1:2] "extracted" "list"

Grab out text contained in curly brackets

using a function that employs regex to find things in bracket, we keep only that contained in curly brakes from our text string above

  ex.cur <- ex_bracket(ex.sq, pattern = "curly")[[1]]

data check (curly brackets)

Not a huge change - but now we are bracket free

  str(ex.cur)

##  chr [1:1351] "\"date\":16725321E5,\"temp\":18" ...

convert to a dataframe

Now if we convert this character string to a dataframe we get an observation (date and temp) per row

  df <- as.data.frame(ex.cur)

data check (dataframe)

  head(df)

##                        ex.cur
## 1 "date":16725321E5,"temp":18
## 2  "date":1672533E6,"temp":17
## 3 "date":16725348E5,"temp":17
## 4 "date":16725366E5,"temp":18
## 5 "date":16725384E5,"temp":17
## 6 "date":16725402E5,"temp":18

Extract datetime from obs

keep only string parts that have a datetime in UNIX epoch format

  df$date_rw <- str_extract(df$ex.cur, "\\d{5,9}E\\d")

data check (Extract datetime)

  head(df)

##                        ex.cur    date_rw
## 1 "date":16725321E5,"temp":18 16725321E5
## 2  "date":1672533E6,"temp":17  1672533E6
## 3 "date":16725348E5,"temp":17 16725348E5
## 4 "date":16725366E5,"temp":18 16725366E5
## 5 "date":16725384E5,"temp":17 16725384E5
## 6 "date":16725402E5,"temp":18 16725402E5

UNIX epoch to EST

Convert datetime from UNIX epoch to standard datetime format also add 10 hours for the difference with EST (11 hours when daylight savings) and lastly add 1 minute to keep from loosing time part at midnight and noon

  df$Datetime <-anytime(ifelse( anytime(as.numeric(df$date_rw)/1000) > '2023-04-03' 
                                ,anytime(as.numeric(df$date_rw)/1000) - (10*3600) +1
                                ,anytime(as.numeric(df$date_rw)/1000) - (11*3600) +1
  ))

data check (Datetime)

  head(df)

##                        ex.cur    date_rw            Datetime
## 1 "date":16725321E5,"temp":18 16725321E5 2023-01-01 00:15:01
## 2  "date":1672533E6,"temp":17  1672533E6 2023-01-01 00:30:01
## 3 "date":16725348E5,"temp":17 16725348E5 2023-01-01 01:00:01
## 4 "date":16725366E5,"temp":18 16725366E5 2023-01-01 01:30:01
## 5 "date":16725384E5,"temp":17 16725384E5 2023-01-01 02:00:01
## 6 "date":16725402E5,"temp":18 16725402E5 2023-01-01 02:30:01

Extract temperature from obs

keep only string parts that have a temperature (json key : temp)

  df$temp_rw <- str_extract(str_extract(df$ex.cur, "temp.*"), ":.*")

data check (Extract temperature)

  head(df)

##                        ex.cur    date_rw            Datetime temp_rw
## 1 "date":16725321E5,"temp":18 16725321E5 2023-01-01 00:15:01     :18
## 2  "date":1672533E6,"temp":17  1672533E6 2023-01-01 00:30:01     :17
## 3 "date":16725348E5,"temp":17 16725348E5 2023-01-01 01:00:01     :17
## 4 "date":16725366E5,"temp":18 16725366E5 2023-01-01 01:30:01     :18
## 5 "date":16725384E5,"temp":17 16725384E5 2023-01-01 02:00:01     :17
## 6 "date":16725402E5,"temp":18 16725402E5 2023-01-01 02:30:01     :18

Tidy up temperature

Create new field for temperature , remove colon and convert to numeric

  df$Temperature_C <- as.numeric(sub(':',"",df$temp_rw))

data check (Temperature_C)

  head(df)

##                        ex.cur    date_rw            Datetime temp_rw
## 1 "date":16725321E5,"temp":18 16725321E5 2023-01-01 00:15:01     :18
## 2  "date":1672533E6,"temp":17  1672533E6 2023-01-01 00:30:01     :17
## 3 "date":16725348E5,"temp":17 16725348E5 2023-01-01 01:00:01     :17
## 4 "date":16725366E5,"temp":18 16725366E5 2023-01-01 01:30:01     :18
## 5 "date":16725384E5,"temp":17 16725384E5 2023-01-01 02:00:01     :17
## 6 "date":16725402E5,"temp":18 16725402E5 2023-01-01 02:30:01     :18
##   Temperature_C
## 1            18
## 2            17
## 3            17
## 4            18
## 5            17
## 6            18