# call in required packages
library(tidyverse) # str_extract
library(qdapRegex) # ex_bracket
library(rvest) # read_html
library(anytime) # anytime
library(ggplot2)
library(dplyr)
library(hrbrthemes)
library(viridis)
Data collected from inside flat
I have done this using a Data logger Elitech RC - 5.
The data logger is left in the lounge area by a north-west facing
window.
# First read in my data logger data
#
# directories
# cDat <- "C:/Users/roman/Documents/MawsonWeather/data/CleanedData"
#
# set working directory to sub folder
# setwd(cDat)
#
# read in datalogger dat
dfl <- read.csv("C:/Users/roman/Documents/MawsonWeather/data/CleanedData/datalogger2023-12-18.csv")
#
# cast datetime to datetime data type
dfl$Datetime <- as.POSIXct(dfl$Datetime,format="%d/%m/%Y %I:%M:%S %p")
# round datetime values to nearest half hour
dfl$Datetime <- round_date(dfl$Datetime, "30 mins")
# find the mean value within each 30 min period
dfl <- aggregate( Temperature_C ~ Datetime , dfl , mean )
Take a look at the data
We have two fields:
[Datetime] - A record is logged every 15 minutes
[Temperature_C] - The recorded temperature in Celsius
head(dfl)
## Datetime Temperature_C
## 1 2023-04-15 16:30:00 24.20
## 2 2023-04-15 17:00:00 20.05
## 3 2023-04-15 17:30:00 19.20
## 4 2023-04-15 18:00:00 18.80
## 5 2023-04-15 18:30:00 18.50
## 6 2023-04-15 19:00:00 18.15
Data collected outside the flat
To get hold of hourly temperature records for Canberra I had to find a
website that keeps a record of the hourly temp (not BOM). The website I
am using is www.timeanddate.com .
Web scrape data from each month
Now to scraping this data and cleaning it for every month this year to
date. To do this I’ve built a simple loop to open each page and scrape
and format the data
# Set parameters
max_mth <- month(Sys.Date())
year <- year(Sys.Date())
#
# Create empty df
df_total = data.frame()
#
#
# Run loop to create a file per month
for(i in 1:max_mth){
month <- i
link <- paste0("https://www.timeanddate.com/weather/australia/canberra/historic?month=",month,"&year=",year)
page <- read_html(link)
text <- html_text(page)
top_tail <- str_extract(str_extract(text, 'var data=.*'), '.*detail')
ex.sq <- ex_bracket(top_tail, pattern = "square")
ex.cur <- ex_bracket(ex.sq, pattern = "curly")
df <- as.data.frame(ex.cur[[1]])
df$date_rw <- str_extract(df$ex.cur, "\\d{5,9}E\\d")
df$Datetime <-anytime(ifelse( anytime(as.numeric(df$date_rw)/1000) > '2023-04-03'
,anytime(as.numeric(df$date_rw)/1000) - (10*3600) +1
,anytime(as.numeric(df$date_rw)/1000) - (11*3600) +1
))
df$temp_rw <- str_extract(str_extract(df$ex.cur, "temp.*"), ":.*")
df$Temperature_C <- as.numeric(sub(':',"",df$temp_rw))
temperature_dat <- df[ , c("Datetime", "Temperature_C")]
#
# append to df_total
df_total <- rbind(df_total,temperature_dat)
} #!
#
# read in web dat (drop NAs)
dfw <- na.omit(df_total)
dfw$Datetime <- round_date(dfw$Datetime, "30 mins")
dfw <- aggregate( Temperature_C ~ Datetime , dfw , mean )
data check head(full web scrape)
head(dfw)
## Datetime Temperature_C
## 1 2023-01-01 00:30:00 17.5
## 2 2023-01-01 01:00:00 17.0
## 3 2023-01-01 01:30:00 18.0
## 4 2023-01-01 02:00:00 17.0
## 5 2023-01-01 02:30:00 18.0
## 6 2023-01-01 03:00:00 17.0
data check tail(full web scrape)
tail(dfw)
## Datetime Temperature_C
## 14831 2023-12-17 09:00:00 17.0
## 14832 2023-12-17 09:30:00 17.5
## 14833 2023-12-17 10:00:00 18.0
## 14834 2023-12-17 10:30:00 19.0
## 14835 2023-12-17 11:00:00 20.0
## 14836 2023-12-17 11:30:00 21.0
Match web data to data logger dates
subset web data to match data logger
dfw_s <- dfw[which(dfw$Datetime > min(dfl$Datetime)
& dfw$Datetime < max(dfl$Datetime)),]
dfa <- merge(dfw_s, dfl, by = 'Datetime')
names(dfa) <- c("Datetime", "Temperature_C.Outside", "Temperature_C.Inside")
dfa$Difference <- abs(dfa$Temperature_C.Inside - dfa$Temperature_C.Outside)
EDA all time covered
outside
summary(dfw_s$Temperature_C)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.00 6.00 10.00 10.95 15.00 35.00
inside
summary(dfl$Temperature_C)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.90 12.50 15.55 16.47 19.95 38.20
dfhm <- subset(dfa, month(dfa$Datetime) == 6)
dfaway <- subset(dfa, month(dfa$Datetime) == 5)
EDA monthly
boxplot(dfw_s$Temperature_C ~ month(dfw_s$Datetime)
, col = "skyblue"
, main = "Outside temperature by month"
, xlab = "Month"
, ylab = "Temperature C"
, ylim = c(-10, 40)
,horizontal=FALSE)
boxplot(dfl$Temperature_C ~ month(dfl$Datetime)
, col = "red"
, main = "Inside temperature by month"
, xlab = "Month"
, ylab = "Temperature C"
, ylim = c(-10, 40)
, horizontal=FALSE)
boxplot(dfa$Difference ~ hour(dfa$Datetime)
, col = "purple"
, main = "Difference in temperature (inside vs outside) by hour of day"
, xlab = "Hour"
, ylab = "Temperature C"
, ylim = c(0, max(dfa$Difference) )
, horizontal=FALSE)
# boxplot(dfhm$Difference ~ hour(dfhm$Datetime)
# , col = "pink"
# , main = "Difference in temperature (inside vs outside) by hour of day - home"
# , xlab = "Hour"
# , ylab = "Temperature C"
# , ylim = c(0, max(dfhm$Difference) )
# , horizontal=FALSE)
#
# boxplot(dfaway$Difference ~ hour(dfaway$Datetime)
# , col = "green"
# , main = "Difference in temperature (inside vs outside) by hour of day - away"
# , xlab = "Hour"
# , ylab = "Temperature C"
# , ylim = c(0, max(dfaway$Difference) )
# , horizontal=FALSE)
dfw_s$mth <- as.factor(month(dfw_s$Datetime))
sample_size = dfw_s %>% group_by(mth) %>% summarize(num=n())
# Plot
dfw_s %>%
left_join(sample_size) %>%
mutate(myaxis = paste0(mth, "\n", "n=", num)) %>%
ggplot( aes(x=myaxis, y=Temperature_C, fill=mth)) +
geom_violin(width=1.4) +
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
scale_fill_viridis(discrete = TRUE) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Outside by month- using violin and box plot") +
xlab("")
dfl$mth <- as.factor(month(dfl$Datetime))
sample_size = dfl %>% group_by(mth) %>% summarize(num=n())
# Plot
dfl %>%
left_join(sample_size) %>%
mutate(myaxis = paste0(mth, "\n", "n=", num)) %>%
ggplot( aes(x=myaxis, y=Temperature_C, fill=mth)) +
geom_violin(width=1.4) +
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
scale_fill_viridis(discrete = TRUE) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Inside by month- using violin and box plot") +
xlab("")
# set parameters
c1 <- "skyblue"
c2 <- "red"
d1 <- "Outside"
d2 <- "Inside"
Title <- "Inside Vs. Outside"
xlab <- "Date"
ylab <- "Temperture"
p <- ggplot() +
geom_line(data=dfw_s, aes(x=Datetime, y = Temperature_C, colour = d1)) +
geom_line(data=dfl, aes(x=Datetime, y = Temperature_C, colour = d2 )) +
ggtitle(Title) +
xlab(xlab) +
ylab(ylab) +
scale_colour_manual("",
breaks = c(d1, d2),
values = c(c1, c2)) +
theme(legend.position="top")
p
# set parameters
c1 <- "skyblue"
c2 <- "red"
d1 <- "Outside"
d2 <- "Inside"
Title <- "Inside Vs. Outside"
xlab <- "Date"
ylab <- "Temperture"
p <- ggplot() +
# geom_line(data=dfw_s, aes(x=Datetime, y = Temperature_C, colour = d1)) +
geom_line(data=dfl, aes(x=Datetime, y = Temperature_C, colour = d2 )) +
ggtitle(Title) +
xlab(xlab) +
ylab(ylab) +
# scale_colour_manual("",
# breaks = c(d1, d2),
# values = c(c1, c2)) +
theme(legend.position="top")
p
Hourly weather data
To get hold of hourly temperature records for Canberra I had to find a
website that keeps a record of the hourly temp (not BOM). The website I
am using is www.timeanddate.com . Below I am going through each step of
scraping this data and cleaning it for my purposes here
link <- "https://www.timeanddate.com/weather/australia/canberra/historic?month=1&year=2023"
#
# read in html from link
page <- read_html(link)
data check (raw scrape)
From the output we can see this is xml format data
str(page)
## List of 2
## $ node:<externalptr>
## $ doc :<externalptr>
## - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
Convert html to text string
text <- html_text(page)
data check (raw - text)
Now we see the data as one long character string
str(text)
## chr "Weather in January 2023 in Canberra, Australian Capital Territory, Australia\n@font-face{font-family:iconfont;s"| __truncated__
Top and tail From looking at the page source in the browser I can see that the data I want is stored in a variable called ‘data’ and it ends with the word ‘detail’. So I will use this to extract the bit I want
top_tail <- str_extract(str_extract(text, 'var data=.*'), '.*detail')
data check (top and tail)
str(top_tail)
## chr "var data={\"copyright\":\"Contents are strictly for use by timeanddate.com\",\"units\":{\"temp\":\"°C\",\"prec\"| __truncated__
str_sub(top_tail,-20,-1)
## [1] ",\"temp\":18}],\"detail"
Grab out text contained in square brackets
From playing with the data I’ve found the data I want is contained in square brackets
ex.sq <- ex_bracket(top_tail, pattern = "square")
data check (square brackets)
using a function that employs regex to find things in bracket, we keep only that contained in square brakes from our text string above
str(ex.sq)
## List of 1
## $ : chr "{\"date\":16725321E5,\"temp\":18},{\"date\":1672533E6,\"temp\":17},{\"date\":16725348E5,\"temp\":17},{\"date\":"| __truncated__
## - attr(*, "class")= chr [1:2] "extracted" "list"
Grab out text contained in curly brackets
using a function that employs regex to find things in bracket, we keep only that contained in curly brakes from our text string above
ex.cur <- ex_bracket(ex.sq, pattern = "curly")[[1]]
data check (curly brackets)
Not a huge change - but now we are bracket free
str(ex.cur)
## chr [1:1351] "\"date\":16725321E5,\"temp\":18" ...
convert to a dataframe
Now if we convert this character string to a dataframe we get an observation (date and temp) per row
df <- as.data.frame(ex.cur)
data check (dataframe)
head(df)
## ex.cur
## 1 "date":16725321E5,"temp":18
## 2 "date":1672533E6,"temp":17
## 3 "date":16725348E5,"temp":17
## 4 "date":16725366E5,"temp":18
## 5 "date":16725384E5,"temp":17
## 6 "date":16725402E5,"temp":18
Extract datetime from obs
keep only string parts that have a datetime in UNIX epoch format
df$date_rw <- str_extract(df$ex.cur, "\\d{5,9}E\\d")
data check (Extract datetime)
head(df)
## ex.cur date_rw
## 1 "date":16725321E5,"temp":18 16725321E5
## 2 "date":1672533E6,"temp":17 1672533E6
## 3 "date":16725348E5,"temp":17 16725348E5
## 4 "date":16725366E5,"temp":18 16725366E5
## 5 "date":16725384E5,"temp":17 16725384E5
## 6 "date":16725402E5,"temp":18 16725402E5
UNIX epoch to EST
Convert datetime from UNIX epoch to standard datetime format also add 10 hours for the difference with EST (11 hours when daylight savings) and lastly add 1 minute to keep from loosing time part at midnight and noon
df$Datetime <-anytime(ifelse( anytime(as.numeric(df$date_rw)/1000) > '2023-04-03'
,anytime(as.numeric(df$date_rw)/1000) - (10*3600) +1
,anytime(as.numeric(df$date_rw)/1000) - (11*3600) +1
))
data check (Datetime)
head(df)
## ex.cur date_rw Datetime
## 1 "date":16725321E5,"temp":18 16725321E5 2023-01-01 00:15:01
## 2 "date":1672533E6,"temp":17 1672533E6 2023-01-01 00:30:01
## 3 "date":16725348E5,"temp":17 16725348E5 2023-01-01 01:00:01
## 4 "date":16725366E5,"temp":18 16725366E5 2023-01-01 01:30:01
## 5 "date":16725384E5,"temp":17 16725384E5 2023-01-01 02:00:01
## 6 "date":16725402E5,"temp":18 16725402E5 2023-01-01 02:30:01
Extract temperature from obs
keep only string parts that have a temperature (json key : temp)
df$temp_rw <- str_extract(str_extract(df$ex.cur, "temp.*"), ":.*")
data check (Extract temperature)
head(df)
## ex.cur date_rw Datetime temp_rw
## 1 "date":16725321E5,"temp":18 16725321E5 2023-01-01 00:15:01 :18
## 2 "date":1672533E6,"temp":17 1672533E6 2023-01-01 00:30:01 :17
## 3 "date":16725348E5,"temp":17 16725348E5 2023-01-01 01:00:01 :17
## 4 "date":16725366E5,"temp":18 16725366E5 2023-01-01 01:30:01 :18
## 5 "date":16725384E5,"temp":17 16725384E5 2023-01-01 02:00:01 :17
## 6 "date":16725402E5,"temp":18 16725402E5 2023-01-01 02:30:01 :18
Tidy up temperature
Create new field for temperature , remove colon and convert to numeric
df$Temperature_C <- as.numeric(sub(':',"",df$temp_rw))
data check (Temperature_C)
head(df)
## ex.cur date_rw Datetime temp_rw
## 1 "date":16725321E5,"temp":18 16725321E5 2023-01-01 00:15:01 :18
## 2 "date":1672533E6,"temp":17 1672533E6 2023-01-01 00:30:01 :17
## 3 "date":16725348E5,"temp":17 16725348E5 2023-01-01 01:00:01 :17
## 4 "date":16725366E5,"temp":18 16725366E5 2023-01-01 01:30:01 :18
## 5 "date":16725384E5,"temp":17 16725384E5 2023-01-01 02:00:01 :17
## 6 "date":16725402E5,"temp":18 16725402E5 2023-01-01 02:30:01 :18
## Temperature_C
## 1 18
## 2 17
## 3 17
## 4 18
## 5 17
## 6 18