WORK IN PROGRESS
interesting links and sources:
- https://www.edwardtufte.com/bboard/q-and-a-fetch-msg?msg_id=00014g
- http://academic.udayton.edu/kissock/http/Weather/
- …
TEXT
A good text is still missing here. However, a short summary of the project of mine is as follows:
- analysing the weather data
- recreating the graph you can see below - building shiney app
- main features: You can choose the city & You can choose the “years of interest”
- visualize all relevant cities on world map
- click on city to get this graph
Original graph from the New York Times (Weather Chart from January 2004, p. A-15)
pkgsName <- c("readr","magrittr","ggplot2","dplyr", "gridExtra")
# (pkgs <- package(pkgsName))
lapply(pkgsName, require, character.only = TRUE)
#> Loading required package: readr
#> Loading required package: magrittr
#> Loading required package: ggplot2
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#> Loading required package: gridExtra
#>
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#>
#> combine
#> [[1]]
#> [1] TRUE
#>
#> [[2]]
#> [1] TRUE
#>
#> [[3]]
#> [1] TRUE
#>
#> [[4]]
#> [1] TRUE
#>
#> [[5]]
#> [1] TRUETEXT
Three functions which make it easy to load the files:
- make_filename()
- get_FileInfo()
- read_and_load()
make_filename <- function(CityABBR) {
filePathSep <- "/"
fileNamesep <- "."
fileExt <- "txt"
baseURL <- "http://academic.udayton.edu/kissock/http/Weather/gsod95-current"
filename <- paste(CityABBR, fileExt, sep = fileNamesep)
finalURL <- paste(baseURL, filename, sep = filePathSep)
} # END make_filename()
get_FileInfo <- function(CityFile, CountryABBR, City){
# start with an empty data frame:
# not really needed if only one file is looked at
# df <- data.frame(name = c(), size = c())
fileInfo <- object.size(CityFile)
fileSizeInMb <- paste(round(fileInfo / 1024 / 1024, 2), "MB")
df <- data.frame(name = paste(CountryABBR, City), size = fileSizeInMb)
} #END get_FileInfo
read_and_load <- function(finalURL){
ext_tracks_colnames <- c("Month", "Day", "Year", "TempInF")
ext_tracks_widths <- c(8,9,17,17)
# data <- readr::read_fwf(finalURL) #col_names = FALSE
data <- readr::read_fwf(finalURL,
fwf_widths(ext_tracks_widths,
ext_tracks_colnames)
)
return(data)
}So, let’s start with the city Cairo in Egypt:
- define input parameters
- load the file with the functions you saw above
CountryABBR <- "EG"; City <- "CAIRO" #ensure: captial letters
CityABBR <- paste0(CountryABBR, City)
# make file name
name <- make_filename(CityABBR)
# read
testfile <- read_and_load(name)
(get_FileInfo(testfile, CountryABBR, City))
#> name size
#> 1 EG CAIRO 0.45 MB
head(testfile) # --simple as it can be
#> # A tibble: 6 x 4
#> Month Day Year TempInF
#> <int> <int> <int> <dbl>
#> 1 1 1 1995 59.2
#> 2 1 2 1995 57.5
#> 3 1 3 1995 57.4
#> 4 1 4 1995 59.3
#> 5 1 5 1995 58.8
#> 6 1 6 1995 55.7TEXT
I defined the following functions:
- get_YearData(data, YearsOfInterest)
- get_YearPastExtremes(PastData)
- get_ExtremesForYearX(PastYearExtremes, DataYearX)
They help to summarize the data given by the *.csv files. They are defined as follows:
get_YearData <- function(data, YearsOfInterest){
Past <- data %>%
dplyr::group_by(Year) %>%
dplyr::mutate(seqDay = seq(1, length(Day))) %>%
dplyr::ungroup() %>%
dplyr::filter(TempInF != -99 & Year %in% YearsOfInterest) %>% # missing values = -99
dplyr::group_by(seqDay) %>%
dplyr::mutate(TempInC = (TempInF - 32) / 1.8,
upper = max(TempInC),
lower = min(TempInC),
avg = mean(TempInC),
se = sd(TempInC) / sqrt(length(TempInC)),
avg_upper = avg + (2.101 * se),
avg_lower = avg - (2.101 * se)) %>%
ungroup()
}
get_YearPastExtremes <- function(PastData){
Low <- PastData %>%
dplyr::group_by(seqDay) %>%
dplyr::summarise(PastLow = min(TempInC),
PastHigh = max(TempInC),
PastAvg = mean(TempInC),
PastSe = sd(TempInC) / sqrt(length(TempInC)),
PastAvgLow = PastAvg - (2.101 * PastSe),
PastAvgHigh = PastAvg + (2.101 * PastSe))
}
get_ExtremesForYearX <- function(PastYearExtremes, DataYearX){
Low_YearX <- DataYearX %>%
dplyr::left_join(PastYearExtremes) %>%
mutate(recordLow = ifelse(TempInC < PastLow, "Y", "N"),
recordHigh = ifelse(TempInC >= PastHigh, "Y", "N"))
}So the routine looks like this:
YearToday <- 2014L
YearPastMin <- min(testfile$Year)
YearPastMax <- YearToday - 1
PastData <- get_YearData(testfile, YearPastMin:YearPastMax)
#> Warning: package 'bindrcpp' was built under R version 3.3.3
DataYearX <- get_YearData(testfile, YearToday)
PastExtremes <- get_YearPastExtremes(PastData)
head(PastExtremes)
#> # A tibble: 6 x 7
#> seqDay PastLow PastHigh PastAvg PastSe PastAvgLow PastAvgHigh
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 12.61111 19.33333 15.26901 0.3567744 14.51942 16.01859
#> 2 2 12.50000 20.44444 15.50585 0.4304761 14.60142 16.41028
#> 3 3 11.77778 18.50000 15.03216 0.4338677 14.12061 15.94372
#> 4 4 12.00000 18.61111 14.93275 0.3712363 14.15278 15.71272
#> 5 5 11.16667 19.16667 14.62865 0.4309403 13.72325 15.53406
#> 6 6 10.05556 19.44444 14.42982 0.4841701 13.41258 15.44707
tail(PastExtremes)
#> # A tibble: 6 x 7
#> seqDay PastLow PastHigh PastAvg PastSe PastAvgLow PastAvgHigh
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 361 11.05556 21.05556 15.16959 0.5581563 13.99690 16.34228
#> 2 362 11.16667 19.83333 15.25731 0.5298165 14.14417 16.37045
#> 3 363 11.61111 18.38889 14.96784 0.3705729 14.18926 15.74641
#> 4 364 12.77778 17.88889 15.32407 0.3421212 14.60528 16.04287
#> 5 365 13.05556 18.22222 15.27778 0.3537958 14.53445 16.02110
#> 6 366 13.94444 18.33333 15.61111 0.8136566 13.90162 17.32060
YearXExtremes <- get_ExtremesForYearX(PastExtremes, DataYearX)
#> Joining, by = "seqDay"TEXT
This is not yet the final product (obvious…duah :)
Work in Progress
Keep in mind, this is the Weather of Cairo.
Here is an evolution of the graph
evo2 <- create_base_plot(City, PastExtremes,
DataYearX,
YearXExtremes,
YearPastMin,
YearPastMax,
YearToday)
evo3 <- evo2 %>%
add_line_yearX(DataYearX)
evo4 <- evo3 %>%
add_formatting(feb_days = 29, PastExtremes)
evo5 <- evo4 %>%
add_ExtremePoints(YearXExtremes)
evo6 <- evo5 +
theme_dayton()
grid.arrange(evo2,evo3,evo4,evo5,evo6, layout_matrix = rbind(c(1,2),c(3,4), c(5,5), c(5,5), c(5,5)))Here you can find some of the functions used for plotting
## ---
theme_dayton <- function(base_size = 11, base_family = 'sans'){
dayton <- ggplot2::theme_minimal(base_size = base_size,
base_family = base_family) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(face = "bold",
hjust = 0.012,
vjust = 0.8,
color = "#3C3C3C",
size = 20)
)
dayton
}
## ---
create_base_plot <- function(City, df_PastExtremes,
df_DataYearX,
df_YearXExtremes,
YearPastMin,
YearPastMax,
YearToday){
year_current <- max(df_DataYearX$Year) # Year to look at ("current year")
y_maxValue <- plyr::round_any(range(df_PastExtremes$PastHigh)[2], 10, f = ceiling)
temp_value <- plyr::round_any(range(df_PastExtremes$PastHigh)[1], 10, f = floor)
y_minValue <- ifelse(temp_value > 0, 0, temp_value)
p1 <- ggplot() +
geom_linerange(df_PastExtremes, mapping = aes(x = seqDay,
ymin = PastLow,
ymax = PastHigh),
colour = "wheat2",
alpha = 1) +
geom_linerange(df_PastExtremes, mapping = aes(x = seqDay,
ymin = PastAvgLow,
ymax = PastAvgHigh),
colour = "wheat4")
return(p1)
}
## ---
add_line_yearX <- function(gg_obj, df_DataYearX){
gg_obj +
geom_line(df_DataYearX,
mapping = aes(x = seqDay,
y = TempInC,
group = 1))
}
## ---
add_formatting <- function(gg_obj, feb_days, df_PastExtremes){
# pre-calculation
y_maxValue <- plyr::round_any(range(df_PastExtremes$PastHigh)[2], 10, f = ceiling)
temp_value <- plyr::round_any(range(df_PastExtremes$PastHigh)[1], 10, f = floor)
y_minValue <- ifelse(temp_value > 0, 0, temp_value)
gg_obj +
geom_vline(xintercept = 0, colour = "wheat4", linetype=1, size=1) +
geom_hline(yintercept = 0, colour = "white", linetype=1) +
geom_hline(yintercept = 5, colour = "white", linetype=1) +
geom_hline(yintercept = 10, colour = "white", linetype=1) +
geom_hline(yintercept = 15, colour = "white", linetype=1) +
geom_hline(yintercept = 20, colour = "white", linetype=1) +
geom_hline(yintercept = 25, colour = "white", linetype=1) +
geom_hline(yintercept = 30, colour = "white", linetype=1) +
geom_hline(yintercept = 35, colour = "white", linetype=1) +
geom_hline(yintercept = 40, colour = "white", linetype=1) +
geom_vline(xintercept = 31, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 31 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 2*31 + 0*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 2*31 + 1*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 3*31 + 1*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 3*31 + 2*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 4*31 + 2*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 4*31 + 3*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 5*31 + 3*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 5*31 + 4*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 6*31 + 4*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
geom_vline(xintercept = 6*31 + 5*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
scale_x_continuous(expand = c(0,0),
breaks = c(15, 45, 75, 105, 135, 165, 198, 228, 258, 288, 320, 350),
position = "top",
labels = c("January", "February", "March", "April",
"May", "June", "July", "August", "September",
"October", "November", "December")) +
ylim(y_minValue, y_maxValue) +
labs(x = "", y = expression("Temperature in "*~degree*C)) +
ggtitle(paste0(City, "'s Weather in ", YearToday))
}
## ---
add_ExtremePoints <- function(gg_obj, df_YearXExtremes){
gg_obj +
geom_point(data = df_YearXExtremes[df_YearXExtremes$recordLow == "Y", ],
aes(x = seqDay, y = TempInC), color = "blue3") +
geom_point(data = df_YearXExtremes[df_YearXExtremes$recordHigh == "Y", ],
aes(x = seqDay, y = TempInC), color = "firebrick3")
}
## ---
# not in use
add_subtext <- function(){
gridText <- grid.text("Temperature.....", x = 0, y = 1,
rot = 0,
just = "left", vjust = 1)
grid::grid.draw(gridText)
}
## ---
# not in use
plot.title = function(plot = NULL, text.1 = NULL, text.2 = NULL,
size.1 = 12, size.2 = 12,
col.1 = "black", col.2 = "black",
face.1 = "plain", face.2 = "plain") {
require(gtable)
require(grid)
gt = ggplotGrob(plot)
text.grob1 = textGrob(text.1, y = unit(.45, "npc"),
gp = gpar(fontsize = size.1, col = col.1, fontface = face.1))
text.grob2 = textGrob(text.2, y = unit(.65, "npc"),
gp = gpar(fontsize = size.2, col = col.2, fontface = face.2))
text = matrix(list(text.grob1, text.grob2), nrow = 2)
text = gtable_matrix(name = "title", grobs = text,
widths = unit(1, "null"),
heights = unit.c(unit(1.1, "grobheight", text.grob1) + unit(0.5, "lines"), unit(1.1, "grobheight", text.grob2) + unit(0.5, "lines")))
gt = gtable_add_grob(gt, text, t = 2, l = 4)
gt$heights[2] = sum(text$heights)
class(gt) = c("Title", class(gt))
gt
}