WORK IN PROGRESS

Introduction

interesting links and sources:
- https://www.edwardtufte.com/bboard/q-and-a-fetch-msg?msg_id=00014g
- http://academic.udayton.edu/kissock/http/Weather/
- …

TEXT
A good text is still missing here. However, a short summary of the project of mine is as follows:
- analysing the weather data
- recreating the graph you can see below - building shiney app
- main features: You can choose the city & You can choose the “years of interest”
- visualize all relevant cities on world map
- click on city to get this graph

Original graph from the New York Times (Weather Chart from January 2004, p. A-15) alt text

Initiating

pkgsName <- c("readr","magrittr","ggplot2","dplyr", "gridExtra")
# (pkgs <- package(pkgsName))
lapply(pkgsName, require, character.only = TRUE)
#> Loading required package: readr
#> Loading required package: magrittr
#> Loading required package: ggplot2
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
#> Loading required package: gridExtra
#> 
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#> 
#>     combine
#> [[1]]
#> [1] TRUE
#> 
#> [[2]]
#> [1] TRUE
#> 
#> [[3]]
#> [1] TRUE
#> 
#> [[4]]
#> [1] TRUE
#> 
#> [[5]]
#> [1] TRUE

Data - Exploring

TEXT
Three functions which make it easy to load the files:
- make_filename()
- get_FileInfo()
- read_and_load()


make_filename <- function(CityABBR) {
        filePathSep <- "/"
        fileNamesep <- "."
        fileExt <- "txt"
        baseURL <- "http://academic.udayton.edu/kissock/http/Weather/gsod95-current"
        
        filename <- paste(CityABBR, fileExt, sep = fileNamesep)
        finalURL <- paste(baseURL, filename, sep = filePathSep)
        
} # END make_filename()

get_FileInfo <- function(CityFile, CountryABBR, City){
        # start with an empty data frame:
        #     not really needed if only one file is looked at
        # df <- data.frame(name = c(), size = c()) 
        fileInfo <- object.size(CityFile)
        fileSizeInMb <- paste(round(fileInfo / 1024 / 1024, 2), "MB")
        df <- data.frame(name = paste(CountryABBR, City), size = fileSizeInMb)
} #END get_FileInfo


read_and_load <- function(finalURL){
        ext_tracks_colnames <- c("Month", "Day", "Year", "TempInF")
        ext_tracks_widths <- c(8,9,17,17)
        # data <- readr::read_fwf(finalURL) #col_names = FALSE
        data <- readr::read_fwf(finalURL, 
                                fwf_widths(ext_tracks_widths, 
                                           ext_tracks_colnames)
                                )
        return(data)
}

So, let’s start with the city Cairo in Egypt:
- define input parameters
- load the file with the functions you saw above

CountryABBR <- "EG"; City <- "CAIRO" #ensure: captial letters
CityABBR <- paste0(CountryABBR, City)

# make file name
name <- make_filename(CityABBR)

# read
testfile <- read_and_load(name)
(get_FileInfo(testfile, CountryABBR, City))
#>       name    size
#> 1 EG CAIRO 0.45 MB
head(testfile) # --simple as it can be
#> # A tibble: 6 x 4
#>   Month   Day  Year TempInF
#>   <int> <int> <int>   <dbl>
#> 1     1     1  1995    59.2
#> 2     1     2  1995    57.5
#> 3     1     3  1995    57.4
#> 4     1     4  1995    59.3
#> 5     1     5  1995    58.8
#> 6     1     6  1995    55.7

Preparing the data

TEXT

I defined the following functions:
- get_YearData(data, YearsOfInterest)
- get_YearPastExtremes(PastData)
- get_ExtremesForYearX(PastYearExtremes, DataYearX)

They help to summarize the data given by the *.csv files. They are defined as follows:

get_YearData <- function(data, YearsOfInterest){

        Past <- data %>%
                dplyr::group_by(Year) %>%
                dplyr::mutate(seqDay = seq(1, length(Day))) %>%
                dplyr::ungroup() %>%
                dplyr::filter(TempInF != -99 & Year %in% YearsOfInterest) %>% # missing values = -99
                dplyr::group_by(seqDay) %>%
                dplyr::mutate(TempInC = (TempInF - 32) / 1.8,
                              upper = max(TempInC), 
                              lower = min(TempInC),
                              avg = mean(TempInC),
                              se = sd(TempInC) / sqrt(length(TempInC)),
                              avg_upper = avg + (2.101 * se),
                              avg_lower = avg - (2.101 * se)) %>%
                ungroup()
}

get_YearPastExtremes <- function(PastData){
        Low <- PastData %>%
                dplyr::group_by(seqDay) %>%
                dplyr::summarise(PastLow = min(TempInC),
                                 PastHigh = max(TempInC),
                                 PastAvg = mean(TempInC),
                                 PastSe = sd(TempInC) / sqrt(length(TempInC)),
                                 PastAvgLow = PastAvg - (2.101 * PastSe),
                                 PastAvgHigh = PastAvg + (2.101 * PastSe))
}


get_ExtremesForYearX <- function(PastYearExtremes, DataYearX){
      
        
        Low_YearX <- DataYearX %>%
                dplyr::left_join(PastYearExtremes) %>%
                mutate(recordLow = ifelse(TempInC < PastLow, "Y", "N"),
                       recordHigh = ifelse(TempInC >= PastHigh, "Y", "N"))
                
}

So the routine looks like this:

YearToday   <- 2014L
YearPastMin <- min(testfile$Year)
YearPastMax <- YearToday - 1

PastData  <- get_YearData(testfile, YearPastMin:YearPastMax)
#> Warning: package 'bindrcpp' was built under R version 3.3.3
DataYearX <- get_YearData(testfile, YearToday)

PastExtremes <- get_YearPastExtremes(PastData)

head(PastExtremes)
#> # A tibble: 6 x 7
#>   seqDay  PastLow PastHigh  PastAvg    PastSe PastAvgLow PastAvgHigh
#>    <int>    <dbl>    <dbl>    <dbl>     <dbl>      <dbl>       <dbl>
#> 1      1 12.61111 19.33333 15.26901 0.3567744   14.51942    16.01859
#> 2      2 12.50000 20.44444 15.50585 0.4304761   14.60142    16.41028
#> 3      3 11.77778 18.50000 15.03216 0.4338677   14.12061    15.94372
#> 4      4 12.00000 18.61111 14.93275 0.3712363   14.15278    15.71272
#> 5      5 11.16667 19.16667 14.62865 0.4309403   13.72325    15.53406
#> 6      6 10.05556 19.44444 14.42982 0.4841701   13.41258    15.44707
tail(PastExtremes)
#> # A tibble: 6 x 7
#>   seqDay  PastLow PastHigh  PastAvg    PastSe PastAvgLow PastAvgHigh
#>    <int>    <dbl>    <dbl>    <dbl>     <dbl>      <dbl>       <dbl>
#> 1    361 11.05556 21.05556 15.16959 0.5581563   13.99690    16.34228
#> 2    362 11.16667 19.83333 15.25731 0.5298165   14.14417    16.37045
#> 3    363 11.61111 18.38889 14.96784 0.3705729   14.18926    15.74641
#> 4    364 12.77778 17.88889 15.32407 0.3421212   14.60528    16.04287
#> 5    365 13.05556 18.22222 15.27778 0.3537958   14.53445    16.02110
#> 6    366 13.94444 18.33333 15.61111 0.8136566   13.90162    17.32060

YearXExtremes <- get_ExtremesForYearX(PastExtremes, DataYearX)
#> Joining, by = "seqDay"

Evolution of Plots

TEXT

This is not yet the final product (obvious…duah :)

Work in Progress

Keep in mind, this is the Weather of Cairo.
Here is an evolution of the graph

evo2 <- create_base_plot(City, PastExtremes, 
                       DataYearX, 
                       YearXExtremes, 
                       YearPastMin, 
                       YearPastMax, 
                       YearToday)

evo3 <- evo2 %>%
        add_line_yearX(DataYearX) 

evo4 <- evo3 %>%
        add_formatting(feb_days = 29, PastExtremes)

evo5 <- evo4 %>%
        add_ExtremePoints(YearXExtremes)

evo6 <- evo5 +
        theme_dayton()

grid.arrange(evo2,evo3,evo4,evo5,evo6, layout_matrix = rbind(c(1,2),c(3,4), c(5,5), c(5,5), c(5,5)))

Appendix

Here you can find some of the functions used for plotting

## ---
theme_dayton <- function(base_size = 11, base_family = 'sans'){
        dayton <- ggplot2::theme_minimal(base_size = base_size, 
                                           base_family = base_family) +
                                                  theme(panel.grid.major = element_blank(),
                                                        panel.grid.minor = element_blank(),
                                                        axis.ticks = element_blank(),
                                                        plot.title = element_text(face = "bold",
                                                                                  hjust = 0.012,
                                                                                  vjust = 0.8,
                                                                                  color = "#3C3C3C",
                                                                                  size = 20)
                                                  )
                   dayton
}




## ---
create_base_plot <- function(City, df_PastExtremes, 
                             df_DataYearX, 
                             df_YearXExtremes, 
                             YearPastMin, 
                             YearPastMax, 
                             YearToday){
        
        year_current <- max(df_DataYearX$Year) # Year to look at ("current year")
        y_maxValue <- plyr::round_any(range(df_PastExtremes$PastHigh)[2], 10, f = ceiling)
        temp_value <- plyr::round_any(range(df_PastExtremes$PastHigh)[1], 10, f = floor)
        y_minValue <- ifelse(temp_value > 0, 0, temp_value)
        
        
        p1 <- ggplot() + 
                geom_linerange(df_PastExtremes, mapping = aes(x = seqDay, 
                                                           ymin = PastLow, 
                                                           ymax = PastHigh), 
                               colour = "wheat2", 
                               alpha = 1) +
                geom_linerange(df_PastExtremes, mapping = aes(x = seqDay,
                                                              ymin = PastAvgLow, 
                                                              ymax = PastAvgHigh),
                               colour = "wheat4")
        return(p1)
}

## ---
add_line_yearX <- function(gg_obj, df_DataYearX){
        gg_obj + 
                geom_line(df_DataYearX, 
                          mapping = aes(x = seqDay, 
                                        y = TempInC, 
                                        group = 1))
}


## ---
add_formatting <- function(gg_obj, feb_days, df_PastExtremes){
        
        # pre-calculation
        y_maxValue <- plyr::round_any(range(df_PastExtremes$PastHigh)[2], 10, f = ceiling)
        temp_value <- plyr::round_any(range(df_PastExtremes$PastHigh)[1], 10, f = floor)
        y_minValue <- ifelse(temp_value > 0, 0, temp_value)
        
        gg_obj + 
                geom_vline(xintercept = 0, colour = "wheat4", linetype=1, size=1) +
                
                geom_hline(yintercept = 0, colour = "white", linetype=1) +
                geom_hline(yintercept = 5, colour = "white", linetype=1) +
                geom_hline(yintercept = 10, colour = "white", linetype=1) +
                geom_hline(yintercept = 15, colour = "white", linetype=1) +
                geom_hline(yintercept = 20, colour = "white", linetype=1) +
                geom_hline(yintercept = 25, colour = "white", linetype=1) +
                geom_hline(yintercept = 30, colour = "white", linetype=1) +
                geom_hline(yintercept = 35, colour = "white", linetype=1) +
                geom_hline(yintercept = 40, colour = "white", linetype=1) +
                
                geom_vline(xintercept = 31, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 31 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 2*31 + 0*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 2*31 + 1*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 3*31 + 1*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 3*31 + 2*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 4*31 + 2*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 4*31 + 3*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 5*31 + 3*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 5*31 + 4*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 6*31 + 4*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                geom_vline(xintercept = 6*31 + 5*30 + feb_days, colour = "wheat4", linetype=3, size=.5) +
                
                scale_x_continuous(expand = c(0,0),
                                   breaks = c(15, 45, 75, 105, 135, 165, 198, 228, 258, 288, 320, 350),
                                   position = "top",
                                   labels = c("January", "February", "March", "April",
                                              "May", "June", "July", "August", "September",
                                              "October", "November", "December")) +
                ylim(y_minValue, y_maxValue) +
                labs(x = "", y = expression("Temperature in "*~degree*C)) +
                ggtitle(paste0(City, "'s Weather in ", YearToday))
}


## ---
add_ExtremePoints <- function(gg_obj, df_YearXExtremes){
        gg_obj + 
                geom_point(data = df_YearXExtremes[df_YearXExtremes$recordLow == "Y", ], 
                           aes(x = seqDay, y = TempInC), color = "blue3") +
                geom_point(data = df_YearXExtremes[df_YearXExtremes$recordHigh == "Y", ], 
                           aes(x = seqDay, y = TempInC), color = "firebrick3")
}

## ---
# not in use
add_subtext <- function(){
        
        gridText <- grid.text("Temperature.....", x = 0, y = 1, 
                              rot = 0, 
                              just = "left", vjust = 1)
        grid::grid.draw(gridText)
}


## ---
# not in use
plot.title = function(plot = NULL, text.1 = NULL, text.2 = NULL, 
                      size.1 = 12,  size.2 = 12,
                      col.1 = "black", col.2 = "black", 
                      face.1 = "plain",  face.2 = "plain") {
        
        require(gtable)
        require(grid)
        
        gt = ggplotGrob(plot)
        
        text.grob1 = textGrob(text.1, y = unit(.45, "npc"), 
                              gp = gpar(fontsize = size.1, col = col.1, fontface = face.1))
        text.grob2 = textGrob(text.2,  y = unit(.65, "npc"), 
                              gp = gpar(fontsize = size.2, col = col.2, fontface = face.2))
        
        text = matrix(list(text.grob1, text.grob2), nrow = 2)
        text = gtable_matrix(name = "title", grobs = text, 
                             widths = unit(1, "null"), 
                             heights = unit.c(unit(1.1, "grobheight", text.grob1) + unit(0.5, "lines"), unit(1.1,  "grobheight", text.grob2) + unit(0.5, "lines")))
        
        gt = gtable_add_grob(gt, text, t = 2, l = 4)
        gt$heights[2] = sum(text$heights)
        
        class(gt) =  c("Title", class(gt))
        
        gt
}