library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(readr)

Issue Description

Describe the general area or issue you want to investigate in your data analysis project.

My original intention was to compare the weather in two locations that I’m considering retiring to: Otavalo, Ecuador and Oaxaca, Mexico. Standard weather data for these cities (or neighboring areas) is available from NOAA. I was able to identify the appropriate weather stations, download datasets and load the data to RStudio.

After performing some initial prep of the data, I found that I was unable to get any results when attempting to plot the data. I ended up swapping out the Ecuador data and substituting Olympia data but still was not seeing results. I eventually swapped out the Mexico data as well, substituting data from Houston, Texas.

In lieu of comparing data from Ecuador and Mexico, my project compares data from Olympia, Washington to data from Houston, Texas

Questions

Define at least two specific questions you would like to attempt to answer.

Using my substitute locations, Olympia, Washington and Houston, Texas:
1. Compare daily temperature for locations (both min and max) 2. Compare daily precipitation for locations

I would like to have taken this comparison further by looking at precipitation and temperature together (as an indication of high humidity or dry, dusty conditions) but I wasn’t able to include to include that in my project.

Data Source

Identify the data source(s) you used for your analysis. Provide a URL if possible.

My project will use the following data source: NOAA - National Centers for Environmental Information, National Oceanic Atmospheric Administration.

https://www.ncdc.noaa.gov/cdo-web/datasets

I used the “Daily Summaries”" dataset and downloaded my datasets in csv format.

Documentation

Provide a link to the documentation for the data or the documentation itself. Is there a data dictionary?

The following is a link to the documentation for my data source: https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf

Description of the Data

Use the tools in R such as str() and summary() to describe the original dataset you imported.

# Import Olympia weather data
load("~/CSC360/olywthr.rdata")

# Description of Weather data for Olympia, WA
str(olywthr)
## Classes 'tbl_df', 'tbl' and 'data.frame':    49316 obs. of  9 variables:
##  $ STATION_NAME: chr  "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" ...
##  $ DATE        : Date, format: "1877-07-01" "1877-07-02" ...
##  $ PRCP        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TMAX        : int  63 63 67 67 71 74 80 88 81 70 ...
##  $ TMIN        : int  48 53 45 45 43 49 49 50 57 57 ...
##  $ yr          : num  1877 1877 1877 1877 1877 ...
##  $ mo          : num  7 7 7 7 7 7 7 7 7 7 ...
##  $ dy          : int  1 2 3 4 5 6 7 8 9 10 ...
summary(olywthr)
##  STATION_NAME            DATE                 PRCP       
##  Length:49316       Min.   :1877-07-01   Min.   :0.0000  
##  Class :character   1st Qu.:1913-05-10   1st Qu.:0.0000  
##  Mode  :character   Median :1949-12-24   Median :0.0000  
##                     Mean   :1948-11-12   Mean   :0.1409  
##                     3rd Qu.:1983-09-26   3rd Qu.:0.1400  
##                     Max.   :2017-07-11   Max.   :4.8200  
##       SNOW               TMAX             TMIN             yr      
##  Min.   : 0.00000   Min.   : 15.00   Min.   :-8.00   Min.   :1877  
##  1st Qu.: 0.00000   1st Qu.: 50.00   1st Qu.:34.00   1st Qu.:1913  
##  Median : 0.00000   Median : 59.00   Median :41.00   Median :1949  
##  Mean   : 0.02647   Mean   : 60.64   Mean   :40.42   Mean   :1948  
##  3rd Qu.: 0.00000   3rd Qu.: 71.00   3rd Qu.:47.00   3rd Qu.:1983  
##  Max.   :14.20000   Max.   :104.00   Max.   :76.00   Max.   :2017  
##        mo               dy       
##  Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 4.000   1st Qu.: 8.00  
##  Median : 7.000   Median :16.00  
##  Mean   : 6.516   Mean   :15.74  
##  3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :12.000   Max.   :31.00
#Import Houston weather data
Houston1331074 <- read_csv("~/CSC360/Houston1331074.csv", 
   col_types = cols(DATE = col_date(format = "%Y-%m-%d")))

# Description of weather data for Houston, TX
str(Houston1331074)
## Classes 'tbl_df', 'tbl' and 'data.frame':    17648 obs. of  7 variables:
##  $ STATION: chr  "USW00012960" "USW00012960" "USW00012960" "USW00012960" ...
##  $ NAME   : chr  "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" ...
##  $ DATE   : Date, format: "1970-01-01" "1970-01-02" ...
##  $ PRCP   : num  0 0.19 0 0.07 0.78 0 0 0 0.03 0.45 ...
##  $ TAVG   : chr  NA NA NA NA ...
##  $ TMAX   : int  56 45 51 53 50 40 44 48 40 57 ...
##  $ TMIN   : int  25 32 29 26 40 26 19 20 32 38 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 7
##   .. ..$ STATION: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ NAME   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ DATE   :List of 1
##   .. .. ..$ format: chr "%Y-%m-%d"
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ PRCP   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ TAVG   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ TMAX   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ TMIN   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
summary(Houston1331074)
##    STATION              NAME                DATE           
##  Length:17648       Length:17648       Min.   :1970-01-01  
##  Class :character   Class :character   1st Qu.:1982-01-29  
##  Mode  :character   Mode  :character   Median :1994-02-27  
##                                        Mean   :1994-02-27  
##                                        3rd Qu.:2006-03-28  
##                                        Max.   :2018-04-26  
##                                                            
##       PRCP             TAVG                TMAX             TMIN      
##  Min.   : 0.0000   Length:17648       Min.   : 27.00   Min.   : 7.00  
##  1st Qu.: 0.0000   Class :character   1st Qu.: 71.00   1st Qu.:48.00  
##  Median : 0.0000   Mode  :character   Median : 82.00   Median :62.00  
##  Mean   : 0.1364                      Mean   : 79.55   Mean   :58.97  
##  3rd Qu.: 0.0200                      3rd Qu.: 91.00   3rd Qu.:72.00  
##  Max.   :16.0700                      Max.   :109.00   Max.   :83.00  
##  NA's   :1                            NA's   :1        NA's   :1

Cleaning and Preparation

Describe the steps you took to get from your original dataset to the final dataset you used for your analysis. Include the R code in chunks.

Prepare and combine the datasets for the two locations

# Select desired columns and create new columns for year, month & day
olywthr <- olywthr %>%
  select(STATION_NAME,DATE,PRCP,TMAX,TMIN) %>%
  mutate(yr = year(DATE), mo = month(DATE), dy = day(DATE))

houwthr <- Houston1331074 %>%
  select(NAME,DATE,PRCP,TMAX,TMIN) %>%
  mutate(yr = year(DATE), mo = month(DATE), dy = day(DATE)) 

# Rename the "NAME" column in the Houston dataframe to match name in Olympia dataframe
colnames(houwthr)[1] <- "STATION_NAME"

# Confirm that both dataframes now have the same structure 
glimpse(olywthr)
## Observations: 49,316
## Variables: 8
## $ STATION_NAME <chr> "OLYMPIA PRIEST PT PA WA US", "OLYMPIA PRIEST PT ...
## $ DATE         <date> 1877-07-01, 1877-07-02, 1877-07-03, 1877-07-04, ...
## $ PRCP         <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0...
## $ TMAX         <int> 63, 63, 67, 67, 71, 74, 80, 88, 81, 70, 71, 75, 7...
## $ TMIN         <int> 48, 53, 45, 45, 43, 49, 49, 50, 57, 57, 45, 47, 4...
## $ yr           <dbl> 1877, 1877, 1877, 1877, 1877, 1877, 1877, 1877, 1...
## $ mo           <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ dy           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
glimpse(houwthr)
## Observations: 17,648
## Variables: 8
## $ STATION_NAME <chr> "HOUSTON INTERCONTINENTAL AIRPORT, TX US", "HOUST...
## $ DATE         <date> 1970-01-01, 1970-01-02, 1970-01-03, 1970-01-04, ...
## $ PRCP         <dbl> 0.00, 0.19, 0.00, 0.07, 0.78, 0.00, 0.00, 0.00, 0...
## $ TMAX         <int> 56, 45, 51, 53, 50, 40, 44, 48, 40, 57, 70, 53, 4...
## $ TMIN         <int> 25, 32, 29, 26, 40, 26, 19, 20, 32, 38, 46, 39, 3...
## $ yr           <dbl> 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1...
## $ mo           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dy           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
# Create shorter location nanes for plot legends
olywthr <- olywthr %>%
  mutate(abbr = "Oly", airport = ifelse(STATION_NAME=="OLYMPIA AIRPORT WA US","OLM",NA))

houwthr <- houwthr %>%
  mutate(abbr = 'Hou', airport = 'IAH')

# Confirm that my two dataframes now have the same structure
str(olywthr)
## Classes 'tbl_df', 'tbl' and 'data.frame':    49316 obs. of  10 variables:
##  $ STATION_NAME: chr  "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" ...
##  $ DATE        : Date, format: "1877-07-01" "1877-07-02" ...
##  $ PRCP        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TMAX        : int  63 63 67 67 71 74 80 88 81 70 ...
##  $ TMIN        : int  48 53 45 45 43 49 49 50 57 57 ...
##  $ yr          : num  1877 1877 1877 1877 1877 ...
##  $ mo          : num  7 7 7 7 7 7 7 7 7 7 ...
##  $ dy          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ abbr        : chr  "Oly" "Oly" "Oly" "Oly" ...
##  $ airport     : chr  NA NA NA NA ...
str(houwthr)
## Classes 'tbl_df', 'tbl' and 'data.frame':    17648 obs. of  10 variables:
##  $ STATION_NAME: chr  "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" ...
##  $ DATE        : Date, format: "1970-01-01" "1970-01-02" ...
##  $ PRCP        : num  0 0.19 0 0.07 0.78 0 0 0 0.03 0.45 ...
##  $ TMAX        : int  56 45 51 53 50 40 44 48 40 57 ...
##  $ TMIN        : int  25 32 29 26 40 26 19 20 32 38 ...
##  $ yr          : num  1970 1970 1970 1970 1970 1970 1970 1970 1970 1970 ...
##  $ mo          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ dy          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ abbr        : chr  "Hou" "Hou" "Hou" "Hou" ...
##  $ airport     : chr  "IAH" "IAH" "IAH" "IAH" ...
# Combine Olympia and Houston into a single dataframe
allwthr <- rbind(olywthr,houwthr)
summary(allwthr)
##  STATION_NAME            DATE                 PRCP        
##  Length:66964       Min.   :1877-07-01   Min.   : 0.0000  
##  Class :character   1st Qu.:1928-03-14   1st Qu.: 0.0000  
##  Mode  :character   Median :1972-01-26   Median : 0.0000  
##                     Mean   :1960-10-20   Mean   : 0.1397  
##                     3rd Qu.:1994-12-27   3rd Qu.: 0.1000  
##                     Max.   :2018-04-26   Max.   :16.0700  
##                                          NA's   :1        
##       TMAX             TMIN             yr             mo        
##  Min.   : 15.00   Min.   :-8.00   Min.   :1877   Min.   : 1.000  
##  1st Qu.: 53.00   1st Qu.:36.00   1st Qu.:1928   1st Qu.: 4.000  
##  Median : 65.00   Median :44.00   Median :1972   Median : 7.000  
##  Mean   : 65.63   Mean   :45.31   Mean   :1960   Mean   : 6.511  
##  3rd Qu.: 78.00   3rd Qu.:52.00   3rd Qu.:1994   3rd Qu.:10.000  
##  Max.   :109.00   Max.   :83.00   Max.   :2018   Max.   :12.000  
##  NA's   :1        NA's   :1                                      
##        dy            abbr             airport         
##  Min.   : 1.00   Length:66964       Length:66964      
##  1st Qu.: 8.00   Class :character   Class :character  
##  Median :16.00   Mode  :character   Mode  :character  
##  Mean   :15.74                                        
##  3rd Qu.:23.00                                        
##  Max.   :31.00                                        
## 
# Get the date range of data for each station
allwthr %>%
  group_by(STATION_NAME) %>%
  summarize(min_date = min(DATE),
            max_date = max(DATE)) %>%
  ungroup -> StationDateRange

StationDateRange
# Limit the data to overlapping dates 
allwthr2 <- allwthr %>%
  filter(DATE >= "1970-01-01" & DATE <= "2017-07-11" ) 

Summarize data for reporting on min & max temperature

# Get the ave max temp & avg min temp for each airport/year/mo combination
allwthr3 <- allwthr2 %>%
  group_by(airport,yr,mo) %>%
  summarize(mmax = mean(TMAX),
            mmin = mean(TMIN)) %>%
  ungroup()

Summarize data for reporting on precipitation

# Get the number of rainy days for each airport/year/mo combination
allwthr4 <- allwthr2 %>%
  filter(PRCP > 0) %>%
  group_by(airport,yr,mo) %>%
  summarize(nrain = n()) %>%
  ungroup()

# Get the avg number of rainy days for each airport/mo combination
allwthr6 <- allwthr4 %>%
  group_by(airport,mo) %>%
  summarize(mnrain = mean(nrain)) %>%
  ungroup()

# Get avg amount of rain for the days with rain for each airport/mo combination
allwthr5 <- allwthr2 %>%
  group_by(airport,mo) %>%
  summarize(mrain = mean(PRCP > 0,na.rm=TRUE)) %>%
  ungroup()

Final Results

Show how you approached the questions you posed at the beginning. Describe how much you were able to accomplish. There should be both graphical and numerical results produced by R code included in chunks. Explain what you did and what it means.

I looked at temperature and precipitation data from month to month for each location, primarily using a faceted view of the data. The graphical results provide an easy way to see trends in the data for each location as well as how the two locations compare to one another. The numerical results provide a way to see specific data values, like: what is the average maximum temperature in Houston in August or what is the average rainfall per day in Olympia in December?

Seeing the comparison between the two locations is helpful in terms of getting a general understanding of the climate of each individual area. Deciding which location has the “better” climate requires a subjective evaluation. If some baseline of “ideal” values could be established, then presumably I would be able to do some specific comparison between the locations and the ideal.

To get a true measure of climate, I think it would necessary to consider temperature and precipitation together instead of evaluating them separately. For example, if the temperature is high at a time when the precipitation is high, this would seems to indicate uncomfortable humidity. On the other hand, if precipitation is low for an extended period of time, particularly if the temperatures are high, this could be an indication that the area is likely to be dry and dusty.

allwthr3 %>%
  ggplot(aes(x = yr, y = mmax,color = airport)) +
  geom_point() +
  scale_x_continuous("Year", breaks = c(1970,1990,2010)) +
  scale_y_continuous("Temperature (°F)") +
  facet_wrap(~mo) +
  ggtitle("Average Monthly Maximum Temperature (P5)") 

allwthr3_OLM <- allwthr3 %>%
  filter(airport == 'OLM')

allwthr3_IAH <- allwthr3 %>%
  filter(airport == 'IAH')

print("OLYMPIA AIRPORT WA US")
## [1] "OLYMPIA AIRPORT WA US"
tapply(allwthr3_OLM$mmax,allwthr3_OLM$mo,summary)
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   39.84   43.58   44.66   45.48   47.65   50.26 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   42.18   47.57   48.96   49.42   51.34   55.43 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.48   51.27   53.90   53.94   56.33   60.39 
## 
## $`4`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   52.73   57.36   58.88   59.02   60.50   66.30 
## 
## $`5`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   60.42   63.73   65.53   65.68   67.74   72.65 
## 
## $`6`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   64.93   68.46   71.03   70.95   73.48   79.90 
## 
## $`7`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   71.06   75.37   77.23   77.21   78.94   84.35 
## 
## $`8`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   70.71   75.68   78.13   77.68   79.47   83.94 
## 
## $`9`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   65.77   69.47   72.03   71.82   74.20   79.63 
## 
## $`10`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   56.77   58.69   60.42   60.54   61.77   67.71 
## 
## $`11`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   41.13   49.00   50.17   50.35   51.47   55.00 
## 
## $`12`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   38.90   42.69   45.10   44.50   46.10   49.39
print("HOUSTON INTERCONTINENTAL AIRPORT, TX US")
## [1] "HOUSTON INTERCONTINENTAL AIRPORT, TX US"
tapply(allwthr3_IAH$mmax,allwthr3_IAH$mo,summary)
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.19   59.19   63.10   62.48   66.12   70.19 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   55.18   63.58   66.59   66.39   69.02   76.75 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.23   71.73   72.58   73.09   75.01   78.35 
## 
## $`4`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   74.27   78.31   79.13   79.46   80.65   85.47 
## 
## $`5`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   81.19   84.51   85.73   85.79   86.98   90.29 
## 
## $`6`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   87.60   90.01   90.92   91.36   92.41   97.40 
## 
## $`7`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   89.71   91.83   93.71   93.71   95.65   99.42 
## 
## $`8`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   89.42   92.56   94.00   94.05   95.52  102.03 
## 
## $`9`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   82.93   87.18   89.47   89.37   91.22   95.50 
## 
## $`10`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   72.16   80.71   82.13   81.93   83.27   86.61 
## 
## $`11`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   62.80   69.68   72.57   72.04   74.83   79.10 
## 
## $`12`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   55.32   63.94   65.23   64.79   66.52   71.84
allwthr3 %>%
  ggplot(aes(x = yr, y = mmin,color = airport)) +
  geom_point() +
  scale_x_continuous("Year", breaks = c(1970,1990,2010)) +
  scale_y_continuous("Temperature (°F)") +
  facet_wrap(~mo) +
  ggtitle("Average Monthly Minimum Temperature (P6)")

allwthr6 %>%
  ggplot(aes(x = mo, y = mnrain, color = airport)) +
  geom_line(size=1.5) +
  scale_x_continuous("Month", breaks = allwthr6$mo) +
  scale_y_continuous("Number of Rainy Days") +
  ggtitle("Average Number of Days with Rain (P11)")

tapply(allwthr6$mnrain,allwthr6$airport,summary)
## $IAH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.000   8.214   8.606   8.622   9.174   9.979 
## 
## $OLM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.778   9.089  14.761  13.740  18.922  20.362
allwthr5 %>%
  ggplot(aes(x = airport, y = mrain)) +
  geom_col(aes(fill = airport)) +
  scale_y_continuous("Inches of Rain") +
  facet_wrap(~mo) +
  ggtitle("Average Amount of Rain on Days with Rain (P8)")

tapply(allwthr5$mrain,allwthr5$airport,summary)
## $IAH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2333  0.2692  0.2901  0.2836  0.2981  0.3219 
## 
## $OLM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1466  0.2884  0.4854  0.4497  0.6104  0.6589