library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readr)
Describe the general area or issue you want to investigate in your data analysis project.
My original intention was to compare the weather in two locations that I’m considering retiring to: Otavalo, Ecuador and Oaxaca, Mexico. Standard weather data for these cities (or neighboring areas) is available from NOAA. I was able to identify the appropriate weather stations, download datasets and load the data to RStudio.
After performing some initial prep of the data, I found that I was unable to get any results when attempting to plot the data. I ended up swapping out the Ecuador data and substituting Olympia data but still was not seeing results. I eventually swapped out the Mexico data as well, substituting data from Houston, Texas.
In lieu of comparing data from Ecuador and Mexico, my project compares data from Olympia, Washington to data from Houston, Texas
Define at least two specific questions you would like to attempt to answer.
Using my substitute locations, Olympia, Washington and Houston, Texas:
1. Compare daily temperature for locations (both min and max) 2. Compare daily precipitation for locations
I would like to have taken this comparison further by looking at precipitation and temperature together (as an indication of high humidity or dry, dusty conditions) but I wasn’t able to include to include that in my project.
Identify the data source(s) you used for your analysis. Provide a URL if possible.
My project will use the following data source: NOAA - National Centers for Environmental Information, National Oceanic Atmospheric Administration.
https://www.ncdc.noaa.gov/cdo-web/datasets
I used the “Daily Summaries”" dataset and downloaded my datasets in csv format.
Provide a link to the documentation for the data or the documentation itself. Is there a data dictionary?
The following is a link to the documentation for my data source: https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf
Use the tools in R such as str() and summary() to describe the original dataset you imported.
# Import Olympia weather data
load("~/CSC360/olywthr.rdata")
# Description of Weather data for Olympia, WA
str(olywthr)
## Classes 'tbl_df', 'tbl' and 'data.frame': 49316 obs. of 9 variables:
## $ STATION_NAME: chr "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" ...
## $ DATE : Date, format: "1877-07-01" "1877-07-02" ...
## $ PRCP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TMAX : int 63 63 67 67 71 74 80 88 81 70 ...
## $ TMIN : int 48 53 45 45 43 49 49 50 57 57 ...
## $ yr : num 1877 1877 1877 1877 1877 ...
## $ mo : num 7 7 7 7 7 7 7 7 7 7 ...
## $ dy : int 1 2 3 4 5 6 7 8 9 10 ...
summary(olywthr)
## STATION_NAME DATE PRCP
## Length:49316 Min. :1877-07-01 Min. :0.0000
## Class :character 1st Qu.:1913-05-10 1st Qu.:0.0000
## Mode :character Median :1949-12-24 Median :0.0000
## Mean :1948-11-12 Mean :0.1409
## 3rd Qu.:1983-09-26 3rd Qu.:0.1400
## Max. :2017-07-11 Max. :4.8200
## SNOW TMAX TMIN yr
## Min. : 0.00000 Min. : 15.00 Min. :-8.00 Min. :1877
## 1st Qu.: 0.00000 1st Qu.: 50.00 1st Qu.:34.00 1st Qu.:1913
## Median : 0.00000 Median : 59.00 Median :41.00 Median :1949
## Mean : 0.02647 Mean : 60.64 Mean :40.42 Mean :1948
## 3rd Qu.: 0.00000 3rd Qu.: 71.00 3rd Qu.:47.00 3rd Qu.:1983
## Max. :14.20000 Max. :104.00 Max. :76.00 Max. :2017
## mo dy
## Min. : 1.000 Min. : 1.00
## 1st Qu.: 4.000 1st Qu.: 8.00
## Median : 7.000 Median :16.00
## Mean : 6.516 Mean :15.74
## 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :12.000 Max. :31.00
#Import Houston weather data
Houston1331074 <- read_csv("~/CSC360/Houston1331074.csv",
col_types = cols(DATE = col_date(format = "%Y-%m-%d")))
# Description of weather data for Houston, TX
str(Houston1331074)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17648 obs. of 7 variables:
## $ STATION: chr "USW00012960" "USW00012960" "USW00012960" "USW00012960" ...
## $ NAME : chr "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" ...
## $ DATE : Date, format: "1970-01-01" "1970-01-02" ...
## $ PRCP : num 0 0.19 0 0.07 0.78 0 0 0 0.03 0.45 ...
## $ TAVG : chr NA NA NA NA ...
## $ TMAX : int 56 45 51 53 50 40 44 48 40 57 ...
## $ TMIN : int 25 32 29 26 40 26 19 20 32 38 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 7
## .. ..$ STATION: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ NAME : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ DATE :List of 1
## .. .. ..$ format: chr "%Y-%m-%d"
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ PRCP : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ TAVG : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ TMAX : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ TMIN : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(Houston1331074)
## STATION NAME DATE
## Length:17648 Length:17648 Min. :1970-01-01
## Class :character Class :character 1st Qu.:1982-01-29
## Mode :character Mode :character Median :1994-02-27
## Mean :1994-02-27
## 3rd Qu.:2006-03-28
## Max. :2018-04-26
##
## PRCP TAVG TMAX TMIN
## Min. : 0.0000 Length:17648 Min. : 27.00 Min. : 7.00
## 1st Qu.: 0.0000 Class :character 1st Qu.: 71.00 1st Qu.:48.00
## Median : 0.0000 Mode :character Median : 82.00 Median :62.00
## Mean : 0.1364 Mean : 79.55 Mean :58.97
## 3rd Qu.: 0.0200 3rd Qu.: 91.00 3rd Qu.:72.00
## Max. :16.0700 Max. :109.00 Max. :83.00
## NA's :1 NA's :1 NA's :1
Describe the steps you took to get from your original dataset to the final dataset you used for your analysis. Include the R code in chunks.
Prepare and combine the datasets for the two locations
# Select desired columns and create new columns for year, month & day
olywthr <- olywthr %>%
select(STATION_NAME,DATE,PRCP,TMAX,TMIN) %>%
mutate(yr = year(DATE), mo = month(DATE), dy = day(DATE))
houwthr <- Houston1331074 %>%
select(NAME,DATE,PRCP,TMAX,TMIN) %>%
mutate(yr = year(DATE), mo = month(DATE), dy = day(DATE))
# Rename the "NAME" column in the Houston dataframe to match name in Olympia dataframe
colnames(houwthr)[1] <- "STATION_NAME"
# Confirm that both dataframes now have the same structure
glimpse(olywthr)
## Observations: 49,316
## Variables: 8
## $ STATION_NAME <chr> "OLYMPIA PRIEST PT PA WA US", "OLYMPIA PRIEST PT ...
## $ DATE <date> 1877-07-01, 1877-07-02, 1877-07-03, 1877-07-04, ...
## $ PRCP <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0...
## $ TMAX <int> 63, 63, 67, 67, 71, 74, 80, 88, 81, 70, 71, 75, 7...
## $ TMIN <int> 48, 53, 45, 45, 43, 49, 49, 50, 57, 57, 45, 47, 4...
## $ yr <dbl> 1877, 1877, 1877, 1877, 1877, 1877, 1877, 1877, 1...
## $ mo <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ dy <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
glimpse(houwthr)
## Observations: 17,648
## Variables: 8
## $ STATION_NAME <chr> "HOUSTON INTERCONTINENTAL AIRPORT, TX US", "HOUST...
## $ DATE <date> 1970-01-01, 1970-01-02, 1970-01-03, 1970-01-04, ...
## $ PRCP <dbl> 0.00, 0.19, 0.00, 0.07, 0.78, 0.00, 0.00, 0.00, 0...
## $ TMAX <int> 56, 45, 51, 53, 50, 40, 44, 48, 40, 57, 70, 53, 4...
## $ TMIN <int> 25, 32, 29, 26, 40, 26, 19, 20, 32, 38, 46, 39, 3...
## $ yr <dbl> 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1...
## $ mo <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dy <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
# Create shorter location nanes for plot legends
olywthr <- olywthr %>%
mutate(abbr = "Oly", airport = ifelse(STATION_NAME=="OLYMPIA AIRPORT WA US","OLM",NA))
houwthr <- houwthr %>%
mutate(abbr = 'Hou', airport = 'IAH')
# Confirm that my two dataframes now have the same structure
str(olywthr)
## Classes 'tbl_df', 'tbl' and 'data.frame': 49316 obs. of 10 variables:
## $ STATION_NAME: chr "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" ...
## $ DATE : Date, format: "1877-07-01" "1877-07-02" ...
## $ PRCP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TMAX : int 63 63 67 67 71 74 80 88 81 70 ...
## $ TMIN : int 48 53 45 45 43 49 49 50 57 57 ...
## $ yr : num 1877 1877 1877 1877 1877 ...
## $ mo : num 7 7 7 7 7 7 7 7 7 7 ...
## $ dy : int 1 2 3 4 5 6 7 8 9 10 ...
## $ abbr : chr "Oly" "Oly" "Oly" "Oly" ...
## $ airport : chr NA NA NA NA ...
str(houwthr)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17648 obs. of 10 variables:
## $ STATION_NAME: chr "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" "HOUSTON INTERCONTINENTAL AIRPORT, TX US" ...
## $ DATE : Date, format: "1970-01-01" "1970-01-02" ...
## $ PRCP : num 0 0.19 0 0.07 0.78 0 0 0 0.03 0.45 ...
## $ TMAX : int 56 45 51 53 50 40 44 48 40 57 ...
## $ TMIN : int 25 32 29 26 40 26 19 20 32 38 ...
## $ yr : num 1970 1970 1970 1970 1970 1970 1970 1970 1970 1970 ...
## $ mo : num 1 1 1 1 1 1 1 1 1 1 ...
## $ dy : int 1 2 3 4 5 6 7 8 9 10 ...
## $ abbr : chr "Hou" "Hou" "Hou" "Hou" ...
## $ airport : chr "IAH" "IAH" "IAH" "IAH" ...
# Combine Olympia and Houston into a single dataframe
allwthr <- rbind(olywthr,houwthr)
summary(allwthr)
## STATION_NAME DATE PRCP
## Length:66964 Min. :1877-07-01 Min. : 0.0000
## Class :character 1st Qu.:1928-03-14 1st Qu.: 0.0000
## Mode :character Median :1972-01-26 Median : 0.0000
## Mean :1960-10-20 Mean : 0.1397
## 3rd Qu.:1994-12-27 3rd Qu.: 0.1000
## Max. :2018-04-26 Max. :16.0700
## NA's :1
## TMAX TMIN yr mo
## Min. : 15.00 Min. :-8.00 Min. :1877 Min. : 1.000
## 1st Qu.: 53.00 1st Qu.:36.00 1st Qu.:1928 1st Qu.: 4.000
## Median : 65.00 Median :44.00 Median :1972 Median : 7.000
## Mean : 65.63 Mean :45.31 Mean :1960 Mean : 6.511
## 3rd Qu.: 78.00 3rd Qu.:52.00 3rd Qu.:1994 3rd Qu.:10.000
## Max. :109.00 Max. :83.00 Max. :2018 Max. :12.000
## NA's :1 NA's :1
## dy abbr airport
## Min. : 1.00 Length:66964 Length:66964
## 1st Qu.: 8.00 Class :character Class :character
## Median :16.00 Mode :character Mode :character
## Mean :15.74
## 3rd Qu.:23.00
## Max. :31.00
##
# Get the date range of data for each station
allwthr %>%
group_by(STATION_NAME) %>%
summarize(min_date = min(DATE),
max_date = max(DATE)) %>%
ungroup -> StationDateRange
StationDateRange
# Limit the data to overlapping dates
allwthr2 <- allwthr %>%
filter(DATE >= "1970-01-01" & DATE <= "2017-07-11" )
Summarize data for reporting on min & max temperature
# Get the ave max temp & avg min temp for each airport/year/mo combination
allwthr3 <- allwthr2 %>%
group_by(airport,yr,mo) %>%
summarize(mmax = mean(TMAX),
mmin = mean(TMIN)) %>%
ungroup()
Summarize data for reporting on precipitation
# Get the number of rainy days for each airport/year/mo combination
allwthr4 <- allwthr2 %>%
filter(PRCP > 0) %>%
group_by(airport,yr,mo) %>%
summarize(nrain = n()) %>%
ungroup()
# Get the avg number of rainy days for each airport/mo combination
allwthr6 <- allwthr4 %>%
group_by(airport,mo) %>%
summarize(mnrain = mean(nrain)) %>%
ungroup()
# Get avg amount of rain for the days with rain for each airport/mo combination
allwthr5 <- allwthr2 %>%
group_by(airport,mo) %>%
summarize(mrain = mean(PRCP > 0,na.rm=TRUE)) %>%
ungroup()
Show how you approached the questions you posed at the beginning. Describe how much you were able to accomplish. There should be both graphical and numerical results produced by R code included in chunks. Explain what you did and what it means.
I looked at temperature and precipitation data from month to month for each location, primarily using a faceted view of the data. The graphical results provide an easy way to see trends in the data for each location as well as how the two locations compare to one another. The numerical results provide a way to see specific data values, like: what is the average maximum temperature in Houston in August or what is the average rainfall per day in Olympia in December?
Seeing the comparison between the two locations is helpful in terms of getting a general understanding of the climate of each individual area. Deciding which location has the “better” climate requires a subjective evaluation. If some baseline of “ideal” values could be established, then presumably I would be able to do some specific comparison between the locations and the ideal.
To get a true measure of climate, I think it would necessary to consider temperature and precipitation together instead of evaluating them separately. For example, if the temperature is high at a time when the precipitation is high, this would seems to indicate uncomfortable humidity. On the other hand, if precipitation is low for an extended period of time, particularly if the temperatures are high, this could be an indication that the area is likely to be dry and dusty.
allwthr3 %>%
ggplot(aes(x = yr, y = mmax,color = airport)) +
geom_point() +
scale_x_continuous("Year", breaks = c(1970,1990,2010)) +
scale_y_continuous("Temperature (°F)") +
facet_wrap(~mo) +
ggtitle("Average Monthly Maximum Temperature (P5)")
allwthr3_OLM <- allwthr3 %>%
filter(airport == 'OLM')
allwthr3_IAH <- allwthr3 %>%
filter(airport == 'IAH')
print("OLYMPIA AIRPORT WA US")
## [1] "OLYMPIA AIRPORT WA US"
tapply(allwthr3_OLM$mmax,allwthr3_OLM$mo,summary)
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39.84 43.58 44.66 45.48 47.65 50.26
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 42.18 47.57 48.96 49.42 51.34 55.43
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.48 51.27 53.90 53.94 56.33 60.39
##
## $`4`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 52.73 57.36 58.88 59.02 60.50 66.30
##
## $`5`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60.42 63.73 65.53 65.68 67.74 72.65
##
## $`6`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 64.93 68.46 71.03 70.95 73.48 79.90
##
## $`7`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71.06 75.37 77.23 77.21 78.94 84.35
##
## $`8`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 70.71 75.68 78.13 77.68 79.47 83.94
##
## $`9`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65.77 69.47 72.03 71.82 74.20 79.63
##
## $`10`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 56.77 58.69 60.42 60.54 61.77 67.71
##
## $`11`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.13 49.00 50.17 50.35 51.47 55.00
##
## $`12`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.90 42.69 45.10 44.50 46.10 49.39
print("HOUSTON INTERCONTINENTAL AIRPORT, TX US")
## [1] "HOUSTON INTERCONTINENTAL AIRPORT, TX US"
tapply(allwthr3_IAH$mmax,allwthr3_IAH$mo,summary)
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.19 59.19 63.10 62.48 66.12 70.19
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 55.18 63.58 66.59 66.39 69.02 76.75
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 67.23 71.73 72.58 73.09 75.01 78.35
##
## $`4`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.27 78.31 79.13 79.46 80.65 85.47
##
## $`5`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81.19 84.51 85.73 85.79 86.98 90.29
##
## $`6`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 87.60 90.01 90.92 91.36 92.41 97.40
##
## $`7`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 89.71 91.83 93.71 93.71 95.65 99.42
##
## $`8`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 89.42 92.56 94.00 94.05 95.52 102.03
##
## $`9`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 82.93 87.18 89.47 89.37 91.22 95.50
##
## $`10`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 72.16 80.71 82.13 81.93 83.27 86.61
##
## $`11`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 62.80 69.68 72.57 72.04 74.83 79.10
##
## $`12`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 55.32 63.94 65.23 64.79 66.52 71.84
allwthr3 %>%
ggplot(aes(x = yr, y = mmin,color = airport)) +
geom_point() +
scale_x_continuous("Year", breaks = c(1970,1990,2010)) +
scale_y_continuous("Temperature (°F)") +
facet_wrap(~mo) +
ggtitle("Average Monthly Minimum Temperature (P6)")
allwthr6 %>%
ggplot(aes(x = mo, y = mnrain, color = airport)) +
geom_line(size=1.5) +
scale_x_continuous("Month", breaks = allwthr6$mo) +
scale_y_continuous("Number of Rainy Days") +
ggtitle("Average Number of Days with Rain (P11)")
tapply(allwthr6$mnrain,allwthr6$airport,summary)
## $IAH
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.000 8.214 8.606 8.622 9.174 9.979
##
## $OLM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.778 9.089 14.761 13.740 18.922 20.362
allwthr5 %>%
ggplot(aes(x = airport, y = mrain)) +
geom_col(aes(fill = airport)) +
scale_y_continuous("Inches of Rain") +
facet_wrap(~mo) +
ggtitle("Average Amount of Rain on Days with Rain (P8)")
tapply(allwthr5$mrain,allwthr5$airport,summary)
## $IAH
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2333 0.2692 0.2901 0.2836 0.2981 0.3219
##
## $OLM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1466 0.2884 0.4854 0.4497 0.6104 0.6589