knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(stringr)
library(ggplot2)

urlfile="https://raw.githubusercontent.com/Nhodgkinson/DATA-607-P2/main/cdec-monthly-precipitation-san-joaquin-1913-2014.csv"

calidata<-read_csv(url(urlfile))
## Rows: 102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Region
## dbl (14): WY, Oct, Nov, Dec, Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Total
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Tidying

I initially began working on gathering the data with the assumption that I would want a month over month view. I saw how much data there was and felt that the year over year was a better approach with an avg rain fall by month would be more insightful.

calidata
## # A tibble: 102 × 15
##    Region         WY   Oct   Nov   Dec   Jan   Feb   Mar   Apr   May   Jun   Jul
##    <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 San Joaquin  1913  1.01  2.8   1.26  5.4   1.74  2.96  2.11  2.09  2.77  1.12
##  2 San Joaquin  1914  0.08  4.55  7.91 21.3   5.17  0.94  5.18  1.05  1.07  0.02
##  3 San Joaquin  1915  1.71  0.61  3.92  7.68  9.2   2.73  3.53  6.94  0     0   
##  4 San Joaquin  1916  0     1.47  5.87 20     5.87  6.31  0.84  0.67  0     0   
##  5 San Joaquin  1917  6.77  1.64  8.19  1.69 12.5   2.78  1.46  1.71  0     0.24
##  6 San Joaquin  1918  0     1.53  1.79  0.97 11.0  11.7   0.42  1.84  0.15  0.2 
##  7 San Joaquin  1919  1.69  5.39  2.18  1.45 10.3   5.69  0.53  1.12  0     0   
##  8 San Joaquin  1920  0.95  0.78  6.34  1.45  4.29  9.55  4.86  0.15  0.57  0.02
##  9 San Joaquin  1921  5.41  4.36  6.25  9.69  3.33  4.53  0.7   2.25  0.3   0   
## 10 San Joaquin  1922  0.58  0.59 13.1   5.52  8.11  6.05  0.85  2.63  0.42  0.53
## # … with 92 more rows, and 3 more variables: Aug <dbl>, Sep <dbl>, Total <dbl>
#Year over Year info
cyeardf<-calidata[,c(1,2,15)]

cyeardf<- cyeardf %>%
  rename(Year = WY) %>%
  mutate(`Avg Rain` = mean(Total))

#Month Avgs
cmdf<-gather(calidata, "Month", "n", 3:14)

cmdf<-cmdf %>%
  rename(Year = WY) %>%
  group_by(Month) %>%
   mutate(`Avg Rain` = mean(n))%>%
  subset(select=-c(3))#2,5

#cmdf<-cmdf[!duplicated(cmdf), ]


cmdf
## # A tibble: 1,224 × 5
## # Groups:   Month [12]
##    Region       Year Month     n `Avg Rain`
##    <chr>       <dbl> <chr> <dbl>      <dbl>
##  1 San Joaquin  1913 Oct    1.01       2.06
##  2 San Joaquin  1914 Oct    0.08       2.06
##  3 San Joaquin  1915 Oct    1.71       2.06
##  4 San Joaquin  1916 Oct    0          2.06
##  5 San Joaquin  1917 Oct    6.77       2.06
##  6 San Joaquin  1918 Oct    0          2.06
##  7 San Joaquin  1919 Oct    1.69       2.06
##  8 San Joaquin  1920 Oct    0.95       2.06
##  9 San Joaquin  1921 Oct    5.41       2.06
## 10 San Joaquin  1922 Oct    0.58       2.06
## # … with 1,214 more rows
cyeardf
## # A tibble: 102 × 4
##    Region       Year Total `Avg Rain`
##    <chr>       <dbl> <dbl>      <dbl>
##  1 San Joaquin  1913  24.6       38.6
##  2 San Joaquin  1914  48.0       38.6
##  3 San Joaquin  1915  36.4       38.6
##  4 San Joaquin  1916  43         38.6
##  5 San Joaquin  1917  37.1       38.6
##  6 San Joaquin  1918  32.4       38.6
##  7 San Joaquin  1919  30.2       38.6
##  8 San Joaquin  1920  30.7       38.6
##  9 San Joaquin  1921  37.4       38.6
## 10 San Joaquin  1922  38.5       38.6
## # … with 92 more rows

Analysis

I have two tables, one that includes month and another that only has year over year. I know the avg rain by month and the avg rain by year. I can now create two graphs and see if the month precipitation compared to the avg expected and the yearly total to the avg expected.

#For the month view I want to look at only January month data. But over 100 years is a lot of data to look at for a visual so lets only look at the past 20 years. 1994 to 2014.


Jandf<-cmdf %>%
  subset( Month == "Jan") %>%
  subset(Year > 1993)

 ggplot(Jandf,aes(Year,n))+geom_point()+geom_hline(yintercept=Jandf$`Avg Rain`,color="red")

#Repeating the above steps for a year over year look compared to the avg. This time we will look back to 1984 through 2004
 
 Ydf<-cyeardf%>%
   subset(Year > 1983 & Year < 2005)

 ggplot(Ydf,aes(Year, Total))+geom_point()+geom_hline(yintercept=Ydf$`Avg Rain`,color="red")

##Observation The month graph for Jan 94 to Jan 2014 shows 95 and 97 having much higher rainfall with around 20 inches for the month compared to the avg Jan rainfall of 6.8 inches.

When looking at the Year graph we see 95 has a very high rain fall at 70 inches for the year with the yearly avg at 38.55 inches. As seen in the Month graph, 95 had high rainfall in January with around 20 inches. 1997 is the second highest year for rain fall between 1984 andf 2004 with 65 inches. It was also showing very high monthly rain fall.