## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: lubridate
## 
## 
## Attaching package: 'lubridate'
## 
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Reading in the data

file <- read.csv("Module2ExamInputFile.csv")
head(file)
##   X YYYY.MM.DD.DOY.    yield irrigation_demand county_name       crop
## 1 1 1980-08-27(240) 15147.19          421.8288    Okanogan Corn_grain
## 2 2 1981-08-21(233) 15926.39          427.0740    Okanogan Corn_grain
## 3 3 1982-08-16(228) 13489.79          363.3833    Okanogan Corn_grain
## 4 4 1983-08-19(231) 15263.74          382.2921    Okanogan Corn_grain
## 5 5 1984-08-22(235) 13779.91          391.8027    Okanogan Corn_grain
## 6 6 1985-08-04(216) 13920.45          445.9654    Okanogan Corn_grain
##               lat_lon
## 1 48.15625N119.71875W
## 2 48.15625N119.71875W
## 3 48.15625N119.71875W
## 4 48.15625N119.71875W
## 5 48.15625N119.71875W
## 6 48.15625N119.71875W
#since there are two columns having row numbers, one will be removed.
file <- select(file,-X)
head(file)
##   YYYY.MM.DD.DOY.    yield irrigation_demand county_name       crop
## 1 1980-08-27(240) 15147.19          421.8288    Okanogan Corn_grain
## 2 1981-08-21(233) 15926.39          427.0740    Okanogan Corn_grain
## 3 1982-08-16(228) 13489.79          363.3833    Okanogan Corn_grain
## 4 1983-08-19(231) 15263.74          382.2921    Okanogan Corn_grain
## 5 1984-08-22(235) 13779.91          391.8027    Okanogan Corn_grain
## 6 1985-08-04(216) 13920.45          445.9654    Okanogan Corn_grain
##               lat_lon
## 1 48.15625N119.71875W
## 2 48.15625N119.71875W
## 3 48.15625N119.71875W
## 4 48.15625N119.71875W
## 5 48.15625N119.71875W
## 6 48.15625N119.71875W

1) Adding a year column using lubridate

file_with_year <- mutate(file, year = year(str_sub(YYYY.MM.DD.DOY. , 1, 10)))
head(file_with_year)
##   YYYY.MM.DD.DOY.    yield irrigation_demand county_name       crop
## 1 1980-08-27(240) 15147.19          421.8288    Okanogan Corn_grain
## 2 1981-08-21(233) 15926.39          427.0740    Okanogan Corn_grain
## 3 1982-08-16(228) 13489.79          363.3833    Okanogan Corn_grain
## 4 1983-08-19(231) 15263.74          382.2921    Okanogan Corn_grain
## 5 1984-08-22(235) 13779.91          391.8027    Okanogan Corn_grain
## 6 1985-08-04(216) 13920.45          445.9654    Okanogan Corn_grain
##               lat_lon year
## 1 48.15625N119.71875W 1980
## 2 48.15625N119.71875W 1981
## 3 48.15625N119.71875W 1982
## 4 48.15625N119.71875W 1983
## 5 48.15625N119.71875W 1984
## 6 48.15625N119.71875W 1985

2) a data set with annual irrigation >90th percentile

#calculating the 90th percentile based on joint crop and county name

perc_90th <- file_with_year %>% 
  group_by(county_name, crop) %>% 
  summarise(percentile_90th = quantile(irrigation_demand, probs = 0.9), count = sum(county_name == "Okanogan", county_name == "WallaWalla", county_name == "Yakima"))
## `summarise()` has grouped output by 'county_name'. You can override using the
## `.groups` argument.
perc_90th
## # A tibble: 6 × 4
## # Groups:   county_name [3]
##   county_name crop         percentile_90th count
##   <chr>       <chr>                  <dbl> <int>
## 1 Okanogan    Corn_grain              424.    80
## 2 Okanogan    Winter_Wheat            450.    78
## 3 WallaWalla  Corn_grain              437.    80
## 4 WallaWalla  Winter_Wheat            460.    78
## 5 Yakima      Corn_grain              600     80
## 6 Yakima      Winter_Wheat            600     78
file_with_p90th <- cbind(file_with_year, p90th = rep(perc_90th$percentile_90th, perc_90th$count))
file_filtered <- filter(file_with_p90th, irrigation_demand < p90th)
head(file_filtered)
##   YYYY.MM.DD.DOY.    yield irrigation_demand county_name       crop
## 1 1980-08-27(240) 15147.19          421.8288    Okanogan Corn_grain
## 2 1982-08-16(228) 13489.79          363.3833    Okanogan Corn_grain
## 3 1983-08-19(231) 15263.74          382.2921    Okanogan Corn_grain
## 4 1984-08-22(235) 13779.91          391.8027    Okanogan Corn_grain
## 5 1986-08-10(222) 13722.79          405.9091    Okanogan Corn_grain
## 6 1987-08-13(225) 14043.32          393.0089    Okanogan Corn_grain
##               lat_lon year    p90th
## 1 48.15625N119.71875W 1980 423.5128
## 2 48.15625N119.71875W 1982 423.5128
## 3 48.15625N119.71875W 1983 423.5128
## 4 48.15625N119.71875W 1984 423.5128
## 5 48.15625N119.71875W 1986 423.5128
## 6 48.15625N119.71875W 1987 423.5128

3) Creating annual irrigation and yield csv files

irrigation <- select(file_filtered, irrigation_demand, county_name, crop, lat_lon)
write.csv(irrigation, "irrigation.csv", row.names = F)
yield <- select(file_filtered, yield, county_name, crop, lat_lon)
write.csv(yield, "yield.csv", row.names = F)

4) Joining

#reading the two csv files
irrigationr <- cbind(Num = seq(1:dim(irrigation)[1]), read.csv("irrigation.csv"))
yieldr <- cbind(Num = seq(1:dim(irrigation)[1]), read.csv("yield.csv"))

# The Num column is created since neither the county, crop nor the lat_lon are unique rows resulting in erroneous joining.  

full.join <- full_join(irrigationr, yieldr, by = c("Num", "county_name", "crop", "lat_lon"))  
dim(full.join)
## [1] 416   6
left.join <- left_join(irrigationr, yieldr, by = c("Num", "county_name", "crop", "lat_lon"))
dim(left.join)
## [1] 416   6
right.join <- right_join(irrigationr, yieldr, by = c("Num", "county_name", "crop", "lat_lon"))
dim(right.join)
## [1] 416   6
# The rest of the joining parameters ("county_name", "crop", "lat_lon") are added to avoid repetition of columns (.x, .y). Only using the joining parameter "Num" will give us a joined data frame. 

Full, right and left joins result in the same number of rows. Because both the irrigation and yield files were created from the same data set that was created in step 2, thus they have exactly the same rows of county, crop and lat_long. All rows are common. So in this case whether we use left, right or full, the result is the same.

5) Missing data

dim(full.join)
## [1] 416   6
dim(file_with_year)
## [1] 474   7
dim(file_with_year)[1] - dim(full.join)[1]
## [1] 58

As can be seen they are of different dimensions. This is expected because the full.join data frame was created out of a filtered file. So the full.join has some years missing. But they are implicit. There are 58 rows missing.

full.join <- cbind(full.join, year = file_filtered$year)
missing <- left_join(file_with_p90th, full.join, by = c("crop", "year", "county_name", "lat_lon"))
head(missing)
##   YYYY.MM.DD.DOY.  yield.x irrigation_demand.x county_name       crop
## 1 1980-08-27(240) 15147.19            421.8288    Okanogan Corn_grain
## 2 1981-08-21(233) 15926.39            427.0740    Okanogan Corn_grain
## 3 1982-08-16(228) 13489.79            363.3833    Okanogan Corn_grain
## 4 1983-08-19(231) 15263.74            382.2921    Okanogan Corn_grain
## 5 1984-08-22(235) 13779.91            391.8027    Okanogan Corn_grain
## 6 1985-08-04(216) 13920.45            445.9654    Okanogan Corn_grain
##               lat_lon year    p90th Num irrigation_demand.y  yield.y
## 1 48.15625N119.71875W 1980 423.5128   1            421.8288 15147.19
## 2 48.15625N119.71875W 1981 423.5128  NA                  NA       NA
## 3 48.15625N119.71875W 1982 423.5128   2            363.3833 13489.79
## 4 48.15625N119.71875W 1983 423.5128   3            382.2921 15263.74
## 5 48.15625N119.71875W 1984 423.5128   4            391.8027 13779.91
## 6 48.15625N119.71875W 1985 423.5128  NA                  NA       NA
avg_irr <- missing %>% 
  group_by(county_name,crop) %>% 
  summarise(avg_irrigation = mean(irrigation_demand.x), count = sum(county_name == "Okanogan", county_name == "WallaWalla", county_name == "Yakima"))
## `summarise()` has grouped output by 'county_name'. You can override using the
## `.groups` argument.
temp <- cbind(missing, avrg = rep(avg_irr$avg_irrigation, avg_irr$count))

temp$irrigation_demand.y <- ifelse (is.na(temp$irrigation_demand.y), temp$avrg, temp$irrigation_demand.y)
  
complete <- select (temp, year, county_name, crop, lat_lon, irrigation = irrigation_demand.y, yield = yield.x)
head(complete)
##   year county_name       crop             lat_lon irrigation    yield
## 1 1980    Okanogan Corn_grain 48.15625N119.71875W   421.8288 15147.19
## 2 1981    Okanogan Corn_grain 48.15625N119.71875W   381.5832 15926.39
## 3 1982    Okanogan Corn_grain 48.15625N119.71875W   363.3833 13489.79
## 4 1983    Okanogan Corn_grain 48.15625N119.71875W   382.2921 15263.74
## 5 1984    Okanogan Corn_grain 48.15625N119.71875W   391.8027 13779.91
## 6 1985    Okanogan Corn_grain 48.15625N119.71875W   381.5832 13920.45

6) Graphing the NA rows

na_rows <- missing[rowSums(is.na(missing)) > 0, ] 
temp1 <- left_join(na_rows, file_with_year, by = c("year", "crop", "county_name", "lat_lon"))
temp2 <- left_join(na_rows, complete, by = c("year", "crop", "county_name", "lat_lon"))
graph <- cbind(select(na_rows, year, county_name, crop, lat_lon), irrig_orig = temp1$irrigation_demand, irrig_fill = temp2$irrigation)
head(graph)
##    year county_name       crop             lat_lon irrig_orig irrig_fill
## 2  1981    Okanogan Corn_grain 48.15625N119.71875W   427.0740   381.5832
## 6  1985    Okanogan Corn_grain 48.15625N119.71875W   445.9654   381.5832
## 9  1988    Okanogan Corn_grain 48.15625N119.71875W   424.1820   381.5832
## 29 2008    Okanogan Corn_grain 48.15625N119.71875W   434.8880   381.5832
## 32 2011    Okanogan Corn_grain 48.15625N119.71875W   426.7880   381.5832
## 39 2018    Okanogan Corn_grain 48.15625N119.71875W   448.9120   381.5832

Plotting

ggplot(data = graph) +
  geom_point(mapping = aes(x = year, y = irrig_orig, color = county_name, shape = crop)) +
  geom_line(mapping = aes(x = year, y = irrig_fill, color = county_name))

The filled in values always lie below the original irrigation values. This is to be expected since the irrigation values here are the ones above the 90th percentile in their perspective county and crop, while the filled in values are the average in their perspective county and crop.