## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: lubridate
##
##
## Attaching package: 'lubridate'
##
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Reading in the data
file <- read.csv("Module2ExamInputFile.csv")
head(file)
## X YYYY.MM.DD.DOY. yield irrigation_demand county_name crop
## 1 1 1980-08-27(240) 15147.19 421.8288 Okanogan Corn_grain
## 2 2 1981-08-21(233) 15926.39 427.0740 Okanogan Corn_grain
## 3 3 1982-08-16(228) 13489.79 363.3833 Okanogan Corn_grain
## 4 4 1983-08-19(231) 15263.74 382.2921 Okanogan Corn_grain
## 5 5 1984-08-22(235) 13779.91 391.8027 Okanogan Corn_grain
## 6 6 1985-08-04(216) 13920.45 445.9654 Okanogan Corn_grain
## lat_lon
## 1 48.15625N119.71875W
## 2 48.15625N119.71875W
## 3 48.15625N119.71875W
## 4 48.15625N119.71875W
## 5 48.15625N119.71875W
## 6 48.15625N119.71875W
#since there are two columns having row numbers, one will be removed.
file <- select(file,-X)
head(file)
## YYYY.MM.DD.DOY. yield irrigation_demand county_name crop
## 1 1980-08-27(240) 15147.19 421.8288 Okanogan Corn_grain
## 2 1981-08-21(233) 15926.39 427.0740 Okanogan Corn_grain
## 3 1982-08-16(228) 13489.79 363.3833 Okanogan Corn_grain
## 4 1983-08-19(231) 15263.74 382.2921 Okanogan Corn_grain
## 5 1984-08-22(235) 13779.91 391.8027 Okanogan Corn_grain
## 6 1985-08-04(216) 13920.45 445.9654 Okanogan Corn_grain
## lat_lon
## 1 48.15625N119.71875W
## 2 48.15625N119.71875W
## 3 48.15625N119.71875W
## 4 48.15625N119.71875W
## 5 48.15625N119.71875W
## 6 48.15625N119.71875W
file_with_year <- mutate(file, year = year(str_sub(YYYY.MM.DD.DOY. , 1, 10)))
head(file_with_year)
## YYYY.MM.DD.DOY. yield irrigation_demand county_name crop
## 1 1980-08-27(240) 15147.19 421.8288 Okanogan Corn_grain
## 2 1981-08-21(233) 15926.39 427.0740 Okanogan Corn_grain
## 3 1982-08-16(228) 13489.79 363.3833 Okanogan Corn_grain
## 4 1983-08-19(231) 15263.74 382.2921 Okanogan Corn_grain
## 5 1984-08-22(235) 13779.91 391.8027 Okanogan Corn_grain
## 6 1985-08-04(216) 13920.45 445.9654 Okanogan Corn_grain
## lat_lon year
## 1 48.15625N119.71875W 1980
## 2 48.15625N119.71875W 1981
## 3 48.15625N119.71875W 1982
## 4 48.15625N119.71875W 1983
## 5 48.15625N119.71875W 1984
## 6 48.15625N119.71875W 1985
#calculating the 90th percentile based on joint crop and county name
perc_90th <- file_with_year %>%
group_by(county_name, crop) %>%
summarise(percentile_90th = quantile(irrigation_demand, probs = 0.9), count = sum(county_name == "Okanogan", county_name == "WallaWalla", county_name == "Yakima"))
## `summarise()` has grouped output by 'county_name'. You can override using the
## `.groups` argument.
perc_90th
## # A tibble: 6 × 4
## # Groups: county_name [3]
## county_name crop percentile_90th count
## <chr> <chr> <dbl> <int>
## 1 Okanogan Corn_grain 424. 80
## 2 Okanogan Winter_Wheat 450. 78
## 3 WallaWalla Corn_grain 437. 80
## 4 WallaWalla Winter_Wheat 460. 78
## 5 Yakima Corn_grain 600 80
## 6 Yakima Winter_Wheat 600 78
file_with_p90th <- cbind(file_with_year, p90th = rep(perc_90th$percentile_90th, perc_90th$count))
file_filtered <- filter(file_with_p90th, irrigation_demand < p90th)
head(file_filtered)
## YYYY.MM.DD.DOY. yield irrigation_demand county_name crop
## 1 1980-08-27(240) 15147.19 421.8288 Okanogan Corn_grain
## 2 1982-08-16(228) 13489.79 363.3833 Okanogan Corn_grain
## 3 1983-08-19(231) 15263.74 382.2921 Okanogan Corn_grain
## 4 1984-08-22(235) 13779.91 391.8027 Okanogan Corn_grain
## 5 1986-08-10(222) 13722.79 405.9091 Okanogan Corn_grain
## 6 1987-08-13(225) 14043.32 393.0089 Okanogan Corn_grain
## lat_lon year p90th
## 1 48.15625N119.71875W 1980 423.5128
## 2 48.15625N119.71875W 1982 423.5128
## 3 48.15625N119.71875W 1983 423.5128
## 4 48.15625N119.71875W 1984 423.5128
## 5 48.15625N119.71875W 1986 423.5128
## 6 48.15625N119.71875W 1987 423.5128
irrigation <- select(file_filtered, irrigation_demand, county_name, crop, lat_lon)
write.csv(irrigation, "irrigation.csv", row.names = F)
yield <- select(file_filtered, yield, county_name, crop, lat_lon)
write.csv(yield, "yield.csv", row.names = F)
#reading the two csv files
irrigationr <- cbind(Num = seq(1:dim(irrigation)[1]), read.csv("irrigation.csv"))
yieldr <- cbind(Num = seq(1:dim(irrigation)[1]), read.csv("yield.csv"))
# The Num column is created since neither the county, crop nor the lat_lon are unique rows resulting in erroneous joining.
full.join <- full_join(irrigationr, yieldr, by = c("Num", "county_name", "crop", "lat_lon"))
dim(full.join)
## [1] 416 6
left.join <- left_join(irrigationr, yieldr, by = c("Num", "county_name", "crop", "lat_lon"))
dim(left.join)
## [1] 416 6
right.join <- right_join(irrigationr, yieldr, by = c("Num", "county_name", "crop", "lat_lon"))
dim(right.join)
## [1] 416 6
# The rest of the joining parameters ("county_name", "crop", "lat_lon") are added to avoid repetition of columns (.x, .y). Only using the joining parameter "Num" will give us a joined data frame.
Full, right and left joins result in the same number of rows. Because both the irrigation and yield files were created from the same data set that was created in step 2, thus they have exactly the same rows of county, crop and lat_long. All rows are common. So in this case whether we use left, right or full, the result is the same.
dim(full.join)
## [1] 416 6
dim(file_with_year)
## [1] 474 7
dim(file_with_year)[1] - dim(full.join)[1]
## [1] 58
As can be seen they are of different dimensions. This is expected because the full.join data frame was created out of a filtered file. So the full.join has some years missing. But they are implicit. There are 58 rows missing.
full.join <- cbind(full.join, year = file_filtered$year)
missing <- left_join(file_with_p90th, full.join, by = c("crop", "year", "county_name", "lat_lon"))
head(missing)
## YYYY.MM.DD.DOY. yield.x irrigation_demand.x county_name crop
## 1 1980-08-27(240) 15147.19 421.8288 Okanogan Corn_grain
## 2 1981-08-21(233) 15926.39 427.0740 Okanogan Corn_grain
## 3 1982-08-16(228) 13489.79 363.3833 Okanogan Corn_grain
## 4 1983-08-19(231) 15263.74 382.2921 Okanogan Corn_grain
## 5 1984-08-22(235) 13779.91 391.8027 Okanogan Corn_grain
## 6 1985-08-04(216) 13920.45 445.9654 Okanogan Corn_grain
## lat_lon year p90th Num irrigation_demand.y yield.y
## 1 48.15625N119.71875W 1980 423.5128 1 421.8288 15147.19
## 2 48.15625N119.71875W 1981 423.5128 NA NA NA
## 3 48.15625N119.71875W 1982 423.5128 2 363.3833 13489.79
## 4 48.15625N119.71875W 1983 423.5128 3 382.2921 15263.74
## 5 48.15625N119.71875W 1984 423.5128 4 391.8027 13779.91
## 6 48.15625N119.71875W 1985 423.5128 NA NA NA
avg_irr <- missing %>%
group_by(county_name,crop) %>%
summarise(avg_irrigation = mean(irrigation_demand.x), count = sum(county_name == "Okanogan", county_name == "WallaWalla", county_name == "Yakima"))
## `summarise()` has grouped output by 'county_name'. You can override using the
## `.groups` argument.
temp <- cbind(missing, avrg = rep(avg_irr$avg_irrigation, avg_irr$count))
temp$irrigation_demand.y <- ifelse (is.na(temp$irrigation_demand.y), temp$avrg, temp$irrigation_demand.y)
complete <- select (temp, year, county_name, crop, lat_lon, irrigation = irrigation_demand.y, yield = yield.x)
head(complete)
## year county_name crop lat_lon irrigation yield
## 1 1980 Okanogan Corn_grain 48.15625N119.71875W 421.8288 15147.19
## 2 1981 Okanogan Corn_grain 48.15625N119.71875W 381.5832 15926.39
## 3 1982 Okanogan Corn_grain 48.15625N119.71875W 363.3833 13489.79
## 4 1983 Okanogan Corn_grain 48.15625N119.71875W 382.2921 15263.74
## 5 1984 Okanogan Corn_grain 48.15625N119.71875W 391.8027 13779.91
## 6 1985 Okanogan Corn_grain 48.15625N119.71875W 381.5832 13920.45
na_rows <- missing[rowSums(is.na(missing)) > 0, ]
temp1 <- left_join(na_rows, file_with_year, by = c("year", "crop", "county_name", "lat_lon"))
temp2 <- left_join(na_rows, complete, by = c("year", "crop", "county_name", "lat_lon"))
graph <- cbind(select(na_rows, year, county_name, crop, lat_lon), irrig_orig = temp1$irrigation_demand, irrig_fill = temp2$irrigation)
head(graph)
## year county_name crop lat_lon irrig_orig irrig_fill
## 2 1981 Okanogan Corn_grain 48.15625N119.71875W 427.0740 381.5832
## 6 1985 Okanogan Corn_grain 48.15625N119.71875W 445.9654 381.5832
## 9 1988 Okanogan Corn_grain 48.15625N119.71875W 424.1820 381.5832
## 29 2008 Okanogan Corn_grain 48.15625N119.71875W 434.8880 381.5832
## 32 2011 Okanogan Corn_grain 48.15625N119.71875W 426.7880 381.5832
## 39 2018 Okanogan Corn_grain 48.15625N119.71875W 448.9120 381.5832
Plotting
ggplot(data = graph) +
geom_point(mapping = aes(x = year, y = irrig_orig, color = county_name, shape = crop)) +
geom_line(mapping = aes(x = year, y = irrig_fill, color = county_name))
The filled in values always lie below the original irrigation values. This is to be expected since the irrigation values here are the ones above the 90th percentile in their perspective county and crop, while the filled in values are the average in their perspective county and crop.