Applying read.csv function while identifying the NA items, then use na.omit to ensure the remaining data set has complete entries for all 10 variables.
# Obtain Raw Data from Source
URL <- "https://data.cityofchicago.org/api/views/qmqz-2xku/rows.csv?accessType=DOWNLOAD"
download.file(URL,"bwq.csv",method = "curl")
# Execute Data Import while removing the missing data entries.
dat <- read.csv("bwq.csv", header = TRUE, na.strings = c("","NA","0",0))
dat$Measurement.Timestamp <- parse_date_time(dat$Measurement.Timestamp, "mdy_HMS")
# Rename Beach to Simplify Beach Names
beach <- recode(dat$Beach.Name, "63rd Street Beach" = "63rd Street", "Calumet Beach" = "Calumet", "Montrose Beach" = "Montrose", "Ohio Street Beach" = "Ohio Street", "Osterman Beach" = "Osterman", "Rainbow Beach" = "Rainbow")
# Replace Parameter Name after Recode
dat$Beach.Name <- beach
# Mutate with respective year and month
dat <- dat %>%
mutate(year = year(Measurement.Timestamp), month = month(Measurement.Timestamp)) %>%
select(-Transducer.Depth) %>%
filter(Wave.Height > 0) %>%
arrange(Measurement.Timestamp)
dat$year <- as.character(dat$year)
dat$month <- as.character(dat$month)
# Clean Data with na.values
tidy <- na.omit(dat)
This graph is to use the ggplotly to demonstrate the distribution quartile range for ohio street beach’s water temperature from 2014 to 2021. The x axis contains the individual beach while the y axis is the water temperature in celsius.
The following code chunk illustrates how I generated my data set for graph illustration.
# Select & Demonstrate the Structure of Tibble
data1 <- tidy %>%
filter(Beach.Name == "Ohio Street") %>%
filter(year != "2013")
# Examine Data Structure
as_tibble(data1)
## # A tibble: 14,945 x 11
## Beach.Name Measurement.Timestamp Water.Temperature Turbidity Wave.Height
## <chr> <dttm> <dbl> <dbl> <dbl>
## 1 Ohio Street 2014-06-05 12:00:00 16.9 1.6 0.159
## 2 Ohio Street 2014-06-06 02:00:00 18.8 0.7 0.135
## 3 Ohio Street 2014-06-06 05:00:00 19.8 0.78 0.162
## 4 Ohio Street 2014-06-06 06:00:00 19.7 0.77 0.13
## 5 Ohio Street 2014-06-06 07:00:00 19.2 0.8 0.137
## 6 Ohio Street 2014-06-06 08:00:00 18.7 1.03 0.147
## 7 Ohio Street 2014-06-06 09:00:00 18.1 0.83 0.144
## 8 Ohio Street 2014-06-06 10:00:00 17.3 1 0.133
## 9 Ohio Street 2014-06-06 11:00:00 17.1 1.12 0.119
## 10 Ohio Street 2014-06-07 01:00:00 16.6 0.97 0.104
## # ... with 14,935 more rows, and 6 more variables: Wave.Period <int>,
## # Battery.Life <dbl>, Measurement.Timestamp.Label <chr>,
## # Measurement.ID <chr>, year <chr>, month <chr>
# Plot Graph
ggplotly(graph1 <- data1 %>%
ggplot(aes(x = year, y = Water.Temperature, fill = year)) +
geom_boxplot()+
theme(axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8),
axis.title = element_text(size = 8, face = "bold"),
title = element_text(size =10, face = "bold"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
legend.key.width = unit(0.5, 'cm'),
legend.key.height = unit(0.5, 'cm'),
legend.key.size = unit(0.5, 'cm')) +
labs(x = "Year", y = "Temperature (Celsius)", title = "Water Temperature for Ohio Street Beach by Years"))
This graph is to use the ggplotly to demonstrate the distribution density for individual beach’s wave height in 2015. The x axis contains the individual beach while the y axis is the wave height in meters.
The following code chunk illustrates how I generated my data set for graph illustration.
# Select & Demonstrate the Structure of Tibble
data2 <- tidy %>%
filter(year == "2015")
# Examine Data Structure
as_tibble(data2)
## # A tibble: 13,716 x 11
## Beach.Name Measurement.Timestamp Water.Temperature Turbidity Wave.Height
## <chr> <dttm> <dbl> <dbl> <dbl>
## 1 63rd Street 2015-05-19 01:00:00 11.3 1.22 0.194
## 2 Calumet 2015-05-19 01:00:00 12.9 1.3 0.147
## 3 Montrose 2015-05-19 01:00:00 12 0.5 0.25
## 4 Osterman 2015-05-19 01:00:00 12.6 0.94 0.232
## 5 63rd Street 2015-05-19 02:00:00 11.3 1.27 0.237
## 6 Calumet 2015-05-19 02:00:00 12.9 1.38 0.165
## 7 Montrose 2015-05-19 02:00:00 12 0.54 0.291
## 8 Osterman 2015-05-19 02:00:00 12.6 1.17 0.277
## 9 63rd Street 2015-05-19 03:00:00 11.3 1.35 0.253
## 10 Calumet 2015-05-19 03:00:00 12.9 1.24 0.221
## # ... with 13,706 more rows, and 6 more variables: Wave.Period <int>,
## # Battery.Life <dbl>, Measurement.Timestamp.Label <chr>,
## # Measurement.ID <chr>, year <chr>, month <chr>
# Plot Graph
ggplotly(graph2 <- data2 %>%
ggplot(aes(x = Beach.Name, y = Wave.Height, fill = Beach.Name)) +
geom_violin()+
theme(axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8),
axis.title = element_text(size = 8, face = "bold"),
title = element_text(size =10, face = "bold"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
legend.key.width = unit(0.5, 'cm'),
legend.key.height = unit(0.5, 'cm'),
legend.key.size = unit(0.5, 'cm')) +
labs(x = "Chicago Beach Location", y = "Wave Height (m)", title = "Chicago's 2015 Wave Height Distribution Coparison"))
This graph is to use the ggplotly to show average turbidity level for individual beach in 2015. The x axis contains the turbidity level in Nephelometric Turbidity Units (NTU) while y axis is the individual beach.
The following code chunk illustrates how I generated my data set for graph illustration.
# Select & Demonstrate the Structure of Tibble
data3 <- tidy %>%
filter(year == "2015")
# Examine Data Structure
as_tibble(data3)
## # A tibble: 13,716 x 11
## Beach.Name Measurement.Timestamp Water.Temperature Turbidity Wave.Height
## <chr> <dttm> <dbl> <dbl> <dbl>
## 1 63rd Street 2015-05-19 01:00:00 11.3 1.22 0.194
## 2 Calumet 2015-05-19 01:00:00 12.9 1.3 0.147
## 3 Montrose 2015-05-19 01:00:00 12 0.5 0.25
## 4 Osterman 2015-05-19 01:00:00 12.6 0.94 0.232
## 5 63rd Street 2015-05-19 02:00:00 11.3 1.27 0.237
## 6 Calumet 2015-05-19 02:00:00 12.9 1.38 0.165
## 7 Montrose 2015-05-19 02:00:00 12 0.54 0.291
## 8 Osterman 2015-05-19 02:00:00 12.6 1.17 0.277
## 9 63rd Street 2015-05-19 03:00:00 11.3 1.35 0.253
## 10 Calumet 2015-05-19 03:00:00 12.9 1.24 0.221
## # ... with 13,706 more rows, and 6 more variables: Wave.Period <int>,
## # Battery.Life <dbl>, Measurement.Timestamp.Label <chr>,
## # Measurement.ID <chr>, year <chr>, month <chr>
# Plot the Graph
ggplotly(graph3 <- data3 %>%
select(Turbidity,Beach.Name) %>%
group_by(Beach.Name) %>%
summarize(avg_turbidity = mean(Turbidity)) %>%
ggplot(aes(x = avg_turbidity, y = Beach.Name))+
geom_point(color = 'red', size = 2)+
geom_segment(aes(x = 0, xend = avg_turbidity,y = Beach.Name, yend = Beach.Name), color = "blue") +
theme(axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8),
axis.title = element_text(size = 8, face = "bold"),
title = element_text(size =10, face = "bold")) +
labs(y = "Chicago Beach Location", x = "Average Turbidity (NTU)", title = "Average Turbidity Level Calculated in 2015"))