Data Visualization Capstone: Data Review

Data Import

Applying read.csv function while identifying the NA items, then use na.omit to ensure the remaining data set has complete entries for all 10 variables.

# Obtain Raw Data from Source
URL <- "https://data.cityofchicago.org/api/views/qmqz-2xku/rows.csv?accessType=DOWNLOAD"
download.file(URL,"bwq.csv",method = "curl")

# Execute Data Import while removing the missing data entries.
dat <- read.csv("bwq.csv", header = TRUE, na.strings = c("","NA","0",0)) 
dat$Measurement.Timestamp <- parse_date_time(dat$Measurement.Timestamp, "mdy_HMS")

# Rename Beach to Simplify Beach Names
beach <- recode(dat$Beach.Name, "63rd Street Beach" = "63rd Street", "Calumet Beach" = "Calumet", "Montrose Beach" = "Montrose", "Ohio Street Beach" = "Ohio Street", "Osterman Beach" = "Osterman", "Rainbow Beach" = "Rainbow")

# Replace Parameter Name after Recode
dat$Beach.Name <- beach

# Mutate with respective year and month
dat <- dat %>%
  mutate(year = year(Measurement.Timestamp), month = month(Measurement.Timestamp)) %>%
  select(-Transducer.Depth) %>%
  filter(Wave.Height > 0) %>%
  arrange(Measurement.Timestamp)

dat$year <- as.character(dat$year)
dat$month <- as.character(dat$month)

# Clean Data with na.values
tidy <- na.omit(dat)

Graph 1

This graph is to use the ggplotly to demonstrate the distribution quartile range for ohio street beach’s water temperature from 2014 to 2021. The x axis contains the individual beach while the y axis is the water temperature in celsius.

The following code chunk illustrates how I generated my data set for graph illustration.

# Select & Demonstrate the Structure of Tibble
data1 <- tidy %>%
  filter(Beach.Name == "Ohio Street") %>%
  filter(year !=　"2013")

# Examine Data Structure
as_tibble(data1)

## # A tibble: 14,945 x 11
##    Beach.Name  Measurement.Timestamp Water.Temperature Turbidity Wave.Height
##    <chr>       <dttm>                            <dbl>     <dbl>       <dbl>
##  1 Ohio Street 2014-06-05 12:00:00                16.9      1.6        0.159
##  2 Ohio Street 2014-06-06 02:00:00                18.8      0.7        0.135
##  3 Ohio Street 2014-06-06 05:00:00                19.8      0.78       0.162
##  4 Ohio Street 2014-06-06 06:00:00                19.7      0.77       0.13 
##  5 Ohio Street 2014-06-06 07:00:00                19.2      0.8        0.137
##  6 Ohio Street 2014-06-06 08:00:00                18.7      1.03       0.147
##  7 Ohio Street 2014-06-06 09:00:00                18.1      0.83       0.144
##  8 Ohio Street 2014-06-06 10:00:00                17.3      1          0.133
##  9 Ohio Street 2014-06-06 11:00:00                17.1      1.12       0.119
## 10 Ohio Street 2014-06-07 01:00:00                16.6      0.97       0.104
## # ... with 14,935 more rows, and 6 more variables: Wave.Period <int>,
## #   Battery.Life <dbl>, Measurement.Timestamp.Label <chr>,
## #   Measurement.ID <chr>, year <chr>, month <chr>

# Plot Graph
ggplotly(graph1 <- data1 %>%
  ggplot(aes(x = year, y = Water.Temperature, fill = year)) +
    geom_boxplot()+
      theme(axis.text.y = element_text(size = 8),
            axis.text.x = element_text(size = 8),
            axis.title = element_text(size = 8, face = "bold"),
            title = element_text(size =10, face = "bold"),
            legend.title = element_text(size = 8),
            legend.text = element_text(size = 6),
            legend.key.width = unit(0.5, 'cm'),
            legend.key.height = unit(0.5, 'cm'),
            legend.key.size = unit(0.5, 'cm')) + 
      labs(x = "Year", y = "Temperature (Celsius)", title = "Water Temperature for Ohio Street Beach by Years"))

Graph 2

This graph is to use the ggplotly to demonstrate the distribution density for individual beach’s wave height in 2015. The x axis contains the individual beach while the y axis is the wave height in meters.

The following code chunk illustrates how I generated my data set for graph illustration.

# Select & Demonstrate the Structure of Tibble
data2 <- tidy %>%
  filter(year == "2015")
  

# Examine Data Structure
as_tibble(data2)

## # A tibble: 13,716 x 11
##    Beach.Name  Measurement.Timestamp Water.Temperature Turbidity Wave.Height
##    <chr>       <dttm>                            <dbl>     <dbl>       <dbl>
##  1 63rd Street 2015-05-19 01:00:00                11.3      1.22       0.194
##  2 Calumet     2015-05-19 01:00:00                12.9      1.3        0.147
##  3 Montrose    2015-05-19 01:00:00                12        0.5        0.25 
##  4 Osterman    2015-05-19 01:00:00                12.6      0.94       0.232
##  5 63rd Street 2015-05-19 02:00:00                11.3      1.27       0.237
##  6 Calumet     2015-05-19 02:00:00                12.9      1.38       0.165
##  7 Montrose    2015-05-19 02:00:00                12        0.54       0.291
##  8 Osterman    2015-05-19 02:00:00                12.6      1.17       0.277
##  9 63rd Street 2015-05-19 03:00:00                11.3      1.35       0.253
## 10 Calumet     2015-05-19 03:00:00                12.9      1.24       0.221
## # ... with 13,706 more rows, and 6 more variables: Wave.Period <int>,
## #   Battery.Life <dbl>, Measurement.Timestamp.Label <chr>,
## #   Measurement.ID <chr>, year <chr>, month <chr>

# Plot Graph
ggplotly(graph2 <- data2 %>%
           ggplot(aes(x = Beach.Name, y = Wave.Height, fill = Beach.Name)) +
           geom_violin()+ 
           theme(axis.text.y = element_text(size = 8),
                 axis.text.x = element_text(size = 8),
                 axis.title = element_text(size = 8, face = "bold"),
                 title = element_text(size =10, face = "bold"),
                 legend.title = element_text(size = 8),
                 legend.text = element_text(size = 6),
                 legend.key.width = unit(0.5, 'cm'),
                 legend.key.height = unit(0.5, 'cm'),
                 legend.key.size = unit(0.5, 'cm')) + 
           labs(x = "Chicago Beach Location", y = "Wave Height (m)", title = "Chicago's 2015 Wave Height Distribution Coparison"))

Graph 3

This graph is to use the ggplotly to show average turbidity level for individual beach in 2015. The x axis contains the turbidity level in Nephelometric Turbidity Units (NTU) while y axis is the individual beach.

The following code chunk illustrates how I generated my data set for graph illustration.

# Select & Demonstrate the Structure of Tibble
data3 <- tidy %>%
    filter(year == "2015")

# Examine Data Structure
as_tibble(data3)

## # A tibble: 13,716 x 11
##    Beach.Name  Measurement.Timestamp Water.Temperature Turbidity Wave.Height
##    <chr>       <dttm>                            <dbl>     <dbl>       <dbl>
##  1 63rd Street 2015-05-19 01:00:00                11.3      1.22       0.194
##  2 Calumet     2015-05-19 01:00:00                12.9      1.3        0.147
##  3 Montrose    2015-05-19 01:00:00                12        0.5        0.25 
##  4 Osterman    2015-05-19 01:00:00                12.6      0.94       0.232
##  5 63rd Street 2015-05-19 02:00:00                11.3      1.27       0.237
##  6 Calumet     2015-05-19 02:00:00                12.9      1.38       0.165
##  7 Montrose    2015-05-19 02:00:00                12        0.54       0.291
##  8 Osterman    2015-05-19 02:00:00                12.6      1.17       0.277
##  9 63rd Street 2015-05-19 03:00:00                11.3      1.35       0.253
## 10 Calumet     2015-05-19 03:00:00                12.9      1.24       0.221
## # ... with 13,706 more rows, and 6 more variables: Wave.Period <int>,
## #   Battery.Life <dbl>, Measurement.Timestamp.Label <chr>,
## #   Measurement.ID <chr>, year <chr>, month <chr>

# Plot the Graph
ggplotly(graph3 <- data3 %>%
           select(Turbidity,Beach.Name) %>%
           group_by(Beach.Name) %>%
           summarize(avg_turbidity = mean(Turbidity)) %>%
           ggplot(aes(x = avg_turbidity, y = Beach.Name))+
           geom_point(color = 'red', size = 2)+
           geom_segment(aes(x = 0, xend = avg_turbidity,y = Beach.Name, yend = Beach.Name), color = "blue") + 
           theme(axis.text.y = element_text(size = 8),
                 axis.text.x = element_text(size = 8),
                 axis.title = element_text(size = 8, face = "bold"),
                 title = element_text(size =10, face = "bold")) + 
           labs(y = "Chicago Beach Location", x = "Average Turbidity (NTU)", title = "Average Turbidity Level Calculated in 2015"))

Data Visualization Capstone: Data Review

Hsin Chih Chen

2022/4/2

Data Import

Graph 1

Graph 2

Graph 3