R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Install packages in the console:

#install.packages("tidyverse")
#install.packages("dummies")
# install.packages("knitr")

# Load packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  2.0.1     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)


# Get working directory
getwd()
## [1] "/Users/wendywong/Downloads"
# Set working directory. Change all back slashes to forward slashes
setwd("/Users/wendywong/Downloads")


# Read csv file. 
# Use the import dataset function after the data is in the correct working directory.
# The data is enterococci levels at sydney beaches over the last 5 years, as well as rainfall and  temperature. Check out this post if you want to learn about enterococci https://rladiessydney.org/post/2018/11/05/basicbasics-3/. 

library(readr)
rain_temp_beachbugs <- read_csv("rain_temp_beachbugs.csv")
## Parsed with column specification:
## cols(
##   council = col_character(),
##   long = col_double(),
##   lat = col_double(),
##   date = col_date(format = ""),
##   site = col_character(),
##   beachbugs = col_double(),
##   id = col_double(),
##   region = col_character(),
##   rain_mm = col_double(),
##   temp_airport = col_double()
## )
data <- rain_temp_beachbugs

# View the data
glimpse(data)
## Observations: 3,690
## Variables: 10
## $ council      <chr> "Randwick Council", "Randwick Council", "Randwick C…
## $ long         <dbl> 151.2675, 151.2675, 151.2675, 151.2675, 151.2675, 1…
## $ lat          <dbl> -33.91449, -33.91449, -33.91449, -33.91449, -33.914…
## $ date         <date> 2013-01-02, 2013-01-06, 2013-01-12, 2013-01-18, 20…
## $ site         <chr> "Clovelly Beach", "Clovelly Beach", "Clovelly Beach…
## $ beachbugs    <dbl> 19, 3, 2, 13, 8, 7, 11, 97, 3, 0, 6, 0, 1, 8, 3, 5,…
## $ id           <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,…
## $ region       <chr> "Sydney City Ocean Beaches", "Sydney City Ocean Bea…
## $ rain_mm      <dbl> 0.0, 0.0, 0.0, 0.0, 0.6, 0.1, 8.0, 7.2, 0.0, 0.0, 0…
## $ temp_airport <dbl> 23.4, 30.3, 31.4, 46.4, 26.6, 25.7, 22.2, 24.8, 29.…
# View structure of the data
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3690 obs. of  10 variables:
##  $ council     : chr  "Randwick Council" "Randwick Council" "Randwick Council" "Randwick Council" ...
##  $ long        : num  151 151 151 151 151 ...
##  $ lat         : num  -33.9 -33.9 -33.9 -33.9 -33.9 ...
##  $ date        : Date, format: "2013-01-02" "2013-01-06" ...
##  $ site        : chr  "Clovelly Beach" "Clovelly Beach" "Clovelly Beach" "Clovelly Beach" ...
##  $ beachbugs   : num  19 3 2 13 8 7 11 97 3 0 ...
##  $ id          : num  25 25 25 25 25 25 25 25 25 25 ...
##  $ region      : chr  "Sydney City Ocean Beaches" "Sydney City Ocean Beaches" "Sydney City Ocean Beaches" "Sydney City Ocean Beaches" ...
##  $ rain_mm     : num  0 0 0 0 0.6 0.1 8 7.2 0 0 ...
##  $ temp_airport: num  23.4 30.3 31.4 46.4 26.6 25.7 22.2 24.8 29.1 25.8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   council = col_character(),
##   ..   long = col_double(),
##   ..   lat = col_double(),
##   ..   date = col_date(format = ""),
##   ..   site = col_character(),
##   ..   beachbugs = col_double(),
##   ..   id = col_double(),
##   ..   region = col_character(),
##   ..   rain_mm = col_double(),
##   ..   temp_airport = col_double()
##   .. )
# inspect first 5 values
head(data)
## # A tibble: 6 x 10
##   council  long   lat date       site  beachbugs    id region rain_mm
##   <chr>   <dbl> <dbl> <date>     <chr>     <dbl> <dbl> <chr>    <dbl>
## 1 Randwi…  151. -33.9 2013-01-02 Clov…        19    25 Sydne…     0  
## 2 Randwi…  151. -33.9 2013-01-06 Clov…         3    25 Sydne…     0  
## 3 Randwi…  151. -33.9 2013-01-12 Clov…         2    25 Sydne…     0  
## 4 Randwi…  151. -33.9 2013-01-18 Clov…        13    25 Sydne…     0  
## 5 Randwi…  151. -33.9 2013-01-30 Clov…         8    25 Sydne…     0.6
## 6 Randwi…  151. -33.9 2013-02-05 Clov…         7    25 Sydne…     0.1
## # … with 1 more variable: temp_airport <dbl>
# Pre-process and clean the data
# convert chracters into factors
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
data$council <- as.factor(data$council)
data$site <- as.factor(data$site)
data$region <- as.factor(data$region)
data$temp_airport<- as.numeric(data$temp_airport)
data$rain_mm<- as.numeric(data$rain_mm)

# Check status of cleaned data
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3690 obs. of  10 variables:
##  $ council     : Factor w/ 2 levels "Randwick Council",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ long        : num  151 151 151 151 151 ...
##  $ lat         : num  -33.9 -33.9 -33.9 -33.9 -33.9 ...
##  $ date        : Date, format: "2013-01-02" "2013-01-06" ...
##  $ site        : Factor w/ 11 levels "Bondi Beach",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ beachbugs   : num  19 3 2 13 8 7 11 97 3 0 ...
##  $ id          : num  25 25 25 25 25 25 25 25 25 25 ...
##  $ region      : Factor w/ 1 level "Sydney City Ocean Beaches": 1 1 1 1 1 1 1 1 1 1 ...
##  $ rain_mm     : num  0 0 0 0 0.6 0.1 8 7.2 0 0 ...
##  $ temp_airport: num  23.4 30.3 31.4 46.4 26.6 25.7 22.2 24.8 29.1 25.8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   council = col_character(),
##   ..   long = col_double(),
##   ..   lat = col_double(),
##   ..   date = col_date(format = ""),
##   ..   site = col_character(),
##   ..   beachbugs = col_double(),
##   ..   id = col_double(),
##   ..   region = col_character(),
##   ..   rain_mm = col_double(),
##   ..   temp_airport = col_double()
##   .. )
# Descriptive statistics of data
summary(data)
##              council          long            lat        
##  Randwick Council:2677   Min.   :151.3   Min.   :-33.98  
##  Waverley Council:1013   1st Qu.:151.3   1st Qu.:-33.95  
##                          Median :151.3   Median :-33.92  
##                          Mean   :151.3   Mean   :-33.93  
##                          3rd Qu.:151.3   3rd Qu.:-33.90  
##                          Max.   :151.3   Max.   :-33.89  
##                                                          
##       date                          site        beachbugs      
##  Min.   :2013-01-02   Malabar Beach   : 343   Min.   :   0.00  
##  1st Qu.:2014-08-01   Coogee Beach    : 342   1st Qu.:   1.00  
##  Median :2016-01-22   Bondi Beach     : 338   Median :   5.00  
##  Mean   :2015-12-29   Bronte Beach    : 338   Mean   :  33.92  
##  3rd Qu.:2017-06-14   Clovelly Beach  : 338   3rd Qu.:  17.00  
##  Max.   :2018-10-16   Little Bay Beach: 338   Max.   :4900.00  
##                       (Other)         :1653   NA's   :29       
##        id                              region        rain_mm      
##  Min.   :22.00   Sydney City Ocean Beaches:3690   Min.   : 0.000  
##  1st Qu.:24.00                                    1st Qu.: 0.000  
##  Median :26.00                                    Median : 0.000  
##  Mean   :25.87                                    Mean   : 4.175  
##  3rd Qu.:27.40                                    3rd Qu.: 2.800  
##  Max.   :29.00                                    Max.   :62.200  
##                                                   NA's   :22      
##   temp_airport  
##  Min.   :13.10  
##  1st Qu.:19.60  
##  Median :23.30  
##  Mean   :23.62  
##  3rd Qu.:26.80  
##  Max.   :46.40  
## 
######  ggplot2 ######
# https://ggplot2.tidyverse.org/reference/geom_bar.html#examples
library(RColorBrewer)  #add colour palette
library(ggplot2)


# 1.  Discrete and continuous, add linear regression to highlight trends
 a <- ggplot(data, aes(temp_airport,rain_mm)) 
 a + geom_point() + geom_smooth(method = "lm")
## Warning: Removed 22 rows containing non-finite values (stat_smooth).
## Warning: Removed 22 rows containing missing values (geom_point).

# 2.  Plot - site
b <- ggplot(data, aes(site))
b + 
  geom_bar(aes(fill=site), position = position_stack(reverse = TRUE)) +
  coord_flip() +
  theme(legend.position = "top")

# ScatterPlot - beachbugs
c <- ggplot(data,aes(beachbugs,site)) 
c + geom_point(color = "steelblue", size = 4, alpha = 1/2)
## Warning: Removed 29 rows containing missing values (geom_point).

 # compare date and beachbugs
f <- ggplot(data,aes( x = date, y = rain_mm))
f + geom_line() + ylim(-3,3)
## Warning: Removed 33 rows containing missing values (geom_path).

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.