This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Install packages in the console:
#install.packages("tidyverse")
#install.packages("dummies")
# install.packages("knitr")
# Load packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
# Get working directory
getwd()
## [1] "/Users/wendywong/Downloads"
# Set working directory. Change all back slashes to forward slashes
setwd("/Users/wendywong/Downloads")
# Read csv file.
# Use the import dataset function after the data is in the correct working directory.
# The data is enterococci levels at sydney beaches over the last 5 years, as well as rainfall and temperature. Check out this post if you want to learn about enterococci https://rladiessydney.org/post/2018/11/05/basicbasics-3/.
library(readr)
rain_temp_beachbugs <- read_csv("rain_temp_beachbugs.csv")
## Parsed with column specification:
## cols(
## council = col_character(),
## long = col_double(),
## lat = col_double(),
## date = col_date(format = ""),
## site = col_character(),
## beachbugs = col_double(),
## id = col_double(),
## region = col_character(),
## rain_mm = col_double(),
## temp_airport = col_double()
## )
data <- rain_temp_beachbugs
# View the data
glimpse(data)
## Observations: 3,690
## Variables: 10
## $ council <chr> "Randwick Council", "Randwick Council", "Randwick C…
## $ long <dbl> 151.2675, 151.2675, 151.2675, 151.2675, 151.2675, 1…
## $ lat <dbl> -33.91449, -33.91449, -33.91449, -33.91449, -33.914…
## $ date <date> 2013-01-02, 2013-01-06, 2013-01-12, 2013-01-18, 20…
## $ site <chr> "Clovelly Beach", "Clovelly Beach", "Clovelly Beach…
## $ beachbugs <dbl> 19, 3, 2, 13, 8, 7, 11, 97, 3, 0, 6, 0, 1, 8, 3, 5,…
## $ id <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,…
## $ region <chr> "Sydney City Ocean Beaches", "Sydney City Ocean Bea…
## $ rain_mm <dbl> 0.0, 0.0, 0.0, 0.0, 0.6, 0.1, 8.0, 7.2, 0.0, 0.0, 0…
## $ temp_airport <dbl> 23.4, 30.3, 31.4, 46.4, 26.6, 25.7, 22.2, 24.8, 29.…
# View structure of the data
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3690 obs. of 10 variables:
## $ council : chr "Randwick Council" "Randwick Council" "Randwick Council" "Randwick Council" ...
## $ long : num 151 151 151 151 151 ...
## $ lat : num -33.9 -33.9 -33.9 -33.9 -33.9 ...
## $ date : Date, format: "2013-01-02" "2013-01-06" ...
## $ site : chr "Clovelly Beach" "Clovelly Beach" "Clovelly Beach" "Clovelly Beach" ...
## $ beachbugs : num 19 3 2 13 8 7 11 97 3 0 ...
## $ id : num 25 25 25 25 25 25 25 25 25 25 ...
## $ region : chr "Sydney City Ocean Beaches" "Sydney City Ocean Beaches" "Sydney City Ocean Beaches" "Sydney City Ocean Beaches" ...
## $ rain_mm : num 0 0 0 0 0.6 0.1 8 7.2 0 0 ...
## $ temp_airport: num 23.4 30.3 31.4 46.4 26.6 25.7 22.2 24.8 29.1 25.8 ...
## - attr(*, "spec")=
## .. cols(
## .. council = col_character(),
## .. long = col_double(),
## .. lat = col_double(),
## .. date = col_date(format = ""),
## .. site = col_character(),
## .. beachbugs = col_double(),
## .. id = col_double(),
## .. region = col_character(),
## .. rain_mm = col_double(),
## .. temp_airport = col_double()
## .. )
# inspect first 5 values
head(data)
## # A tibble: 6 x 10
## council long lat date site beachbugs id region rain_mm
## <chr> <dbl> <dbl> <date> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Randwi… 151. -33.9 2013-01-02 Clov… 19 25 Sydne… 0
## 2 Randwi… 151. -33.9 2013-01-06 Clov… 3 25 Sydne… 0
## 3 Randwi… 151. -33.9 2013-01-12 Clov… 2 25 Sydne… 0
## 4 Randwi… 151. -33.9 2013-01-18 Clov… 13 25 Sydne… 0
## 5 Randwi… 151. -33.9 2013-01-30 Clov… 8 25 Sydne… 0.6
## 6 Randwi… 151. -33.9 2013-02-05 Clov… 7 25 Sydne… 0.1
## # … with 1 more variable: temp_airport <dbl>
# Pre-process and clean the data
# convert chracters into factors
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
data$council <- as.factor(data$council)
data$site <- as.factor(data$site)
data$region <- as.factor(data$region)
data$temp_airport<- as.numeric(data$temp_airport)
data$rain_mm<- as.numeric(data$rain_mm)
# Check status of cleaned data
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3690 obs. of 10 variables:
## $ council : Factor w/ 2 levels "Randwick Council",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ long : num 151 151 151 151 151 ...
## $ lat : num -33.9 -33.9 -33.9 -33.9 -33.9 ...
## $ date : Date, format: "2013-01-02" "2013-01-06" ...
## $ site : Factor w/ 11 levels "Bondi Beach",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ beachbugs : num 19 3 2 13 8 7 11 97 3 0 ...
## $ id : num 25 25 25 25 25 25 25 25 25 25 ...
## $ region : Factor w/ 1 level "Sydney City Ocean Beaches": 1 1 1 1 1 1 1 1 1 1 ...
## $ rain_mm : num 0 0 0 0 0.6 0.1 8 7.2 0 0 ...
## $ temp_airport: num 23.4 30.3 31.4 46.4 26.6 25.7 22.2 24.8 29.1 25.8 ...
## - attr(*, "spec")=
## .. cols(
## .. council = col_character(),
## .. long = col_double(),
## .. lat = col_double(),
## .. date = col_date(format = ""),
## .. site = col_character(),
## .. beachbugs = col_double(),
## .. id = col_double(),
## .. region = col_character(),
## .. rain_mm = col_double(),
## .. temp_airport = col_double()
## .. )
# Descriptive statistics of data
summary(data)
## council long lat
## Randwick Council:2677 Min. :151.3 Min. :-33.98
## Waverley Council:1013 1st Qu.:151.3 1st Qu.:-33.95
## Median :151.3 Median :-33.92
## Mean :151.3 Mean :-33.93
## 3rd Qu.:151.3 3rd Qu.:-33.90
## Max. :151.3 Max. :-33.89
##
## date site beachbugs
## Min. :2013-01-02 Malabar Beach : 343 Min. : 0.00
## 1st Qu.:2014-08-01 Coogee Beach : 342 1st Qu.: 1.00
## Median :2016-01-22 Bondi Beach : 338 Median : 5.00
## Mean :2015-12-29 Bronte Beach : 338 Mean : 33.92
## 3rd Qu.:2017-06-14 Clovelly Beach : 338 3rd Qu.: 17.00
## Max. :2018-10-16 Little Bay Beach: 338 Max. :4900.00
## (Other) :1653 NA's :29
## id region rain_mm
## Min. :22.00 Sydney City Ocean Beaches:3690 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 0.000
## Median :26.00 Median : 0.000
## Mean :25.87 Mean : 4.175
## 3rd Qu.:27.40 3rd Qu.: 2.800
## Max. :29.00 Max. :62.200
## NA's :22
## temp_airport
## Min. :13.10
## 1st Qu.:19.60
## Median :23.30
## Mean :23.62
## 3rd Qu.:26.80
## Max. :46.40
##
###### ggplot2 ######
# https://ggplot2.tidyverse.org/reference/geom_bar.html#examples
library(RColorBrewer) #add colour palette
library(ggplot2)
# 1. Discrete and continuous, add linear regression to highlight trends
a <- ggplot(data, aes(temp_airport,rain_mm))
a + geom_point() + geom_smooth(method = "lm")
## Warning: Removed 22 rows containing non-finite values (stat_smooth).
## Warning: Removed 22 rows containing missing values (geom_point).
# 2. Plot - site
b <- ggplot(data, aes(site))
b +
geom_bar(aes(fill=site), position = position_stack(reverse = TRUE)) +
coord_flip() +
theme(legend.position = "top")
# ScatterPlot - beachbugs
c <- ggplot(data,aes(beachbugs,site))
c + geom_point(color = "steelblue", size = 4, alpha = 1/2)
## Warning: Removed 29 rows containing missing values (geom_point).
# compare date and beachbugs
f <- ggplot(data,aes( x = date, y = rain_mm))
f + geom_line() + ylim(-3,3)
## Warning: Removed 33 rows containing missing values (geom_path).
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.