Download and install:
Set up a Github account + Version control system + This can be organisational or personal; public or private
Learn how to:
dplyr
, ggplot2
readr::read_csv()
data.table::fread()
- fast read for large filesreadxl::read_excel()
ODS::read_ods()
haven::read_sav()
/ haven::read_spss()
haven::read_dta()
.rds
which is fast and highly compressedlibrary(pacman)
p_load(tidyverse)
data <- read_csv("https://coronavirus.data.gov.uk/api/v1/data?filters=areaType=nation;areaName=England&structure=%7B%22areaType%22:%22areaType%22,%22areaName%22:%22areaName%22,%22areaCode%22:%22areaCode%22,%22date%22:%22date%22,%22uniqueCasePositivityBySpecimenDateRollingSum%22:%22uniqueCasePositivityBySpecimenDateRollingSum%22,%22uniquePeopleTestedBySpecimenDateRollingSum%22:%22uniquePeopleTestedBySpecimenDateRollingSum%22%7D&format=csv", show_col_types = F)
head(data)
## # A tibble: 6 × 6
## areaType areaName areaCode date uniqueCasePositivityB… uniquePeopleTes…
## <chr> <chr> <chr> <date> <dbl> <dbl>
## 1 nation England E92000001 2022-01-21 16.9 2122387
## 2 nation England E92000001 2022-01-20 16.9 2148550
## 3 nation England E92000001 2022-01-19 17.2 2171736
## 4 nation England E92000001 2022-01-18 17.5 2192894
## 5 nation England E92000001 2022-01-17 18.1 2257634
## 6 nation England E92000001 2022-01-16 19 2375188
dplyr
Data wrangling package with a set of key functions (verbs)
filter
- operations to select rows on the basis of specified criteriadata %>%
filter(date == "2022-01-01")
## # A tibble: 1 × 6
## areaType areaName areaCode date uniqueCasePositivityB… uniquePeopleTes…
## <chr> <chr> <chr> <date> <dbl> <dbl>
## 1 nation England E92000001 2022-01-01 31.2 3306581
select
- select columnsdata %>%
select(date, uniqueCasePositivityBySpecimenDateRollingSum) %>%
filter(date >= "2022-01-01")
## # A tibble: 21 × 2
## date uniqueCasePositivityBySpecimenDateRollingSum
## <date> <dbl>
## 1 2022-01-21 16.9
## 2 2022-01-20 16.9
## 3 2022-01-19 17.2
## 4 2022-01-18 17.5
## 5 2022-01-17 18.1
## 6 2022-01-16 19
## 7 2022-01-15 19.8
## 8 2022-01-14 20.7
## 9 2022-01-13 22
## 10 2022-01-12 23.8
## # … with 11 more rows
mutate
- add a new columndata %>%
mutate(cases = uniqueCasePositivityBySpecimenDateRollingSum / uniqueCasePositivityBySpecimenDateRollingSum[1]) %>%
select(-areaType) %>%
head()
## # A tibble: 6 × 6
## areaName areaCode date uniqueCasePositivityBySp… uniquePeopleTes… cases
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 England E92000001 2022-01-21 16.9 2122387 1
## 2 England E92000001 2022-01-20 16.9 2148550 1
## 3 England E92000001 2022-01-19 17.2 2171736 1.02
## 4 England E92000001 2022-01-18 17.5 2192894 1.04
## 5 England E92000001 2022-01-17 18.1 2257634 1.07
## 6 England E92000001 2022-01-16 19 2375188 1.12
ggplot2
data %>%
mutate(cases = uniqueCasePositivityBySpecimenDateRollingSum / uniqueCasePositivityBySpecimenDateRollingSum[1]) %>%
ggplot(aes(date, cases)) +
geom_line(lty = "dotted") +
geom_smooth(method = "gam", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
devtools::install_github("ropensci/fingertipsR")
## Skipping install of 'fingertipsR' from a github remote, the SHA1 (3d4ea2a7) has not changed since last install.
## Use `force = TRUE` to force installation
library(fingertipsR); library(tidyverse)
#fingertipsR::select_indicators()
smoking <- fingertips_data(IndicatorID = c(92443, 93085, 93553, 93579,
93573, 93798, 91547, 92304),
AreaTypeID = 401)
smoking_ne <- smoking %>%
filter(ParentCode == "E12000001")
smoking_ne %>%
filter(str_detect(IndicatorName, "[Pp]revalence"), Age == "18+ yrs",
Sex == "Persons") %>%
ggplot(aes(round(TimeperiodSortable/10^4, 0), Value,
colour = IndicatorName)) +
geom_line(aes(group = IndicatorName)) +
geom_point() +
geom_linerange(aes(ymin = LowerCI95.0limit, ymax = UpperCI95.0limit)) +
facet_wrap(~AreaName,
nrow = 3) +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 0, hjust =1)) +
labs(x = "Year",
y = "Estimate (%)") +
scale_x_continuous(n.breaks = 6)