Import data
# excel file
Coaster <- read_excel("../00_data/MyData.xlsx")
data
## function (..., list = character(), package = NULL, lib.loc = NULL,
## verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
## {
## fileExt <- function(x) {
## db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
## ans <- sub(".*\\.", "", x)
## ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
## x[db])
## ans
## }
## my_read_table <- function(...) {
## lcc <- Sys.getlocale("LC_COLLATE")
## on.exit(Sys.setlocale("LC_COLLATE", lcc))
## Sys.setlocale("LC_COLLATE", "C")
## read.table(...)
## }
## stopifnot(is.character(list))
## names <- c(as.character(substitute(list(...))[-1L]), list)
## if (!is.null(package)) {
## if (!is.character(package))
## stop("'package' must be a character vector or NULL")
## }
## paths <- find.package(package, lib.loc, verbose = verbose)
## if (is.null(lib.loc))
## paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
## paths)
## paths <- unique(normalizePath(paths[file.exists(paths)]))
## paths <- paths[dir.exists(file.path(paths, "data"))]
## dataExts <- tools:::.make_file_exts("data")
## if (length(names) == 0L) {
## db <- matrix(character(), nrow = 0L, ncol = 4L)
## for (path in paths) {
## entries <- NULL
## packageName <- if (file_test("-f", file.path(path,
## "DESCRIPTION")))
## basename(path)
## else "."
## if (file_test("-f", INDEX <- file.path(path, "Meta",
## "data.rds"))) {
## entries <- readRDS(INDEX)
## }
## else {
## dataDir <- file.path(path, "data")
## entries <- tools::list_files_with_type(dataDir,
## "data")
## if (length(entries)) {
## entries <- unique(tools::file_path_sans_ext(basename(entries)))
## entries <- cbind(entries, "")
## }
## }
## if (NROW(entries)) {
## if (is.matrix(entries) && ncol(entries) == 2L)
## db <- rbind(db, cbind(packageName, dirname(path),
## entries))
## else warning(gettextf("data index for package %s is invalid and will be ignored",
## sQuote(packageName)), domain = NA, call. = FALSE)
## }
## }
## colnames(db) <- c("Package", "LibPath", "Item", "Title")
## footer <- if (missing(package))
## paste0("Use ", sQuote(paste("data(package =", ".packages(all.available = TRUE))")),
## "\n", "to list the data sets in all *available* packages.")
## else NULL
## y <- list(title = "Data sets", header = NULL, results = db,
## footer = footer)
## class(y) <- "packageIQR"
## return(y)
## }
## paths <- file.path(paths, "data")
## for (name in names) {
## found <- FALSE
## for (p in paths) {
## tmp_env <- if (overwrite)
## envir
## else new.env()
## if (file_test("-f", file.path(p, "Rdata.rds"))) {
## rds <- readRDS(file.path(p, "Rdata.rds"))
## if (name %in% names(rds)) {
## found <- TRUE
## if (verbose)
## message(sprintf("name=%s:\t found in Rdata.rds",
## name), domain = NA)
## thispkg <- sub(".*/([^/]*)/data$", "\\1", p)
## thispkg <- sub("_.*$", "", thispkg)
## thispkg <- paste0("package:", thispkg)
## objs <- rds[[name]]
## lazyLoad(file.path(p, "Rdata"), envir = tmp_env,
## filter = function(x) x %in% objs)
## break
## }
## else if (verbose)
## message(sprintf("name=%s:\t NOT found in names() of Rdata.rds, i.e.,\n\t%s\n",
## name, paste(names(rds), collapse = ",")),
## domain = NA)
## }
## if (file_test("-f", file.path(p, "Rdata.zip"))) {
## warning("zipped data found for package ", sQuote(basename(dirname(p))),
## ".\nThat is defunct, so please re-install the package.",
## domain = NA)
## if (file_test("-f", fp <- file.path(p, "filelist")))
## files <- file.path(p, scan(fp, what = "", quiet = TRUE))
## else {
## warning(gettextf("file 'filelist' is missing for directory %s",
## sQuote(p)), domain = NA)
## next
## }
## }
## else {
## files <- list.files(p, full.names = TRUE)
## }
## files <- files[grep(name, files, fixed = TRUE)]
## if (length(files) > 1L) {
## o <- match(fileExt(files), dataExts, nomatch = 100L)
## paths0 <- dirname(files)
## paths0 <- factor(paths0, levels = unique(paths0))
## files <- files[order(paths0, o)]
## }
## if (length(files)) {
## for (file in files) {
## if (verbose)
## message("name=", name, ":\t file= ...", .Platform$file.sep,
## basename(file), "::\t", appendLF = FALSE,
## domain = NA)
## ext <- fileExt(file)
## if (basename(file) != paste0(name, ".", ext))
## found <- FALSE
## else {
## found <- TRUE
## zfile <- file
## zipname <- file.path(dirname(file), "Rdata.zip")
## if (file.exists(zipname)) {
## Rdatadir <- tempfile("Rdata")
## dir.create(Rdatadir, showWarnings = FALSE)
## topic <- basename(file)
## rc <- .External(C_unzip, zipname, topic,
## Rdatadir, FALSE, TRUE, FALSE, FALSE)
## if (rc == 0L)
## zfile <- file.path(Rdatadir, topic)
## }
## if (zfile != file)
## on.exit(unlink(zfile))
## switch(ext, R = , r = {
## library("utils")
## sys.source(zfile, chdir = TRUE, envir = tmp_env)
## }, RData = , rdata = , rda = load(zfile,
## envir = tmp_env), TXT = , txt = , tab = ,
## tab.gz = , tab.bz2 = , tab.xz = , txt.gz = ,
## txt.bz2 = , txt.xz = assign(name, my_read_table(zfile,
## header = TRUE, as.is = FALSE), envir = tmp_env),
## CSV = , csv = , csv.gz = , csv.bz2 = ,
## csv.xz = assign(name, my_read_table(zfile,
## header = TRUE, sep = ";", as.is = FALSE),
## envir = tmp_env), found <- FALSE)
## }
## if (found)
## break
## }
## if (verbose)
## message(if (!found)
## "*NOT* ", "found", domain = NA)
## }
## if (found)
## break
## }
## if (!found) {
## warning(gettextf("data set %s not found", sQuote(name)),
## domain = NA)
## }
## else if (!overwrite) {
## for (o in ls(envir = tmp_env, all.names = TRUE)) {
## if (exists(o, envir = envir, inherits = FALSE))
## warning(gettextf("an object named %s already exists and will not be overwritten",
## sQuote(o)))
## else assign(o, get(o, envir = tmp_env, inherits = FALSE),
## envir = envir)
## }
## rm(tmp_env)
## }
## }
## invisible(names)
## }
## <bytecode: 0x000001ab21e9bd28>
## <environment: namespace:utils>
Apply the following dplyr verbs to your data
Filter rows
filter(Coaster,acc_state == "NH", gender == "F")
## # A tibble: 102 × 23
## acc_id acc_date acc_state acc_city fix_port source bus_type
## <dbl> <dttm> <chr> <chr> <chr> <chr> <chr>
## 1 1001560 2011-01-09 00:00:00 NH Bartlett F New Hamps… Mountai…
## 2 1001509 2011-02-17 00:00:00 NH Nashua F New Hamps… Family …
## 3 1001608 2011-07-01 00:00:00 NH Bartlett F New Hamps… Mountai…
## 4 1001532 2011-07-10 00:00:00 NH Portsmouth F New Hamps… Water p…
## 5 1001520 2011-07-12 00:00:00 NH Glen F New Hamps… Amuseme…
## 6 1001530 2011-07-16 00:00:00 NH Portsmouth F New Hamps… Water p…
## 7 1001566 2011-07-18 00:00:00 NH Bartlett F New Hamps… Mountai…
## 8 1001616 2011-07-20 00:00:00 NH Bartlett F New Hamps… Mountai…
## 9 1001567 2011-07-27 00:00:00 NH Bartlett F New Hamps… Mountai…
## 10 1001568 2011-08-04 00:00:00 NH Bartlett F New Hamps… Mountai…
## # ℹ 92 more rows
## # ℹ 16 more variables: industry_sector <chr>, device_category <chr>,
## # device_type <chr>, tradename_or_generic <chr>, manufacturer <chr>,
## # num_injured <dbl>, age_youngest <dbl>, gender <chr>, acc_desc <chr>,
## # injury_desc <chr>, report <chr>, category <chr>, mechanical <dbl>,
## # op_error <dbl>, employee <dbl>, notes <chr>
filter(Coaster,acc_state == "NH", gender == "M")
## # A tibble: 101 × 23
## acc_id acc_date acc_state acc_city fix_port source bus_type
## <dbl> <dttm> <chr> <chr> <chr> <chr> <chr>
## 1 1001656 2011-06-09 00:00:00 NH Salem F New Ham… Amuseme…
## 2 1001564 2011-06-13 00:00:00 NH Bartlett F New Ham… Mountai…
## 3 1001562 2011-07-07 00:00:00 NH Bartlett F New Ham… Mountai…
## 4 1001561 2011-07-11 00:00:00 NH Bartlett F New Ham… Mountai…
## 5 1001565 2011-07-12 00:00:00 NH Bartlett F New Ham… Mountai…
## 6 1001659 2011-07-17 00:00:00 NH Hampton F New Ham… Pool wa…
## 7 1001591 2011-07-24 00:00:00 NH Bartlett F New Ham… Mountai…
## 8 1001663 2011-08-01 00:00:00 NH North Conway F New Ham… Mountai…
## 9 1001592 2011-08-08 00:00:00 NH Bartlett F New Ham… Mountai…
## 10 1001569 2011-08-09 00:00:00 NH Bartlett F New Ham… Mountai…
## # ℹ 91 more rows
## # ℹ 16 more variables: industry_sector <chr>, device_category <chr>,
## # device_type <chr>, tradename_or_generic <chr>, manufacturer <chr>,
## # num_injured <dbl>, age_youngest <dbl>, gender <chr>, acc_desc <chr>,
## # injury_desc <chr>, report <chr>, category <chr>, mechanical <dbl>,
## # op_error <dbl>, employee <dbl>, notes <chr>
Arrange rows
arrange(Coaster, desc(acc_date))
## # A tibble: 8,351 × 23
## acc_id acc_date acc_state acc_city fix_port source bus_type
## <dbl> <dttm> <chr> <chr> <chr> <chr> <chr>
## 1 1009800 2017-07-26 00:00:00 OH Columbus P United… Carniva…
## 2 1001736 2017-06-30 00:00:00 OH Sandusky F Ohio D… Amuseme…
## 3 1005814 2017-06-07 00:00:00 PA Stroudsburg F Media … Go kart…
## 4 1004577 2017-05-18 00:00:00 WA Port Townsend P Washin… Carniva…
## 5 1002532 2017-03-31 00:00:00 TN Pigeon Forge F Tennes… Trampol…
## 6 1000054 2017-03-16 00:00:00 AZ Chandler F Arizon… Trampol…
## 7 1002531 2017-03-11 00:00:00 TN Sevierville F Tennes… Trampol…
## 8 1000074 2017-02-14 00:00:00 AZ Glendale F Arizon… Trampol…
## 9 1002530 2017-02-11 00:00:00 TN Sevierville F Tennes… Trampol…
## 10 1002529 2017-02-05 00:00:00 TN Sevierville F Tennes… Trampol…
## # ℹ 8,341 more rows
## # ℹ 16 more variables: industry_sector <chr>, device_category <chr>,
## # device_type <chr>, tradename_or_generic <chr>, manufacturer <chr>,
## # num_injured <dbl>, age_youngest <dbl>, gender <chr>, acc_desc <chr>,
## # injury_desc <chr>, report <chr>, category <chr>, mechanical <dbl>,
## # op_error <dbl>, employee <dbl>, notes <chr>
Select columns
select(Coaster, acc_date, acc_state, gender, age_youngest)
## # A tibble: 8,351 × 4
## acc_date acc_state gender age_youngest
## <dttm> <chr> <chr> <dbl>
## 1 2010-06-12 00:00:00 OH <NA> NA
## 2 2010-06-12 00:00:00 OH M 54
## 3 2010-07-10 00:00:00 CA F 37
## 4 2010-07-10 00:00:00 CA F 37
## 5 2010-07-29 00:00:00 CO M NA
## 6 2010-07-30 00:00:00 WI F 12
## 7 2010-08-05 00:00:00 WI F 16
## 8 2010-08-11 00:00:00 CO <NA> NA
## 9 2010-08-15 00:00:00 WI M 14
## 10 2010-09-05 00:00:00 ME <NA> NA
## # ℹ 8,341 more rows
Add columns
# difference in age of youngest injured vs the average age of injured
mutate( Coaster, age_above_avg = age_youngest - 24.6) %>%
select(acc_date,age_youngest, age_above_avg)
## # A tibble: 8,351 × 3
## acc_date age_youngest age_above_avg
## <dttm> <dbl> <dbl>
## 1 2010-06-12 00:00:00 NA NA
## 2 2010-06-12 00:00:00 54 29.4
## 3 2010-07-10 00:00:00 37 12.4
## 4 2010-07-10 00:00:00 37 12.4
## 5 2010-07-29 00:00:00 NA NA
## 6 2010-07-30 00:00:00 12 -12.6
## 7 2010-08-05 00:00:00 16 -8.6
## 8 2010-08-11 00:00:00 NA NA
## 9 2010-08-15 00:00:00 14 -10.6
## 10 2010-09-05 00:00:00 NA NA
## # ℹ 8,341 more rows
# got 24.6 from the next section summarise
Summarize by groups
# Average age of people injured
summarise(Coaster, avg_age = mean(age_youngest, na.rm = TRUE))
## # A tibble: 1 × 1
## avg_age
## <dbl>
## 1 24.6