2 - readr-datatable
# install.packages("readr")
# Load the readr package
library(readr)
##########
# get data
##########
# impact of storage period and cooking on potatoes' flavor.
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/potatoes.csv",
"potatoes.csv")
# does contain columns names in the first row
# -----------------------------------
# read_csv() = wrapper function around read_delim()
# https://www.rdocumentation.org/packages/readr/versions/1.0.0/topics/read_delim
# read_csv(file, col_names = TRUE, col_types = NULL,
potatoes <- read_csv("potatoes.csv")
## Parsed with column specification:
## cols(
## area = col_integer(),
## temp = col_integer(),
## size = col_integer(),
## storage = col_integer(),
## method = col_integer(),
## texture = col_double(),
## flavor = col_double(),
## moistness = col_double()
## )
##########
# get data
##########
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/potatoes.txt",
"potatoes.txt")
# does not contain columns names in the first row
# -----------------------------------
# read_tsv()
# read_tsv(file, col_names = TRUE, col_types = NULL,
# Column names
properties <- c("area", "temp", "size", "storage", "method",
"texture", "flavor", "moistness")
potatoes <- read_tsv("potatoes.txt", col_names = properties)
## Parsed with column specification:
## cols(
## area = col_integer(),
## temp = col_integer(),
## size = col_integer(),
## storage = col_integer(),
## method = col_integer(),
## texture = col_double(),
## flavor = col_double(),
## moistness = col_double()
## )
head(potatoes)
## # A tibble: 6 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3 1.7
# -----------------------------------
# read_delim()
# read_delim(file, delim,
# delim = character that
# separates the values in the data file
potatoes <- read_delim("potatoes.txt", delim = "\t", col_names = properties)
## Parsed with column specification:
## cols(
## area = col_integer(),
## temp = col_integer(),
## size = col_integer(),
## storage = col_integer(),
## method = col_integer(),
## texture = col_double(),
## flavor = col_double(),
## moistness = col_double()
## )
potatoes
## # A tibble: 160 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3 1.7
## 7 1 1 1 2 2 2.6 3.1 2.4
## 8 1 1 1 2 3 3 3 2.9
## 9 1 1 1 2 4 2.2 3.2 2.5
## 10 1 1 1 2 5 2 2.8 1.9
## # ... with 150 more rows
# -----------------------------------
# read_tsv()
# skip = No. of lines you're ignoring (skip the first line that can contain column names)
# n_max = No. of lines you're actually importing
#
# for example:
# skip = 2
# n_max = 3
# reading in lines 3, 4 and 5 of the file.
# Import 5 observations: 7, 8, 9, 10 and 11
potatoes_fragment <- read_tsv("potatoes.txt", skip = 6, n_max = 5, col_names = properties)
## Parsed with column specification:
## cols(
## area = col_integer(),
## temp = col_integer(),
## size = col_integer(),
## storage = col_integer(),
## method = col_integer(),
## texture = col_double(),
## flavor = col_double(),
## moistness = col_double()
## )
# -----------------------------------
# read_tsv()
# col_types = NULL => functions from the readr package
# will try to find the correct types themselves
potatoes_char <- read_tsv("potatoes.txt", col_names = properties)
## Parsed with column specification:
## cols(
## area = col_integer(),
## temp = col_integer(),
## size = col_integer(),
## storage = col_integer(),
## method = col_integer(),
## texture = col_double(),
## flavor = col_double(),
## moistness = col_double()
## )
str(potatoes_char)
## Classes 'tbl_df', 'tbl' and 'data.frame': 160 obs. of 8 variables:
## $ area : int 1 1 1 1 1 1 1 1 1 1 ...
## $ temp : int 1 1 1 1 1 1 1 1 1 1 ...
## $ size : int 1 1 1 1 1 1 1 1 1 1 ...
## $ storage : int 1 1 1 1 1 2 2 2 2 2 ...
## $ method : int 1 2 3 4 5 1 2 3 4 5 ...
## $ texture : num 2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
## $ flavor : num 3.2 2.5 2.8 2.9 2.8 3 3.1 3 3.2 2.8 ...
## $ moistness: num 3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 8
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ temp : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ size : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ storage : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ method : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ texture : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ flavor : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ moistness: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# each character denotes the class of the column
potatoes_char <- read_tsv("potatoes.txt", col_types = "iiiiiddd", col_names = properties)
str(potatoes_char)
## Classes 'tbl_df', 'tbl' and 'data.frame': 160 obs. of 8 variables:
## $ area : int 1 1 1 1 1 1 1 1 1 1 ...
## $ temp : int 1 1 1 1 1 1 1 1 1 1 ...
## $ size : int 1 1 1 1 1 1 1 1 1 1 ...
## $ storage : int 1 1 1 1 1 2 2 2 2 2 ...
## $ method : int 1 2 3 4 5 1 2 3 4 5 ...
## $ texture : num 2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
## $ flavor : num 3.2 2.5 2.8 2.9 2.8 3 3.1 3 3.2 2.8 ...
## $ moistness: num 3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 8
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ temp : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ size : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ storage : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ method : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ texture : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ flavor : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ moistness: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# c haracter
# d ouble
# i nteger
# l ogical
# _ skips whole column
# force all columns to be character (c)
potatoes_char <- read_tsv("potatoes.txt", col_types = "cccccccc", col_names = properties)
str(potatoes_char)
## Classes 'tbl_df', 'tbl' and 'data.frame': 160 obs. of 8 variables:
## $ area : chr "1" "1" "1" "1" ...
## $ temp : chr "1" "1" "1" "1" ...
## $ size : chr "1" "1" "1" "1" ...
## $ storage : chr "1" "1" "1" "1" ...
## $ method : chr "1" "2" "3" "4" ...
## $ texture : chr "2.9" "2.3" "2.5" "2.1" ...
## $ flavor : chr "3.2" "2.5" "2.8" "2.9" ...
## $ moistness: chr "3.0" "2.6" "2.8" "2.4" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 8
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ temp : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ size : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ storage : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ method : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ texture : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ flavor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ moistness: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# ----------------------------------------
# Collector functions can be passed in a list()
# to the col_types argument
# of read_ functions
# to tell them how to interpret values in a column.
# Collector function: col_integer()
# column should be interpreted as
# an integer.
# Collector function: col_factor(levels, ordered = FALSE)
# column should be interpreted as
# a factor with levels.
##########
# get data
##########
# tab-delimited file without column names in the first row.
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/hotdogs.txt",
"hotdogs.txt")
# Import without col_types
hotdogs <- read_tsv("hotdogs.txt", col_names = c("type", "calories", "sodium"))
## Parsed with column specification:
## cols(
## type = col_character(),
## calories = col_integer(),
## sodium = col_integer()
## )
summary(hotdogs)
## type calories sodium
## Length:54 Min. : 86.0 Min. :144.0
## Class :character 1st Qu.:132.0 1st Qu.:362.5
## Mode :character Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
# The collectors you will need to import the data
fac <- col_factor(levels = c("Beef", "Meat", "Poultry"))
int <- col_integer()
# Edit the col_types argument
# hotdogs_factor <- read_tsv("hotdogs.txt",
# col_names = c("type", "calories", "sodium"),
# col_types = NULL)
hotdogs_factor <- read_tsv("hotdogs.txt",
col_names = c("type", "calories", "sodium"),
col_types = list(fac, int, int))
# col_types = c(fac, int, int)) - error
# Display the summary of hotdogs_factor
hotdogs_factor
## # A tibble: 54 x 3
## type calories sodium
## <fct> <int> <int>
## 1 Beef 186 495
## 2 Beef 181 477
## 3 Beef 176 425
## 4 Beef 149 322
## 5 Beef 184 482
## 6 Beef 190 587
## 7 Beef 158 370
## 8 Beef 139 322
## 9 Beef 175 479
## 10 Beef 148 375
## # ... with 44 more rows
summary(hotdogs_factor)
## type calories sodium
## Beef :20 Min. : 86.0 Min. :144.0
## Meat :17 1st Qu.:132.0 1st Qu.:362.5
## Poultry:17 Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
summary(hotdogs)
## type calories sodium
## Length:54 Min. : 86.0 Min. :144.0
## Class :character 1st Qu.:132.0 1st Qu.:362.5
## Mode :character Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
# The summary of hotdogs_factor contains more interesting information for the type column.
# ---------------------------------------
# load the data.table package
library(data.table)
# Fields are delimited by commas
# first line contains the column names.
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/potatoes.csv",
"potatoes.csv")
# ---------------------------------------
# fread() = faster than read.table()
potatoes <- fread("potatoes.csv")
potatoes
## area temp size storage method texture flavor moistness
## 1: 1 1 1 1 1 2.9 3.2 3.0
## 2: 1 1 1 1 2 2.3 2.5 2.6
## 3: 1 1 1 1 3 2.5 2.8 2.8
## 4: 1 1 1 1 4 2.1 2.9 2.4
## 5: 1 1 1 1 5 1.9 2.8 2.2
## ---
## 156: 2 2 2 4 1 2.7 3.3 2.6
## 157: 2 2 2 4 2 2.6 2.8 2.3
## 158: 2 2 2 4 3 2.5 3.1 2.6
## 159: 2 2 2 4 4 3.4 3.3 3.0
## 160: 2 2 2 4 5 2.5 2.8 2.3
str(potatoes)
## Classes 'data.table' and 'data.frame': 160 obs. of 8 variables:
## $ area : int 1 1 1 1 1 1 1 1 1 1 ...
## $ temp : int 1 1 1 1 1 1 1 1 1 1 ...
## $ size : int 1 1 1 1 1 1 1 1 1 1 ...
## $ storage : int 1 1 1 1 1 2 2 2 2 2 ...
## $ method : int 1 2 3 4 5 1 2 3 4 5 ...
## $ texture : num 2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
## $ flavor : num 3.2 2.5 2.8 2.9 2.8 3 3.1 3 3.2 2.8 ...
## $ moistness: num 3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
## - attr(*, ".internal.selfref")=<externalptr>
# dataset that contains 5 variables
# want to keep the first and fifth variable, named "a" and "e".
# fread("path/to/file.txt", drop = 2:4)
# fread("path/to/file.txt", select = c(1, 5))
# fread("path/to/file.txt", drop = c("b", "c", "d")
# fread("path/to/file.txt", select = c("a", "e"))
# import the texture and moistness columns
# columns 6 and 8
potatoes <- fread("potatoes.csv", select = c(6, 8))
str(potatoes)
## Classes 'data.table' and 'data.frame': 160 obs. of 2 variables:
## $ texture : num 2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
## $ moistness: num 3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
## - attr(*, ".internal.selfref")=<externalptr>
plot(potatoes$texture, potatoes$moistness)
