2 - readr-datatable

# install.packages("readr")
# Load the readr package
library(readr)

##########
# get data
##########
# impact of storage period and cooking on potatoes' flavor.
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/potatoes.csv",
              "potatoes.csv")
# does contain columns names in the first row

# -----------------------------------
# read_csv() = wrapper function around read_delim()
# https://www.rdocumentation.org/packages/readr/versions/1.0.0/topics/read_delim
# read_csv(file, col_names = TRUE, col_types = NULL, 

potatoes <- read_csv("potatoes.csv")
## Parsed with column specification:
## cols(
##   area = col_integer(),
##   temp = col_integer(),
##   size = col_integer(),
##   storage = col_integer(),
##   method = col_integer(),
##   texture = col_double(),
##   flavor = col_double(),
##   moistness = col_double()
## )

##########
# get data
##########
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/potatoes.txt",
              "potatoes.txt")
# does not contain columns names in the first row

# -----------------------------------
# read_tsv()
# read_tsv(file, col_names = TRUE, col_types = NULL,

# Column names
properties <- c("area", "temp", "size", "storage", "method",
                "texture", "flavor", "moistness")

potatoes <- read_tsv("potatoes.txt", col_names = properties)
## Parsed with column specification:
## cols(
##   area = col_integer(),
##   temp = col_integer(),
##   size = col_integer(),
##   storage = col_integer(),
##   method = col_integer(),
##   texture = col_double(),
##   flavor = col_double(),
##   moistness = col_double()
## )

head(potatoes)
## # A tibble: 6 x 8
##    area  temp  size storage method texture flavor moistness
##   <int> <int> <int>   <int>  <int>   <dbl>  <dbl>     <dbl>
## 1     1     1     1       1      1     2.9    3.2       3  
## 2     1     1     1       1      2     2.3    2.5       2.6
## 3     1     1     1       1      3     2.5    2.8       2.8
## 4     1     1     1       1      4     2.1    2.9       2.4
## 5     1     1     1       1      5     1.9    2.8       2.2
## 6     1     1     1       2      1     1.8    3         1.7


# -----------------------------------
# read_delim()
# read_delim(file, delim,
#                 delim = character that
#                         separates the values in the data file

potatoes <- read_delim("potatoes.txt", delim = "\t", col_names = properties)
## Parsed with column specification:
## cols(
##   area = col_integer(),
##   temp = col_integer(),
##   size = col_integer(),
##   storage = col_integer(),
##   method = col_integer(),
##   texture = col_double(),
##   flavor = col_double(),
##   moistness = col_double()
## )
potatoes
## # A tibble: 160 x 8
##     area  temp  size storage method texture flavor moistness
##    <int> <int> <int>   <int>  <int>   <dbl>  <dbl>     <dbl>
##  1     1     1     1       1      1     2.9    3.2       3  
##  2     1     1     1       1      2     2.3    2.5       2.6
##  3     1     1     1       1      3     2.5    2.8       2.8
##  4     1     1     1       1      4     2.1    2.9       2.4
##  5     1     1     1       1      5     1.9    2.8       2.2
##  6     1     1     1       2      1     1.8    3         1.7
##  7     1     1     1       2      2     2.6    3.1       2.4
##  8     1     1     1       2      3     3      3         2.9
##  9     1     1     1       2      4     2.2    3.2       2.5
## 10     1     1     1       2      5     2      2.8       1.9
## # ... with 150 more rows

# -----------------------------------
# read_tsv()
# skip = No. of lines you're ignoring (skip the first line that can contain column names)
# n_max = No. of lines you're actually importing
# 
# for example:
# skip = 2 
# n_max = 3
# reading in lines 3, 4 and 5 of the file.

# Import 5 observations: 7, 8, 9, 10 and 11
potatoes_fragment <- read_tsv("potatoes.txt", skip = 6, n_max = 5, col_names = properties)
## Parsed with column specification:
## cols(
##   area = col_integer(),
##   temp = col_integer(),
##   size = col_integer(),
##   storage = col_integer(),
##   method = col_integer(),
##   texture = col_double(),
##   flavor = col_double(),
##   moistness = col_double()
## )


# -----------------------------------
# read_tsv()
# col_types = NULL => functions from the readr package 
#                     will try to find the correct types themselves
potatoes_char <- read_tsv("potatoes.txt", col_names = properties)
## Parsed with column specification:
## cols(
##   area = col_integer(),
##   temp = col_integer(),
##   size = col_integer(),
##   storage = col_integer(),
##   method = col_integer(),
##   texture = col_double(),
##   flavor = col_double(),
##   moistness = col_double()
## )
str(potatoes_char)
## Classes 'tbl_df', 'tbl' and 'data.frame':    160 obs. of  8 variables:
##  $ area     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ temp     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ size     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ storage  : int  1 1 1 1 1 2 2 2 2 2 ...
##  $ method   : int  1 2 3 4 5 1 2 3 4 5 ...
##  $ texture  : num  2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
##  $ flavor   : num  3.2 2.5 2.8 2.9 2.8 3 3.1 3 3.2 2.8 ...
##  $ moistness: num  3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 8
##   .. ..$ area     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ temp     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ size     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ storage  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ method   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ texture  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ flavor   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ moistness: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

# each character denotes the class of the column
potatoes_char <- read_tsv("potatoes.txt", col_types = "iiiiiddd", col_names = properties)
str(potatoes_char)
## Classes 'tbl_df', 'tbl' and 'data.frame':    160 obs. of  8 variables:
##  $ area     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ temp     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ size     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ storage  : int  1 1 1 1 1 2 2 2 2 2 ...
##  $ method   : int  1 2 3 4 5 1 2 3 4 5 ...
##  $ texture  : num  2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
##  $ flavor   : num  3.2 2.5 2.8 2.9 2.8 3 3.1 3 3.2 2.8 ...
##  $ moistness: num  3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 8
##   .. ..$ area     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ temp     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ size     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ storage  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ method   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ texture  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ flavor   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ moistness: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

# c haracter
# d ouble
# i nteger
# l ogical
# _ skips whole column

# force all columns to be character (c)
potatoes_char <- read_tsv("potatoes.txt", col_types = "cccccccc", col_names = properties)
str(potatoes_char)
## Classes 'tbl_df', 'tbl' and 'data.frame':    160 obs. of  8 variables:
##  $ area     : chr  "1" "1" "1" "1" ...
##  $ temp     : chr  "1" "1" "1" "1" ...
##  $ size     : chr  "1" "1" "1" "1" ...
##  $ storage  : chr  "1" "1" "1" "1" ...
##  $ method   : chr  "1" "2" "3" "4" ...
##  $ texture  : chr  "2.9" "2.3" "2.5" "2.1" ...
##  $ flavor   : chr  "3.2" "2.5" "2.8" "2.9" ...
##  $ moistness: chr  "3.0" "2.6" "2.8" "2.4" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 8
##   .. ..$ area     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ temp     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ size     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ storage  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ method   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ texture  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ flavor   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ moistness: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"


# ----------------------------------------
# Collector functions can be passed in a list() 
# to the col_types argument 
# of read_ functions
# to tell them how to interpret values in a column.
# Collector function: col_integer()
#                     column should be interpreted as 
#                     an integer.
# Collector function: col_factor(levels, ordered = FALSE)
#                     column should be interpreted as 
#                     a factor with levels.

##########
# get data
##########
# tab-delimited file without column names in the first row.
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/hotdogs.txt",
              "hotdogs.txt")

# Import without col_types
hotdogs <- read_tsv("hotdogs.txt", col_names = c("type", "calories", "sodium"))
## Parsed with column specification:
## cols(
##   type = col_character(),
##   calories = col_integer(),
##   sodium = col_integer()
## )
summary(hotdogs)
##      type              calories         sodium     
##  Length:54          Min.   : 86.0   Min.   :144.0  
##  Class :character   1st Qu.:132.0   1st Qu.:362.5  
##  Mode  :character   Median :145.0   Median :405.0  
##                     Mean   :145.4   Mean   :424.8  
##                     3rd Qu.:172.8   3rd Qu.:503.5  
##                     Max.   :195.0   Max.   :645.0


# The collectors you will need to import the data
fac <- col_factor(levels = c("Beef", "Meat", "Poultry"))
int <- col_integer()

# Edit the col_types argument

# hotdogs_factor <- read_tsv("hotdogs.txt",
#                           col_names = c("type", "calories", "sodium"),
#                           col_types = NULL)

hotdogs_factor <- read_tsv("hotdogs.txt",
                           col_names = c("type", "calories", "sodium"),
                           col_types = list(fac, int, int))
#                          col_types = c(fac, int, int)) - error

# Display the summary of hotdogs_factor
hotdogs_factor
## # A tibble: 54 x 3
##    type  calories sodium
##    <fct>    <int>  <int>
##  1 Beef       186    495
##  2 Beef       181    477
##  3 Beef       176    425
##  4 Beef       149    322
##  5 Beef       184    482
##  6 Beef       190    587
##  7 Beef       158    370
##  8 Beef       139    322
##  9 Beef       175    479
## 10 Beef       148    375
## # ... with 44 more rows

summary(hotdogs_factor)
##       type       calories         sodium     
##  Beef   :20   Min.   : 86.0   Min.   :144.0  
##  Meat   :17   1st Qu.:132.0   1st Qu.:362.5  
##  Poultry:17   Median :145.0   Median :405.0  
##               Mean   :145.4   Mean   :424.8  
##               3rd Qu.:172.8   3rd Qu.:503.5  
##               Max.   :195.0   Max.   :645.0

summary(hotdogs)
##      type              calories         sodium     
##  Length:54          Min.   : 86.0   Min.   :144.0  
##  Class :character   1st Qu.:132.0   1st Qu.:362.5  
##  Mode  :character   Median :145.0   Median :405.0  
##                     Mean   :145.4   Mean   :424.8  
##                     3rd Qu.:172.8   3rd Qu.:503.5  
##                     Max.   :195.0   Max.   :645.0

# The summary of hotdogs_factor contains more interesting information for the type column.


# ---------------------------------------
# load the data.table package
library(data.table)

# Fields are delimited by commas
# first line contains the column names.
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/potatoes.csv",
              "potatoes.csv")

# ---------------------------------------
# fread() = faster than read.table()
potatoes <- fread("potatoes.csv")
potatoes
##      area temp size storage method texture flavor moistness
##   1:    1    1    1       1      1     2.9    3.2       3.0
##   2:    1    1    1       1      2     2.3    2.5       2.6
##   3:    1    1    1       1      3     2.5    2.8       2.8
##   4:    1    1    1       1      4     2.1    2.9       2.4
##   5:    1    1    1       1      5     1.9    2.8       2.2
##  ---                                                       
## 156:    2    2    2       4      1     2.7    3.3       2.6
## 157:    2    2    2       4      2     2.6    2.8       2.3
## 158:    2    2    2       4      3     2.5    3.1       2.6
## 159:    2    2    2       4      4     3.4    3.3       3.0
## 160:    2    2    2       4      5     2.5    2.8       2.3
str(potatoes)
## Classes 'data.table' and 'data.frame':   160 obs. of  8 variables:
##  $ area     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ temp     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ size     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ storage  : int  1 1 1 1 1 2 2 2 2 2 ...
##  $ method   : int  1 2 3 4 5 1 2 3 4 5 ...
##  $ texture  : num  2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
##  $ flavor   : num  3.2 2.5 2.8 2.9 2.8 3 3.1 3 3.2 2.8 ...
##  $ moistness: num  3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
##  - attr(*, ".internal.selfref")=<externalptr>

# dataset that contains 5 variables 
# want to keep the first and fifth variable, named "a" and "e". 
# fread("path/to/file.txt", drop = 2:4)
# fread("path/to/file.txt", select = c(1, 5))
# fread("path/to/file.txt", drop = c("b", "c", "d")
# fread("path/to/file.txt", select = c("a", "e"))

# import the texture and moistness columns
# columns 6 and 8 
potatoes <- fread("potatoes.csv", select = c(6, 8))
str(potatoes)
## Classes 'data.table' and 'data.frame':   160 obs. of  2 variables:
##  $ texture  : num  2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
##  $ moistness: num  3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
##  - attr(*, ".internal.selfref")=<externalptr>
plot(potatoes$texture, potatoes$moistness)
2 - readr-datatable

Kunal Haira

2018-05-09

2 - readr-datatable