cacao.csv <- read.csv("flavors_of_cacao.csv")
Preview the data
head(cacao.csv)
tail(cacao.csv)
Describe the data
str(cacao.csv)
## 'data.frame': 1795 obs. of 9 variables:
## $ Company...Maker.if.known. : chr "A. Morin" "A. Morin" "A. Morin" "A. Morin" ...
## $ Specific.Bean.Origin.or.Bar.Name: chr "Agua Grande" "Kpime" "Atsane" "Akata" ...
## $ REF : int 1876 1676 1676 1680 1704 1315 1315 1315 1319 1319 ...
## $ Review.Date : int 2016 2015 2015 2015 2015 2014 2014 2014 2014 2014 ...
## $ Cocoa.Percent : chr "63%" "70%" "70%" "70%" ...
## $ Company.Location : chr "France" "France" "France" "France" ...
## $ Rating : num 3.75 2.75 3 3.5 3.5 2.75 3.5 3.5 3.75 4 ...
## $ Bean.Type : chr "Â " "Â " "Â " "Â " ...
## $ Broad.Bean.Origin : chr "Sao Tome" "Togo" "Togo" "Togo" ...
summary(cacao.csv)
## Company...Maker.if.known. Specific.Bean.Origin.or.Bar.Name REF
## Length:1795 Length:1795 Min. : 5
## Class :character Class :character 1st Qu.: 576
## Mode :character Mode :character Median :1069
## Mean :1036
## 3rd Qu.:1502
## Max. :1952
## Review.Date Cocoa.Percent Company.Location Rating
## Min. :2006 Length:1795 Length:1795 Min. :1.000
## 1st Qu.:2010 Class :character Class :character 1st Qu.:2.875
## Median :2013 Mode :character Mode :character Median :3.250
## Mean :2012 Mean :3.186
## 3rd Qu.:2015 3rd Qu.:3.500
## Max. :2017 Max. :5.000
## Bean.Type Broad.Bean.Origin
## Length:1795 Length:1795
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Create a contingency table
sort(table(cacao.csv$Company.Location), decreasing=TRUE)
##
## U.S.A. France Canada U.K.
## 764 156 125 96
## Italy Ecuador Australia Belgium
## 63 54 49 40
## Switzerland Germany Austria Spain
## 38 35 26 25
## Colombia Hungary Venezuela Brazil
## 23 22 20 17
## Japan Madagascar New Zealand Peru
## 17 17 17 17
## Denmark Vietnam Guatemala Scotland
## 15 11 10 10
## Argentina Costa Rica Israel Poland
## 9 9 9 8
## Honduras Lithuania Domincan Republic Nicaragua
## 6 6 5 5
## South Korea Sweden Amsterdam Fiji
## 5 5 4 4
## Ireland Mexico Netherlands Puerto Rico
## 4 4 4 4
## Sao Tome Grenada Iceland Portugal
## 4 3 3 3
## Singapore South Africa Bolivia Chile
## 3 3 2 2
## Finland St. Lucia Czech Republic Eucador
## 2 2 1 1
## Ghana India Martinique Niacragua
## 1 1 1 1
## Philippines Russia Suriname Wales
## 1 1 1 1
Select the data using indexing
# select 8th row, and/or 9th column
cacao.csv[8,]
head(cacao.csv[,9])
## [1] "Sao Tome" "Togo" "Togo" "Togo" "Peru" "Venezuela"
cacao.csv[8,9]
## [1] "Venezuela"
Subset the data using the subset()
# find out all chocolate bars of which rating >= 4
subset(cacao.csv, Rating >=4) # show the first few rows only to save space
# or you can use logical test to complete indexing:
cacao.csv[cacao.csv$Rating>=4,]
# or you can use indexing and which() to complete subsetting
cacao.csv[which(cacao.csv$Rating>=4),]
# find out all chocolate bars of which company locations in Italy or USA
subset(cacao.csv, Company.Location %in% c("Italy", "USA"))
# find all chocolate bars that meet both criteria above
subset(cacao.csv,
(Rating >=4) & (Company.Location %in% c("Italy", "USA")))
Import data from flat files using readr
library(readr)
cacao_csv <- read_csv("flavors_of_cacao.csv")
## Rows: 1795 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Company
## (Maker-if known), Specific Bean Origin
## or Bar Name, Cocoa
## ...
## dbl (3): REF, Review
## Date, Rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cacao_csv
cacao_tsv <- read_tsv("flavors_of_cacao.txt")
## Rows: 1795 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Company
## (Maker-if known), Specific Bean Origin
## or Bar Name, Cocoa
## ...
## dbl (3): REF, Review
## Date, Rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Import data from flat files using data.table
library(data.table)
cacaof <- fread("flavors_of_cacao.csv")
cacaof
Import data from spreadsheets using readxl
library(readxl)
excel_sheets("penguins.xlsx")
## [1] "Torgersen Island" "Biscoe Island" "Dream Island"
torgersen <- read_excel("penguins.xlsx", "Torgersen Island", na="NA")
torgersen
biscoe <- read_excel("penguins.xlsx", "Biscoe Island", na="NA")
biscoe
dream <- read_excel("penguins.xlsx", "Dream Island", na="NA")
dream
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
penguins <- bind_rows(torgersen, biscoe, dream)
penguins
# Check the islands where different species of penguins lived
table(penguins$species, penguins$island)
##
## Biscoe Dream Torgersen
## Adelie 44 56 52
## Chinstrap 0 68 0
## Gentoo 124 0 0