1 - importing-data-from-flat-files-with-utils
# The utils package,
# (which is automatically loaded in your R session on startup)
# -----------------------------------------------------
# read.csv() ----------------------------------------
# read.csv(file, header = TRUE, sep = ",",
# stringsAsFactors = TRUE
# all 3 ways to use read.csv via url
pools <- read.csv(url("https://www.data.brisbane.qld.gov.au/data/dataset/ccf67d3e-cfaf-4d30-8b78-a794c783af9f/resource/c09546c8-9526-4358-a1eb-81dbb224cdca/download/pool-location-and-information-30nov17.csv"))
pools <- read.csv("https://www.data.brisbane.qld.gov.au/data/dataset/ccf67d3e-cfaf-4d30-8b78-a794c783af9f/resource/c09546c8-9526-4358-a1eb-81dbb224cdca/download/pool-location-and-information-30nov17.csv", header = FALSE)
pools <- read.csv("https://www.data.brisbane.qld.gov.au/data/dataset/ccf67d3e-cfaf-4d30-8b78-a794c783af9f/resource/c09546c8-9526-4358-a1eb-81dbb224cdca/download/pool-location-and-information-30nov17.csv")
# alternatively, download it
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/swimming_pools.csv",
destfile = "swimming_pools.csv")
# show found file:
dir()
## [1] "1 - importing-data-from-flat-files-with-utils.R"
## [2] "1 - importing-data-from-flat-files-with-utils.Rmd"
## [3] "1_-_importing-data-from-flat-files-with-utils.Rmd"
## [4] "2 - readr-datatable.R"
## [5] "3 - importing-excel-data.R"
## [6] "4 - reproducible-excel-work-with-xlconnect.R"
## [7] "data"
## [8] "hotdogs.txt"
## [9] "importing_data_in_r_1_ch1.pdf"
## [10] "importing_data_in_r_1_ch2.pdf"
## [11] "importing_data_in_r_1_ch3.pdf"
## [12] "importing_data_in_r_1_ch4.pdf"
## [13] "potatoes.csv"
## [14] "potatoes.txt"
## [15] "swimming_pools.csv"
pools <- read.csv("swimming_pools.csv")
# For all importing functions in the utils package
# stringsAsFactors = TRUE
# great iFF strings represent categorical variables
# before stringsAsFactors = TRUE by default
str(pools)
## 'data.frame': 20 obs. of 4 variables:
## $ Name : Factor w/ 20 levels "Acacia Ridge Leisure Centre",..: 1 2 3 4 5 6 19 7 8 9 ...
## $ Address : Factor w/ 20 levels "1 Fairlead Crescent, Manly",..: 5 20 18 10 9 11 6 15 12 17 ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
# convert strings in the flat file to factors
pools <- read.csv("swimming_pools.csv", stringsAsFactors = FALSE)
# now stringsAsFactors = FALSE
str(pools) # Name and Address should NOT be factors
## 'data.frame': 20 obs. of 4 variables:
## $ Name : chr "Acacia Ridge Leisure Centre" "Bellbowrie Pool" "Carole Park" "Centenary Pool (inner City)" ...
## $ Address : chr "1391 Beaudesert Road, Acacia Ridge" "Sugarwood Street, Bellbowrie" "Cnr Boundary Road and Waterford Road Wacol" "400 Gregory Terrace, Spring Hill" ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
# -----------------------------------------------------
# read.delim() ------------------------------------
# read.delim(file, header = TRUE, sep = "\t",
# *.txt files
# http://wiki.stat.ucla.edu/socr/index.php/SOCR_012708_ID_Data_HotDogs
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/hotdogs.txt",
"hotdogs.txt")
hotdogs <- read.delim("hotdogs.txt") # header = TRUE
str(hotdogs)
## 'data.frame': 53 obs. of 3 variables:
## $ Beef: Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X186: int 181 176 149 184 190 158 139 175 148 152 ...
## $ X495: int 477 425 322 482 587 370 322 479 375 330 ...
hotdogs <- read.delim("hotdogs.txt", header = FALSE)
str(hotdogs)
## 'data.frame': 54 obs. of 3 variables:
## $ V1: Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ V2: int 186 181 176 149 184 190 158 139 175 148 ...
## $ V3: int 495 477 425 322 482 587 370 322 479 375 ...
summary(hotdogs)
## V1 V2 V3
## Beef :20 Min. : 86.0 Min. :144.0
## Meat :17 1st Qu.:132.0 1st Qu.:362.5
## Poultry:17 Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
# -----------------------------------------------------
# read.table() ------------------------------------
# read.table(file, header = FALSE, sep = "",
dir.create("data")
## Warning in dir.create("data"): 'data' already exists
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/hotdogs.txt",
"data/hotdogs.txt")
# Path to the hotdogs.txt file: path
path <- file.path("data", "hotdogs.txt")
# Import the hotdogs.txt file: hotdogs
hotdogs <- read.table(path,
sep = '\t',
col.names = c("type", "calories", "sodium"))
# Call head() on hotdogs
head(hotdogs)
## type calories sodium
## 1 Beef 186 495
## 2 Beef 181 477
## 3 Beef 176 425
## 4 Beef 149 322
## 5 Beef 184 482
## 6 Beef 190 587
# -----------------------------------------------------
# read.delim() ------------------------------------
# Finish the read.delim() call
# hotdogs <- read.delim("hotdogs.txt", header = ___, col.names = c("type", "calories", ___))
hotdogs <- read.delim("hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"))
# Select the hot dog with the least calories: lily
# which.min() => returns the index the smallest value in a vector.
lily <- hotdogs[which.min(hotdogs$calories), ]
lily
## type calories sodium
## 50 Poultry 86 358
# Select the observation with the most sodium: tom
tom <- hotdogs[which.max(hotdogs$sodium), ]
tom
## type calories sodium
## 15 Beef 190 645
# -----------------------------------------------------
# read.delim() ------------------------------------
# specify the column types
# some columns = factors
# others = characters
# => can't use: stringsAsFactors
# => colClasses argument:
#
# read.delim("my_file.txt",
# colClasses = c("character",
# "numeric",
# "logical"))
# Display structure of hotdogs
str(hotdogs)
## 'data.frame': 54 obs. of 3 variables:
## $ type : Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ calories: int 186 181 176 149 184 190 158 139 175 148 ...
## $ sodium : int 495 477 425 322 482 587 370 322 479 375 ...
# Edit the colClasses argument to import the data correctly: hotdogs2
hotdogs2 <- read.delim("hotdogs.txt", header = FALSE,
col.names = c("type", "calories", "sodium"),
colClasses = NA)
hotdogs2 <- read.delim("hotdogs.txt", header = FALSE,
col.names = c("type", "calories", "sodium"),
colClasses = c("factor", "NULL", "numeric"))
# NB:
# colClasses = NULL
# => column will be skipped
# & not be loaded into the data frame.
# Display structure of hotdogs2
str(hotdogs2)
## 'data.frame': 54 obs. of 2 variables:
## $ type : Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sodium: num 495 477 425 322 482 587 370 322 479 375 ...