1 - importing-data-from-flat-files-with-utils

# The utils package, 
# (which is automatically loaded in your R session on startup)

# -----------------------------------------------------
 # read.csv() ----------------------------------------
 # read.csv(file, header = TRUE, sep = ",", 
 # stringsAsFactors = TRUE

# all 3 ways to use read.csv via url
pools <- read.csv(url("https://www.data.brisbane.qld.gov.au/data/dataset/ccf67d3e-cfaf-4d30-8b78-a794c783af9f/resource/c09546c8-9526-4358-a1eb-81dbb224cdca/download/pool-location-and-information-30nov17.csv"))
pools <- read.csv("https://www.data.brisbane.qld.gov.au/data/dataset/ccf67d3e-cfaf-4d30-8b78-a794c783af9f/resource/c09546c8-9526-4358-a1eb-81dbb224cdca/download/pool-location-and-information-30nov17.csv", header = FALSE)
pools <- read.csv("https://www.data.brisbane.qld.gov.au/data/dataset/ccf67d3e-cfaf-4d30-8b78-a794c783af9f/resource/c09546c8-9526-4358-a1eb-81dbb224cdca/download/pool-location-and-information-30nov17.csv")

# alternatively, download it
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/swimming_pools.csv",
              destfile = "swimming_pools.csv")

# show found file:
dir()
##  [1] "1 - importing-data-from-flat-files-with-utils.R"  
##  [2] "1 - importing-data-from-flat-files-with-utils.Rmd"
##  [3] "1_-_importing-data-from-flat-files-with-utils.Rmd"
##  [4] "2 - readr-datatable.R"                            
##  [5] "3 - importing-excel-data.R"                       
##  [6] "4 - reproducible-excel-work-with-xlconnect.R"     
##  [7] "data"                                             
##  [8] "hotdogs.txt"                                      
##  [9] "importing_data_in_r_1_ch1.pdf"                    
## [10] "importing_data_in_r_1_ch2.pdf"                    
## [11] "importing_data_in_r_1_ch3.pdf"                    
## [12] "importing_data_in_r_1_ch4.pdf"                    
## [13] "potatoes.csv"                                     
## [14] "potatoes.txt"                                     
## [15] "swimming_pools.csv"

pools <- read.csv("swimming_pools.csv")

# For all importing functions in the utils package
# stringsAsFactors = TRUE
# great iFF strings represent categorical variables

# before stringsAsFactors = TRUE by default
str(pools)
## 'data.frame':    20 obs. of  4 variables:
##  $ Name     : Factor w/ 20 levels "Acacia Ridge Leisure Centre",..: 1 2 3 4 5 6 19 7 8 9 ...
##  $ Address  : Factor w/ 20 levels "1 Fairlead Crescent, Manly",..: 5 20 18 10 9 11 6 15 12 17 ...
##  $ Latitude : num  -27.6 -27.6 -27.6 -27.5 -27.4 ...
##  $ Longitude: num  153 153 153 153 153 ...

# convert strings in the flat file to factors
pools <- read.csv("swimming_pools.csv", stringsAsFactors = FALSE)

# now stringsAsFactors = FALSE
str(pools) # Name and Address should NOT be factors
## 'data.frame':    20 obs. of  4 variables:
##  $ Name     : chr  "Acacia Ridge Leisure Centre" "Bellbowrie Pool" "Carole Park" "Centenary Pool (inner City)" ...
##  $ Address  : chr  "1391 Beaudesert Road, Acacia Ridge" "Sugarwood Street, Bellbowrie" "Cnr Boundary Road and Waterford Road Wacol" "400 Gregory Terrace, Spring Hill" ...
##  $ Latitude : num  -27.6 -27.6 -27.6 -27.5 -27.4 ...
##  $ Longitude: num  153 153 153 153 153 ...

# -----------------------------------------------------
 # read.delim() ------------------------------------
 # read.delim(file, header = TRUE, sep = "\t", 

# *.txt files
# http://wiki.stat.ucla.edu/socr/index.php/SOCR_012708_ID_Data_HotDogs
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/hotdogs.txt",
                "hotdogs.txt")
hotdogs <- read.delim("hotdogs.txt") # header = TRUE
str(hotdogs)
## 'data.frame':    53 obs. of  3 variables:
##  $ Beef: Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X186: int  181 176 149 184 190 158 139 175 148 152 ...
##  $ X495: int  477 425 322 482 587 370 322 479 375 330 ...
hotdogs <- read.delim("hotdogs.txt", header = FALSE)
str(hotdogs)
## 'data.frame':    54 obs. of  3 variables:
##  $ V1: Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ V2: int  186 181 176 149 184 190 158 139 175 148 ...
##  $ V3: int  495 477 425 322 482 587 370 322 479 375 ...
summary(hotdogs)
##        V1           V2              V3       
##  Beef   :20   Min.   : 86.0   Min.   :144.0  
##  Meat   :17   1st Qu.:132.0   1st Qu.:362.5  
##  Poultry:17   Median :145.0   Median :405.0  
##               Mean   :145.4   Mean   :424.8  
##               3rd Qu.:172.8   3rd Qu.:503.5  
##               Max.   :195.0   Max.   :645.0


# -----------------------------------------------------
 # read.table() ------------------------------------
 # read.table(file, header = FALSE, sep = "",  
 
dir.create("data")
## Warning in dir.create("data"): 'data' already exists

download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/hotdogs.txt",
              "data/hotdogs.txt")

# Path to the hotdogs.txt file: path
path <- file.path("data", "hotdogs.txt")

# Import the hotdogs.txt file: hotdogs
hotdogs <- read.table(path, 
                      sep = '\t', 
                      col.names = c("type", "calories", "sodium"))

# Call head() on hotdogs
head(hotdogs)
##   type calories sodium
## 1 Beef      186    495
## 2 Beef      181    477
## 3 Beef      176    425
## 4 Beef      149    322
## 5 Beef      184    482
## 6 Beef      190    587


# -----------------------------------------------------
  # read.delim() ------------------------------------

# Finish the read.delim() call
# hotdogs <- read.delim("hotdogs.txt", header = ___, col.names = c("type", "calories", ___))
hotdogs <- read.delim("hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"))

# Select the hot dog with the least calories: lily
  # which.min() => returns the index the smallest value in a vector.
lily <- hotdogs[which.min(hotdogs$calories), ]
lily
##       type calories sodium
## 50 Poultry       86    358

# Select the observation with the most sodium: tom
tom <- hotdogs[which.max(hotdogs$sodium), ]
tom
##    type calories sodium
## 15 Beef      190    645

# -----------------------------------------------------
  # read.delim() ------------------------------------
  # specify the column types

  # some columns = factors
  # others = characters
  # => can't use: stringsAsFactors
  # => colClasses argument:
  # 
  # read.delim("my_file.txt", 
  #            colClasses = c("character",
  #                           "numeric",
  #                           "logical"))

# Display structure of hotdogs
str(hotdogs)
## 'data.frame':    54 obs. of  3 variables:
##  $ type    : Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ calories: int  186 181 176 149 184 190 158 139 175 148 ...
##  $ sodium  : int  495 477 425 322 482 587 370 322 479 375 ...

# Edit the colClasses argument to import the data correctly: hotdogs2
hotdogs2 <- read.delim("hotdogs.txt", header = FALSE, 
                       col.names = c("type", "calories", "sodium"),
                       colClasses = NA)

hotdogs2 <- read.delim("hotdogs.txt", header = FALSE, 
                       col.names = c("type", "calories", "sodium"),
                       colClasses = c("factor", "NULL", "numeric"))
# NB:
# colClasses = NULL
# => column will be skipped 
#    & not be loaded into the data frame.

# Display structure of hotdogs2
str(hotdogs2)
## 'data.frame':    54 obs. of  2 variables:
##  $ type  : Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sodium: num  495 477 425 322 482 587 370 322 479 375 ...
1 - importing-data-from-flat-files-with-utils

Kunal Haira

2018-05-09

1 - importing-data-from-flat-files-with-utils