Course notes for Importing Data in R (Part 1)
read.csv, read.delim, read.tablestringsAsFactors, column classes, other argumentsreadr & data.table
read_csv, read_tsv, read_delimreadxl - listing and importing sheets, column names ans skipgdata - read.xls
dput in the course workspace to get the code to create the dataframe here then saved it with write_csv## dir functions lets us see what is in the workinig directory
dir('data/')## [1] "swimming_pools.csv"
# Import swimming_pools.csv: pools
pools <- read.csv('data/swimming_pools.csv')
# Print the structure of pools
str(pools)## 'data.frame': 20 obs. of 4 variables:
## $ Name : Factor w/ 20 levels "Acacia Ridge Leisure Centre",..: 1 2 3 4 5 6 19 7 8 9 ...
## $ Address : Factor w/ 20 levels "1 Fairlead Crescent, Manly",..: 5 20 18 10 9 11 6 15 12 17 ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
read.csv or other read functions
# Import swimming_pools.csv correctly: pools
pools <- read.csv("data/swimming_pools.csv", stringsAsFactors=F)
# Check the structure of pools
str(pools)## 'data.frame': 20 obs. of 4 variables:
## $ Name : chr "Acacia Ridge Leisure Centre" "Bellbowrie Pool" "Carole Park" "Centenary Pool (inner City)" ...
## $ Address : chr "1391 Beaudesert Road, Acacia Ridge" "Sugarwood Street, Bellbowrie" "Cnr Boundary Road and Waterford Road Wacol" "400 Gregory Terrace, Spring Hill" ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
## How many variables in the resulting pools data frame have different types if you specify the stringsAsFactors argument differently?
# Option A
pools <- read.csv("data/swimming_pools.csv", stringsAsFactors = TRUE)
str(pools)## 'data.frame': 20 obs. of 4 variables:
## $ Name : Factor w/ 20 levels "Acacia Ridge Leisure Centre",..: 1 2 3 4 5 6 19 7 8 9 ...
## $ Address : Factor w/ 20 levels "1 Fairlead Crescent, Manly",..: 5 20 18 10 9 11 6 15 12 17 ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
# Option B
pools <- read.csv("data/swimming_pools.csv", stringsAsFactors = FALSE)
str(pools)## 'data.frame': 20 obs. of 4 variables:
## $ Name : chr "Acacia Ridge Leisure Centre" "Bellbowrie Pool" "Carole Park" "Centenary Pool (inner City)" ...
## $ Address : chr "1391 Beaudesert Road, Acacia Ridge" "Sugarwood Street, Bellbowrie" "Cnr Boundary Road and Waterford Road Wacol" "400 Gregory Terrace, Spring Hill" ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
stingsAsFactors will impact any field that has strings.
# Import hotdogs.txt: hotdogs
hotdogs <- read.delim("data/hotdogs.txt", header=F)
# Summarize hotdogs
summary(hotdogs)## V1 V2 V3
## Beef :20 Min. : 86.0 Min. :144.0
## Meat :17 1st Qu.:132.0 1st Qu.:362.5
## Poultry:17 Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
# Path to the hotdogs.txt file: path
path <- file.path("data", "hotdogs.txt")
# Import the hotdogs.txt file: hotdogs
hotdogs <- read.table(path,
sep = "\t",
col.names = c("type", "calories", "sodium"))
# Call head() on hotdogs
head(hotdogs)## type calories sodium
## 1 Beef 186 495
## 2 Beef 181 477
## 3 Beef 176 425
## 4 Beef 149 322
## 5 Beef 184 482
## 6 Beef 190 587
# Finish the read.delim() call
hotdogs <- read.delim("data/hotdogs.txt", header = F, col.names = c("type", "calories", "sodium"))
head(hotdogs)## type calories sodium
## 1 Beef 186 495
## 2 Beef 181 477
## 3 Beef 176 425
## 4 Beef 149 322
## 5 Beef 184 482
## 6 Beef 190 587
# Select the hot dog with the least calories: lily
lily <- hotdogs[which.min(hotdogs$calories), ]
# Select the observation with the most sodium: tom
tom <- hotdogs[which.max(hotdogs$sodium), ]
# Print lily and tom
lily## type calories sodium
## 50 Poultry 86 358
tom## type calories sodium
## 15 Beef 190 645
NULL in the colClasses vector will skip that column# Previous call to import hotdogs.txt
hotdogs <- read.delim("data/hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"))
# Display structure of hotdogs
str(hotdogs)## 'data.frame': 54 obs. of 3 variables:
## $ type : Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ calories: int 186 181 176 149 184 190 158 139 175 148 ...
## $ sodium : int 495 477 425 322 482 587 370 322 479 375 ...
# Edit the colClasses argument to import the data correctly: hotdogs2
hotdogs2 <- read.delim("data/hotdogs.txt", header = FALSE,
col.names = c("type", "calories", "sodium"),
colClasses = c("factor", "NULL","numeric"))
# Display structure of hotdogs2
str(hotdogs2)## 'data.frame': 54 obs. of 2 variables:
## $ type : Factor w/ 3 levels "Beef","Meat",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sodium: num 495 477 425 322 482 587 370 322 479 375 ...
## readr is already loaded
# Import potatoes.csv with read_csv(): potatoes
potatoes <- read_csv("data/potatoes.csv")
potatoes## # A tibble: 160 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3.0
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3.0 1.7
## 7 1 1 1 2 2 2.6 3.1 2.4
## 8 1 1 1 2 3 3.0 3.0 2.9
## 9 1 1 1 2 4 2.2 3.2 2.5
## 10 1 1 1 2 5 2.0 2.8 1.9
## # ... with 150 more rows
# Column names
col_names_potatoes <- c("area", "temp", "size", "storage", "method",
"texture", "flavor", "moistness")
# Import potatoes.txt: potatoes
potatoes <- read_tsv("data/potatoes.txt", col_names = col_names_potatoes)
# Call head() on potatoes
head(potatoes)## # A tibble: 6 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3.0
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3.0 1.7
read_delim
# Column names
col_names_potatoes <- c("area", "temp", "size", "storage", "method",
"texture", "flavor", "moistness")
# Import potatoes.txt using read_delim(): potatoes
potatoes <- read_delim("data/potatoes.txt", delim="\t", col_names = col_names_potatoes)
# Print out potatoes
potatoes## # A tibble: 160 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3.0
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3.0 1.7
## 7 1 1 1 2 2 2.6 3.1 2.4
## 8 1 1 1 2 3 3.0 3.0 2.9
## 9 1 1 1 2 4 2.2 3.2 2.5
## 10 1 1 1 2 5 2.0 2.8 1.9
## # ... with 150 more rows
# Column names
col_names_potatoes <- c("area", "temp", "size", "storage", "method",
"texture", "flavor", "moistness")
# Import 5 observations from potatoes.txt: potatoes_fragment
potatoes_fragment <- read_tsv("data/potatoes.txt", skip = 6, n_max = 5, col_names = col_names_potatoes)
potatoes_fragment## # A tibble: 5 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 2 2 2.6 3.1 2.4
## 2 1 1 1 2 3 3.0 3.0 2.9
## 3 1 1 1 2 4 2.2 3.2 2.5
## 4 1 1 1 2 5 2.0 2.8 1.9
## 5 1 1 1 3 1 1.8 2.6 1.5
# Column names
col_names_potatoes <- c("area", "temp", "size", "storage", "method",
"texture", "flavor", "moistness")
# Import all data, but force all columns to be character: potatoes_char
potatoes_char <- read_tsv("data/potatoes.txt", col_types = "cccccccc", col_names = col_names_potatoes)
# Print out structure of potatoes_char
str(potatoes_char)## Classes 'tbl_df', 'tbl' and 'data.frame': 160 obs. of 8 variables:
## $ area : chr "1" "1" "1" "1" ...
## $ temp : chr "1" "1" "1" "1" ...
## $ size : chr "1" "1" "1" "1" ...
## $ storage : chr "1" "1" "1" "1" ...
## $ method : chr "1" "2" "3" "4" ...
## $ texture : chr "2.9" "2.3" "2.5" "2.1" ...
## $ flavor : chr "3.2" "2.5" "2.8" "2.9" ...
## $ moistness: chr "3" "2.6" "2.8" "2.4" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 8
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ temp : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ size : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ storage : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ method : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ texture : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ flavor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ moistness: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Import without col_types
hotdogs <- read_tsv("data/hotdogs.txt", col_names = c("type", "calories", "sodium"))
# Display the summary of hotdogs
summary(hotdogs)## type calories sodium
## Length:54 Min. : 86.0 Min. :144.0
## Class :character 1st Qu.:132.0 1st Qu.:362.5
## Mode :character Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
# The collectors you will need to import the data
fac <- col_factor(levels = c("Beef", "Meat", "Poultry"))
int <- col_integer()
# Edit the col_types argument to import the data correctly: hotdogs_factor
hotdogs_factor <- read_tsv("data/hotdogs.txt",
col_names = c("type", "calories", "sodium"),
col_types = list(fac, int, int))
# Display the summary of hotdogs_factor
summary(hotdogs_factor)## type calories sodium
## Beef :20 Min. : 86.0 Min. :144.0
## Meat :17 1st Qu.:132.0 1st Qu.:362.5
## Poultry:17 Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
hotdogs2 <- read_tsv("data/hotdogs.txt", col_names = c("type", "calories", "sodium")) %>%
mutate(type = as.factor(type))
summary(hotdogs2)## type calories sodium
## Beef :20 Min. : 86.0 Min. :144.0
## Meat :17 1st Qu.:132.0 1st Qu.:362.5
## Poultry:17 Median :145.0 Median :405.0
## Mean :145.4 Mean :424.8
## 3rd Qu.:172.8 3rd Qu.:503.5
## Max. :195.0 Max. :645.0
# data.table package is already loaded
# Import potatoes.csv with fread(): potatoes
potatoes <- fread("data/potatoes.csv")
# Print out potatoes
potatoes## area temp size storage method texture flavor moistness
## 1: 1 1 1 1 1 2.9 3.2 3.0
## 2: 1 1 1 1 2 2.3 2.5 2.6
## 3: 1 1 1 1 3 2.5 2.8 2.8
## 4: 1 1 1 1 4 2.1 2.9 2.4
## 5: 1 1 1 1 5 1.9 2.8 2.2
## ---
## 156: 2 2 2 4 1 2.7 3.3 2.6
## 157: 2 2 2 4 2 2.6 2.8 2.3
## 158: 2 2 2 4 3 2.5 3.1 2.6
## 159: 2 2 2 4 4 3.4 3.3 3.0
## 160: 2 2 2 4 5 2.5 2.8 2.3
# Import columns 6 and 8 of potatoes.csv: potatoes
potatoes <- fread("data/potatoes.csv", select = c(6,8))
str(potatoes)## Classes 'data.table' and 'data.frame': 160 obs. of 2 variables:
## $ texture : num 2.9 2.3 2.5 2.1 1.9 1.8 2.6 3 2.2 2 ...
## $ moistness: num 3 2.6 2.8 2.4 2.2 1.7 2.4 2.9 2.5 1.9 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Plot texture (x) and moistness (y) of potatoes
plot(potatoes$texture, potatoes$moistness)potatoes <- fread("data/potatoes.csv")
class(potatoes)## [1] "data.table" "data.frame"
potatoes <- read_csv("data/potatoes.csv")
class(potatoes)## [1] "tbl_df" "tbl" "data.frame"
data.table and readr are different
gdata, readxl, and XLConnect
gdata can load files directly from the web, which is nice, but only xls. The gaminder file I need is xlsxreadxl could work but requires I download the file first. This is fine, but ultimately I need to create worksheets and with portions of the data and save it off. This package is just for reading data in.XLConnect is the package I really need to connect and change the worksheet I think.XLConnect libraryThen, I got this error when loading the library…
Error: package or namespace load failed for ‘XLConnectJars’:
.onLoad failed in loadNamespace() for 'rJava', details:
call: dyn.load(file, DLLpath = DLLpath, ...)
error: unable to load shared object '/Library/Frameworks/R.framework/Versions/3.4/Resources/library/rJava/libs/rJava.so':
dlopen(/Library/Frameworks/R.framework/Versions/3.4/Resources/library/rJava/libs/rJava.so, 6): Library not loaded: @rpath/libjvm.dylib
Referenced from: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/rJava/libs/rJava.so
Reason: image not foundsudo ln -f -s $(/usr/libexec/java_home)/jre/lib/server/libjvm.dylib /usr/local/libThis seems pretty gross, but for now it will have to do, so I can move forward with the real learning
dyn.load('/Library/Java/JavaVirtualMachines/jdk1.8.0[YOURVERSIONHERE].jdk/Contents/Home/jre/lib/server/libjvm.dylib')## Download and load urban pop excel file
url_urban_pop <- "http://docs.google.com/spreadsheet/pub?key=pyj6tScZqmEfH89V6UQhpZA&output=xlsx"
download.file(url_urban_pop, "data/urban_pop_orig.xlsx")my_book <- loadWorkbook('data/urban_pop_orig.xlsx')
# List the sheets in my_book
getSheets(my_book)## [1] "Data" "About" "Footnotes" "Settings" "Download" "v"
## These are the worksheets I want
## "1960-1966" "1967-1974" "1975-2011"
# Get the data sheet with all the ... data
data <- readWorksheet(my_book, sheet='Data')
# I like to use stringr instead of gsub functions
col_names <- colnames(data) %>%
str_replace("Urban.population","country") %>%
str_replace("^X","")
col_names## [1] "country" "1960" "1961" "1962" "1963" "1964" "1965"
## [8] "1966" "1967" "1968" "1969" "1970" "1971" "1972"
## [15] "1973" "1974" "1975" "1976" "1977" "1978" "1979"
## [22] "1980" "1981" "1982" "1983" "1984" "1985" "1986"
## [29] "1987" "1988" "1989" "1990" "1991" "1992" "1993"
## [36] "1994" "1995" "1996" "1997" "1998" "1999" "2000"
## [43] "2001" "2002" "2003" "2004" "2005" "2006" "2007"
## [50] "2008" "2009" "2010" "2011"
# Set the cleaned column names
colnames(data) <- col_names
data1 <- data[,c("country",c(1960:1966))]
data2 <- data[,c("country",c(1967:1974))]
data3 <- data[,c("country",c(1975:2011))]
### Save new urban pop xlsx file with and without column names
## Now create the 3 sheets and add the data in each
createSheet(my_book,"1960-1966")
createSheet(my_book,"1967-1974")
createSheet(my_book,"1975-2011")
writeWorksheet(my_book, data1, "1960-1966")
writeWorksheet(my_book, data2, "1967-1974")
writeWorksheet(my_book, data3, "1975-2011")
getSheets(my_book)## [1] "Data" "About" "Footnotes" "Settings" "Download" "v"
## [7] "1960-1966" "1967-1974" "1975-2011"
## Get rid of the other sheets
removeSheet(my_book, sheet=c("Data","About","Footnotes","Settings","Download","v"))
getSheets(my_book)## [1] "1960-1966" "1967-1974" "1975-2011"
## save this off as the file just like in the course
saveWorkbook(my_book, "data/urbanpop.xlsx")
## Do it again with no column names
writeWorksheet(my_book, data1, "1960-1966", header=F)
writeWorksheet(my_book, data2, "1967-1974", header=F)
writeWorksheet(my_book, data3, "1975-2011", header=F)
saveWorkbook(my_book, "data/urbanpop_nonames.xlsx")
dir('data/')## [1] "hotdogs.txt" "potatoes.csv" "potatoes.txt"
## [4] "swimming_pools.csv" "urban_pop_orig.xlsx" "urbanpop_nonames.xlsx"
## [7] "urbanpop.xlsx"
## I can repeat the process to create the urbanpop.xls file
## But I need to start from scratch and create the workbook as an xls file to begin with
## If I try to use the xlsx workbbok it will be corrupted and not load correctly later
my_book <- loadWorkbook('data/urbanpop.xls', create=T)
createSheet(my_book,"1960-1966")
createSheet(my_book,"1967-1974")
createSheet(my_book,"1975-2011")
writeWorksheet(my_book, data1, "1960-1966")
writeWorksheet(my_book, data2, "1967-1974")
writeWorksheet(my_book, data3, "1975-2011")
saveWorkbook(my_book, "data/urbanpop.xls")
dir('data/')## [1] "hotdogs.txt" "potatoes.csv" "potatoes.txt"
## [4] "swimming_pools.csv" "urban_pop_orig.xlsx" "urbanpop_nonames.xlsx"
## [7] "urbanpop.xls" "urbanpop.xlsx"
# the readxl package is already loaded
# Print out the names of both spreadsheets
excel_sheets("data/urbanpop.xlsx")## [1] "1960-1966" "1967-1974" "1975-2011"
# Read the sheets, one by one
pop_1 <- read_excel("data/urbanpop.xlsx", sheet = 1)
pop_2 <- read_excel("data/urbanpop.xlsx", sheet = 2)
pop_3 <- read_excel("data/urbanpop.xlsx", sheet = 3)
# Put pop_1, pop_2 and pop_3 in a list: pop_list
pop_list <- list(pop_1, pop_2, pop_3)
# Display the structure of pop_list
str(pop_list)## List of 3
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 273 obs. of 8 variables:
## ..$ country: chr [1:273] "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## ..$ 1960 : num [1:273] NA 769308 NA 494443 3293999 ...
## ..$ 1961 : num [1:273] NA 814923 NA 511803 3515148 ...
## ..$ 1962 : num [1:273] NA 858522 NA 529439 3739963 ...
## ..$ 1963 : num [1:273] NA 903914 NA 547377 3973289 ...
## ..$ 1964 : num [1:273] NA 951226 NA 565572 4220987 ...
## ..$ 1965 : num [1:273] NA 1000582 NA 583983 4488176 ...
## ..$ 1966 : num [1:273] NA 1058743 NA 602512 4649105 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 273 obs. of 9 variables:
## ..$ country: chr [1:273] "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## ..$ 1967 : num [1:273] NA 1119067 NA 621180 4826104 ...
## ..$ 1968 : num [1:273] NA 1182159 NA 639964 5017299 ...
## ..$ 1969 : num [1:273] NA 1248901 NA 658853 5219332 ...
## ..$ 1970 : num [1:273] NA 1319849 NA 677839 5429743 ...
## ..$ 1971 : num [1:273] NA 1409001 NA 698932 5619042 ...
## ..$ 1972 : num [1:273] NA 1502402 NA 720207 5815734 ...
## ..$ 1973 : num [1:273] NA 1598835 NA 741681 6020647 ...
## ..$ 1974 : num [1:273] NA 1696445 NA 763385 6235114 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 273 obs. of 38 variables:
## ..$ country: chr [1:273] "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## ..$ 1975 : num [1:273] NA 1793266 NA 785350 6460138 ...
## ..$ 1976 : num [1:273] NA 1905033 NA 807990 6774099 ...
## ..$ 1977 : num [1:273] NA 2021308 NA 830959 7102902 ...
## ..$ 1978 : num [1:273] NA 2142248 NA 854262 7447728 ...
## ..$ 1979 : num [1:273] NA 2268015 NA 877898 7810073 ...
## ..$ 1980 : num [1:273] NA 2398775 NA 901884 8190772 ...
## ..$ 1981 : num [1:273] NA 2493265 NA 927224 8637724 ...
## ..$ 1982 : num [1:273] NA 2590846 NA 952447 9105820 ...
## ..$ 1983 : num [1:273] NA 2691612 NA 978476 9591900 ...
## ..$ 1984 : num [1:273] NA 2795656 NA 1006613 10091289 ...
## ..$ 1985 : num [1:273] NA 2903078 NA 1037541 10600112 ...
## ..$ 1986 : num [1:273] NA 3006983 NA 1072365 11101757 ...
## ..$ 1987 : num [1:273] NA 3113957 NA 1109954 11609104 ...
## ..$ 1988 : num [1:273] NA 3224082 NA 1146633 12122941 ...
## ..$ 1989 : num [1:273] NA 3337444 NA 1177286 12645263 ...
## ..$ 1990 : num [1:273] NA 3454129 NA 1198293 13177079 ...
## ..$ 1991 : num [1:273] NA 3617842 NA 1215445 13708813 ...
## ..$ 1992 : num [1:273] NA 3788685 NA 1222544 14248297 ...
## ..$ 1993 : num [1:273] NA 3966956 NA 1222812 14789176 ...
## ..$ 1994 : num [1:273] NA 4152960 NA 1221364 15322651 ...
## ..$ 1995 : num [1:273] NA 4347018 NA 1222234 15842442 ...
## ..$ 1996 : num [1:273] NA 4531285 NA 1228760 16395553 ...
## ..$ 1997 : num [1:273] NA 4722603 NA 1238090 16935451 ...
## ..$ 1998 : num [1:273] NA 4921227 NA 1250366 17469200 ...
## ..$ 1999 : num [1:273] NA 5127421 NA 1265195 18007937 ...
## ..$ 2000 : num [1:273] NA 5341456 NA 1282223 18560597 ...
## ..$ 2001 : num [1:273] NA 5564492 NA 1315690 19198872 ...
## ..$ 2002 : num [1:273] NA 5795940 NA 1352278 19854835 ...
## ..$ 2003 : num [1:273] NA 6036100 NA 1391143 20529356 ...
## ..$ 2004 : num [1:273] NA 6285281 NA 1430918 21222198 ...
## ..$ 2005 : num [1:273] NA 6543804 NA 1470488 21932978 ...
## ..$ 2006 : num [1:273] NA 6812538 NA 1512255 22625052 ...
## ..$ 2007 : num [1:273] NA 7091245 NA 1553491 23335543 ...
## ..$ 2008 : num [1:273] NA 7380272 NA 1594351 24061749 ...
## ..$ 2009 : num [1:273] NA 7679982 NA 1635262 24799591 ...
## ..$ 2010 : num [1:273] NA 7990746 NA 1676545 25545622 ...
## ..$ 2011 : num [1:273] NA 8316976 NA 1716842 26216968 ...
# Read all Excel sheets with lapply(): pop_list
pop_list <- lapply(
excel_sheets("data/urbanpop.xlsx"),
read_excel,
path = "data/urbanpop.xlsx"
)
# Display the structure of pop_list
str(pop_list)## List of 3
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 273 obs. of 8 variables:
## ..$ country: chr [1:273] "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## ..$ 1960 : num [1:273] NA 769308 NA 494443 3293999 ...
## ..$ 1961 : num [1:273] NA 814923 NA 511803 3515148 ...
## ..$ 1962 : num [1:273] NA 858522 NA 529439 3739963 ...
## ..$ 1963 : num [1:273] NA 903914 NA 547377 3973289 ...
## ..$ 1964 : num [1:273] NA 951226 NA 565572 4220987 ...
## ..$ 1965 : num [1:273] NA 1000582 NA 583983 4488176 ...
## ..$ 1966 : num [1:273] NA 1058743 NA 602512 4649105 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 273 obs. of 9 variables:
## ..$ country: chr [1:273] "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## ..$ 1967 : num [1:273] NA 1119067 NA 621180 4826104 ...
## ..$ 1968 : num [1:273] NA 1182159 NA 639964 5017299 ...
## ..$ 1969 : num [1:273] NA 1248901 NA 658853 5219332 ...
## ..$ 1970 : num [1:273] NA 1319849 NA 677839 5429743 ...
## ..$ 1971 : num [1:273] NA 1409001 NA 698932 5619042 ...
## ..$ 1972 : num [1:273] NA 1502402 NA 720207 5815734 ...
## ..$ 1973 : num [1:273] NA 1598835 NA 741681 6020647 ...
## ..$ 1974 : num [1:273] NA 1696445 NA 763385 6235114 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 273 obs. of 38 variables:
## ..$ country: chr [1:273] "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## ..$ 1975 : num [1:273] NA 1793266 NA 785350 6460138 ...
## ..$ 1976 : num [1:273] NA 1905033 NA 807990 6774099 ...
## ..$ 1977 : num [1:273] NA 2021308 NA 830959 7102902 ...
## ..$ 1978 : num [1:273] NA 2142248 NA 854262 7447728 ...
## ..$ 1979 : num [1:273] NA 2268015 NA 877898 7810073 ...
## ..$ 1980 : num [1:273] NA 2398775 NA 901884 8190772 ...
## ..$ 1981 : num [1:273] NA 2493265 NA 927224 8637724 ...
## ..$ 1982 : num [1:273] NA 2590846 NA 952447 9105820 ...
## ..$ 1983 : num [1:273] NA 2691612 NA 978476 9591900 ...
## ..$ 1984 : num [1:273] NA 2795656 NA 1006613 10091289 ...
## ..$ 1985 : num [1:273] NA 2903078 NA 1037541 10600112 ...
## ..$ 1986 : num [1:273] NA 3006983 NA 1072365 11101757 ...
## ..$ 1987 : num [1:273] NA 3113957 NA 1109954 11609104 ...
## ..$ 1988 : num [1:273] NA 3224082 NA 1146633 12122941 ...
## ..$ 1989 : num [1:273] NA 3337444 NA 1177286 12645263 ...
## ..$ 1990 : num [1:273] NA 3454129 NA 1198293 13177079 ...
## ..$ 1991 : num [1:273] NA 3617842 NA 1215445 13708813 ...
## ..$ 1992 : num [1:273] NA 3788685 NA 1222544 14248297 ...
## ..$ 1993 : num [1:273] NA 3966956 NA 1222812 14789176 ...
## ..$ 1994 : num [1:273] NA 4152960 NA 1221364 15322651 ...
## ..$ 1995 : num [1:273] NA 4347018 NA 1222234 15842442 ...
## ..$ 1996 : num [1:273] NA 4531285 NA 1228760 16395553 ...
## ..$ 1997 : num [1:273] NA 4722603 NA 1238090 16935451 ...
## ..$ 1998 : num [1:273] NA 4921227 NA 1250366 17469200 ...
## ..$ 1999 : num [1:273] NA 5127421 NA 1265195 18007937 ...
## ..$ 2000 : num [1:273] NA 5341456 NA 1282223 18560597 ...
## ..$ 2001 : num [1:273] NA 5564492 NA 1315690 19198872 ...
## ..$ 2002 : num [1:273] NA 5795940 NA 1352278 19854835 ...
## ..$ 2003 : num [1:273] NA 6036100 NA 1391143 20529356 ...
## ..$ 2004 : num [1:273] NA 6285281 NA 1430918 21222198 ...
## ..$ 2005 : num [1:273] NA 6543804 NA 1470488 21932978 ...
## ..$ 2006 : num [1:273] NA 6812538 NA 1512255 22625052 ...
## ..$ 2007 : num [1:273] NA 7091245 NA 1553491 23335543 ...
## ..$ 2008 : num [1:273] NA 7380272 NA 1594351 24061749 ...
## ..$ 2009 : num [1:273] NA 7679982 NA 1635262 24799591 ...
## ..$ 2010 : num [1:273] NA 7990746 NA 1676545 25545622 ...
## ..$ 2011 : num [1:273] NA 8316976 NA 1716842 26216968 ...
# Import the the first Excel sheet of urbanpop_nonames.xlsx (R gives names): pop_a
pop_a <- read_excel("data/urbanpop_nonames.xlsx", sheet=1, col_names = F)
# Import the the first Excel sheet of urbanpop_nonames.xlsx (specify col_names): pop_b
cols <- c("country", paste0("year_", 1960:1966))
pop_b <- read_excel("data/urbanpop_nonames.xlsx", sheet=1, col_names=cols)
# Print the summary of pop_a
summary(pop_a)## X__1 X__2 X__3
## Length:274 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 90220 1st Qu.: 70644
## Mode :character Median : 561636 Median : 570159
## Mean : 4963530 Mean : 4991613
## 3rd Qu.: 3024372 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :75 NA's :65
## X__4 X__5 X__6
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
## NA's :65 NA's :65 NA's :65
## X__7 X__8
## Min. : 1281 Min. : 1349
## 1st Qu.: 88633 1st Qu.: 93638
## Median : 679109 Median : 735139
## Mean : 5637394 Mean : 5790281
## 3rd Qu.: 3317422 3rd Qu.: 3418036
## Max. :139663053 Max. :141962708
## NA's :65 NA's :65
# Print the summary of pop_b
summary(pop_b)## country year_1960 year_1961
## Length:274 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 90220 1st Qu.: 70644
## Mode :character Median : 561636 Median : 570159
## Mean : 4963530 Mean : 4991613
## 3rd Qu.: 3024372 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :75 NA's :65
## year_1962 year_1963 year_1964
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
## NA's :65 NA's :65 NA's :65
## year_1965 year_1966
## Min. : 1281 Min. : 1349
## 1st Qu.: 88633 1st Qu.: 93638
## Median : 679109 Median : 735139
## Mean : 5637394 Mean : 5790281
## 3rd Qu.: 3317422 3rd Qu.: 3418036
## Max. :139663053 Max. :141962708
## NA's :65 NA's :65
# Import the second sheet of urbanpop.xlsx, skipping the first 21 rows: urbanpop_sel
urbanpop_sel <- read_excel("data/urbanpop.xlsx", sheet=2, skip=21, col_names=F)
# Print out the first observation from urbanpop_sel
# I'm showing 5. We can see that we are on the countries starting with B
urbanpop_sel[1:5,]## # A tibble: 5 x 9
## X__1 X__2 X__3 X__4 X__5 X__6 X__7
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Belarus 3556447.88 3696854.38 3838002.9 3978504.00 4132163.53 4286800.60
## 2 Belgium 8950504.31 8999365.83 9038505.6 9061056.85 9089909.02 9137945.57
## 3 Belize 58790.24 59711.73 60492.2 61141.33 61839.91 62403.29
## 4 Benin 382022.12 411859.45 443013.1 475611.38 515819.53 557937.60
## 5 Bermuda 52000.00 53000.00 54000.0 55000.00 54600.00 54200.00
## # ... with 2 more variables: X__8 <dbl>, X__9 <dbl>
read.table and makes all of its arguments available# gdata library is already loaded
# Import the second sheet of urbanpop.xls: urban_pop
urban_pop <- read.xls("data/urbanpop.xls", sheet=2)
# Print the first 11 observations using head()
head(urban_pop,11)## country X1967 X1968 X1969 X1970
## 1 Abkhazia NA NA NA NA
## 2 Afghanistan 1119067.20 1182159.06 1248900.79 1319848.78
## 3 Akrotiri and Dhekelia NA NA NA NA
## 4 Albania 621179.85 639964.46 658853.12 677839.12
## 5 Algeria 4826104.22 5017298.60 5219331.87 5429743.08
## 6 American Samoa 17348.66 17995.51 18618.68 19206.39
## 7 Andorra 15439.62 16726.99 18088.32 19528.96
## 8 Angola 757496.32 798459.26 841261.96 886401.63
## 9 Anguilla NA NA NA NA
## 10 Antigua and Barbuda 22086.25 22149.39 22182.92 22180.87
## 11 Argentina 17753280.98 18124103.64 18510462.30 18918072.79
## X1971 X1972 X1973 X1974
## 1 NA NA NA NA
## 2 1409001.09 1502401.79 1598835.45 1696444.83
## 3 NA NA NA NA
## 4 698932.25 720206.57 741681.04 763385.45
## 5 5619041.53 5815734.49 6020647.35 6235114.38
## 6 19752.02 20262.67 20741.97 21194.38
## 7 20928.73 22405.84 23937.05 25481.98
## 8 955010.09 1027397.35 1103829.78 1184486.23
## 9 NA NA NA NA
## 10 22560.87 22907.76 23221.29 23502.92
## 11 19329718.16 19763078.00 20211424.85 20664728.90
# Column names for urban_pop
columns <- c("country", paste0("year_", 1967:1974))
# Finish the read.xls call
urban_pop <- read.xls("data/urbanpop.xls", sheet = 2,
skip = 50, header = F, stringsAsFactors = F,
col.names = columns)
# Print first 10 observation of urban_pop
head(urban_pop,10)## country year_1967 year_1968 year_1969 year_1970 year_1971
## 1 Congo, Dem. Rep. 5161472.5 5475208.0 5802068.8 6140903.7 6282833.7
## 2 Congo, Rep. 450669.8 473335.2 497210.7 522406.6 549789.4
## 3 Cook Is NA NA NA NA NA
## 4 Costa Rica 621785.8 649916.4 678253.9 706798.6 733545.9
## 5 Cote d'Ivoire 1243350.1 1330719.3 1424438.5 1525425.2 1638738.0
## 6 Croatia 1608233.0 1663050.6 1717607.0 1773045.6 1826421.7
## 7 Cuba 4927341.2 5032013.6 5137260.2 5244278.6 5407254.3
## 8 Cyprus 231929.7 237831.4 243983.3 250164.5 261213.2
## 9 Czech Republic 6204409.9 6266304.5 6326369.0 6348794.9 6437055.2
## 10 Czechoslovakia NA NA NA NA NA
## year_1972 year_1973 year_1974
## 1 6425372.3 6570538.5 6721175.0
## 2 578639.8 608850.4 640236.4
## 3 NA NA NA
## 4 760430.8 787918.3 816658.8
## 5 1760508.3 1891241.0 2031395.3
## 6 1879427.8 1932436.1 1984975.8
## 7 5572975.4 5738230.5 5898512.3
## 8 272408.0 283774.9 295379.8
## 9 6572632.3 6718465.5 6873458.2
## 10 NA NA NA
# Add code to import data from all three sheets in urbanpop.xls
path <- "data/urbanpop.xls"
urban_sheet1 <- read.xls(path, sheet = 1, stringsAsFactors = FALSE)
urban_sheet2 <- read.xls(path, sheet = 2, stringsAsFactors = FALSE)
urban_sheet3 <- read.xls(path, sheet = 3, stringsAsFactors = FALSE)
# Extend the cbind() call to include urban_sheet3: urban
urban <- cbind(urban_sheet1, urban_sheet2[-1], urban_sheet3[-1])
# Remove all rows with NAs from urban: urban_clean
urban_clean <- na.omit(urban)
# Print out a summary of urban_clean
# Shortening the output to the first 5 columns
summary(urban_clean[, 1:5])## country X1960 X1961
## Length:197 Min. : 3378 Min. : 3433
## Class :character 1st Qu.: 87735 1st Qu.: 92905
## Mode :character Median : 599714 Median : 630788
## Mean : 5012388 Mean : 5282488
## 3rd Qu.: 3130085 3rd Qu.: 3155370
## Max. :126469700 Max. :129268133
## X1962 X1963
## Min. : 3481 Min. : 3532
## 1st Qu.: 98331 1st Qu.: 104988
## Median : 659464 Median : 704989
## Mean : 5440972 Mean : 5612312
## 3rd Qu.: 3250211 3rd Qu.: 3416490
## Max. :131974143 Max. :134599886
# The XLConnect package is already loaded
# Build connection to urbanpop.xlsx: my_book
my_book <- loadWorkbook('data/urbanpop.xlsx')
# Print out the class of my_book
class(my_book)## [1] "workbook"
## attr(,"package")
## [1] "XLConnect"
# Build connection to urbanpop.xlsx
my_book <- loadWorkbook("data/urbanpop.xlsx")
# List the sheets in my_book
getSheets(my_book)## [1] "1960-1966" "1967-1974" "1975-2011"
# Import the second sheet in my_book
head(readWorksheet(my_book, sheet=2),10)## country X1967 X1968 X1969 X1970
## 1 Abkhazia NA NA NA NA
## 2 Afghanistan 1119067.20 1182159.06 1248900.79 1319848.78
## 3 Akrotiri and Dhekelia NA NA NA NA
## 4 Albania 621179.85 639964.46 658853.12 677839.12
## 5 Algeria 4826104.22 5017298.60 5219331.87 5429743.08
## 6 American Samoa 17348.66 17995.51 18618.68 19206.39
## 7 Andorra 15439.62 16726.99 18088.32 19528.96
## 8 Angola 757496.32 798459.26 841261.96 886401.63
## 9 Anguilla NA NA NA NA
## 10 Antigua and Barbuda 22086.25 22149.39 22182.92 22180.87
## X1971 X1972 X1973 X1974
## 1 NA NA NA NA
## 2 1409001.09 1502401.79 1598835.45 1696444.83
## 3 NA NA NA NA
## 4 698932.25 720206.57 741681.04 763385.45
## 5 5619041.53 5815734.49 6020647.35 6235114.38
## 6 19752.02 20262.67 20741.97 21194.38
## 7 20928.73 22405.84 23937.05 25481.98
## 8 955010.09 1027397.35 1103829.78 1184486.23
## 9 NA NA NA NA
## 10 22560.87 22907.76 23221.29 23502.92
# Build connection to urbanpop.xlsx
my_book <- loadWorkbook("data/urbanpop.xlsx")
# Import columns 3, 4, and 5 from second sheet in my_book: urbanpop_sel
urbanpop_sel <- readWorksheet(my_book, sheet = 2, startCol=3, endCol=5)
# Import first column from second sheet in my_book: countries
countries <- readWorksheet(my_book, sheet = 2, startCol=1, endCol=1)
## For some reason these have different number of rows
str(urbanpop_sel)## 'data.frame': 260 obs. of 3 variables:
## $ X1968: num NA 1182159 NA 639964 5017299 ...
## $ X1969: num NA 1248901 NA 658853 5219332 ...
## $ X1970: num NA 1319849 NA 677839 5429743 ...
str(countries)## 'data.frame': 273 obs. of 1 variable:
## $ country: chr "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## Welp, lets add some rows so we can cbind
extra_rows <- rep(NA, nrow(countries) - nrow(urbanpop_sel))
df_extra_rows <- data.frame(x1 = extra_rows,
x2 = extra_rows,
x3 = extra_rows)
colnames(df_extra_rows) <- colnames(urbanpop_sel)
urbanpop_sel_2 <- rbind(urbanpop_sel,df_extra_rows)
str(urbanpop_sel_2)## 'data.frame': 273 obs. of 3 variables:
## $ X1968: num NA 1182159 NA 639964 5017299 ...
## $ X1969: num NA 1248901 NA 658853 5219332 ...
## $ X1970: num NA 1319849 NA 677839 5429743 ...
# cbind() urbanpop_sel and countries together: selection
selection <- cbind(countries, urbanpop_sel_2)
str(selection)## 'data.frame': 273 obs. of 4 variables:
## $ country: chr "Abkhazia" "Afghanistan" "Akrotiri and Dhekelia" "Albania" ...
## $ X1968 : num NA 1182159 NA 639964 5017299 ...
## $ X1969 : num NA 1248901 NA 658853 5219332 ...
## $ X1970 : num NA 1319849 NA 677839 5429743 ...
# Build connection to urbanpop.xlsx
my_book <- loadWorkbook("data/urbanpop.xlsx")
# Add a worksheet to my_book, named "data_summary"
createSheet(my_book,"data_summary")
# Use getSheets() on my_book
getSheets(my_book)## [1] "1960-1966" "1967-1974" "1975-2011" "data_summary"
# Build connection to urbanpop.xlsx
my_book <- loadWorkbook("data/urbanpop.xlsx")
# Add a worksheet to my_book, named "data_summary"
createSheet(my_book, "data_summary")
# Create data frame: summ
sheets <- getSheets(my_book)[1:3]
dims <- sapply(sheets, function(x) dim(readWorksheet(my_book, sheet = x)), USE.NAMES = FALSE)
summ <- data.frame(sheets = sheets,
nrows = dims[1, ],
ncols = dims[2, ])
# Add data in summ to "data_summary" sheet
writeWorksheet(my_book, summ, "data_summary")
# Save workbook as summary.xlsx
saveWorkbook(my_book, "data/summary.xlsx")
dir('data/')## [1] "hotdogs.txt" "potatoes.csv" "potatoes.txt"
## [4] "summary.xlsx" "swimming_pools.csv" "urban_pop_orig.xlsx"
## [7] "urbanpop_nonames.xlsx" "urbanpop.xls" "urbanpop.xlsx"
# Rename "data_summary" sheet to "summary"
renameSheet(my_book, sheet="data_summary", newName="summary")
# Print out sheets of my_book
getSheets(my_book)## [1] "1960-1966" "1967-1974" "1975-2011" "summary"
# Save workbook to "renamed.xlsx"
saveWorkbook(my_book, "data/renamed.xlsx")
dir('data/')## [1] "hotdogs.txt" "potatoes.csv"
## [3] "potatoes.txt" "renamed.xlsx"
## [5] "summary.xlsx" "swimming_pools.csv"
## [7] "urban_pop_orig.xlsx" "urbanpop_nonames.xlsx"
## [9] "urbanpop.xls" "urbanpop.xlsx"
# Build connection to renamed.xlsx: my_book
my_book <- loadWorkbook("data/renamed.xlsx")
# Remove the fourth sheet
removeSheet(my_book, sheet="summary")
# Save workbook to "clean.xlsx"
saveWorkbook(my_book, "data/clean.xlsx")
dir('data/')## [1] "clean.xlsx" "hotdogs.txt"
## [3] "potatoes.csv" "potatoes.txt"
## [5] "renamed.xlsx" "summary.xlsx"
## [7] "swimming_pools.csv" "urban_pop_orig.xlsx"
## [9] "urbanpop_nonames.xlsx" "urbanpop.xls"
## [11] "urbanpop.xlsx"