3 - importing-excel-data

# -----------------------------------
# Get the file used in exercise from internet

url_urban_pop <- "http://docs.google.com/spreadsheet/pub?key=pyj6tScZqmEfH89V6UQhpZA&output=xlsx"
download.file(url_urban_pop, "data/urban_pop_orig.xlsx")
# this file does not open
# went to the url
# downloaded as data/urban_pop_orig.xlsx
# this needs formatting to get in state for the course

# alternative link, also did not work with the download function:
# http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/urbanpop.xlsx
# go to url
# save file as: "data/urbanpop.xlsx"

dir()
##  [1] "1 - importing-data-from-flat-files-with-utils.R"   
##  [2] "1 - importing-data-from-flat-files-with-utils.Rmd" 
##  [3] "1_-_importing-data-from-flat-files-with-utils.html"
##  [4] "2 - readr-datatable.R"                             
##  [5] "2 - readr-datatable.Rmd"                           
##  [6] "2_-_readr-datatable.html"                          
##  [7] "3 - importing-excel-data.R"                        
##  [8] "3 - importing-excel-data.Rmd"                      
##  [9] "3_-_importing-excel-data.Rmd"                      
## [10] "4 - reproducible-excel-work-with-xlconnect.R"      
## [11] "data"                                              
## [12] "hotdogs.txt"                                       
## [13] "importing_data_in_r_1_ch1.pdf"                     
## [14] "importing_data_in_r_1_ch2.pdf"                     
## [15] "importing_data_in_r_1_ch3.pdf"                     
## [16] "importing_data_in_r_1_ch4.pdf"                     
## [17] "potatoes.csv"                                      
## [18] "potatoes.txt"                                      
## [19] "rsconnect"                                         
## [20] "swimming_pools.csv"
dir("data")
## [1] "hotdogs.txt"           "urban_pop_orig.xlsx"   "urbanpop.xls"         
## [4] "urbanpop.xlsx"         "urbanpop_nonames.xlsx"

# -----------------------------------

# http://www.rpubs.com/williamsurles/290389
# Tries to re create urbanpop.xlsx from urban_pop_orig.xlsx

# # https://stackoverflow.com/questions/28133360/rjava-is-not-picking-up-the-correct-java-version/32962637#32962637
# install.packages("rJava",,"http://cran.r-project.org",type="source")
# library(rJava)
# .jinit()
# .jcall("java/lang/System", "S", "getProperty", "java.runtime.version")

# install.packages('XLConnect')
# library(XLConnect)
# my_book <- loadWorkbook('data/urban_pop_orig.xlsx')
# List the sheets in my_book
# getSheets(my_book)

# did not work for me :(

# -----------------------------------

# urbanpop.xlsx

# urban population metrics 
# for practically all countries in the world
# throughout time (Source: Gapminder) www.gapminder.org
# three sheets for three different time periods
# In each sheet, the first row contains the column names.

# -----------------------------------
# Load the readxl package
library(readxl)
# https://cran.r-project.org/web/packages/readxl/readxl.pdf

# Print out the names of both spreadsheets
# excel_sheets("urbanpop.xlsx")
excel_sheets('data/urbanpop.xlsx')
## [1] "1960-1966" "1967-1974" "1975-2011"
path <- "data/urbanpop.xlsx"
excel_sheets(path = path)
## [1] "1960-1966" "1967-1974" "1975-2011"

# -----------------------------------

# Read the sheets, one by one
pop_1 <- read_excel("data/urbanpop.xlsx", sheet = 1)
pop_2 <- read_excel("data/urbanpop.xlsx", sheet = "1967-1974")
pop_3 <- read_excel("data/urbanpop.xlsx", sheet = 3)

# Put pop_1, pop_2 and pop_3 in a list: pop_list
pop_list <- list(pop_1, pop_2, pop_3)

# Display the structure of pop_list
str(pop_list)
## List of 3
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    209 obs. of  8 variables:
##   ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##   ..$ 1960   : num [1:209] 769308 494443 3293999 NA NA ...
##   ..$ 1961   : num [1:209] 814923 511803 3515148 13660 8724 ...
##   ..$ 1962   : num [1:209] 858522 529439 3739963 14166 9700 ...
##   ..$ 1963   : num [1:209] 903914 547377 3973289 14759 10748 ...
##   ..$ 1964   : num [1:209] 951226 565572 4220987 15396 11866 ...
##   ..$ 1965   : num [1:209] 1000582 583983 4488176 16045 13053 ...
##   ..$ 1966   : num [1:209] 1058743 602512 4649105 16693 14217 ...
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    209 obs. of  9 variables:
##   ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##   ..$ 1967   : num [1:209] 1119067 621180 4826104 17349 15440 ...
##   ..$ 1968   : num [1:209] 1182159 639964 5017299 17996 16727 ...
##   ..$ 1969   : num [1:209] 1248901 658853 5219332 18619 18088 ...
##   ..$ 1970   : num [1:209] 1319849 677839 5429743 19206 19529 ...
##   ..$ 1971   : num [1:209] 1409001 698932 5619042 19752 20929 ...
##   ..$ 1972   : num [1:209] 1502402 720207 5815734 20263 22406 ...
##   ..$ 1973   : num [1:209] 1598835 741681 6020647 20742 23937 ...
##   ..$ 1974   : num [1:209] 1696445 763385 6235114 21194 25482 ...
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    209 obs. of  38 variables:
##   ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##   ..$ 1975   : num [1:209] 1793266 785350 6460138 21632 27019 ...
##   ..$ 1976   : num [1:209] 1905033 807990 6774099 22047 28366 ...
##   ..$ 1977   : num [1:209] 2021308 830959 7102902 22452 29677 ...
##   ..$ 1978   : num [1:209] 2142248 854262 7447728 22899 31037 ...
##   ..$ 1979   : num [1:209] 2268015 877898 7810073 23457 32572 ...
##   ..$ 1980   : num [1:209] 2398775 901884 8190772 24177 34366 ...
##   ..$ 1981   : num [1:209] 2493265 927224 8637724 25173 36356 ...
##   ..$ 1982   : num [1:209] 2590846 952447 9105820 26342 38618 ...
##   ..$ 1983   : num [1:209] 2691612 978476 9591900 27655 40983 ...
##   ..$ 1984   : num [1:209] 2795656 1006613 10091289 29062 43207 ...
##   ..$ 1985   : num [1:209] 2903078 1037541 10600112 30524 45119 ...
##   ..$ 1986   : num [1:209] 3006983 1072365 11101757 32014 46254 ...
##   ..$ 1987   : num [1:209] 3113957 1109954 11609104 33548 47019 ...
##   ..$ 1988   : num [1:209] 3224082 1146633 12122941 35095 47669 ...
##   ..$ 1989   : num [1:209] 3337444 1177286 12645263 36618 48577 ...
##   ..$ 1990   : num [1:209] 3454129 1198293 13177079 38088 49982 ...
##   ..$ 1991   : num [1:209] 3617842 1215445 13708813 39600 51972 ...
##   ..$ 1992   : num [1:209] 3788685 1222544 14248297 41049 54469 ...
##   ..$ 1993   : num [1:209] 3966956 1222812 14789176 42443 57079 ...
##   ..$ 1994   : num [1:209] 4152960 1221364 15322651 43798 59243 ...
##   ..$ 1995   : num [1:209] 4347018 1222234 15842442 45129 60598 ...
##   ..$ 1996   : num [1:209] 4531285 1228760 16395553 46343 60927 ...
##   ..$ 1997   : num [1:209] 4722603 1238090 16935451 47527 60462 ...
##   ..$ 1998   : num [1:209] 4921227 1250366 17469200 48705 59685 ...
##   ..$ 1999   : num [1:209] 5127421 1265195 18007937 49906 59281 ...
##   ..$ 2000   : num [1:209] 5341456 1282223 18560597 51151 59719 ...
##   ..$ 2001   : num [1:209] 5564492 1315690 19198872 52341 61062 ...
##   ..$ 2002   : num [1:209] 5795940 1352278 19854835 53583 63212 ...
##   ..$ 2003   : num [1:209] 6036100 1391143 20529356 54864 65802 ...
##   ..$ 2004   : num [1:209] 6285281 1430918 21222198 56166 68301 ...
##   ..$ 2005   : num [1:209] 6543804 1470488 21932978 57474 70329 ...
##   ..$ 2006   : num [1:209] 6812538 1512255 22625052 58679 71726 ...
##   ..$ 2007   : num [1:209] 7091245 1553491 23335543 59894 72684 ...
##   ..$ 2008   : num [1:209] 7380272 1594351 24061749 61118 73335 ...
##   ..$ 2009   : num [1:209] 7679982 1635262 24799591 62357 73897 ...
##   ..$ 2010   : num [1:209] 7990746 1676545 25545622 63616 74525 ...
##   ..$ 2011   : num [1:209] 8316976 1716842 26216968 64817 75207 ...

# -----------------------------------

# loading in every sheet manually and then merging them in a list can be quite tedious

# Read all Excel sheets with lapply()
pop_list <- lapply(excel_sheets("data/urbanpop.xlsx"), read_excel, path = "data/urbanpop.xlsx")
# each sheet is loaded in one after the other

# Display the structure of pop_list
str(pop_list) # list of data frames, each data frame representing one of the sheets
## List of 3
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    209 obs. of  8 variables:
##   ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##   ..$ 1960   : num [1:209] 769308 494443 3293999 NA NA ...
##   ..$ 1961   : num [1:209] 814923 511803 3515148 13660 8724 ...
##   ..$ 1962   : num [1:209] 858522 529439 3739963 14166 9700 ...
##   ..$ 1963   : num [1:209] 903914 547377 3973289 14759 10748 ...
##   ..$ 1964   : num [1:209] 951226 565572 4220987 15396 11866 ...
##   ..$ 1965   : num [1:209] 1000582 583983 4488176 16045 13053 ...
##   ..$ 1966   : num [1:209] 1058743 602512 4649105 16693 14217 ...
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    209 obs. of  9 variables:
##   ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##   ..$ 1967   : num [1:209] 1119067 621180 4826104 17349 15440 ...
##   ..$ 1968   : num [1:209] 1182159 639964 5017299 17996 16727 ...
##   ..$ 1969   : num [1:209] 1248901 658853 5219332 18619 18088 ...
##   ..$ 1970   : num [1:209] 1319849 677839 5429743 19206 19529 ...
##   ..$ 1971   : num [1:209] 1409001 698932 5619042 19752 20929 ...
##   ..$ 1972   : num [1:209] 1502402 720207 5815734 20263 22406 ...
##   ..$ 1973   : num [1:209] 1598835 741681 6020647 20742 23937 ...
##   ..$ 1974   : num [1:209] 1696445 763385 6235114 21194 25482 ...
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    209 obs. of  38 variables:
##   ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##   ..$ 1975   : num [1:209] 1793266 785350 6460138 21632 27019 ...
##   ..$ 1976   : num [1:209] 1905033 807990 6774099 22047 28366 ...
##   ..$ 1977   : num [1:209] 2021308 830959 7102902 22452 29677 ...
##   ..$ 1978   : num [1:209] 2142248 854262 7447728 22899 31037 ...
##   ..$ 1979   : num [1:209] 2268015 877898 7810073 23457 32572 ...
##   ..$ 1980   : num [1:209] 2398775 901884 8190772 24177 34366 ...
##   ..$ 1981   : num [1:209] 2493265 927224 8637724 25173 36356 ...
##   ..$ 1982   : num [1:209] 2590846 952447 9105820 26342 38618 ...
##   ..$ 1983   : num [1:209] 2691612 978476 9591900 27655 40983 ...
##   ..$ 1984   : num [1:209] 2795656 1006613 10091289 29062 43207 ...
##   ..$ 1985   : num [1:209] 2903078 1037541 10600112 30524 45119 ...
##   ..$ 1986   : num [1:209] 3006983 1072365 11101757 32014 46254 ...
##   ..$ 1987   : num [1:209] 3113957 1109954 11609104 33548 47019 ...
##   ..$ 1988   : num [1:209] 3224082 1146633 12122941 35095 47669 ...
##   ..$ 1989   : num [1:209] 3337444 1177286 12645263 36618 48577 ...
##   ..$ 1990   : num [1:209] 3454129 1198293 13177079 38088 49982 ...
##   ..$ 1991   : num [1:209] 3617842 1215445 13708813 39600 51972 ...
##   ..$ 1992   : num [1:209] 3788685 1222544 14248297 41049 54469 ...
##   ..$ 1993   : num [1:209] 3966956 1222812 14789176 42443 57079 ...
##   ..$ 1994   : num [1:209] 4152960 1221364 15322651 43798 59243 ...
##   ..$ 1995   : num [1:209] 4347018 1222234 15842442 45129 60598 ...
##   ..$ 1996   : num [1:209] 4531285 1228760 16395553 46343 60927 ...
##   ..$ 1997   : num [1:209] 4722603 1238090 16935451 47527 60462 ...
##   ..$ 1998   : num [1:209] 4921227 1250366 17469200 48705 59685 ...
##   ..$ 1999   : num [1:209] 5127421 1265195 18007937 49906 59281 ...
##   ..$ 2000   : num [1:209] 5341456 1282223 18560597 51151 59719 ...
##   ..$ 2001   : num [1:209] 5564492 1315690 19198872 52341 61062 ...
##   ..$ 2002   : num [1:209] 5795940 1352278 19854835 53583 63212 ...
##   ..$ 2003   : num [1:209] 6036100 1391143 20529356 54864 65802 ...
##   ..$ 2004   : num [1:209] 6285281 1430918 21222198 56166 68301 ...
##   ..$ 2005   : num [1:209] 6543804 1470488 21932978 57474 70329 ...
##   ..$ 2006   : num [1:209] 6812538 1512255 22625052 58679 71726 ...
##   ..$ 2007   : num [1:209] 7091245 1553491 23335543 59894 72684 ...
##   ..$ 2008   : num [1:209] 7380272 1594351 24061749 61118 73335 ...
##   ..$ 2009   : num [1:209] 7679982 1635262 24799591 62357 73897 ...
##   ..$ 2010   : num [1:209] 7990746 1676545 25545622 63616 74525 ...
##   ..$ 2011   : num [1:209] 8316976 1716842 26216968 64817 75207 ...

# -----------------------------------

# download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/urbanpop_nonames.xlsx",
#               "data/urbanpop_nonames.xlsx")
# had to go the url and save this file

# Import the the first Excel sheet of urbanpop_nonames.xlsx
# col_names argument of read_excel()
# default is TRUE
# R will choose column names for you
pop_a <- read_excel("data/urbanpop_nonames.xlsx", sheet = 1, col_names = FALSE)
head(pop_a)
## # A tibble: 6 x 8
##   X__1              X__2     X__3     X__4     X__5     X__6   X__7   X__8
##   <chr>            <dbl>    <dbl>    <dbl>    <dbl>    <dbl>  <dbl>  <dbl>
## 1 Afghanistan     769308  814923.  858522.  903914.  951226. 1.00e6 1.06e6
## 2 Albania         494443  511803.  529439.  547377.  565572. 5.84e5 6.03e5
## 3 Algeria        3293999 3515148. 3739963. 3973289. 4220987. 4.49e6 4.65e6
## 4 American Samoa      NA   13660.   14166.   14759.   15396. 1.60e4 1.67e4
## 5 Andorra             NA    8724.    9700.   10748.   11866. 1.31e4 1.42e4
## 6 Angola          521205  548265.  579695.  612087.  645262. 6.79e5 7.18e5

# Import the the first Excel sheet of urbanpop_nonames.xlsx (specify col_names)
cols <- c("country", paste0("year_", 1960:1966))
# (can also choose to set col_names to a character vector with names for each column)
pop_b <- read_excel("data/urbanpop_nonames.xlsx", sheet = 1, col_names = cols)
head(pop_b)
## # A tibble: 6 x 8
##   country      year_1960 year_1961 year_1962 year_1963 year_1964 year_1965
##   <chr>            <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 Afghanistan     769308   814923.   858522.   903914.   951226.  1000582.
## 2 Albania         494443   511803.   529439.   547377.   565572.   583983.
## 3 Algeria        3293999  3515148.  3739963.  3973289.  4220987.  4488176.
## 4 American Sa~        NA    13660.    14166.    14759.    15396.    16045.
## 5 Andorra             NA     8724.     9700.    10748.    11866.    13053.
## 6 Angola          521205   548265.   579695.   612087.   645262.   679109.
## # ... with 1 more variable: year_1966 <dbl>

# Print the summary of pop_a
summary(pop_a)
##      X__1                X__2                X__3          
##  Length:209         Min.   :     3378   Min.   :     1028  
##  Class :character   1st Qu.:    88978   1st Qu.:    70644  
##  Mode  :character   Median :   580675   Median :   570159  
##                     Mean   :  4988124   Mean   :  4991613  
##                     3rd Qu.:  3077228   3rd Qu.:  2807280  
##                     Max.   :126469700   Max.   :129268133  
##                     NA's   :11                             
##       X__4                X__5                X__6          
##  Min.   :     1090   Min.   :     1154   Min.   :     1218  
##  1st Qu.:    74974   1st Qu.:    81870   1st Qu.:    84953  
##  Median :   593968   Median :   619331   Median :   645262  
##  Mean   :  5141592   Mean   :  5303711   Mean   :  5468966  
##  3rd Qu.:  2948396   3rd Qu.:  3148941   3rd Qu.:  3296444  
##  Max.   :131974143   Max.   :134599886   Max.   :137205240  
##                                                             
##       X__7                X__8          
##  Min.   :     1281   Min.   :     1349  
##  1st Qu.:    88633   1st Qu.:    93638  
##  Median :   679109   Median :   735139  
##  Mean   :  5637394   Mean   :  5790281  
##  3rd Qu.:  3317422   3rd Qu.:  3418036  
##  Max.   :139663053   Max.   :141962708  
## 

# Print the summary of pop_b
summary(pop_b)
##    country            year_1960           year_1961        
##  Length:209         Min.   :     3378   Min.   :     1028  
##  Class :character   1st Qu.:    88978   1st Qu.:    70644  
##  Mode  :character   Median :   580675   Median :   570159  
##                     Mean   :  4988124   Mean   :  4991613  
##                     3rd Qu.:  3077228   3rd Qu.:  2807280  
##                     Max.   :126469700   Max.   :129268133  
##                     NA's   :11                             
##    year_1962           year_1963           year_1964        
##  Min.   :     1090   Min.   :     1154   Min.   :     1218  
##  1st Qu.:    74974   1st Qu.:    81870   1st Qu.:    84953  
##  Median :   593968   Median :   619331   Median :   645262  
##  Mean   :  5141592   Mean   :  5303711   Mean   :  5468966  
##  3rd Qu.:  2948396   3rd Qu.:  3148941   3rd Qu.:  3296444  
##  Max.   :131974143   Max.   :134599886   Max.   :137205240  
##                                                             
##    year_1965           year_1966        
##  Min.   :     1281   Min.   :     1349  
##  1st Qu.:    88633   1st Qu.:    93638  
##  Median :   679109   Median :   735139  
##  Mean   :  5637394   Mean   :  5790281  
##  3rd Qu.:  3317422   3rd Qu.:  3418036  
##  Max.   :139663053   Max.   :141962708  
## 

# -----------------------------------

# skip
# ignore a specified number of rows inside the Excel sheets 

# Import the second sheet of urbanpop.xlsx
# skipping the first 21 rows
urbanpop_sel <- read_excel("data/urbanpop.xlsx", sheet = 2, skip = 21, col_names = FALSE)

# Print out the first observation from urbanpop_sel
str(urbanpop_sel)
## Classes 'tbl_df', 'tbl' and 'data.frame':    189 obs. of  9 variables:
##  $ X__1: chr  "Benin" "Bermuda" "Bhutan" "Bolivia" ...
##  $ X__2: num  382022 52000 14379 1527065 851692 ...
##  $ X__3: num  411859 53000 15617 1575177 890270 ...
##  $ X__4: num  443013 54000 16946 1625173 929450 ...
##  $ X__5: num  475611 55000 18381 1677184 969549 ...
##  $ X__6: num  515820 54600 20173 1731437 1008630 ...
##  $ X__7: num  557938 54200 22100 1787719 1048738 ...
##  $ X__8: num  602093 53800 24160 1845894 1089648 ...
##  $ X__9: num  648410 53400 26343 1905749 1130966 ...

# urbanpop_sel[1,1]
urbanpop_sel[1,] # first observation - i.e. first row
## # A tibble: 1 x 9
##   X__1     X__2    X__3    X__4    X__5    X__6    X__7    X__8    X__9
##   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 Benin 382022. 411859. 443013. 475611. 515820. 557938. 602093. 648410.

# -----------------------------------

# Load the gdata package
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
# written in pearl
# wraps read.table and makes all of its arguments available
# Its pretty slow. It converts everything to csv then reads it in

# download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/urbanpop.xls",
#               "data/urbanpop.xls")

# go to link above
# download
# open
# save

# read.xls does not work due to xls2sep & perl issues?



# Import the second sheet of urbanpop.xls
urban_pop <- read.xls("data/urbanpop.xls", "1967-1974")

# Print the first 11 observations using head()
head(urban_pop, 11)
##                country       X1967       X1968       X1969       X1970
## 1          Afghanistan  1119067.20  1182159.06  1248900.79  1319848.78
## 2              Albania   621179.85   639964.46   658853.12   677839.12
## 3              Algeria  4826104.22  5017298.60  5219331.87  5429743.08
## 4       American Samoa    17348.66    17995.51    18618.68    19206.39
## 5              Andorra    15439.62    16726.99    18088.32    19528.96
## 6               Angola   757496.32   798459.26   841261.96   886401.63
## 7  Antigua and Barbuda    22086.25    22149.39    22182.92    22180.87
## 8            Argentina 17753280.98 18124103.64 18510462.30 18918072.79
## 9              Armenia  1337032.09  1392892.13  1449641.49  1507619.77
## 10               Aruba    29414.72    29576.09    29737.87    29901.57
## 11           Australia  9934404.03 10153969.77 10412390.67 10664093.55
##          X1971       X1972       X1973       X1974
## 1   1409001.09  1502401.79  1598835.45  1696444.83
## 2    698932.25   720206.57   741681.04   763385.45
## 3   5619041.53  5815734.49  6020647.35  6235114.38
## 4     19752.02    20262.67    20741.97    21194.38
## 5     20928.73    22405.84    23937.05    25481.98
## 6    955010.09  1027397.35  1103829.78  1184486.23
## 7     22560.87    22907.76    23221.29    23502.92
## 8  19329718.16 19763078.00 20211424.85 20664728.90
## 9   1564367.60  1622103.53  1680497.75  1739063.02
## 10    30081.36    30279.76    30467.42    30602.87
## 11 11047706.39 11269945.50 11461120.68 11772934.25

# -----------------------------------

# read.xls() wraps around read.table()

# converting the Excel file to a .csv file using a Perl script, 
# and then reading that .csv file
# with the read.csv() function 
# that is loaded by default in R, through the utils package.

# Column names for urban_pop
columns <- c("country", paste0("year_", 1967:1974))

# Finish the read.xls call
urban_pop <- read.xls("data/urbanpop.xls", sheet = 2,
                      skip = 50, header = FALSE, stringsAsFactors = FALSE,
                      col.names = columns)

# Print first 10 observation of urban_pop
head(urban_pop,10)
##               country   year_1967   year_1968   year_1969   year_1970
## 1              Cyprus   231929.74   237831.38   243983.34   250164.52
## 2      Czech Republic  6204409.91  6266304.50  6326368.97  6348794.89
## 3             Denmark  3777552.62  3826785.08  3874313.99  3930042.97
## 4            Djibouti    77788.04    84694.35    92045.77    99845.22
## 5            Dominica    27550.36    29527.32    31475.62    33328.25
## 6  Dominican Republic  1535485.43  1625455.76  1718315.40  1814060.00
## 7             Ecuador  2059355.12  2151395.14  2246890.79  2345864.41
## 8               Egypt 13798171.00 14248342.19 14703858.22 15162858.52
## 9         El Salvador  1345528.98  1387218.33  1429378.98  1472181.26
## 10  Equatorial Guinea    75364.50    77295.03    78445.74    78411.07
##      year_1971   year_1972   year_1973   year_1974
## 1    261213.21   272407.99   283774.90   295379.83
## 2   6437055.17  6572632.32  6718465.53  6873458.18
## 3   3981360.12  4028247.92  4076867.28  4120201.43
## 4    107799.69   116098.23   125391.58   136606.25
## 5     34761.52    36049.99    37260.05    38501.47
## 6   1915590.38  2020157.01  2127714.45  2238203.87
## 7   2453817.78  2565644.81  2681525.25  2801692.62
## 8  15603661.36 16047814.69 16498633.27 16960827.93
## 9   1527985.34  1584758.18  1642098.95  1699470.87
## 10    77055.29    74596.06    71438.96    68179.26


# import data from all three sheets in urbanpop.xls
path <- "data/urbanpop.xls"
urban_sheet1 <- read.xls(path, sheet = 1, stringsAsFactors = FALSE)
urban_sheet2 <- read.xls(path, sheet = 2, stringsAsFactors = FALSE)
urban_sheet3 <- read.xls(path, sheet = 3, stringsAsFactors = FALSE)


# Extend the cbind() call to include urban_sheet3
# first column of urban_sheet2 and urban_sheet3 are removed,
# so you don't have duplicate columns.
urban <- cbind(urban_sheet1, urban_sheet2[-1], urban_sheet3[-1])

# Remove all rows with NAs from urban
summary(urban)
##    country              X1960               X1961          
##  Length:209         Min.   :     3378   Min.   :     1028  
##  Class :character   1st Qu.:    88978   1st Qu.:    70644  
##  Mode  :character   Median :   580675   Median :   570159  
##                     Mean   :  4988124   Mean   :  4991613  
##                     3rd Qu.:  3077228   3rd Qu.:  2807280  
##                     Max.   :126469700   Max.   :129268133  
##                     NA's   :11                             
##      X1962               X1963               X1964          
##  Min.   :     1090   Min.   :     1154   Min.   :     1218  
##  1st Qu.:    74974   1st Qu.:    81870   1st Qu.:    84953  
##  Median :   593968   Median :   619331   Median :   645262  
##  Mean   :  5141592   Mean   :  5303711   Mean   :  5468966  
##  3rd Qu.:  2948396   3rd Qu.:  3148941   3rd Qu.:  3296444  
##  Max.   :131974143   Max.   :134599886   Max.   :137205240  
##                                                             
##      X1965               X1966               X1967          
##  Min.   :     1281   Min.   :     1349   Min.   :     1415  
##  1st Qu.:    88633   1st Qu.:    93638   1st Qu.:    98684  
##  Median :   679109   Median :   735139   Median :   765601  
##  Mean   :  5637394   Mean   :  5790281   Mean   :  5944033  
##  3rd Qu.:  3317422   3rd Qu.:  3418036   3rd Qu.:  3556448  
##  Max.   :139663053   Max.   :141962708   Max.   :144201722  
##                                                             
##      X1968               X1969               X1970          
##  Min.   :     1480   Min.   :     1545   Min.   :     1611  
##  1st Qu.:   103670   1st Qu.:   108473   1st Qu.:   113010  
##  Median :   798459   Median :   866258   Median :   903905  
##  Mean   :  6098019   Mean   :  6262333   Mean   :  6427281  
##  3rd Qu.:  3696854   3rd Qu.:  3838003   3rd Qu.:  3930043  
##  Max.   :146340364   Max.   :148475901   Max.   :150922373  
##                                                             
##      X1971               X1972               X1973          
##  Min.   :     1684   Min.   :     1757   Min.   :     1831  
##  1st Qu.:   115124   1st Qu.:   116098   1st Qu.:   124928  
##  Median :   953577   Median :   976446   Median :   999667  
##  Mean   :  6599259   Mean   :  6773754   Mean   :  6947953  
##  3rd Qu.:  3981360   3rd Qu.:  4028248   3rd Qu.:  4224277  
##  Max.   :152863831   Max.   :154530473   Max.   :156034106  
##                                                             
##      X1974               X1975               X1976          
##  Min.   :     1905   Min.   :     1980   Min.   :     2060  
##  1st Qu.:   135168   1st Qu.:   144775   1st Qu.:   153214  
##  Median :  1028372   Median :  1072796   Median :  1135501  
##  Mean   :  7128645   Mean   :  7309852   Mean   :  7503259  
##  3rd Qu.:  4427442   3rd Qu.:  4635991   3rd Qu.:  4854943  
##  Max.   :157488074   Max.   :159452730   Max.   :165583752  
##                                                             
##      X1977               X1978               X1979          
##  Min.   :     2140   Min.   :     2223   Min.   :     2309  
##  1st Qu.:   160948   1st Qu.:   169616   1st Qu.:   181477  
##  Median :  1180706   Median :  1287154   Median :  1316892  
##  Mean   :  7701487   Mean   :  7905258   Mean   :  8115040  
##  3rd Qu.:  5012324   3rd Qu.:  5157254   3rd Qu.:  5301984  
##  Max.   :171550310   Max.   :177605736   Max.   :183785364  
##                                                             
##      X1980               X1981               X1982          
##  Min.   :     2401   Min.   :     2506   Min.   :     2617  
##  1st Qu.:   198023   1st Qu.:   203581   1st Qu.:   207630  
##  Median :  1346902   Median :  1397894   Median :  1453904  
##  Mean   :  8328473   Mean   :  8555840   Mean   :  8788551  
##  3rd Qu.:  5448102   3rd Qu.:  5565661   3rd Qu.:  5626559  
##  Max.   :189947471   Max.   :199385258   Max.   :209435968  
##                                                             
##      X1983               X1984               X1985          
##  Min.   :     2732   Min.   :     2848   Min.   :     2962  
##  1st Qu.:   221136   1st Qu.:   235938   1st Qu.:   243791  
##  Median :  1522872   Median :  1592240   Median :  1663681  
##  Mean   :  9024799   Mean   :  9264563   Mean   :  9510547  
##  3rd Qu.:  5685137   3rd Qu.:  5743078   3rd Qu.:  5787617  
##  Max.   :219680098   Max.   :229872397   Max.   :240414890  
##                                                             
##      X1986               X1987               X1988          
##  Min.   :     3106   Min.   :     3247   Min.   :     3386  
##  1st Qu.:   252033   1st Qu.:   261211   1st Qu.:   271192  
##  Median :  1740216   Median :  1808035   Median :  1830180  
##  Mean   :  9761453   Mean   : 10018768   Mean   : 10280947  
##  3rd Qu.:  5818099   3rd Qu.:  5858728   3rd Qu.:  5889922  
##  Max.   :251630158   Max.   :263433513   Max.   :275570541  
##                                                             
##      X1989               X1990               X1991          
##  Min.   :     3524   Min.   :     3661   Min.   :     3742  
##  1st Qu.:   281694   1st Qu.:   292457   1st Qu.:   303034  
##  Median :  1916007   Median :  1978215   Median :  2018359  
##  Mean   : 10546895   Mean   : 10815109   Mean   : 11085209  
##  3rd Qu.:  5860541   3rd Qu.:  6030313   3rd Qu.:  6239205  
##  Max.   :287810747   Max.   :300165618   Max.   :314689997  
##                                                             
##      X1992               X1993               X1994          
##  Min.   :     3822   Min.   :     3901   Min.   :     3979  
##  1st Qu.:   315154   1st Qu.:   325923   1st Qu.:   333766  
##  Median :  2090284   Median :  2147831   Median :  2195782  
##  Mean   : 11399986   Mean   : 11671192   Mean   : 11942346  
##  3rd Qu.:  6660368   3rd Qu.:  6803893   3rd Qu.:  6891732  
##  Max.   :329099365   Max.   :343555327   Max.   :358232230  
##  NA's   :1           NA's   :1           NA's   :1          
##      X1995               X1996               X1997          
##  Min.   :     4057   Min.   :     4111   Min.   :     4164  
##  1st Qu.:   338722   1st Qu.:   344075   1st Qu.:   349136  
##  Median :  2229853   Median :  2324907   Median :  2401255  
##  Mean   : 12165147   Mean   : 12442902   Mean   : 12723964  
##  3rd Qu.:  6924376   3rd Qu.:  7128895   3rd Qu.:  7422271  
##  Max.   :373035157   Max.   :388936607   Max.   :405031716  
##                                                             
##      X1998               X1999               X2000          
##  Min.   :     4219   Min.   :     4275   Min.   :     4334  
##  1st Qu.:   356719   1st Qu.:   365573   1st Qu.:   370353  
##  Median :  2381800   Median :  2365285   Median :  2459957  
##  Mean   : 13006498   Mean   : 13290484   Mean   : 13579489  
##  3rd Qu.:  7429968   3rd Qu.:  7439339   3rd Qu.:  7593573  
##  Max.   :421147610   Max.   :437126845   Max.   :452999147  
##                                                             
##      X2001               X2002               X2003          
##  Min.   :     4397   Min.   :     4464   Min.   :     4532  
##  1st Qu.:   373611   1st Qu.:   377197   1st Qu.:   381128  
##  Median :  2539746   Median :  2637810   Median :  2745377  
##  Mean   : 13897284   Mean   : 14216175   Mean   : 14538311  
##  3rd Qu.:  7651796   3rd Qu.:  7867562   3rd Qu.:  8089415  
##  Max.   :473204511   Max.   :493402140   Max.   :513607776  
##                                                             
##      X2004               X2005               X2006          
##  Min.   :     4598   Min.   :     4661   Min.   :     4719  
##  1st Qu.:   385355   1st Qu.:   390398   1st Qu.:   397864  
##  Median :  2868044   Median :  2969664   Median :  3000504  
##  Mean   : 14864880   Mean   : 15195447   Mean   : 15531476  
##  3rd Qu.:  8314375   3rd Qu.:  8520688   3rd Qu.:  8764631  
##  Max.   :533892175   Max.   :554367818   Max.   :575050081  
##                                                             
##      X2007               X2008               X2009          
##  Min.   :     4774   Min.   :     4827   Min.   :     4877  
##  1st Qu.:   405264   1st Qu.:   413803   1st Qu.:   422789  
##  Median :  3024406   Median :  3047794   Median :  3079402  
##  Mean   : 15873312   Mean   : 16220693   Mean   : 16569652  
##  3rd Qu.:  9010932   3rd Qu.:  9316659   3rd Qu.:  9638790  
##  Max.   :595731464   Max.   :616552722   Max.   :637533976  
##                                                             
##      X2010               X2011          
##  Min.   :     4928   Min.   :     4979  
##  1st Qu.:   431853   1st Qu.:   441587  
##  Median :  3103263   Median :  3117433  
##  Mean   : 16920124   Mean   : 17271161  
##  3rd Qu.:  9978840   3rd Qu.: 10334364  
##  Max.   :658557734   Max.   :678796403  
## 
urban_clean <- na.omit(urban)

# Print out a summary of urban_clean
summary(urban_clean) # assert that there are no more NA values.
##    country              X1960               X1961          
##  Length:197         Min.   :     3378   Min.   :     3433  
##  Class :character   1st Qu.:    87735   1st Qu.:    92905  
##  Mode  :character   Median :   599714   Median :   630788  
##                     Mean   :  5012388   Mean   :  5282488  
##                     3rd Qu.:  3130085   3rd Qu.:  3155370  
##                     Max.   :126469700   Max.   :129268133  
##      X1962               X1963               X1964          
##  Min.   :     3481   Min.   :     3532   Min.   :     3586  
##  1st Qu.:    98331   1st Qu.:   104988   1st Qu.:   112084  
##  Median :   659464   Median :   704989   Median :   740609  
##  Mean   :  5440972   Mean   :  5612312   Mean   :  5786961  
##  3rd Qu.:  3250211   3rd Qu.:  3416490   3rd Qu.:  3585464  
##  Max.   :131974143   Max.   :134599886   Max.   :137205240  
##      X1965               X1966               X1967          
##  Min.   :     3644   Min.   :     3706   Min.   :     3771  
##  1st Qu.:   119322   1st Qu.:   128565   1st Qu.:   138024  
##  Median :   774957   Median :   809768   Median :   838449  
##  Mean   :  5964970   Mean   :  6126413   Mean   :  6288771  
##  3rd Qu.:  3666724   3rd Qu.:  3871757   3rd Qu.:  4019906  
##  Max.   :139663053   Max.   :141962708   Max.   :144201722  
##      X1968               X1969               X1970          
##  Min.   :     3835   Min.   :     3893   Min.   :     3941  
##  1st Qu.:   147846   1st Qu.:   158252   1st Qu.:   171063  
##  Median :   890270   Median :   929450   Median :   976471  
##  Mean   :  6451367   Mean   :  6624909   Mean   :  6799110  
##  3rd Qu.:  4158186   3rd Qu.:  4300669   3rd Qu.:  4440047  
##  Max.   :146340364   Max.   :148475901   Max.   :150922373  
##      X1971               X1972               X1973          
##  Min.   :     4017   Min.   :     4084   Min.   :     4146  
##  1st Qu.:   181483   1st Qu.:   189492   1st Qu.:   197792  
##  Median :  1008630   Median :  1048738   Median :  1097293  
##  Mean   :  6980895   Mean   :  7165338   Mean   :  7349454  
##  3rd Qu.:  4595966   3rd Qu.:  4766545   3rd Qu.:  4838297  
##  Max.   :152863831   Max.   :154530473   Max.   :156034106  
##      X1974               X1975               X1976          
##  Min.   :     4206   Min.   :     4267   Min.   :     4334  
##  1st Qu.:   205410   1st Qu.:   211746   1st Qu.:   216991  
##  Median :  1159402   Median :  1223146   Median :  1249829  
##  Mean   :  7540446   Mean   :  7731973   Mean   :  7936401  
##  3rd Qu.:  4906384   3rd Qu.:  5003370   3rd Qu.:  5121118  
##  Max.   :157488074   Max.   :159452730   Max.   :165583752  
##      X1977               X1978               X1979          
##  Min.   :     4402   Min.   :     4470   Min.   :     4539  
##  1st Qu.:   222209   1st Qu.:   227605   1st Qu.:   233461  
##  Median :  1311276   Median :  1340811   Median :  1448185  
##  Mean   :  8145945   Mean   :  8361360   Mean   :  8583138  
##  3rd Qu.:  5227677   3rd Qu.:  5352746   3rd Qu.:  5558850  
##  Max.   :171550310   Max.   :177605736   Max.   :183785364  
##      X1980               X1981               X1982          
##  Min.   :     4607   Min.   :     4645   Min.   :     4681  
##  1st Qu.:   242583   1st Qu.:   248948   1st Qu.:   257944  
##  Median :  1592397   Median :  1673079   Median :  1713060  
##  Mean   :  8808772   Mean   :  9049163   Mean   :  9295226  
##  3rd Qu.:  5815772   3rd Qu.:  6070457   3rd Qu.:  6337995  
##  Max.   :189947471   Max.   :199385258   Max.   :209435968  
##      X1983               X1984               X1985          
##  Min.   :     4716   Min.   :     4750   Min.   :     4782  
##  1st Qu.:   274139   1st Qu.:   284939   1st Qu.:   300928  
##  Median :  1730626   Median :  1749033   Median :  1786125  
##  Mean   :  9545035   Mean   :  9798559   Mean   : 10058661  
##  3rd Qu.:  6619987   3rd Qu.:  6918261   3rd Qu.:  6931780  
##  Max.   :219680098   Max.   :229872397   Max.   :240414890  
##      X1986               X1987               X1988          
##  Min.   :     4809   Min.   :     4835   Min.   :     4859  
##  1st Qu.:   307699   1st Qu.:   321125   1st Qu.:   334616  
##  Median :  1850910   Median :  1953694   Median :  1997011  
##  Mean   : 10323839   Mean   : 10595817   Mean   : 10873041  
##  3rd Qu.:  6935763   3rd Qu.:  6939905   3rd Qu.:  6945022  
##  Max.   :251630158   Max.   :263433513   Max.   :275570541  
##      X1989               X1990               X1991          
##  Min.   :     4883   Min.   :     4907   Min.   :     4946  
##  1st Qu.:   347348   1st Qu.:   370152   1st Qu.:   394611  
##  Median :  1993544   Median :  2066505   Median :  2150230  
##  Mean   : 11154458   Mean   : 11438543   Mean   : 11725076  
##  3rd Qu.:  6885378   3rd Qu.:  6830026   3rd Qu.:  6816589  
##  Max.   :287810747   Max.   :300165618   Max.   :314689997  
##      X1992               X1993               X1994          
##  Min.   :     4985   Min.   :     5024   Min.   :     5062  
##  1st Qu.:   418788   1st Qu.:   427457   1st Qu.:   435959  
##  Median :  2237405   Median :  2322158   Median :  2410297  
##  Mean   : 12010922   Mean   : 12296949   Mean   : 12582930  
##  3rd Qu.:  6820099   3rd Qu.:  7139656   3rd Qu.:  7499901  
##  Max.   :329099365   Max.   :343555327   Max.   :358232230  
##      X1995               X1996               X1997          
##  Min.   :     5100   Min.   :     5079   Min.   :     5055  
##  1st Qu.:   461993   1st Qu.:   488136   1st Qu.:   494203  
##  Median :  2482393   Median :  2522460   Median :  2606125  
##  Mean   : 12871480   Mean   : 13165924   Mean   : 13463675  
##  3rd Qu.:  7708571   3rd Qu.:  7686092   3rd Qu.:  7664316  
##  Max.   :373035157   Max.   :388936607   Max.   :405031716  
##      X1998               X1999               X2000          
##  Min.   :     5029   Min.   :     5001   Min.   :     4971  
##  1st Qu.:   498002   1st Qu.:   505144   1st Qu.:   525629  
##  Median :  2664983   Median :  2737809   Median :  2826647  
##  Mean   : 13762861   Mean   : 14063387   Mean   : 14369278  
##  3rd Qu.:  7784056   3rd Qu.:  8083488   3rd Qu.:  8305564  
##  Max.   :421147610   Max.   :437126845   Max.   :452999147  
##      X2001               X2002               X2003          
##  Min.   :     5003   Min.   :     5034   Min.   :     5064  
##  1st Qu.:   550638   1st Qu.:   567531   1st Qu.:   572094  
##  Median :  2925851   Median :  2928252   Median :  2944934  
##  Mean   : 14705743   Mean   : 15043381   Mean   : 15384513  
##  3rd Qu.:  8421967   3rd Qu.:  8448628   3rd Qu.:  8622732  
##  Max.   :473204511   Max.   :493402140   Max.   :513607776  
##      X2004               X2005               X2006          
##  Min.   :     5090   Min.   :     5111   Min.   :     5135  
##  1st Qu.:   593900   1st Qu.:   620511   1st Qu.:   632659  
##  Median :  2994356   Median :  3057923   Median :  3269963  
##  Mean   : 15730299   Mean   : 16080262   Mean   : 16435872  
##  3rd Qu.:  8999112   3rd Qu.:  9394001   3rd Qu.:  9689807  
##  Max.   :533892175   Max.   :554367818   Max.   :575050081  
##      X2007               X2008               X2009          
##  Min.   :     5155   Min.   :     5172   Min.   :     5189  
##  1st Qu.:   645172   1st Qu.:   658017   1st Qu.:   671085  
##  Median :  3432024   Median :  3589395   Median :  3652338  
##  Mean   : 16797484   Mean   : 17164898   Mean   : 17533997  
##  3rd Qu.:  9803381   3rd Qu.: 10210317   3rd Qu.: 10518289  
##  Max.   :595731464   Max.   :616552722   Max.   :637533976  
##      X2010               X2011          
##  Min.   :     5206   Min.   :     5233  
##  1st Qu.:   684302   1st Qu.:   698009  
##  Median :  3676309   Median :  3664664  
##  Mean   : 17904811   Mean   : 18276297  
##  3rd Qu.: 10618596   3rd Qu.: 10731193  
##  Max.   :658557734   Max.   :678796403