3 - importing-excel-data
# -----------------------------------
# Get the file used in exercise from internet
url_urban_pop <- "http://docs.google.com/spreadsheet/pub?key=pyj6tScZqmEfH89V6UQhpZA&output=xlsx"
download.file(url_urban_pop, "data/urban_pop_orig.xlsx")
# this file does not open
# went to the url
# downloaded as data/urban_pop_orig.xlsx
# this needs formatting to get in state for the course
# alternative link, also did not work with the download function:
# http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/urbanpop.xlsx
# go to url
# save file as: "data/urbanpop.xlsx"
dir()
## [1] "1 - importing-data-from-flat-files-with-utils.R"
## [2] "1 - importing-data-from-flat-files-with-utils.Rmd"
## [3] "1_-_importing-data-from-flat-files-with-utils.html"
## [4] "2 - readr-datatable.R"
## [5] "2 - readr-datatable.Rmd"
## [6] "2_-_readr-datatable.html"
## [7] "3 - importing-excel-data.R"
## [8] "3 - importing-excel-data.Rmd"
## [9] "3_-_importing-excel-data.Rmd"
## [10] "4 - reproducible-excel-work-with-xlconnect.R"
## [11] "data"
## [12] "hotdogs.txt"
## [13] "importing_data_in_r_1_ch1.pdf"
## [14] "importing_data_in_r_1_ch2.pdf"
## [15] "importing_data_in_r_1_ch3.pdf"
## [16] "importing_data_in_r_1_ch4.pdf"
## [17] "potatoes.csv"
## [18] "potatoes.txt"
## [19] "rsconnect"
## [20] "swimming_pools.csv"
dir("data")
## [1] "hotdogs.txt" "urban_pop_orig.xlsx" "urbanpop.xls"
## [4] "urbanpop.xlsx" "urbanpop_nonames.xlsx"
# -----------------------------------
# http://www.rpubs.com/williamsurles/290389
# Tries to re create urbanpop.xlsx from urban_pop_orig.xlsx
# # https://stackoverflow.com/questions/28133360/rjava-is-not-picking-up-the-correct-java-version/32962637#32962637
# install.packages("rJava",,"http://cran.r-project.org",type="source")
# library(rJava)
# .jinit()
# .jcall("java/lang/System", "S", "getProperty", "java.runtime.version")
# install.packages('XLConnect')
# library(XLConnect)
# my_book <- loadWorkbook('data/urban_pop_orig.xlsx')
# List the sheets in my_book
# getSheets(my_book)
# did not work for me :(
# -----------------------------------
# urbanpop.xlsx
# urban population metrics
# for practically all countries in the world
# throughout time (Source: Gapminder) www.gapminder.org
# three sheets for three different time periods
# In each sheet, the first row contains the column names.
# -----------------------------------
# Load the readxl package
library(readxl)
# https://cran.r-project.org/web/packages/readxl/readxl.pdf
# Print out the names of both spreadsheets
# excel_sheets("urbanpop.xlsx")
excel_sheets('data/urbanpop.xlsx')
## [1] "1960-1966" "1967-1974" "1975-2011"
path <- "data/urbanpop.xlsx"
excel_sheets(path = path)
## [1] "1960-1966" "1967-1974" "1975-2011"
# -----------------------------------
# Read the sheets, one by one
pop_1 <- read_excel("data/urbanpop.xlsx", sheet = 1)
pop_2 <- read_excel("data/urbanpop.xlsx", sheet = "1967-1974")
pop_3 <- read_excel("data/urbanpop.xlsx", sheet = 3)
# Put pop_1, pop_2 and pop_3 in a list: pop_list
pop_list <- list(pop_1, pop_2, pop_3)
# Display the structure of pop_list
str(pop_list)
## List of 3
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 209 obs. of 8 variables:
## ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## ..$ 1960 : num [1:209] 769308 494443 3293999 NA NA ...
## ..$ 1961 : num [1:209] 814923 511803 3515148 13660 8724 ...
## ..$ 1962 : num [1:209] 858522 529439 3739963 14166 9700 ...
## ..$ 1963 : num [1:209] 903914 547377 3973289 14759 10748 ...
## ..$ 1964 : num [1:209] 951226 565572 4220987 15396 11866 ...
## ..$ 1965 : num [1:209] 1000582 583983 4488176 16045 13053 ...
## ..$ 1966 : num [1:209] 1058743 602512 4649105 16693 14217 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 209 obs. of 9 variables:
## ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## ..$ 1967 : num [1:209] 1119067 621180 4826104 17349 15440 ...
## ..$ 1968 : num [1:209] 1182159 639964 5017299 17996 16727 ...
## ..$ 1969 : num [1:209] 1248901 658853 5219332 18619 18088 ...
## ..$ 1970 : num [1:209] 1319849 677839 5429743 19206 19529 ...
## ..$ 1971 : num [1:209] 1409001 698932 5619042 19752 20929 ...
## ..$ 1972 : num [1:209] 1502402 720207 5815734 20263 22406 ...
## ..$ 1973 : num [1:209] 1598835 741681 6020647 20742 23937 ...
## ..$ 1974 : num [1:209] 1696445 763385 6235114 21194 25482 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 209 obs. of 38 variables:
## ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## ..$ 1975 : num [1:209] 1793266 785350 6460138 21632 27019 ...
## ..$ 1976 : num [1:209] 1905033 807990 6774099 22047 28366 ...
## ..$ 1977 : num [1:209] 2021308 830959 7102902 22452 29677 ...
## ..$ 1978 : num [1:209] 2142248 854262 7447728 22899 31037 ...
## ..$ 1979 : num [1:209] 2268015 877898 7810073 23457 32572 ...
## ..$ 1980 : num [1:209] 2398775 901884 8190772 24177 34366 ...
## ..$ 1981 : num [1:209] 2493265 927224 8637724 25173 36356 ...
## ..$ 1982 : num [1:209] 2590846 952447 9105820 26342 38618 ...
## ..$ 1983 : num [1:209] 2691612 978476 9591900 27655 40983 ...
## ..$ 1984 : num [1:209] 2795656 1006613 10091289 29062 43207 ...
## ..$ 1985 : num [1:209] 2903078 1037541 10600112 30524 45119 ...
## ..$ 1986 : num [1:209] 3006983 1072365 11101757 32014 46254 ...
## ..$ 1987 : num [1:209] 3113957 1109954 11609104 33548 47019 ...
## ..$ 1988 : num [1:209] 3224082 1146633 12122941 35095 47669 ...
## ..$ 1989 : num [1:209] 3337444 1177286 12645263 36618 48577 ...
## ..$ 1990 : num [1:209] 3454129 1198293 13177079 38088 49982 ...
## ..$ 1991 : num [1:209] 3617842 1215445 13708813 39600 51972 ...
## ..$ 1992 : num [1:209] 3788685 1222544 14248297 41049 54469 ...
## ..$ 1993 : num [1:209] 3966956 1222812 14789176 42443 57079 ...
## ..$ 1994 : num [1:209] 4152960 1221364 15322651 43798 59243 ...
## ..$ 1995 : num [1:209] 4347018 1222234 15842442 45129 60598 ...
## ..$ 1996 : num [1:209] 4531285 1228760 16395553 46343 60927 ...
## ..$ 1997 : num [1:209] 4722603 1238090 16935451 47527 60462 ...
## ..$ 1998 : num [1:209] 4921227 1250366 17469200 48705 59685 ...
## ..$ 1999 : num [1:209] 5127421 1265195 18007937 49906 59281 ...
## ..$ 2000 : num [1:209] 5341456 1282223 18560597 51151 59719 ...
## ..$ 2001 : num [1:209] 5564492 1315690 19198872 52341 61062 ...
## ..$ 2002 : num [1:209] 5795940 1352278 19854835 53583 63212 ...
## ..$ 2003 : num [1:209] 6036100 1391143 20529356 54864 65802 ...
## ..$ 2004 : num [1:209] 6285281 1430918 21222198 56166 68301 ...
## ..$ 2005 : num [1:209] 6543804 1470488 21932978 57474 70329 ...
## ..$ 2006 : num [1:209] 6812538 1512255 22625052 58679 71726 ...
## ..$ 2007 : num [1:209] 7091245 1553491 23335543 59894 72684 ...
## ..$ 2008 : num [1:209] 7380272 1594351 24061749 61118 73335 ...
## ..$ 2009 : num [1:209] 7679982 1635262 24799591 62357 73897 ...
## ..$ 2010 : num [1:209] 7990746 1676545 25545622 63616 74525 ...
## ..$ 2011 : num [1:209] 8316976 1716842 26216968 64817 75207 ...
# -----------------------------------
# loading in every sheet manually and then merging them in a list can be quite tedious
# Read all Excel sheets with lapply()
pop_list <- lapply(excel_sheets("data/urbanpop.xlsx"), read_excel, path = "data/urbanpop.xlsx")
# each sheet is loaded in one after the other
# Display the structure of pop_list
str(pop_list) # list of data frames, each data frame representing one of the sheets
## List of 3
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 209 obs. of 8 variables:
## ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## ..$ 1960 : num [1:209] 769308 494443 3293999 NA NA ...
## ..$ 1961 : num [1:209] 814923 511803 3515148 13660 8724 ...
## ..$ 1962 : num [1:209] 858522 529439 3739963 14166 9700 ...
## ..$ 1963 : num [1:209] 903914 547377 3973289 14759 10748 ...
## ..$ 1964 : num [1:209] 951226 565572 4220987 15396 11866 ...
## ..$ 1965 : num [1:209] 1000582 583983 4488176 16045 13053 ...
## ..$ 1966 : num [1:209] 1058743 602512 4649105 16693 14217 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 209 obs. of 9 variables:
## ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## ..$ 1967 : num [1:209] 1119067 621180 4826104 17349 15440 ...
## ..$ 1968 : num [1:209] 1182159 639964 5017299 17996 16727 ...
## ..$ 1969 : num [1:209] 1248901 658853 5219332 18619 18088 ...
## ..$ 1970 : num [1:209] 1319849 677839 5429743 19206 19529 ...
## ..$ 1971 : num [1:209] 1409001 698932 5619042 19752 20929 ...
## ..$ 1972 : num [1:209] 1502402 720207 5815734 20263 22406 ...
## ..$ 1973 : num [1:209] 1598835 741681 6020647 20742 23937 ...
## ..$ 1974 : num [1:209] 1696445 763385 6235114 21194 25482 ...
## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 209 obs. of 38 variables:
## ..$ country: chr [1:209] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## ..$ 1975 : num [1:209] 1793266 785350 6460138 21632 27019 ...
## ..$ 1976 : num [1:209] 1905033 807990 6774099 22047 28366 ...
## ..$ 1977 : num [1:209] 2021308 830959 7102902 22452 29677 ...
## ..$ 1978 : num [1:209] 2142248 854262 7447728 22899 31037 ...
## ..$ 1979 : num [1:209] 2268015 877898 7810073 23457 32572 ...
## ..$ 1980 : num [1:209] 2398775 901884 8190772 24177 34366 ...
## ..$ 1981 : num [1:209] 2493265 927224 8637724 25173 36356 ...
## ..$ 1982 : num [1:209] 2590846 952447 9105820 26342 38618 ...
## ..$ 1983 : num [1:209] 2691612 978476 9591900 27655 40983 ...
## ..$ 1984 : num [1:209] 2795656 1006613 10091289 29062 43207 ...
## ..$ 1985 : num [1:209] 2903078 1037541 10600112 30524 45119 ...
## ..$ 1986 : num [1:209] 3006983 1072365 11101757 32014 46254 ...
## ..$ 1987 : num [1:209] 3113957 1109954 11609104 33548 47019 ...
## ..$ 1988 : num [1:209] 3224082 1146633 12122941 35095 47669 ...
## ..$ 1989 : num [1:209] 3337444 1177286 12645263 36618 48577 ...
## ..$ 1990 : num [1:209] 3454129 1198293 13177079 38088 49982 ...
## ..$ 1991 : num [1:209] 3617842 1215445 13708813 39600 51972 ...
## ..$ 1992 : num [1:209] 3788685 1222544 14248297 41049 54469 ...
## ..$ 1993 : num [1:209] 3966956 1222812 14789176 42443 57079 ...
## ..$ 1994 : num [1:209] 4152960 1221364 15322651 43798 59243 ...
## ..$ 1995 : num [1:209] 4347018 1222234 15842442 45129 60598 ...
## ..$ 1996 : num [1:209] 4531285 1228760 16395553 46343 60927 ...
## ..$ 1997 : num [1:209] 4722603 1238090 16935451 47527 60462 ...
## ..$ 1998 : num [1:209] 4921227 1250366 17469200 48705 59685 ...
## ..$ 1999 : num [1:209] 5127421 1265195 18007937 49906 59281 ...
## ..$ 2000 : num [1:209] 5341456 1282223 18560597 51151 59719 ...
## ..$ 2001 : num [1:209] 5564492 1315690 19198872 52341 61062 ...
## ..$ 2002 : num [1:209] 5795940 1352278 19854835 53583 63212 ...
## ..$ 2003 : num [1:209] 6036100 1391143 20529356 54864 65802 ...
## ..$ 2004 : num [1:209] 6285281 1430918 21222198 56166 68301 ...
## ..$ 2005 : num [1:209] 6543804 1470488 21932978 57474 70329 ...
## ..$ 2006 : num [1:209] 6812538 1512255 22625052 58679 71726 ...
## ..$ 2007 : num [1:209] 7091245 1553491 23335543 59894 72684 ...
## ..$ 2008 : num [1:209] 7380272 1594351 24061749 61118 73335 ...
## ..$ 2009 : num [1:209] 7679982 1635262 24799591 62357 73897 ...
## ..$ 2010 : num [1:209] 7990746 1676545 25545622 63616 74525 ...
## ..$ 2011 : num [1:209] 8316976 1716842 26216968 64817 75207 ...
# -----------------------------------
# download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/urbanpop_nonames.xlsx",
# "data/urbanpop_nonames.xlsx")
# had to go the url and save this file
# Import the the first Excel sheet of urbanpop_nonames.xlsx
# col_names argument of read_excel()
# default is TRUE
# R will choose column names for you
pop_a <- read_excel("data/urbanpop_nonames.xlsx", sheet = 1, col_names = FALSE)
head(pop_a)
## # A tibble: 6 x 8
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 769308 814923. 858522. 903914. 951226. 1.00e6 1.06e6
## 2 Albania 494443 511803. 529439. 547377. 565572. 5.84e5 6.03e5
## 3 Algeria 3293999 3515148. 3739963. 3973289. 4220987. 4.49e6 4.65e6
## 4 American Samoa NA 13660. 14166. 14759. 15396. 1.60e4 1.67e4
## 5 Andorra NA 8724. 9700. 10748. 11866. 1.31e4 1.42e4
## 6 Angola 521205 548265. 579695. 612087. 645262. 6.79e5 7.18e5
# Import the the first Excel sheet of urbanpop_nonames.xlsx (specify col_names)
cols <- c("country", paste0("year_", 1960:1966))
# (can also choose to set col_names to a character vector with names for each column)
pop_b <- read_excel("data/urbanpop_nonames.xlsx", sheet = 1, col_names = cols)
head(pop_b)
## # A tibble: 6 x 8
## country year_1960 year_1961 year_1962 year_1963 year_1964 year_1965
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 769308 814923. 858522. 903914. 951226. 1000582.
## 2 Albania 494443 511803. 529439. 547377. 565572. 583983.
## 3 Algeria 3293999 3515148. 3739963. 3973289. 4220987. 4488176.
## 4 American Sa~ NA 13660. 14166. 14759. 15396. 16045.
## 5 Andorra NA 8724. 9700. 10748. 11866. 13053.
## 6 Angola 521205 548265. 579695. 612087. 645262. 679109.
## # ... with 1 more variable: year_1966 <dbl>
# Print the summary of pop_a
summary(pop_a)
## X__1 X__2 X__3
## Length:209 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 88978 1st Qu.: 70644
## Mode :character Median : 580675 Median : 570159
## Mean : 4988124 Mean : 4991613
## 3rd Qu.: 3077228 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :11
## X__4 X__5 X__6
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
##
## X__7 X__8
## Min. : 1281 Min. : 1349
## 1st Qu.: 88633 1st Qu.: 93638
## Median : 679109 Median : 735139
## Mean : 5637394 Mean : 5790281
## 3rd Qu.: 3317422 3rd Qu.: 3418036
## Max. :139663053 Max. :141962708
##
# Print the summary of pop_b
summary(pop_b)
## country year_1960 year_1961
## Length:209 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 88978 1st Qu.: 70644
## Mode :character Median : 580675 Median : 570159
## Mean : 4988124 Mean : 4991613
## 3rd Qu.: 3077228 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :11
## year_1962 year_1963 year_1964
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
##
## year_1965 year_1966
## Min. : 1281 Min. : 1349
## 1st Qu.: 88633 1st Qu.: 93638
## Median : 679109 Median : 735139
## Mean : 5637394 Mean : 5790281
## 3rd Qu.: 3317422 3rd Qu.: 3418036
## Max. :139663053 Max. :141962708
##
# -----------------------------------
# skip
# ignore a specified number of rows inside the Excel sheets
# Import the second sheet of urbanpop.xlsx
# skipping the first 21 rows
urbanpop_sel <- read_excel("data/urbanpop.xlsx", sheet = 2, skip = 21, col_names = FALSE)
# Print out the first observation from urbanpop_sel
str(urbanpop_sel)
## Classes 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 9 variables:
## $ X__1: chr "Benin" "Bermuda" "Bhutan" "Bolivia" ...
## $ X__2: num 382022 52000 14379 1527065 851692 ...
## $ X__3: num 411859 53000 15617 1575177 890270 ...
## $ X__4: num 443013 54000 16946 1625173 929450 ...
## $ X__5: num 475611 55000 18381 1677184 969549 ...
## $ X__6: num 515820 54600 20173 1731437 1008630 ...
## $ X__7: num 557938 54200 22100 1787719 1048738 ...
## $ X__8: num 602093 53800 24160 1845894 1089648 ...
## $ X__9: num 648410 53400 26343 1905749 1130966 ...
# urbanpop_sel[1,1]
urbanpop_sel[1,] # first observation - i.e. first row
## # A tibble: 1 x 9
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Benin 382022. 411859. 443013. 475611. 515820. 557938. 602093. 648410.
# -----------------------------------
# Load the gdata package
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
# written in pearl
# wraps read.table and makes all of its arguments available
# Its pretty slow. It converts everything to csv then reads it in
# download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1477/datasets/urbanpop.xls",
# "data/urbanpop.xls")
# go to link above
# download
# open
# save
# read.xls does not work due to xls2sep & perl issues?
# Import the second sheet of urbanpop.xls
urban_pop <- read.xls("data/urbanpop.xls", "1967-1974")
# Print the first 11 observations using head()
head(urban_pop, 11)
## country X1967 X1968 X1969 X1970
## 1 Afghanistan 1119067.20 1182159.06 1248900.79 1319848.78
## 2 Albania 621179.85 639964.46 658853.12 677839.12
## 3 Algeria 4826104.22 5017298.60 5219331.87 5429743.08
## 4 American Samoa 17348.66 17995.51 18618.68 19206.39
## 5 Andorra 15439.62 16726.99 18088.32 19528.96
## 6 Angola 757496.32 798459.26 841261.96 886401.63
## 7 Antigua and Barbuda 22086.25 22149.39 22182.92 22180.87
## 8 Argentina 17753280.98 18124103.64 18510462.30 18918072.79
## 9 Armenia 1337032.09 1392892.13 1449641.49 1507619.77
## 10 Aruba 29414.72 29576.09 29737.87 29901.57
## 11 Australia 9934404.03 10153969.77 10412390.67 10664093.55
## X1971 X1972 X1973 X1974
## 1 1409001.09 1502401.79 1598835.45 1696444.83
## 2 698932.25 720206.57 741681.04 763385.45
## 3 5619041.53 5815734.49 6020647.35 6235114.38
## 4 19752.02 20262.67 20741.97 21194.38
## 5 20928.73 22405.84 23937.05 25481.98
## 6 955010.09 1027397.35 1103829.78 1184486.23
## 7 22560.87 22907.76 23221.29 23502.92
## 8 19329718.16 19763078.00 20211424.85 20664728.90
## 9 1564367.60 1622103.53 1680497.75 1739063.02
## 10 30081.36 30279.76 30467.42 30602.87
## 11 11047706.39 11269945.50 11461120.68 11772934.25
# -----------------------------------
# read.xls() wraps around read.table()
# converting the Excel file to a .csv file using a Perl script,
# and then reading that .csv file
# with the read.csv() function
# that is loaded by default in R, through the utils package.
# Column names for urban_pop
columns <- c("country", paste0("year_", 1967:1974))
# Finish the read.xls call
urban_pop <- read.xls("data/urbanpop.xls", sheet = 2,
skip = 50, header = FALSE, stringsAsFactors = FALSE,
col.names = columns)
# Print first 10 observation of urban_pop
head(urban_pop,10)
## country year_1967 year_1968 year_1969 year_1970
## 1 Cyprus 231929.74 237831.38 243983.34 250164.52
## 2 Czech Republic 6204409.91 6266304.50 6326368.97 6348794.89
## 3 Denmark 3777552.62 3826785.08 3874313.99 3930042.97
## 4 Djibouti 77788.04 84694.35 92045.77 99845.22
## 5 Dominica 27550.36 29527.32 31475.62 33328.25
## 6 Dominican Republic 1535485.43 1625455.76 1718315.40 1814060.00
## 7 Ecuador 2059355.12 2151395.14 2246890.79 2345864.41
## 8 Egypt 13798171.00 14248342.19 14703858.22 15162858.52
## 9 El Salvador 1345528.98 1387218.33 1429378.98 1472181.26
## 10 Equatorial Guinea 75364.50 77295.03 78445.74 78411.07
## year_1971 year_1972 year_1973 year_1974
## 1 261213.21 272407.99 283774.90 295379.83
## 2 6437055.17 6572632.32 6718465.53 6873458.18
## 3 3981360.12 4028247.92 4076867.28 4120201.43
## 4 107799.69 116098.23 125391.58 136606.25
## 5 34761.52 36049.99 37260.05 38501.47
## 6 1915590.38 2020157.01 2127714.45 2238203.87
## 7 2453817.78 2565644.81 2681525.25 2801692.62
## 8 15603661.36 16047814.69 16498633.27 16960827.93
## 9 1527985.34 1584758.18 1642098.95 1699470.87
## 10 77055.29 74596.06 71438.96 68179.26
# import data from all three sheets in urbanpop.xls
path <- "data/urbanpop.xls"
urban_sheet1 <- read.xls(path, sheet = 1, stringsAsFactors = FALSE)
urban_sheet2 <- read.xls(path, sheet = 2, stringsAsFactors = FALSE)
urban_sheet3 <- read.xls(path, sheet = 3, stringsAsFactors = FALSE)
# Extend the cbind() call to include urban_sheet3
# first column of urban_sheet2 and urban_sheet3 are removed,
# so you don't have duplicate columns.
urban <- cbind(urban_sheet1, urban_sheet2[-1], urban_sheet3[-1])
# Remove all rows with NAs from urban
summary(urban)
## country X1960 X1961
## Length:209 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 88978 1st Qu.: 70644
## Mode :character Median : 580675 Median : 570159
## Mean : 4988124 Mean : 4991613
## 3rd Qu.: 3077228 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :11
## X1962 X1963 X1964
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
##
## X1965 X1966 X1967
## Min. : 1281 Min. : 1349 Min. : 1415
## 1st Qu.: 88633 1st Qu.: 93638 1st Qu.: 98684
## Median : 679109 Median : 735139 Median : 765601
## Mean : 5637394 Mean : 5790281 Mean : 5944033
## 3rd Qu.: 3317422 3rd Qu.: 3418036 3rd Qu.: 3556448
## Max. :139663053 Max. :141962708 Max. :144201722
##
## X1968 X1969 X1970
## Min. : 1480 Min. : 1545 Min. : 1611
## 1st Qu.: 103670 1st Qu.: 108473 1st Qu.: 113010
## Median : 798459 Median : 866258 Median : 903905
## Mean : 6098019 Mean : 6262333 Mean : 6427281
## 3rd Qu.: 3696854 3rd Qu.: 3838003 3rd Qu.: 3930043
## Max. :146340364 Max. :148475901 Max. :150922373
##
## X1971 X1972 X1973
## Min. : 1684 Min. : 1757 Min. : 1831
## 1st Qu.: 115124 1st Qu.: 116098 1st Qu.: 124928
## Median : 953577 Median : 976446 Median : 999667
## Mean : 6599259 Mean : 6773754 Mean : 6947953
## 3rd Qu.: 3981360 3rd Qu.: 4028248 3rd Qu.: 4224277
## Max. :152863831 Max. :154530473 Max. :156034106
##
## X1974 X1975 X1976
## Min. : 1905 Min. : 1980 Min. : 2060
## 1st Qu.: 135168 1st Qu.: 144775 1st Qu.: 153214
## Median : 1028372 Median : 1072796 Median : 1135501
## Mean : 7128645 Mean : 7309852 Mean : 7503259
## 3rd Qu.: 4427442 3rd Qu.: 4635991 3rd Qu.: 4854943
## Max. :157488074 Max. :159452730 Max. :165583752
##
## X1977 X1978 X1979
## Min. : 2140 Min. : 2223 Min. : 2309
## 1st Qu.: 160948 1st Qu.: 169616 1st Qu.: 181477
## Median : 1180706 Median : 1287154 Median : 1316892
## Mean : 7701487 Mean : 7905258 Mean : 8115040
## 3rd Qu.: 5012324 3rd Qu.: 5157254 3rd Qu.: 5301984
## Max. :171550310 Max. :177605736 Max. :183785364
##
## X1980 X1981 X1982
## Min. : 2401 Min. : 2506 Min. : 2617
## 1st Qu.: 198023 1st Qu.: 203581 1st Qu.: 207630
## Median : 1346902 Median : 1397894 Median : 1453904
## Mean : 8328473 Mean : 8555840 Mean : 8788551
## 3rd Qu.: 5448102 3rd Qu.: 5565661 3rd Qu.: 5626559
## Max. :189947471 Max. :199385258 Max. :209435968
##
## X1983 X1984 X1985
## Min. : 2732 Min. : 2848 Min. : 2962
## 1st Qu.: 221136 1st Qu.: 235938 1st Qu.: 243791
## Median : 1522872 Median : 1592240 Median : 1663681
## Mean : 9024799 Mean : 9264563 Mean : 9510547
## 3rd Qu.: 5685137 3rd Qu.: 5743078 3rd Qu.: 5787617
## Max. :219680098 Max. :229872397 Max. :240414890
##
## X1986 X1987 X1988
## Min. : 3106 Min. : 3247 Min. : 3386
## 1st Qu.: 252033 1st Qu.: 261211 1st Qu.: 271192
## Median : 1740216 Median : 1808035 Median : 1830180
## Mean : 9761453 Mean : 10018768 Mean : 10280947
## 3rd Qu.: 5818099 3rd Qu.: 5858728 3rd Qu.: 5889922
## Max. :251630158 Max. :263433513 Max. :275570541
##
## X1989 X1990 X1991
## Min. : 3524 Min. : 3661 Min. : 3742
## 1st Qu.: 281694 1st Qu.: 292457 1st Qu.: 303034
## Median : 1916007 Median : 1978215 Median : 2018359
## Mean : 10546895 Mean : 10815109 Mean : 11085209
## 3rd Qu.: 5860541 3rd Qu.: 6030313 3rd Qu.: 6239205
## Max. :287810747 Max. :300165618 Max. :314689997
##
## X1992 X1993 X1994
## Min. : 3822 Min. : 3901 Min. : 3979
## 1st Qu.: 315154 1st Qu.: 325923 1st Qu.: 333766
## Median : 2090284 Median : 2147831 Median : 2195782
## Mean : 11399986 Mean : 11671192 Mean : 11942346
## 3rd Qu.: 6660368 3rd Qu.: 6803893 3rd Qu.: 6891732
## Max. :329099365 Max. :343555327 Max. :358232230
## NA's :1 NA's :1 NA's :1
## X1995 X1996 X1997
## Min. : 4057 Min. : 4111 Min. : 4164
## 1st Qu.: 338722 1st Qu.: 344075 1st Qu.: 349136
## Median : 2229853 Median : 2324907 Median : 2401255
## Mean : 12165147 Mean : 12442902 Mean : 12723964
## 3rd Qu.: 6924376 3rd Qu.: 7128895 3rd Qu.: 7422271
## Max. :373035157 Max. :388936607 Max. :405031716
##
## X1998 X1999 X2000
## Min. : 4219 Min. : 4275 Min. : 4334
## 1st Qu.: 356719 1st Qu.: 365573 1st Qu.: 370353
## Median : 2381800 Median : 2365285 Median : 2459957
## Mean : 13006498 Mean : 13290484 Mean : 13579489
## 3rd Qu.: 7429968 3rd Qu.: 7439339 3rd Qu.: 7593573
## Max. :421147610 Max. :437126845 Max. :452999147
##
## X2001 X2002 X2003
## Min. : 4397 Min. : 4464 Min. : 4532
## 1st Qu.: 373611 1st Qu.: 377197 1st Qu.: 381128
## Median : 2539746 Median : 2637810 Median : 2745377
## Mean : 13897284 Mean : 14216175 Mean : 14538311
## 3rd Qu.: 7651796 3rd Qu.: 7867562 3rd Qu.: 8089415
## Max. :473204511 Max. :493402140 Max. :513607776
##
## X2004 X2005 X2006
## Min. : 4598 Min. : 4661 Min. : 4719
## 1st Qu.: 385355 1st Qu.: 390398 1st Qu.: 397864
## Median : 2868044 Median : 2969664 Median : 3000504
## Mean : 14864880 Mean : 15195447 Mean : 15531476
## 3rd Qu.: 8314375 3rd Qu.: 8520688 3rd Qu.: 8764631
## Max. :533892175 Max. :554367818 Max. :575050081
##
## X2007 X2008 X2009
## Min. : 4774 Min. : 4827 Min. : 4877
## 1st Qu.: 405264 1st Qu.: 413803 1st Qu.: 422789
## Median : 3024406 Median : 3047794 Median : 3079402
## Mean : 15873312 Mean : 16220693 Mean : 16569652
## 3rd Qu.: 9010932 3rd Qu.: 9316659 3rd Qu.: 9638790
## Max. :595731464 Max. :616552722 Max. :637533976
##
## X2010 X2011
## Min. : 4928 Min. : 4979
## 1st Qu.: 431853 1st Qu.: 441587
## Median : 3103263 Median : 3117433
## Mean : 16920124 Mean : 17271161
## 3rd Qu.: 9978840 3rd Qu.: 10334364
## Max. :658557734 Max. :678796403
##
urban_clean <- na.omit(urban)
# Print out a summary of urban_clean
summary(urban_clean) # assert that there are no more NA values.
## country X1960 X1961
## Length:197 Min. : 3378 Min. : 3433
## Class :character 1st Qu.: 87735 1st Qu.: 92905
## Mode :character Median : 599714 Median : 630788
## Mean : 5012388 Mean : 5282488
## 3rd Qu.: 3130085 3rd Qu.: 3155370
## Max. :126469700 Max. :129268133
## X1962 X1963 X1964
## Min. : 3481 Min. : 3532 Min. : 3586
## 1st Qu.: 98331 1st Qu.: 104988 1st Qu.: 112084
## Median : 659464 Median : 704989 Median : 740609
## Mean : 5440972 Mean : 5612312 Mean : 5786961
## 3rd Qu.: 3250211 3rd Qu.: 3416490 3rd Qu.: 3585464
## Max. :131974143 Max. :134599886 Max. :137205240
## X1965 X1966 X1967
## Min. : 3644 Min. : 3706 Min. : 3771
## 1st Qu.: 119322 1st Qu.: 128565 1st Qu.: 138024
## Median : 774957 Median : 809768 Median : 838449
## Mean : 5964970 Mean : 6126413 Mean : 6288771
## 3rd Qu.: 3666724 3rd Qu.: 3871757 3rd Qu.: 4019906
## Max. :139663053 Max. :141962708 Max. :144201722
## X1968 X1969 X1970
## Min. : 3835 Min. : 3893 Min. : 3941
## 1st Qu.: 147846 1st Qu.: 158252 1st Qu.: 171063
## Median : 890270 Median : 929450 Median : 976471
## Mean : 6451367 Mean : 6624909 Mean : 6799110
## 3rd Qu.: 4158186 3rd Qu.: 4300669 3rd Qu.: 4440047
## Max. :146340364 Max. :148475901 Max. :150922373
## X1971 X1972 X1973
## Min. : 4017 Min. : 4084 Min. : 4146
## 1st Qu.: 181483 1st Qu.: 189492 1st Qu.: 197792
## Median : 1008630 Median : 1048738 Median : 1097293
## Mean : 6980895 Mean : 7165338 Mean : 7349454
## 3rd Qu.: 4595966 3rd Qu.: 4766545 3rd Qu.: 4838297
## Max. :152863831 Max. :154530473 Max. :156034106
## X1974 X1975 X1976
## Min. : 4206 Min. : 4267 Min. : 4334
## 1st Qu.: 205410 1st Qu.: 211746 1st Qu.: 216991
## Median : 1159402 Median : 1223146 Median : 1249829
## Mean : 7540446 Mean : 7731973 Mean : 7936401
## 3rd Qu.: 4906384 3rd Qu.: 5003370 3rd Qu.: 5121118
## Max. :157488074 Max. :159452730 Max. :165583752
## X1977 X1978 X1979
## Min. : 4402 Min. : 4470 Min. : 4539
## 1st Qu.: 222209 1st Qu.: 227605 1st Qu.: 233461
## Median : 1311276 Median : 1340811 Median : 1448185
## Mean : 8145945 Mean : 8361360 Mean : 8583138
## 3rd Qu.: 5227677 3rd Qu.: 5352746 3rd Qu.: 5558850
## Max. :171550310 Max. :177605736 Max. :183785364
## X1980 X1981 X1982
## Min. : 4607 Min. : 4645 Min. : 4681
## 1st Qu.: 242583 1st Qu.: 248948 1st Qu.: 257944
## Median : 1592397 Median : 1673079 Median : 1713060
## Mean : 8808772 Mean : 9049163 Mean : 9295226
## 3rd Qu.: 5815772 3rd Qu.: 6070457 3rd Qu.: 6337995
## Max. :189947471 Max. :199385258 Max. :209435968
## X1983 X1984 X1985
## Min. : 4716 Min. : 4750 Min. : 4782
## 1st Qu.: 274139 1st Qu.: 284939 1st Qu.: 300928
## Median : 1730626 Median : 1749033 Median : 1786125
## Mean : 9545035 Mean : 9798559 Mean : 10058661
## 3rd Qu.: 6619987 3rd Qu.: 6918261 3rd Qu.: 6931780
## Max. :219680098 Max. :229872397 Max. :240414890
## X1986 X1987 X1988
## Min. : 4809 Min. : 4835 Min. : 4859
## 1st Qu.: 307699 1st Qu.: 321125 1st Qu.: 334616
## Median : 1850910 Median : 1953694 Median : 1997011
## Mean : 10323839 Mean : 10595817 Mean : 10873041
## 3rd Qu.: 6935763 3rd Qu.: 6939905 3rd Qu.: 6945022
## Max. :251630158 Max. :263433513 Max. :275570541
## X1989 X1990 X1991
## Min. : 4883 Min. : 4907 Min. : 4946
## 1st Qu.: 347348 1st Qu.: 370152 1st Qu.: 394611
## Median : 1993544 Median : 2066505 Median : 2150230
## Mean : 11154458 Mean : 11438543 Mean : 11725076
## 3rd Qu.: 6885378 3rd Qu.: 6830026 3rd Qu.: 6816589
## Max. :287810747 Max. :300165618 Max. :314689997
## X1992 X1993 X1994
## Min. : 4985 Min. : 5024 Min. : 5062
## 1st Qu.: 418788 1st Qu.: 427457 1st Qu.: 435959
## Median : 2237405 Median : 2322158 Median : 2410297
## Mean : 12010922 Mean : 12296949 Mean : 12582930
## 3rd Qu.: 6820099 3rd Qu.: 7139656 3rd Qu.: 7499901
## Max. :329099365 Max. :343555327 Max. :358232230
## X1995 X1996 X1997
## Min. : 5100 Min. : 5079 Min. : 5055
## 1st Qu.: 461993 1st Qu.: 488136 1st Qu.: 494203
## Median : 2482393 Median : 2522460 Median : 2606125
## Mean : 12871480 Mean : 13165924 Mean : 13463675
## 3rd Qu.: 7708571 3rd Qu.: 7686092 3rd Qu.: 7664316
## Max. :373035157 Max. :388936607 Max. :405031716
## X1998 X1999 X2000
## Min. : 5029 Min. : 5001 Min. : 4971
## 1st Qu.: 498002 1st Qu.: 505144 1st Qu.: 525629
## Median : 2664983 Median : 2737809 Median : 2826647
## Mean : 13762861 Mean : 14063387 Mean : 14369278
## 3rd Qu.: 7784056 3rd Qu.: 8083488 3rd Qu.: 8305564
## Max. :421147610 Max. :437126845 Max. :452999147
## X2001 X2002 X2003
## Min. : 5003 Min. : 5034 Min. : 5064
## 1st Qu.: 550638 1st Qu.: 567531 1st Qu.: 572094
## Median : 2925851 Median : 2928252 Median : 2944934
## Mean : 14705743 Mean : 15043381 Mean : 15384513
## 3rd Qu.: 8421967 3rd Qu.: 8448628 3rd Qu.: 8622732
## Max. :473204511 Max. :493402140 Max. :513607776
## X2004 X2005 X2006
## Min. : 5090 Min. : 5111 Min. : 5135
## 1st Qu.: 593900 1st Qu.: 620511 1st Qu.: 632659
## Median : 2994356 Median : 3057923 Median : 3269963
## Mean : 15730299 Mean : 16080262 Mean : 16435872
## 3rd Qu.: 8999112 3rd Qu.: 9394001 3rd Qu.: 9689807
## Max. :533892175 Max. :554367818 Max. :575050081
## X2007 X2008 X2009
## Min. : 5155 Min. : 5172 Min. : 5189
## 1st Qu.: 645172 1st Qu.: 658017 1st Qu.: 671085
## Median : 3432024 Median : 3589395 Median : 3652338
## Mean : 16797484 Mean : 17164898 Mean : 17533997
## 3rd Qu.: 9803381 3rd Qu.: 10210317 3rd Qu.: 10518289
## Max. :595731464 Max. :616552722 Max. :637533976
## X2010 X2011
## Min. : 5206 Min. : 5233
## 1st Qu.: 684302 1st Qu.: 698009
## Median : 3676309 Median : 3664664
## Mean : 17904811 Mean : 18276297
## 3rd Qu.: 10618596 3rd Qu.: 10731193
## Max. :658557734 Max. :678796403