Install packages
suppressWarnings(library(readr))
suppressWarnings(library(ggplot2))
suppressMessages(library(cowplot))
suppressMessages(library(tidyverse))
Check and set working directory
getwd()
## [1] "/Users/codethedral/Google Drive/GitHub/R-intro-divvy/analysis"
#setwd("C:/Users/jorge/Google Drive/GitHub/R-intro-divvy/analysis")#surface
setwd("/Users/codethedral/Google Drive/GitHub/R-intro-divvy/analysis")#mac
Version of R
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.5 purrr_0.2.4
## [5] tidyr_0.8.1 tibble_1.4.2 tidyverse_1.2.1 cowplot_0.9.3
## [9] ggplot2_2.2.1 readr_1.1.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.17 cellranger_1.1.0 pillar_1.2.2 compiler_3.5.1
## [5] plyr_1.8.4 bindr_0.1.1 tools_3.5.1 digest_0.6.15
## [9] lubridate_1.7.4 jsonlite_1.5 lattice_0.20-35 evaluate_0.10.1
## [13] gtable_0.2.0 nlme_3.1-137 pkgconfig_2.0.1 rlang_0.2.0
## [17] psych_1.8.4 cli_1.0.0 rstudioapi_0.7 parallel_3.5.1
## [21] yaml_2.1.19 haven_1.1.2 bindrcpp_0.2.2 xml2_1.2.0
## [25] httr_1.3.1 knitr_1.20 hms_0.4.2 rprojroot_1.3-2
## [29] grid_3.5.1 tidyselect_0.2.4 glue_1.2.0 R6_2.2.2
## [33] readxl_1.1.0 foreign_0.8-70 rmarkdown_1.9 modelr_0.1.2
## [37] reshape2_1.4.3 magrittr_1.5 backports_1.1.2 scales_0.5.0
## [41] htmltools_0.3.6 rvest_0.3.2 mnormt_1.5-5 assertthat_0.2.0
## [45] colorspace_1.3-2 stringi_1.2.2 lazyeval_0.2.1 munsell_0.4.3
## [49] broom_0.4.5 crayon_1.3.4
ls()
## character(0)
Clean environment.
rm(list = ls())
Sys.glob("*csv")
## [1] "Divvy_Stations_2016_Q1Q2.csv" "Divvy_Stations_2016_Q3.csv"
## [3] "Divvy_Stations_2016_Q4.csv" "Divvy_Stations_2017_Q1Q2.csv"
## [5] "Divvy_Stations_2017_Q3Q4.csv" "Divvy_Trips_2016_04.csv"
## [7] "Divvy_Trips_2016_05.csv" "Divvy_Trips_2016_06.csv"
## [9] "Divvy_Trips_2016_Q1.csv" "Divvy_Trips_2016_Q3.csv"
## [11] "Divvy_Trips_2016_Q4.csv" "Divvy_Trips_2017_Q1.csv"
## [13] "Divvy_Trips_2017_Q2.csv" "Divvy_Trips_2017_Q3.csv"
## [15] "Divvy_Trips_2017_Q4.csv"
Read one of the csv files
glimpse(read.csv("Divvy_Stations_2017_Q1Q2.csv"))
## Observations: 582
## Variables: 7
## $ id <int> 456, 101, 109, 21, 80, 346, 341, 480, 444, 511, 37...
## $ name <fct> 2112 W Peterson Ave, 63rd St Beach, 900 W Harrison...
## $ city <fct> Chicago, Chicago, Chicago, Chicago, Chicago, Chica...
## $ latitude <dbl> 41.99118, 41.78102, 41.87468, 41.87773, 41.88157, ...
## $ longitude <dbl> -87.68359, -87.57612, -87.65002, -87.65479, -87.65...
## $ dpcapacity <int> 15, 23, 19, 15, 19, 15, 35, 15, 11, 15, 35, 15, 15...
## $ online_date <fct> 2/10/2015 14:04:42, 7/16/2013 01:27:50, 7/18/2013 ...
Import station data
stations<-read.csv("Divvy_Stations_2017_Q3Q4.csv", stringsAsFactors = FALSE)
ls()
## [1] "stations"
class(stations)
## [1] "data.frame"
Inspect station data.
nrow(stations)
## [1] 585
ncol(stations)
## [1] 8
head(stations)
## id name city latitude longitude dpcapacity
## 1 2 Buckingham Fountain Chicago 41.87639 -87.62033 27
## 2 3 Shedd Aquarium Chicago 41.86723 -87.61536 55
## 3 4 Burnham Harbor Chicago 41.85627 -87.61335 23
## 4 5 State St & Harrison St Chicago 41.87405 -87.62772 23
## 5 6 Dusable Harbor Chicago 41.88504 -87.61279 39
## 6 7 Field Blvd & South Water St Chicago 41.88635 -87.61752 19
## online_date X
## 1 6/10/2013 10:43 NA
## 2 6/10/2013 10:44 NA
## 3 6/10/2013 10:46 NA
## 4 6/10/2013 10:46 NA
## 5 6/10/2013 11:18 NA
## 6 6/19/2013 10:00 NA
summary(stations)
## id name city latitude
## Min. : 2.0 Length:585 Length:585 Min. :41.74
## 1st Qu.:163.0 Class :character Class :character 1st Qu.:41.85
## Median :315.0 Mode :character Mode :character Median :41.89
## Mean :317.3 Mean :41.89
## 3rd Qu.:475.0 3rd Qu.:41.93
## Max. :626.0 Max. :42.06
## longitude dpcapacity online_date X
## Min. :-87.80 Min. : 0.00 Length:585 Mode:logical
## 1st Qu.:-87.68 1st Qu.:15.00 Class :character NA's:585
## Median :-87.65 Median :15.00 Mode :character
## Mean :-87.66 Mean :17.46
## 3rd Qu.:-87.63 3rd Qu.:19.00
## Max. :-87.55 Max. :55.00
Inspect the data in more detail:
sapply(stations, class)
## id name city latitude longitude dpcapacity
## "integer" "character" "character" "numeric" "numeric" "integer"
## online_date X
## "character" "logical"
object.size(stations)
## 108264 bytes
print(object.size(stations),units = "Kb")
## 105.7 Kb
Examining dpcapacity
x<-stations$dpcapacity
class(x)
## [1] "integer"
length(x)
## [1] 585
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 15.00 15.00 17.46 19.00 55.00
table(x)
## x
## 0 11 12 15 16 19 20 23 27 28 31 35 39 43 47 55
## 2 106 1 256 7 106 2 47 20 1 14 4 12 1 4 2
stations$name[1:4]
## [1] "Buckingham Fountain" "Shedd Aquarium"
## [3] "Burnham Harbor" "State St & Harrison St"
Select first 4 rows of “name” column:
stations[1:4, 2]
## [1] "Buckingham Fountain" "Shedd Aquarium"
## [3] "Burnham Harbor" "State St & Harrison St"
stations[1:4, "name"]#this command does the same as the one before
## [1] "Buckingham Fountain" "Shedd Aquarium"
## [3] "Burnham Harbor" "State St & Harrison St"
Select first 4 rows and multiple columns:
stations[1:4, c(2, 3, 4)]
## name city latitude
## 1 Buckingham Fountain Chicago 41.87639
## 2 Shedd Aquarium Chicago 41.86723
## 3 Burnham Harbor Chicago 41.85627
## 4 State St & Harrison St Chicago 41.87405
stations[1:4, c("name","city","latitude")]#different sintax, same output
## name city latitude
## 1 Buckingham Fountain Chicago 41.87639
## 2 Shedd Aquarium Chicago 41.86723
## 3 Burnham Harbor Chicago 41.85627
## 4 State St & Harrison St Chicago 41.87405
Getting the row and column names:
colnames(stations)
## [1] "id" "name" "city" "latitude" "longitude"
## [6] "dpcapacity" "online_date" "X"
#rownames(stations)
Take an even closer look at “dpcapacity”
subset(stations,dpcapacity == 0)
## id name city latitude longitude dpcapacity
## 541 581 Commercial Ave & 83rd St Chicago 41.74461 -87.55120 0
## 542 582 Phillips Ave & 83rd St Chicago 41.74469 -87.56607 0
## online_date X
## 541 5/27/2016 11:33 NA
## 542 5/27/2016 11:34 NA
subset(stations,dpcapacity >= 40)
## id name city latitude longitude dpcapacity
## 2 3 Shedd Aquarium Chicago 41.86723 -87.61536 55
## 32 35 Streeter Dr & Grand Ave Chicago 41.89228 -87.61204 47
## 40 43 Michigan Ave & Washington St Chicago 41.88389 -87.62465 43
## 79 90 Millennium Park Chicago 41.88103 -87.62408 47
## 86 97 Field Museum Chicago 41.86531 -87.61787 55
## 174 192 Canal St & Adams St Chicago 41.87926 -87.63990 47
## 177 195 Columbus Dr & Randolph St Chicago 41.88473 -87.61952 47
## online_date X
## 2 6/10/2013 10:44 NA
## 32 6/22/2013 21:12 NA
## 40 6/25/2013 10:57 NA
## 79 6/26/2013 19:51 NA
## 86 6/30/2013 13:25 NA
## 174 8/6/2013 13:27 NA
## 177 8/7/2013 14:11 NA
filter(stations, dpcapacity==0)#same output but using filter from dplyr
## id name city latitude longitude dpcapacity
## 1 581 Commercial Ave & 83rd St Chicago 41.74461 -87.55120 0
## 2 582 Phillips Ave & 83rd St Chicago 41.74469 -87.56607 0
## online_date X
## 1 5/27/2016 11:33 NA
## 2 5/27/2016 11:34 NA
filter
## function (.data, ...)
## {
## UseMethod("filter")
## }
## <bytecode: 0x7ff6b05f20c8>
## <environment: namespace:dplyr>
filter(stations, dpcapacity >= 40)
## id name city latitude longitude dpcapacity
## 1 3 Shedd Aquarium Chicago 41.86723 -87.61536 55
## 2 35 Streeter Dr & Grand Ave Chicago 41.89228 -87.61204 47
## 3 43 Michigan Ave & Washington St Chicago 41.88389 -87.62465 43
## 4 90 Millennium Park Chicago 41.88103 -87.62408 47
## 5 97 Field Museum Chicago 41.86531 -87.61787 55
## 6 192 Canal St & Adams St Chicago 41.87926 -87.63990 47
## 7 195 Columbus Dr & Randolph St Chicago 41.88473 -87.61952 47
## online_date X
## 1 6/10/2013 10:44 NA
## 2 6/22/2013 21:12 NA
## 3 6/25/2013 10:57 NA
## 4 6/26/2013 19:51 NA
## 5 6/30/2013 13:25 NA
## 6 8/6/2013 13:27 NA
## 7 8/7/2013 14:11 NA
Let’s order the data set from largest to smallest dpcapacity
rows <- order(stations$dpcapacity,decreasing=TRUE)
stations2 <- stations[rows,]
head(stations2)
## id name city latitude longitude dpcapacity
## 2 3 Shedd Aquarium Chicago 41.86723 -87.61536 55
## 86 97 Field Museum Chicago 41.86531 -87.61787 55
## 32 35 Streeter Dr & Grand Ave Chicago 41.89228 -87.61204 47
## 79 90 Millennium Park Chicago 41.88103 -87.62408 47
## 174 192 Canal St & Adams St Chicago 41.87926 -87.63990 47
## 177 195 Columbus Dr & Randolph St Chicago 41.88473 -87.61952 47
## online_date X
## 2 6/10/2013 10:44 NA
## 86 6/30/2013 13:25 NA
## 32 6/22/2013 21:12 NA
## 79 6/26/2013 19:51 NA
## 174 8/6/2013 13:27 NA
## 177 8/7/2013 14:11 NA
tail(stations2)
## id name city latitude longitude
## 553 593 Halsted St & 59th St Chicago 41.78754 -87.64487
## 567 607 Cuyler Ave & Augusta St Oak Park 41.89817 -87.78306
## 568 608 Humphrey Ave & Ontario St Oak Park 41.89025 -87.77828
## 573 613 Wisconsin Ave & Madison St (Temp) Oak Park 41.87990 -87.80391
## 541 581 Commercial Ave & 83rd St Chicago 41.74461 -87.55120
## 542 582 Phillips Ave & 83rd St Chicago 41.74469 -87.56607
## dpcapacity online_date X
## 553 11 5/27/2016 11:41 NA
## 567 11 6/23/2016 12:22 NA
## 568 11 6/23/2016 12:23 NA
## 573 11 6/23/2016 12:26 NA
## 541 0 5/27/2016 11:33 NA
## 542 0 5/27/2016 11:34 NA
Take a closer look at the “city” column
x <- stations$city
class(x)
## [1] "character"
summary(x)
## Length Class Mode
## 585 character character
x <- factor(stations$city)
class(x)
## [1] "factor"
summary(x)
## Chicago Chicago Evanston Oak Park
## 520 41 11 13
For some reason Chicago shows twice, even as a factor.
Fix up.
#Select the rows that need to be fix
rows <- which(stations$city == "Chicago ")
stations[rows,"city"] <- "Chicago"
summary(stations$city)
## Length Class Mode
## 585 character character
Now all the entries under “Chicago” are the same.
stations$city <- factor(stations$city)
summary(stations$city)
## Chicago Evanston Oak Park
## 561 11 13
What’s a factor?
x <- stations$city
attributes(x)
## $levels
## [1] "Chicago" "Evanston" "Oak Park"
##
## $class
## [1] "factor"
unclass(x)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [456] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [561] 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 2 1
## attr(,"levels")
## [1] "Chicago" "Evanston" "Oak Park"
Save your environment:
save.image("divvy_analysis.RData")
#To load it, type the following:
load("divvy_analysis.RData")
Load the Divvy trip data:
trips <- read.csv("Divvy_Trips_2017_Q4.csv", stringsAsFactors = FALSE)
nrow(trips)
## [1] 669239
ncol(trips)
## [1] 12
print(object.size(trips), units = "Mb")
## 59.2 Mb
Let’s read the CSV file with the read_csv function, this is faster than read.csv
trips <- read_csv("Divvy_Trips_2017_Q4.csv")
## Parsed with column specification:
## cols(
## trip_id = col_integer(),
## start_time = col_character(),
## end_time = col_character(),
## bikeid = col_integer(),
## tripduration = col_integer(),
## from_station_id = col_integer(),
## from_station_name = col_character(),
## to_station_id = col_integer(),
## to_station_name = col_character(),
## usertype = col_character(),
## gender = col_character(),
## birthyear = col_integer()
## )
class(trips)
## [1] "tbl_df" "tbl" "data.frame"
Converting the tibble trips data set into a data frame.
class(trips) <- "data.frame"
vignette(package = "readr")
vignette("readr")
## starting httpd help server ... done
How do i know which packages are already installed?
rownames(installed.packages())
## [1] "abind" "assertthat" "backports" "base"
## [5] "base64enc" "BH" "bindr" "bindrcpp"
## [9] "bitops" "boot" "broom" "callr"
## [13] "caret" "caTools" "cellranger" "class"
## [17] "cli" "clipr" "cluster" "codetools"
## [21] "colorspace" "compiler" "cowplot" "crayon"
## [25] "curl" "CVST" "data.table" "DataExplorer"
## [29] "datasets" "DBI" "dbplyr" "ddalpha"
## [33] "DEoptimR" "dichromat" "digest" "dimRed"
## [37] "DMwR" "doParallel" "dplyr" "DRR"
## [41] "evaluate" "expsmooth" "extraDistr" "fma"
## [45] "forcats" "foreach" "forecast" "foreign"
## [49] "formatR" "fpp" "fracdiff" "gains"
## [53] "gdata" "geometry" "geosphere" "GGally"
## [57] "ggfortify" "ggmap" "ggplot2" "ggrepel"
## [61] "glue" "gower" "gplots" "graphics"
## [65] "grDevices" "grid" "gridBase" "gridExtra"
## [69] "gtable" "gtools" "haven" "highr"
## [73] "hms" "htmltools" "htmlwidgets" "hts"
## [77] "httpuv" "httr" "igraph" "imputeTS"
## [81] "inline" "ipred" "ISLR" "iterators"
## [85] "jpeg" "jsonlite" "kernlab" "KernSmooth"
## [89] "knitr" "labeling" "later" "lattice"
## [93] "lava" "lazyeval" "leaps" "lmtest"
## [97] "locfit" "lubridate" "magic" "magrittr"
## [101] "mapdata" "mapproj" "maps" "markdown"
## [105] "MASS" "Matrix" "matrixcalc" "methods"
## [109] "mgcv" "mime" "mnormt" "ModelMetrics"
## [113] "modelr" "MuMIn" "munsell" "NbClust"
## [117] "NCmisc" "networkD3" "nlme" "nnet"
## [121] "numDeriv" "openssl" "packrat" "parallel"
## [125] "pillar" "pkgconfig" "PKI" "plogr"
## [129] "pls" "plyr" "png" "poLCA"
## [133] "praise" "prettyunits" "pROC" "processx"
## [137] "prodlim" "proftools" "progress" "promises"
## [141] "prophet" "proto" "psych" "purrr"
## [145] "qqplotr" "quadprog" "quantmod" "R6"
## [149] "RColorBrewer" "Rcpp" "RcppEigen" "RcppRoll"
## [153] "RCurl" "reader" "readr" "readxl"
## [157] "recipes" "rematch" "reprex" "reshape"
## [161] "reshape2" "RgoogleMaps" "rJava" "rjson"
## [165] "RJSONIO" "rlang" "rmarkdown" "robustbase"
## [169] "ROCR" "rpart" "rprojroot" "rsconnect"
## [173] "rstan" "rstudioapi" "rvest" "scales"
## [177] "scatterplot3d" "selectr" "sfsmisc" "shiny"
## [181] "sourcetools" "sp" "SparseM" "spatial"
## [185] "splines" "SQUAREM" "StanHeaders" "stats"
## [189] "stats4" "stinepack" "stringi" "stringr"
## [193] "survival" "tcltk" "testthat" "tibble"
## [197] "tidyr" "tidyselect" "tidyverse" "timeDate"
## [201] "tools" "treemap" "TSA" "tseries"
## [205] "TTR" "urca" "uroot" "utf8"
## [209] "utils" "varhandle" "viridisLite" "whisker"
## [213] "withr" "xlsx" "xlsxjars" "xml2"
## [217] "xtable" "xts" "yaml" "zoo"
Where do the packages live?
.libPaths()
## [1] "/Library/Frameworks/R.framework/Versions/3.5/Resources/library"
Reviewing the trip data:
nrow(trips)
## [1] 669239
ncol(trips)
## [1] 12
head(trips)
## trip_id start_time end_time bikeid tripduration
## 1 17536701 12/31/2017 23:58 1/1/2018 0:03 3304 284
## 2 17536700 12/31/2017 23:54 1/1/2018 0:18 5975 1402
## 3 17536699 12/31/2017 23:54 1/1/2018 0:18 4906 1441
## 4 17536698 12/31/2017 23:48 12/31/2017 23:53 5667 315
## 5 17536697 12/31/2017 23:42 12/31/2017 23:47 5353 272
## 6 17536696 12/31/2017 23:41 12/31/2017 23:51 5840 589
## from_station_id from_station_name to_station_id
## 1 159 Claremont Ave & Hirsch St 69
## 2 145 Mies van der Rohe Way & Chestnut St 145
## 3 145 Mies van der Rohe Way & Chestnut St 145
## 4 340 Clark St & Wrightwood Ave 143
## 5 240 Sheridan Rd & Irving Park Rd 245
## 6 93 Sheffield Ave & Willow St 343
## to_station_name usertype gender birthyear
## 1 Damen Ave & Pierce Ave Subscriber Male 1988
## 2 Mies van der Rohe Way & Chestnut St Customer <NA> NA
## 3 Mies van der Rohe Way & Chestnut St Customer <NA> NA
## 4 Sedgwick St & Webster Ave Subscriber Male 1963
## 5 Clarendon Ave & Junior Ter Subscriber Male 1977
## 6 Racine Ave & Wrightwood Ave Subscriber Male 1988
summary(trips)
## trip_id start_time end_time bikeid
## Min. :16734066 Length:669239 Length:669239 Min. : 1
## 1st Qu.:16932824 Class :character Class :character 1st Qu.:1966
## Median :17130688 Mode :character Mode :character Median :3905
## Mean :17132520 Mean :3717
## 3rd Qu.:17334366 3rd Qu.:5590
## Max. :17536701 Max. :6471
##
## tripduration from_station_id from_station_name to_station_id
## Min. : 60.0 Min. : 2 Length:669239 Min. : 2.0
## 1st Qu.: 347.0 1st Qu.: 76 Class :character 1st Qu.: 76.0
## Median : 567.0 Median :161 Mode :character Median :157.0
## Mean : 779.4 Mean :184 Mean :183.3
## 3rd Qu.: 940.0 3rd Qu.:280 3rd Qu.:275.0
## Max. :85466.0 Max. :626 Max. :626.0
##
## to_station_name usertype gender birthyear
## Length:669239 Length:669239 Length:669239 Min. :1918
## Class :character Class :character Class :character 1st Qu.:1975
## Mode :character Mode :character Mode :character Median :1985
## Mean :1982
## 3rd Qu.:1990
## Max. :2004
## NA's :78827