January 29, 2016
This presentation assumes that you know, generally, what R is, and that you have done the following…
Packages can contain…
# Example of functions and data in package library(MASS) # Functions and datasets for 'Modern Applied Statistics with S' corresp(caith) # Correspondence analysis function and eye color data
## First canonical correlation(s): 0.4463684 ## ## Row scores: ## blue light medium dark ## -0.89679252 -0.98731818 0.07530627 1.57434710 ## ## Column scores: ## fair red medium dark black ## -1.21871379 -0.52257500 -0.09414671 1.31888486 2.45176017
ggplot2
(visualization package)options("repos"="http://cran.stat.sfu.ca/") # Set source (optional) install.packages('ggplot2') # Choose package to install library(ggplot2) # Load package
install.packages('dplyr') # Data manipulation install.packages('devtools') # Good for interaction with GitHub install.packages('rmarkdown') # Authoring dynamic documents in R install.packages('lubridate') # Working with dates and times install.packages('foreign') # Reading in SAS, SPSS, STATA data sets install.packages('swirl') # Interactive learning in R
#Install Bioconductor core and recommended packages source("http://bioconductor.org/biocLite.R") biocLite() # Install additional packages source("https://bioconductor.org/biocLite.R") biocLite("CausalR") #Causal reasoning on Biological Networks
library(devtools) # Load devtools library # Installing install_github('ramnathv/slidify') # Install Slidify, the `PowerPoint` of R install_github('ramnathv/slidifyLibraries') # Slidify themes and supporting files install_github("quandl/quandl-r") # Open Data package
library(xlsx) # Read in first worksheet from workbook 'spreadsheet.xlsx' df <- read.xlsx('spreadsheet.xlsx') # Read in worksheet named 'orange' df <- read.xlsx('spreadsheet.xlsx', sheetName = 'second')
library(foreign) # Package to import Stata formatted data df <- read.dta('data.dta')
library(Hmisc) # Package with SAS and SPSS import functions df.sas <- sasxport.get('data.xpt') # Importing from SAS transport format df.spss <- spss.get('data.por') # Importing from SPSS transport format
ggplot(data=Quandl('WIKI/FB', start_date='2005-01-01')[,c(1,5)], aes(x=Date, y=Close)) + geom_line(color='#FAB521') + xlab('Date') + ylab('Closing Price') + ggtitle('FB Closing Prices')
char
to date
)dplyr
tidyr
lubridate
data.table
magrittr
*dplyr
ggplot2
, devtools
, and more)dplyr
covers…filter
: for subsetting variablesselect
: for subsetting rowsarrange
: for re-ordering rowsmutate
: for adding new columnssummarise
or summarize
: for reducing each group to a smaller number of summary statisticsdplyr
dplyr
helps us apply the split, apply, combine method with easedplyr
install.packages("nycflights13") library(nycflights13)
Package nycflights13
has data about all flights that departed NYC in 2013, comprising 5 datasets
flights
: Flights dataairlines
: Airline namesairports
: Airport metadataplanes
: Plane metadataweather
: Hourly weather datadplyr
dim(flights)
## [1] 336776 16
colSums(is.na(flights)) # How many NA per variable?
## year month day dep_time dep_delay arr_time arr_delay ## 0 0 0 8255 8255 8713 9430 ## carrier tailnum flight origin dest air_time distance ## 0 0 0 0 0 9430 0 ## hour minute ## 8255 8255
head(flights)
## Source: local data frame [6 x 16] ## ## year month day dep_time dep_delay arr_time arr_delay carrier tailnum ## (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) ## 1 2013 1 1 517 2 830 11 UA N14228 ## 2 2013 1 1 533 4 850 20 UA N24211 ## 3 2013 1 1 542 2 923 33 AA N619AA ## 4 2013 1 1 544 -1 1004 -18 B6 N804JB ## 5 2013 1 1 554 -6 812 -25 DL N668DN ## 6 2013 1 1 554 -4 740 12 UA N39463 ## Variables not shown: flight (int), origin (chr), dest (chr), air_time ## (dbl), distance (dbl), hour (dbl), minute (dbl)
dplyr
: filter
filter(flights, carrier == 'UA')
## Source: local data frame [58,665 x 16] ## ## year month day dep_time dep_delay arr_time arr_delay carrier tailnum ## (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) ## 1 2013 1 1 517 2 830 11 UA N14228 ## 2 2013 1 1 533 4 850 20 UA N24211 ## 3 2013 1 1 554 -4 740 12 UA N39463 ## 4 2013 1 1 558 -2 924 7 UA N29129 ## 5 2013 1 1 558 -2 923 -14 UA N53441 ## 6 2013 1 1 559 -1 854 -8 UA N76515 ## 7 2013 1 1 607 0 858 -17 UA N53442 ## 8 2013 1 1 611 11 945 14 UA N532UA ## 9 2013 1 1 623 -4 933 1 UA N459UA ## 10 2013 1 1 628 -2 1016 29 UA N33289 ## .. ... ... ... ... ... ... ... ... ... ## Variables not shown: flight (int), origin (chr), dest (chr), air_time ## (dbl), distance (dbl), hour (dbl), minute (dbl)
dplyr
: filter
filter(flights, arr_delay > 5)
## Source: local data frame [109,831 x 16] ## ## year month day dep_time dep_delay arr_time arr_delay carrier tailnum ## (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) ## 1 2013 1 1 517 2 830 11 UA N14228 ## 2 2013 1 1 533 4 850 20 UA N24211 ## 3 2013 1 1 542 2 923 33 AA N619AA ## 4 2013 1 1 554 -4 740 12 UA N39463 ## 5 2013 1 1 555 -5 913 19 B6 N516JB ## 6 2013 1 1 558 -2 753 8 AA N3ALAA ## 7 2013 1 1 558 -2 924 7 UA N29129 ## 8 2013 1 1 559 -1 941 31 AA N3DUAA ## 9 2013 1 1 600 0 837 12 MQ N542MQ ## 10 2013 1 1 602 -3 821 16 MQ N730MQ ## .. ... ... ... ... ... ... ... ... ... ## Variables not shown: flight (int), origin (chr), dest (chr), air_time ## (dbl), distance (dbl), hour (dbl), minute (dbl)
dplyr
: select
select(flights, year:carrier)
## Source: local data frame [336,776 x 8] ## ## year month day dep_time dep_delay arr_time arr_delay carrier ## (int) (int) (int) (int) (dbl) (int) (dbl) (chr) ## 1 2013 1 1 517 2 830 11 UA ## 2 2013 1 1 533 4 850 20 UA ## 3 2013 1 1 542 2 923 33 AA ## 4 2013 1 1 544 -1 1004 -18 B6 ## 5 2013 1 1 554 -6 812 -25 DL ## 6 2013 1 1 554 -4 740 12 UA ## 7 2013 1 1 555 -5 913 19 B6 ## 8 2013 1 1 557 -3 709 -14 EV ## 9 2013 1 1 557 -3 838 -8 B6 ## 10 2013 1 1 558 -2 753 8 AA ## .. ... ... ... ... ... ... ... ...
dplyr
: arrange
arrange(flights, desc(arr_delay))
## Source: local data frame [336,776 x 16] ## ## year month day dep_time dep_delay arr_time arr_delay carrier tailnum ## (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) ## 1 2013 1 9 641 1301 1242 1272 HA N384HA ## 2 2013 6 15 1432 1137 1607 1127 MQ N504MQ ## 3 2013 1 10 1121 1126 1239 1109 MQ N517MQ ## 4 2013 9 20 1139 1014 1457 1007 AA N338AA ## 5 2013 7 22 845 1005 1044 989 MQ N665MQ ## 6 2013 4 10 1100 960 1342 931 DL N959DL ## 7 2013 3 17 2321 911 135 915 DL N927DA ## 8 2013 7 22 2257 898 121 895 DL N6716C ## 9 2013 12 5 756 896 1058 878 AA N5DMAA ## 10 2013 5 3 1133 878 1250 875 MQ N523MQ ## .. ... ... ... ... ... ... ... ... ... ## Variables not shown: flight (int), origin (chr), dest (chr), air_time ## (dbl), distance (dbl), hour (dbl), minute (dbl)
dplyr
: mutate
mutate(flights, arr_status = ifelse(arr_delay > 0, 1, 0))
## Source: local data frame [336,776 x 17] ## ## year month day dep_time dep_delay arr_time arr_delay carrier tailnum ## (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) ## 1 2013 1 1 517 2 830 11 UA N14228 ## 2 2013 1 1 533 4 850 20 UA N24211 ## 3 2013 1 1 542 2 923 33 AA N619AA ## 4 2013 1 1 544 -1 1004 -18 B6 N804JB ## 5 2013 1 1 554 -6 812 -25 DL N668DN ## 6 2013 1 1 554 -4 740 12 UA N39463 ## 7 2013 1 1 555 -5 913 19 B6 N516JB ## 8 2013 1 1 557 -3 709 -14 EV N829AS ## 9 2013 1 1 557 -3 838 -8 B6 N593JB ## 10 2013 1 1 558 -2 753 8 AA N3ALAA ## .. ... ... ... ... ... ... ... ... ... ## Variables not shown: flight (int), origin (chr), dest (chr), air_time ## (dbl), distance (dbl), hour (dbl), minute (dbl), arr_status (dbl)
dplyr
: summarise
flights %>% group_by(origin) %>% summarise(avg_delay = mean(arr_delay, na.rm=TRUE))
## Source: local data frame [3 x 2] ## ## origin avg_delay ## (chr) (dbl) ## 1 EWR 9.107055 ## 2 JFK 5.551481 ## 3 LGA 5.783488
dplyr
: Chaining df <- flights del_percent <- df %>% na.omit() %>% mutate(dep_status = ifelse(dep_delay > 0, 1, 0)) %>% mutate(arr_status = ifelse(arr_delay > 0, 1, 0)) %>% group_by(carrier) %>% summarise(per_dep_delay = mean(dep_status), per_arr_delay = mean(arr_status)) %>% arrange(desc(per_dep_delay))
dplyr
: Chaining del_percent
## Source: local data frame [16 x 3] ## ## carrier per_dep_delay per_arr_delay ## (chr) (dbl) (dbl) ## 1 WN 0.5425938 0.4403853 ## 2 FL 0.5187402 0.5968504 ## 3 F9 0.4992658 0.5756241 ## 4 UA 0.4694368 0.3845834 ## 5 EV 0.4495578 0.4790639 ## 6 VX 0.4331509 0.3412823 ## 7 YV 0.4264706 0.4742647 ## 8 9E 0.4036082 0.3837747 ## 9 B6 0.3954190 0.4368073 ## 10 DL 0.3186453 0.3443913 ## 11 MQ 0.3181691 0.4670288 ## 12 AS 0.3173484 0.2665726 ## 13 AA 0.3163051 0.3351175 ## 14 OO 0.3103448 0.3448276 ## 15 US 0.2401291 0.3705814 ## 16 HA 0.2017544 0.2836257
"Using R is a bit akin to smoking. The beginning is difficult, one may get headaches and even gag the first few times. But in the long run, it becomes pleasurable and even addictive. Yet, deep down, for those willing to be honest, there is something not fully healthy in it."