Importing Data
TidyR
StringR
Lubridate
Data Structures
dplyr
Joins
Visualization (ggplot2)
Control Statements & Iteration
Function
Apply Family
Miscellaneous
library(DT)
library(tidyverse)
library(tidyr)
library(stringr)
library(lubridate)
library(nycflights13)
library(GGally)
baseR
df <- read.csv("data/flights.csv")
readr
library(readr)
df <- read_csv("data/flights.csv")
data.table
library(data.table)
df <- fread("data/flights.csv")
library(readxl)
excel_sheets("data/aircraft.xlsx")
## [1] "Bombers" "Fighters" "Trainers"
## [4] "UAV_Drones" "Tankers_Transporters"
df <- read_excel("data/aircraft.xlsx", sheet = "Bombers")
cases <- fread("data/cases.csv", header = TRUE)
datatable(cases)
gather
library(tidyr)
# Year - name of the new column
# population - name of the new value column
# 2:4 - columns to collapse
df <-
cases %>%
gather(Year, population, 2:4)
datatable(df)
# Alternate Code
df <- cases %>% gather(Year, population, '2011':'2013')
df <- cases %>% gather(Year, population, '2011', '2012', '2013')
df <- cases %>% gather(Year, population, 2:4)
df <- cases %>% gather(Year, population, -country)
spread
# Year - column to use as column names
# population - column to use as column values
df %>%
spread(Year, population)
## country 2011 2012 2013
## 1 DE 5800 6000 6200
## 2 FR 7000 6900 7000
## 3 US 15000 14000 13000
separate
df <- data.frame(x = c("a.b", "a.d", "b.c"))
datatable(df)
# x - column to split into multiple columns
# c("A", "B") - names of new columns
df <-
df %>%
separate(x, c("A", "B"))
datatable(df)
unite
# x - name of the new merged column
# A,B - columns to merge
# sep="." - unite column by
df %>%
unite(x,A,B,sep=".")
## x
## 1 a.b
## 2 a.d
## 3 b.c
example
df <- read_rds("data/bomber_mess.rds")
datatable(df)
df <-
df %>%
unite(MD, prefix, number, sep="-") %>%
separate(Metric, c("FY","Output"), sep="_") %>%
spread(Output, Value)
datatable(df)
str_sub
# x - string
# 1 - start position
# 3 - end position
x <- "akshay"
str_sub(x,1,3)
## [1] "aks"
str_pad
x <- "abc"
str_pad(x, 5,"left") # default pads on left
## [1] " abc"
str_pad(x, 5,"right") # default pads on left
## [1] "abc "
str_trim
x <- c(" a ", "b ", " c")
str_trim(x, "left")
## [1] "a " "b " "c"
str_trim(x)
## [1] "a" "b" "c"
str_to_upper, str_to_title, str_to_lower
x <- "I like horses."
str_to_upper(x)
## [1] "I LIKE HORSES."
str_to_title(x)
## [1] "I Like Horses."
str_to_lower(x)
## [1] "i like horses."
str_detect
# fruits - string
# "apple" - pattern to match
fruits <- c(
"apple",
"orange",
"banana",
"grapes",
"grapesgrapes"
)
str_detect(fruits, "apple")
## [1] TRUE FALSE FALSE FALSE FALSE
str_replace
# fruits - string
# "grapes" - pattern to replace
# "guava" - pattern to replace with
str_replace(fruits, "grapes", "guava")
## [1] "apple" "orange" "banana" "guava" "guavagrapes"
str_replace_all(fruits, "grapes", "guava")
## [1] "apple" "orange" "banana" "guava" "guavaguava"
ymd, mdy, dmy
library(lubridate)
ymd("20110604")
## [1] "2011-06-04"
mdy("06-04-2011")
## [1] "2011-06-04"
dmy("04/06/2011")
## [1] "2011-06-04"
ymd_hms
arrive <- ymd_hms("2011-06-04 12:00:00")
arrive
## [1] "2011-06-04 12:00:00 UTC"
leave <- ymd_hms("2011-08-10 14:00:00")
leave
## [1] "2011-08-10 14:00:00 UTC"
second, minute, hour, day, wday, yday, week, month, year
wday(arrive)
## [1] 7
vectors
state.name
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
state.name[1:5]
## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "California"
state.name[c(23, 6, 15, 34, 35)]
## [1] "Minnesota" "Colorado" "Iowa" "North Dakota"
## [5] "Ohio"
matrix
VADeaths
## Rural Male Rural Female Urban Male Urban Female
## 50-54 11.7 8.7 15.4 8.4
## 55-59 18.1 11.7 24.3 13.6
## 60-64 26.9 20.3 37.0 19.3
## 65-69 41.0 30.9 54.6 35.1
## 70-74 66.0 54.3 71.1 50.0
VADeaths[1:3,]
## Rural Male Rural Female Urban Male Urban Female
## 50-54 11.7 8.7 15.4 8.4
## 55-59 18.1 11.7 24.3 13.6
## 60-64 26.9 20.3 37.0 19.3
VADeaths[, c(2, 4)]
## Rural Female Urban Female
## 50-54 8.7 8.4
## 55-59 11.7 13.6
## 60-64 20.3 19.3
## 65-69 30.9 35.1
## 70-74 54.3 50.0
VADeaths[1:3, c(2, 4)]
## Rural Female Urban Female
## 50-54 8.7 8.4
## 55-59 11.7 13.6
## 60-64 20.3 19.3
dataframe
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
mtcars[1:10,]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
mtcars[, c("mpg", "cyl", "vs")]
## mpg cyl vs
## Mazda RX4 21.0 6 0
## Mazda RX4 Wag 21.0 6 0
## Datsun 710 22.8 4 1
## Hornet 4 Drive 21.4 6 1
## Hornet Sportabout 18.7 8 0
## Valiant 18.1 6 1
## Duster 360 14.3 8 0
## Merc 240D 24.4 4 1
## Merc 230 22.8 4 1
## Merc 280 19.2 6 1
## Merc 280C 17.8 6 1
## Merc 450SE 16.4 8 0
## Merc 450SL 17.3 8 0
## Merc 450SLC 15.2 8 0
## Cadillac Fleetwood 10.4 8 0
## Lincoln Continental 10.4 8 0
## Chrysler Imperial 14.7 8 0
## Fiat 128 32.4 4 1
## Honda Civic 30.4 4 1
## Toyota Corolla 33.9 4 1
## Toyota Corona 21.5 4 1
## Dodge Challenger 15.5 8 0
## AMC Javelin 15.2 8 0
## Camaro Z28 13.3 8 0
## Pontiac Firebird 19.2 8 0
## Fiat X1-9 27.3 4 1
## Porsche 914-2 26.0 4 0
## Lotus Europa 30.4 4 1
## Ford Pantera L 15.8 8 0
## Ferrari Dino 19.7 6 0
## Maserati Bora 15.0 8 0
## Volvo 142E 21.4 4 1
mtcars[1:10, c("mpg", "cyl", "vs")]
## mpg cyl vs
## Mazda RX4 21.0 6 0
## Mazda RX4 Wag 21.0 6 0
## Datsun 710 22.8 4 1
## Hornet 4 Drive 21.4 6 1
## Hornet Sportabout 18.7 8 0
## Valiant 18.1 6 1
## Duster 360 14.3 8 0
## Merc 240D 24.4 4 1
## Merc 230 22.8 4 1
## Merc 280 19.2 6 1
list
l1 <- list(item1 = 1:3,
item2 = matrix(1:9, nrow = 3))
l1
## $item1
## [1] 1 2 3
##
## $item2
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
l1[2] # list containing matrix
## $item2
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
l1[[2]] # matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
l1$item2 # matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
l1$item2[3, 1] # element of a matrix
## [1] 3
filter: pick observations based on filters
library(dplyr)
library(nycflights13)
filter(flights, month == 1, day == 1, dep_delay > 0)
## # A tibble: 352 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 601 600 1 844
## 5 2013 1 1 608 600 8 807
## 6 2013 1 1 611 600 11 945
## 7 2013 1 1 613 610 3 925
## 8 2013 1 1 623 610 13 920
## 9 2013 1 1 632 608 24 740
## 10 2013 1 1 644 636 8 931
## # ... with 342 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month == 12)
## # A tibble: 28,135 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 1 13 2359 14 446
## 2 2013 12 1 17 2359 18 443
## 3 2013 12 1 453 500 -7 636
## 4 2013 12 1 520 515 5 749
## 5 2013 12 1 536 540 -4 845
## 6 2013 12 1 540 550 -10 1005
## 7 2013 12 1 541 545 -4 734
## 8 2013 12 1 546 545 1 826
## 9 2013 12 1 549 600 -11 648
## 10 2013 12 1 550 600 -10 825
## # ... with 28,125 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month != 12)
## # A tibble: 308,641 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 308,631 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 5 2359 6 352
## 2 2013 11 1 35 2250 105 123
## 3 2013 11 1 455 500 -5 641
## 4 2013 11 1 539 545 -6 856
## 5 2013 11 1 542 545 -3 831
## 6 2013 11 1 549 600 -11 912
## 7 2013 11 1 550 600 -10 705
## 8 2013 11 1 554 600 -6 659
## 9 2013 11 1 554 600 -6 826
## 10 2013 11 1 554 600 -6 749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, arr_delay <= 120)
## # A tibble: 317,312 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 317,302 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, !(arr_delay <= 120))
## # A tibble: 10,034 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101 1047
## 2 2013 1 1 848 1835 853 1001
## 3 2013 1 1 957 733 144 1056
## 4 2013 1 1 1114 900 134 1447
## 5 2013 1 1 1505 1310 115 1638
## 6 2013 1 1 1525 1340 105 1831
## 7 2013 1 1 1549 1445 64 1912
## 8 2013 1 1 1558 1359 119 1718
## 9 2013 1 1 1732 1630 62 2028
## 10 2013 1 1 1803 1620 103 2008
## # ... with 10,024 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, is.na(tailnum))
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 2 NA 1545 NA NA
## 2 2013 1 2 NA 1601 NA NA
## 3 2013 1 3 NA 857 NA NA
## 4 2013 1 3 NA 645 NA NA
## 5 2013 1 4 NA 845 NA NA
## 6 2013 1 4 NA 1830 NA NA
## 7 2013 1 5 NA 840 NA NA
## 8 2013 1 7 NA 820 NA NA
## 9 2013 1 8 NA 1645 NA NA
## 10 2013 1 9 NA 755 NA NA
## # ... with 2,502 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Using comma is same as using &
filter(flights, month == 12, day == 25)
## # A tibble: 719 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 25 456 500 -4 649
## 2 2013 12 25 524 515 9 805
## 3 2013 12 25 542 540 2 832
## 4 2013 12 25 546 550 -4 1022
## 5 2013 12 25 556 600 -4 730
## 6 2013 12 25 557 600 -3 743
## 7 2013 12 25 557 600 -3 818
## 8 2013 12 25 559 600 -1 855
## 9 2013 12 25 559 600 -1 849
## 10 2013 12 25 600 600 0 850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month == 12 & day == 25)
## # A tibble: 719 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 25 456 500 -4 649
## 2 2013 12 25 524 515 9 805
## 3 2013 12 25 542 540 2 832
## 4 2013 12 25 546 550 -4 1022
## 5 2013 12 25 556 600 -4 730
## 6 2013 12 25 557 600 -3 743
## 7 2013 12 25 557 600 -3 818
## 8 2013 12 25 559 600 -1 855
## 9 2013 12 25 559 600 -1 849
## 10 2013 12 25 600 600 0 850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Use %in% as a shortcut for |
filter(flights, month == 11 | month == 12)
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 5 2359 6 352
## 2 2013 11 1 35 2250 105 123
## 3 2013 11 1 455 500 -5 641
## 4 2013 11 1 539 545 -6 856
## 5 2013 11 1 542 545 -3 831
## 6 2013 11 1 549 600 -11 912
## 7 2013 11 1 550 600 -10 705
## 8 2013 11 1 554 600 -6 659
## 9 2013 11 1 554 600 -6 826
## 10 2013 11 1 554 600 -6 749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 5 2359 6 352
## 2 2013 11 1 35 2250 105 123
## 3 2013 11 1 455 500 -5 641
## 4 2013 11 1 539 545 -6 856
## 5 2013 11 1 542 545 -3 831
## 6 2013 11 1 549 600 -11 912
## 7 2013 11 1 550 600 -10 705
## 8 2013 11 1 554 600 -6 659
## 9 2013 11 1 554 600 -6 826
## 10 2013 11 1 554 600 -6 749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange: reorder data
arrange(flights, dep_delay, arr_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43 40
## 2 2013 2 3 2022 2055 -33 2240
## 3 2013 11 10 1408 1440 -32 1549
## 4 2013 1 11 1900 1930 -30 2233
## 5 2013 1 29 1703 1730 -27 1947
## 6 2013 8 9 729 755 -26 1002
## 7 2013 3 30 2030 2055 -25 2213
## 8 2013 10 23 1907 1932 -25 2143
## 9 2013 5 5 934 958 -24 1225
## 10 2013 9 18 1631 1655 -24 1812
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
select: pick variables
select(flights, year, month, day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
select(flights, year:day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
select(flights, -(year:day))
## # A tibble: 336,776 x 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <dbl> <int> <int> <dbl>
## 1 517 515 2 830 819 11
## 2 533 529 4 850 830 20
## 3 542 540 2 923 850 33
## 4 544 545 -1 1004 1022 -18
## 5 554 600 -6 812 837 -25
## 6 554 558 -4 740 728 12
## 7 555 600 -5 913 854 19
## 8 557 600 -3 709 723 -14
## 9 557 600 -3 838 846 -8
## 10 558 600 -2 753 745 8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, ends_with("time"))
## # A tibble: 336,776 x 5
## dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <dbl>
## 1 517 515 830 819 227
## 2 533 529 850 830 227
## 3 542 540 923 850 160
## 4 544 545 1004 1022 183
## 5 554 600 812 837 116
## 6 554 558 740 728 150
## 7 555 600 913 854 158
## 8 557 600 709 723 53
## 9 557 600 838 846 140
## 10 558 600 753 745 138
## # ... with 336,766 more rows
select(flights, c(carrier, ends_with("time"), contains("delay")))
## # A tibble: 336,776 x 8
## carrier dep_time sched_dep_time arr_time sched_arr_time air_time
## <chr> <int> <int> <int> <int> <dbl>
## 1 UA 517 515 830 819 227
## 2 UA 533 529 850 830 227
## 3 AA 542 540 923 850 160
## 4 B6 544 545 1004 1022 183
## 5 DL 554 600 812 837 116
## 6 UA 554 558 740 728 150
## 7 B6 555 600 913 854 158
## 8 EV 557 600 709 723 53
## 9 B6 557 600 838 846 140
## 10 AA 558 600 753 745 138
## # ... with 336,766 more rows, and 2 more variables: dep_delay <dbl>,
## # arr_delay <dbl>
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 x 19
## time_hour air_time year month day dep_time sched_dep_time
## <dttm> <dbl> <int> <int> <int> <int> <int>
## 1 2013-01-01 05:00:00 227 2013 1 1 517 515
## 2 2013-01-01 05:00:00 227 2013 1 1 533 529
## 3 2013-01-01 05:00:00 160 2013 1 1 542 540
## 4 2013-01-01 05:00:00 183 2013 1 1 544 545
## 5 2013-01-01 06:00:00 116 2013 1 1 554 600
## 6 2013-01-01 05:00:00 150 2013 1 1 554 558
## 7 2013-01-01 06:00:00 158 2013 1 1 555 600
## 8 2013-01-01 06:00:00 53 2013 1 1 557 600
## 9 2013-01-01 06:00:00 140 2013 1 1 557 600
## 10 2013-01-01 06:00:00 138 2013 1 1 558 600
## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## # hour <dbl>, minute <dbl>
rename(flights, departure_delay = dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time departure_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
mutate: create new variables
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
flights_sml
## # A tibble: 336,776 x 7
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227
## 2 2013 1 1 4 20 1416 227
## 3 2013 1 1 2 33 1089 160
## 4 2013 1 1 -1 -18 1576 183
## 5 2013 1 1 -6 -25 762 116
## 6 2013 1 1 -4 12 719 150
## 7 2013 1 1 -5 19 1065 158
## 8 2013 1 1 -3 -14 229 53
## 9 2013 1 1 -3 -8 944 140
## 10 2013 1 1 -2 8 733 138
## # ... with 336,766 more rows
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 10
## year month day dep_delay arr_delay distance air_time gain hours
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 3.78
## 2 2013 1 1 4 20 1416 227 16 3.78
## 3 2013 1 1 2 33 1089 160 31 2.67
## 4 2013 1 1 -1 -18 1576 183 -17 3.05
## 5 2013 1 1 -6 -25 762 116 -19 1.93
## 6 2013 1 1 -4 12 719 150 16 2.5
## 7 2013 1 1 -5 19 1065 158 24 2.63
## 8 2013 1 1 -3 -14 229 53 -11 0.883
## 9 2013 1 1 -3 -8 944 140 -5 2.33
## 10 2013 1 1 -2 8 733 138 10 2.3
## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 9 3.78 2.38
## 2 16 3.78 4.23
## 3 31 2.67 11.6
## 4 -17 3.05 -5.57
## 5 -19 1.93 -9.83
## 6 16 2.5 6.4
## 7 24 2.63 9.11
## 8 -11 0.883 -12.5
## 9 -5 2.33 -2.14
## 10 10 2.3 4.35
## # ... with 336,766 more rows
transmute(flights,
normalized_delay = dep_delay / (mean(dep_delay, na.rm = TRUE)))
## # A tibble: 336,776 x 1
## normalized_delay
## <dbl>
## 1 0.158
## 2 0.316
## 3 0.158
## 4 -0.0791
## 5 -0.475
## 6 -0.316
## 7 -0.396
## 8 -0.237
## 9 -0.237
## 10 -0.158
## # ... with 336,766 more rows
transmute(flights,
log_air_time = log2(air_time),
exp_delay = exp(dep_delay))
## # A tibble: 336,776 x 2
## log_air_time exp_delay
## <dbl> <dbl>
## 1 7.83 7.39
## 2 7.83 54.6
## 3 7.32 7.39
## 4 7.52 0.368
## 5 6.86 0.00248
## 6 7.23 0.0183
## 7 7.30 0.00674
## 8 5.73 0.0498
## 9 7.13 0.0498
## 10 7.11 0.135
## # ... with 336,766 more rows
transmute(flights,
arr_delay = arr_delay,
bucket = ntile(arr_delay, 10))
## # A tibble: 336,776 x 2
## arr_delay bucket
## <dbl> <int>
## 1 11 8
## 2 20 8
## 3 33 9
## 4 -18 3
## 5 -25 2
## 6 12 8
## 7 19 8
## 8 -14 3
## 9 -8 5
## 10 8 7
## # ... with 336,766 more rows
summarize: summarize data by functions of choice
summarize(flights,
dep_delay_mean = mean(dep_delay, na.rm = TRUE),
dep_delay_sd = sd(dep_delay, na.rm = TRUE),
n = n())
## # A tibble: 1 x 3
## dep_delay_mean dep_delay_sd n
## <dbl> <dbl> <int>
## 1 12.6 40.2 336776
group_by: group data by categorical levels
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
# Which carrier had the largest mean departure delay? Smallest?
by_carrier <- group_by(flights, carrier)
summarise(by_carrier, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 16 x 2
## carrier delay
## <chr> <dbl>
## 1 9E 16.7
## 2 AA 8.59
## 3 AS 5.80
## 4 B6 13.0
## 5 DL 9.26
## 6 EV 20.0
## 7 F9 20.2
## 8 FL 18.7
## 9 HA 4.90
## 10 MQ 10.6
## 11 OO 12.6
## 12 UA 12.1
## 13 US 3.78
## 14 VX 12.9
## 15 WN 17.7
## 16 YV 19.0
# Which carrier had the largest difference between their max and min departure delay?
summarise(by_carrier,
max = max(dep_delay, na.rm = TRUE),
min = min(dep_delay, na.rm = TRUE),
delta = max - min)
## # A tibble: 16 x 4
## carrier max min delta
## <chr> <dbl> <dbl> <dbl>
## 1 9E 747 -24 771
## 2 AA 1014 -24 1038
## 3 AS 225 -21 246
## 4 B6 502 -43 545
## 5 DL 960 -33 993
## 6 EV 548 -32 580
## 7 F9 853 -27 880
## 8 FL 602 -22 624
## 9 HA 1301 -16 1317
## 10 MQ 1137 -26 1163
## 11 OO 154 -14 168
## 12 UA 483 -20 503
## 13 US 500 -19 519
## 14 VX 653 -20 673
## 15 WN 471 -13 484
## 16 YV 387 -16 403
# Which month has the largest variance for arrival delays?
by_month <- group_by(flights, month)
summarise(by_month, delay = sd(arr_delay, na.rm = TRUE))
## # A tibble: 12 x 2
## month delay
## <int> <dbl>
## 1 1 40.4
## 2 2 39.5
## 3 3 44.1
## 4 4 47.5
## 5 5 44.2
## 6 6 56.1
## 7 7 57.1
## 8 8 42.6
## 9 9 39.7
## 10 10 32.6
## 11 11 31.4
## 12 12 46.1
% Operator
flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
arrange(desc(delay))
## # A tibble: 4,043 x 3
## tailnum delay n
## <chr> <dbl> <int>
## 1 N844MH 320 1
## 2 N911DA 294 1
## 3 N922EV 276 1
## 4 N587NW 264 1
## 5 N851NW 219 1
## 6 N928DN 201 1
## 7 N7715E 188 1
## 8 N654UA 185 1
## 9 N665MQ 175. 6
## 10 N427SW 157 1
## # ... with 4,033 more rows
flights %>%
group_by(carrier, month) %>%
summarise(max_delay = max(arr_delay, na.rm = TRUE)) %>%
mutate(rank_delay = rank(desc(max_delay))) %>%
group_by(month) %>%
summarize(avg_rank = mean(rank_delay)) %>%
arrange(desc(avg_rank))
## # A tibble: 12 x 2
## month avg_rank
## <int> <dbl>
## 1 10 9.2
## 2 2 7.8
## 3 11 7.56
## 4 8 6.72
## 5 9 6.59
## 6 12 6.47
## 7 3 6.33
## 8 5 6.27
## 9 1 6.06
## 10 4 5.8
## 11 6 4.44
## 12 7 3.67
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
x
## # A tibble: 3 x 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
## 3 3 x3
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
y
## # A tibble: 3 x 2
## key val_y
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
## 3 4 y3
Inner Join
x %>%
inner_join(y, by="key")
## # A tibble: 2 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
Left Join
x %>%
left_join(y, by="key")
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
Right Join
x %>%
right_join(y, by="key")
## # A tibble: 3 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
Full Join
x %>%
full_join(y, by="key")
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
What if key names don not match?
x <- tribble(
~key1, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key2, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
x %>%
inner_join(y, by = c("key1" = "key2"))
## # A tibble: 2 x 3
## key1 val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
semi join: keeps all observations in x that have a match in y
x %>%
semi_join(y, by = c("key1" = "key2"))
## # A tibble: 2 x 2
## key1 val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
anti join: drops all observations in x that have a match in y
x %>%
anti_join(y, by = c("key1" = "key2"))
## # A tibble: 1 x 2
## key1 val_x
## <dbl> <chr>
## 1 3 x3
Drawing the Canvas
library(ggplot2)
ggplot(data = mpg)
ggplot(data = mpg, aes(x = displ, y = hwy))
Geoms:
#aes - aesthetics
ggplot(data = mpg, aes(x = hwy)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = mpg, aes(x = hwy)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = mpg, aes(x = hwy)) +
geom_density()
ggplot(data = mpg, aes(x = class)) +
geom_bar()
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point()
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_violin()
Non-Mapping Aesthetics - color, size, shape, opacity
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue", size = 2, shape = 17, alpha = .5)
color inside geom_point - colors all the points blue
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue")
color inside aesthetic - makes class variable as legend
ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
geom_point()
A common error - color =“blue” inside aesthetic
ggplot(data = mpg, aes(x = displ, y = hwy, color = "blue")) +
geom_point()
Facets
facet_wrap: primarily used to create small multiples based on a single variable
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~ class, nrow = 2)
facet_grid: primarily used to create a small multiples grid based on two variables
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl)
Overplotting - Multiple Geoms
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()
aes inside geom_point - ‘color = class’ applicable to only the geom_point
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
geom_smooth()
aes inside ggplot - ‘color = class’ applicable to both geoms
ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
geom_point() +
geom_smooth()
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class == "2seater")) +
geom_smooth(data = filter(mpg, class == "2seater"), se = FALSE) +
geom_smooth(data = filter(mpg, class != "2seater"), se = FALSE)
Positioning - Bar chart
ggplot(data = mpg, aes(class, color = factor(year))) +
geom_bar()
ggplot(data = mpg, aes(class, fill = factor(year))) +
geom_bar()
ggplot(data = mpg, aes(class, fill = factor(year))) +
geom_bar(position = "fill")
ggplot(data = mpg, aes(class, fill = factor(year))) +
geom_bar(position = "dodge")
Coordinate System
Flipping the Coordinate System
# top
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_boxplot()
# bottom
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_boxplot() +
coord_flip()
Zooming in or out
ggplot(data = mpg, aes(x = displ, y = cty)) +
geom_jitter() +
coord_cartesian(xlim = c(4, 7), ylim = c(10, 20))
Formatting Axes and labels
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
ggtitle("Texas Housing Sales",
subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center")
Pairwise Plots
# library(GGally)
ggpairs(mtcars)
if-else
x <- 7
if(x >= 10) {
print("x exceeds acceptable tolerance levels")
} else if(x >= 0 & x < 10) {
print("x is within acceptable tolerance levels")
} else {
print("x is negative")
}
## [1] "x is within acceptable tolerance levels"
for loop
years <- 2010:2017
for (i in seq_along(years)) {
output <- paste("The year is", years[i])
print(output)
}
## [1] "The year is 2010"
## [1] "The year is 2011"
## [1] "The year is 2012"
## [1] "The year is 2013"
## [1] "The year is 2014"
## [1] "The year is 2015"
## [1] "The year is 2016"
## [1] "The year is 2017"
result <- vector(mode = "character",
length = length(years))
for (i in seq_along(years)) {
output <- paste("The year is", years[i])
result[i] <- output
}
result
## [1] "The year is 2010" "The year is 2011" "The year is 2012"
## [4] "The year is 2013" "The year is 2014" "The year is 2015"
## [7] "The year is 2016" "The year is 2017"
x <- c(-1, 7, 8, 11)
tolerance <- vector(mode = "character",
length = length(x))
for (i in seq_along(x)) {
if(x[i] >= 10) {
value <- "x exceeds acceptable tolerance levels"
} else if(x[i] >= 0 & x[i] < 10) {
value <- "x is within acceptable tolerance levels"
} else {
value <- "x is negative"
}
tolerance[i] <- value
}
tolerance
## [1] "x is negative"
## [2] "x is within acceptable tolerance levels"
## [3] "x is within acceptable tolerance levels"
## [4] "x exceeds acceptable tolerance levels"
break
x <- 1:5
for (val in x) {
if (val == 3){
break
}
print(val)
}
## [1] 1
## [1] 2
next
x <- 1:5
for (val in x) {
if (val == 3){
next
}
print(val)
}
## [1] 1
## [1] 2
## [1] 4
## [1] 5
pv <- function(FV, r, n) {
present_value <- FV / (1 + r)^n
round(present_value, 2)
}
pv(FV = 1000, r = .08, n = 5)
## [1] 680.58
pv <- function(FV, r, n = 5) {
if(!is.atomic(FV)) {
stop('FV must be an atomic vector')
}
if(!is.numeric(FV) | !is.numeric(r) | !is.numeric(n)){
stop('This function only works for numeric inputs!\n',
'You have provided objects of the following classes:\n',
'FV: ', class(FV), '\n',
'r: ', class(r), '\n',
'n: ', class(n))
}
present_value <- FV / (1 + r)^n
round(present_value, 2)
}
# pv(FV = "1000", .08, n = 5)
# Error in pv(FV = "1000", 0.08, n = 5) :
#This function only works for numeric inputs!
# You have provided objects of the following classes:
# FV: character
# r: numeric
# n: numeric
apply - can be used to apply a function to a matrix.
data <- matrix(c(1:10, 21:30), nrow = 5, ncol = 4)
data
## [,1] [,2] [,3] [,4]
## [1,] 1 6 21 26
## [2,] 2 7 22 27
## [3,] 3 8 23 28
## [4,] 4 9 24 29
## [5,] 5 10 25 30
# applies function to row (1) or column (2) of the matrix
apply(data, 1, mean)
## [1] 13.5 14.5 15.5 16.5 17.5
custom function in apply
apply(mtcars[,1:2], 2, function(x)(x-mean(x))/sd(x))
## mpg cyl
## Mazda RX4 0.15088482 -0.1049878
## Mazda RX4 Wag 0.15088482 -0.1049878
## Datsun 710 0.44954345 -1.2248578
## Hornet 4 Drive 0.21725341 -0.1049878
## Hornet Sportabout -0.23073453 1.0148821
## Valiant -0.33028740 -0.1049878
## Duster 360 -0.96078893 1.0148821
## Merc 240D 0.71501778 -1.2248578
## Merc 230 0.44954345 -1.2248578
## Merc 280 -0.14777380 -0.1049878
## Merc 280C -0.38006384 -0.1049878
## Merc 450SE -0.61235388 1.0148821
## Merc 450SL -0.46302456 1.0148821
## Merc 450SLC -0.81145962 1.0148821
## Cadillac Fleetwood -1.60788262 1.0148821
## Lincoln Continental -1.60788262 1.0148821
## Chrysler Imperial -0.89442035 1.0148821
## Fiat 128 2.04238943 -1.2248578
## Honda Civic 1.71054652 -1.2248578
## Toyota Corolla 2.29127162 -1.2248578
## Toyota Corona 0.23384555 -1.2248578
## Dodge Challenger -0.76168319 1.0148821
## AMC Javelin -0.81145962 1.0148821
## Camaro Z28 -1.12671039 1.0148821
## Pontiac Firebird -0.14777380 1.0148821
## Fiat X1-9 1.19619000 -1.2248578
## Porsche 914-2 0.98049211 -1.2248578
## Lotus Europa 1.71054652 -1.2248578
## Ford Pantera L -0.71190675 1.0148821
## Ferrari Dino -0.06481307 -0.1049878
## Maserati Bora -0.84464392 1.0148821
## Volvo 142E 0.21725341 -1.2248578
lapply - similar to apply, but it takes a list as an input, and returns a list as the output.
data <- list(x = 1:5, y = 6:10, z = 11:15)
data
## $x
## [1] 1 2 3 4 5
##
## $y
## [1] 6 7 8 9 10
##
## $z
## [1] 11 12 13 14 15
# applies a function to each element in the list
lapply(data, FUN = median)
## $x
## [1] 3
##
## $y
## [1] 8
##
## $z
## [1] 13
sapply - same as lapply, but returns a vector instead of a list.
sapply(data, FUN = median)
## x y z
## 3 8 13
tapply - splits the array based on specified data, usually factor levels and then applies the function to it.
# group by cyl, then find mean wt in each group
tapply(mtcars$wt, mtcars$cyl, mean)
## 4 6 8
## 2.285727 3.117143 3.999214
bind_rows()
one <- mtcars[1:4, 1:3]
two <- mtcars[11:14, 1:2]
print(one)
## mpg cyl disp
## Mazda RX4 21.0 6 160
## Mazda RX4 Wag 21.0 6 160
## Datsun 710 22.8 4 108
## Hornet 4 Drive 21.4 6 258
print(two)
## mpg cyl
## Merc 280C 17.8 6
## Merc 450SE 16.4 8
## Merc 450SL 17.3 8
## Merc 450SLC 15.2 8
# You can bind dataframes by rows for having different columns
bind_rows(one, two)
## mpg cyl disp
## 1 21.0 6 160
## 2 21.0 6 160
## 3 22.8 4 108
## 4 21.4 6 258
## 5 17.8 6 NA
## 6 16.4 8 NA
## 7 17.3 8 NA
## 8 15.2 8 NA
bind_cols()
one <- mtcars[1:5, 1:2]
two <- mtcars[1:5, 3:4]
print(one)
## mpg cyl
## Mazda RX4 21.0 6
## Mazda RX4 Wag 21.0 6
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 6
## Hornet Sportabout 18.7 8
print(two)
## disp hp
## Mazda RX4 160 110
## Mazda RX4 Wag 160 110
## Datsun 710 108 93
## Hornet 4 Drive 258 110
## Hornet Sportabout 360 175
# You can bind dataframes by columns
bind_cols(one, two)
## mpg cyl disp hp
## 1 21.0 6 160 110
## 2 21.0 6 160 110
## 3 22.8 4 108 93
## 4 21.4 6 258 110
## 5 18.7 8 360 175
remove duplicates
one <- mtcars[1:4, 1:7]
df <- bind_rows(one,one)
df[!duplicated(df), ]
## mpg cyl disp hp drat wt qsec
## 1 21.0 6 160 110 3.90 2.620 16.46
## 2 21.0 6 160 110 3.90 2.875 17.02
## 3 22.8 4 108 93 3.85 2.320 18.61
## 4 21.4 6 258 110 3.08 3.215 19.44
“.” operator usage
mtcars %>%
mutate(cars = rownames(.))
## mpg cyl disp hp drat wt qsec vs am gear carb cars
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Datsun 710
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 Valiant
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Duster 360
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 Merc 230
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 Merc 280
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 Merc 280C
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 Merc 450SE
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 Merc 450SL
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 Merc 450SLC
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 Cadillac Fleetwood
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 Lincoln Continental
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 Chrysler Imperial
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 Fiat 128
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 Honda Civic
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 Toyota Corolla
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 Toyota Corona
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 Dodge Challenger
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 AMC Javelin
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 Camaro Z28
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 Pontiac Firebird
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 Fiat X1-9
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 Porsche 914-2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 Lotus Europa
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 Ford Pantera L
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 Ferrari Dino
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 Maserati Bora
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 Volvo 142E
accessing columns having special characters using ’’
model <- lm(mpg~disp,mtcars)
coeff <- as.data.frame(summary(model)$coefficients)
coeff
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.59985476 1.229719515 24.070411 3.576586e-21
## disp -0.04121512 0.004711833 -8.747152 9.380327e-10
coeff$`Pr(>|t|)`
## [1] 3.576586e-21 9.380327e-10
avoid single columns of dataframe to become vector
df1 <- mtcars[,1]
print(class(df1))
## [1] "numeric"
print(df1)
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4
df2 <- mtcars[,1,drop=FALSE]
print(class(df2))
## [1] "data.frame"
print(df2)
## mpg
## Mazda RX4 21.0
## Mazda RX4 Wag 21.0
## Datsun 710 22.8
## Hornet 4 Drive 21.4
## Hornet Sportabout 18.7
## Valiant 18.1
## Duster 360 14.3
## Merc 240D 24.4
## Merc 230 22.8
## Merc 280 19.2
## Merc 280C 17.8
## Merc 450SE 16.4
## Merc 450SL 17.3
## Merc 450SLC 15.2
## Cadillac Fleetwood 10.4
## Lincoln Continental 10.4
## Chrysler Imperial 14.7
## Fiat 128 32.4
## Honda Civic 30.4
## Toyota Corolla 33.9
## Toyota Corona 21.5
## Dodge Challenger 15.5
## AMC Javelin 15.2
## Camaro Z28 13.3
## Pontiac Firebird 19.2
## Fiat X1-9 27.3
## Porsche 914-2 26.0
## Lotus Europa 30.4
## Ford Pantera L 15.8
## Ferrari Dino 19.7
## Maserati Bora 15.0
## Volvo 142E 21.4