library(dplyr)
library(ggplot2)1 + 2 # addition## [1] 3
5 - 3 # subtraction## [1] 2
4 / 1 # division## [1] 4
3 ^ 2 # exponential## [1] 9
7 %% 2 # Modulo## [1] 1
(50*2) + (50/2) # BODMAS## [1] 125
1 > 2 # will return boolean (false) ## [1] FALSE
7 > 4 # will return boolean (true)## [1] TRUE
5>=5 # greater than equal to## [1] TRUE
1>=2## [1] FALSE
3<=2 # less than equal to ## [1] FALSE
2<=2## [1] TRUE
5 != 2 # Not equal to## [1] TRUE
7 != 7 ## [1] FALSE
5 == 5 # Equal to## [1] TRUE
2 == 3 ## [1] FALSE
v1 <- c(1,2,3) # creates a numeric vector
v2 <- c(11,22,33) # creates a numeric vector
v1 < v2 # returns a boolean vector## [1] TRUE TRUE TRUE
help(vector) #help function is good for any help and documentation
?data.frame # performs the same as help functionn <- 3.14 # numeric class (floating point values)
i <- 10 # integer class
t <- TRUE # Logical class
f <- FALSE # Logical class
ch <- "R for Data Science" # character class (can also use single quotes)
class(n) # class function is used to know the data type class## [1] "numeric"
class(i)## [1] "numeric"
class(t)## [1] "logical"
class(f)## [1] "logical"
class(ch)## [1] "character"
nvec <- c(10,20,30,40) # creates a numeric vector
cvec <- c('A','B','C') # creates a character vector
lvec <- c(TRUE,FALSE) # creates a logical vector
class(nvec) # check the classes## [1] "numeric"
class(cvec)## [1] "character"
class(lvec)## [1] "logical"
temps <- c(55,67,74,68,65,77,81)
names(temps) <- c('Mon','Tue','Wed','Thu','Fri','Sat','Sun') # names function is used to give names to the vector
temps## Mon Tue Wed Thu Fri Sat Sun
## 55 67 74 68 65 77 81
v1 <- c(100,200,300)
v2 <- c('a','b','c')
v1[2] # get the second element## [1] 200
v2[2]## [1] "b"
v1[c(1,2)] # get multiple elements## [1] 100 200
v2[c(1,3)]## [1] "a" "c"
v <- c(1,2,3,4,5,6,7,8,9,10) # slicing example
v[2:4] # get the 2nd,3rd and 4th element ## [1] 2 3 4
v[5:8] # get 5-8 elements## [1] 5 6 7 8
names(v) <- c('a','b','c','d','e','f','g','h','i','j') # indexing with names
v['c'] # get the value corresponding to c## c
## 3
v['i'] # get the value corresponding to i## i
## 9
v[v>2] # indexing using comparison operations ## c d e f g h i j
## 3 4 5 6 7 8 9 10
v1 <- c(10,20,30) # create sample vectos
v2 <- c(50,60,70)
# vector arithmetic
v1 + v2 # addition## [1] 60 80 100
v2 - v1 # subtraction## [1] 40 40 40
v1 * v2 # multiplication## [1] 500 1200 2100
v2 / v1 # division## [1] 5.000000 3.000000 2.333333
# functions
sum(v1) # add all the elements on the vector## [1] 60
sd(v2) # standard deviation## [1] 10
var(v2) # variance## [1] 100
max(v1) # maximum value in the vector## [1] 30
min(v2) # minimum value in the vector## [1] 50
prod(v1) # product of all the elements in the vector## [1] 6000
empty_dataframe <- data.frame() # creates an empty data frame
c1 <- 1:10
c2 <- 20:29
df <- data.frame(c1,c2)
df## c1 c2
## 1 1 20
## 2 2 21
## 3 3 22
## 4 4 23
## 5 5 24
## 6 6 25
## 7 7 26
## 8 8 27
## 9 9 28
## 10 10 29
# the following code reads a csv file using the function read.csv() and store it as a dataframe.
#df2 <- read.csv('sample.csv',header = T,stringsAsFactors = F)
# the following code reads an excel file using the function read_excel() and store it as a dataframe. To run this code first you also need to import the readxl package.
library(readxl)
#df3 <- read_excel('sample.xlsx',sheet = 'Sheet 1')nrow(df) # number of rows in dataframe## [1] 10
ncol(df) # number of cols in dataframe## [1] 2
colnames(df) # column names of dataframe## [1] "c1" "c2"
rownames(df) # row names of dataframe## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
head(df,n=5) # shows the first few rows## c1 c2
## 1 1 20
## 2 2 21
## 3 3 22
## 4 4 23
## 5 5 24
tail(df,n=5) # shows the last few rows ## c1 c2
## 6 6 25
## 7 7 26
## 8 8 27
## 9 9 28
## 10 10 29
str(df) # structure of dataframe## 'data.frame': 10 obs. of 2 variables:
## $ c1: int 1 2 3 4 5 6 7 8 9 10
## $ c2: int 20 21 22 23 24 25 26 27 28 29
summary(df) # gives summary statistic## c1 c2
## Min. : 1.00 Min. :20.00
## 1st Qu.: 3.25 1st Qu.:22.25
## Median : 5.50 Median :24.50
## Mean : 5.50 Mean :24.50
## 3rd Qu.: 7.75 3rd Qu.:26.75
## Max. :10.00 Max. :29.00
new_row <- c(11,30)
df <- rbind(df,new_row) # use rbind to add new rows to existing dataframe
df## c1 c2
## 1 1 20
## 2 2 21
## 3 3 22
## 4 4 23
## 5 5 24
## 6 6 25
## 7 7 26
## 8 8 27
## 9 9 28
## 10 10 29
## 11 11 30
df_new <- df
new_col <- 41:51
df_new <- cbind(df_new,new_col) # use cbind to add new columns to existing dataframe
df_new## c1 c2 new_col
## 1 1 20 41
## 2 2 21 42
## 3 3 22 43
## 4 4 23 44
## 5 5 24 45
## 6 6 25 46
## 7 7 26 47
## 8 8 27 48
## 9 9 28 49
## 10 10 29 50
## 11 11 30 51
any(is.na(df)) # to find if there are na values in the dataframe## [1] FALSE
any(is.na(df$c1)) # any na values in column c1## [1] FALSE
na.omit(df) # deletes all the na values## c1 c2
## 1 1 20
## 2 2 21
## 3 3 22
## 4 4 23
## 5 5 24
## 6 6 25
## 7 7 26
## 8 8 27
## 9 9 28
## 10 10 29
## 11 11 30
Why to use Dplyr ?
library(dplyr) # load the packages
library(hflights)We will be using the hflights dataset which has the data for the flights departing from two Houston airports in 2011.
data(hflights) # load the data
head(hflights) # looks at first few rows## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
#tail(hlights) # looks at the last few rows
str(hflights) # displays the structure of the dataframe## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
Dplyr Verbs and functions :-
flights <- tbl_df(hflights) # creates a local dataframe that prints nicely.
flights## # A tibble: 227,496 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## * <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(flights, Month == 1 & DayofMonth == 1) # filter those rows where Month is 1 and DayofMonth is 1## # A tibble: 552 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 1 6 728 840 AA
## 3 2011 1 1 6 1631 1736 AA
## 4 2011 1 1 6 1756 2112 AA
## 5 2011 1 1 6 1012 1347 AA
## 6 2011 1 1 6 1211 1325 AA
## 7 2011 1 1 6 557 906 AA
## 8 2011 1 1 6 1824 2106 AS
## 9 2011 1 1 6 654 1124 B6
## 10 2011 1 1 6 1639 2110 B6
## # ... with 542 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# filter those rows where UniqueCarrier is "AA" or "UA"
filter(flights, UniqueCarrier == "AA" | UniqueCarrier == "UA")## # A tibble: 5,316 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 5,306 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
select(flights,DepTime,ArrTime,FlightNum) # select only the specified columns## # A tibble: 227,496 x 3
## DepTime ArrTime FlightNum
## * <int> <int> <int>
## 1 1400 1500 428
## 2 1401 1501 428
## 3 1352 1502 428
## 4 1403 1513 428
## 5 1405 1507 428
## 6 1359 1503 428
## 7 1359 1509 428
## 8 1355 1454 428
## 9 1443 1554 428
## 10 1443 1553 428
## # ... with 227,486 more rows
select(flights,Year:DayofMonth,contains("Taxi"),contains("Delay")) # use contains if column names are big## # A tibble: 227,496 x 7
## Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## * <int> <int> <int> <int> <int> <int> <int>
## 1 2011 1 1 7 13 -10 0
## 2 2011 1 2 6 9 -9 1
## 3 2011 1 3 5 17 -8 -8
## 4 2011 1 4 9 22 3 3
## 5 2011 1 5 9 9 -3 5
## 6 2011 1 6 6 13 -7 -1
## 7 2011 1 7 12 15 -1 -1
## 8 2011 1 8 7 12 -16 -5
## 9 2011 1 9 8 22 44 43
## 10 2011 1 10 6 19 43 43
## # ... with 227,486 more rows
# chaining example where first you select the specified columns and then filter only those rows based on the speicfied condition
flights %>% select(UniqueCarrier,DepDelay) %>% filter(DepDelay > 60) ## # A tibble: 10,242 x 2
## UniqueCarrier DepDelay
## <chr> <int>
## 1 AA 90
## 2 AA 67
## 3 AA 74
## 4 AA 125
## 5 AA 82
## 6 AA 99
## 7 AA 70
## 8 AA 61
## 9 AA 74
## 10 AS 73
## # ... with 10,232 more rows
pipe_vector1 <- c(1:5) # chaining can be used in general R programming as well
pipe_vector2 <- c(2:6)
(pipe_vector1-pipe_vector2)^2 %>% sum() %>% sqrt()## [1] 2.236068
# use arrange to order the dataframe in ascending order
flights %>% select(UniqueCarrier,DepDelay) %>% arrange(DepDelay)## # A tibble: 227,496 x 2
## UniqueCarrier DepDelay
## <chr> <int>
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
## 7 XE -17
## 8 CO -17
## 9 XE -17
## 10 MQ -17
## # ... with 227,486 more rows
# use desc within arrange to order the dataframe in descending order
#flights %>% select(UniqueCarrier,DepDelay) %>% arrange(desc(DepDelay))# use mutate to create a new column called Speed from the existing columns
flights %>% select(Distance,AirTime) %>% mutate(Speed = Distance/AirTime*60) ## # A tibble: 227,496 x 3
## Distance AirTime Speed
## <int> <int> <dbl>
## 1 224 40 336.0000
## 2 224 45 298.6667
## 3 224 48 280.0000
## 4 224 39 344.6154
## 5 224 44 305.4545
## 6 224 45 298.6667
## 7 224 43 312.5581
## 8 224 40 336.0000
## 9 224 41 327.8049
## 10 224 45 298.6667
## # ... with 227,486 more rows
#flights <- flights %>% mutate(Speed = Distance/AirTime*60) # save the new column to the dataframe# group by Dest and then calculate the mean avg_delay for each particular group
flights %>% group_by(Dest) %>% summarise(avg_delay = mean(ArrDelay,na.rm=TRUE)) ## # A tibble: 116 x 2
## Dest avg_delay
## <chr> <dbl>
## 1 ABQ 7.226259
## 2 AEX 5.839437
## 3 AGS 4.000000
## 4 AMA 6.840095
## 5 ANC 26.080645
## 6 ASE 6.794643
## 7 ATL 8.233251
## 8 AUS 7.448718
## 9 AVL 9.973988
## 10 BFL -13.198807
## # ... with 106 more rows
# same as above but this time we are caluculating the mean on two columns (Cancelled and Diverted)
flights %>% group_by(UniqueCarrier) %>% summarise_each(funs(mean),Cancelled,Diverted) ## # A tibble: 15 x 3
## UniqueCarrier Cancelled Diverted
## <chr> <dbl> <dbl>
## 1 AA 0.018495684 0.001849568
## 2 AS 0.000000000 0.002739726
## 3 B6 0.025899281 0.005755396
## 4 CO 0.006782614 0.002627370
## 5 DL 0.015903067 0.003029156
## 6 EV 0.034482759 0.003176044
## 7 F9 0.007159905 0.000000000
## 8 FL 0.009817672 0.003272557
## 9 MQ 0.029044750 0.001936317
## 10 OO 0.013946828 0.003486707
## 11 UA 0.016409266 0.002413127
## 12 US 0.011268986 0.001469868
## 13 WN 0.015504047 0.002293629
## 14 XE 0.015495599 0.003449550
## 15 YV 0.012658228 0.000000000
# can also use matches to match the column names
flights %>% group_by(UniqueCarrier) %>% summarise_each(funs(min(.,na.rm=T),max(.,na.rm=T)),matches("Delay")) ## # A tibble: 15 x 5
## UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
## <chr> <int> <int> <int> <int>
## 1 AA -39 -15 978 970
## 2 AS -43 -15 183 172
## 3 B6 -44 -14 335 310
## 4 CO -55 -18 957 981
## 5 DL -32 -17 701 730
## 6 EV -40 -18 469 479
## 7 F9 -24 -15 277 275
## 8 FL -30 -14 500 507
## 9 MQ -38 -23 918 931
## 10 OO -57 -33 380 360
## 11 UA -47 -11 861 869
## 12 US -42 -17 433 425
## 13 WN -44 -10 499 548
## 14 XE -70 -19 634 628
## 15 YV -32 -11 72 54
# n() is used to count the number of rows within each groups
flights %>% group_by(Month,DayofMonth) %>% summarise(flight_count = n()) %>% arrange(desc(flight_count)) ## # A tibble: 365 x 3
## # Groups: Month [12]
## Month DayofMonth flight_count
## <int> <int> <int>
## 1 8 4 706
## 2 8 11 706
## 3 8 12 706
## 4 8 5 705
## 5 8 3 704
## 6 8 10 704
## 7 1 3 702
## 8 7 7 702
## 9 7 14 702
## 10 7 28 701
## # ... with 355 more rows
# n_distinct() gives you unique or distinct values within each group
flights %>% group_by(Dest) %>% summarise(flight_count = n(), plane_count = n_distinct(TailNum)) ## # A tibble: 116 x 3
## Dest flight_count plane_count
## <chr> <int> <int>
## 1 ABQ 2812 716
## 2 AEX 724 215
## 3 AGS 1 1
## 4 AMA 1297 158
## 5 ANC 125 38
## 6 ASE 125 60
## 7 ATL 7886 983
## 8 AUS 5022 1015
## 9 AVL 350 142
## 10 BFL 504 70
## # ... with 106 more rows
glimpse(flights) # same as str() function but much better looking## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
flights %>% sample_n(10) # randomly selects 10 rows from the dataframe## # A tibble: 10 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 3 30 3 925 1045 OO
## 2 2011 5 16 1 1805 2044 CO
## 3 2011 9 19 1 2109 2202 XE
## 4 2011 9 14 3 1733 2154 CO
## 5 2011 12 31 6 1900 2049 XE
## 6 2011 12 25 7 1531 1743 WN
## 7 2011 5 20 5 744 1149 CO
## 8 2011 6 6 1 1034 1356 CO
## 9 2011 6 9 4 1150 1316 CO
## 10 2011 5 2 1 1247 1405 XE
## # ... with 14 more variables: FlightNum <int>, TailNum <chr>,
## # ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
This is the dplyr tutorial which explains data manipulation using dplyr in R. https://www.youtube.com/watch?v=jWjqLW-u3hc
This is the Rpubs document which explains the dplyr tutorial along with comparing it with base R functions. https://rpubs.com/justmarkham/dplyr-tutorial