Jying-Nan Wang
2016/11/12
其中,第2步驟通常會花掉分析者超過一半以上的時間!! (According to my experience, step 2 always spends more than 50% time.)
了解資料結構 (understand the data structure)
觀察部分原始資料 (observe part of data)
畫圖了解數據 (plot)
This famous (Fisher’s or Anderson’s) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.
class(iris)## [1] "data.frame"
dim(iris)## [1] 150 5
names(iris)## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
str(iris)## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(dplyr)
glimpse(iris)## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species <fctr> setosa, setosa, setosa, setosa, setosa, setosa, ...
summary(iris)## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris,10)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
tail(iris,10)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
hist(iris$Sepal.Length)plot(iris$Sepal.Width, type="l")plot(iris$Sepal.Width, iris$Sepal.Length)Package: hflights
library(hflights)
class(hflights)## [1] "data.frame"
dim(hflights)## [1] 227496 21
names(hflights)## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
str(hflights)## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
library(dplyr)
glimpse(hflights)## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
summary(hflights)## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
head(hflights,10)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## 5430 2011 1 7 5 1359 1509 AA
## 5431 2011 1 8 6 1355 1454 AA
## 5432 2011 1 9 7 1443 1554 AA
## 5433 2011 1 10 1 1443 1553 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## 5430 428 N493AA 70 43 -1 -1 IAH
## 5431 428 N477AA 59 40 -16 -5 IAH
## 5432 428 N476AA 71 41 44 43 IAH
## 5433 428 N504AA 70 45 43 43 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
## 5430 DFW 224 12 15 0 0
## 5431 DFW 224 7 12 0 0
## 5432 DFW 224 8 22 0 0
## 5433 DFW 224 6 19 0 0
tail(hflights,10)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 6083250 2011 12 6 2 556 745 WN
## 6083251 2011 12 6 2 1026 1208 WN
## 6083252 2011 12 6 2 1611 1746 WN
## 6083253 2011 12 6 2 758 1051 WN
## 6083254 2011 12 6 2 1307 1600 WN
## 6083255 2011 12 6 2 1818 2111 WN
## 6083256 2011 12 6 2 2047 2334 WN
## 6083257 2011 12 6 2 912 1031 WN
## 6083258 2011 12 6 2 656 812 WN
## 6083259 2011 12 6 2 1600 1713 WN
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay
## 6083250 280 N728SW 109 87 -10 -4
## 6083251 782 N476WN 102 86 -12 1
## 6083252 1050 N655WN 95 83 -9 16
## 6083253 201 N903WN 113 100 -4 -2
## 6083254 471 N632SW 113 98 0 7
## 6083255 1191 N284WN 113 97 -9 8
## 6083256 1674 N366SW 107 94 4 7
## 6083257 127 N777QC 79 61 -4 -3
## 6083258 621 N727SW 76 64 -13 -4
## 6083259 1597 N745SW 73 59 -12 0
## Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode
## 6083250 HOU STL 687 13 9 0
## 6083251 HOU STL 687 4 12 0
## 6083252 HOU STL 687 3 9 0
## 6083253 HOU TPA 781 3 10 0
## 6083254 HOU TPA 781 5 10 0
## 6083255 HOU TPA 781 5 11 0
## 6083256 HOU TPA 781 4 9 0
## 6083257 HOU TUL 453 4 14 0
## 6083258 HOU TUL 453 3 9 0
## 6083259 HOU TUL 453 3 11 0
## Diverted
## 6083250 0
## 6083251 0
## 6083252 0
## 6083253 0
## 6083254 0
## 6083255 0
## 6083256 0
## 6083257 0
## 6083258 0
## 6083259 0
# 資料量太大只取2000筆資料
hdat <- hflights[1:2000,]
hist(hdat$Distance)plot(hdat$AirTime, type="h")plot(x=hdat$AirTime, y=hdat$Distance)hdat <- hflights[181:200,1:5]
any(is.na(hdat))## [1] TRUE
hdat## Year Month DayofMonth DayOfWeek DepTime
## 33060 2011 1 10 1 1038
## 33061 2011 1 11 2 1134
## 33062 2011 1 12 3 1019
## 33063 2011 1 13 4 1018
## 33064 2011 1 14 5 1024
## 33065 2011 1 15 6 1022
## 33066 2011 1 16 7 1021
## 33067 2011 1 17 1 1019
## 33068 2011 1 18 2 1017
## 33069 2011 1 19 3 1016
## 33070 2011 1 20 4 1015
## 33071 2011 1 21 5 1017
## 33072 2011 1 22 6 1026
## 33073 2011 1 23 7 1012
## 33074 2011 1 24 1 NA
## 33075 2011 1 25 2 1040
## 33076 2011 1 26 3 1015
## 33077 2011 1 27 4 1017
## 33078 2011 1 28 5 1020
## 33079 2011 1 29 6 1015
complete.cases(hdat)## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
na.omit(hdat)## Year Month DayofMonth DayOfWeek DepTime
## 33060 2011 1 10 1 1038
## 33061 2011 1 11 2 1134
## 33062 2011 1 12 3 1019
## 33063 2011 1 13 4 1018
## 33064 2011 1 14 5 1024
## 33065 2011 1 15 6 1022
## 33066 2011 1 16 7 1021
## 33067 2011 1 17 1 1019
## 33068 2011 1 18 2 1017
## 33069 2011 1 19 3 1016
## 33070 2011 1 20 4 1015
## 33071 2011 1 21 5 1017
## 33072 2011 1 22 6 1026
## 33073 2011 1 23 7 1012
## 33075 2011 1 25 2 1040
## 33076 2011 1 26 3 1015
## 33077 2011 1 27 4 1017
## 33078 2011 1 28 5 1020
## 33079 2011 1 29 6 1015
# try: hdat[complete.cases(hdat),]Package: tidyr
Package: dplyr
piping operator “%>%”
gather(wide_df, my_key, my_val, -col)library(tidyr)
iris_long <- gather(iris,measure,val,-Species)
head(iris_long)## Species measure val
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Length 4.9
## 3 setosa Sepal.Length 4.7
## 4 setosa Sepal.Length 4.6
## 5 setosa Sepal.Length 5.0
## 6 setosa Sepal.Length 5.4
The opposite of gather() is spread(), which takes key-values pairs and spreads them across multiple columns.
spread(long_df, my_key, my_val)stocks <- data.frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2),
Z = rnorm(10, 0, 4)
)
stocksm <- gather(stocks, stock, price, -time)
stocksm## time stock price
## 1 2009-01-01 X 1.1431482
## 2 2009-01-02 X 0.3947804
## 3 2009-01-03 X -0.4696662
## 4 2009-01-04 X 0.1154922
## 5 2009-01-05 X -0.5865716
## 6 2009-01-06 X 0.1262386
## 7 2009-01-07 X 0.2057747
## 8 2009-01-08 X -1.2179181
## 9 2009-01-09 X 1.3067953
## 10 2009-01-10 X 0.3910988
## 11 2009-01-01 Y -0.9863420
## 12 2009-01-02 Y -1.0409016
## 13 2009-01-03 Y 0.1394411
## 14 2009-01-04 Y 1.1976775
## 15 2009-01-05 Y 1.4603047
## 16 2009-01-06 Y -0.9554024
## 17 2009-01-07 Y -2.1406907
## 18 2009-01-08 Y 0.8740856
## 19 2009-01-09 Y 0.3567158
## 20 2009-01-10 Y 2.2240829
## 21 2009-01-01 Z 5.5719442
## 22 2009-01-02 Z 2.4844178
## 23 2009-01-03 Z 0.7518040
## 24 2009-01-04 Z 3.0963480
## 25 2009-01-05 Z 4.0063066
## 26 2009-01-06 Z -0.7690407
## 27 2009-01-07 Z -3.5785023
## 28 2009-01-08 Z -4.2355729
## 29 2009-01-09 Z -0.2205083
## 30 2009-01-10 Z 0.4557198
stocksm1 <- spread(stocksm, stock, price)
stocksm1## time X Y Z
## 1 2009-01-01 1.1431482 -0.9863420 5.5719442
## 2 2009-01-02 0.3947804 -1.0409016 2.4844178
## 3 2009-01-03 -0.4696662 0.1394411 0.7518040
## 4 2009-01-04 0.1154922 1.1976775 3.0963480
## 5 2009-01-05 -0.5865716 1.4603047 4.0063066
## 6 2009-01-06 0.1262386 -0.9554024 -0.7690407
## 7 2009-01-07 0.2057747 -2.1406907 -3.5785023
## 8 2009-01-08 -1.2179181 0.8740856 -4.2355729
## 9 2009-01-09 1.3067953 0.3567158 -0.2205083
## 10 2009-01-10 0.3910988 2.2240829 0.4557198
stocksm2 <- spread(stocksm, time, price)
stocksm2## stock 2009-01-01 2009-01-02 2009-01-03 2009-01-04 2009-01-05 2009-01-06
## 1 X 1.143148 0.3947804 -0.4696662 0.1154922 -0.5865716 0.1262386
## 2 Y -0.986342 -1.0409016 0.1394411 1.1976775 1.4603047 -0.9554024
## 3 Z 5.571944 2.4844178 0.7518040 3.0963480 4.0063066 -0.7690407
## 2009-01-07 2009-01-08 2009-01-09 2009-01-10
## 1 0.2057747 -1.2179181 1.3067953 0.3910988
## 2 -2.1406907 0.8740856 0.3567158 2.2240829
## 3 -3.5785023 -4.2355729 -0.2205083 0.4557198
separate(treatments, year_mo, c("year", "month"))iris_long2 <- separate(iris_long, measure, c("type", "LW"),sep="\\.")
head(iris_long2)## Species type LW val
## 1 setosa Sepal Length 5.1
## 2 setosa Sepal Length 4.9
## 3 setosa Sepal Length 4.7
## 4 setosa Sepal Length 4.6
## 5 setosa Sepal Length 5.0
## 6 setosa Sepal Length 5.4
unite(data, col, from, sep = "_", remove = TRUE)範例
iris_long3 <- unite(iris_long2, "measure", c(Species, type, LW), sep="-")
head(iris_long3)## measure val
## 1 setosa-Sepal-Length 5.1
## 2 setosa-Sepal-Length 4.9
## 3 setosa-Sepal-Length 4.7
## 4 setosa-Sepal-Length 4.6
## 5 setosa-Sepal-Length 5.0
## 6 setosa-Sepal-Length 5.4
另一個範例
hdat <- hflights[1:20, 1:8]
hdat## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## 5430 2011 1 7 5 1359 1509 AA
## 5431 2011 1 8 6 1355 1454 AA
## 5432 2011 1 9 7 1443 1554 AA
## 5433 2011 1 10 1 1443 1553 AA
## 5434 2011 1 11 2 1429 1539 AA
## 5435 2011 1 12 3 1419 1515 AA
## 5436 2011 1 13 4 1358 1501 AA
## 5437 2011 1 14 5 1357 1504 AA
## 5438 2011 1 15 6 1359 1459 AA
## 5439 2011 1 16 7 1359 1509 AA
## 5440 2011 1 17 1 1530 1634 AA
## 5441 2011 1 18 2 1408 1508 AA
## 5442 2011 1 19 3 1356 1503 AA
## 5443 2011 1 20 4 1507 1622 AA
## FlightNum
## 5424 428
## 5425 428
## 5426 428
## 5427 428
## 5428 428
## 5429 428
## 5430 428
## 5431 428
## 5432 428
## 5433 428
## 5434 428
## 5435 428
## 5436 428
## 5437 428
## 5438 428
## 5439 428
## 5440 428
## 5441 428
## 5442 428
## 5443 428
# 合併年月日資料
hdat1 <- unite(hdat, "Date",c(Year,Month,DayofMonth), sep="-")
hdat1## Date DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011-1-1 6 1400 1500 AA 428
## 5425 2011-1-2 7 1401 1501 AA 428
## 5426 2011-1-3 1 1352 1502 AA 428
## 5427 2011-1-4 2 1403 1513 AA 428
## 5428 2011-1-5 3 1405 1507 AA 428
## 5429 2011-1-6 4 1359 1503 AA 428
## 5430 2011-1-7 5 1359 1509 AA 428
## 5431 2011-1-8 6 1355 1454 AA 428
## 5432 2011-1-9 7 1443 1554 AA 428
## 5433 2011-1-10 1 1443 1553 AA 428
## 5434 2011-1-11 2 1429 1539 AA 428
## 5435 2011-1-12 3 1419 1515 AA 428
## 5436 2011-1-13 4 1358 1501 AA 428
## 5437 2011-1-14 5 1357 1504 AA 428
## 5438 2011-1-15 6 1359 1459 AA 428
## 5439 2011-1-16 7 1359 1509 AA 428
## 5440 2011-1-17 1 1530 1634 AA 428
## 5441 2011-1-18 2 1408 1508 AA 428
## 5442 2011-1-19 3 1356 1503 AA 428
## 5443 2011-1-20 4 1507 1622 AA 428
# 合併年月日資料: 調整格式
hdat$Month <- formatC(hdat$Month, width=2, flag="0")
hdat$DayofMonth <- formatC(hdat$DayofMonth, width=2, flag="0")
hdat2 <- unite(hdat, "Date",c(Year,Month,DayofMonth), sep="-")
hdat2## Date DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011-01-01 6 1400 1500 AA 428
## 5425 2011-01-02 7 1401 1501 AA 428
## 5426 2011-01-03 1 1352 1502 AA 428
## 5427 2011-01-04 2 1403 1513 AA 428
## 5428 2011-01-05 3 1405 1507 AA 428
## 5429 2011-01-06 4 1359 1503 AA 428
## 5430 2011-01-07 5 1359 1509 AA 428
## 5431 2011-01-08 6 1355 1454 AA 428
## 5432 2011-01-09 7 1443 1554 AA 428
## 5433 2011-01-10 1 1443 1553 AA 428
## 5434 2011-01-11 2 1429 1539 AA 428
## 5435 2011-01-12 3 1419 1515 AA 428
## 5436 2011-01-13 4 1358 1501 AA 428
## 5437 2011-01-14 5 1357 1504 AA 428
## 5438 2011-01-15 6 1359 1459 AA 428
## 5439 2011-01-16 7 1359 1509 AA 428
## 5440 2011-01-17 1 1530 1634 AA 428
## 5441 2011-01-18 2 1408 1508 AA 428
## 5442 2011-01-19 3 1356 1503 AA 428
## 5443 2011-01-20 4 1507 1622 AA 428
As an example, take the following call, that selects the variables var1 and var2 from the data frame df.
select(df, var1, var2)# hflights is pre-loaded as a tbl, together with the necessary libraries.
# Print out a tbl with the four columns of hflights related to delay
s1 <- select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
head(s1)## ActualElapsedTime AirTime ArrDelay DepDelay
## 5424 60 40 -10 0
## 5425 60 45 -9 1
## 5426 70 48 -8 -8
## 5427 70 39 3 3
## 5428 62 44 -3 5
## 5429 64 45 -7 -1
# Print out the columns Origin up to Cancelled of hflights
s2 <- select(hflights,Origin:Cancelled)
head(s2)## Origin Dest Distance TaxiIn TaxiOut Cancelled
## 5424 IAH DFW 224 7 13 0
## 5425 IAH DFW 224 6 9 0
## 5426 IAH DFW 224 5 17 0
## 5427 IAH DFW 224 9 22 0
## 5428 IAH DFW 224 9 9 0
## 5429 IAH DFW 224 6 13 0
# Answer to last question: be concise!
s3 <- select(hflights,Year:DayOfWeek,ArrDelay:Diverted)
head(s3)## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## 5424 2011 1 1 6 -10 0 IAH DFW
## 5425 2011 1 2 7 -9 1 IAH DFW
## 5426 2011 1 3 1 -8 -8 IAH DFW
## 5427 2011 1 4 2 3 3 IAH DFW
## 5428 2011 1 5 3 -3 5 IAH DFW
## 5429 2011 1 6 4 -7 -1 IAH DFW
## Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 224 7 13 0 0
## 5425 224 6 9 0 0
## 5426 224 5 17 0 0
## 5427 224 9 22 0 0
## 5428 224 9 9 0 0
## 5429 224 6 13 0 0
Take this example that adds a new column, z, which is the element-wise sum of the columns x and y, to the data frame df:
mutate(df, z = x + y)# hflights and dplyr are loaded and ready to serve you.
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights,ActualGroundTime=ActualElapsedTime-AirTime)
head(g1)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted ActualGroundTime
## 1 7 13 0 0 20
## 2 6 9 0 0 15
## 3 5 17 0 0 22
## 4 9 22 0 0 31
## 5 9 9 0 0 18
## 6 6 13 0 0 19
# Add the new variable GroundTime to g1. Save the result as g2.
g2 <- mutate(g1, GroundTime=TaxiIn+ TaxiOut)
head(g2)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted ActualGroundTime
## 1 7 13 0 0 20
## 2 6 9 0 0 15
## 3 5 17 0 0 22
## 4 9 22 0 0 31
## 5 9 9 0 0 18
## 6 6 13 0 0 19
## GroundTime
## 1 20
## 2 15
## 3 22
## 4 31
## 5 18
## 6 19
# Add the new variable AverageSpeed to g2. Save the result as g3.
g3 <- mutate(g2, AverageSpeed=Distance / AirTime * 60)
head(g3)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted ActualGroundTime
## 1 7 13 0 0 20
## 2 6 9 0 0 15
## 3 5 17 0 0 22
## 4 9 22 0 0 31
## 5 9 9 0 0 18
## 6 6 13 0 0 19
## GroundTime AverageSpeed
## 1 20 336.0000
## 2 15 298.6667
## 3 22 280.0000
## 4 31 344.6154
## 5 18 305.4545
## 6 19 298.6667
# Add a variable tdelay to the dataset: g4
g4 <- mutate(hflights, tdelay = ArrDelay + DepDelay)
head(g4)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted tdelay
## 1 7 13 0 0 -10
## 2 6 9 0 0 -8
## 3 5 17 0 0 -16
## 4 9 22 0 0 6
## 5 9 9 0 0 2
## 6 6 13 0 0 -8
THe following example filters df such that only the observations for which a is positive, are kept:
filter(df, a > 0)# All flights that traveled 3000 miles or more
f1 <- filter(hflights, Distance>=3000)
head(f1)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 31 1 924 1413 CO 1
## 2 2011 1 30 7 925 1410 CO 1
## 3 2011 1 29 6 1045 1445 CO 1
## 4 2011 1 28 5 1516 1916 CO 1
## 5 2011 1 27 4 950 1344 CO 1
## 6 2011 1 26 3 944 1350 CO 1
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N69063 529 492 23 -1 IAH HNL 3904
## 2 N76064 525 493 20 0 IAH HNL 3904
## 3 N69063 480 459 55 80 IAH HNL 3904
## 4 N77066 480 463 326 351 IAH HNL 3904
## 5 N76055 474 455 -6 25 IAH HNL 3904
## 6 N76065 486 471 0 19 IAH HNL 3904
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 6 31 0 0
## 2 13 19 0 0
## 3 4 17 0 0
## 4 7 10 0 0
## 5 4 15 0 0
## 6 5 10 0 0
# All flights flown by one of JetBlue, Southwest, or Delta
f2 <- filter(hflights, UniqueCarrier %in% c("JetBlue", "Southwest", "Delta"))
head(f2)## [1] Year Month DayofMonth
## [4] DayOfWeek DepTime ArrTime
## [7] UniqueCarrier FlightNum TailNum
## [10] ActualElapsedTime AirTime ArrDelay
## [13] DepDelay Origin Dest
## [16] Distance TaxiIn TaxiOut
## [19] Cancelled CancellationCode Diverted
## <0 rows> (or 0-length row.names)
# All flights where taxiing took longer than flying
f3 <- filter(hflights, TaxiOut+TaxiIn>AirTime)
head(f3)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 24 1 731 904 AA 460
## 2 2011 1 30 7 1959 2132 AA 533
## 3 2011 1 24 1 1621 1749 AA 1121
## 4 2011 1 10 1 941 1113 AA 1436
## 5 2011 1 31 1 1301 1356 CO 241
## 6 2011 1 31 1 2113 2215 CO 1533
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N545AA 93 42 29 11 IAH DFW 224
## 2 N455AA 93 43 12 -6 IAH DFW 224
## 3 N484AA 88 43 4 -9 IAH DFW 224
## 4 N591AA 92 45 48 31 IAH DFW 224
## 5 N14629 55 27 -2 -4 IAH AUS 140
## 6 N72405 62 30 20 13 IAH AUS 140
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 14 37 0 0
## 2 10 40 0 0
## 3 10 35 0 0
## 4 27 20 0 0
## 5 5 23 0 0
## 6 7 25 0 0
# All flights that departed before 5am or arrived after 10pm
f4 <- filter(hflights, DepTime<500 | ArrTime > 2200)
head(f4)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 4 2 2100 2207 AA 533
## 2 2011 1 14 5 2119 2229 AA 533
## 3 2011 1 10 1 1934 2235 AA 1294
## 4 2011 1 26 3 1905 2211 AA 1294
## 5 2011 1 30 7 1856 2209 AA 1294
## 6 2011 1 9 7 1938 2228 AS 731
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N4XGAA 67 42 47 55 IAH DFW 224
## 2 N549AA 70 45 69 74 IAH DFW 224
## 3 N3BXAA 121 107 80 99 IAH MIA 964
## 4 N3BXAA 126 111 56 70 IAH MIA 964
## 5 N3CPAA 133 108 54 61 IAH MIA 964
## 6 N609AS 290 253 78 73 IAH SEA 1874
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 3 22 0 0
## 2 5 20 0 0
## 3 3 11 0 0
## 4 5 10 0 0
## 5 7 18 0 0
## 6 5 32 0 0
# All flights that departed late but arrived ahead of schedule
f5 <- filter(hflights, DepDelay>0 & ArrDelay<0)
head(f5)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 2 7 1401 1501 AA 428
## 2 2011 1 5 3 1405 1507 AA 428
## 3 2011 1 18 2 1408 1508 AA 428
## 4 2011 1 18 2 721 827 AA 460
## 5 2011 1 12 3 2015 2113 AA 533
## 6 2011 1 13 4 2020 2116 AA 533
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N557AA 60 45 -9 1 IAH DFW 224
## 2 N492AA 62 44 -3 5 IAH DFW 224
## 3 N507AA 60 42 -2 8 IAH DFW 224
## 4 N558AA 66 46 -8 1 IAH DFW 224
## 5 N555AA 58 39 -7 10 IAH DFW 224
## 6 N4XCAA 56 44 -4 15 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 6 9 0 0
## 2 9 9 0 0
## 3 7 11 0 0
## 4 7 13 0 0
## 5 9 10 0 0
## 6 4 8 0 0
# All flights that were cancelled after being delayed
f6 <- filter(hflights, Cancelled == 1 & DepDelay>0)
head(f6)## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 26 3 1926 NA CO 310
## 2 2011 1 11 2 1100 NA US 944
## 3 2011 1 19 3 1811 NA XE 2376
## 4 2011 1 7 5 2028 NA XE 3050
## 5 2011 2 4 5 1638 NA AA 1121
## 6 2011 2 8 2 1057 NA CO 408
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N77865 NA NA NA 26 IAH EWR 1400
## 2 N452UW NA NA NA 135 IAH CLT 913
## 3 N15932 NA NA NA 6 IAH ICT 542
## 4 N15912 NA NA NA 73 IAH JAX 817
## 5 N537AA NA NA NA 8 IAH DFW 224
## 6 N11641 NA NA NA 187 IAH EWR 1400
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 NA NA 1 B 0
## 2 NA NA 1 B 0
## 3 NA NA 1 B 0
## 4 NA 19 1 A 0
## 5 NA 19 1 A 0
## 6 NA NA 1 A 0
arrange() can be used to rearrange rows according to any type of data
# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))
# Arrange dtc by departure delays
a1 <- arrange(dtc,DepDelay)
# Arrange dtc so that cancellation reasons are grouped
a2 <- arrange(dtc,CancellationCode)
# Arrange dtc according to carrier and departure delays
a3 <- arrange(dtc,UniqueCarrier,DepDelay)
# Arrange according to carrier and decreasing departure delays
a4 <- arrange(hflights,UniqueCarrier,desc(DepDelay))
# Arrange flights by total delay (normal order).
a5 <- arrange(hflights,DepDelay+ArrDelay)# Print out a summary with variables min_dist and max_dist
summarise(hflights,min_dist=min(Distance), max_dist=max(Distance))## min_dist max_dist
## 1 79 3904
# Print out a summary with variable max_div
summarise(filter(hflights, Diverted==1),max_div=max(Distance))## max_div
## 1 3904
# Generate summarizing statistics for hflights
summarise(hflights,
n_obs = n(),
n_carrier = n_distinct(UniqueCarrier),
n_dest = n_distinct(Dest))## n_obs n_carrier n_dest
## 1 227496 15 116
# Make an ordered per-carrier summary of hflights
hflights %>%
group_by(UniqueCarrier) %>%
summarise(p_canc = mean(Cancelled==1)*100,
avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
arrange(avg_delay, p_canc)## # A tibble: 15 × 3
## UniqueCarrier p_canc avg_delay
## <chr> <dbl> <dbl>
## 1 US 1.1268986 -0.6307692
## 2 AA 1.8495684 0.8917558
## 3 FL 0.9817672 1.8536239
## 4 AS 0.0000000 3.1923077
## 5 YV 1.2658228 4.0128205
## 6 DL 1.5903067 6.0841374
## 7 CO 0.6782614 6.0986983
## 8 MQ 2.9044750 7.1529751
## 9 EV 3.4482759 7.2569543
## 10 WN 1.5504047 7.5871430
## 11 F9 0.7159905 7.6682692
## 12 XE 1.5495599 8.1865242
## 13 OO 1.3946828 8.6934922
## 14 B6 2.5899281 9.8588410
## 15 UA 1.6409266 10.4628628
# Ordered overview of average arrival delays per carrier
hflights %>%
filter(!is.na(ArrDelay) & ArrDelay>0) %>%
group_by(UniqueCarrier) %>%
summarise(avg=mean(ArrDelay)) %>%
mutate(rank=rank(avg)) %>%
arrange(rank)## # A tibble: 15 × 3
## UniqueCarrier avg rank
## <chr> <dbl> <dbl>
## 1 YV 18.67568 1
## 2 F9 18.68683 2
## 3 US 20.70235 3
## 4 CO 22.13374 4
## 5 AS 22.91195 5
## 6 OO 24.14663 6
## 7 XE 24.19337 7
## 8 WN 25.27750 8
## 9 FL 27.85693 9
## 10 AA 28.49740 10
## 11 DL 32.12463 11
## 12 UA 32.48067 12
## 13 MQ 38.75135 13
## 14 EV 40.24231 14
## 15 B6 45.47744 15