getwd()
## [1] "/Users/mac/bigdata"
setwd("/Users/mac/bigdata")
getwd()
## [1] "/Users/mac/bigdata"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data("cars")
glimpse(cars)
## Rows: 50
## Columns: 2
## $ speed <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13…
## $ dist  <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34…
summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
cars1<-cars %>% mutate(violation=ifelse(cars$speed>15.4,"fast","slow"))
cars1
##    speed dist violation
## 1      4    2      slow
## 2      4   10      slow
## 3      7    4      slow
## 4      7   22      slow
## 5      8   16      slow
## 6      9   10      slow
## 7     10   18      slow
## 8     10   26      slow
## 9     10   34      slow
## 10    11   17      slow
## 11    11   28      slow
## 12    12   14      slow
## 13    12   20      slow
## 14    12   24      slow
## 15    12   28      slow
## 16    13   26      slow
## 17    13   34      slow
## 18    13   34      slow
## 19    13   46      slow
## 20    14   26      slow
## 21    14   36      slow
## 22    14   60      slow
## 23    14   80      slow
## 24    15   20      slow
## 25    15   26      slow
## 26    15   54      slow
## 27    16   32      fast
## 28    16   40      fast
## 29    17   32      fast
## 30    17   40      fast
## 31    17   50      fast
## 32    18   42      fast
## 33    18   56      fast
## 34    18   76      fast
## 35    18   84      fast
## 36    19   36      fast
## 37    19   46      fast
## 38    19   68      fast
## 39    20   32      fast
## 40    20   48      fast
## 41    20   52      fast
## 42    20   56      fast
## 43    20   64      fast
## 44    22   66      fast
## 45    23   54      fast
## 46    24   70      fast
## 47    24   92      fast
## 48    24   93      fast
## 49    24  120      fast
## 50    25   85      fast
glimpse(cars1)
## Rows: 50
## Columns: 3
## $ speed     <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13…
## $ dist      <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26…
## $ violation <chr> "slow", "slow", "slow", "slow", "slow", "slow", "slow", "slo…
cars1$violation<-factor(cars1$violation)
glimpse(cars1)
## Rows: 50
## Columns: 3
## $ speed     <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13…
## $ dist      <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26…
## $ violation <fct> slow, slow, slow, slow, slow, slow, slow, slow, slow, slow, …
cars1 %>% group_by(violation) %>% summarize(p=mean(speed))
## # A tibble: 2 × 2
##   violation     p
##   <fct>     <dbl>
## 1 fast       19.9
## 2 slow       11.2
library(hflights)
summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek        DepTime    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400  
##                                                                  NA's   :2905  
##     ArrTime     UniqueCarrier        FlightNum      TailNum         
##  Min.   :   1   Length:227496      Min.   :   1   Length:227496     
##  1st Qu.:1215   Class :character   1st Qu.: 855   Class :character  
##  Median :1617   Mode  :character   Median :1696   Mode  :character  
##  Mean   :1578                      Mean   :1962                     
##  3rd Qu.:1953                      3rd Qu.:2755                     
##  Max.   :2400                      Max.   :7290                     
##  NA's   :3066                                                       
##  ActualElapsedTime    AirTime         ArrDelay          DepDelay      
##  Min.   : 34.0     Min.   : 11.0   Min.   :-70.000   Min.   :-33.000  
##  1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000   1st Qu.: -3.000  
##  Median :128.0     Median :107.0   Median :  0.000   Median :  0.000  
##  Mean   :129.3     Mean   :108.1   Mean   :  7.094   Mean   :  9.445  
##  3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000   3rd Qu.:  9.000  
##  Max.   :575.0     Max.   :549.0   Max.   :978.000   Max.   :981.000  
##  NA's   :3622      NA's   :3622    NA's   :3622      NA's   :2905     
##     Origin              Dest              Distance          TaxiIn       
##  Length:227496      Length:227496      Min.   :  79.0   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.: 376.0   1st Qu.:  4.000  
##  Mode  :character   Mode  :character   Median : 809.0   Median :  5.000  
##                                        Mean   : 787.8   Mean   :  6.099  
##                                        3rd Qu.:1042.0   3rd Qu.:  7.000  
##                                        Max.   :3904.0   Max.   :165.000  
##                                                         NA's   :3066     
##     TaxiOut         Cancelled       CancellationCode      Diverted       
##  Min.   :  1.00   Min.   :0.00000   Length:227496      Min.   :0.000000  
##  1st Qu.: 10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000  
##  Median : 14.00   Median :0.00000   Mode  :character   Median :0.000000  
##  Mean   : 15.09   Mean   :0.01307                      Mean   :0.002853  
##  3rd Qu.: 18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000  
##  Max.   :163.00   Max.   :1.00000                      Max.   :1.000000  
##  NA's   :2947
glimpse(hflights)
## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
hflights %>% count(Month)
##    Month     n
## 1      1 18910
## 2      2 17128
## 3      3 19470
## 4      4 18593
## 5      5 19172
## 6      6 19600
## 7      7 20548
## 8      8 20176
## 9      9 18065
## 10    10 18696
## 11    11 18021
## 12    12 19117
hflights %>% filter(Month==2|Month==8) %>% dim()
## [1] 37304    21
glimpse(hflights)
## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
hflights %>% count(Dest)
##     Dest    n
## 1    ABQ 2812
## 2    AEX  724
## 3    AGS    1
## 4    AMA 1297
## 5    ANC  125
## 6    ASE  125
## 7    ATL 7886
## 8    AUS 5022
## 9    AVL  350
## 10   BFL  504
## 11   BHM 2736
## 12   BKG  110
## 13   BNA 3481
## 14   BOS 1752
## 15   BPT    3
## 16   BRO 1692
## 17   BTR 1762
## 18   BWI 2551
## 19   CAE  561
## 20   CHS 1200
## 21   CID  410
## 22   CLE 2140
## 23   CLT 4735
## 24   CMH 1348
## 25   COS 1657
## 26   CRP 4813
## 27   CRW  357
## 28   CVG 1535
## 29   DAL 9820
## 30   DAY  451
## 31   DCA 2699
## 32   DEN 5920
## 33   DFW 6653
## 34   DSM  647
## 35   DTW 2601
## 36   ECP  729
## 37   EGE  110
## 38   ELP 3036
## 39   EWR 4314
## 40   FLL 2462
## 41   GJT  403
## 42   GPT 1618
## 43   GRK   42
## 44   GRR  677
## 45   GSO  630
## 46   GSP 1123
## 47   GUC   86
## 48   HDN  110
## 49   HNL  402
## 50   HOB  309
## 51   HRL 3983
## 52   HSV  923
## 53   IAD 1980
## 54   ICT 1517
## 55   IND 1750
## 56   JAN 2011
## 57   JAX 2135
## 58   JFK  695
## 59   LAS 4082
## 60   LAX 6064
## 61   LBB 1333
## 62   LCH  364
## 63   LEX  584
## 64   LFT 2313
## 65   LGA 2730
## 66   LIT 1579
## 67   LRD 1188
## 68   MAF 2306
## 69   MCI 3174
## 70   MCO 3687
## 71   MDW 2094
## 72   MEM 2399
## 73   MFE 1128
## 74   MIA 2463
## 75   MKE 1588
## 76   MLU  292
## 77   MOB 1674
## 78   MSP 2010
## 79   MSY 6823
## 80   MTJ  164
## 81   OAK  690
## 82   OKC 3170
## 83   OMA 2044
## 84   ONT  952
## 85   ORD 5748
## 86   ORF  717
## 87   PBI 1253
## 88   PDX 1235
## 89   PHL 2367
## 90   PHX 5096
## 91   PIT 1664
## 92   PNS 1539
## 93   PSP  106
## 94   RDU 1740
## 95   RIC  900
## 96   RNO  243
## 97   RSW  948
## 98   SAN 2936
## 99   SAT 4893
## 100  SAV  863
## 101  SDF 1279
## 102  SEA 2615
## 103  SFO 2818
## 104  SHV  787
## 105  SJC  885
## 106  SJU  391
## 107  SLC 2033
## 108  SMF 1014
## 109  SNA 1661
## 110  STL 2509
## 111  TPA 3085
## 112  TUL 2924
## 113  TUS 1565
## 114  TYS 1210
## 115  VPS  880
## 116  XNA 1172
hflights %>% count(Dest) %>% arrange(n) %>% head()
##   Dest   n
## 1  AGS   1
## 2  BPT   3
## 3  GRK  42
## 4  GUC  86
## 5  PSP 106
## 6  BKG 110
hflights %>% count(Dest) %>% arrange(desc(n)) %>% head()
##   Dest    n
## 1  DAL 9820
## 2  ATL 7886
## 3  MSY 6823
## 4  DFW 6653
## 5  LAX 6064
## 6  DEN 5920
library(gapminder)
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
gapminder %>% count(country)
## # A tibble: 142 × 2
##    country         n
##    <fct>       <int>
##  1 Afghanistan    12
##  2 Albania        12
##  3 Algeria        12
##  4 Angola         12
##  5 Argentina      12
##  6 Australia      12
##  7 Austria        12
##  8 Bahrain        12
##  9 Bangladesh     12
## 10 Belgium        12
## # ℹ 132 more rows
gapminder %>% count(continent)
## # A tibble: 5 × 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp))
## # A tibble: 5 × 2
##   continent     m
##   <fct>     <dbl>
## 1 Africa     48.9
## 2 Americas   64.7
## 3 Asia       60.1
## 4 Europe     71.9
## 5 Oceania    74.3
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp)) %>% arrange(m)
## # A tibble: 5 × 2
##   continent     m
##   <fct>     <dbl>
## 1 Africa     48.9
## 2 Asia       60.1
## 3 Americas   64.7
## 4 Europe     71.9
## 5 Oceania    74.3
gapminder %>% group_by(continent) %>% summarize(m=mean(lifeExp)) %>% arrange(desc(m))
## # A tibble: 5 × 2
##   continent     m
##   <fct>     <dbl>
## 1 Oceania    74.3
## 2 Europe     71.9
## 3 Americas   64.7
## 4 Asia       60.1
## 5 Africa     48.9
tips<-read.csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')
glimpse(tips)
## Rows: 244
## Columns: 7
## $ total_bill <dbl> 16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.77, 26.88, 15.0…
## $ tip        <dbl> 1.01, 1.66, 3.50, 3.31, 3.61, 4.71, 2.00, 3.12, 1.96, 3.23,…
## $ sex        <chr> "Female", "Male", "Male", "Male", "Female", "Male", "Male",…
## $ smoker     <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",…
## $ day        <chr> "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Su…
## $ time       <chr> "Dinner", "Dinner", "Dinner", "Dinner", "Dinner", "Dinner",…
## $ size       <int> 2, 3, 3, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 3, 3, 3, 3,…
head(tips)
##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4
library(dplyr)
library(ggplot2)

tips %>% ggplot(aes(size))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tips %>% ggplot(aes(total_bill,tip))+geom_point()

tips %>% ggplot(aes(total_bill,tip))+geom_point(aes(col=day))

tips %>% ggplot(aes(total_bill,tip))+geom_point(aes(col=day,pch=sex),size=3)

glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
gapminder %>% ggplot(aes(x=year,y=lifeExp,col=continent))+geom_point(alpha=0.2)+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

x<-gapminder %>% filter(year==1952)
hist(x$lifeExp,main="Histogram of lifeExp in 1952")

x %>% ggplot(aes(continent,lifeExp))+geom_boxplot()