R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.You can check out my Online Portfolio

Import Datasets

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Check out the diamonds dataset in ggplot2

subset by row with filter

filter(diamonds,cut=='Ideal')
## # A tibble: 21,551 x 10
##    carat cut   color clarity depth table price     x     y     z
##    <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46
##  3  0.31 Ideal J     SI2      62.2    54   344  4.35  4.37  2.71
##  4  0.3  Ideal I     SI2      62      54   348  4.31  4.34  2.68
##  5  0.33 Ideal I     SI2      61.8    55   403  4.49  4.51  2.78
##  6  0.33 Ideal I     SI2      61.2    56   403  4.49  4.5   2.75
##  7  0.33 Ideal J     SI1      61.1    56   403  4.49  4.55  2.76
##  8  0.23 Ideal G     VS1      61.9    54   404  3.93  3.95  2.44
##  9  0.32 Ideal I     SI1      60.9    55   404  4.45  4.48  2.72
## 10  0.3  Ideal I     SI2      61      59   405  4.3   4.33  2.63
## # ... with 21,541 more rows
filter(diamonds,price > 10000)
## # A tibble: 5,222 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  1.7  Ideal     J     VS2      60.5    58 10002  7.73  7.74  4.68
##  2  1.03 Ideal     E     VVS2     60.6    59 10003  6.5   6.53  3.95
##  3  1.23 Very Good G     VVS2     60.6    55 10004  6.93  7.02  4.23
##  4  1.25 Ideal     F     VS2      61.6    55 10006  6.93  6.96  4.28
##  5  2.01 Very Good I     SI2      61.4    63 10009  8.19  7.96  4.96
##  6  1.21 Very Good F     VS1      62.3    58 10009  6.76  6.85  4.24
##  7  1.51 Premium   I     VS2      59.9    60 10010  7.42  7.36  4.43
##  8  1.01 Fair      D     SI2      64.6    58 10011  6.25  6.2   4.02
##  9  1.05 Ideal     F     VVS2     60.5    55 10011  6.67  6.58  4.01
## 10  1.6  Ideal     J     VS1      62      53 10011  7.57  7.56  4.69
## # ... with 5,212 more rows
filter(diamonds,cut=='Ideal',
       price>10000,)
## # A tibble: 1,770 x 10
##    carat cut   color clarity depth table price     x     y     z
##    <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  1.7  Ideal J     VS2      60.5    58 10002  7.73  7.74  4.68
##  2  1.03 Ideal E     VVS2     60.6    59 10003  6.5   6.53  3.95
##  3  1.25 Ideal F     VS2      61.6    55 10006  6.93  6.96  4.28
##  4  1.05 Ideal F     VVS2     60.5    55 10011  6.67  6.58  4.01
##  5  1.6  Ideal J     VS1      62      53 10011  7.57  7.56  4.69
##  6  1.51 Ideal H     SI1      61.3    56 10012  7.44  7.4   4.55
##  7  1.13 Ideal F     VS1      60.9    57 10016  6.73  6.76  4.11
##  8  1.04 Ideal E     VVS2     62.9    55 10019  6.47  6.51  4.08
##  9  1.22 Ideal G     VVS2     62.3    56 10038  6.81  6.84  4.25
## 10  1.3  Ideal G     VS1      62      55 10038  6.98  7.02  4.34
## # ... with 1,760 more rows
filter(diamonds,cut=='Ideal' |
         price>10000)
## # A tibble: 25,003 x 10
##    carat cut   color clarity depth table price     x     y     z
##    <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46
##  3  0.31 Ideal J     SI2      62.2    54   344  4.35  4.37  2.71
##  4  0.3  Ideal I     SI2      62      54   348  4.31  4.34  2.68
##  5  0.33 Ideal I     SI2      61.8    55   403  4.49  4.51  2.78
##  6  0.33 Ideal I     SI2      61.2    56   403  4.49  4.5   2.75
##  7  0.33 Ideal J     SI1      61.1    56   403  4.49  4.55  2.76
##  8  0.23 Ideal G     VS1      61.9    54   404  3.93  3.95  2.44
##  9  0.32 Ideal I     SI1      60.9    55   404  4.45  4.48  2.72
## 10  0.3  Ideal I     SI2      61      59   405  4.3   4.33  2.63
## # ... with 24,993 more rows

Subset by columns with select

#Select some columns 
select(diamonds,cut,color)
## # A tibble: 53,940 x 2
##    cut       color
##    <ord>     <ord>
##  1 Ideal     E    
##  2 Premium   E    
##  3 Good      E    
##  4 Premium   I    
##  5 Good      J    
##  6 Very Good J    
##  7 Very Good I    
##  8 Very Good H    
##  9 Fair      E    
## 10 Very Good H    
## # ... with 53,930 more rows
select(diamonds,1:4)
## # A tibble: 53,940 x 4
##    carat cut       color clarity
##    <dbl> <ord>     <ord> <ord>  
##  1  0.23 Ideal     E     SI2    
##  2  0.21 Premium   E     SI1    
##  3  0.23 Good      E     VS1    
##  4  0.29 Premium   I     VS2    
##  5  0.31 Good      J     SI2    
##  6  0.24 Very Good J     VVS2   
##  7  0.24 Very Good I     VVS1   
##  8  0.26 Very Good H     SI1    
##  9  0.22 Fair      E     VS2    
## 10  0.23 Very Good H     VS1    
## # ... with 53,930 more rows
select(diamonds,starts_with("c"))
## # A tibble: 53,940 x 4
##    carat cut       color clarity
##    <dbl> <ord>     <ord> <ord>  
##  1  0.23 Ideal     E     SI2    
##  2  0.21 Premium   E     SI1    
##  3  0.23 Good      E     VS1    
##  4  0.29 Premium   I     VS2    
##  5  0.31 Good      J     SI2    
##  6  0.24 Very Good J     VVS2   
##  7  0.24 Very Good I     VVS1   
##  8  0.26 Very Good H     SI1    
##  9  0.22 Fair      E     VS2    
## 10  0.23 Very Good H     VS1    
## # ... with 53,930 more rows
select(diamonds,contains("c"))
## # A tibble: 53,940 x 5
##    carat cut       color clarity price
##    <dbl> <ord>     <ord> <ord>   <int>
##  1  0.23 Ideal     E     SI2       326
##  2  0.21 Premium   E     SI1       326
##  3  0.23 Good      E     VS1       327
##  4  0.29 Premium   I     VS2       334
##  5  0.31 Good      J     SI2       335
##  6  0.24 Very Good J     VVS2      336
##  7  0.24 Very Good I     VVS1      336
##  8  0.26 Very Good H     SI1       337
##  9  0.22 Fair      E     VS2       337
## 10  0.23 Very Good H     VS1       338
## # ... with 53,930 more rows
select(diamonds,price,
       everything())
## # A tibble: 53,940 x 10
##    price carat cut       color clarity depth table     x     y     z
##    <int> <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1   326  0.23 Ideal     E     SI2      61.5    55  3.95  3.98  2.43
##  2   326  0.21 Premium   E     SI1      59.8    61  3.89  3.84  2.31
##  3   327  0.23 Good      E     VS1      56.9    65  4.05  4.07  2.31
##  4   334  0.29 Premium   I     VS2      62.4    58  4.2   4.23  2.63
##  5   335  0.31 Good      J     SI2      63.3    58  4.34  4.35  2.75
##  6   336  0.24 Very Good J     VVS2     62.8    57  3.94  3.96  2.48
##  7   336  0.24 Very Good I     VVS1     62.3    57  3.95  3.98  2.47
##  8   337  0.26 Very Good H     SI1      61.9    55  4.07  4.11  2.53
##  9   337  0.22 Fair      E     VS2      65.1    61  3.87  3.78  2.49
## 10   338  0.23 Very Good H     VS1      59.4    61  4     4.05  2.39
## # ... with 53,930 more rows
#Eliminate a column
select(diamonds,-price)
## # A tibble: 53,940 x 9
##    carat cut       color clarity depth table     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61  4     4.05  2.39
## # ... with 53,930 more rows

Using the Pipe %>% operator

#Select all without the price column
diamonds %>%
  select(-price)
## # A tibble: 53,940 x 9
##    carat cut       color clarity depth table     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61  4     4.05  2.39
## # ... with 53,930 more rows
#Reorder rows with arrange
diamonds%>%
  arrange(color)
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Very Good D     VS2      60.5    61   357  3.96  3.97  2.4 
##  2  0.23 Very Good D     VS1      61.9    58   402  3.92  3.96  2.44
##  3  0.26 Very Good D     VS2      60.8    59   403  4.13  4.16  2.52
##  4  0.26 Good      D     VS2      65.2    56   403  3.99  4.02  2.61
##  5  0.26 Good      D     VS1      58.4    63   403  4.19  4.24  2.46
##  6  0.22 Premium   D     VS2      59.3    62   404  3.91  3.88  2.31
##  7  0.3  Premium   D     SI1      62.6    59   552  4.23  4.27  2.66
##  8  0.3  Ideal     D     SI1      62.5    57   552  4.29  4.32  2.69
##  9  0.3  Ideal     D     SI1      62.1    56   552  4.3   4.33  2.68
## 10  0.24 Very Good D     VVS1     61.5    60   553  3.97  4     2.45
## # ... with 53,930 more rows
diamonds %>%
  arrange(carat,color)
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1   0.2 Ideal     D     VS2      61.5    57   367  3.81  3.77  2.33
##  2   0.2 Premium   D     VS2      62.3    60   367  3.73  3.68  2.31
##  3   0.2 Premium   D     VS2      61.7    60   367  3.77  3.72  2.31
##  4   0.2 Premium   E     SI2      60.2    62   345  3.79  3.75  2.27
##  5   0.2 Premium   E     VS2      59.8    62   367  3.79  3.77  2.26
##  6   0.2 Premium   E     VS2      59      60   367  3.81  3.78  2.24
##  7   0.2 Premium   E     VS2      61.1    59   367  3.81  3.78  2.32
##  8   0.2 Premium   E     VS2      59.7    62   367  3.84  3.8   2.28
##  9   0.2 Ideal     E     VS2      59.7    55   367  3.86  3.84  2.3 
## 10   0.2 Very Good E     VS2      63.4    59   367  3.74  3.71  2.36
## # ... with 53,930 more rows
diamonds %>%
  arrange(desc(carat))
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  5.01 Fair      J     I1       65.5    59 18018 10.7  10.5   6.98
##  2  4.5  Fair      J     I1       65.8    58 18531 10.2  10.2   6.72
##  3  4.13 Fair      H     I1       64.8    61 17329 10     9.85  6.43
##  4  4.01 Premium   I     I1       61      61 15223 10.1  10.1   6.17
##  5  4.01 Premium   J     I1       62.5    62 15223 10.0   9.94  6.24
##  6  4    Very Good I     I1       63.3    58 15984 10.0   9.94  6.31
##  7  3.67 Premium   I     I1       62.4    56 16193  9.86  9.81  6.13
##  8  3.65 Fair      H     I1       67.1    53 11668  9.53  9.48  6.38
##  9  3.51 Premium   J     VS2      62.5    59 18701  9.66  9.63  6.03
## 10  3.5  Ideal     H     I1       62.8    57 12587  9.65  9.59  6.03
## # ... with 53,930 more rows

Lets Glimpse

glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.~
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver~
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,~
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, ~
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64~
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58~
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34~
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.~
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.~
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.~

Mutate or modify records

diamonds %>%
  mutate(mass_g=.20 *carat,
         price_per_carat=price/carat,
         cut=tolower(cut),
         expensive_TF=price>10000)
## # A tibble: 53,940 x 13
##    carat cut       color clarity depth table price     x     y     z mass_g
##    <dbl> <chr>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>  <dbl>
##  1  0.23 ideal     E     SI2      61.5    55   326  3.95  3.98  2.43  0.046
##  2  0.21 premium   E     SI1      59.8    61   326  3.89  3.84  2.31  0.042
##  3  0.23 good      E     VS1      56.9    65   327  4.05  4.07  2.31  0.046
##  4  0.29 premium   I     VS2      62.4    58   334  4.2   4.23  2.63  0.058
##  5  0.31 good      J     SI2      63.3    58   335  4.34  4.35  2.75  0.062
##  6  0.24 very good J     VVS2     62.8    57   336  3.94  3.96  2.48  0.048
##  7  0.24 very good I     VVS1     62.3    57   336  3.95  3.98  2.47  0.048
##  8  0.26 very good H     SI1      61.9    55   337  4.07  4.11  2.53  0.052
##  9  0.22 fair      E     VS2      65.1    61   337  3.87  3.78  2.49  0.044
## 10  0.23 very good H     VS1      59.4    61   338  4     4.05  2.39  0.046
## # ... with 53,930 more rows, and 2 more variables: price_per_carat <dbl>,
## #   expensive_TF <lgl>

Other smaller verbs

?slice_max ?bind_rows ?left_join ?rename ?case_when

Grouped by

diamonds %>%
  group_by(cut) %>%
  summarize(avg_price=mean(price),
            sd_price=sd(price))
## # A tibble: 5 x 3
##   cut       avg_price sd_price
##   <ord>         <dbl>    <dbl>
## 1 Fair          4359.    3560.
## 2 Good          3929.    3682.
## 3 Very Good     3982.    3936.
## 4 Premium       4584.    4349.
## 5 Ideal         3458.    3808.
diamonds %>%
  group_by(cut,color) %>%
  summarize(avg_price=mean(price),
            sd_price=sd(price),
            count=n())
## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.
## # A tibble: 35 x 5
## # Groups:   cut [5]
##    cut   color avg_price sd_price count
##    <ord> <ord>     <dbl>    <dbl> <int>
##  1 Fair  D         4291.    3286.   163
##  2 Fair  E         3682.    2977.   224
##  3 Fair  F         3827.    3223.   312
##  4 Fair  G         4239.    3610.   314
##  5 Fair  H         5136.    3886.   303
##  6 Fair  I         4685.    3730.   175
##  7 Fair  J         4976.    4050.   119
##  8 Good  D         3405.    3175.   662
##  9 Good  E         3424.    3331.   933
## 10 Good  F         3496.    3202.   909
## # ... with 25 more rows
diamonds %>%
  count(cut,color) #Count only
## # A tibble: 35 x 3
##    cut   color     n
##    <ord> <ord> <int>
##  1 Fair  D       163
##  2 Fair  E       224
##  3 Fair  F       312
##  4 Fair  G       314
##  5 Fair  H       303
##  6 Fair  I       175
##  7 Fair  J       119
##  8 Good  D       662
##  9 Good  E       933
## 10 Good  F       909
## # ... with 25 more rows
diamonds %>%
  group_by(expensive=price>10000) %>%
  summarize(avg_price=mean(price),
            sd_price=sd(price),
            count=n())
## # A tibble: 2 x 4
##   expensive avg_price sd_price count
##   <lgl>         <dbl>    <dbl> <int>
## 1 FALSE         2892.    2400. 48718
## 2 TRUE         13640.    2518.  5222