This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.You can check out my Online Portfolio
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
filter(diamonds,cut=='Ideal')
## # A tibble: 21,551 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.9 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.3 Ideal I SI2 62 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.5 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.3 Ideal I SI2 61 59 405 4.3 4.33 2.63
## # ... with 21,541 more rows
filter(diamonds,price > 10000)
## # A tibble: 5,222 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1.7 Ideal J VS2 60.5 58 10002 7.73 7.74 4.68
## 2 1.03 Ideal E VVS2 60.6 59 10003 6.5 6.53 3.95
## 3 1.23 Very Good G VVS2 60.6 55 10004 6.93 7.02 4.23
## 4 1.25 Ideal F VS2 61.6 55 10006 6.93 6.96 4.28
## 5 2.01 Very Good I SI2 61.4 63 10009 8.19 7.96 4.96
## 6 1.21 Very Good F VS1 62.3 58 10009 6.76 6.85 4.24
## 7 1.51 Premium I VS2 59.9 60 10010 7.42 7.36 4.43
## 8 1.01 Fair D SI2 64.6 58 10011 6.25 6.2 4.02
## 9 1.05 Ideal F VVS2 60.5 55 10011 6.67 6.58 4.01
## 10 1.6 Ideal J VS1 62 53 10011 7.57 7.56 4.69
## # ... with 5,212 more rows
filter(diamonds,cut=='Ideal',
price>10000,)
## # A tibble: 1,770 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1.7 Ideal J VS2 60.5 58 10002 7.73 7.74 4.68
## 2 1.03 Ideal E VVS2 60.6 59 10003 6.5 6.53 3.95
## 3 1.25 Ideal F VS2 61.6 55 10006 6.93 6.96 4.28
## 4 1.05 Ideal F VVS2 60.5 55 10011 6.67 6.58 4.01
## 5 1.6 Ideal J VS1 62 53 10011 7.57 7.56 4.69
## 6 1.51 Ideal H SI1 61.3 56 10012 7.44 7.4 4.55
## 7 1.13 Ideal F VS1 60.9 57 10016 6.73 6.76 4.11
## 8 1.04 Ideal E VVS2 62.9 55 10019 6.47 6.51 4.08
## 9 1.22 Ideal G VVS2 62.3 56 10038 6.81 6.84 4.25
## 10 1.3 Ideal G VS1 62 55 10038 6.98 7.02 4.34
## # ... with 1,760 more rows
filter(diamonds,cut=='Ideal' |
price>10000)
## # A tibble: 25,003 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.9 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.3 Ideal I SI2 62 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.5 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.3 Ideal I SI2 61 59 405 4.3 4.33 2.63
## # ... with 24,993 more rows
#Select some columns
select(diamonds,cut,color)
## # A tibble: 53,940 x 2
## cut color
## <ord> <ord>
## 1 Ideal E
## 2 Premium E
## 3 Good E
## 4 Premium I
## 5 Good J
## 6 Very Good J
## 7 Very Good I
## 8 Very Good H
## 9 Fair E
## 10 Very Good H
## # ... with 53,930 more rows
select(diamonds,1:4)
## # A tibble: 53,940 x 4
## carat cut color clarity
## <dbl> <ord> <ord> <ord>
## 1 0.23 Ideal E SI2
## 2 0.21 Premium E SI1
## 3 0.23 Good E VS1
## 4 0.29 Premium I VS2
## 5 0.31 Good J SI2
## 6 0.24 Very Good J VVS2
## 7 0.24 Very Good I VVS1
## 8 0.26 Very Good H SI1
## 9 0.22 Fair E VS2
## 10 0.23 Very Good H VS1
## # ... with 53,930 more rows
select(diamonds,starts_with("c"))
## # A tibble: 53,940 x 4
## carat cut color clarity
## <dbl> <ord> <ord> <ord>
## 1 0.23 Ideal E SI2
## 2 0.21 Premium E SI1
## 3 0.23 Good E VS1
## 4 0.29 Premium I VS2
## 5 0.31 Good J SI2
## 6 0.24 Very Good J VVS2
## 7 0.24 Very Good I VVS1
## 8 0.26 Very Good H SI1
## 9 0.22 Fair E VS2
## 10 0.23 Very Good H VS1
## # ... with 53,930 more rows
select(diamonds,contains("c"))
## # A tibble: 53,940 x 5
## carat cut color clarity price
## <dbl> <ord> <ord> <ord> <int>
## 1 0.23 Ideal E SI2 326
## 2 0.21 Premium E SI1 326
## 3 0.23 Good E VS1 327
## 4 0.29 Premium I VS2 334
## 5 0.31 Good J SI2 335
## 6 0.24 Very Good J VVS2 336
## 7 0.24 Very Good I VVS1 336
## 8 0.26 Very Good H SI1 337
## 9 0.22 Fair E VS2 337
## 10 0.23 Very Good H VS1 338
## # ... with 53,930 more rows
select(diamonds,price,
everything())
## # A tibble: 53,940 x 10
## price carat cut color clarity depth table x y z
## <int> <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 326 0.23 Ideal E SI2 61.5 55 3.95 3.98 2.43
## 2 326 0.21 Premium E SI1 59.8 61 3.89 3.84 2.31
## 3 327 0.23 Good E VS1 56.9 65 4.05 4.07 2.31
## 4 334 0.29 Premium I VS2 62.4 58 4.2 4.23 2.63
## 5 335 0.31 Good J SI2 63.3 58 4.34 4.35 2.75
## 6 336 0.24 Very Good J VVS2 62.8 57 3.94 3.96 2.48
## 7 336 0.24 Very Good I VVS1 62.3 57 3.95 3.98 2.47
## 8 337 0.26 Very Good H SI1 61.9 55 4.07 4.11 2.53
## 9 337 0.22 Fair E VS2 65.1 61 3.87 3.78 2.49
## 10 338 0.23 Very Good H VS1 59.4 61 4 4.05 2.39
## # ... with 53,930 more rows
#Eliminate a column
select(diamonds,-price)
## # A tibble: 53,940 x 9
## carat cut color clarity depth table x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 4 4.05 2.39
## # ... with 53,930 more rows
#Select all without the price column
diamonds %>%
select(-price)
## # A tibble: 53,940 x 9
## carat cut color clarity depth table x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 4 4.05 2.39
## # ... with 53,930 more rows
#Reorder rows with arrange
diamonds%>%
arrange(color)
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Very Good D VS2 60.5 61 357 3.96 3.97 2.4
## 2 0.23 Very Good D VS1 61.9 58 402 3.92 3.96 2.44
## 3 0.26 Very Good D VS2 60.8 59 403 4.13 4.16 2.52
## 4 0.26 Good D VS2 65.2 56 403 3.99 4.02 2.61
## 5 0.26 Good D VS1 58.4 63 403 4.19 4.24 2.46
## 6 0.22 Premium D VS2 59.3 62 404 3.91 3.88 2.31
## 7 0.3 Premium D SI1 62.6 59 552 4.23 4.27 2.66
## 8 0.3 Ideal D SI1 62.5 57 552 4.29 4.32 2.69
## 9 0.3 Ideal D SI1 62.1 56 552 4.3 4.33 2.68
## 10 0.24 Very Good D VVS1 61.5 60 553 3.97 4 2.45
## # ... with 53,930 more rows
diamonds %>%
arrange(carat,color)
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.2 Ideal D VS2 61.5 57 367 3.81 3.77 2.33
## 2 0.2 Premium D VS2 62.3 60 367 3.73 3.68 2.31
## 3 0.2 Premium D VS2 61.7 60 367 3.77 3.72 2.31
## 4 0.2 Premium E SI2 60.2 62 345 3.79 3.75 2.27
## 5 0.2 Premium E VS2 59.8 62 367 3.79 3.77 2.26
## 6 0.2 Premium E VS2 59 60 367 3.81 3.78 2.24
## 7 0.2 Premium E VS2 61.1 59 367 3.81 3.78 2.32
## 8 0.2 Premium E VS2 59.7 62 367 3.84 3.8 2.28
## 9 0.2 Ideal E VS2 59.7 55 367 3.86 3.84 2.3
## 10 0.2 Very Good E VS2 63.4 59 367 3.74 3.71 2.36
## # ... with 53,930 more rows
diamonds %>%
arrange(desc(carat))
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 5.01 Fair J I1 65.5 59 18018 10.7 10.5 6.98
## 2 4.5 Fair J I1 65.8 58 18531 10.2 10.2 6.72
## 3 4.13 Fair H I1 64.8 61 17329 10 9.85 6.43
## 4 4.01 Premium I I1 61 61 15223 10.1 10.1 6.17
## 5 4.01 Premium J I1 62.5 62 15223 10.0 9.94 6.24
## 6 4 Very Good I I1 63.3 58 15984 10.0 9.94 6.31
## 7 3.67 Premium I I1 62.4 56 16193 9.86 9.81 6.13
## 8 3.65 Fair H I1 67.1 53 11668 9.53 9.48 6.38
## 9 3.51 Premium J VS2 62.5 59 18701 9.66 9.63 6.03
## 10 3.5 Ideal H I1 62.8 57 12587 9.65 9.59 6.03
## # ... with 53,930 more rows
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.~
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver~
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,~
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, ~
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64~
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58~
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34~
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.~
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.~
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.~
diamonds %>%
mutate(mass_g=.20 *carat,
price_per_carat=price/carat,
cut=tolower(cut),
expensive_TF=price>10000)
## # A tibble: 53,940 x 13
## carat cut color clarity depth table price x y z mass_g
## <dbl> <chr> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 ideal E SI2 61.5 55 326 3.95 3.98 2.43 0.046
## 2 0.21 premium E SI1 59.8 61 326 3.89 3.84 2.31 0.042
## 3 0.23 good E VS1 56.9 65 327 4.05 4.07 2.31 0.046
## 4 0.29 premium I VS2 62.4 58 334 4.2 4.23 2.63 0.058
## 5 0.31 good J SI2 63.3 58 335 4.34 4.35 2.75 0.062
## 6 0.24 very good J VVS2 62.8 57 336 3.94 3.96 2.48 0.048
## 7 0.24 very good I VVS1 62.3 57 336 3.95 3.98 2.47 0.048
## 8 0.26 very good H SI1 61.9 55 337 4.07 4.11 2.53 0.052
## 9 0.22 fair E VS2 65.1 61 337 3.87 3.78 2.49 0.044
## 10 0.23 very good H VS1 59.4 61 338 4 4.05 2.39 0.046
## # ... with 53,930 more rows, and 2 more variables: price_per_carat <dbl>,
## # expensive_TF <lgl>
?slice_max ?bind_rows ?left_join ?rename ?case_when
diamonds %>%
group_by(cut) %>%
summarize(avg_price=mean(price),
sd_price=sd(price))
## # A tibble: 5 x 3
## cut avg_price sd_price
## <ord> <dbl> <dbl>
## 1 Fair 4359. 3560.
## 2 Good 3929. 3682.
## 3 Very Good 3982. 3936.
## 4 Premium 4584. 4349.
## 5 Ideal 3458. 3808.
diamonds %>%
group_by(cut,color) %>%
summarize(avg_price=mean(price),
sd_price=sd(price),
count=n())
## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.
## # A tibble: 35 x 5
## # Groups: cut [5]
## cut color avg_price sd_price count
## <ord> <ord> <dbl> <dbl> <int>
## 1 Fair D 4291. 3286. 163
## 2 Fair E 3682. 2977. 224
## 3 Fair F 3827. 3223. 312
## 4 Fair G 4239. 3610. 314
## 5 Fair H 5136. 3886. 303
## 6 Fair I 4685. 3730. 175
## 7 Fair J 4976. 4050. 119
## 8 Good D 3405. 3175. 662
## 9 Good E 3424. 3331. 933
## 10 Good F 3496. 3202. 909
## # ... with 25 more rows
diamonds %>%
count(cut,color) #Count only
## # A tibble: 35 x 3
## cut color n
## <ord> <ord> <int>
## 1 Fair D 163
## 2 Fair E 224
## 3 Fair F 312
## 4 Fair G 314
## 5 Fair H 303
## 6 Fair I 175
## 7 Fair J 119
## 8 Good D 662
## 9 Good E 933
## 10 Good F 909
## # ... with 25 more rows
diamonds %>%
group_by(expensive=price>10000) %>%
summarize(avg_price=mean(price),
sd_price=sd(price),
count=n())
## # A tibble: 2 x 4
## expensive avg_price sd_price count
## <lgl> <dbl> <dbl> <int>
## 1 FALSE 2892. 2400. 48718
## 2 TRUE 13640. 2518. 5222