rm(list=ls())
getwd()
## [1] "C:/Users/LEEDAEJOON/Documents"
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(readxl)
data("diamonds")
diamonds1<-diamonds %>% rename(c=clarity, p=price)
head(diamonds1, 3)
## # A tibble: 3 x 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31
count(diamonds, cut)
## # A tibble: 5 x 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
table(diamonds$cut)
## 
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
class(count(diamonds, cut))
## [1] "tbl_df"     "tbl"        "data.frame"
class(table(diamonds$cut))
## [1] "table"
df1<-diamonds %>% select(carat, price)
head(df1, 3)
## # A tibble: 3 x 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327
df2<-diamonds %>% 
  select(-carat, -price)
head(df2, 3)
## # A tibble: 3 x 8
##   cut     color clarity depth table     x     y     z
##   <ord>   <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ideal   E     SI2      61.5    55  3.95  3.98  2.43
## 2 Premium E     SI1      59.8    61  3.89  3.84  2.31
## 3 Good    E     VS1      56.9    65  4.05  4.07  2.31
head(diamonds)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
diamonds %>% 
  slice(1:5)
## # A tibble: 5 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good    J     SI2      63.3    58   335  4.34  4.35  2.75
diamonds %>% 
  slice(-1:-100)
## # A tibble: 53,840 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.75 Very Good D     SI1      63.2    56  2760  5.8   5.75  3.65
##  2  0.75 Premium   E     SI1      59.9    54  2760  6     5.96  3.58
##  3  0.74 Ideal     G     SI1      61.6    55  2760  5.8   5.85  3.59
##  4  0.75 Premium   G     VS2      61.7    58  2760  5.85  5.79  3.59
##  5  0.8  Ideal     I     VS1      62.9    56  2760  5.94  5.87  3.72
##  6  0.75 Ideal     G     SI1      62.2    55  2760  5.87  5.8   3.63
##  7  0.8  Premium   G     SI1      63      59  2760  5.9   5.81  3.69
##  8  0.74 Ideal     I     VVS2     62.3    55  2761  5.77  5.81  3.61
##  9  0.81 Ideal     F     SI2      58.8    57  2761  6.14  6.11  3.6 
## 10  0.59 Ideal     E     VVS2     62      55  2761  5.38  5.43  3.35
## # ... with 53,830 more rows
diamonds %>% 
  filter(cut=="Good") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
max(diamonds$price)
## [1] 18823
diamonds %>% 
  filter(price==max(price))
## # A tibble: 1 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16
diamonds %>% 
  filter(price==18823)
## # A tibble: 1 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16
diamonds %>% 
  filter(cut!="Preminum")%>% head(3)
## # A tibble: 3 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
diamonds %>% 
  filter(price!=1000) %>% head(3)
## # A tibble: 3 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
diamonds %>% 
  filter(price>=1000) %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.7  Ideal E     SI1      62.5    57  2757  5.7   5.72  3.57
## 2  0.86 Fair  E     SI2      55.1    69  2757  6.45  6.33  3.52
## 3  0.7  Ideal G     VS2      61.6    56  2757  5.7   5.67  3.5
diamonds %>% filter(price!=1000 & cut=="Ideal") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46
## 3  0.31 Ideal J     SI2      62.2    54   344  4.35  4.37  2.71
diamonds %>% filter(price!=1000 & cut=="Ideal" & color=="E") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.26 Ideal E     VVS2     62.9    58   554  4.02  4.06  2.54
## 3  0.7  Ideal E     SI1      62.5    57  2757  5.7   5.72  3.57
diamonds %>% filter(carat < 1 | carat > 5) %>% head(3)
## # A tibble: 3 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
diamonds %>% filter(cut%in%c("Ideal", "Good")) %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 3  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
diamonds %>% select(carat, depth, price) %>%
  filter(depth==max(depth)|price==min(price))
## # A tibble: 4 x 3
##   carat depth price
##   <dbl> <dbl> <int>
## 1  0.23  61.5   326
## 2  0.21  59.8   326
## 3  0.5   79    2579
## 4  0.5   79    2579
diamonds %>% mutate(Ratio=price/carat, Double=Ratio*2) %>% head(3)
## # A tibble: 3 x 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.
diamonds %>% summarize(mean(price))
## # A tibble: 1 x 1
##   `mean(price)`
##           <dbl>
## 1         3933.
diamonds %>% summarize(AvgPrice=mean(price),
                       MedianPrice=median(price),
                       AvgCarat=mean(carat))
## # A tibble: 1 x 3
##   AvgPrice MedianPrice AvgCarat
##      <dbl>       <dbl>    <dbl>
## 1    3933.        2401    0.798
diamonds %>% group_by(cut) %>% 
  summarize(AvgPrice=mean(price),
            SumCarat=sum(carat))
## # A tibble: 5 x 3
##   cut       AvgPrice SumCarat
##   <ord>        <dbl>    <dbl>
## 1 Fair         4359.    1684.
## 2 Good         3929.    4166.
## 3 Very Good    3982.    9743.
## 4 Premium      4584.   12301.
## 5 Ideal        3458.   15147.
diamonds %>% group_by(cut) %>% 
  summarize(n=n()) %>% 
  mutate(total=sum(n),
         pct=n/total*100)
## # A tibble: 5 x 4
##   cut           n total   pct
##   <ord>     <int> <int> <dbl>
## 1 Fair       1610 53940  2.98
## 2 Good       4906 53940  9.10
## 3 Very Good 12082 53940 22.4 
## 4 Premium   13791 53940 25.6 
## 5 Ideal     21551 53940 40.0
quantile(diamonds$price)
##       0%      25%      50%      75%     100% 
##   326.00   950.00  2401.00  5324.25 18823.00
diamonds1<-diamonds %>% 
  mutate(price_class=
           ifelse(price>=5324.25, "best",
                  ifelse(price>=2401, "good",
                         ifelse(price>=950, "normal", "bad"))))
table(diamonds1$price_class)
## 
##    bad   best   good normal 
##  13483  13485  13496  13476
diamonds %>% group_by(cut) %>% 
  summarize(AvgPrice=mean(price)) %>% 
  arrange(desc(AvgPrice))
## # A tibble: 5 x 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Premium      4584.
## 2 Fair         4359.
## 3 Very Good    3982.
## 4 Good         3929.
## 5 Ideal        3458.
ott1<-data.frame(id=c(1,2,3), 
                 car=c("bmw", "bmw", "bmw"),
                 fe=c(20, 22, 24))
ott1
##   id car fe
## 1  1 bmw 20
## 2  2 bmw 22
## 3  3 bmw 24
ott2<-data.frame(id=c(1,4,5), fe1=c(30,34,35))
ott2
##   id fe1
## 1  1  30
## 2  4  34
## 3  5  35
left_join(ott1,ott2,by="id")
##   id car fe fe1
## 1  1 bmw 20  30
## 2  2 bmw 22  NA
## 3  3 bmw 24  NA