#09 mutate() + ifelse() : 연속형 데이터로 범주별 변수 만들기
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
quantile(diamonds$price)
##       0%      25%      50%      75%     100% 
##   326.00   950.00  2401.00  5324.25 18823.00
diamonds1<-diamonds%>%
  mutate(price_class=ifelse(price>=5324.25,"best",
                            ifelse(price>=2401,"good",
                                   ifelse(price>=950,"normal","bad"))))
table(diamonds1$price_class)
## 
##    bad   best   good normal 
##  13483  13485  13496  13476
glimpse(diamonds1)
## Rows: 53,940
## Columns: 11
## $ carat       <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23…
## $ cut         <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good,…
## $ color       <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J…
## $ clarity     <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, V…
## $ depth       <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4…
## $ table       <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62…
## $ price       <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340…
## $ x           <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00…
## $ y           <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05…
## $ z           <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39…
## $ price_class <chr> "bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad", "b…
#10 arrange() : 데이터 정렬하기
diamonds%>%group_by(cut)%>%summarize(AvgPrice=mean(price))%>%
  arrange(desc(AvgPrice))
## # A tibble: 5 × 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Premium      4584.
## 2 Fair         4359.
## 3 Very Good    3982.
## 4 Good         3929.
## 5 Ideal        3458.
diamonds%>%group_by(cut)%>%summarize(AvgPrice=mean(price))%>%
  arrange(AvgPrice)
## # A tibble: 5 × 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Ideal        3458.
## 2 Good         3929.
## 3 Very Good    3982.
## 4 Fair         4359.
## 5 Premium      4584.
#11 데이터 결합하기
#1) 열 결합
ott1<-data.frame(id=c(1,2,3),car=c("bmw","bmw","bmw"),fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1
##   id car fe
## 1  1 bmw 20
## 2  2 bmw 22
## 3  3 bmw 24
ott2
##   id fe1
## 1  1  30
## 2  4  34
## 3  5  35
left_join(ott1,ott2,by="id")
##   id car fe fe1
## 1  1 bmw 20  30
## 2  2 bmw 22  NA
## 3  3 bmw 24  NA
inner_join(ott1,ott2,by="id")
##   id car fe fe1
## 1  1 bmw 20  30
full_join(ott1,ott2,by="id")
##   id  car fe fe1
## 1  1  bmw 20  30
## 2  2  bmw 22  NA
## 3  3  bmw 24  NA
## 4  4 <NA> NA  34
## 5  5 <NA> NA  35
#2) 키워드 형식으로 결합하기

#3) 행 결합
ott5<-data.frame(car=c("bmw","bmw","bmw"),fe=c(20,22,24))
ott6<-data.frame(car=c("audi","audi","audi"),fe1=c(20,22,24))
library(dplyr)
bind_rows(ott5,ott6)
##    car fe fe1
## 1  bmw 20  NA
## 2  bmw 22  NA
## 3  bmw 24  NA
## 4 audi NA  20
## 5 audi NA  22
## 6 audi NA  24
#12 실기 시험에 알아두면 유용한 함수
q1<-c("nike","polo","adidas","wilson","yonex")
class(q1)
## [1] "character"
q1_factor<-as.factor(q1)
class(q1_factor)
## [1] "factor"
q1_factor
## [1] nike   polo   adidas wilson yonex 
## Levels: adidas nike polo wilson yonex
as.numeric(q1_factor)
## [1] 2 3 1 4 5
levels(q1_factor)
## [1] "adidas" "nike"   "polo"   "wilson" "yonex"
levels(q1_factor)[1]
## [1] "adidas"
factor(x=c("high school","college","masters"),
       levels=c("high school","college","masters"),
       (ordered = TRUE))
## [1] TRUE1 TRUE2 TRUE3
## Levels: TRUE1 TRUE2 TRUE3
#2) 문자변수에서 일부를 추출
economics<-ggplot2::economics
library(ggplot2)
data("economics")
head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
glimpse(economics)
## Rows: 574
## Columns: 6
## $ date     <date> 1967-07-01, 1967-08-01, 1967-09-01, 1967-10-01, 1967-11-01, …
## $ pce      <dbl> 506.7, 509.8, 515.6, 512.2, 517.4, 525.1, 530.9, 533.6, 544.3…
## $ pop      <dbl> 198712, 198911, 199113, 199311, 199498, 199657, 199808, 19992…
## $ psavert  <dbl> 12.6, 12.6, 11.9, 12.9, 12.8, 11.8, 11.7, 12.3, 11.7, 12.3, 1…
## $ uempmed  <dbl> 4.5, 4.7, 4.6, 4.9, 4.7, 4.8, 5.1, 4.5, 4.1, 4.6, 4.4, 4.4, 4…
## $ unemploy <dbl> 2944, 2945, 2958, 3143, 3066, 3018, 2878, 3001, 2877, 2709, 2…
economics<-economics%>%mutate(year=substr(economics$date,1,4))
economics%>%group_by(year)%>%summarize(m=mean(psavert))%>%
  arrange(desc(m))%>%head(5)
## # A tibble: 5 × 2
##   year      m
##   <chr> <dbl>
## 1 1971   13.5
## 2 1973   13.4
## 3 1975   13.4
## 4 1974   13.3
## 5 1970   12.8
#3) 시계열 데이터 가공
as.Date("2021-05-01")
## [1] "2021-05-01"
#lubridate 패키지 + dplyr 패키지 활용

library(lubridate)
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
data(lakers)
glimpse(lakers)
## Rows: 34,624
## Columns: 13
## $ date      <int> 20081028, 20081028, 20081028, 20081028, 20081028, 20081028, …
## $ opponent  <chr> "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR…
## $ game_type <chr> "home", "home", "home", "home", "home", "home", "home", "hom…
## $ time      <chr> "12:00", "11:39", "11:37", "11:25", "11:23", "11:22", "11:22…
## $ period    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ etype     <chr> "jump ball", "shot", "rebound", "shot", "rebound", "shot", "…
## $ team      <chr> "OFF", "LAL", "LAL", "LAL", "LAL", "LAL", "POR", "LAL", "LAL…
## $ player    <chr> "", "Pau Gasol", "Vladimir Radmanovic", "Derek Fisher", "Pau…
## $ result    <chr> "", "missed", "", "missed", "", "made", "", "made", "", "mad…
## $ points    <int> 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, …
## $ type      <chr> "", "hook", "off", "layup", "off", "hook", "shooting", "", "…
## $ x         <int> NA, 23, NA, 25, NA, 25, NA, NA, NA, 36, 30, 34, NA, 15, 46, …
## $ y         <int> NA, 13, NA, 6, NA, 10, NA, NA, NA, 21, 21, 10, NA, 17, 9, 10…
lakers<-lakers%>%as_tibble
lakers%>%select(date,time)
## # A tibble: 34,624 × 2
##        date time 
##       <int> <chr>
##  1 20081028 12:00
##  2 20081028 11:39
##  3 20081028 11:37
##  4 20081028 11:25
##  5 20081028 11:23
##  6 20081028 11:22
##  7 20081028 11:22
##  8 20081028 11:22
##  9 20081028 11:00
## 10 20081028 10:53
## # ℹ 34,614 more rows
lakers<-lakers%>%mutate(date=paste(date,time)%>%ymd_hm) %>% 
  rename(time_index=date)
head(lakers)
## # A tibble: 6 × 13
##   time_index          opponent game_type time  period etype  team  player result
##   <dttm>              <chr>    <chr>     <chr>  <int> <chr>  <chr> <chr>  <chr> 
## 1 2008-10-28 12:00:00 POR      home      12:00      1 jump … OFF   ""     ""    
## 2 2008-10-28 11:39:00 POR      home      11:39      1 shot   LAL   "Pau … "miss…
## 3 2008-10-28 11:37:00 POR      home      11:37      1 rebou… LAL   "Vlad… ""    
## 4 2008-10-28 11:25:00 POR      home      11:25      1 shot   LAL   "Dere… "miss…
## 5 2008-10-28 11:23:00 POR      home      11:23      1 rebou… LAL   "Pau … ""    
## 6 2008-10-28 11:22:00 POR      home      11:22      1 shot   LAL   "Pau … "made"
## # ℹ 4 more variables: points <int>, type <chr>, x <int>, y <int>
glimpse(lakers)
## Rows: 34,624
## Columns: 13
## $ time_index <dttm> 2008-10-28 12:00:00, 2008-10-28 11:39:00, 2008-10-28 11:37…
## $ opponent   <chr> "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR", "PO…
## $ game_type  <chr> "home", "home", "home", "home", "home", "home", "home", "ho…
## $ time       <chr> "12:00", "11:39", "11:37", "11:25", "11:23", "11:22", "11:2…
## $ period     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ etype      <chr> "jump ball", "shot", "rebound", "shot", "rebound", "shot", …
## $ team       <chr> "OFF", "LAL", "LAL", "LAL", "LAL", "LAL", "POR", "LAL", "LA…
## $ player     <chr> "", "Pau Gasol", "Vladimir Radmanovic", "Derek Fisher", "Pa…
## $ result     <chr> "", "missed", "", "missed", "", "made", "", "made", "", "ma…
## $ points     <int> 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0,…
## $ type       <chr> "", "hook", "off", "layup", "off", "hook", "shooting", "", …
## $ x          <int> NA, 23, NA, 25, NA, 25, NA, NA, NA, 36, 30, 34, NA, 15, 46,…
## $ y          <int> NA, 13, NA, 6, NA, 10, NA, NA, NA, 21, 21, 10, NA, 17, 9, 1…
lakers%>%group_by(month(time_index))%>%
  summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))
## # A tibble: 7 × 3
##   `month(time_index)` mean_x mean_y
##                 <dbl>  <dbl>  <dbl>
## 1                   1   25.5   13.9
## 2                   2   25.0   13.2
## 3                   3   25.5   13.2
## 4                   4   25.4   13.5
## 5                  10   24.9   13.1
## 6                  11   25.5   13.4
## 7                  12   25.1   13.5
lakers%>%group_by(year(time_index))%>%
  summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))
## # A tibble: 2 × 3
##   `year(time_index)` mean_x mean_y
##                <dbl>  <dbl>  <dbl>
## 1               2008   25.2   13.4
## 2               2009   25.4   13.4