#09 mutate() + ifelse() : 연속형 데이터로 범주별 변수 만들기
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
quantile(diamonds$price)
## 0% 25% 50% 75% 100%
## 326.00 950.00 2401.00 5324.25 18823.00
diamonds1<-diamonds%>%
mutate(price_class=ifelse(price>=5324.25,"best",
ifelse(price>=2401,"good",
ifelse(price>=950,"normal","bad"))))
table(diamonds1$price_class)
##
## bad best good normal
## 13483 13485 13496 13476
glimpse(diamonds1)
## Rows: 53,940
## Columns: 11
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good,…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, V…
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39…
## $ price_class <chr> "bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad", "b…
#10 arrange() : 데이터 정렬하기
diamonds%>%group_by(cut)%>%summarize(AvgPrice=mean(price))%>%
arrange(desc(AvgPrice))
## # A tibble: 5 × 2
## cut AvgPrice
## <ord> <dbl>
## 1 Premium 4584.
## 2 Fair 4359.
## 3 Very Good 3982.
## 4 Good 3929.
## 5 Ideal 3458.
diamonds%>%group_by(cut)%>%summarize(AvgPrice=mean(price))%>%
arrange(AvgPrice)
## # A tibble: 5 × 2
## cut AvgPrice
## <ord> <dbl>
## 1 Ideal 3458.
## 2 Good 3929.
## 3 Very Good 3982.
## 4 Fair 4359.
## 5 Premium 4584.
#11 데이터 결합하기
#1) 열 결합
ott1<-data.frame(id=c(1,2,3),car=c("bmw","bmw","bmw"),fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1
## id car fe
## 1 1 bmw 20
## 2 2 bmw 22
## 3 3 bmw 24
ott2
## id fe1
## 1 1 30
## 2 4 34
## 3 5 35
left_join(ott1,ott2,by="id")
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
inner_join(ott1,ott2,by="id")
## id car fe fe1
## 1 1 bmw 20 30
full_join(ott1,ott2,by="id")
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
## 4 4 <NA> NA 34
## 5 5 <NA> NA 35
#2) 키워드 형식으로 결합하기
#3) 행 결합
ott5<-data.frame(car=c("bmw","bmw","bmw"),fe=c(20,22,24))
ott6<-data.frame(car=c("audi","audi","audi"),fe1=c(20,22,24))
library(dplyr)
bind_rows(ott5,ott6)
## car fe fe1
## 1 bmw 20 NA
## 2 bmw 22 NA
## 3 bmw 24 NA
## 4 audi NA 20
## 5 audi NA 22
## 6 audi NA 24
#12 실기 시험에 알아두면 유용한 함수
q1<-c("nike","polo","adidas","wilson","yonex")
class(q1)
## [1] "character"
q1_factor<-as.factor(q1)
class(q1_factor)
## [1] "factor"
q1_factor
## [1] nike polo adidas wilson yonex
## Levels: adidas nike polo wilson yonex
as.numeric(q1_factor)
## [1] 2 3 1 4 5
levels(q1_factor)
## [1] "adidas" "nike" "polo" "wilson" "yonex"
levels(q1_factor)[1]
## [1] "adidas"
factor(x=c("high school","college","masters"),
levels=c("high school","college","masters"),
(ordered = TRUE))
## [1] TRUE1 TRUE2 TRUE3
## Levels: TRUE1 TRUE2 TRUE3
#2) 문자변수에서 일부를 추출
economics<-ggplot2::economics
library(ggplot2)
data("economics")
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
glimpse(economics)
## Rows: 574
## Columns: 6
## $ date <date> 1967-07-01, 1967-08-01, 1967-09-01, 1967-10-01, 1967-11-01, …
## $ pce <dbl> 506.7, 509.8, 515.6, 512.2, 517.4, 525.1, 530.9, 533.6, 544.3…
## $ pop <dbl> 198712, 198911, 199113, 199311, 199498, 199657, 199808, 19992…
## $ psavert <dbl> 12.6, 12.6, 11.9, 12.9, 12.8, 11.8, 11.7, 12.3, 11.7, 12.3, 1…
## $ uempmed <dbl> 4.5, 4.7, 4.6, 4.9, 4.7, 4.8, 5.1, 4.5, 4.1, 4.6, 4.4, 4.4, 4…
## $ unemploy <dbl> 2944, 2945, 2958, 3143, 3066, 3018, 2878, 3001, 2877, 2709, 2…
economics<-economics%>%mutate(year=substr(economics$date,1,4))
economics%>%group_by(year)%>%summarize(m=mean(psavert))%>%
arrange(desc(m))%>%head(5)
## # A tibble: 5 × 2
## year m
## <chr> <dbl>
## 1 1971 13.5
## 2 1973 13.4
## 3 1975 13.4
## 4 1974 13.3
## 5 1970 12.8
#3) 시계열 데이터 가공
as.Date("2021-05-01")
## [1] "2021-05-01"
#lubridate 패키지 + dplyr 패키지 활용
library(lubridate)
##
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
data(lakers)
glimpse(lakers)
## Rows: 34,624
## Columns: 13
## $ date <int> 20081028, 20081028, 20081028, 20081028, 20081028, 20081028, …
## $ opponent <chr> "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR…
## $ game_type <chr> "home", "home", "home", "home", "home", "home", "home", "hom…
## $ time <chr> "12:00", "11:39", "11:37", "11:25", "11:23", "11:22", "11:22…
## $ period <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ etype <chr> "jump ball", "shot", "rebound", "shot", "rebound", "shot", "…
## $ team <chr> "OFF", "LAL", "LAL", "LAL", "LAL", "LAL", "POR", "LAL", "LAL…
## $ player <chr> "", "Pau Gasol", "Vladimir Radmanovic", "Derek Fisher", "Pau…
## $ result <chr> "", "missed", "", "missed", "", "made", "", "made", "", "mad…
## $ points <int> 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, …
## $ type <chr> "", "hook", "off", "layup", "off", "hook", "shooting", "", "…
## $ x <int> NA, 23, NA, 25, NA, 25, NA, NA, NA, 36, 30, 34, NA, 15, 46, …
## $ y <int> NA, 13, NA, 6, NA, 10, NA, NA, NA, 21, 21, 10, NA, 17, 9, 10…
lakers<-lakers%>%as_tibble
lakers%>%select(date,time)
## # A tibble: 34,624 × 2
## date time
## <int> <chr>
## 1 20081028 12:00
## 2 20081028 11:39
## 3 20081028 11:37
## 4 20081028 11:25
## 5 20081028 11:23
## 6 20081028 11:22
## 7 20081028 11:22
## 8 20081028 11:22
## 9 20081028 11:00
## 10 20081028 10:53
## # ℹ 34,614 more rows
lakers<-lakers%>%mutate(date=paste(date,time)%>%ymd_hm) %>%
rename(time_index=date)
head(lakers)
## # A tibble: 6 × 13
## time_index opponent game_type time period etype team player result
## <dttm> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
## 1 2008-10-28 12:00:00 POR home 12:00 1 jump … OFF "" ""
## 2 2008-10-28 11:39:00 POR home 11:39 1 shot LAL "Pau … "miss…
## 3 2008-10-28 11:37:00 POR home 11:37 1 rebou… LAL "Vlad… ""
## 4 2008-10-28 11:25:00 POR home 11:25 1 shot LAL "Dere… "miss…
## 5 2008-10-28 11:23:00 POR home 11:23 1 rebou… LAL "Pau … ""
## 6 2008-10-28 11:22:00 POR home 11:22 1 shot LAL "Pau … "made"
## # ℹ 4 more variables: points <int>, type <chr>, x <int>, y <int>
glimpse(lakers)
## Rows: 34,624
## Columns: 13
## $ time_index <dttm> 2008-10-28 12:00:00, 2008-10-28 11:39:00, 2008-10-28 11:37…
## $ opponent <chr> "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR", "PO…
## $ game_type <chr> "home", "home", "home", "home", "home", "home", "home", "ho…
## $ time <chr> "12:00", "11:39", "11:37", "11:25", "11:23", "11:22", "11:2…
## $ period <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ etype <chr> "jump ball", "shot", "rebound", "shot", "rebound", "shot", …
## $ team <chr> "OFF", "LAL", "LAL", "LAL", "LAL", "LAL", "POR", "LAL", "LA…
## $ player <chr> "", "Pau Gasol", "Vladimir Radmanovic", "Derek Fisher", "Pa…
## $ result <chr> "", "missed", "", "missed", "", "made", "", "made", "", "ma…
## $ points <int> 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0,…
## $ type <chr> "", "hook", "off", "layup", "off", "hook", "shooting", "", …
## $ x <int> NA, 23, NA, 25, NA, 25, NA, NA, NA, 36, 30, 34, NA, 15, 46,…
## $ y <int> NA, 13, NA, 6, NA, 10, NA, NA, NA, 21, 21, 10, NA, 17, 9, 1…
lakers%>%group_by(month(time_index))%>%
summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))
## # A tibble: 7 × 3
## `month(time_index)` mean_x mean_y
## <dbl> <dbl> <dbl>
## 1 1 25.5 13.9
## 2 2 25.0 13.2
## 3 3 25.5 13.2
## 4 4 25.4 13.5
## 5 10 24.9 13.1
## 6 11 25.5 13.4
## 7 12 25.1 13.5
lakers%>%group_by(year(time_index))%>%
summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))
## # A tibble: 2 × 3
## `year(time_index)` mean_x mean_y
## <dbl> <dbl> <dbl>
## 1 2008 25.2 13.4
## 2 2009 25.4 13.4