rm(list=ls()) #1 - 이때까지 저장된거 지우기
library(dplyr) #2
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
##
## 다음의 패키지를 부착합니다: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(hflights)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## 다음의 패키지를 부착합니다: 'plyr'
##
## The following object is masked from 'package:purrr':
##
## compact
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
setwd("C:/data")
getwd()
## [1] "C:/data"
library(caret)
## 필요한 패키지를 로딩중입니다: lattice
##
## 다음의 패키지를 부착합니다: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
ott1<-data.frame(id=c(1,2,3),car=c("bmw","bmw","bmw"),
fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1
## id car fe
## 1 1 bmw 20
## 2 2 bmw 22
## 3 3 bmw 24
ott2
## id fe1
## 1 1 30
## 2 4 34
## 3 5 35
left_join(ott1,ott2,by="id")
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
inner_join(ott1,ott2,by="id")
## id car fe fe1
## 1 1 bmw 20 30
full_join(ott1,ott2,by='id')
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
## 4 4 <NA> NA 34
## 5 5 <NA> NA 35
ott3<-data.frame(nation_code=c(1,2,3,4),
nation=c('korea','japan','china','germany'))
ott4<-data.frame(car=c('bmw','toyota','kia'),
nation_code=c(3,3,2))
ott3
## nation_code nation
## 1 1 korea
## 2 2 japan
## 3 3 china
## 4 4 germany
ott4
## car nation_code
## 1 bmw 3
## 2 toyota 3
## 3 kia 2
left_join(ott4,ott3,by='nation_code')
## car nation_code nation
## 1 bmw 3 china
## 2 toyota 3 china
## 3 kia 2 japan
ott5<-data.frame(car=c('bmw','bmw','bmw'),
fec=c(20,22,24))
ott6<-data.frame(car=c('audi','audi','audi'),
fel=c(20,22,24))
bind_rows(ott5,ott6)
## car fec fel
## 1 bmw 20 NA
## 2 bmw 22 NA
## 3 bmw 24 NA
## 4 audi NA 20
## 5 audi NA 22
## 6 audi NA 24
data("economics")
glimpse(economics)
## Rows: 574
## Columns: 6
## $ date <date> 1967-07-01, 1967-08-01, 1967-09-01, 1967-10-01, 1967-11-01, …
## $ pce <dbl> 506.7, 509.8, 515.6, 512.2, 517.4, 525.1, 530.9, 533.6, 544.3…
## $ pop <dbl> 198712, 198911, 199113, 199311, 199498, 199657, 199808, 19992…
## $ psavert <dbl> 12.6, 12.6, 11.9, 12.9, 12.8, 11.8, 11.7, 12.3, 11.7, 12.3, 1…
## $ uempmed <dbl> 4.5, 4.7, 4.6, 4.9, 4.7, 4.8, 5.1, 4.5, 4.1, 4.6, 4.4, 4.4, 4…
## $ unemploy <dbl> 2944, 2945, 2958, 3143, 3066, 3018, 2878, 3001, 2877, 2709, 2…
economics<-economics%>%mutate(year=substr(economics$date,1,4))
economics%>%group_by(year)%>%summarise(m=mean(psavert))%>%
arrange(desc(m)) %>% head(5)
## m
## 1 8.567247
#시계열
library(caret)
as.Date("2021-05-01")
## [1] "2021-05-01"
library(dplyr)
library(lubridate)
data(lakers)
lakers<-lakers %>% as_tibble
glimpse(lakers)
## Rows: 34,624
## Columns: 13
## $ date <int> 20081028, 20081028, 20081028, 20081028, 20081028, 20081028, …
## $ opponent <chr> "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR…
## $ game_type <chr> "home", "home", "home", "home", "home", "home", "home", "hom…
## $ time <chr> "12:00", "11:39", "11:37", "11:25", "11:23", "11:22", "11:22…
## $ period <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ etype <chr> "jump ball", "shot", "rebound", "shot", "rebound", "shot", "…
## $ team <chr> "OFF", "LAL", "LAL", "LAL", "LAL", "LAL", "POR", "LAL", "LAL…
## $ player <chr> "", "Pau Gasol", "Vladimir Radmanovic", "Derek Fisher", "Pau…
## $ result <chr> "", "missed", "", "missed", "", "made", "", "made", "", "mad…
## $ points <int> 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, …
## $ type <chr> "", "hook", "off", "layup", "off", "hook", "shooting", "", "…
## $ x <int> NA, 23, NA, 25, NA, 25, NA, NA, NA, 36, 30, 34, NA, 15, 46, …
## $ y <int> NA, 13, NA, 6, NA, 10, NA, NA, NA, 21, 21, 10, NA, 17, 9, 10…
lakers %>% select(date,time)
## # A tibble: 34,624 × 2
## date time
## <int> <chr>
## 1 20081028 12:00
## 2 20081028 11:39
## 3 20081028 11:37
## 4 20081028 11:25
## 5 20081028 11:23
## 6 20081028 11:22
## 7 20081028 11:22
## 8 20081028 11:22
## 9 20081028 11:00
## 10 20081028 10:53
## # ℹ 34,614 more rows
rename<-dplyr::rename
select<-dplyr::select
lakers<-lakers %>%
mutate(date=paste(date,time) %>% ymd_hm) %>%
rename(time_index=date) %>%
select(-time)
head(lakers)
## # A tibble: 6 × 12
## time_index opponent game_type period etype team player result points
## <dttm> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 2008-10-28 12:00:00 POR home 1 jump… OFF "" "" 0
## 2 2008-10-28 11:39:00 POR home 1 shot LAL "Pau … "miss… 0
## 3 2008-10-28 11:37:00 POR home 1 rebo… LAL "Vlad… "" 0
## 4 2008-10-28 11:25:00 POR home 1 shot LAL "Dere… "miss… 0
## 5 2008-10-28 11:23:00 POR home 1 rebo… LAL "Pau … "" 0
## 6 2008-10-28 11:22:00 POR home 1 shot LAL "Pau … "made" 2
## # ℹ 3 more variables: type <chr>, x <int>, y <int>
summary(lakers)
## time_index opponent game_type
## Min. :2008-10-28 00:00:00.0 Length:34624 Length:34624
## 1st Qu.:2008-12-10 00:19:30.0 Class :character Class :character
## Median :2009-01-21 10:52:00.0 Mode :character Mode :character
## Mean :2009-01-22 20:08:18.4
## 3rd Qu.:2009-03-09 00:33:00.0
## Max. :2009-04-14 12:00:00.0
##
## period etype team player
## Min. :1.000 Length:34624 Length:34624 Length:34624
## 1st Qu.:2.000 Class :character Class :character Class :character
## Median :3.000 Mode :character Mode :character Mode :character
## Mean :2.536
## 3rd Qu.:4.000
## Max. :5.000
##
## result points type x
## Length:34624 Min. :0.0000 Length:34624 Min. : 0.00
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.:20.00
## Mode :character Median :0.0000 Mode :character Median :25.00
## Mean :0.4627 Mean :25.32
## 3rd Qu.:1.0000 3rd Qu.:31.00
## Max. :3.0000 Max. :51.00
## NA's :21557
## y
## Min. : 3.00
## 1st Qu.: 6.00
## Median :10.00
## Mean :13.43
## 3rd Qu.:20.00
## Max. :90.00
## NA's :21557
lakers %>% group_by(month(time_index)) %>%
summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))
## mean_x mean_y
## 1 25.31767 13.43055
lakers %>% group_by(year(time_index)) %>%
summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))
## mean_x mean_y
## 1 25.31767 13.43055
lakers %>% filter(time_index<=ymd_hms('2008-10-28 12:00:00')) %>%
head(3)
## # A tibble: 3 × 12
## time_index opponent game_type period etype team player result points
## <dttm> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 2008-10-28 12:00:00 POR home 1 jump… OFF "" "" 0
## 2 2008-10-28 11:39:00 POR home 1 shot LAL "Pau … "miss… 0
## 3 2008-10-28 11:37:00 POR home 1 rebo… LAL "Vlad… "" 0
## # ℹ 3 more variables: type <chr>, x <int>, y <int>
#결측치 확인하고 빈도 확인하기
data('airquality')
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
library(dplyr)
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
names(airquality)<-tolower(names(airquality))
is.na(airquality$Ozone)
## logical(0)
table(is.na(airquality))
##
## FALSE TRUE
## 874 44
table(is.na(airquality$ozone))
##
## FALSE TRUE
## 116 37
summary(is.na(airquality))
## ozone solar.r wind temp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:116 FALSE:146 FALSE:153 FALSE:153
## TRUE :37 TRUE :7
## month day
## Mode :logical Mode :logical
## FALSE:153 FALSE:153
##
airquality<-na.omit(airquality)
colSums(is.na(airquality))
## ozone solar.r wind temp month day
## 0 0 0 0 0 0
library(dplyr)
airquality %>% filter(!is.na(ozone)) %>% head(3)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
mean(airquality$ozone,na.rm=TRUE)
## [1] 42.0991
table(is.na(airquality$ozone))
##
## FALSE
## 111