7월 17일 실습

rm(list=ls()) #1 - 이때까지 저장된거 지우기
library(dplyr) #2

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(psych)
library(ggplot2)

## 
## 다음의 패키지를 부착합니다: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(hflights)
library(plyr)

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## 다음의 패키지를 부착합니다: 'plyr'
## 
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

setwd("C:/data")
getwd()

## [1] "C:/data"

library(caret)

## 필요한 패키지를 로딩중입니다: lattice
## 
## 다음의 패키지를 부착합니다: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

ott1<-data.frame(id=c(1,2,3),car=c("bmw","bmw","bmw"),
                 fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1

##   id car fe
## 1  1 bmw 20
## 2  2 bmw 22
## 3  3 bmw 24

ott2

##   id fe1
## 1  1  30
## 2  4  34
## 3  5  35

left_join(ott1,ott2,by="id")

##   id car fe fe1
## 1  1 bmw 20  30
## 2  2 bmw 22  NA
## 3  3 bmw 24  NA

inner_join(ott1,ott2,by="id")

##   id car fe fe1
## 1  1 bmw 20  30

full_join(ott1,ott2,by='id')

##   id  car fe fe1
## 1  1  bmw 20  30
## 2  2  bmw 22  NA
## 3  3  bmw 24  NA
## 4  4 <NA> NA  34
## 5  5 <NA> NA  35

ott3<-data.frame(nation_code=c(1,2,3,4),
                nation=c('korea','japan','china','germany'))
ott4<-data.frame(car=c('bmw','toyota','kia'),
                 nation_code=c(3,3,2))
ott3

##   nation_code  nation
## 1           1   korea
## 2           2   japan
## 3           3   china
## 4           4 germany

ott4

##      car nation_code
## 1    bmw           3
## 2 toyota           3
## 3    kia           2

left_join(ott4,ott3,by='nation_code')

##      car nation_code nation
## 1    bmw           3  china
## 2 toyota           3  china
## 3    kia           2  japan

ott5<-data.frame(car=c('bmw','bmw','bmw'),
                 fec=c(20,22,24))
ott6<-data.frame(car=c('audi','audi','audi'),
                 fel=c(20,22,24))
bind_rows(ott5,ott6)

##    car fec fel
## 1  bmw  20  NA
## 2  bmw  22  NA
## 3  bmw  24  NA
## 4 audi  NA  20
## 5 audi  NA  22
## 6 audi  NA  24

data("economics")
glimpse(economics)

## Rows: 574
## Columns: 6
## $ date     <date> 1967-07-01, 1967-08-01, 1967-09-01, 1967-10-01, 1967-11-01, …
## $ pce      <dbl> 506.7, 509.8, 515.6, 512.2, 517.4, 525.1, 530.9, 533.6, 544.3…
## $ pop      <dbl> 198712, 198911, 199113, 199311, 199498, 199657, 199808, 19992…
## $ psavert  <dbl> 12.6, 12.6, 11.9, 12.9, 12.8, 11.8, 11.7, 12.3, 11.7, 12.3, 1…
## $ uempmed  <dbl> 4.5, 4.7, 4.6, 4.9, 4.7, 4.8, 5.1, 4.5, 4.1, 4.6, 4.4, 4.4, 4…
## $ unemploy <dbl> 2944, 2945, 2958, 3143, 3066, 3018, 2878, 3001, 2877, 2709, 2…

economics<-economics%>%mutate(year=substr(economics$date,1,4))
economics%>%group_by(year)%>%summarise(m=mean(psavert))%>% 
  arrange(desc(m)) %>% head(5)

##          m
## 1 8.567247

#시계열
library(caret)
as.Date("2021-05-01")

## [1] "2021-05-01"

library(dplyr)
library(lubridate)
data(lakers)
lakers<-lakers %>% as_tibble
glimpse(lakers)

## Rows: 34,624
## Columns: 13
## $ date      <int> 20081028, 20081028, 20081028, 20081028, 20081028, 20081028, …
## $ opponent  <chr> "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR", "POR…
## $ game_type <chr> "home", "home", "home", "home", "home", "home", "home", "hom…
## $ time      <chr> "12:00", "11:39", "11:37", "11:25", "11:23", "11:22", "11:22…
## $ period    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ etype     <chr> "jump ball", "shot", "rebound", "shot", "rebound", "shot", "…
## $ team      <chr> "OFF", "LAL", "LAL", "LAL", "LAL", "LAL", "POR", "LAL", "LAL…
## $ player    <chr> "", "Pau Gasol", "Vladimir Radmanovic", "Derek Fisher", "Pau…
## $ result    <chr> "", "missed", "", "missed", "", "made", "", "made", "", "mad…
## $ points    <int> 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, …
## $ type      <chr> "", "hook", "off", "layup", "off", "hook", "shooting", "", "…
## $ x         <int> NA, 23, NA, 25, NA, 25, NA, NA, NA, 36, 30, 34, NA, 15, 46, …
## $ y         <int> NA, 13, NA, 6, NA, 10, NA, NA, NA, 21, 21, 10, NA, 17, 9, 10…

lakers %>% select(date,time)

## # A tibble: 34,624 × 2
##        date time 
##       <int> <chr>
##  1 20081028 12:00
##  2 20081028 11:39
##  3 20081028 11:37
##  4 20081028 11:25
##  5 20081028 11:23
##  6 20081028 11:22
##  7 20081028 11:22
##  8 20081028 11:22
##  9 20081028 11:00
## 10 20081028 10:53
## # ℹ 34,614 more rows

rename<-dplyr::rename
select<-dplyr::select
lakers<-lakers %>% 
  mutate(date=paste(date,time) %>% ymd_hm) %>% 
  rename(time_index=date) %>% 
  select(-time)
head(lakers)

## # A tibble: 6 × 12
##   time_index          opponent game_type period etype team  player result points
##   <dttm>              <chr>    <chr>      <int> <chr> <chr> <chr>  <chr>   <int>
## 1 2008-10-28 12:00:00 POR      home           1 jump… OFF   ""     ""          0
## 2 2008-10-28 11:39:00 POR      home           1 shot  LAL   "Pau … "miss…      0
## 3 2008-10-28 11:37:00 POR      home           1 rebo… LAL   "Vlad… ""          0
## 4 2008-10-28 11:25:00 POR      home           1 shot  LAL   "Dere… "miss…      0
## 5 2008-10-28 11:23:00 POR      home           1 rebo… LAL   "Pau … ""          0
## 6 2008-10-28 11:22:00 POR      home           1 shot  LAL   "Pau … "made"      2
## # ℹ 3 more variables: type <chr>, x <int>, y <int>

summary(lakers)

##    time_index                      opponent          game_type        
##  Min.   :2008-10-28 00:00:00.0   Length:34624       Length:34624      
##  1st Qu.:2008-12-10 00:19:30.0   Class :character   Class :character  
##  Median :2009-01-21 10:52:00.0   Mode  :character   Mode  :character  
##  Mean   :2009-01-22 20:08:18.4                                        
##  3rd Qu.:2009-03-09 00:33:00.0                                        
##  Max.   :2009-04-14 12:00:00.0                                        
##                                                                       
##      period         etype               team              player         
##  Min.   :1.000   Length:34624       Length:34624       Length:34624      
##  1st Qu.:2.000   Class :character   Class :character   Class :character  
##  Median :3.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2.536                                                           
##  3rd Qu.:4.000                                                           
##  Max.   :5.000                                                           
##                                                                          
##     result              points           type                 x        
##  Length:34624       Min.   :0.0000   Length:34624       Min.   : 0.00  
##  Class :character   1st Qu.:0.0000   Class :character   1st Qu.:20.00  
##  Mode  :character   Median :0.0000   Mode  :character   Median :25.00  
##                     Mean   :0.4627                      Mean   :25.32  
##                     3rd Qu.:1.0000                      3rd Qu.:31.00  
##                     Max.   :3.0000                      Max.   :51.00  
##                                                         NA's   :21557  
##        y        
##  Min.   : 3.00  
##  1st Qu.: 6.00  
##  Median :10.00  
##  Mean   :13.43  
##  3rd Qu.:20.00  
##  Max.   :90.00  
##  NA's   :21557

lakers %>% group_by(month(time_index)) %>% 
  summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))

##     mean_x   mean_y
## 1 25.31767 13.43055

lakers %>% group_by(year(time_index)) %>% 
  summarize(mean_x=mean(x,na.rm=TRUE),mean_y=mean(y,na.rm=TRUE))

##     mean_x   mean_y
## 1 25.31767 13.43055

lakers %>% filter(time_index<=ymd_hms('2008-10-28 12:00:00')) %>% 
  head(3)

## # A tibble: 3 × 12
##   time_index          opponent game_type period etype team  player result points
##   <dttm>              <chr>    <chr>      <int> <chr> <chr> <chr>  <chr>   <int>
## 1 2008-10-28 12:00:00 POR      home           1 jump… OFF   ""     ""          0
## 2 2008-10-28 11:39:00 POR      home           1 shot  LAL   "Pau … "miss…      0
## 3 2008-10-28 11:37:00 POR      home           1 rebo… LAL   "Vlad… ""          0
## # ℹ 3 more variables: type <chr>, x <int>, y <int>

#결측치 확인하고 빈도 확인하기
data('airquality')
colSums(is.na(airquality))

##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0

library(dplyr)
summary(airquality)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

names(airquality)<-tolower(names(airquality))
is.na(airquality$Ozone)

## logical(0)

table(is.na(airquality))

## 
## FALSE  TRUE 
##   874    44

table(is.na(airquality$ozone))

## 
## FALSE  TRUE 
##   116    37

summary(is.na(airquality))

##    ozone          solar.r           wind            temp        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:116       FALSE:146       FALSE:153       FALSE:153      
##  TRUE :37        TRUE :7                                        
##    month            day         
##  Mode :logical   Mode :logical  
##  FALSE:153       FALSE:153      
##

airquality<-na.omit(airquality)
colSums(is.na(airquality))

##   ozone solar.r    wind    temp   month     day 
##       0       0       0       0       0       0

library(dplyr)
airquality %>% filter(!is.na(ozone)) %>% head(3)

##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3

mean(airquality$ozone,na.rm=TRUE)

## [1] 42.0991

table(is.na(airquality$ozone))

## 
## FALSE 
##   111

7월 17일 실습

방지원

2023-07-17