Data 전처리의 기본 내용은 다음과 같다.
dplyr를 쓰기 위해서는 dplyr/readxl 패키지를 설치하고, 일반적인 dataframe 파일의 경우에는 “as_tibble(df)”과 같이 tibble dataframe으로 변환한 후에 사용해야 한다. 그리고 비행관련데이터인 ’hflights’을 사용하려면 hflights 패키지를 설치해야 한다.
# install.packages(c('dplyr','hflights','readxl'))
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
library(readxl)
# hflights를 tibble dataframe인 hflights_df로 변환
hflights_df <- tibble::as_tibble(hflights)
# 1월 1일에 해당하는 자료만 선택 (filter): 552 cases
filter(hflights_df, Month == 1, DayofMonth == 1)
## # A tibble: 552 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## 7 2011 1 1 6 557 906 AA 1994
## 8 2011 1 1 6 1824 2106 AS 731
## 9 2011 1 1 6 654 1124 B6 620
## 10 2011 1 1 6 1639 2110 B6 622
## # ℹ 542 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## # ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# 1월과 2월달에 해당하는 자료만 선택 (filter): 30,638 cases
filter(hflights_df, Month == 1 | Month == 2)
## # A tibble: 36,038 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## # ℹ 36,028 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## # ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# ArrDelay, 월, 년 별로 정렬 (arrange)
arrange(hflights_df, ArrDelay, Month, Year)
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 7 3 7 1914 2039 XE 2804
## 2 2011 12 25 7 741 926 OO 4591
## 3 2011 8 21 7 935 1039 OO 2001
## 4 2011 8 31 3 934 1039 OO 2040
## 5 2011 8 26 5 2107 2205 OO 2003
## 6 2011 12 24 6 2129 2337 CO 1552
## 7 2011 8 28 7 2059 2206 OO 2003
## 8 2011 8 29 1 935 1041 OO 2040
## 9 2011 8 18 4 939 1043 OO 2001
## 10 2011 12 24 6 2117 2258 CO 1712
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## # ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# 월별로 내림차순으로(descending) 정렬 (arrange)
arrange(hflights_df, desc(Month))
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 12 15 4 2113 2217 AA 426
## 2 2011 12 16 5 2004 2128 AA 426
## 3 2011 12 18 7 2007 2113 AA 426
## 4 2011 12 19 1 2108 2223 AA 426
## 5 2011 12 20 2 2008 2107 AA 426
## 6 2011 12 21 3 2025 2124 AA 426
## 7 2011 12 22 4 2021 2118 AA 426
## 8 2011 12 23 5 2015 2118 AA 426
## 9 2011 12 26 1 2013 2118 AA 426
## 10 2011 12 27 2 2007 2123 AA 426
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## # ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# gain과 gain_per_hour라는 변수를 생성(mutate)
mutate(hflights_df,gain=ArrDelay-DepDelay, gain_per_hour = gain/(AirTime/60))
## # A tibble: 227,496 × 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## # ℹ 227,486 more rows
## # ℹ 15 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## # ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, gain <int>, gain_per_hour <dbl>
# DepDelay의 평균을 구하는 것(summarise), 단 missing케이스는 계산에서 제외
summarise(hflights_df, delay = mean(DepDelay, na.rm = TRUE) )
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 9.44
# DepDelay의 평균을 구하는 것(summarise), colMeans를 써도 가능함
summarise(hflights_df, delay = colMeans(select(hflights_df,DepDelay),na.rm=TRUE))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 9.44
# 데이터를 TailNum변수별로 grouping을 하여 planes라는 파일에 저장
# Tailnum별로 사례수(count), 비행거리(dist)의 평균, 도착시연착(delay)평균을 계산하여 delay에 저장
planes <- group_by(hflights_df, TailNum)
delay <- summarise(planes, count = n(),
dist = mean(Distance, na.rm = TRUE),
delay = mean(ArrDelay, na.rm = TRUE))
# delay파일에서 count가 20이 넘고 비행거리가 2000이하인 것만 select (filter)한 후에 출력함.
delay <- filter(delay, count > 20, dist < 2000)
delay
## # A tibble: 1,526 × 4
## TailNum count dist delay
## <chr> <int> <dbl> <dbl>
## 1 "" 795 939. NaN
## 2 "N0EGMQ" 40 1095. 1.92
## 3 "N10156" 317 802. 8.20
## 4 "N10575" 94 632. 18.1
## 5 "N11106" 308 775. 10.1
## 6 "N11107" 345 768. 8.05
## 7 "N11109" 331 772. 10.3
## 8 "N11113" 282 773. 4.06
## 9 "N11119" 130 790. 7.40
## 10 "N11121" 333 775. 6.74
## # ℹ 1,516 more rows
# 또 다른 예를 들어본다.
a1 <- group_by(hflights_df, Year, Month, DayofMonth)
a2 <- select(a1, Year:DayofMonth, ArrDelay, DepDelay)
a3 <- summarise(a2,
arr = mean(ArrDelay, na.rm = TRUE),
dep = mean(DepDelay, na.rm = TRUE))
## `summarise()` has grouped output by 'Year', 'Month'. You can override using the
## `.groups` argument.
a4 <- filter(a3, arr > 30 | dep > 30)
a4
## # A tibble: 14 × 5
## # Groups: Year, Month [10]
## Year Month DayofMonth arr dep
## <int> <int> <int> <dbl> <dbl>
## 1 2011 2 4 44.1 47.2
## 2 2011 3 3 35.1 38.2
## 3 2011 3 14 46.6 36.1
## 4 2011 4 4 38.7 27.9
## 5 2011 4 25 37.8 22.3
## 6 2011 5 12 69.5 64.5
## 7 2011 5 20 37.0 26.6
## 8 2011 6 22 65.5 62.3
## 9 2011 7 29 29.6 31.9
## 10 2011 9 29 39.2 32.5
## 11 2011 10 9 61.9 59.5
## 12 2011 11 15 43.7 39.2
## 13 2011 12 29 26.3 30.8
## 14 2011 12 31 46.5 54.2
# 즉 위의 a1, a2, a3, a4의 네가지 다른 명령을 하나로 표현하면 다음과 같다.
hflights_df %>%
group_by(Year, Month, DayofMonth) %>%
summarise(arr = mean(ArrDelay, na.rm = TRUE),
dep = mean(DepDelay, na.rm = TRUE)) %>%
filter(arr > 30 | dep > 30)
## `summarise()` has grouped output by 'Year', 'Month'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 5
## # Groups: Year, Month [10]
## Year Month DayofMonth arr dep
## <int> <int> <int> <dbl> <dbl>
## 1 2011 2 4 44.1 47.2
## 2 2011 3 3 35.1 38.2
## 3 2011 3 14 46.6 36.1
## 4 2011 4 4 38.7 27.9
## 5 2011 4 25 37.8 22.3
## 6 2011 5 12 69.5 64.5
## 7 2011 5 20 37.0 26.6
## 8 2011 6 22 65.5 62.3
## 9 2011 7 29 29.6 31.9
## 10 2011 9 29 39.2 32.5
## 11 2011 10 9 61.9 59.5
## 12 2011 11 15 43.7 39.2
## 13 2011 12 29 26.3 30.8
## 14 2011 12 31 46.5 54.2
data.table을 쓰기 위해서는 data.table 패키지를 설치하고, df1과 같은 일반적인 dataframe 파일의 경우에는 “as.data.table(df1)”과 같이 data.table로 변환한 후에 사용해야 한다.
기본적인 data.table 명령어는 다음과 같다.
DT[(행), (열), (by)] [order]
즉, 첫번째 (행)은 행(case)을 선택하거나 조작하는 것이고,
두번째 (열)은 열(column)을 선택하거나 조작하는 것이고, 세번째 (by)은
grouping하는데 사용하는 것이고, 네번째 (order)는 결과를 정렬하는 데
사용하는 것이다.
library(data.table)
##
## 다음의 패키지를 부착합니다: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(openxlsx)
df1 = read.xlsx("anova.xlsx")
df1 = as.data.table(df1)
df2 = fread("tempbig.csv")
df3 = as.data.table(read.xlsx("twoway.xlsx"))
# df1: 근무만족도가 5~8점 사이의 case만 출력
df1[(근무만족도>=5 & 근무만족도 <= 8)]
## 근무만족도 상사의유형
## 1: 7 1
## 2: 7 1
## 3: 8 1
## 4: 8 1
## 5: 8 1
## 6: 7 2
## 7: 8 2
## 8: 8 2
## 9: 8 2
## 10: 8 3
## 11: 8 3
# df1: 근무만족도 변수만 출력
df1[,.(근무만족도)]
## 근무만족도
## 1: 7
## 2: 7
## 3: 8
## 4: 8
## 5: 8
## 6: 10
## 7: 7
## 8: 8
## 9: 8
## 10: 8
## 11: 9
## 12: 10
## 13: 8
## 14: 8
## 15: 9
## 16: 12
## 17: 15
## 18: 16
# df1: 상사의유형별로 평균, 표준편차, 사례수를 구하는 명령임.
# 결과를 상사의유형별로 정렬
stat = df1[, .(Mean=mean(근무만족도), SD=sd(근무만족도), N=sum(!is.na(근무만족도))),
by = .(상사의유형)][order(상사의유형)]
stat
## 상사의유형 Mean SD N
## 1: 1 8.000000 1.095445 6
## 2: 2 8.333333 1.032796 6
## 3: 3 11.333333 3.559026 6
# df2: x1과 x2별로 평균, 표준편차, 사례수를 구하는 명령임
# 결과를 x1, x2 별로 정렬
df2[, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by=.(x1,x2)][order(x1,x2)]
## x1 x2 Mean SD N
## 1: 1 1 60.22331 29.10055 1021910
## 2: 1 2 62.80916 27.61108 1395690
## 3: 1 3 60.82691 28.03215 1246580
## 4: 1 4 54.75922 26.60266 789890
## 5: 1 5 51.32877 26.42764 489860
## 6: 1 6 47.39352 25.75961 354670
## 7: 2 1 63.30948 26.89169 1843800
## 8: 2 2 66.91873 26.11230 1671400
## 9: 2 3 66.28037 25.14893 677430
## 10: 2 4 58.39985 26.92955 275400
## 11: 2 5 53.88967 28.43449 145660
## 12: 2 6 51.27192 24.85099 87710
# df2: x2가 4보다 같거나 작은 case만 선택하여, x1과 x2별로 평균, 표준편차, 사례수를 구하는 명령임
# 결과를 x1, x2 별로 정렬
df2[x2 <= 4, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by = .(x1,x2)][order(x1,x2)]
## x1 x2 Mean SD N
## 1: 1 1 60.22331 29.10055 1021910
## 2: 1 2 62.80916 27.61108 1395690
## 3: 1 3 60.82691 28.03215 1246580
## 4: 1 4 54.75922 26.60266 789890
## 5: 2 1 63.30948 26.89169 1843800
## 6: 2 2 66.91873 26.11230 1671400
## 7: 2 3 66.28037 25.14893 677430
## 8: 2 4 58.39985 26.92955 275400
# df2: x2가 4보다 같거나 작고, x1이 1인 case만 선택하여, x1과 x2별로 평균/표준편차/사례수를 구하는 명령임
# 결과를 x1, x2 별로 정렬
df2[x2 <= 4 & x1 == 1, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by = .(x1,x2)][order(x1,x2)]
## x1 x2 Mean SD N
## 1: 1 1 60.22331 29.10055 1021910
## 2: 1 2 62.80916 27.61108 1395690
## 3: 1 3 60.82691 28.03215 1246580
## 4: 1 4 54.75922 26.60266 789890
# df3: 상사의 유형과 보상방법별로 근무만족도의 평균/표준편차/사례수를 구하는 명령임.
# 결과를 상사의 유형, 보상방법별로 정렬
stat1 = df3[ , .(Mean=mean(근무만족도), SD=sd(근무만족도), N = sum(!is.na(근무만족도))),
by = .(상사의유형, 보상방법)][order(상사의유형, 보상방법)]
stat1
## 상사의유형 보상방법 Mean SD N
## 1: 민주형 사탕 8.333333 0.5773503 3
## 2: 민주형 칭찬 14.333333 2.0816660 3
## 3: 자유방임형 사탕 7.666667 0.5773503 3
## 4: 자유방임형 칭찬 9.000000 1.0000000 3
## 5: 전제형 사탕 7.333333 0.5773503 3
## 6: 전제형 칭찬 8.666667 1.1547005 3
# 국가간 A매치 축구시합결과 파일: worldcup.csv
library(data.table)
library(openxlsx)
# Rbase의 read.csv와 data.table의 fread의 속도비교
system.time(read.csv("tempbig.csv"))
## 사용자 시스템 elapsed
## 5.17 0.15 6.68
system.time(fread("tempbig.csv"))
## 사용자 시스템 elapsed
## 0.14 0.03 0.11
print("fread로 읽는 것이 60배 정도 빠름.")
## [1] "fread로 읽는 것이 60배 정도 빠름."
df = fread("worldcup.csv")
# 새로운 변수 (year, 결과) 생성
df = df[, year := year(date)]
df = df[, 결과 := ifelse(team_score>opponent_score, '승',
ifelse(team_score==opponent_score, '무', '패'))]
df
# 행(case) 선택 (team =="Korea Republic")
df[team=='Korea Republic']
## date team opponent team_score opponent_score
## 1: 1949-01-02 Korea Republic China 2 3
## 2: 1949-01-16 Korea Republic Vietnam 3 3
## 3: 1949-01-25 Korea Republic Macau 5 1
## 4: 1950-04-16 Korea Republic Taiwan 3 1
## 5: 1953-04-05 Korea Republic Hong Kong 0 4
## ---
## 833: 2018-06-07 Korea Republic Bolivia 0 0
## 834: 2018-06-11 Korea Republic Senegal 0 2
## 835: 2018-06-18 Korea Republic Sweden 0 1
## 836: 2018-06-23 Korea Republic Mexico 1 2
## 837: 2018-06-27 Korea Republic Germany 2 0
## tournament year 결과
## 1: Friendly 1949 패
## 2: Friendly 1949 무
## 3: Friendly 1949 승
## 4: Friendly 1950 승
## 5: Friendly 1953 패
## ---
## 833: Friendly 2018 무
## 834: Friendly 2018 패
## 835: FIFA World Cup 2018 패
## 836: FIFA World Cup 2018 패
## 837: FIFA World Cup 2018 승
# 행(case) 선택 (1980~2000년)
df[year >= 1980 & year <= 2000]
## date team opponent team_score opponent_score
## 1: 1980-01-06 Sierra Leone Ghana 2 4
## 2: 1980-01-06 Ghana Sierra Leone 4 2
## 3: 1980-01-16 Cyprus Greece 1 1
## 4: 1980-01-16 Greece Cyprus 1 1
## 5: 1980-01-19 Congo Ivory Coast 2 0
## ---
## 24680: 2000-12-16 Liberia South Africa 1 2
## 24681: 2000-12-20 Japan Korea Republic 1 1
## 24682: 2000-12-20 Mexico Argentina 0 2
## 24683: 2000-12-20 Korea Republic Japan 1 1
## 24684: 2000-12-20 Argentina Mexico 2 0
## tournament year 결과
## 1: Friendly 1980 패
## 2: Friendly 1980 승
## 3: Friendly 1980 무
## 4: Friendly 1980 무
## 5: Friendly 1980 승
## ---
## 24680: African Cup of Nations qualification 2000 패
## 24681: Friendly 2000 무
## 24682: Friendly 2000 패
## 24683: Friendly 2000 무
## 24684: Friendly 2000 승
# team별 "승리"한 횟수 계산
df[, .(승=sum(결과=='승')), by=.(team)]
## team 승
## 1: Scotland 361
## 2: England 557
## 3: Wales 197
## 4: Northern Ireland 158
## 5: USA 281
## ---
## 290: Kárpátalja 5
## 291: Barawa 2
## 292: Cascadia 3
## 293: Matabeleland 2
## 294: Kabylia 1
# team별 "승리"와 "무승부" 횟수 계산
df[, .(승=sum(결과=='승'),무=sum(결과=='무')), by=.(team)]
## team 승 무
## 1: Scotland 361 165
## 2: England 557 239
## 3: Wales 197 139
## 4: Northern Ireland 158 145
## 5: USA 281 141
## ---
## 290: Kárpátalja 5 4
## 291: Barawa 2 0
## 292: Cascadia 3 1
## 293: Matabeleland 2 1
## 294: Kabylia 1 2
# team별 "승리"/"무승부"/"패배' 횟수 계산
df[, .(승=sum(결과=='승'),무=sum(결과=='무'),패=sum(결과=='패')), by=.(team)]
## team 승 무 패
## 1: Scotland 361 165 235
## 2: England 557 239 186
## 3: Wales 197 139 301
## 4: Northern Ireland 158 145 323
## 5: USA 281 141 240
## ---
## 290: Kárpátalja 5 4 1
## 291: Barawa 2 0 4
## 292: Cascadia 3 1 2
## 293: Matabeleland 2 1 2
## 294: Kabylia 1 2 3
# 한국과 일본의 경기만 선택함
df[team=='Korea Republic' & opponent=='Japan',]
## date team opponent team_score opponent_score
## 1: 1954-03-07 Korea Republic Japan 5 1
## 2: 1954-03-14 Korea Republic Japan 2 2
## 3: 1959-09-05 Korea Republic Japan 0 0
## 4: 1959-09-06 Korea Republic Japan 3 1
## 5: 1960-11-06 Korea Republic Japan 2 1
## 6: 1961-06-11 Korea Republic Japan 2 0
## 7: 1963-08-13 Korea Republic Japan 1 1
## 8: 1967-08-01 Korea Republic Japan 1 2
## 9: 1969-10-12 Korea Republic Japan 2 2
## 10: 1969-10-18 Korea Republic Japan 2 0
## 11: 1970-08-02 Korea Republic Japan 1 1
## 12: 1972-07-26 Korea Republic Japan 3 0
## 13: 1972-09-14 Korea Republic Japan 2 2
## 14: 1973-06-23 Korea Republic Japan 2 0
## 15: 1974-09-28 Korea Republic Japan 1 4
## 16: 1975-08-09 Korea Republic Japan 3 1
## 17: 1975-09-08 Korea Republic Japan 3 0
## 18: 1976-08-18 Korea Republic Japan 0 0
## 19: 1976-12-04 Korea Republic Japan 2 1
## 20: 1977-03-26 Korea Republic Japan 0 0
## 21: 1977-04-03 Korea Republic Japan 1 0
## 22: 1977-06-15 Korea Republic Japan 2 1
## 23: 1978-07-19 Korea Republic Japan 4 0
## 24: 1979-03-04 Korea Republic Japan 1 2
## 25: 1979-06-16 Korea Republic Japan 4 1
## 26: 1981-03-08 Korea Republic Japan 1 0
## 27: 1981-06-21 Korea Republic Japan 2 0
## 28: 1982-03-21 Korea Republic Japan 3 0
## 29: 1983-03-06 Korea Republic Japan 1 1
## 30: 1984-09-30 Korea Republic Japan 1 2
## 31: 1985-10-26 Korea Republic Japan 2 1
## 32: 1985-11-03 Korea Republic Japan 1 0
## 33: 1988-10-26 Korea Republic Japan 1 0
## 34: 1988-12-06 Korea Republic Japan 2 0
## 35: 1989-05-05 Korea Republic Japan 1 0
## 36: 1990-07-27 Korea Republic Japan 2 0
## 37: 1991-07-27 Korea Republic Japan 1 0
## 38: 1992-08-22 Korea Republic Japan 0 0
## 39: 1992-08-29 Korea Republic Japan 2 2
## 40: 1993-10-25 Korea Republic Japan 0 1
## 41: 1995-02-21 Korea Republic Japan 1 1
## 42: 1995-02-26 Korea Republic Japan 2 2
## 43: 1997-05-21 Korea Republic Japan 1 1
## 44: 1997-09-28 Korea Republic Japan 2 1
## 45: 1997-11-01 Korea Republic Japan 0 2
## 46: 1998-03-01 Korea Republic Japan 1 2
## 47: 1998-04-01 Korea Republic Japan 2 1
## 48: 2000-04-26 Korea Republic Japan 1 0
## 49: 2000-12-20 Korea Republic Japan 1 1
## 50: 2003-04-16 Korea Republic Japan 0 1
## 51: 2003-05-31 Korea Republic Japan 1 0
## 52: 2003-12-10 Korea Republic Japan 0 0
## 53: 2005-08-07 Korea Republic Japan 0 1
## 54: 2007-07-28 Korea Republic Japan 0 0
## 55: 2008-02-23 Korea Republic Japan 1 1
## 56: 2010-02-14 Korea Republic Japan 3 1
## 57: 2010-05-24 Korea Republic Japan 2 0
## 58: 2010-10-12 Korea Republic Japan 0 0
## 59: 2011-01-25 Korea Republic Japan 2 2
## 60: 2011-08-10 Korea Republic Japan 0 3
## 61: 2013-07-28 Korea Republic Japan 1 2
## 62: 2015-08-05 Korea Republic Japan 1 1
## 63: 2017-12-16 Korea Republic Japan 4 1
## date team opponent team_score opponent_score
## tournament year 결과
## 1: FIFA World Cup qualification 1954 승
## 2: FIFA World Cup qualification 1954 무
## 3: Merdeka Tournament 1959 무
## 4: Merdeka Tournament 1959 승
## 5: FIFA World Cup qualification 1960 승
## 6: FIFA World Cup qualification 1961 승
## 7: Merdeka Tournament 1963 무
## 8: AFC Asian Cup qualification 1967 패
## 9: FIFA World Cup qualification 1969 무
## 10: FIFA World Cup qualification 1969 승
## 11: Merdeka Tournament 1970 무
## 12: Merdeka Tournament 1972 승
## 13: Friendly 1972 무
## 14: Friendly 1973 승
## 15: Friendly 1974 패
## 16: Merdeka Tournament 1975 승
## 17: Friendly 1975 승
## 18: Merdeka Tournament 1976 무
## 19: Friendly 1976 승
## 20: FIFA World Cup qualification 1977 무
## 21: FIFA World Cup qualification 1977 승
## 22: Friendly 1977 승
## 23: Merdeka Tournament 1978 승
## 24: Friendly 1979 패
## 25: Friendly 1979 승
## 26: Friendly 1981 승
## 27: Korea Cup 1981 승
## 28: Friendly 1982 승
## 29: Friendly 1983 무
## 30: Friendly 1984 패
## 31: FIFA World Cup qualification 1985 승
## 32: FIFA World Cup qualification 1985 승
## 33: Friendly 1988 승
## 34: AFC Asian Cup 1988 승
## 35: Friendly 1989 승
## 36: Dynasty Cup 1990 승
## 37: Friendly 1991 승
## 38: Dynasty Cup 1992 무
## 39: Dynasty Cup 1992 무
## 40: FIFA World Cup qualification 1993 패
## 41: Dynasty Cup 1995 무
## 42: Dynasty Cup 1995 무
## 43: Friendly 1997 무
## 44: FIFA World Cup qualification 1997 승
## 45: FIFA World Cup qualification 1997 패
## 46: Dynasty Cup 1998 패
## 47: Friendly 1998 승
## 48: Friendly 2000 승
## 49: Friendly 2000 무
## 50: Friendly 2003 패
## 51: Friendly 2003 승
## 52: EAFF Championship 2003 무
## 53: EAFF Championship 2005 패
## 54: AFC Asian Cup 2007 무
## 55: EAFF Championship 2008 무
## 56: EAFF Championship 2010 승
## 57: Friendly 2010 승
## 58: Friendly 2010 무
## 59: AFC Asian Cup 2011 무
## 60: Friendly 2011 패
## 61: EAFF Championship 2013 패
## 62: EAFF Championship 2015 무
## 63: EAFF Championship 2017 승
## tournament year 결과
# 한국과 일본의 경기중 '승리'한 횟수 계산
df[team=='Korea Republic' & opponent=='Japan', .(승=sum(결과=='승')), by=.(team)]
## team 승
## 1: Korea Republic 31
# 한국과 일본의 경기중 '승리'/'무승부'/'패배' 횟수 계산
df[team=='Korea Republic' & opponent=='Japan', .(승=sum(결과=='승'),무=sum(결과=='무'),패=sum(결과=='패')), by=.(team)]
## team 승 무 패
## 1: Korea Republic 31 21 11
# 한국팀의 평균 득점과 평균 실점을 계산하여 내림차순으로 정리
df[team=='Korea Republic', .(평균득점=mean(team_score), 평균실점=mean(opponent_score)), by=.(opponent)][order(평균득점, decreasing = TRUE)]
## opponent 평균득점 평균실점
## 1: Guam 9.0 0.0
## 2: Sudan 8.0 0.0
## 3: Yemen 6.0 0.0
## 4: Mongolia 6.0 0.0
## 5: Laos 5.6 0.0
## ---
## 113: Tunisia 0.0 0.5
## 114: Chile 0.0 1.0
## 115: Denmark 0.0 0.0
## 116: Belarus 0.0 1.0
## 117: Peru 0.0 0.0
# 한국팀의 평균 득점과 평균 실점을 계산하여 내림차순으로 정리 (order를 사용)
df[team=='Korea Republic', .(평균득점=mean(team_score), 평균실점=mean(opponent_score)), by=.(opponent)][order(-평균득점)]
## opponent 평균득점 평균실점
## 1: Guam 9.0 0.0
## 2: Sudan 8.0 0.0
## 3: Yemen 6.0 0.0
## 4: Mongolia 6.0 0.0
## 5: Laos 5.6 0.0
## ---
## 113: Tunisia 0.0 0.5
## 114: Chile 0.0 1.0
## 115: Denmark 0.0 0.0
## 116: Belarus 0.0 1.0
## 117: Peru 0.0 0.0
df4 = read.xlsx("ClassExample.xlsx")
# 막대그래프 barplot()
barplot(table(df4$sliptime))
barplot(table(df4$sliptime),
main = "Sleep Time 빈도",
xlab = "Sleep Time 정도",
ylab = "빈도",
col = 'red')
# 박스그래프 barplot()
df1 = read.xlsx("anova.xlsx")
boxplot(df1$근무만족도 ~ df1$상사의유형)
boxplot(df1$근무만족도 ~ df1$상사의유형,
main = "상사의 유형별 근무만족도",
xlab = "상사의 유형",
ylab = "근무만족도",
col = 'yellow')
# 산점도 plot() scatterplot
df3 = read.xlsx("regress.xlsx")
plot(df3$근무만족도,df3$근무평정,
main = "근무만족도-근무평정 산점도",
xlab = "근무만족도",
ylab = '근무평정',
col = 'red',
pch = 20)
pairs(df3,
main = "Multi Plots",
pch = 20,
col = 'red')
# 파이차트 pie()
pie(table(df4$sliptime))
df6 = table(df4$sliptime)
df6 = data.frame(freq = df6,
lab =c("1~3 hr", "3~5 hr", "6~7 hr",
"7~8 hr", "8~9 hr", "10~ hr"))
pie (df6[,2], labels=df6[,3])
# 선그래프 line chart
df7 = aggregate(df4$weight ~ df4$sliptime, data=df4, mean)
plot(df7[,1], df7[,2])
plot(df7[,1], df7[,2],
main = "Weight by Sleeptime",
xlab = "Sleep Time",
ylab = "Weight",
type = 'o')
ggplot의 기본구조
library(ggplot2)
ggplot(data=df, aes(x = x1, y = x2) +
geom_xx() +
geom_yy() +)
library(ggplot2)
# ggplot: 막대그래프 barplot()
month = c(1, 2, 3, 4, 5, 6)
rain = c(55, 50, 45, 50, 60, 70)
df = data.frame(month, rain)
ggplot(df, aes(x = month, y = rain)) +
geom_bar(stat = 'identity',
width = 0.7,
fill = 'steelblue')
# flipped bar
ggplot(df, aes(x = month, y = rain)) +
geom_bar(stat = 'identity',
width = 0.7,
fill = 'steelblue') +
ggtitle("월별강수량") +
theme(plot.title=element_text(size=25,
face="bold",color="steelblue")) +
labs(x = '월', y = '강수량') + coord_flip()
# 막대그래프 histogram()
library(ggplot2)
ggplot(iris, aes(x = Petal.Length)) +
geom_histogram(binwidth = 0.5, fill='red', color='black')
ggplot(iris, aes(x = Sepal.Width, fill = Species, color = Species)) +
geom_histogram(binwidth = 0.5, position="dodge")+theme(legend.position="top")
# 산점도 point() Scatterplot
ggplot(iris, aes(x=Petal.Length, y=Petal.Width)) +
geom_point()
# 산점도 with Colored Points
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=Species)) +
geom_point(size=3) +
ggtitle("꽃잎의 길이와 폭") +
theme(plot.title=element_text(size=25,face="bold",color="steelblue"))
# 박스그래프 boxplot()
ggplot(data=iris, aes(y=Petal.Length, fill=Species)) +
geom_boxplot()
# 선그래프 line()
year = 1937:1960
cnt = as.vector(airmiles)
df = data.frame(year,cnt)
ggplot(data=df, aes(x=year, y=cnt)) +
geom_line(col='red')
# scatter plot + smooth
ggplot() +
geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# scatter plot + multi smooth
ggplot() +
geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg) +
geom_point(mapping=aes(x=displ, y=cty), data=mpg, col='red', shape=1) +
geom_smooth(mapping=aes(x=displ,y=cty), data=mpg, linetype=2, col='red')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# polor barplot
ggplot(mpg, aes(class, fill=drv)) +
geom_bar(position='fill') + coord_polar()