Statistics with R (제6강)

R강의: 제6강 데이터의 전처리

Data 전처리의 기본 내용은 다음과 같다.

새로운 변수(column)를 생성하고
새로운 케이스(row)를 삽입하고
변수를 선택하고
케이스를 선택하고
행을 정렬하고
변수이름을 바꾸고
데이터를 집단별로 분류하고
데이터를 요약하는 것이다.

6-1. dplyr

dplyr를 쓰기 위해서는 dplyr/readxl 패키지를 설치하고, 일반적인 dataframe 파일의 경우에는 “as_tibble(df)”과 같이 tibble dataframe으로 변환한 후에 사용해야 한다. 그리고 비행관련데이터인 ’hflights’을 사용하려면 hflights 패키지를 설치해야 한다.

# install.packages(c('dplyr','hflights','readxl'))

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(hflights)  
library(readxl)  

# hflights를 tibble dataframe인 hflights_df로 변환
hflights_df <- tibble::as_tibble(hflights)

# 1월 1일에 해당하는 자료만 선택 (filter): 552 cases
filter(hflights_df, Month == 1, DayofMonth == 1)

## # A tibble: 552 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     1          1         6    1400    1500 AA                  428
##  2  2011     1          1         6     728     840 AA                  460
##  3  2011     1          1         6    1631    1736 AA                 1121
##  4  2011     1          1         6    1756    2112 AA                 1294
##  5  2011     1          1         6    1012    1347 AA                 1700
##  6  2011     1          1         6    1211    1325 AA                 1820
##  7  2011     1          1         6     557     906 AA                 1994
##  8  2011     1          1         6    1824    2106 AS                  731
##  9  2011     1          1         6     654    1124 B6                  620
## 10  2011     1          1         6    1639    2110 B6                  622
## # ℹ 542 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

# 1월과 2월달에 해당하는 자료만 선택 (filter): 30,638 cases
filter(hflights_df, Month == 1 | Month == 2)

## # A tibble: 36,038 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     1          1         6    1400    1500 AA                  428
##  2  2011     1          2         7    1401    1501 AA                  428
##  3  2011     1          3         1    1352    1502 AA                  428
##  4  2011     1          4         2    1403    1513 AA                  428
##  5  2011     1          5         3    1405    1507 AA                  428
##  6  2011     1          6         4    1359    1503 AA                  428
##  7  2011     1          7         5    1359    1509 AA                  428
##  8  2011     1          8         6    1355    1454 AA                  428
##  9  2011     1          9         7    1443    1554 AA                  428
## 10  2011     1         10         1    1443    1553 AA                  428
## # ℹ 36,028 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

# ArrDelay, 월, 년 별로 정렬 (arrange)
arrange(hflights_df, ArrDelay, Month, Year)

## # A tibble: 227,496 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     7          3         7    1914    2039 XE                 2804
##  2  2011    12         25         7     741     926 OO                 4591
##  3  2011     8         21         7     935    1039 OO                 2001
##  4  2011     8         31         3     934    1039 OO                 2040
##  5  2011     8         26         5    2107    2205 OO                 2003
##  6  2011    12         24         6    2129    2337 CO                 1552
##  7  2011     8         28         7    2059    2206 OO                 2003
##  8  2011     8         29         1     935    1041 OO                 2040
##  9  2011     8         18         4     939    1043 OO                 2001
## 10  2011    12         24         6    2117    2258 CO                 1712
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

# 월별로 내림차순으로(descending) 정렬 (arrange) 
arrange(hflights_df, desc(Month))

## # A tibble: 227,496 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011    12         15         4    2113    2217 AA                  426
##  2  2011    12         16         5    2004    2128 AA                  426
##  3  2011    12         18         7    2007    2113 AA                  426
##  4  2011    12         19         1    2108    2223 AA                  426
##  5  2011    12         20         2    2008    2107 AA                  426
##  6  2011    12         21         3    2025    2124 AA                  426
##  7  2011    12         22         4    2021    2118 AA                  426
##  8  2011    12         23         5    2015    2118 AA                  426
##  9  2011    12         26         1    2013    2118 AA                  426
## 10  2011    12         27         2    2007    2123 AA                  426
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

# gain과 gain_per_hour라는 변수를 생성(mutate)
mutate(hflights_df,gain=ArrDelay-DepDelay, gain_per_hour = gain/(AirTime/60))

## # A tibble: 227,496 × 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     1          1         6    1400    1500 AA                  428
##  2  2011     1          2         7    1401    1501 AA                  428
##  3  2011     1          3         1    1352    1502 AA                  428
##  4  2011     1          4         2    1403    1513 AA                  428
##  5  2011     1          5         3    1405    1507 AA                  428
##  6  2011     1          6         4    1359    1503 AA                  428
##  7  2011     1          7         5    1359    1509 AA                  428
##  8  2011     1          8         6    1355    1454 AA                  428
##  9  2011     1          9         7    1443    1554 AA                  428
## 10  2011     1         10         1    1443    1553 AA                  428
## # ℹ 227,486 more rows
## # ℹ 15 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, gain <int>, gain_per_hour <dbl>

# DepDelay의 평균을 구하는 것(summarise), 단 missing케이스는 계산에서 제외
summarise(hflights_df, delay = mean(DepDelay, na.rm = TRUE) )

## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  9.44

# DepDelay의 평균을 구하는 것(summarise), colMeans를 써도 가능함
summarise(hflights_df, delay = colMeans(select(hflights_df,DepDelay),na.rm=TRUE))

## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  9.44

# 데이터를 TailNum변수별로 grouping을 하여 planes라는 파일에 저장
# Tailnum별로 사례수(count), 비행거리(dist)의 평균, 도착시연착(delay)평균을 계산하여 delay에 저장 

planes <- group_by(hflights_df, TailNum)

delay <- summarise(planes, count = n(), 
                    dist = mean(Distance, na.rm = TRUE), 
                    delay = mean(ArrDelay, na.rm = TRUE))

# delay파일에서 count가 20이 넘고 비행거리가 2000이하인 것만 select (filter)한 후에 출력함.
delay <- filter(delay, count > 20, dist < 2000)
delay

## # A tibble: 1,526 × 4
##    TailNum  count  dist  delay
##    <chr>    <int> <dbl>  <dbl>
##  1 ""         795  939. NaN   
##  2 "N0EGMQ"    40 1095.   1.92
##  3 "N10156"   317  802.   8.20
##  4 "N10575"    94  632.  18.1 
##  5 "N11106"   308  775.  10.1 
##  6 "N11107"   345  768.   8.05
##  7 "N11109"   331  772.  10.3 
##  8 "N11113"   282  773.   4.06
##  9 "N11119"   130  790.   7.40
## 10 "N11121"   333  775.   6.74
## # ℹ 1,516 more rows

# 또 다른 예를 들어본다. 

a1 <- group_by(hflights_df, Year, Month, DayofMonth)
a2 <- select(a1, Year:DayofMonth, ArrDelay, DepDelay)
a3 <- summarise(a2, 
                arr = mean(ArrDelay, na.rm = TRUE), 
                dep = mean(DepDelay, na.rm = TRUE))

## `summarise()` has grouped output by 'Year', 'Month'. You can override using the
## `.groups` argument.

a4 <- filter(a3, arr > 30 | dep > 30)
a4

## # A tibble: 14 × 5
## # Groups:   Year, Month [10]
##     Year Month DayofMonth   arr   dep
##    <int> <int>      <int> <dbl> <dbl>
##  1  2011     2          4  44.1  47.2
##  2  2011     3          3  35.1  38.2
##  3  2011     3         14  46.6  36.1
##  4  2011     4          4  38.7  27.9
##  5  2011     4         25  37.8  22.3
##  6  2011     5         12  69.5  64.5
##  7  2011     5         20  37.0  26.6
##  8  2011     6         22  65.5  62.3
##  9  2011     7         29  29.6  31.9
## 10  2011     9         29  39.2  32.5
## 11  2011    10          9  61.9  59.5
## 12  2011    11         15  43.7  39.2
## 13  2011    12         29  26.3  30.8
## 14  2011    12         31  46.5  54.2

# 즉 위의 a1, a2, a3, a4의 네가지 다른 명령을 하나로 표현하면 다음과 같다. 

hflights_df %>% 
  group_by(Year, Month, DayofMonth) %>%    
  summarise(arr = mean(ArrDelay, na.rm = TRUE), 
            dep = mean(DepDelay, na.rm = TRUE)) %>% 
  filter(arr > 30 |  dep > 30)

## `summarise()` has grouped output by 'Year', 'Month'. You can override using the
## `.groups` argument.

## # A tibble: 14 × 5
## # Groups:   Year, Month [10]
##     Year Month DayofMonth   arr   dep
##    <int> <int>      <int> <dbl> <dbl>
##  1  2011     2          4  44.1  47.2
##  2  2011     3          3  35.1  38.2
##  3  2011     3         14  46.6  36.1
##  4  2011     4          4  38.7  27.9
##  5  2011     4         25  37.8  22.3
##  6  2011     5         12  69.5  64.5
##  7  2011     5         20  37.0  26.6
##  8  2011     6         22  65.5  62.3
##  9  2011     7         29  29.6  31.9
## 10  2011     9         29  39.2  32.5
## 11  2011    10          9  61.9  59.5
## 12  2011    11         15  43.7  39.2
## 13  2011    12         29  26.3  30.8
## 14  2011    12         31  46.5  54.2

6-2. data.table

data.table을 쓰기 위해서는 data.table 패키지를 설치하고, df1과 같은 일반적인 dataframe 파일의 경우에는 “as.data.table(df1)”과 같이 data.table로 변환한 후에 사용해야 한다.

기본적인 data.table 명령어는 다음과 같다.

 DT[(행), (열), (by)] [order]

즉, 첫번째 (행)은 행(case)을 선택하거나 조작하는 것이고,
두번째 (열)은 열(column)을 선택하거나 조작하는 것이고, 세번째 (by)은 grouping하는데 사용하는 것이고, 네번째 (order)는 결과를 정렬하는 데 사용하는 것이다.

library(data.table)

## 
## 다음의 패키지를 부착합니다: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(openxlsx)

df1 = read.xlsx("anova.xlsx")
df1 = as.data.table(df1)
df2 = fread("tempbig.csv")
df3 = as.data.table(read.xlsx("twoway.xlsx"))

# df1: 근무만족도가 5~8점 사이의 case만 출력
df1[(근무만족도>=5 & 근무만족도 <= 8)]

##     근무만족도 상사의유형
##  1:          7          1
##  2:          7          1
##  3:          8          1
##  4:          8          1
##  5:          8          1
##  6:          7          2
##  7:          8          2
##  8:          8          2
##  9:          8          2
## 10:          8          3
## 11:          8          3

# df1: 근무만족도 변수만 출력
df1[,.(근무만족도)]

##     근무만족도
##  1:          7
##  2:          7
##  3:          8
##  4:          8
##  5:          8
##  6:         10
##  7:          7
##  8:          8
##  9:          8
## 10:          8
## 11:          9
## 12:         10
## 13:          8
## 14:          8
## 15:          9
## 16:         12
## 17:         15
## 18:         16

# df1: 상사의유형별로 평균, 표준편차, 사례수를 구하는 명령임.
#      결과를 상사의유형별로 정렬
stat = df1[, .(Mean=mean(근무만족도), SD=sd(근무만족도), N=sum(!is.na(근무만족도))),        
          by = .(상사의유형)][order(상사의유형)]
stat

##    상사의유형      Mean       SD N
## 1:          1  8.000000 1.095445 6
## 2:          2  8.333333 1.032796 6
## 3:          3 11.333333 3.559026 6

# df2: x1과 x2별로 평균, 표준편차, 사례수를 구하는 명령임
#      결과를 x1, x2 별로 정렬
df2[, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by=.(x1,x2)][order(x1,x2)]

##     x1 x2     Mean       SD       N
##  1:  1  1 60.22331 29.10055 1021910
##  2:  1  2 62.80916 27.61108 1395690
##  3:  1  3 60.82691 28.03215 1246580
##  4:  1  4 54.75922 26.60266  789890
##  5:  1  5 51.32877 26.42764  489860
##  6:  1  6 47.39352 25.75961  354670
##  7:  2  1 63.30948 26.89169 1843800
##  8:  2  2 66.91873 26.11230 1671400
##  9:  2  3 66.28037 25.14893  677430
## 10:  2  4 58.39985 26.92955  275400
## 11:  2  5 53.88967 28.43449  145660
## 12:  2  6 51.27192 24.85099   87710

# df2: x2가 4보다 같거나 작은 case만 선택하여, x1과 x2별로 평균, 표준편차, 사례수를 구하는 명령임
#      결과를 x1, x2 별로 정렬

df2[x2 <= 4, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by = .(x1,x2)][order(x1,x2)]

##    x1 x2     Mean       SD       N
## 1:  1  1 60.22331 29.10055 1021910
## 2:  1  2 62.80916 27.61108 1395690
## 3:  1  3 60.82691 28.03215 1246580
## 4:  1  4 54.75922 26.60266  789890
## 5:  2  1 63.30948 26.89169 1843800
## 6:  2  2 66.91873 26.11230 1671400
## 7:  2  3 66.28037 25.14893  677430
## 8:  2  4 58.39985 26.92955  275400

# df2: x2가 4보다 같거나 작고, x1이 1인 case만 선택하여, x1과 x2별로 평균/표준편차/사례수를 구하는 명령임
#      결과를 x1, x2 별로 정렬
df2[x2 <= 4 & x1 == 1, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by = .(x1,x2)][order(x1,x2)]

##    x1 x2     Mean       SD       N
## 1:  1  1 60.22331 29.10055 1021910
## 2:  1  2 62.80916 27.61108 1395690
## 3:  1  3 60.82691 28.03215 1246580
## 4:  1  4 54.75922 26.60266  789890

# df3: 상사의 유형과 보상방법별로 근무만족도의 평균/표준편차/사례수를 구하는 명령임.
#      결과를 상사의 유형, 보상방법별로 정렬
stat1 = df3[ , .(Mean=mean(근무만족도), SD=sd(근무만족도), N = sum(!is.na(근무만족도))),             
            by = .(상사의유형, 보상방법)][order(상사의유형, 보상방법)]
stat1

##    상사의유형 보상방법      Mean        SD N
## 1:     민주형     사탕  8.333333 0.5773503 3
## 2:     민주형     칭찬 14.333333 2.0816660 3
## 3: 자유방임형     사탕  7.666667 0.5773503 3
## 4: 자유방임형     칭찬  9.000000 1.0000000 3
## 5:     전제형     사탕  7.333333 0.5773503 3
## 6:     전제형     칭찬  8.666667 1.1547005 3

# 국가간 A매치 축구시합결과 파일: worldcup.csv
library(data.table)
library(openxlsx)

# Rbase의 read.csv와 data.table의 fread의 속도비교 
system.time(read.csv("tempbig.csv"))

##  사용자  시스템 elapsed 
##    5.17    0.15    6.68

system.time(fread("tempbig.csv"))

##  사용자  시스템 elapsed 
##    0.14    0.03    0.11

print("fread로 읽는 것이 60배 정도 빠름.")

## [1] "fread로 읽는 것이 60배 정도 빠름."

df = fread("worldcup.csv")

# 새로운 변수 (year, 결과) 생성 
df = df[, year := year(date)]
df = df[, 결과 := ifelse(team_score>opponent_score, '승', 
                         ifelse(team_score==opponent_score, '무', '패'))]
df

# 행(case) 선택 (team =="Korea Republic")          
df[team=='Korea Republic']

##            date           team  opponent team_score opponent_score
##   1: 1949-01-02 Korea Republic     China          2              3
##   2: 1949-01-16 Korea Republic   Vietnam          3              3
##   3: 1949-01-25 Korea Republic     Macau          5              1
##   4: 1950-04-16 Korea Republic    Taiwan          3              1
##   5: 1953-04-05 Korea Republic Hong Kong          0              4
##  ---                                                              
## 833: 2018-06-07 Korea Republic   Bolivia          0              0
## 834: 2018-06-11 Korea Republic   Senegal          0              2
## 835: 2018-06-18 Korea Republic    Sweden          0              1
## 836: 2018-06-23 Korea Republic    Mexico          1              2
## 837: 2018-06-27 Korea Republic   Germany          2              0
##          tournament year 결과
##   1:       Friendly 1949   패
##   2:       Friendly 1949   무
##   3:       Friendly 1949   승
##   4:       Friendly 1950   승
##   5:       Friendly 1953   패
##  ---                         
## 833:       Friendly 2018   무
## 834:       Friendly 2018   패
## 835: FIFA World Cup 2018   패
## 836: FIFA World Cup 2018   패
## 837: FIFA World Cup 2018   승

# 행(case) 선택 (1980~2000년)  
df[year >= 1980 & year <= 2000]

##              date           team       opponent team_score opponent_score
##     1: 1980-01-06   Sierra Leone          Ghana          2              4
##     2: 1980-01-06          Ghana   Sierra Leone          4              2
##     3: 1980-01-16         Cyprus         Greece          1              1
##     4: 1980-01-16         Greece         Cyprus          1              1
##     5: 1980-01-19          Congo    Ivory Coast          2              0
##    ---                                                                   
## 24680: 2000-12-16        Liberia   South Africa          1              2
## 24681: 2000-12-20          Japan Korea Republic          1              1
## 24682: 2000-12-20         Mexico      Argentina          0              2
## 24683: 2000-12-20 Korea Republic          Japan          1              1
## 24684: 2000-12-20      Argentina         Mexico          2              0
##                                  tournament year 결과
##     1:                             Friendly 1980   패
##     2:                             Friendly 1980   승
##     3:                             Friendly 1980   무
##     4:                             Friendly 1980   무
##     5:                             Friendly 1980   승
##    ---                                               
## 24680: African Cup of Nations qualification 2000   패
## 24681:                             Friendly 2000   무
## 24682:                             Friendly 2000   패
## 24683:                             Friendly 2000   무
## 24684:                             Friendly 2000   승

# team별 "승리"한 횟수 계산
df[, .(승=sum(결과=='승')), by=.(team)]

##                  team  승
##   1:         Scotland 361
##   2:          England 557
##   3:            Wales 197
##   4: Northern Ireland 158
##   5:              USA 281
##  ---                     
## 290:       Kárpátalja   5
## 291:           Barawa   2
## 292:         Cascadia   3
## 293:     Matabeleland   2
## 294:          Kabylia   1

# team별 "승리"와 "무승부" 횟수 계산
df[, .(승=sum(결과=='승'),무=sum(결과=='무')), by=.(team)]

##                  team  승  무
##   1:         Scotland 361 165
##   2:          England 557 239
##   3:            Wales 197 139
##   4: Northern Ireland 158 145
##   5:              USA 281 141
##  ---                         
## 290:       Kárpátalja   5   4
## 291:           Barawa   2   0
## 292:         Cascadia   3   1
## 293:     Matabeleland   2   1
## 294:          Kabylia   1   2

# team별 "승리"/"무승부"/"패배' 횟수 계산
df[, .(승=sum(결과=='승'),무=sum(결과=='무'),패=sum(결과=='패')), by=.(team)]

##                  team  승  무  패
##   1:         Scotland 361 165 235
##   2:          England 557 239 186
##   3:            Wales 197 139 301
##   4: Northern Ireland 158 145 323
##   5:              USA 281 141 240
##  ---                             
## 290:       Kárpátalja   5   4   1
## 291:           Barawa   2   0   4
## 292:         Cascadia   3   1   2
## 293:     Matabeleland   2   1   2
## 294:          Kabylia   1   2   3

# 한국과 일본의 경기만 선택함
df[team=='Korea Republic' & opponent=='Japan',]

##           date           team opponent team_score opponent_score
##  1: 1954-03-07 Korea Republic    Japan          5              1
##  2: 1954-03-14 Korea Republic    Japan          2              2
##  3: 1959-09-05 Korea Republic    Japan          0              0
##  4: 1959-09-06 Korea Republic    Japan          3              1
##  5: 1960-11-06 Korea Republic    Japan          2              1
##  6: 1961-06-11 Korea Republic    Japan          2              0
##  7: 1963-08-13 Korea Republic    Japan          1              1
##  8: 1967-08-01 Korea Republic    Japan          1              2
##  9: 1969-10-12 Korea Republic    Japan          2              2
## 10: 1969-10-18 Korea Republic    Japan          2              0
## 11: 1970-08-02 Korea Republic    Japan          1              1
## 12: 1972-07-26 Korea Republic    Japan          3              0
## 13: 1972-09-14 Korea Republic    Japan          2              2
## 14: 1973-06-23 Korea Republic    Japan          2              0
## 15: 1974-09-28 Korea Republic    Japan          1              4
## 16: 1975-08-09 Korea Republic    Japan          3              1
## 17: 1975-09-08 Korea Republic    Japan          3              0
## 18: 1976-08-18 Korea Republic    Japan          0              0
## 19: 1976-12-04 Korea Republic    Japan          2              1
## 20: 1977-03-26 Korea Republic    Japan          0              0
## 21: 1977-04-03 Korea Republic    Japan          1              0
## 22: 1977-06-15 Korea Republic    Japan          2              1
## 23: 1978-07-19 Korea Republic    Japan          4              0
## 24: 1979-03-04 Korea Republic    Japan          1              2
## 25: 1979-06-16 Korea Republic    Japan          4              1
## 26: 1981-03-08 Korea Republic    Japan          1              0
## 27: 1981-06-21 Korea Republic    Japan          2              0
## 28: 1982-03-21 Korea Republic    Japan          3              0
## 29: 1983-03-06 Korea Republic    Japan          1              1
## 30: 1984-09-30 Korea Republic    Japan          1              2
## 31: 1985-10-26 Korea Republic    Japan          2              1
## 32: 1985-11-03 Korea Republic    Japan          1              0
## 33: 1988-10-26 Korea Republic    Japan          1              0
## 34: 1988-12-06 Korea Republic    Japan          2              0
## 35: 1989-05-05 Korea Republic    Japan          1              0
## 36: 1990-07-27 Korea Republic    Japan          2              0
## 37: 1991-07-27 Korea Republic    Japan          1              0
## 38: 1992-08-22 Korea Republic    Japan          0              0
## 39: 1992-08-29 Korea Republic    Japan          2              2
## 40: 1993-10-25 Korea Republic    Japan          0              1
## 41: 1995-02-21 Korea Republic    Japan          1              1
## 42: 1995-02-26 Korea Republic    Japan          2              2
## 43: 1997-05-21 Korea Republic    Japan          1              1
## 44: 1997-09-28 Korea Republic    Japan          2              1
## 45: 1997-11-01 Korea Republic    Japan          0              2
## 46: 1998-03-01 Korea Republic    Japan          1              2
## 47: 1998-04-01 Korea Republic    Japan          2              1
## 48: 2000-04-26 Korea Republic    Japan          1              0
## 49: 2000-12-20 Korea Republic    Japan          1              1
## 50: 2003-04-16 Korea Republic    Japan          0              1
## 51: 2003-05-31 Korea Republic    Japan          1              0
## 52: 2003-12-10 Korea Republic    Japan          0              0
## 53: 2005-08-07 Korea Republic    Japan          0              1
## 54: 2007-07-28 Korea Republic    Japan          0              0
## 55: 2008-02-23 Korea Republic    Japan          1              1
## 56: 2010-02-14 Korea Republic    Japan          3              1
## 57: 2010-05-24 Korea Republic    Japan          2              0
## 58: 2010-10-12 Korea Republic    Japan          0              0
## 59: 2011-01-25 Korea Republic    Japan          2              2
## 60: 2011-08-10 Korea Republic    Japan          0              3
## 61: 2013-07-28 Korea Republic    Japan          1              2
## 62: 2015-08-05 Korea Republic    Japan          1              1
## 63: 2017-12-16 Korea Republic    Japan          4              1
##           date           team opponent team_score opponent_score
##                       tournament year 결과
##  1: FIFA World Cup qualification 1954   승
##  2: FIFA World Cup qualification 1954   무
##  3:           Merdeka Tournament 1959   무
##  4:           Merdeka Tournament 1959   승
##  5: FIFA World Cup qualification 1960   승
##  6: FIFA World Cup qualification 1961   승
##  7:           Merdeka Tournament 1963   무
##  8:  AFC Asian Cup qualification 1967   패
##  9: FIFA World Cup qualification 1969   무
## 10: FIFA World Cup qualification 1969   승
## 11:           Merdeka Tournament 1970   무
## 12:           Merdeka Tournament 1972   승
## 13:                     Friendly 1972   무
## 14:                     Friendly 1973   승
## 15:                     Friendly 1974   패
## 16:           Merdeka Tournament 1975   승
## 17:                     Friendly 1975   승
## 18:           Merdeka Tournament 1976   무
## 19:                     Friendly 1976   승
## 20: FIFA World Cup qualification 1977   무
## 21: FIFA World Cup qualification 1977   승
## 22:                     Friendly 1977   승
## 23:           Merdeka Tournament 1978   승
## 24:                     Friendly 1979   패
## 25:                     Friendly 1979   승
## 26:                     Friendly 1981   승
## 27:                    Korea Cup 1981   승
## 28:                     Friendly 1982   승
## 29:                     Friendly 1983   무
## 30:                     Friendly 1984   패
## 31: FIFA World Cup qualification 1985   승
## 32: FIFA World Cup qualification 1985   승
## 33:                     Friendly 1988   승
## 34:                AFC Asian Cup 1988   승
## 35:                     Friendly 1989   승
## 36:                  Dynasty Cup 1990   승
## 37:                     Friendly 1991   승
## 38:                  Dynasty Cup 1992   무
## 39:                  Dynasty Cup 1992   무
## 40: FIFA World Cup qualification 1993   패
## 41:                  Dynasty Cup 1995   무
## 42:                  Dynasty Cup 1995   무
## 43:                     Friendly 1997   무
## 44: FIFA World Cup qualification 1997   승
## 45: FIFA World Cup qualification 1997   패
## 46:                  Dynasty Cup 1998   패
## 47:                     Friendly 1998   승
## 48:                     Friendly 2000   승
## 49:                     Friendly 2000   무
## 50:                     Friendly 2003   패
## 51:                     Friendly 2003   승
## 52:            EAFF Championship 2003   무
## 53:            EAFF Championship 2005   패
## 54:                AFC Asian Cup 2007   무
## 55:            EAFF Championship 2008   무
## 56:            EAFF Championship 2010   승
## 57:                     Friendly 2010   승
## 58:                     Friendly 2010   무
## 59:                AFC Asian Cup 2011   무
## 60:                     Friendly 2011   패
## 61:            EAFF Championship 2013   패
## 62:            EAFF Championship 2015   무
## 63:            EAFF Championship 2017   승
##                       tournament year 결과

# 한국과 일본의 경기중 '승리'한 횟수 계산
df[team=='Korea Republic' & opponent=='Japan', .(승=sum(결과=='승')), by=.(team)]

##              team 승
## 1: Korea Republic 31

# 한국과 일본의 경기중 '승리'/'무승부'/'패배' 횟수 계산 
df[team=='Korea Republic' & opponent=='Japan', .(승=sum(결과=='승'),무=sum(결과=='무'),패=sum(결과=='패')), by=.(team)]

##              team 승 무 패
## 1: Korea Republic 31 21 11

# 한국팀의 평균 득점과 평균 실점을 계산하여 내림차순으로 정리        
df[team=='Korea Republic', .(평균득점=mean(team_score), 평균실점=mean(opponent_score)), by=.(opponent)][order(평균득점, decreasing = TRUE)]

##      opponent 평균득점 평균실점
##   1:     Guam      9.0      0.0
##   2:    Sudan      8.0      0.0
##   3:    Yemen      6.0      0.0
##   4: Mongolia      6.0      0.0
##   5:     Laos      5.6      0.0
##  ---                           
## 113:  Tunisia      0.0      0.5
## 114:    Chile      0.0      1.0
## 115:  Denmark      0.0      0.0
## 116:  Belarus      0.0      1.0
## 117:     Peru      0.0      0.0

# 한국팀의 평균 득점과 평균 실점을 계산하여 내림차순으로 정리 (order를 사용)  
df[team=='Korea Republic', .(평균득점=mean(team_score), 평균실점=mean(opponent_score)), by=.(opponent)][order(-평균득점)]

##      opponent 평균득점 평균실점
##   1:     Guam      9.0      0.0
##   2:    Sudan      8.0      0.0
##   3:    Yemen      6.0      0.0
##   4: Mongolia      6.0      0.0
##   5:     Laos      5.6      0.0
##  ---                           
## 113:  Tunisia      0.0      0.5
## 114:    Chile      0.0      1.0
## 115:  Denmark      0.0      0.0
## 116:  Belarus      0.0      1.0
## 117:     Peru      0.0      0.0

6-3. Rbase 그래프

6-3-1. Rbase:막대그래프

df4 = read.xlsx("ClassExample.xlsx")

# 막대그래프 barplot()
barplot(table(df4$sliptime))

barplot(table(df4$sliptime),
        main = "Sleep Time 빈도",
        xlab = "Sleep Time 정도",
        ylab = "빈도",
        col = 'red')

6-3-2. Rbase:박스그래프

# 박스그래프 barplot()
df1 = read.xlsx("anova.xlsx")
boxplot(df1$근무만족도 ~ df1$상사의유형)

boxplot(df1$근무만족도 ~ df1$상사의유형,
        main = "상사의 유형별 근무만족도",
        xlab = "상사의 유형",
        ylab = "근무만족도",
        col = 'yellow')

6-3-3. Rbase:산점도

# 산점도 plot() scatterplot
df3 = read.xlsx("regress.xlsx")
plot(df3$근무만족도,df3$근무평정,
     main = "근무만족도-근무평정 산점도",
     xlab = "근무만족도",
     ylab = '근무평정',
     col  = 'red',
     pch  = 20)

pairs(df3,
      main = "Multi Plots",
      pch = 20,
      col = 'red')

6-3-4. Rbase:파이차트

# 파이차트 pie()
pie(table(df4$sliptime))

df6 = table(df4$sliptime)
df6 = data.frame(freq = df6,
                 lab =c("1~3 hr", "3~5 hr", "6~7 hr", 
                        "7~8 hr", "8~9 hr", "10~ hr"))
pie (df6[,2], labels=df6[,3])

6-3-5. Rbase: 선그래프

# 선그래프 line chart
df7 = aggregate(df4$weight ~ df4$sliptime, data=df4, mean)
plot(df7[,1], df7[,2])

plot(df7[,1], df7[,2],
     main = "Weight by Sleeptime",
     xlab = "Sleep Time",
     ylab = "Weight",
     type = 'o')

6.4. ggplot 그래프

ggplot의 기본구조

library(ggplot2)

ggplot(data=df, aes(x = x1, y = x2) +
geom_xx() +
geom_yy() +)

ggplot2 library를 불러온다.
첫 줄에 data, x축, y축에 해당하는 것을 지정
“+” 기호로 ggplot함수와 여러 개의 geom_xx()를 연결
각각의 geom 함수를 정의
예, geom_bar(.., .., …) +
geom_line(.., .., ..)

6-4-1. ggplot: 막대그래프

library(ggplot2)

# ggplot: 막대그래프 barplot()
month = c(1, 2, 3, 4, 5, 6)
rain  = c(55, 50, 45, 50, 60, 70)
df = data.frame(month, rain)

ggplot(df, aes(x = month, y = rain)) +
  geom_bar(stat = 'identity',
           width = 0.7,
           fill = 'steelblue')

# flipped bar
ggplot(df, aes(x = month, y = rain)) +
  geom_bar(stat = 'identity',
           width = 0.7,
           fill = 'steelblue') +
  ggtitle("월별강수량") +
  theme(plot.title=element_text(size=25,
                                face="bold",color="steelblue")) +
  labs(x = '월', y = '강수량') + coord_flip()

6-4-2. ggplot: 막대그래프

# 막대그래프 histogram()
library(ggplot2)
ggplot(iris, aes(x = Petal.Length)) +
  geom_histogram(binwidth = 0.5, fill='red', color='black')

ggplot(iris, aes(x = Sepal.Width, fill = Species, color = Species)) +
  geom_histogram(binwidth = 0.5, position="dodge")+theme(legend.position="top")

6-4-3. ggplot: 산점도

# 산점도 point() Scatterplot
ggplot(iris, aes(x=Petal.Length, y=Petal.Width)) +
  geom_point()

# 산점도 with Colored Points
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=Species)) +
  geom_point(size=3) +
  ggtitle("꽃잎의 길이와 폭") +
  theme(plot.title=element_text(size=25,face="bold",color="steelblue"))

6-4-4. ggplot: 박스그래프

# 박스그래프  boxplot()
ggplot(data=iris, aes(y=Petal.Length, fill=Species)) +
  geom_boxplot()

6-4-5. ggplot: 선그래프

# 선그래프 line()
year = 1937:1960
cnt = as.vector(airmiles)
df = data.frame(year,cnt)

ggplot(data=df, aes(x=year, y=cnt)) +
  geom_line(col='red')

6-4-6. ggplot: 산점도+smooth

# scatter plot + smooth

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

6-4-7. ggplot: 산점도+multi smooth

# scatter plot + multi smooth

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_point(mapping=aes(x=displ, y=cty), data=mpg, col='red', shape=1) +
  geom_smooth(mapping=aes(x=displ,y=cty), data=mpg, linetype=2, col='red')

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

6-4-8. ggplot: polar barplot

# polor barplot

ggplot(mpg, aes(class, fill=drv)) +
  geom_bar(position='fill') + coord_polar()

6-5. ggplot2 그래프 예시

https://kilhwan.github.io/rprogramming/ch-visualization.html
https://kkokkilkon.tistory.com/17
https://bioinformaticsandme.tistory.com/255
https://rpubs.com/kimwoohyung/ggplot2/