R강의: 제6강 데이터의 전처리

Data 전처리의 기본 내용은 다음과 같다.

  1. 새로운 변수(column)를 생성하고
  2. 새로운 케이스(row)를 삽입하고
  3. 변수를 선택하고
  4. 케이스를 선택하고
  5. 행을 정렬하고
  6. 변수이름을 바꾸고
  7. 데이터를 집단별로 분류하고
  8. 데이터를 요약하는 것이다.

6-1. dplyr

dplyr를 쓰기 위해서는 dplyr/readxl 패키지를 설치하고, 일반적인 dataframe 파일의 경우에는 “as_tibble(df)”과 같이 tibble dataframe으로 변환한 후에 사용해야 한다. 그리고 비행관련데이터인 ’hflights’을 사용하려면 hflights 패키지를 설치해야 한다.

# install.packages(c('dplyr','hflights','readxl'))

library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hflights)  
library(readxl)  

# hflights를 tibble dataframe인 hflights_df로 변환
hflights_df <- tibble::as_tibble(hflights)

# 1월 1일에 해당하는 자료만 선택 (filter): 552 cases
filter(hflights_df, Month == 1, DayofMonth == 1)
## # A tibble: 552 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     1          1         6    1400    1500 AA                  428
##  2  2011     1          1         6     728     840 AA                  460
##  3  2011     1          1         6    1631    1736 AA                 1121
##  4  2011     1          1         6    1756    2112 AA                 1294
##  5  2011     1          1         6    1012    1347 AA                 1700
##  6  2011     1          1         6    1211    1325 AA                 1820
##  7  2011     1          1         6     557     906 AA                 1994
##  8  2011     1          1         6    1824    2106 AS                  731
##  9  2011     1          1         6     654    1124 B6                  620
## 10  2011     1          1         6    1639    2110 B6                  622
## # ℹ 542 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>
# 1월과 2월달에 해당하는 자료만 선택 (filter): 30,638 cases
filter(hflights_df, Month == 1 | Month == 2)
## # A tibble: 36,038 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     1          1         6    1400    1500 AA                  428
##  2  2011     1          2         7    1401    1501 AA                  428
##  3  2011     1          3         1    1352    1502 AA                  428
##  4  2011     1          4         2    1403    1513 AA                  428
##  5  2011     1          5         3    1405    1507 AA                  428
##  6  2011     1          6         4    1359    1503 AA                  428
##  7  2011     1          7         5    1359    1509 AA                  428
##  8  2011     1          8         6    1355    1454 AA                  428
##  9  2011     1          9         7    1443    1554 AA                  428
## 10  2011     1         10         1    1443    1553 AA                  428
## # ℹ 36,028 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>
# ArrDelay, 월, 년 별로 정렬 (arrange)
arrange(hflights_df, ArrDelay, Month, Year)
## # A tibble: 227,496 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     7          3         7    1914    2039 XE                 2804
##  2  2011    12         25         7     741     926 OO                 4591
##  3  2011     8         21         7     935    1039 OO                 2001
##  4  2011     8         31         3     934    1039 OO                 2040
##  5  2011     8         26         5    2107    2205 OO                 2003
##  6  2011    12         24         6    2129    2337 CO                 1552
##  7  2011     8         28         7    2059    2206 OO                 2003
##  8  2011     8         29         1     935    1041 OO                 2040
##  9  2011     8         18         4     939    1043 OO                 2001
## 10  2011    12         24         6    2117    2258 CO                 1712
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>
# 월별로 내림차순으로(descending) 정렬 (arrange) 
arrange(hflights_df, desc(Month))
## # A tibble: 227,496 × 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011    12         15         4    2113    2217 AA                  426
##  2  2011    12         16         5    2004    2128 AA                  426
##  3  2011    12         18         7    2007    2113 AA                  426
##  4  2011    12         19         1    2108    2223 AA                  426
##  5  2011    12         20         2    2008    2107 AA                  426
##  6  2011    12         21         3    2025    2124 AA                  426
##  7  2011    12         22         4    2021    2118 AA                  426
##  8  2011    12         23         5    2015    2118 AA                  426
##  9  2011    12         26         1    2013    2118 AA                  426
## 10  2011    12         27         2    2007    2123 AA                  426
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>
# gain과 gain_per_hour라는 변수를 생성(mutate)
mutate(hflights_df,gain=ArrDelay-DepDelay, gain_per_hour = gain/(AirTime/60))
## # A tibble: 227,496 × 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##    <int> <int>      <int>     <int>   <int>   <int> <chr>             <int>
##  1  2011     1          1         6    1400    1500 AA                  428
##  2  2011     1          2         7    1401    1501 AA                  428
##  3  2011     1          3         1    1352    1502 AA                  428
##  4  2011     1          4         2    1403    1513 AA                  428
##  5  2011     1          5         3    1405    1507 AA                  428
##  6  2011     1          6         4    1359    1503 AA                  428
##  7  2011     1          7         5    1359    1509 AA                  428
##  8  2011     1          8         6    1355    1454 AA                  428
##  9  2011     1          9         7    1443    1554 AA                  428
## 10  2011     1         10         1    1443    1553 AA                  428
## # ℹ 227,486 more rows
## # ℹ 15 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## #   ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, gain <int>, gain_per_hour <dbl>
# DepDelay의 평균을 구하는 것(summarise), 단 missing케이스는 계산에서 제외
summarise(hflights_df, delay = mean(DepDelay, na.rm = TRUE) )
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  9.44
# DepDelay의 평균을 구하는 것(summarise), colMeans를 써도 가능함
summarise(hflights_df, delay = colMeans(select(hflights_df,DepDelay),na.rm=TRUE))
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  9.44
# 데이터를 TailNum변수별로 grouping을 하여 planes라는 파일에 저장
# Tailnum별로 사례수(count), 비행거리(dist)의 평균, 도착시연착(delay)평균을 계산하여 delay에 저장 

planes <- group_by(hflights_df, TailNum)

delay <- summarise(planes, count = n(), 
                    dist = mean(Distance, na.rm = TRUE), 
                    delay = mean(ArrDelay, na.rm = TRUE))

# delay파일에서 count가 20이 넘고 비행거리가 2000이하인 것만 select (filter)한 후에 출력함.
delay <- filter(delay, count > 20, dist < 2000)
delay
## # A tibble: 1,526 × 4
##    TailNum  count  dist  delay
##    <chr>    <int> <dbl>  <dbl>
##  1 ""         795  939. NaN   
##  2 "N0EGMQ"    40 1095.   1.92
##  3 "N10156"   317  802.   8.20
##  4 "N10575"    94  632.  18.1 
##  5 "N11106"   308  775.  10.1 
##  6 "N11107"   345  768.   8.05
##  7 "N11109"   331  772.  10.3 
##  8 "N11113"   282  773.   4.06
##  9 "N11119"   130  790.   7.40
## 10 "N11121"   333  775.   6.74
## # ℹ 1,516 more rows
# 또 다른 예를 들어본다. 

a1 <- group_by(hflights_df, Year, Month, DayofMonth)
a2 <- select(a1, Year:DayofMonth, ArrDelay, DepDelay)
a3 <- summarise(a2, 
                arr = mean(ArrDelay, na.rm = TRUE), 
                dep = mean(DepDelay, na.rm = TRUE))
## `summarise()` has grouped output by 'Year', 'Month'. You can override using the
## `.groups` argument.
a4 <- filter(a3, arr > 30 | dep > 30)
a4
## # A tibble: 14 × 5
## # Groups:   Year, Month [10]
##     Year Month DayofMonth   arr   dep
##    <int> <int>      <int> <dbl> <dbl>
##  1  2011     2          4  44.1  47.2
##  2  2011     3          3  35.1  38.2
##  3  2011     3         14  46.6  36.1
##  4  2011     4          4  38.7  27.9
##  5  2011     4         25  37.8  22.3
##  6  2011     5         12  69.5  64.5
##  7  2011     5         20  37.0  26.6
##  8  2011     6         22  65.5  62.3
##  9  2011     7         29  29.6  31.9
## 10  2011     9         29  39.2  32.5
## 11  2011    10          9  61.9  59.5
## 12  2011    11         15  43.7  39.2
## 13  2011    12         29  26.3  30.8
## 14  2011    12         31  46.5  54.2
# 즉 위의 a1, a2, a3, a4의 네가지 다른 명령을 하나로 표현하면 다음과 같다. 

hflights_df %>% 
  group_by(Year, Month, DayofMonth) %>%    
  summarise(arr = mean(ArrDelay, na.rm = TRUE), 
            dep = mean(DepDelay, na.rm = TRUE)) %>% 
  filter(arr > 30 |  dep > 30)
## `summarise()` has grouped output by 'Year', 'Month'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 5
## # Groups:   Year, Month [10]
##     Year Month DayofMonth   arr   dep
##    <int> <int>      <int> <dbl> <dbl>
##  1  2011     2          4  44.1  47.2
##  2  2011     3          3  35.1  38.2
##  3  2011     3         14  46.6  36.1
##  4  2011     4          4  38.7  27.9
##  5  2011     4         25  37.8  22.3
##  6  2011     5         12  69.5  64.5
##  7  2011     5         20  37.0  26.6
##  8  2011     6         22  65.5  62.3
##  9  2011     7         29  29.6  31.9
## 10  2011     9         29  39.2  32.5
## 11  2011    10          9  61.9  59.5
## 12  2011    11         15  43.7  39.2
## 13  2011    12         29  26.3  30.8
## 14  2011    12         31  46.5  54.2

6-2. data.table

data.table을 쓰기 위해서는 data.table 패키지를 설치하고, df1과 같은 일반적인 dataframe 파일의 경우에는 “as.data.table(df1)”과 같이 data.table로 변환한 후에 사용해야 한다.

기본적인 data.table 명령어는 다음과 같다.

 DT[(행), (열), (by)] [order]

즉, 첫번째 (행)은 행(case)을 선택하거나 조작하는 것이고,
두번째 (열)은 열(column)을 선택하거나 조작하는 것이고, 세번째 (by)은 grouping하는데 사용하는 것이고, 네번째 (order)는 결과를 정렬하는 데 사용하는 것이다.

library(data.table)
## 
## 다음의 패키지를 부착합니다: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(openxlsx)

df1 = read.xlsx("anova.xlsx")
df1 = as.data.table(df1)
df2 = fread("tempbig.csv")
df3 = as.data.table(read.xlsx("twoway.xlsx"))

# df1: 근무만족도가 5~8점 사이의 case만 출력
df1[(근무만족도>=5 & 근무만족도 <= 8)]
##     근무만족도 상사의유형
##  1:          7          1
##  2:          7          1
##  3:          8          1
##  4:          8          1
##  5:          8          1
##  6:          7          2
##  7:          8          2
##  8:          8          2
##  9:          8          2
## 10:          8          3
## 11:          8          3
# df1: 근무만족도 변수만 출력
df1[,.(근무만족도)]
##     근무만족도
##  1:          7
##  2:          7
##  3:          8
##  4:          8
##  5:          8
##  6:         10
##  7:          7
##  8:          8
##  9:          8
## 10:          8
## 11:          9
## 12:         10
## 13:          8
## 14:          8
## 15:          9
## 16:         12
## 17:         15
## 18:         16
# df1: 상사의유형별로 평균, 표준편차, 사례수를 구하는 명령임.
#      결과를 상사의유형별로 정렬
stat = df1[, .(Mean=mean(근무만족도), SD=sd(근무만족도), N=sum(!is.na(근무만족도))),        
          by = .(상사의유형)][order(상사의유형)]
stat
##    상사의유형      Mean       SD N
## 1:          1  8.000000 1.095445 6
## 2:          2  8.333333 1.032796 6
## 3:          3 11.333333 3.559026 6
# df2: x1과 x2별로 평균, 표준편차, 사례수를 구하는 명령임
#      결과를 x1, x2 별로 정렬
df2[, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by=.(x1,x2)][order(x1,x2)]
##     x1 x2     Mean       SD       N
##  1:  1  1 60.22331 29.10055 1021910
##  2:  1  2 62.80916 27.61108 1395690
##  3:  1  3 60.82691 28.03215 1246580
##  4:  1  4 54.75922 26.60266  789890
##  5:  1  5 51.32877 26.42764  489860
##  6:  1  6 47.39352 25.75961  354670
##  7:  2  1 63.30948 26.89169 1843800
##  8:  2  2 66.91873 26.11230 1671400
##  9:  2  3 66.28037 25.14893  677430
## 10:  2  4 58.39985 26.92955  275400
## 11:  2  5 53.88967 28.43449  145660
## 12:  2  6 51.27192 24.85099   87710
# df2: x2가 4보다 같거나 작은 case만 선택하여, x1과 x2별로 평균, 표준편차, 사례수를 구하는 명령임
#      결과를 x1, x2 별로 정렬

df2[x2 <= 4, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by = .(x1,x2)][order(x1,x2)]
##    x1 x2     Mean       SD       N
## 1:  1  1 60.22331 29.10055 1021910
## 2:  1  2 62.80916 27.61108 1395690
## 3:  1  3 60.82691 28.03215 1246580
## 4:  1  4 54.75922 26.60266  789890
## 5:  2  1 63.30948 26.89169 1843800
## 6:  2  2 66.91873 26.11230 1671400
## 7:  2  3 66.28037 25.14893  677430
## 8:  2  4 58.39985 26.92955  275400
# df2: x2가 4보다 같거나 작고, x1이 1인 case만 선택하여, x1과 x2별로 평균/표준편차/사례수를 구하는 명령임
#      결과를 x1, x2 별로 정렬
df2[x2 <= 4 & x1 == 1, .(Mean=mean(y), SD=sd(y), N=sum(!is.na(y))), by = .(x1,x2)][order(x1,x2)]
##    x1 x2     Mean       SD       N
## 1:  1  1 60.22331 29.10055 1021910
## 2:  1  2 62.80916 27.61108 1395690
## 3:  1  3 60.82691 28.03215 1246580
## 4:  1  4 54.75922 26.60266  789890
# df3: 상사의 유형과 보상방법별로 근무만족도의 평균/표준편차/사례수를 구하는 명령임.
#      결과를 상사의 유형, 보상방법별로 정렬
stat1 = df3[ , .(Mean=mean(근무만족도), SD=sd(근무만족도), N = sum(!is.na(근무만족도))),             
            by = .(상사의유형, 보상방법)][order(상사의유형, 보상방법)]
stat1
##    상사의유형 보상방법      Mean        SD N
## 1:     민주형     사탕  8.333333 0.5773503 3
## 2:     민주형     칭찬 14.333333 2.0816660 3
## 3: 자유방임형     사탕  7.666667 0.5773503 3
## 4: 자유방임형     칭찬  9.000000 1.0000000 3
## 5:     전제형     사탕  7.333333 0.5773503 3
## 6:     전제형     칭찬  8.666667 1.1547005 3
# 국가간 A매치 축구시합결과 파일: worldcup.csv
library(data.table)
library(openxlsx)

# Rbase의 read.csv와 data.table의 fread의 속도비교 
system.time(read.csv("tempbig.csv"))
##  사용자  시스템 elapsed 
##    5.17    0.15    6.68
system.time(fread("tempbig.csv"))
##  사용자  시스템 elapsed 
##    0.14    0.03    0.11
print("fread로 읽는 것이 60배 정도 빠름.")
## [1] "fread로 읽는 것이 60배 정도 빠름."
df = fread("worldcup.csv")

# 새로운 변수 (year, 결과) 생성 
df = df[, year := year(date)]
df = df[, 결과 := ifelse(team_score>opponent_score, '승', 
                         ifelse(team_score==opponent_score, '무', '패'))]
df

# 행(case) 선택 (team =="Korea Republic")          
df[team=='Korea Republic']
##            date           team  opponent team_score opponent_score
##   1: 1949-01-02 Korea Republic     China          2              3
##   2: 1949-01-16 Korea Republic   Vietnam          3              3
##   3: 1949-01-25 Korea Republic     Macau          5              1
##   4: 1950-04-16 Korea Republic    Taiwan          3              1
##   5: 1953-04-05 Korea Republic Hong Kong          0              4
##  ---                                                              
## 833: 2018-06-07 Korea Republic   Bolivia          0              0
## 834: 2018-06-11 Korea Republic   Senegal          0              2
## 835: 2018-06-18 Korea Republic    Sweden          0              1
## 836: 2018-06-23 Korea Republic    Mexico          1              2
## 837: 2018-06-27 Korea Republic   Germany          2              0
##          tournament year 결과
##   1:       Friendly 1949   패
##   2:       Friendly 1949   무
##   3:       Friendly 1949   승
##   4:       Friendly 1950   승
##   5:       Friendly 1953   패
##  ---                         
## 833:       Friendly 2018   무
## 834:       Friendly 2018   패
## 835: FIFA World Cup 2018   패
## 836: FIFA World Cup 2018   패
## 837: FIFA World Cup 2018   승
# 행(case) 선택 (1980~2000년)  
df[year >= 1980 & year <= 2000]
##              date           team       opponent team_score opponent_score
##     1: 1980-01-06   Sierra Leone          Ghana          2              4
##     2: 1980-01-06          Ghana   Sierra Leone          4              2
##     3: 1980-01-16         Cyprus         Greece          1              1
##     4: 1980-01-16         Greece         Cyprus          1              1
##     5: 1980-01-19          Congo    Ivory Coast          2              0
##    ---                                                                   
## 24680: 2000-12-16        Liberia   South Africa          1              2
## 24681: 2000-12-20          Japan Korea Republic          1              1
## 24682: 2000-12-20         Mexico      Argentina          0              2
## 24683: 2000-12-20 Korea Republic          Japan          1              1
## 24684: 2000-12-20      Argentina         Mexico          2              0
##                                  tournament year 결과
##     1:                             Friendly 1980   패
##     2:                             Friendly 1980   승
##     3:                             Friendly 1980   무
##     4:                             Friendly 1980   무
##     5:                             Friendly 1980   승
##    ---                                               
## 24680: African Cup of Nations qualification 2000   패
## 24681:                             Friendly 2000   무
## 24682:                             Friendly 2000   패
## 24683:                             Friendly 2000   무
## 24684:                             Friendly 2000   승
# team별 "승리"한 횟수 계산
df[, .(승=sum(결과=='승')), by=.(team)] 
##                  team  승
##   1:         Scotland 361
##   2:          England 557
##   3:            Wales 197
##   4: Northern Ireland 158
##   5:              USA 281
##  ---                     
## 290:       Kárpátalja   5
## 291:           Barawa   2
## 292:         Cascadia   3
## 293:     Matabeleland   2
## 294:          Kabylia   1
# team별 "승리"와 "무승부" 횟수 계산
df[, .(승=sum(결과=='승'),무=sum(결과=='무')), by=.(team)] 
##                  team  승  무
##   1:         Scotland 361 165
##   2:          England 557 239
##   3:            Wales 197 139
##   4: Northern Ireland 158 145
##   5:              USA 281 141
##  ---                         
## 290:       Kárpátalja   5   4
## 291:           Barawa   2   0
## 292:         Cascadia   3   1
## 293:     Matabeleland   2   1
## 294:          Kabylia   1   2
# team별 "승리"/"무승부"/"패배' 횟수 계산
df[, .(승=sum(결과=='승'),무=sum(결과=='무'),패=sum(결과=='패')), by=.(team)] 
##                  team  승  무  패
##   1:         Scotland 361 165 235
##   2:          England 557 239 186
##   3:            Wales 197 139 301
##   4: Northern Ireland 158 145 323
##   5:              USA 281 141 240
##  ---                             
## 290:       Kárpátalja   5   4   1
## 291:           Barawa   2   0   4
## 292:         Cascadia   3   1   2
## 293:     Matabeleland   2   1   2
## 294:          Kabylia   1   2   3
# 한국과 일본의 경기만 선택함
df[team=='Korea Republic' & opponent=='Japan',]   
##           date           team opponent team_score opponent_score
##  1: 1954-03-07 Korea Republic    Japan          5              1
##  2: 1954-03-14 Korea Republic    Japan          2              2
##  3: 1959-09-05 Korea Republic    Japan          0              0
##  4: 1959-09-06 Korea Republic    Japan          3              1
##  5: 1960-11-06 Korea Republic    Japan          2              1
##  6: 1961-06-11 Korea Republic    Japan          2              0
##  7: 1963-08-13 Korea Republic    Japan          1              1
##  8: 1967-08-01 Korea Republic    Japan          1              2
##  9: 1969-10-12 Korea Republic    Japan          2              2
## 10: 1969-10-18 Korea Republic    Japan          2              0
## 11: 1970-08-02 Korea Republic    Japan          1              1
## 12: 1972-07-26 Korea Republic    Japan          3              0
## 13: 1972-09-14 Korea Republic    Japan          2              2
## 14: 1973-06-23 Korea Republic    Japan          2              0
## 15: 1974-09-28 Korea Republic    Japan          1              4
## 16: 1975-08-09 Korea Republic    Japan          3              1
## 17: 1975-09-08 Korea Republic    Japan          3              0
## 18: 1976-08-18 Korea Republic    Japan          0              0
## 19: 1976-12-04 Korea Republic    Japan          2              1
## 20: 1977-03-26 Korea Republic    Japan          0              0
## 21: 1977-04-03 Korea Republic    Japan          1              0
## 22: 1977-06-15 Korea Republic    Japan          2              1
## 23: 1978-07-19 Korea Republic    Japan          4              0
## 24: 1979-03-04 Korea Republic    Japan          1              2
## 25: 1979-06-16 Korea Republic    Japan          4              1
## 26: 1981-03-08 Korea Republic    Japan          1              0
## 27: 1981-06-21 Korea Republic    Japan          2              0
## 28: 1982-03-21 Korea Republic    Japan          3              0
## 29: 1983-03-06 Korea Republic    Japan          1              1
## 30: 1984-09-30 Korea Republic    Japan          1              2
## 31: 1985-10-26 Korea Republic    Japan          2              1
## 32: 1985-11-03 Korea Republic    Japan          1              0
## 33: 1988-10-26 Korea Republic    Japan          1              0
## 34: 1988-12-06 Korea Republic    Japan          2              0
## 35: 1989-05-05 Korea Republic    Japan          1              0
## 36: 1990-07-27 Korea Republic    Japan          2              0
## 37: 1991-07-27 Korea Republic    Japan          1              0
## 38: 1992-08-22 Korea Republic    Japan          0              0
## 39: 1992-08-29 Korea Republic    Japan          2              2
## 40: 1993-10-25 Korea Republic    Japan          0              1
## 41: 1995-02-21 Korea Republic    Japan          1              1
## 42: 1995-02-26 Korea Republic    Japan          2              2
## 43: 1997-05-21 Korea Republic    Japan          1              1
## 44: 1997-09-28 Korea Republic    Japan          2              1
## 45: 1997-11-01 Korea Republic    Japan          0              2
## 46: 1998-03-01 Korea Republic    Japan          1              2
## 47: 1998-04-01 Korea Republic    Japan          2              1
## 48: 2000-04-26 Korea Republic    Japan          1              0
## 49: 2000-12-20 Korea Republic    Japan          1              1
## 50: 2003-04-16 Korea Republic    Japan          0              1
## 51: 2003-05-31 Korea Republic    Japan          1              0
## 52: 2003-12-10 Korea Republic    Japan          0              0
## 53: 2005-08-07 Korea Republic    Japan          0              1
## 54: 2007-07-28 Korea Republic    Japan          0              0
## 55: 2008-02-23 Korea Republic    Japan          1              1
## 56: 2010-02-14 Korea Republic    Japan          3              1
## 57: 2010-05-24 Korea Republic    Japan          2              0
## 58: 2010-10-12 Korea Republic    Japan          0              0
## 59: 2011-01-25 Korea Republic    Japan          2              2
## 60: 2011-08-10 Korea Republic    Japan          0              3
## 61: 2013-07-28 Korea Republic    Japan          1              2
## 62: 2015-08-05 Korea Republic    Japan          1              1
## 63: 2017-12-16 Korea Republic    Japan          4              1
##           date           team opponent team_score opponent_score
##                       tournament year 결과
##  1: FIFA World Cup qualification 1954   승
##  2: FIFA World Cup qualification 1954   무
##  3:           Merdeka Tournament 1959   무
##  4:           Merdeka Tournament 1959   승
##  5: FIFA World Cup qualification 1960   승
##  6: FIFA World Cup qualification 1961   승
##  7:           Merdeka Tournament 1963   무
##  8:  AFC Asian Cup qualification 1967   패
##  9: FIFA World Cup qualification 1969   무
## 10: FIFA World Cup qualification 1969   승
## 11:           Merdeka Tournament 1970   무
## 12:           Merdeka Tournament 1972   승
## 13:                     Friendly 1972   무
## 14:                     Friendly 1973   승
## 15:                     Friendly 1974   패
## 16:           Merdeka Tournament 1975   승
## 17:                     Friendly 1975   승
## 18:           Merdeka Tournament 1976   무
## 19:                     Friendly 1976   승
## 20: FIFA World Cup qualification 1977   무
## 21: FIFA World Cup qualification 1977   승
## 22:                     Friendly 1977   승
## 23:           Merdeka Tournament 1978   승
## 24:                     Friendly 1979   패
## 25:                     Friendly 1979   승
## 26:                     Friendly 1981   승
## 27:                    Korea Cup 1981   승
## 28:                     Friendly 1982   승
## 29:                     Friendly 1983   무
## 30:                     Friendly 1984   패
## 31: FIFA World Cup qualification 1985   승
## 32: FIFA World Cup qualification 1985   승
## 33:                     Friendly 1988   승
## 34:                AFC Asian Cup 1988   승
## 35:                     Friendly 1989   승
## 36:                  Dynasty Cup 1990   승
## 37:                     Friendly 1991   승
## 38:                  Dynasty Cup 1992   무
## 39:                  Dynasty Cup 1992   무
## 40: FIFA World Cup qualification 1993   패
## 41:                  Dynasty Cup 1995   무
## 42:                  Dynasty Cup 1995   무
## 43:                     Friendly 1997   무
## 44: FIFA World Cup qualification 1997   승
## 45: FIFA World Cup qualification 1997   패
## 46:                  Dynasty Cup 1998   패
## 47:                     Friendly 1998   승
## 48:                     Friendly 2000   승
## 49:                     Friendly 2000   무
## 50:                     Friendly 2003   패
## 51:                     Friendly 2003   승
## 52:            EAFF Championship 2003   무
## 53:            EAFF Championship 2005   패
## 54:                AFC Asian Cup 2007   무
## 55:            EAFF Championship 2008   무
## 56:            EAFF Championship 2010   승
## 57:                     Friendly 2010   승
## 58:                     Friendly 2010   무
## 59:                AFC Asian Cup 2011   무
## 60:                     Friendly 2011   패
## 61:            EAFF Championship 2013   패
## 62:            EAFF Championship 2015   무
## 63:            EAFF Championship 2017   승
##                       tournament year 결과
# 한국과 일본의 경기중 '승리'한 횟수 계산
df[team=='Korea Republic' & opponent=='Japan', .(승=sum(결과=='승')), by=.(team)]
##              team 승
## 1: Korea Republic 31
# 한국과 일본의 경기중 '승리'/'무승부'/'패배' 횟수 계산 
df[team=='Korea Republic' & opponent=='Japan', .(승=sum(결과=='승'),무=sum(결과=='무'),패=sum(결과=='패')), by=.(team)] 
##              team 승 무 패
## 1: Korea Republic 31 21 11
# 한국팀의 평균 득점과 평균 실점을 계산하여 내림차순으로 정리        
df[team=='Korea Republic', .(평균득점=mean(team_score), 평균실점=mean(opponent_score)), by=.(opponent)][order(평균득점, decreasing = TRUE)]
##      opponent 평균득점 평균실점
##   1:     Guam      9.0      0.0
##   2:    Sudan      8.0      0.0
##   3:    Yemen      6.0      0.0
##   4: Mongolia      6.0      0.0
##   5:     Laos      5.6      0.0
##  ---                           
## 113:  Tunisia      0.0      0.5
## 114:    Chile      0.0      1.0
## 115:  Denmark      0.0      0.0
## 116:  Belarus      0.0      1.0
## 117:     Peru      0.0      0.0
# 한국팀의 평균 득점과 평균 실점을 계산하여 내림차순으로 정리 (order를 사용)  
df[team=='Korea Republic', .(평균득점=mean(team_score), 평균실점=mean(opponent_score)), by=.(opponent)][order(-평균득점)]
##      opponent 평균득점 평균실점
##   1:     Guam      9.0      0.0
##   2:    Sudan      8.0      0.0
##   3:    Yemen      6.0      0.0
##   4: Mongolia      6.0      0.0
##   5:     Laos      5.6      0.0
##  ---                           
## 113:  Tunisia      0.0      0.5
## 114:    Chile      0.0      1.0
## 115:  Denmark      0.0      0.0
## 116:  Belarus      0.0      1.0
## 117:     Peru      0.0      0.0

6-3. Rbase 그래프

6-3-1. Rbase:막대그래프

df4 = read.xlsx("ClassExample.xlsx")

# 막대그래프 barplot()
barplot(table(df4$sliptime))

barplot(table(df4$sliptime),
        main = "Sleep Time 빈도",
        xlab = "Sleep Time 정도",
        ylab = "빈도",
        col = 'red')

6-3-2. Rbase:박스그래프

# 박스그래프 barplot()
df1 = read.xlsx("anova.xlsx")
boxplot(df1$근무만족도 ~ df1$상사의유형)

boxplot(df1$근무만족도 ~ df1$상사의유형,
        main = "상사의 유형별 근무만족도",
        xlab = "상사의 유형",
        ylab = "근무만족도",
        col = 'yellow')

6-3-3. Rbase:산점도

# 산점도 plot() scatterplot
df3 = read.xlsx("regress.xlsx")
plot(df3$근무만족도,df3$근무평정,
     main = "근무만족도-근무평정 산점도",
     xlab = "근무만족도",
     ylab = '근무평정',
     col  = 'red',
     pch  = 20)

pairs(df3,
      main = "Multi Plots",
      pch = 20,
      col = 'red')

6-3-4. Rbase:파이차트

# 파이차트 pie()
pie(table(df4$sliptime))

df6 = table(df4$sliptime)
df6 = data.frame(freq = df6,
                 lab =c("1~3 hr", "3~5 hr", "6~7 hr", 
                        "7~8 hr", "8~9 hr", "10~ hr"))
pie (df6[,2], labels=df6[,3])

6-3-5. Rbase: 선그래프

# 선그래프 line chart
df7 = aggregate(df4$weight ~ df4$sliptime, data=df4, mean)
plot(df7[,1], df7[,2])

plot(df7[,1], df7[,2],
     main = "Weight by Sleeptime",
     xlab = "Sleep Time",
     ylab = "Weight",
     type = 'o')

6.4. ggplot 그래프

ggplot의 기본구조

library(ggplot2)

ggplot(data=df, aes(x = x1, y = x2) +
geom_xx() +
geom_yy() +)

  1. ggplot2 library를 불러온다.
  2. 첫 줄에 data, x축, y축에 해당하는 것을 지정
  3. “+” 기호로 ggplot함수와 여러 개의 geom_xx()를 연결
  4. 각각의 geom 함수를 정의
    예, geom_bar(.., .., …) +
    geom_line(.., .., ..)

6-4-1. ggplot: 막대그래프

library(ggplot2)

# ggplot: 막대그래프 barplot()
month = c(1, 2, 3, 4, 5, 6)
rain  = c(55, 50, 45, 50, 60, 70)
df = data.frame(month, rain)

ggplot(df, aes(x = month, y = rain)) +
  geom_bar(stat = 'identity',
           width = 0.7,
           fill = 'steelblue')

# flipped bar
ggplot(df, aes(x = month, y = rain)) +
  geom_bar(stat = 'identity',
           width = 0.7,
           fill = 'steelblue') +
  ggtitle("월별강수량") +
  theme(plot.title=element_text(size=25,
                                face="bold",color="steelblue")) +
  labs(x = '월', y = '강수량') + coord_flip()

6-4-2. ggplot: 막대그래프

# 막대그래프 histogram()
library(ggplot2)
ggplot(iris, aes(x = Petal.Length)) +
  geom_histogram(binwidth = 0.5, fill='red', color='black')

ggplot(iris, aes(x = Sepal.Width, fill = Species, color = Species)) +
  geom_histogram(binwidth = 0.5, position="dodge")+theme(legend.position="top")

6-4-3. ggplot: 산점도

# 산점도 point() Scatterplot
ggplot(iris, aes(x=Petal.Length, y=Petal.Width)) +
  geom_point()

# 산점도 with Colored Points
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=Species)) +
  geom_point(size=3) +
  ggtitle("꽃잎의 길이와 폭") +
  theme(plot.title=element_text(size=25,face="bold",color="steelblue"))

6-4-4. ggplot: 박스그래프

# 박스그래프  boxplot()
ggplot(data=iris, aes(y=Petal.Length, fill=Species)) +
  geom_boxplot()

6-4-5. ggplot: 선그래프

# 선그래프 line()
year = 1937:1960
cnt = as.vector(airmiles)
df = data.frame(year,cnt)

ggplot(data=df, aes(x=year, y=cnt)) +
  geom_line(col='red')

6-4-6. ggplot: 산점도+smooth

# scatter plot + smooth

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

6-4-7. ggplot: 산점도+multi smooth

# scatter plot + multi smooth

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_point(mapping=aes(x=displ, y=cty), data=mpg, col='red', shape=1) +
  geom_smooth(mapping=aes(x=displ,y=cty), data=mpg, linetype=2, col='red')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

6-4-8. ggplot: polar barplot

# polor barplot

ggplot(mpg, aes(class, fill=drv)) +
  geom_bar(position='fill') + coord_polar()