setwd("c:/data")
library(readxl)
data1<-read_excel("Data1.xls")
head(data1);tail(data1)
## # A tibble: 6 × 26
##      Q1    Q2    Q3    Q4    Q5    Q6    Q7    Q8    Q9   Q10   Q11   Q12   Q13
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     4     4     2     3     4     2     2     4     4     4     4     4     4
## 2     4     4     4     4     4     3     2     4     4     4     4     4     4
## 3     4     4     4     4     2     4     4     4     4     2     4     4     4
## 4     5     4     4     4     4     4     4     4     4     4     4     4     4
## 5     4     4     4     4     4     4     4     4     2     4     4     4     4
## 6     4     4     4     4     4     4     4     4     4     4     4     4     4
## # … with 13 more variables: Q14 <dbl>, Q15 <dbl>, Q16 <dbl>, Q17 <dbl>,
## #   Q18 <dbl>, Q19 <dbl>, Q20 <dbl>, Gender <dbl>, EDU <dbl>, BF <dbl>,
## #   BM <dbl>, Happiness <dbl>, Peace <dbl>
## # A tibble: 6 × 26
##      Q1    Q2    Q3    Q4    Q5    Q6    Q7    Q8    Q9   Q10   Q11   Q12   Q13
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     4     4     3     4     4     2     2     3     4     2     2     4     3
## 2     2     2     2     1     2     2     2     2     2     2     1     3     2
## 3     3     2     2     2     3     1     1     1     1     1     3     3     3
## 4     5     4     4     4     4     2     2     2     2     3     3     4     3
## 5     4     4     4     2     2     4     2     4     4     3     3     2     3
## 6     3     3     1     1     2     1     1     1     1     1     4     4     3
## # … with 13 more variables: Q14 <dbl>, Q15 <dbl>, Q16 <dbl>, Q17 <dbl>,
## #   Q18 <dbl>, Q19 <dbl>, Q20 <dbl>, Gender <dbl>, EDU <dbl>, BF <dbl>,
## #   BM <dbl>, Happiness <dbl>, Peace <dbl>
#txt파일 불러오기
data2<-read.table("Data1.txt",header = TRUE)
head(data2)
##   Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15 Q16 Q17 Q18 Q19 Q20 Gender
## 1  4  4  2  3  4  2  2  4  4   4   4   4   4   4   4   4   4   4   4   4      0
## 2  4  4  4  4  4  3  2  4  4   4   4   4   4   4   4   4   3   4   2   1      0
## 3  4  4  4  4  2  4  4  4  4   2   4   4   4   4   3   4   4   4   4   3      0
## 4  5  4  4  4  4  4  4  4  4   4   4   4   4   4   4   4   4   4   4   4      0
## 5  4  4  4  4  4  4  4  4  2   4   4   4   4   4   4   4   4   4   4   4      0
## 6  4  4  4  4  4  4  4  4  4   4   4   4   4   4   4   4   4   4   4   4      0
##   EDU  BF  BM Happiness Peace
## 1   1 3.4 3.2       4.0   4.0
## 2   1 4.0 3.4       4.0   2.8
## 3   2 3.6 3.6       3.8   3.8
## 4   1 4.2 4.0       4.0   4.0
## 5   2 4.0 3.6       4.0   4.0
## 6   1 4.0 4.0       4.0   4.0
#재장된 데이터를 csv파일로 저장하기
data(mtcars)
write.csv(mtcars,file="mtcars.csv")
#head함수
data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
#iris데이터에서 3번째 행까지 출력
head(iris,3)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
#tail함수
tail(iris)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
tail(iris,3)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
#구조파악 함수
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#새 페이지를 민들어 데이터 전체를 보여줌
View(iris)
#행과 열 개수
dim(iris)
## [1] 150   5
#데이터의 길이
length(iris)
## [1] 5
length(iris$Sepal.Length)
## [1] 150
#데이터 유형 파악
class(iris)
## [1] "data.frame"
class(iris$Sepal.Length)
## [1] "numeric"
class(iris$Species)
## [1] "factor"
#변수이름 파악
ls(iris)
## [1] "Petal.Length" "Petal.Width"  "Sepal.Length" "Sepal.Width"  "Species"
#통계함수(평균,분산,표준편차,합,범위,최댓값,최솟값,사분위수,사분위수범위)
mean(mtcars$mpg)
## [1] 20.09062
var(mtcars$mpg)
## [1] 36.3241
sd(mtcars$mpg)
## [1] 6.026948
sum(mtcars$mpg)
## [1] 642.9
range(mtcars$mpg)
## [1] 10.4 33.9
max(mtcars$mpg)
## [1] 33.9
min(mtcars$mpg)
## [1] 10.4
quantile(mtcars$mpg)
##     0%    25%    50%    75%   100% 
## 10.400 15.425 19.200 22.800 33.900
IQR(mtcars$mpg)
## [1] 7.375
#전체 통계요약
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
#빈도분석=정수형태로 저장되어있는 성별과 학력을 범주형변수로 변환하고 레벨의 값을 확인
df<-read.csv("Data1.csv")
df$Gender<-factor(df$Gender)
df$EDU<-factor(df$EDU)
levels(df$Gender)
## [1] "0" "1"
levels(df$EDU)
## [1] "1" "2" "3" "4"
View(df)
#plyr의 revalue함수를 이용하여 레벨의 이름을 변경후 빈도분석
library(plyr)
df$Gender<-revalue(df$Gender,replace = c("0"="female","1"="male"))
df$EDU<-revalue(df$EDU,replace = c("1"="high","2"="university","3"="graduate","4"="phd"))
table(df$Gender)
## 
## female   male 
##   1136    789
table(df$EDU)
## 
##       high university   graduate        phd 
##        233        472       1022        198
#빈도비율
a<-table(df$Gender)
b<-table(df$EDU)
prop.table(a)
## 
##    female      male 
## 0.5901299 0.4098701
prop.table(b)
## 
##       high university   graduate        phd 
##  0.1210390  0.2451948  0.5309091  0.1028571
e<-table(df$Gender,df$EDU)
prop.table(e)
##         
##                high university   graduate        phd
##   female 0.09454545 0.14389610 0.30233766 0.04935065
##   male   0.02649351 0.10129870 0.22857143 0.05350649
#행과 열의 비율 형식 맞추기
prop.table(e,1) #행별로 합이 1
##         
##                high university   graduate        phd
##   female 0.16021127 0.24383803 0.51232394 0.08362676
##   male   0.06463878 0.24714829 0.55766793 0.13054499
prop.table(e,2) #열별로 합이 1
##         
##               high university  graduate       phd
##   female 0.7811159  0.5868644 0.5694716 0.4797980
##   male   0.2188841  0.4131356 0.4305284 0.5202020
#소수점 자르기
round(0.37573473,2)
## [1] 0.38
round(prop.table(e),2)
##         
##          high university graduate  phd
##   female 0.09       0.14     0.30 0.05
##   male   0.03       0.10     0.23 0.05
#pipes 연산자
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data("diamonds")
diamonds %>% head %>% dim
## [1]  6 10
#10개의 열로 구성되어있음
View(diamonds)
diamonds1<-diamonds %>% rename(c=clarity,p=price) #변수이름 바꾸기
head(diamonds1,3)
## # A tibble: 3 × 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31
count(diamonds,cut)#count를 이용한 빈도분석
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
table(diamonds$cut)#table을 이용한 빈도분석
## 
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
df1<-diamonds %>% select(carat,price) #필요한 열만 추출
head(df1,3)
## # A tibble: 3 × 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327
df2<-diamonds %>% select(-carat,-price) #필요없는열빼고 추출
head(df2,3)
## # A tibble: 3 × 8
##   cut     color clarity depth table     x     y     z
##   <ord>   <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ideal   E     SI2      61.5    55  3.95  3.98  2.43
## 2 Premium E     SI1      59.8    61  3.89  3.84  2.31
## 3 Good    E     VS1      56.9    65  4.05  4.07  2.31
diamonds %>% slice(1:5) #1번째 행부터 5번째 행까지 출력
## # A tibble: 5 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good    J     SI2      63.3    58   335  4.34  4.35  2.75
diamonds %>% slice(-1) #음수가 붙으면 해당 행 제외하고 출력
## # A tibble: 53,939 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  2  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  3  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  4  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  5  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  6  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  7  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
##  9  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## 10  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
## # … with 53,929 more rows
diamonds %>% filter(cut=="Good") %>% head(3) #cut이 good인경우만 3번째행까지 추출
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
max(diamonds$price)
## [1] 18823
diamonds %>% filter(price==max(diamonds$price)) #price가 최댓값인 행 출력
## # A tibble: 1 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16
diamonds %>% filter(price==18823)
## # A tibble: 1 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16
diamonds %>% filter(cut!="Premium") %>% head(3) #cut변수중 Premium이 아닌 행들 추출
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 3  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
diamonds %>% filter(price>=1000) %>% head(3)
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.7  Ideal E     SI1      62.5    57  2757  5.7   5.72  3.57
## 2  0.86 Fair  E     SI2      55.1    69  2757  6.45  6.33  3.52
## 3  0.7  Ideal G     VS2      61.6    56  2757  5.7   5.67  3.5
diamonds %>% filter(price!=1000) %>% head(3)
## # A tibble: 3 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
diamonds %>% filter(price==1000) %>% head(3)
## # A tibble: 3 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.38 Very Good E     VVS2     61.8    56  1000  4.66  4.68  2.88
## 2  0.39 Very Good F     VS1      57.1    61  1000  4.86  4.91  2.79
## 3  0.38 Very Good E     VS1      61.5    58  1000  4.64  4.69  2.87
#가격이 1000이 아니고 절삭형태가 ideal인 행 추출
diamonds %>% filter(price!=1000&cut=="Ideal") %>% head(3) 
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46
## 3  0.31 Ideal J     SI2      62.2    54   344  4.35  4.37  2.71
#1캐럿보다 작거나 5캐럿보다 큰 행 추출
diamonds %>% filter(carat<1|carat>5) %>% head(3)
## # A tibble: 3 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
#cut변수에서 ideal과 good인 행 추출
diamonds %>% filter(cut%in%c("Ideal","Good")) %>% head(3)
## # A tibble: 3 × 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 3  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
#파생변수 만들기
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.
#집단별 통계량 구하기
diamonds %>% summarise(mean(price))
## # A tibble: 1 × 1
##   `mean(price)`
##           <dbl>
## 1         3933.
diamonds %>% summarise(AvgPrice=mean(price),
                       MedianPrice=median(price),
                       AvgCarat=mean(carat))
## # A tibble: 1 × 3
##   AvgPrice MedianPrice AvgCarat
##      <dbl>       <dbl>    <dbl>
## 1    3933.        2401    0.798
diamonds %>% group_by(cut) %>% 
  summarise(AvgPrice=mean(price),SumCarat=sum(carat))
## # A tibble: 5 × 3
##   cut       AvgPrice SumCarat
##   <ord>        <dbl>    <dbl>
## 1 Fair         4359.    1684.
## 2 Good         3929.    4166.
## 3 Very Good    3982.    9743.
## 4 Premium      4584.   12301.
## 5 Ideal        3458.   15147.
#cut의 레벨별로 집단의 개수를 구하고 파생변수를 만들어 출력
diamonds %>% group_by(cut) %>% 
  summarise(n=n()) %>% #집단의 개수를 구함
  mutate(total=sum(n),pct=n/total*100)
## # A tibble: 5 × 4
##   cut           n total   pct
##   <ord>     <int> <int> <dbl>
## 1 Fair       1610 53940  2.98
## 2 Good       4906 53940  9.10
## 3 Very Good 12082 53940 22.4 
## 4 Premium   13791 53940 25.6 
## 5 Ideal     21551 53940 40.0
#4분위수를 알아본 후 3사분위수 이상은 베스트,2사분위수 이상은 굿,...으로 분류후 확인
quantile(diamonds$price)
##       0%      25%      50%      75%     100% 
##   326.00   950.00  2401.00  5324.25 18823.00
diamonds1<-diamonds %>% mutate(price_class=ifelse(price>=5324.25,"best",
                                                  ifelse(price>=2401.00,"good",
                                                  ifelse(price>=950,"normal","bad"))))
table(diamonds1$price_class)
## 
##    bad   best   good normal 
##  13483  13485  13496  13476
diamonds %>% group_by(cut) %>% 
  summarise(AvgPrice=mean(price)) %>% 
  arrange(desc(AvgPrice)) #내림차순 정렬
## # A tibble: 5 × 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Premium      4584.
## 2 Fair         4359.
## 3 Very Good    3982.
## 4 Good         3929.
## 5 Ideal        3458.
diamonds %>% group_by(cut) %>% 
  summarise(AvgPrice=mean(price)) %>% 
  arrange(AvgPrice) #오름차순 정렬
## # A tibble: 5 × 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Ideal        3458.
## 2 Good         3929.
## 3 Very Good    3982.
## 4 Fair         4359.
## 5 Premium      4584.
#열결합
ott1<-data.frame(id=c(1,2,3),car=c("bmw","bmw","bmw"),
                 fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1
##   id car fe
## 1  1 bmw 20
## 2  2 bmw 22
## 3  3 bmw 24
ott2
##   id fe1
## 1  1  30
## 2  4  34
## 3  5  35
left_join(ott1,ott2,by="id") #ott1을 기준으로 id가 1,2,3과 일치하는 데이터만 결합
##   id car fe fe1
## 1  1 bmw 20  30
## 2  2 bmw 22  NA
## 3  3 bmw 24  NA
inner_join(ott1,ott2,by="id") #ott1 기준 id값 교집합인 행 출력
##   id car fe fe1
## 1  1 bmw 20  30
full_join(ott1,ott2,by="id") #모든행 결합
##   id  car fe fe1
## 1  1  bmw 20  30
## 2  2  bmw 22  NA
## 3  3  bmw 24  NA
## 4  4 <NA> NA  34
## 5  5 <NA> NA  35
#키워드형식으로 결합
ott3<-data.frame(nation_code=c(1,2,3,4),nation=c("korea","japan",
                                                 "china","germany"))
ott4<-data.frame(car=c("bmw","toyota","kia"),nation_code=c(3,3,2))
ott3
##   nation_code  nation
## 1           1   korea
## 2           2   japan
## 3           3   china
## 4           4 germany
ott4
##      car nation_code
## 1    bmw           3
## 2 toyota           3
## 3    kia           2
left_join(ott4,ott3,by="nation_code")
##      car nation_code nation
## 1    bmw           3  china
## 2 toyota           3  china
## 3    kia           2  japan
#행결합
ott5<-data.frame(car=c("bmw","bmw","bmw"),fe=c(20,22,24))
ott6<-data.frame(car=c("audi","audi","audi"),fe1=c(20,22,24))
bind_rows(ott5,ott6)
##    car fe fe1
## 1  bmw 20  NA
## 2  bmw 22  NA
## 3  bmw 24  NA
## 4 audi NA  20
## 5 audi NA  22
## 6 audi NA  24
#데이터 유형 변경(기본 세팅은 알파벳 순서)
q1<-c("nike","polo","adidas","wilson","yonex")
q1_factor<-as.factor(q1)
q1_factor
## [1] nike   polo   adidas wilson yonex 
## Levels: adidas nike polo wilson yonex
as.numeric(q1_factor)
## [1] 2 3 1 4 5
#교육수준과 같은 레벨의 순서를 정하고 싶을 때
factor(x=c("high school","colleage","masters"),
       levels=c("high school","colleage","masters"),
       ordered = TRUE)
## [1] high school colleage    masters    
## Levels: high school < colleage < masters
economics<-ggplot2::economics
head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
#1967-07-01과 같이 되어있는 데이터를 연도 부분만 따로 빼내어 year이라는 파생변수를 만들고 
#같은 연도끼리 group_by를 이용하여 묶은 뒤 summarise함수를 이용해 psavert(개인저축률)의 평균을 구하고
#내림차순으로 정렬 후 위에서 5개의 행만 출력
economics<-economics %>% mutate(year=substr(economics$date,1,4))
economics %>% group_by(year) %>% summarise(m=mean(psavert)) %>% 
  arrange(desc(m)) %>% head(5)
## # A tibble: 5 × 2
##   year      m
##   <chr> <dbl>
## 1 1971   13.5
## 2 1973   13.4
## 3 1975   13.4
## 4 1974   13.3
## 5 1970   12.8
#문자열 데이터를 날짜형 데이터로 변환(20221224와 같이 구분기호가없는 문자열은 실행안됨)
as.Date("2022/12/24")
## [1] "2022-12-24"
library(lubridate)
## 필요한 패키지를 로딩중입니다: timechange
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data(lakers)
lakers<-lakers %>% as_tibble
lakers %>% select(date,time)
## # A tibble: 34,624 × 2
##        date time 
##       <int> <chr>
##  1 20081028 12:00
##  2 20081028 11:39
##  3 20081028 11:37
##  4 20081028 11:25
##  5 20081028 11:23
##  6 20081028 11:22
##  7 20081028 11:22
##  8 20081028 11:22
##  9 20081028 11:00
## 10 20081028 10:53
## # … with 34,614 more rows
lakers<-lakers %>% 
  mutate(date=paste(date,time) %>% ymd_hm) %>% 
  rename(time_index=date) %>% 
  select(-time)
head(lakers)
## # A tibble: 6 × 12
##   time_index          opponent game_type period etype team  player result points
##   <dttm>              <chr>    <chr>      <int> <chr> <chr> <chr>  <chr>   <int>
## 1 2008-10-28 12:00:00 POR      home           1 jump… OFF   ""     ""          0
## 2 2008-10-28 11:39:00 POR      home           1 shot  LAL   "Pau … "miss…      0
## 3 2008-10-28 11:37:00 POR      home           1 rebo… LAL   "Vlad… ""          0
## 4 2008-10-28 11:25:00 POR      home           1 shot  LAL   "Dere… "miss…      0
## 5 2008-10-28 11:23:00 POR      home           1 rebo… LAL   "Pau … ""          0
## 6 2008-10-28 11:22:00 POR      home           1 shot  LAL   "Pau … "made"      2
## # … with 3 more variables: type <chr>, x <int>, y <int>
summary(lakers)
##    time_index                      opponent          game_type        
##  Min.   :2008-10-28 00:00:00.0   Length:34624       Length:34624      
##  1st Qu.:2008-12-10 00:19:30.0   Class :character   Class :character  
##  Median :2009-01-21 10:52:00.0   Mode  :character   Mode  :character  
##  Mean   :2009-01-22 20:08:18.4                                        
##  3rd Qu.:2009-03-09 00:33:00.0                                        
##  Max.   :2009-04-14 12:00:00.0                                        
##                                                                       
##      period         etype               team              player         
##  Min.   :1.000   Length:34624       Length:34624       Length:34624      
##  1st Qu.:2.000   Class :character   Class :character   Class :character  
##  Median :3.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2.536                                                           
##  3rd Qu.:4.000                                                           
##  Max.   :5.000                                                           
##                                                                          
##     result              points           type                 x        
##  Length:34624       Min.   :0.0000   Length:34624       Min.   : 0.00  
##  Class :character   1st Qu.:0.0000   Class :character   1st Qu.:20.00  
##  Mode  :character   Median :0.0000   Mode  :character   Median :25.00  
##                     Mean   :0.4627                      Mean   :25.32  
##                     3rd Qu.:1.0000                      3rd Qu.:31.00  
##                     Max.   :3.0000                      Max.   :51.00  
##                                                         NA's   :21557  
##        y        
##  Min.   : 3.00  
##  1st Qu.: 6.00  
##  Median :10.00  
##  Mean   :13.43  
##  3rd Qu.:20.00  
##  Max.   :90.00  
##  NA's   :21557
lakers %>% group_by(month(time_index)) %>% 
  summarise(mean_x=mean(x,na.rm = TRUE),mean_y=mean(y,na.rm = TRUE))
## # A tibble: 7 × 3
##   `month(time_index)` mean_x mean_y
##                 <dbl>  <dbl>  <dbl>
## 1                   1   25.5   13.9
## 2                   2   25.0   13.2
## 3                   3   25.5   13.2
## 4                   4   25.4   13.5
## 5                  10   24.9   13.1
## 6                  11   25.5   13.4
## 7                  12   25.1   13.5
lakers %>% group_by(year(time_index)) %>% 
  summarise(mean_x=mean(x,na.rm = TRUE),mean_y=mean(y,na.rm = TRUE))
## # A tibble: 2 × 3
##   `year(time_index)` mean_x mean_y
##                <dbl>  <dbl>  <dbl>
## 1               2008   25.2   13.4
## 2               2009   25.4   13.4
lakers %>% filter(time_index<=ymd_hms("2008-10-28 12:00:00")) %>% head(3)
## # A tibble: 3 × 12
##   time_index          opponent game_type period etype team  player result points
##   <dttm>              <chr>    <chr>      <int> <chr> <chr> <chr>  <chr>   <int>
## 1 2008-10-28 12:00:00 POR      home           1 jump… OFF   ""     ""          0
## 2 2008-10-28 11:39:00 POR      home           1 shot  LAL   "Pau … "miss…      0
## 3 2008-10-28 11:37:00 POR      home           1 rebo… LAL   "Vlad… ""          0
## # … with 3 more variables: type <chr>, x <int>, y <int>
lakers%>%filter(time_index<=ymd_hms("2008-10-28 12:00:00"),
                time_index<=ymd_hms("2009-03-09 00:33:00")) %>% head(3)
## # A tibble: 3 × 12
##   time_index          opponent game_type period etype team  player result points
##   <dttm>              <chr>    <chr>      <int> <chr> <chr> <chr>  <chr>   <int>
## 1 2008-10-28 12:00:00 POR      home           1 jump… OFF   ""     ""          0
## 2 2008-10-28 11:39:00 POR      home           1 shot  LAL   "Pau … "miss…      0
## 3 2008-10-28 11:37:00 POR      home           1 rebo… LAL   "Vlad… ""          0
## # … with 3 more variables: type <chr>, x <int>, y <int>
#수치형데이터를 구간으로 나누기(같은 간격구분,간격지정 구분)
airquality$grade_Wind1<-cut(airquality$Wind,breaks = 3)
airquality$grade_Wind2<-cut(airquality$Wind,breaks = 3,
                            labels = c("s","m","w"))
head(airquality,3)
##   Ozone Solar.R Wind Temp Month Day grade_Wind1 grade_Wind2
## 1    41     190  7.4   67     5   1 (1.68,8.03]           s
## 2    36     118  8.0   72     5   2 (1.68,8.03]           s
## 3    12     149 12.6   74     5   3 (8.03,14.4]           m
mean(airquality$Wind)
## [1] 9.957516
range(airquality$Wind)
## [1]  1.7 20.7
airquality$grade_Wind3<-cut(airquality$Wind,
                            breaks=c(1.7,9.96,20.7),
                            labels=c("low","high"),
                            include.lowest=TRUE)
head(airquality,3)
##   Ozone Solar.R Wind Temp Month Day grade_Wind1 grade_Wind2 grade_Wind3
## 1    41     190  7.4   67     5   1 (1.68,8.03]           s         low
## 2    36     118  8.0   72     5   2 (1.68,8.03]           s         low
## 3    12     149 12.6   74     5   3 (8.03,14.4]           m        high