setwd("c:/data")
library(readxl)
data1<-read_excel("Data1.xls")
head(data1);tail(data1)
## # A tibble: 6 × 26
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 4 2 3 4 2 2 4 4 4 4 4 4
## 2 4 4 4 4 4 3 2 4 4 4 4 4 4
## 3 4 4 4 4 2 4 4 4 4 2 4 4 4
## 4 5 4 4 4 4 4 4 4 4 4 4 4 4
## 5 4 4 4 4 4 4 4 4 2 4 4 4 4
## 6 4 4 4 4 4 4 4 4 4 4 4 4 4
## # … with 13 more variables: Q14 <dbl>, Q15 <dbl>, Q16 <dbl>, Q17 <dbl>,
## # Q18 <dbl>, Q19 <dbl>, Q20 <dbl>, Gender <dbl>, EDU <dbl>, BF <dbl>,
## # BM <dbl>, Happiness <dbl>, Peace <dbl>
## # A tibble: 6 × 26
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 4 3 4 4 2 2 3 4 2 2 4 3
## 2 2 2 2 1 2 2 2 2 2 2 1 3 2
## 3 3 2 2 2 3 1 1 1 1 1 3 3 3
## 4 5 4 4 4 4 2 2 2 2 3 3 4 3
## 5 4 4 4 2 2 4 2 4 4 3 3 2 3
## 6 3 3 1 1 2 1 1 1 1 1 4 4 3
## # … with 13 more variables: Q14 <dbl>, Q15 <dbl>, Q16 <dbl>, Q17 <dbl>,
## # Q18 <dbl>, Q19 <dbl>, Q20 <dbl>, Gender <dbl>, EDU <dbl>, BF <dbl>,
## # BM <dbl>, Happiness <dbl>, Peace <dbl>
#txt파일 불러오기
data2<-read.table("Data1.txt",header = TRUE)
head(data2)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15 Q16 Q17 Q18 Q19 Q20 Gender
## 1 4 4 2 3 4 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 0
## 2 4 4 4 4 4 3 2 4 4 4 4 4 4 4 4 4 3 4 2 1 0
## 3 4 4 4 4 2 4 4 4 4 2 4 4 4 4 3 4 4 4 4 3 0
## 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0
## 5 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 0
## 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0
## EDU BF BM Happiness Peace
## 1 1 3.4 3.2 4.0 4.0
## 2 1 4.0 3.4 4.0 2.8
## 3 2 3.6 3.6 3.8 3.8
## 4 1 4.2 4.0 4.0 4.0
## 5 2 4.0 3.6 4.0 4.0
## 6 1 4.0 4.0 4.0 4.0
#재장된 데이터를 csv파일로 저장하기
data(mtcars)
write.csv(mtcars,file="mtcars.csv")
#head함수
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#iris데이터에서 3번째 행까지 출력
head(iris,3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
#tail함수
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
tail(iris,3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
#구조파악 함수
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#새 페이지를 민들어 데이터 전체를 보여줌
View(iris)
#행과 열 개수
dim(iris)
## [1] 150 5
#데이터의 길이
length(iris)
## [1] 5
length(iris$Sepal.Length)
## [1] 150
#데이터 유형 파악
class(iris)
## [1] "data.frame"
class(iris$Sepal.Length)
## [1] "numeric"
class(iris$Species)
## [1] "factor"
#변수이름 파악
ls(iris)
## [1] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width" "Species"
#통계함수(평균,분산,표준편차,합,범위,최댓값,최솟값,사분위수,사분위수범위)
mean(mtcars$mpg)
## [1] 20.09062
var(mtcars$mpg)
## [1] 36.3241
sd(mtcars$mpg)
## [1] 6.026948
sum(mtcars$mpg)
## [1] 642.9
range(mtcars$mpg)
## [1] 10.4 33.9
max(mtcars$mpg)
## [1] 33.9
min(mtcars$mpg)
## [1] 10.4
quantile(mtcars$mpg)
## 0% 25% 50% 75% 100%
## 10.400 15.425 19.200 22.800 33.900
IQR(mtcars$mpg)
## [1] 7.375
#전체 통계요약
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
#빈도분석=정수형태로 저장되어있는 성별과 학력을 범주형변수로 변환하고 레벨의 값을 확인
df<-read.csv("Data1.csv")
df$Gender<-factor(df$Gender)
df$EDU<-factor(df$EDU)
levels(df$Gender)
## [1] "0" "1"
levels(df$EDU)
## [1] "1" "2" "3" "4"
View(df)
#plyr의 revalue함수를 이용하여 레벨의 이름을 변경후 빈도분석
library(plyr)
df$Gender<-revalue(df$Gender,replace = c("0"="female","1"="male"))
df$EDU<-revalue(df$EDU,replace = c("1"="high","2"="university","3"="graduate","4"="phd"))
table(df$Gender)
##
## female male
## 1136 789
table(df$EDU)
##
## high university graduate phd
## 233 472 1022 198
#빈도비율
a<-table(df$Gender)
b<-table(df$EDU)
prop.table(a)
##
## female male
## 0.5901299 0.4098701
prop.table(b)
##
## high university graduate phd
## 0.1210390 0.2451948 0.5309091 0.1028571
e<-table(df$Gender,df$EDU)
prop.table(e)
##
## high university graduate phd
## female 0.09454545 0.14389610 0.30233766 0.04935065
## male 0.02649351 0.10129870 0.22857143 0.05350649
#행과 열의 비율 형식 맞추기
prop.table(e,1) #행별로 합이 1
##
## high university graduate phd
## female 0.16021127 0.24383803 0.51232394 0.08362676
## male 0.06463878 0.24714829 0.55766793 0.13054499
prop.table(e,2) #열별로 합이 1
##
## high university graduate phd
## female 0.7811159 0.5868644 0.5694716 0.4797980
## male 0.2188841 0.4131356 0.4305284 0.5202020
#소수점 자르기
round(0.37573473,2)
## [1] 0.38
round(prop.table(e),2)
##
## high university graduate phd
## female 0.09 0.14 0.30 0.05
## male 0.03 0.10 0.23 0.05
#pipes 연산자
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("diamonds")
diamonds %>% head %>% dim
## [1] 6 10
#10개의 열로 구성되어있음
View(diamonds)
diamonds1<-diamonds %>% rename(c=clarity,p=price) #변수이름 바꾸기
head(diamonds1,3)
## # A tibble: 3 × 10
## carat cut color c depth table p x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
count(diamonds,cut)#count를 이용한 빈도분석
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
table(diamonds$cut)#table을 이용한 빈도분석
##
## Fair Good Very Good Premium Ideal
## 1610 4906 12082 13791 21551
df1<-diamonds %>% select(carat,price) #필요한 열만 추출
head(df1,3)
## # A tibble: 3 × 2
## carat price
## <dbl> <int>
## 1 0.23 326
## 2 0.21 326
## 3 0.23 327
df2<-diamonds %>% select(-carat,-price) #필요없는열빼고 추출
head(df2,3)
## # A tibble: 3 × 8
## cut color clarity depth table x y z
## <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ideal E SI2 61.5 55 3.95 3.98 2.43
## 2 Premium E SI1 59.8 61 3.89 3.84 2.31
## 3 Good E VS1 56.9 65 4.05 4.07 2.31
diamonds %>% slice(1:5) #1번째 행부터 5번째 행까지 출력
## # A tibble: 5 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
diamonds %>% slice(-1) #음수가 붙으면 해당 행 제외하고 출력
## # A tibble: 53,939 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 3 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 4 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 5 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 6 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 7 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 9 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## 10 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## # … with 53,929 more rows
diamonds %>% filter(cut=="Good") %>% head(3) #cut이 good인경우만 3번째행까지 추출
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 2 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
max(diamonds$price)
## [1] 18823
diamonds %>% filter(price==max(diamonds$price)) #price가 최댓값인 행 출력
## # A tibble: 1 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
diamonds %>% filter(price==18823)
## # A tibble: 1 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
diamonds %>% filter(cut!="Premium") %>% head(3) #cut변수중 Premium이 아닌 행들 추출
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 3 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
diamonds %>% filter(price>=1000) %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.7 Ideal E SI1 62.5 57 2757 5.7 5.72 3.57
## 2 0.86 Fair E SI2 55.1 69 2757 6.45 6.33 3.52
## 3 0.7 Ideal G VS2 61.6 56 2757 5.7 5.67 3.5
diamonds %>% filter(price!=1000) %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
diamonds %>% filter(price==1000) %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.38 Very Good E VVS2 61.8 56 1000 4.66 4.68 2.88
## 2 0.39 Very Good F VS1 57.1 61 1000 4.86 4.91 2.79
## 3 0.38 Very Good E VS1 61.5 58 1000 4.64 4.69 2.87
#가격이 1000이 아니고 절삭형태가 ideal인 행 추출
diamonds %>% filter(price!=1000&cut=="Ideal") %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.9 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
#1캐럿보다 작거나 5캐럿보다 큰 행 추출
diamonds %>% filter(carat<1|carat>5) %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
#cut변수에서 ideal과 good인 행 추출
diamonds %>% filter(cut%in%c("Ideal","Good")) %>% head(3)
## # A tibble: 3 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 3 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
#파생변수 만들기
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 × 12
## carat cut color clarity depth table price x y z Ratio Double
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 1417. 2835.
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1552. 3105.
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 1422. 2843.
#집단별 통계량 구하기
diamonds %>% summarise(mean(price))
## # A tibble: 1 × 1
## `mean(price)`
## <dbl>
## 1 3933.
diamonds %>% summarise(AvgPrice=mean(price),
MedianPrice=median(price),
AvgCarat=mean(carat))
## # A tibble: 1 × 3
## AvgPrice MedianPrice AvgCarat
## <dbl> <dbl> <dbl>
## 1 3933. 2401 0.798
diamonds %>% group_by(cut) %>%
summarise(AvgPrice=mean(price),SumCarat=sum(carat))
## # A tibble: 5 × 3
## cut AvgPrice SumCarat
## <ord> <dbl> <dbl>
## 1 Fair 4359. 1684.
## 2 Good 3929. 4166.
## 3 Very Good 3982. 9743.
## 4 Premium 4584. 12301.
## 5 Ideal 3458. 15147.
#cut의 레벨별로 집단의 개수를 구하고 파생변수를 만들어 출력
diamonds %>% group_by(cut) %>%
summarise(n=n()) %>% #집단의 개수를 구함
mutate(total=sum(n),pct=n/total*100)
## # A tibble: 5 × 4
## cut n total pct
## <ord> <int> <int> <dbl>
## 1 Fair 1610 53940 2.98
## 2 Good 4906 53940 9.10
## 3 Very Good 12082 53940 22.4
## 4 Premium 13791 53940 25.6
## 5 Ideal 21551 53940 40.0
#4분위수를 알아본 후 3사분위수 이상은 베스트,2사분위수 이상은 굿,...으로 분류후 확인
quantile(diamonds$price)
## 0% 25% 50% 75% 100%
## 326.00 950.00 2401.00 5324.25 18823.00
diamonds1<-diamonds %>% mutate(price_class=ifelse(price>=5324.25,"best",
ifelse(price>=2401.00,"good",
ifelse(price>=950,"normal","bad"))))
table(diamonds1$price_class)
##
## bad best good normal
## 13483 13485 13496 13476
diamonds %>% group_by(cut) %>%
summarise(AvgPrice=mean(price)) %>%
arrange(desc(AvgPrice)) #내림차순 정렬
## # A tibble: 5 × 2
## cut AvgPrice
## <ord> <dbl>
## 1 Premium 4584.
## 2 Fair 4359.
## 3 Very Good 3982.
## 4 Good 3929.
## 5 Ideal 3458.
diamonds %>% group_by(cut) %>%
summarise(AvgPrice=mean(price)) %>%
arrange(AvgPrice) #오름차순 정렬
## # A tibble: 5 × 2
## cut AvgPrice
## <ord> <dbl>
## 1 Ideal 3458.
## 2 Good 3929.
## 3 Very Good 3982.
## 4 Fair 4359.
## 5 Premium 4584.
#열결합
ott1<-data.frame(id=c(1,2,3),car=c("bmw","bmw","bmw"),
fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1
## id car fe
## 1 1 bmw 20
## 2 2 bmw 22
## 3 3 bmw 24
ott2
## id fe1
## 1 1 30
## 2 4 34
## 3 5 35
left_join(ott1,ott2,by="id") #ott1을 기준으로 id가 1,2,3과 일치하는 데이터만 결합
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
inner_join(ott1,ott2,by="id") #ott1 기준 id값 교집합인 행 출력
## id car fe fe1
## 1 1 bmw 20 30
full_join(ott1,ott2,by="id") #모든행 결합
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
## 4 4 <NA> NA 34
## 5 5 <NA> NA 35
#키워드형식으로 결합
ott3<-data.frame(nation_code=c(1,2,3,4),nation=c("korea","japan",
"china","germany"))
ott4<-data.frame(car=c("bmw","toyota","kia"),nation_code=c(3,3,2))
ott3
## nation_code nation
## 1 1 korea
## 2 2 japan
## 3 3 china
## 4 4 germany
ott4
## car nation_code
## 1 bmw 3
## 2 toyota 3
## 3 kia 2
left_join(ott4,ott3,by="nation_code")
## car nation_code nation
## 1 bmw 3 china
## 2 toyota 3 china
## 3 kia 2 japan
#행결합
ott5<-data.frame(car=c("bmw","bmw","bmw"),fe=c(20,22,24))
ott6<-data.frame(car=c("audi","audi","audi"),fe1=c(20,22,24))
bind_rows(ott5,ott6)
## car fe fe1
## 1 bmw 20 NA
## 2 bmw 22 NA
## 3 bmw 24 NA
## 4 audi NA 20
## 5 audi NA 22
## 6 audi NA 24
#데이터 유형 변경(기본 세팅은 알파벳 순서)
q1<-c("nike","polo","adidas","wilson","yonex")
q1_factor<-as.factor(q1)
q1_factor
## [1] nike polo adidas wilson yonex
## Levels: adidas nike polo wilson yonex
as.numeric(q1_factor)
## [1] 2 3 1 4 5
#교육수준과 같은 레벨의 순서를 정하고 싶을 때
factor(x=c("high school","colleage","masters"),
levels=c("high school","colleage","masters"),
ordered = TRUE)
## [1] high school colleage masters
## Levels: high school < colleage < masters
economics<-ggplot2::economics
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
#1967-07-01과 같이 되어있는 데이터를 연도 부분만 따로 빼내어 year이라는 파생변수를 만들고
#같은 연도끼리 group_by를 이용하여 묶은 뒤 summarise함수를 이용해 psavert(개인저축률)의 평균을 구하고
#내림차순으로 정렬 후 위에서 5개의 행만 출력
economics<-economics %>% mutate(year=substr(economics$date,1,4))
economics %>% group_by(year) %>% summarise(m=mean(psavert)) %>%
arrange(desc(m)) %>% head(5)
## # A tibble: 5 × 2
## year m
## <chr> <dbl>
## 1 1971 13.5
## 2 1973 13.4
## 3 1975 13.4
## 4 1974 13.3
## 5 1970 12.8
#문자열 데이터를 날짜형 데이터로 변환(20221224와 같이 구분기호가없는 문자열은 실행안됨)
as.Date("2022/12/24")
## [1] "2022-12-24"
library(lubridate)
## 필요한 패키지를 로딩중입니다: timechange
##
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data(lakers)
lakers<-lakers %>% as_tibble
lakers %>% select(date,time)
## # A tibble: 34,624 × 2
## date time
## <int> <chr>
## 1 20081028 12:00
## 2 20081028 11:39
## 3 20081028 11:37
## 4 20081028 11:25
## 5 20081028 11:23
## 6 20081028 11:22
## 7 20081028 11:22
## 8 20081028 11:22
## 9 20081028 11:00
## 10 20081028 10:53
## # … with 34,614 more rows
lakers<-lakers %>%
mutate(date=paste(date,time) %>% ymd_hm) %>%
rename(time_index=date) %>%
select(-time)
head(lakers)
## # A tibble: 6 × 12
## time_index opponent game_type period etype team player result points
## <dttm> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 2008-10-28 12:00:00 POR home 1 jump… OFF "" "" 0
## 2 2008-10-28 11:39:00 POR home 1 shot LAL "Pau … "miss… 0
## 3 2008-10-28 11:37:00 POR home 1 rebo… LAL "Vlad… "" 0
## 4 2008-10-28 11:25:00 POR home 1 shot LAL "Dere… "miss… 0
## 5 2008-10-28 11:23:00 POR home 1 rebo… LAL "Pau … "" 0
## 6 2008-10-28 11:22:00 POR home 1 shot LAL "Pau … "made" 2
## # … with 3 more variables: type <chr>, x <int>, y <int>
summary(lakers)
## time_index opponent game_type
## Min. :2008-10-28 00:00:00.0 Length:34624 Length:34624
## 1st Qu.:2008-12-10 00:19:30.0 Class :character Class :character
## Median :2009-01-21 10:52:00.0 Mode :character Mode :character
## Mean :2009-01-22 20:08:18.4
## 3rd Qu.:2009-03-09 00:33:00.0
## Max. :2009-04-14 12:00:00.0
##
## period etype team player
## Min. :1.000 Length:34624 Length:34624 Length:34624
## 1st Qu.:2.000 Class :character Class :character Class :character
## Median :3.000 Mode :character Mode :character Mode :character
## Mean :2.536
## 3rd Qu.:4.000
## Max. :5.000
##
## result points type x
## Length:34624 Min. :0.0000 Length:34624 Min. : 0.00
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.:20.00
## Mode :character Median :0.0000 Mode :character Median :25.00
## Mean :0.4627 Mean :25.32
## 3rd Qu.:1.0000 3rd Qu.:31.00
## Max. :3.0000 Max. :51.00
## NA's :21557
## y
## Min. : 3.00
## 1st Qu.: 6.00
## Median :10.00
## Mean :13.43
## 3rd Qu.:20.00
## Max. :90.00
## NA's :21557
lakers %>% group_by(month(time_index)) %>%
summarise(mean_x=mean(x,na.rm = TRUE),mean_y=mean(y,na.rm = TRUE))
## # A tibble: 7 × 3
## `month(time_index)` mean_x mean_y
## <dbl> <dbl> <dbl>
## 1 1 25.5 13.9
## 2 2 25.0 13.2
## 3 3 25.5 13.2
## 4 4 25.4 13.5
## 5 10 24.9 13.1
## 6 11 25.5 13.4
## 7 12 25.1 13.5
lakers %>% group_by(year(time_index)) %>%
summarise(mean_x=mean(x,na.rm = TRUE),mean_y=mean(y,na.rm = TRUE))
## # A tibble: 2 × 3
## `year(time_index)` mean_x mean_y
## <dbl> <dbl> <dbl>
## 1 2008 25.2 13.4
## 2 2009 25.4 13.4
lakers %>% filter(time_index<=ymd_hms("2008-10-28 12:00:00")) %>% head(3)
## # A tibble: 3 × 12
## time_index opponent game_type period etype team player result points
## <dttm> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 2008-10-28 12:00:00 POR home 1 jump… OFF "" "" 0
## 2 2008-10-28 11:39:00 POR home 1 shot LAL "Pau … "miss… 0
## 3 2008-10-28 11:37:00 POR home 1 rebo… LAL "Vlad… "" 0
## # … with 3 more variables: type <chr>, x <int>, y <int>
lakers%>%filter(time_index<=ymd_hms("2008-10-28 12:00:00"),
time_index<=ymd_hms("2009-03-09 00:33:00")) %>% head(3)
## # A tibble: 3 × 12
## time_index opponent game_type period etype team player result points
## <dttm> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 2008-10-28 12:00:00 POR home 1 jump… OFF "" "" 0
## 2 2008-10-28 11:39:00 POR home 1 shot LAL "Pau … "miss… 0
## 3 2008-10-28 11:37:00 POR home 1 rebo… LAL "Vlad… "" 0
## # … with 3 more variables: type <chr>, x <int>, y <int>
#수치형데이터를 구간으로 나누기(같은 간격구분,간격지정 구분)
airquality$grade_Wind1<-cut(airquality$Wind,breaks = 3)
airquality$grade_Wind2<-cut(airquality$Wind,breaks = 3,
labels = c("s","m","w"))
head(airquality,3)
## Ozone Solar.R Wind Temp Month Day grade_Wind1 grade_Wind2
## 1 41 190 7.4 67 5 1 (1.68,8.03] s
## 2 36 118 8.0 72 5 2 (1.68,8.03] s
## 3 12 149 12.6 74 5 3 (8.03,14.4] m
mean(airquality$Wind)
## [1] 9.957516
range(airquality$Wind)
## [1] 1.7 20.7
airquality$grade_Wind3<-cut(airquality$Wind,
breaks=c(1.7,9.96,20.7),
labels=c("low","high"),
include.lowest=TRUE)
head(airquality,3)
## Ozone Solar.R Wind Temp Month Day grade_Wind1 grade_Wind2 grade_Wind3
## 1 41 190 7.4 67 5 1 (1.68,8.03] s low
## 2 36 118 8.0 72 5 2 (1.68,8.03] s low
## 3 12 149 12.6 74 5 3 (8.03,14.4] m high