r=2
circle = pi*r^2
total=100; n=10
average <- total/n
5**2
[1] 25
(1+2)*3
[1] 9
변수에 데이타를 넣는 방법
변수에 데이타 할당 ; =, <-
combine 사용
sequence 연산자 사용 (:)
sequence 함수 사용 (seq)
repeat함수 사용(rep)
x=1
y<-2
a=c(1,2,3)
a
[1] 1 2 3
a[2]
[1] 2
b=1:10
b[9]
[1] 9
c=seq(5)
d=seq(1,3,0.25)
e=c(a,b)
f=rep(a,3)
f
[1] 1 2 3 1 2 3 1 2 3
숫자형(numeric) 12, 4, 0.45
논리형(logical) TRUE, FALSE, T, F, 1, 0
복소수형(complex) 3+2i
문자형(character) “St.Vincent’s Hospital”,“123”,‘3.14’
벡터(vector)
행렬(matrix)
배열(array)
데이타프레임(dataframe)
리스트(list)
범주형자료(categorical variable)
시계열(Time series)
a=1:5
a=c(a,101,102)
b=c(a,103)
b
[1] 1 2 3 4 5 101 102 103
Height=c(168,173,160,145,180)
Weight=c(80,65,92,53,76)
BMI=Weight/(Height/100)^2
BMI
[1] 28.34467 21.71807 35.93750 25.20809 23.45679
a=1:10
b=c(1,-1)
a+b
[1] 2 1 4 3 6 5 8 7 10 9
b=10
a=c=2
b^2+c(1,-1)*4*a*c
[1] 116 84
a=matrix(1:12,ncol=3)
a
[,1] [,2] [,3]
[1,] 1 5 9
[2,] 2 6 10
[3,] 3 7 11
[4,] 4 8 12
b=LETTERS[1:12]
b
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L"
b=matrix(b,ncol=4)
b
[,1] [,2] [,3] [,4]
[1,] "A" "D" "G" "J"
[2,] "B" "E" "H" "K"
[3,] "C" "F" "I" "L"
b[3,2]
[1] "F"
b[2,4]
[1] "K"
b[2,]
[1] "B" "E" "H" "K"
b[,3]
[1] "G" "H" "I"
length(b)
[1] 12
sex=c("Male","Female","Female","Male","Male")
sex=factor(sex)
sex
[1] Male Female Female Male Male
Levels: Female Male
str(sex)
Factor w/ 2 levels "Female","Male": 2 1 1 2 2
levels(sex)
[1] "Female" "Male"
length(sex)
[1] 5
smoking=c(1,1,2,3,1)
smoking=factor(smoking)
levels(smoking)=c("none","ex-smoker","smoker")
smoking
[1] none none ex-smoker smoker none
Levels: none ex-smoker smoker
mydata=data.frame(height=Height,weight=Weight,sex=sex,smoking=smoking)
mydata
height weight sex smoking
1 168 80 Male none
2 173 65 Female none
3 160 92 Female ex-smoker
4 145 53 Male smoker
5 180 76 Male none
mydata[3,]
height weight sex smoking
3 160 92 Female ex-smoker
mydata[,1]
[1] 168 173 160 145 180
mydata[c(1,2)]
height weight
1 168 80
2 173 65
3 160 92
4 145 53
5 180 76
mydata[,c(1,2)]
height weight
1 168 80
2 173 65
3 160 92
4 145 53
5 180 76
mydata[1]
height
1 168
2 173
3 160
4 145
5 180
mydata[[1]]
[1] 168 173 160 145 180
mydata$height
[1] 168 173 160 145 180
mydata[["height"]]
[1] 168 173 160 145 180
mydata$BMI=mydata$weight*10000/(mydata$height)^2
mydata
height weight sex smoking BMI
1 168 80 Male none 28.34467
2 173 65 Female none 21.71807
3 160 92 Female ex-smoker 35.93750
4 145 53 Male smoker 25.20809
5 180 76 Male none 23.45679
str(mydata)
'data.frame': 5 obs. of 5 variables:
$ height : num 168 173 160 145 180
$ weight : num 80 65 92 53 76
$ sex : Factor w/ 2 levels "Female","Male": 2 1 1 2 2
$ smoking: Factor w/ 3 levels "none","ex-smoker",..: 1 1 2 3 1
$ BMI : num 28.3 21.7 35.9 25.2 23.5
summary(mydata)
height weight sex smoking BMI
Min. :145.0 Min. :53.0 Female:2 none :3 Min. :21.72
1st Qu.:160.0 1st Qu.:65.0 Male :3 ex-smoker:1 1st Qu.:23.46
Median :168.0 Median :76.0 smoker :1 Median :25.21
Mean :165.2 Mean :73.2 Mean :26.93
3rd Qu.:173.0 3rd Qu.:80.0 3rd Qu.:28.34
Max. :180.0 Max. :92.0 Max. :35.94
plot(mydata)
data(mtcars)
head(mtcars,10)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
str(mtcars)
'data.frame': 32 obs. of 11 variables:
$ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
$ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
$ disp: num 160 160 108 258 360 ...
$ hp : num 110 110 93 110 175 105 245 62 95 123 ...
$ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
$ wt : num 2.62 2.88 2.32 3.21 3.44 ...
$ qsec: num 16.5 17 18.6 19.4 17 ...
$ vs : num 0 0 1 1 0 1 0 1 1 1 ...
$ am : num 1 1 1 0 0 0 0 0 0 0 ...
$ gear: num 4 4 4 3 3 3 3 4 4 4 ...
$ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
mpg cyl disp hp
Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
Median :19.20 Median :6.000 Median :196.3 Median :123.0
Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
drat wt qsec vs
Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
Median :3.695 Median :3.325 Median :17.71 Median :0.0000
Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
am gear carb
Min. :0.0000 Min. :3.000 Min. :1.000
1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
Median :0.0000 Median :4.000 Median :2.000
Mean :0.4062 Mean :3.688 Mean :2.812
3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
Max. :1.0000 Max. :5.000 Max. :8.000
mtcars$mpg
[1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
[15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
[29] 15.8 19.7 15.0 21.4
stem(mtcars$mpg)
The decimal point is at the |
10 | 44
12 | 3
14 | 3702258
16 | 438
18 | 17227
20 | 00445
22 | 88
24 | 4
26 | 03
28 |
30 | 44
32 | 49
hist(mtcars$mpg)
boxplot(mtcars$mpg)
fivenum(mtcars$mpg)
[1] 10.40 15.35 19.20 22.80 33.90
quantile(mtcars$mpg)
0% 25% 50% 75% 100%
10.400 15.425 19.200 22.800 33.900
order(mtcars$mpg)
[1] 15 16 24 7 17 31 14 23 22 29 12 13 11 6 5 10 25 30 1 2 4 32 21
[24] 3 9 8 27 26 19 28 18 20
mtcars=mtcars[order(mtcars$mpg),]
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Cadillac Fleetwood 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
Camaro Z28 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
Maserati Bora 15.0 8 301 335 3.54 3.570 14.60 0 1 5 8
rownames(mtcars)
[1] "Cadillac Fleetwood" "Lincoln Continental" "Camaro Z28"
[4] "Duster 360" "Chrysler Imperial" "Maserati Bora"
[7] "Merc 450SLC" "AMC Javelin" "Dodge Challenger"
[10] "Ford Pantera L" "Merc 450SE" "Merc 450SL"
[13] "Merc 280C" "Valiant" "Hornet Sportabout"
[16] "Merc 280" "Pontiac Firebird" "Ferrari Dino"
[19] "Mazda RX4" "Mazda RX4 Wag" "Hornet 4 Drive"
[22] "Volvo 142E" "Toyota Corona" "Datsun 710"
[25] "Merc 230" "Merc 240D" "Porsche 914-2"
[28] "Fiat X1-9" "Honda Civic" "Lotus Europa"
[31] "Fiat 128" "Toyota Corolla"
order(rownames(mtcars))
[1] 8 1 3 5 24 9 4 18 31 28 10 29 21 15 2 30 6 19 20 25 26 16 13
[24] 11 12 7 17 27 32 23 14 22
mtcars=mtcars[order(rownames(mtcars)),]
mtcars=mtcars[order(mtcars$mpg,mtcars$wt),]
# 4기통, 6기통, 8기통 중 4,6기통 만 선택
table(mtcars$cyl)
4 6 8
11 7 14
mtcars$cyl<7
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
[23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
mtcars1=mtcars[mtcars$cyl<7,]
table(mtcars1$cyl)
4 6
11 7
# subset(data.frame, subset(행), select(열))
mtcars1=subset(mtcars,subset=cyl<7)
mtcars1
mpg cyl disp hp drat wt qsec vs am gear carb
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
mtcars2=subset(mtcars1,select=c(mpg,cyl))
mtcars2
mpg cyl
Merc 280C 17.8 6
Valiant 18.1 6
Merc 280 19.2 6
Ferrari Dino 19.7 6
Mazda RX4 21.0 6
Mazda RX4 Wag 21.0 6
Volvo 142E 21.4 4
Hornet 4 Drive 21.4 6
Toyota Corona 21.5 4
Datsun 710 22.8 4
Merc 230 22.8 4
Merc 240D 24.4 4
Porsche 914-2 26.0 4
Fiat X1-9 27.3 4
Lotus Europa 30.4 4
Honda Civic 30.4 4
Fiat 128 32.4 4
Toyota Corolla 33.9 4
mtcars3=subset(mtcars1,subset=cyl==4, select=mpg)
mtcars3
mpg
Volvo 142E 21.4
Toyota Corona 21.5
Datsun 710 22.8
Merc 230 22.8
Merc 240D 24.4
Porsche 914-2 26.0
Fiat X1-9 27.3
Lotus Europa 30.4
Honda Civic 30.4
Fiat 128 32.4
Toyota Corolla 33.9
mtcars4=subset(mtcars1,subset=cyl==4, select=cyl)
mtcars4
cyl
Volvo 142E 4
Toyota Corona 4
Datsun 710 4
Merc 230 4
Merc 240D 4
Porsche 914-2 4
Fiat X1-9 4
Lotus Europa 4
Honda Civic 4
Fiat 128 4
Toyota Corolla 4
cbind(mtcars3,mtcars4)
mpg cyl
Volvo 142E 21.4 4
Toyota Corona 21.5 4
Datsun 710 22.8 4
Merc 230 22.8 4
Merc 240D 24.4 4
Porsche 914-2 26.0 4
Fiat X1-9 27.3 4
Lotus Europa 30.4 4
Honda Civic 30.4 4
Fiat 128 32.4 4
Toyota Corolla 33.9 4
mtcars5=subset(mtcars2,mpg<21)
mtcars5
mpg cyl
Merc 280C 17.8 6
Valiant 18.1 6
Merc 280 19.2 6
Ferrari Dino 19.7 6
mtcars6=mtcars2[mtcars2$mpg>21,]
mtcars6
mpg cyl
Volvo 142E 21.4 4
Hornet 4 Drive 21.4 6
Toyota Corona 21.5 4
Datsun 710 22.8 4
Merc 230 22.8 4
Merc 240D 24.4 4
Porsche 914-2 26.0 4
Fiat X1-9 27.3 4
Lotus Europa 30.4 4
Honda Civic 30.4 4
Fiat 128 32.4 4
Toyota Corolla 33.9 4
rbind(mtcars5,mtcars6)
mpg cyl
Merc 280C 17.8 6
Valiant 18.1 6
Merc 280 19.2 6
Ferrari Dino 19.7 6
Volvo 142E 21.4 4
Hornet 4 Drive 21.4 6
Toyota Corona 21.5 4
Datsun 710 22.8 4
Merc 230 22.8 4
Merc 240D 24.4 4
Porsche 914-2 26.0 4
Fiat X1-9 27.3 4
Lotus Europa 30.4 4
Honda Civic 30.4 4
Fiat 128 32.4 4
Toyota Corolla 33.9 4
name=c("김철수","이영희","홍길동")
국어=c(90,95,80)
수학=c(100,95,85)
성적1=data.frame(name,국어)
성적1
name 국어
1 김철수 90
2 이영희 95
3 홍길동 80
성적2=data.frame(name,수학)
성적2
name 수학
1 김철수 100
2 이영희 95
3 홍길동 85
성적=merge(성적1,성적2,by="name")
성적
name 국어 수학
1 김철수 90 100
2 이영희 95 95
3 홍길동 80 85
name=c(name,"문건웅")
과학=c(80,85,90,100)
성적3=data.frame(name,과학)
성적3
name 과학
1 김철수 80
2 이영희 85
3 홍길동 90
4 문건웅 100
cbind(성적,성적3) ##데이타의 길이가 차이가 나면 합쳐지지 않는다.
Error in data.frame(..., check.names = FALSE): arguments imply differing number of rows: 3, 4
merge(성적,성적3,by="name")
name 국어 수학 과학
1 김철수 90 100 80
2 이영희 95 95 85
3 홍길동 80 85 90
merge(성적,성적3,by="name",all=TRUE)
name 국어 수학 과학
1 김철수 90 100 80
2 이영희 95 95 85
3 홍길동 80 85 90
4 문건웅 NA NA 100
table(mtcars$cyl)
4 6 8
11 7 14
help(mtcars)
table(mtcars$cyl,mtcars$am)
0 1
4 3 8
6 4 3
8 12 2
mtcars$tm=factor(mtcars$am,labels=c("automatic","manual"))
# mtcars$tm=ifelse(mtcars$am==0,"automatic","manual")
str(mtcars)
'data.frame': 32 obs. of 12 variables:
$ mpg : num 10.4 10.4 13.3 14.3 14.7 15 15.2 15.2 15.5 15.8 ...
$ cyl : num 8 8 8 8 8 8 8 8 8 8 ...
$ disp: num 472 460 350 360 440 ...
$ hp : num 205 215 245 245 230 335 150 180 150 264 ...
$ drat: num 2.93 3 3.73 3.21 3.23 3.54 3.15 3.07 2.76 4.22 ...
$ wt : num 5.25 5.42 3.84 3.57 5.34 ...
$ qsec: num 18 17.8 15.4 15.8 17.4 ...
$ vs : num 0 0 0 0 0 0 0 0 0 0 ...
$ am : num 0 0 0 0 0 1 0 0 0 1 ...
$ gear: num 3 3 3 3 3 5 3 3 3 5 ...
$ carb: num 4 4 4 4 4 8 2 3 2 4 ...
$ tm : Factor w/ 2 levels "automatic","manual": 1 1 1 1 1 2 1 1 1 2 ...
result=table(mtcars$cyl,mtcars$tm)
result
automatic manual
4 3 8
6 4 3
8 12 2
chisq.test(result)
Warning in chisq.test(result): Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: result
X-squared = 8.7407, df = 2, p-value = 0.01265
plot(result)
barplot(result,legend=paste(rownames(result),"cyl"))
#xtabs(도수~가로+세로)
result1=xtabs(~cyl+tm,data=mtcars)
result1
tm
cyl automatic manual
4 3 8
6 4 3
8 12 2
addmargins(result1)
tm
cyl automatic manual Sum
4 3 8 11
6 4 3 7
8 12 2 14
Sum 19 13 32
chisq.test(result1)
Warning in chisq.test(result1): Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: result1
X-squared = 8.7407, df = 2, p-value = 0.01265
#fisher.test(result1)
plot(mtcars)
# 엔진수에 따른 연비 평균
tapply(mtcars$mpg,mtcars$cyl,mean)
4 6 8
26.66364 19.74286 15.10000
aggregate(mpg~cyl,data=mtcars,mean)
cyl mpg
1 4 26.66364
2 6 19.74286
3 8 15.10000
aggregate(mpg~cyl+am,data=mtcars,mean)
cyl am mpg
1 4 0 22.90000
2 6 0 19.12500
3 8 0 15.05000
4 4 1 28.07500
5 6 1 20.56667
6 8 1 15.40000
# 엔진수에 따른 엔진출력(마력) 평균
tapply(mtcars$hp,mtcars$cyl,mean)
4 6 8
82.63636 122.28571 209.21429
plot(mpg~cyl,data=mtcars)
boxplot(mpg~cyl,data=mtcars)
out=lm(mpg~factor(cyl),data=mtcars)
anova(out)
Analysis of Variance Table
Response: mpg
Df Sum Sq Mean Sq F value Pr(>F)
factor(cyl) 2 824.78 412.39 39.697 4.979e-09 ***
Residuals 29 301.26 10.39
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Height=c(168,173,160,145,NA,180)
mean(Height)
[1] NA
!is.na(Height)
[1] TRUE TRUE TRUE TRUE FALSE TRUE
mean(Height[!is.na(Height)])
[1] 165.2
mean(Height,na.rm=TRUE)
[1] 165.2
# 마력과 연비
cor.test(mtcars$mpg,mtcars$hp)
Pearson's product-moment correlation
data: mtcars$mpg and mtcars$hp
t = -6.7424, df = 30, p-value = 1.788e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.8852686 -0.5860994
sample estimates:
cor
-0.7761684
with(mtcars,cor.test(mpg,hp))
Pearson's product-moment correlation
data: mpg and hp
t = -6.7424, df = 30, p-value = 1.788e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.8852686 -0.5860994
sample estimates:
cor
-0.7761684
plot(mpg~hp,data=mtcars)
out1=lm(mpg~hp,data=mtcars)
summary(out1)
Call:
lm(formula = mpg ~ hp, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-5.7121 -2.1122 -0.8854 1.5819 8.2360
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 30.09886 1.63392 18.421 < 2e-16 ***
hp -0.06823 0.01012 -6.742 1.79e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.863 on 30 degrees of freedom
Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892
F-statistic: 45.46 on 1 and 30 DF, p-value: 1.788e-07
abline(out1,col="red")
자료를 다루다 보면 연속형 자료에서 새로운 범주형 자료를 만들어야 할때가 있다. ggplot2패키지에 있는 diamonds 자료 예를 들어보면
library(ggplot2)
data(diamonds)
str(diamonds)
'data.frame': 53940 obs. of 10 variables:
$ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num 55 61 65 58 58 57 57 55 61 61 ...
$ price : int 326 326 327 334 335 336 336 337 337 338 ...
$ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds)
carat cut color clarity
Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
Max. :5.0100 I: 5422 VVS1 : 3655
J: 2808 (Other): 2531
depth table price x
Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
Median :61.80 Median :57.00 Median : 2401 Median : 5.700
Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
y z
Min. : 0.000 Min. : 0.000
1st Qu.: 4.720 1st Qu.: 2.910
Median : 5.710 Median : 3.530
Mean : 5.735 Mean : 3.539
3rd Qu.: 6.540 3rd Qu.: 4.040
Max. :58.900 Max. :31.800
다이아몬드 가격이 제일 싼 것은 326불 제일 비싼 것은 18823불이다.
diamonds 데이타에 PriceGroup이라는 새로운 변수를 만들고 1000불 미만은 1, 1000불-5000불은 2, 5000불 이상은 3으로 바꾸려면 다음과 같이 한다.
diamonds$PriceGroup=1
diamonds$PriceGroup[diamonds$price>=1000]=2
diamonds$PriceGroup[diamonds$price>=5000]=3
table(diamonds$PriceGroup)
1 2 3
14499 24714 14727
diamonds$PriceGroup=ifelse(diamonds$price<1000,1,ifelse(diamonds$price<5000,2,3))
table(diamonds$PriceGroup)
1 2 3
14499 24714 14727
diamonds$PriceGroup=cut(diamonds$price,breaks=c(0,999,4999,99999),labels=c(1,2,3))
table(diamonds$PriceGroup)
1 2 3
14499 24714 14727
예를 들어 전체 다이아몬드 가격을 1등 부터 53940등까지 순위를 매기고 이를 같은 숫자 만큼 k개의 군으로 나누고 싶다면 어떻게 할까 ? 다음과 같은 함수를 만들어 보았다. rank2group함수는 y라는 벡터를 인자로 받아들여 순위별로 k개의 군으로 나누어진 새로운 벡터를 반환한다. 사용법은 다음과 같다.
rank2group <- function (y,k=4){
count=length(y)
z=rank(y,ties.method="min")
return(floor((z-1)/(count/k))+1)
}
diamonds$PriceGroup=rank2group(diamonds$price,4)
table(diamonds$PriceGroup)
1 2 3 4
13490 13495 13470 13485
aggregate(price~PriceGroup,data=diamonds,range)
PriceGroup price.1 price.2
1 1 326 950
2 2 951 2401
3 3 2402 5324
4 4 5325 18823
가격이 겹치는 데이타(즉, 순위가 같은 데이타)가 있어 네군별로 n수가 다르기는 하지만 우리가 원하는대로 작동한다. 세군 , 다섯군으로 나누려면 다음과 같이 하면 된다.
diamonds$PriceGroup3=rank2group(diamonds$price,3)
table(diamonds$PriceGroup3)
1 2 3
17996 17964 17980
aggregate(price~PriceGroup3,data=diamonds,range)
PriceGroup3 price.1 price.2
1 1 326 1240
2 2 1241 4287
3 3 4288 18823
diamonds$PriceGroup5=rank2group(diamonds$price,5)
table(diamonds$PriceGroup5)
1 2 3 4 5
10796 10784 10789 10783 10788
aggregate(price~PriceGroup5,data=diamonds,range)
PriceGroup5 price.1 price.2
1 1 326 837
2 2 838 1698
3 3 1699 3465
4 4 3466 6301
5 5 6302 18823