r=2
circle = pi*r^2
total=100; n=10
average <- total/n
5**2
## [1] 25
(1+2)*3
## [1] 9
변수에 데이타를 넣는 방법
변수에 데이타 할당 ; =, <-
combine 사용
sequence 연산자 사용 (:)
sequence 함수 사용 (seq)
repeat함수 사용(rep)
x=1
y<-2
a=c(1,2,3)
a
## [1] 1 2 3
a[2]
## [1] 2
b=1:10
b[9]
## [1] 9
c=seq(5)
d=seq(1,3,0.25)
e=c(a,b)
f=rep(a,3)
f
## [1] 1 2 3 1 2 3 1 2 3
숫자형(numeric) 12, 4, 0.45
논리형(logical) TRUE, FALSE, T, F, 1, 0
복소수형(complex) 3+2i
문자형(character) “St.Vincent’s Hospital”,“123”,‘3.14’
벡터(vector)
행렬(matrix)
배열(array)
데이타프레임(dataframe)
리스트(list)
범주형자료(categorical variable)
시계열(Time series)
a=1:5
a=c(a,101,102)
b=c(a,103)
b
## [1] 1 2 3 4 5 101 102 103
Height=c(168,173,160,145,180)
Weight=c(80,65,92,53,76)
BMI=Weight/(Height/100)^2
BMI
## [1] 28.34 21.72 35.94 25.21 23.46
a=1:10
b=c(1,-1)
a+b
## [1] 2 1 4 3 6 5 8 7 10 9
b=10
a=c=2
b^2+c(1,-1)*4*a*c
## [1] 116 84
a=matrix(1:12,ncol=3)
a
## [,1] [,2] [,3]
## [1,] 1 5 9
## [2,] 2 6 10
## [3,] 3 7 11
## [4,] 4 8 12
b=LETTERS[1:12]
b
## [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L"
b=matrix(b,ncol=4)
b
## [,1] [,2] [,3] [,4]
## [1,] "A" "D" "G" "J"
## [2,] "B" "E" "H" "K"
## [3,] "C" "F" "I" "L"
b[3,2]
## [1] "F"
b[2,4]
## [1] "K"
b[2,]
## [1] "B" "E" "H" "K"
b[,3]
## [1] "G" "H" "I"
length(b)
## [1] 12
sex=c("Male","Female","Female","Male","Male")
sex=factor(sex)
sex
## [1] Male Female Female Male Male
## Levels: Female Male
str(sex)
## Factor w/ 2 levels "Female","Male": 2 1 1 2 2
levels(sex)
## [1] "Female" "Male"
length(sex)
## [1] 5
smoking=c(1,1,2,3,1)
smoking=factor(smoking)
levels(smoking)=c("none","ex-smoker","smoker")
smoking
## [1] none none ex-smoker smoker none
## Levels: none ex-smoker smoker
mydata=data.frame(height=Height,weight=Weight,sex=sex,smoking=smoking)
mydata
## height weight sex smoking
## 1 168 80 Male none
## 2 173 65 Female none
## 3 160 92 Female ex-smoker
## 4 145 53 Male smoker
## 5 180 76 Male none
mydata[3,]
## height weight sex smoking
## 3 160 92 Female ex-smoker
mydata[,1]
## [1] 168 173 160 145 180
mydata$height
## [1] 168 173 160 145 180
mydata$BMI=mydata$weight*10000/(mydata$height)^2
mydata
## height weight sex smoking BMI
## 1 168 80 Male none 28.34
## 2 173 65 Female none 21.72
## 3 160 92 Female ex-smoker 35.94
## 4 145 53 Male smoker 25.21
## 5 180 76 Male none 23.46
str(mydata)
## 'data.frame': 5 obs. of 5 variables:
## $ height : num 168 173 160 145 180
## $ weight : num 80 65 92 53 76
## $ sex : Factor w/ 2 levels "Female","Male": 2 1 1 2 2
## $ smoking: Factor w/ 3 levels "none","ex-smoker",..: 1 1 2 3 1
## $ BMI : num 28.3 21.7 35.9 25.2 23.5
summary(mydata)
## height weight sex smoking BMI
## Min. :145 Min. :53.0 Female:2 none :3 Min. :21.7
## 1st Qu.:160 1st Qu.:65.0 Male :3 ex-smoker:1 1st Qu.:23.5
## Median :168 Median :76.0 smoker :1 Median :25.2
## Mean :165 Mean :73.2 Mean :26.9
## 3rd Qu.:173 3rd Qu.:80.0 3rd Qu.:28.3
## Max. :180 Max. :92.0 Max. :35.9
plot(mydata)
data(mtcars)
head(mtcars,10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
## mpg cyl disp hp
## Min. :10.4 Min. :4.00 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.4 1st Qu.:4.00 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.2 Median :6.00 Median :196.3 Median :123.0
## Mean :20.1 Mean :6.19 Mean :230.7 Mean :146.7
## 3rd Qu.:22.8 3rd Qu.:8.00 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.9 Max. :8.00 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.76 Min. :1.51 Min. :14.5 Min. :0.000
## 1st Qu.:3.08 1st Qu.:2.58 1st Qu.:16.9 1st Qu.:0.000
## Median :3.69 Median :3.33 Median :17.7 Median :0.000
## Mean :3.60 Mean :3.22 Mean :17.8 Mean :0.438
## 3rd Qu.:3.92 3rd Qu.:3.61 3rd Qu.:18.9 3rd Qu.:1.000
## Max. :4.93 Max. :5.42 Max. :22.9 Max. :1.000
## am gear carb
## Min. :0.000 Min. :3.00 Min. :1.00
## 1st Qu.:0.000 1st Qu.:3.00 1st Qu.:2.00
## Median :0.000 Median :4.00 Median :2.00
## Mean :0.406 Mean :3.69 Mean :2.81
## 3rd Qu.:1.000 3rd Qu.:4.00 3rd Qu.:4.00
## Max. :1.000 Max. :5.00 Max. :8.00
mtcars$mpg
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4
stem(mtcars$mpg)
##
## The decimal point is at the |
##
## 10 | 44
## 12 | 3
## 14 | 3702258
## 16 | 438
## 18 | 17227
## 20 | 00445
## 22 | 88
## 24 | 4
## 26 | 03
## 28 |
## 30 | 44
## 32 | 49
hist(mtcars$mpg)
boxplot(mtcars$mpg)
fivenum(mtcars$mpg)
## [1] 10.40 15.35 19.20 22.80 33.90
quantile(mtcars$mpg)
## 0% 25% 50% 75% 100%
## 10.40 15.43 19.20 22.80 33.90
order(mtcars$mpg)
## [1] 15 16 24 7 17 31 14 23 22 29 12 13 11 6 5 10 25 30 1 2 4 32 21
## [24] 3 9 8 27 26 19 28 18 20
mtcars=mtcars[order(mtcars$mpg),]
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Cadillac Fleetwood 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
## Camaro Z28 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
## Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
## Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
## Maserati Bora 15.0 8 301 335 3.54 3.570 14.60 0 1 5 8
rownames(mtcars)
## [1] "Cadillac Fleetwood" "Lincoln Continental" "Camaro Z28"
## [4] "Duster 360" "Chrysler Imperial" "Maserati Bora"
## [7] "Merc 450SLC" "AMC Javelin" "Dodge Challenger"
## [10] "Ford Pantera L" "Merc 450SE" "Merc 450SL"
## [13] "Merc 280C" "Valiant" "Hornet Sportabout"
## [16] "Merc 280" "Pontiac Firebird" "Ferrari Dino"
## [19] "Mazda RX4" "Mazda RX4 Wag" "Hornet 4 Drive"
## [22] "Volvo 142E" "Toyota Corona" "Datsun 710"
## [25] "Merc 230" "Merc 240D" "Porsche 914-2"
## [28] "Fiat X1-9" "Honda Civic" "Lotus Europa"
## [31] "Fiat 128" "Toyota Corolla"
order(rownames(mtcars))
## [1] 8 1 3 5 24 9 4 18 31 28 10 29 21 15 2 30 6 19 20 25 26 16 13
## [24] 11 12 7 17 27 32 23 14 22
mtcars=mtcars[order(rownames(mtcars)),]
mtcars=mtcars[order(mtcars$mpg,mtcars$wt),]
# 4기통, 6기통, 8기통 중 4,6기통 만 선택
table(mtcars$cyl)
##
## 4 6 8
## 11 7 14
mtcars$cyl<7
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
mtcars1=mtcars[mtcars$cyl<7,]
table(mtcars1$cyl)
##
## 4 6
## 11 7
# subset(data.frame, subset(행), select(열))
mtcars1=subset(mtcars,cyl<7)
mtcars2=subset(mtcars1,select=c(mpg,cyl))
table(mtcars$cyl)
##
## 4 6 8
## 11 7 14
help(mtcars)
table(mtcars$cyl,mtcars$am)
##
## 0 1
## 4 3 8
## 6 4 3
## 8 12 2
mtcars$tm=factor(mtcars$am,labels=c("automatic","manual"))
# mtcars$tm=ifelse(mtcars$am==0,"automatic","manual")
str(mtcars)
## 'data.frame': 32 obs. of 12 variables:
## $ mpg : num 10.4 10.4 13.3 14.3 14.7 15 15.2 15.2 15.5 15.8 ...
## $ cyl : num 8 8 8 8 8 8 8 8 8 8 ...
## $ disp: num 472 460 350 360 440 ...
## $ hp : num 205 215 245 245 230 335 150 180 150 264 ...
## $ drat: num 2.93 3 3.73 3.21 3.23 3.54 3.15 3.07 2.76 4.22 ...
## $ wt : num 5.25 5.42 3.84 3.57 5.34 ...
## $ qsec: num 18 17.8 15.4 15.8 17.4 ...
## $ vs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ am : num 0 0 0 0 0 1 0 0 0 1 ...
## $ gear: num 3 3 3 3 3 5 3 3 3 5 ...
## $ carb: num 4 4 4 4 4 8 2 3 2 4 ...
## $ tm : Factor w/ 2 levels "automatic","manual": 1 1 1 1 1 2 1 1 1 2 ...
result=table(mtcars$cyl,mtcars$tm)
result
##
## automatic manual
## 4 3 8
## 6 4 3
## 8 12 2
chisq.test(result)
## Warning: Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: result
## X-squared = 8.741, df = 2, p-value = 0.01265
plot(result)
barplot(result,legend=paste(rownames(result),"cyl"))
#xtabs(도수~가로+세로)
result1=xtabs(~cyl+tm,data=mtcars)
result1
## tm
## cyl automatic manual
## 4 3 8
## 6 4 3
## 8 12 2
addmargins(result1)
## tm
## cyl automatic manual Sum
## 4 3 8 11
## 6 4 3 7
## 8 12 2 14
## Sum 19 13 32
chisq.test(result1)
## Warning: Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: result1
## X-squared = 8.741, df = 2, p-value = 0.01265
#fisher.test(result1)
plot(mtcars)
# 엔진수에 따른 연비 평균
tapply(mtcars$mpg,mtcars$cyl,mean)
## 4 6 8
## 26.66 19.74 15.10
aggregate(mpg~cyl,data=mtcars,mean)
## cyl mpg
## 1 4 26.66
## 2 6 19.74
## 3 8 15.10
aggregate(mpg~cyl+am,data=mtcars,mean)
## cyl am mpg
## 1 4 0 22.90
## 2 6 0 19.12
## 3 8 0 15.05
## 4 4 1 28.07
## 5 6 1 20.57
## 6 8 1 15.40
# 엔진수에 따른 엔진출력(마력) 평균
tapply(mtcars$hp,mtcars$cyl,mean)
## 4 6 8
## 82.64 122.29 209.21
plot(mpg~cyl,data=mtcars)
boxplot(mpg~cyl,data=mtcars)
out=lm(mpg~factor(cyl),data=mtcars)
anova(out)
## Analysis of Variance Table
##
## Response: mpg
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(cyl) 2 825 412 39.7 5e-09 ***
## Residuals 29 301 10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Height=c(168,173,160,145,NA,180)
mean(Height)
## [1] NA
!is.na(Height)
## [1] TRUE TRUE TRUE TRUE FALSE TRUE
mean(Height[!is.na(Height)])
## [1] 165.2
mean(Height,na.rm=TRUE)
## [1] 165.2
# 마력과 연비
cor.test(mtcars$mpg,mtcars$hp)
##
## Pearson's product-moment correlation
##
## data: mtcars$mpg and mtcars$hp
## t = -6.742, df = 30, p-value = 1.788e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8853 -0.5861
## sample estimates:
## cor
## -0.7762
with(mtcars,cor.test(mpg,hp))
##
## Pearson's product-moment correlation
##
## data: mpg and hp
## t = -6.742, df = 30, p-value = 1.788e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8853 -0.5861
## sample estimates:
## cor
## -0.7762
plot(mpg~hp,data=mtcars)
out1=lm(mpg~hp,data=mtcars)
summary(out1)
##
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.712 -2.112 -0.885 1.582 8.236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.0989 1.6339 18.42 < 2e-16 ***
## hp -0.0682 0.0101 -6.74 1.8e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.86 on 30 degrees of freedom
## Multiple R-squared: 0.602, Adjusted R-squared: 0.589
## F-statistic: 45.5 on 1 and 30 DF, p-value: 1.79e-07
abline(out1,col="red")
자료를 다루다 보면 연속형 자료에서 새로운 범주형 자료를 만들어야 할때가 있다. ggplot2패키지에 있는 diamonds 자료 예를 들어보면
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked _by_ '.GlobalEnv':
##
## diamonds
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds)
## carat cut color clarity
## Min. :0.200 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.400 Good : 4906 E: 9797 VS2 :12258
## Median :0.700 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.798 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.040 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.010 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.0 Min. :43.0 Min. : 326 Min. : 0.00
## 1st Qu.:61.0 1st Qu.:56.0 1st Qu.: 950 1st Qu.: 4.71
## Median :61.8 Median :57.0 Median : 2401 Median : 5.70
## Mean :61.8 Mean :57.5 Mean : 3933 Mean : 5.73
## 3rd Qu.:62.5 3rd Qu.:59.0 3rd Qu.: 5324 3rd Qu.: 6.54
## Max. :79.0 Max. :95.0 Max. :18823 Max. :10.74
##
## y z
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 4.72 1st Qu.: 2.91
## Median : 5.71 Median : 3.53
## Mean : 5.73 Mean : 3.54
## 3rd Qu.: 6.54 3rd Qu.: 4.04
## Max. :58.90 Max. :31.80
##
다이아몬드 가격이 제일 싼 것은 326불 제일 비싼 것은 18823불이다.
diamonds 데이타에 PriceGroup이라는 새로운 변수를 만들고 1000불 미만은 1, 1000불-5000불은 2, 5000불 이상은 3으로 바꾸려면 다음과 같이 한다.
diamonds$PriceGroup=1
diamonds$PriceGroup[diamonds$price>=1000]=2
diamonds$PriceGroup[diamonds$price>=5000]=3
table(diamonds$PriceGroup)
##
## 1 2 3
## 14499 24714 14727
diamonds$PriceGroup=ifelse(diamonds$price<1000,1,ifelse(diamonds$price<5000,2,3))
table(diamonds$PriceGroup)
##
## 1 2 3
## 14499 24714 14727
diamonds$PriceGroup=cut(diamonds$price,breaks=c(0,999,4999,99999),labels=c(1,2,3))
table(diamonds$PriceGroup)
##
## 1 2 3
## 14499 24714 14727
예를 들어 전체 다이아몬드 가격을 1등 부터 53940등까지 순위를 매기고 이를 같은 숫자 만큼 k개의 군으로 나누고 싶다면 어떻게 할까 ? 다음과 같은 함수를 만들어 보았다. rank2group함수는 y라는 벡터를 인자로 받아들여 순위별로 k개의 군으로 나누어진 새로운 벡터를 반환한다. 사용법은 다음과 같다.
rank2group <- function (y,k=4){
count=length(y)
z=rank(y,ties.method="min")
return(floor((z-1)/(count/k))+1)
}
diamonds$PriceGroup=rank2group(diamonds$price,4)
table(diamonds$PriceGroup)
##
## 1 2 3 4
## 13490 13495 13470 13485
aggregate(price~PriceGroup,data=diamonds,range)
## PriceGroup price.1 price.2
## 1 1 326 950
## 2 2 951 2401
## 3 3 2402 5324
## 4 4 5325 18823
가격이 겹치는 데이타(즉, 순위가 같은 데이타)가 있어 네군별로 n수가 다르기는 하지만 우리가 원하는대로 작동한다. 세군 , 다섯군으로 나누려면 다음과 같이 하면 된다.
diamonds$PriceGroup3=rank2group(diamonds$price,3)
table(diamonds$PriceGroup3)
##
## 1 2 3
## 17996 17964 17980
aggregate(price~PriceGroup3,data=diamonds,range)
## PriceGroup3 price.1 price.2
## 1 1 326 1240
## 2 2 1241 4287
## 3 3 4288 18823
diamonds$PriceGroup5=rank2group(diamonds$price,5)
table(diamonds$PriceGroup5)
##
## 1 2 3 4 5
## 10796 10784 10789 10783 10788
aggregate(price~PriceGroup5,data=diamonds,range)
## PriceGroup5 price.1 price.2
## 1 1 326 837
## 2 2 838 1698
## 3 3 1699 3465
## 4 4 3466 6301
## 5 5 6302 18823