7/25 (수) 43차시

R데이터 자료형

  1. vector :같은 데이터 타입을 갖는 1차원 배열
  2. list :서로 다른 데이터 타입을 갖는 1차원 배열, 중첩 가능
  3. matrix :같은 데이터 타입을 갖는 2차원 배열
  4. array :같은 데이터 타입을 갖는 3차원 배열
  5. factor :목록, 범주형 데이터
  6. data.frame :서로 다른 데이터 타입을 갖는 컬럼으로 이루어진 2차원 배열
  7. table :data.frame과 동일한 구조를 가지며 속도가 빠르다
a<-c(1,2) ; a
## [1] 1 2
b<-list(c('king', 100)); b
## [[1]]
## [1] "king" "100"
c<-matrix(c(1,2)); c
##      [,1]
## [1,]    1
## [2,]    2
d<-array(1:12, dim=c(2,2,3)); d
## , , 1
## 
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    5    7
## [2,]    6    8
## 
## , , 3
## 
##      [,1] [,2]
## [1,]    9   11
## [2,]   10   12
e<-factor(c('male', 'female')); e
## [1] male   female
## Levels: female male
f<-data.frame(x=c(1,2)); f
##   x
## 1 1
## 2 2
class(a); class(b); class(c); class(d); class(e); class(f)
## [1] "numeric"
## [1] "list"
## [1] "matrix"
## [1] "array"
## [1] "factor"
## [1] "data.frame"
mode(a); mode(b); mode(c); mode(d); mode(e); mode(f)
## [1] "numeric"
## [1] "list"
## [1] "numeric"
## [1] "numeric"
## [1] "numeric"
## [1] "list"
str(a); str(b); str(c); str(d); str(e); str(f)
##  num [1:2] 1 2
## List of 1
##  $ : chr [1:2] "king" "100"
##  num [1:2, 1] 1 2
##  int [1:2, 1:2, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
##  Factor w/ 2 levels "female","male": 2 1
## 'data.frame':    2 obs. of  1 variable:
##  $ x: num  1 2
is.numeric(a); is.character(a); is.integer(a)
## [1] TRUE
## [1] FALSE
## [1] FALSE
is.factor(e); is.matrix(c); is.array(d); is.data.frame(f); is.list(b)
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE

read.csv() / read.csv(choose.files(), header= )

-csv파일을 데이터 프레임으로 읽어들이는 함수
-getwd( )로 디렉토리 위치 확인
-setwd( )로 물리적인 디렉토리 위치 지정

emp<-read.csv(choose.files(), header = T)
str(emp)    #last_name, email이 factor형이고 factor는 레벨링이 되기 때문에 문제가 될 수 있음
## 'data.frame':    107 obs. of  11 variables:
##  $ EMPLOYEE_ID   : int  198 199 200 201 202 203 204 205 206 100 ...
##  $ FIRST_NAME    : Factor w/ 91 levels "Adam","Alana",..: 21 22 39 59 66 83 31 76 90 80 ...
##  $ LAST_NAME     : Factor w/ 102 levels "Abel","Ande",..: 69 37 101 41 28 63 5 42 36 50 ...
##  $ EMAIL         : Factor w/ 107 levels "ABANDA","ABULL",..: 24 20 52 67 77 93 32 89 105 90 ...
##  $ PHONE_NUMBER  : Factor w/ 107 levels "011.44.1343.329268",..: 98 99 36 40 62 41 44 42 43 37 ...
##  $ HIRE_DATE     : int  20070621 20080113 20030917 20040217 20050817 20020607 20020607 20020607 20020607 20030617 ...
##  $ JOB_ID        : Factor w/ 19 levels "AC_ACCOUNT","AC_MGR",..: 17 17 3 10 11 8 12 2 1 4 ...
##  $ SALARY        : int  2600 2600 4400 13000 6000 6500 10000 12008 8300 29040 ...
##  $ COMMISSION_PCT: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MANAGER_ID    : int  124 124 101 100 201 101 101 101 205 NA ...
##  $ DEPARTMENT_ID : int  50 50 10 20 20 40 70 110 110 90 ...
emp<-read.csv(choose.files(), header = F)
str(emp)    #헤더를 가져오지 않으면 모든 값들이 factor형으로 들어온다
## 'data.frame':    108 obs. of  11 variables:
##  $ V1 : Factor w/ 108 levels "100","101","102",..: 108 99 100 101 102 103 104 105 106 107 ...
##  $ V2 : Factor w/ 92 levels "Adam","Alana",..: 26 21 22 40 60 67 84 32 77 91 ...
##  $ V3 : Factor w/ 103 levels "Abel","Ande",..: 55 70 37 102 41 28 64 5 42 36 ...
##  $ V4 : Factor w/ 108 levels "ABANDA","ABULL",..: 28 24 20 53 68 78 94 33 90 106 ...
##  $ V5 : Factor w/ 108 levels "011.44.1343.329268",..: 108 98 99 36 40 62 41 44 42 43 ...
##  $ V6 : Factor w/ 99 levels "20010113","20020607",..: 99 81 90 10 15 38 2 2 2 2 ...
##  $ V7 : Factor w/ 20 levels "AC_ACCOUNT","AC_MGR",..: 10 18 18 3 11 12 8 13 2 1 ...
##  $ V8 : Factor w/ 59 levels "10000","10500",..: 59 15 15 32 7 35 39 1 6 52 ...
##  $ V9 : Factor w/ 9 levels "","0.1","0.15",..: 9 1 1 1 1 1 1 1 1 1 ...
##  $ V10: Factor w/ 20 levels "","100","101",..: 20 12 12 3 2 18 3 3 3 19 ...
##  $ V11: Factor w/ 13 levels "","10","100",..: 13 8 8 2 5 5 7 10 4 4 ...
emp<-read.csv(choose.files(), header=T, stringsAsFactors=F)
str(emp)
## 'data.frame':    107 obs. of  11 variables:
##  $ EMPLOYEE_ID   : int  198 199 200 201 202 203 204 205 206 100 ...
##  $ FIRST_NAME    : chr  "Donald" "Douglas" "Jennifer" "Michael" ...
##  $ LAST_NAME     : chr  "OConnell" "Grant" "Whalen" "Hartstein" ...
##  $ EMAIL         : chr  "DOCONNEL" "DGRANT" "JWHALEN" "MHARTSTE" ...
##  $ PHONE_NUMBER  : chr  "650.507.9833" "650.507.9844" "515.123.4444" "515.123.5555" ...
##  $ HIRE_DATE     : int  20070621 20080113 20030917 20040217 20050817 20020607 20020607 20020607 20020607 20030617 ...
##  $ JOB_ID        : chr  "SH_CLERK" "SH_CLERK" "AD_ASST" "MK_MAN" ...
##  $ SALARY        : int  2600 2600 4400 13000 6000 6500 10000 12008 8300 29040 ...
##  $ COMMISSION_PCT: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MANAGER_ID    : int  124 124 101 100 201 101 101 101 205 NA ...
##  $ DEPARTMENT_ID : int  50 50 10 20 20 40 70 110 110 90 ...
names(emp); emp$EMPLOYEE_ID; emp$EMPLOYEE_ID==100;
##  [1] "EMPLOYEE_ID"    "FIRST_NAME"     "LAST_NAME"      "EMAIL"         
##  [5] "PHONE_NUMBER"   "HIRE_DATE"      "JOB_ID"         "SALARY"        
##  [9] "COMMISSION_PCT" "MANAGER_ID"     "DEPARTMENT_ID"
##   [1] 198 199 200 201 202 203 204 205 206 100 101 102 103 104 105 106 107
##  [18] 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
##  [35] 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
##  [52] 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
##  [69] 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
##  [86] 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
## [103] 193 194 195 196 197
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
emp[emp$EMPLOYEE_ID==100,]; emp[emp$EMPLOYEE_ID==100,c('LAST_NAME', 'SALARY')]
##    EMPLOYEE_ID FIRST_NAME LAST_NAME EMAIL PHONE_NUMBER HIRE_DATE  JOB_ID
## 10         100     Steven      King SKING 515.123.4567  20030617 AD_PRES
##    SALARY COMMISSION_PCT MANAGER_ID DEPARTMENT_ID
## 10  29040             NA         NA            90
##    LAST_NAME SALARY
## 10      King  29040

grep함수 : 문자 패턴을 찾을 때 사용되는 함수

-^ : 첫번째
-$ : 마지막
- . : 한 자리수
- * : wild card(%)

emp[grep("aa", emp$LAST_NAME),c("LAST_NAME","SALARY")]
##    LAST_NAME SALARY
## 12   De Haan  17000
emp[grep("[x-z]", emp$LAST_NAME, ignore.case = TRUE),c("LAST_NAME","SALARY")]
##     LAST_NAME SALARY
## 5         Fay   6000
## 9       Gietz   8300
## 17    Lorentz   4200
## 24   Raphaely  11000
## 35      Nayer   3200
## 37     Landry   2400
## 57  Errazuriz  12000
## 59    Zlotkey  10500
## 67      Sully   9500
## 72    Vishney  10500
## 78       Ozer  11500
## 80        Fox   9600
## 86     Taylor   8600
## 90     Taylor   3200
## 99      Dilly   3600
## 107    Feeney   3000
#ignore.case = TRUE :대소문자 구분 안한다.
#ignore.case = FALSE :대소문자 구분한다.

문자 함수

1. nchar :문자 수를 리턴하는 함수(=length)

nchar('R Developer')
## [1] 11
nchar('R Developer', type="chars")  #문자 타입의 숫자 결과값
## [1] 11
nchar('R Developer', type='bytes')  #바이트 타입의 숫자 결과값
## [1] 11
nchar('빅데이터'); nchar('빅데이터', type="chars"); nchar('빅데이터', type='bytes') #한글=2bytes
## [1] 4
## [1] 4
## [1] 8

2. strsplit :부분 문자로 분리하는 함수

#strsplit('R Developer') :에러
strsplit('R Developer', split=character(0)); strsplit('R Developer', split=character(3))
## [[1]]
##  [1] "R" " " "D" "e" "v" "e" "l" "o" "p" "e" "r"
## [[1]]
##  [1] "R" " " "D" "e" "v" "e" "l" "o" "p" "e" "r"
strsplit('R Developer', split=' ')  #공백 문자를 기준으로 분리한다.
## [[1]]
## [1] "R"         "Developer"
strsplit('R Developer', split=',')  #comma를 기준으로 분리한다,
## [[1]]
## [1] "R Developer"
str(strsplit('R Developer', split=','))
## List of 1
##  $ : chr "R Developer"
str(unlist(strsplit('R Developer', split=','))) #리스트형을 벡터로
##  chr "R Developer"
strsplit(emp$LAST_NAME, split=character(0)) #emp<-read.csv("emp.csv", header=T, stringsAsFactors=F)
## [[1]]
## [1] "O" "C" "o" "n" "n" "e" "l" "l"
## 
## [[2]]
## [1] "G" "r" "a" "n" "t"
## 
## [[3]]
## [1] "W" "h" "a" "l" "e" "n"
## 
## [[4]]
## [1] "H" "a" "r" "t" "s" "t" "e" "i" "n"
## 
## [[5]]
## [1] "F" "a" "y"
## 
## [[6]]
## [1] "M" "a" "v" "r" "i" "s"
## 
## [[7]]
## [1] "B" "a" "e" "r"
## 
## [[8]]
## [1] "H" "i" "g" "g" "i" "n" "s"
## 
## [[9]]
## [1] "G" "i" "e" "t" "z"
## 
## [[10]]
## [1] "K" "i" "n" "g"
## 
## [[11]]
## [1] "K" "o" "c" "h" "h" "a" "r"
## 
## [[12]]
## [1] "D" "e" " " "H" "a" "a" "n"
## 
## [[13]]
## [1] "H" "u" "n" "o" "l" "d"
## 
## [[14]]
## [1] "E" "r" "n" "s" "t"
## 
## [[15]]
## [1] "A" "u" "s" "t" "i" "n"
## 
## [[16]]
## [1] "P" "a" "t" "a" "b" "a" "l" "l" "a"
## 
## [[17]]
## [1] "L" "o" "r" "e" "n" "t" "z"
## 
## [[18]]
## [1] "G" "r" "e" "e" "n" "b" "e" "r" "g"
## 
## [[19]]
## [1] "F" "a" "v" "i" "e" "t"
## 
## [[20]]
## [1] "C" "h" "e" "n"
## 
## [[21]]
## [1] "S" "c" "i" "a" "r" "r" "a"
## 
## [[22]]
## [1] "U" "r" "m" "a" "n"
## 
## [[23]]
## [1] "P" "o" "p" "p"
## 
## [[24]]
## [1] "R" "a" "p" "h" "a" "e" "l" "y"
## 
## [[25]]
## [1] "K" "h" "o" "o"
## 
## [[26]]
## [1] "B" "a" "i" "d" "a"
## 
## [[27]]
## [1] "T" "o" "b" "i" "a" "s"
## 
## [[28]]
## [1] "H" "i" "m" "u" "r" "o"
## 
## [[29]]
##  [1] "C" "o" "l" "m" "e" "n" "a" "r" "e" "s"
## 
## [[30]]
## [1] "W" "e" "i" "s" "s"
## 
## [[31]]
## [1] "F" "r" "i" "p" "p"
## 
## [[32]]
## [1] "K" "a" "u" "f" "l" "i" "n" "g"
## 
## [[33]]
## [1] "V" "o" "l" "l" "m" "a" "n"
## 
## [[34]]
## [1] "M" "o" "u" "r" "g" "o" "s"
## 
## [[35]]
## [1] "N" "a" "y" "e" "r"
## 
## [[36]]
##  [1] "M" "i" "k" "k" "i" "l" "i" "n" "e" "n" "i"
## 
## [[37]]
## [1] "L" "a" "n" "d" "r" "y"
## 
## [[38]]
## [1] "M" "a" "r" "k" "l" "e"
## 
## [[39]]
## [1] "B" "i" "s" "s" "o" "t"
## 
## [[40]]
## [1] "A" "t" "k" "i" "n" "s" "o" "n"
## 
## [[41]]
## [1] "M" "a" "r" "l" "o" "w"
## 
## [[42]]
## [1] "O" "l" "s" "o" "n"
## 
## [[43]]
## [1] "M" "a" "l" "l" "i" "n"
## 
## [[44]]
## [1] "R" "o" "g" "e" "r" "s"
## 
## [[45]]
## [1] "G" "e" "e"
## 
## [[46]]
##  [1] "P" "h" "i" "l" "t" "a" "n" "k" "e" "r"
## 
## [[47]]
## [1] "L" "a" "d" "w" "i" "g"
## 
## [[48]]
## [1] "S" "t" "i" "l" "e" "s"
## 
## [[49]]
## [1] "S" "e" "o"
## 
## [[50]]
## [1] "P" "a" "t" "e" "l"
## 
## [[51]]
## [1] "R" "a" "j" "s"
## 
## [[52]]
## [1] "D" "a" "v" "i" "e" "s"
## 
## [[53]]
## [1] "M" "a" "t" "o" "s"
## 
## [[54]]
## [1] "V" "a" "r" "g" "a" "s"
## 
## [[55]]
## [1] "R" "u" "s" "s" "e" "l" "l"
## 
## [[56]]
## [1] "P" "a" "r" "t" "n" "e" "r" "s"
## 
## [[57]]
## [1] "E" "r" "r" "a" "z" "u" "r" "i" "z"
## 
## [[58]]
## [1] "C" "a" "m" "b" "r" "a" "u" "l" "t"
## 
## [[59]]
## [1] "Z" "l" "o" "t" "k" "e" "y"
## 
## [[60]]
## [1] "T" "u" "c" "k" "e" "r"
## 
## [[61]]
## [1] "B" "e" "r" "n" "s" "t" "e" "i" "n"
## 
## [[62]]
## [1] "H" "a" "l" "l"
## 
## [[63]]
## [1] "O" "l" "s" "e" "n"
## 
## [[64]]
## [1] "C" "a" "m" "b" "r" "a" "u" "l" "t"
## 
## [[65]]
## [1] "T" "u" "v" "a" "u" "l" "t"
## 
## [[66]]
## [1] "K" "i" "n" "g"
## 
## [[67]]
## [1] "S" "u" "l" "l" "y"
## 
## [[68]]
## [1] "M" "c" "E" "w" "e" "n"
## 
## [[69]]
## [1] "S" "m" "i" "t" "h"
## 
## [[70]]
## [1] "D" "o" "r" "a" "n"
## 
## [[71]]
## [1] "S" "e" "w" "a" "l" "l"
## 
## [[72]]
## [1] "V" "i" "s" "h" "n" "e" "y"
## 
## [[73]]
## [1] "G" "r" "e" "e" "n" "e"
## 
## [[74]]
## [1] "M" "a" "r" "v" "i" "n" "s"
## 
## [[75]]
## [1] "L" "e" "e"
## 
## [[76]]
## [1] "A" "n" "d" "e"
## 
## [[77]]
## [1] "B" "a" "n" "d" "a"
## 
## [[78]]
## [1] "O" "z" "e" "r"
## 
## [[79]]
## [1] "B" "l" "o" "o" "m"
## 
## [[80]]
## [1] "F" "o" "x"
## 
## [[81]]
## [1] "S" "m" "i" "t" "h"
## 
## [[82]]
## [1] "B" "a" "t" "e" "s"
## 
## [[83]]
## [1] "K" "u" "m" "a" "r"
## 
## [[84]]
## [1] "A" "b" "e" "l"
## 
## [[85]]
## [1] "H" "u" "t" "t" "o" "n"
## 
## [[86]]
## [1] "T" "a" "y" "l" "o" "r"
## 
## [[87]]
##  [1] "L" "i" "v" "i" "n" "g" "s" "t" "o" "n"
## 
## [[88]]
## [1] "G" "r" "a" "n" "t"
## 
## [[89]]
## [1] "J" "o" "h" "n" "s" "o" "n"
## 
## [[90]]
## [1] "T" "a" "y" "l" "o" "r"
## 
## [[91]]
## [1] "F" "l" "e" "a" "u" "r"
## 
## [[92]]
## [1] "S" "u" "l" "l" "i" "v" "a" "n"
## 
## [[93]]
## [1] "G" "e" "o" "n" "i"
## 
## [[94]]
## [1] "S" "a" "r" "c" "h" "a" "n" "d"
## 
## [[95]]
## [1] "B" "u" "l" "l"
## 
## [[96]]
## [1] "D" "e" "l" "l" "i" "n" "g" "e" "r"
## 
## [[97]]
## [1] "C" "a" "b" "r" "i" "o"
## 
## [[98]]
## [1] "C" "h" "u" "n" "g"
## 
## [[99]]
## [1] "D" "i" "l" "l" "y"
## 
## [[100]]
## [1] "G" "a" "t" "e" "s"
## 
## [[101]]
## [1] "P" "e" "r" "k" "i" "n" "s"
## 
## [[102]]
## [1] "B" "e" "l" "l"
## 
## [[103]]
## [1] "E" "v" "e" "r" "e" "t" "t"
## 
## [[104]]
## [1] "M" "c" "C" "a" "i" "n"
## 
## [[105]]
## [1] "J" "o" "n" "e" "s"
## 
## [[106]]
## [1] "W" "a" "l" "s" "h"
## 
## [[107]]
## [1] "F" "e" "e" "n" "e" "y"

3. toupper :대문자

toupper('r developer')
## [1] "R DEVELOPER"

4. tolower :소문자

tolower('R DEVELOPER')
## [1] "r developer"

5. substr :문자열 추출

substr('R Developer', 1, 1) #substr(' ', 시작점, 끝점)
## [1] "R"
substr('1,2,3,4,5,6,7,8,9', 1, 1); substr('1,2,3,4,5,6,7,8,9', 1, 2); substr('1,2,3,4,5,6,7,8,9', 5, 5)
## [1] "1"
## [1] "1,"
## [1] "3"

6. sub :첫 번째로 일치하는 문자만 바꾸는 함수

-sub('찾을 대상', '원본에서 찾은 찾을 대상을 대체할 대상', '원본 대상')  
-sub('a', 'b', 'ac') >> 결과 :bc  
sub('R', 'Python', 'R programmer R Developer')
## [1] "Python programmer R Developer"

7. gsub :일치하는 문자를 모두 바꾸는 함수

gsub('R', 'Python', 'R programmer R Developer')
## [1] "Python programmer Python Developer"
gsub('[0-2]', '*', '120304')
## [1] "***3*4"

숫자함수

1. round :반올림

round(49.926); round(49.326, 3); round(49.326, -5) #round(대상, 자리 수)
## [1] 50
## [1] 49.326
## [1] 0

2. trunc :소숫점 이하 버림

trunc(49.926); trunc(49.926, 2); trunc(49.926, -4)
## [1] 49
## [1] 49
## [1] 49

3. signif

signif(49.326, 1); signif(49.326, 2)    #signif( , n) :n은 맨 앞 숫자를 기준으로 각 숫자들의 위치
## [1] 50
## [1] 49

4. floor :보다 작거나 같은 정수

floor(45.926)
## [1] 45

날짜 함수

1. 현재 날짜, 시간

Sys.Date(); Sys.time(); date()
## [1] "2018-07-25"
## [1] "2018-07-25 17:19:18 KST"
## [1] "Wed Jul 25 17:19:18 2018"

2. as.Date :문자 날짜를 날짜형으로 변환하는 함수

as.Date('2018-07-25'); as.Date('2018/07/25')    #as.Date('20180725') :에러
## [1] "2018-07-25"
## [1] "2018-07-25"
as.Date('20180725', format='%Y%m%d')
## [1] "2018-07-25"
#%Y :세기를 포함한 년도(4자리)
#%y :세기를 생략한 년도(2자리)
#%m :숫자 달
#%B :문자달
#%d :일
#%A :요일
#%u :숫자 요일(1~7:월~일)
#%w :숫자 요일(0~6:일~토)
#%H :시
#%M :분
#%S :초
as.Date('2018년 1월 2일', format='%Y년%m월%d일')
## [1] "2018-01-02"
format(Sys.time(),'%y%m%d %A')
## [1] "180725 수요일"