인코딩 설정

read.csv 에서 encoding 옵션을 활용하여 설정 할 수 있다. csv는 ,를 구분자로 사용하며, tsv는 teb을 구분자로 사용한다. sep옵션을 사용하여 구분자를 설정 할 수 있다.

setwd('R:/HDD1/data/iris')
iris=read.csv('iris.csv')
head(iris)

##   X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 1          5.1         3.5          1.4         0.2  setosa
## 2 2          4.9         3.0          1.4         0.2  setosa
## 3 3          4.7         3.2          1.3         0.2  setosa
## 4 4          4.6         3.1          1.5         0.2  setosa
## 5 5          5.0         3.6          1.4         0.2  setosa
## 6 6          5.4         3.9          1.7         0.4  setosa

iris=read.csv('iris.csv',encoding = 'cp949')
head(iris)

##   X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 1          5.1         3.5          1.4         0.2  setosa
## 2 2          4.9         3.0          1.4         0.2  setosa
## 3 3          4.7         3.2          1.3         0.2  setosa
## 4 4          4.6         3.1          1.5         0.2  setosa
## 5 5          5.0         3.6          1.4         0.2  setosa
## 6 6          5.4         3.9          1.7         0.4  setosa

read.table 을 통해서도 자료를 가져올 수 있다.

setwd('R:/HDD1/data/iris')
iris=read.table('iris.csv')
head(iris,3)

##   V1                                                                   V2
## 1 NA ,"Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
## 2  1                                            ,5.1,3.5,1.4,0.2,"setosa"
## 3  2                                              ,4.9,3,1.4,0.2,"setosa"

setwd('R:/HDD1/data/iris')
iris=read.table('iris.csv',sep=' ')
tail(iris,4)

##                                V1
## 148   147,6.3,2.5,5,1.9,virginica
## 149     148,6.5,3,5.2,2,virginica
## 150 149,6.2,3.4,5.4,2.3,virginica
## 151   150,5.9,3,5.1,1.8,virginica

setwd('R:/HDD1/data/iris')
iris=read.table('iris.csv',sep=',')
tail(iris,4)

##      V1  V2  V3  V4  V5        V6
## 148 147 6.3 2.5   5 1.9 virginica
## 149 148 6.5   3 5.2   2 virginica
## 150 149 6.2 3.4 5.4 2.3 virginica
## 151 150 5.9   3 5.1 1.8 virginica

패키지를 통한 자료 다운로드

library(xlsx)

## Warning: package 'xlsx' was built under R version 3.5.2

setwd('R:/HDD1/data/iris')
write.xlsx(iris,'iris.xls')
write.xlsx(iris,'iris.xlsx')

iris=read.xlsx('iris.xls',sheetIndex =1,encoding='cp949')
head(iris)

##   NA. V1           V2          V3           V4          V5      V6
## 1   1 NA Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2   2  1          5.1         3.5          1.4         0.2  setosa
## 3   3  2          4.9           3          1.4         0.2  setosa
## 4   4  3          4.7         3.2          1.3         0.2  setosa
## 5   5  4          4.6         3.1          1.5         0.2  setosa
## 6   6  5            5         3.6          1.4         0.2  setosa

iris=read.xlsx('iris.xlsx',sheetIndex =1,encoding='utf-8')
head(iris)

##   NA. V1           V2          V3           V4          V5      V6
## 1   1 NA Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2   2  1          5.1         3.5          1.4         0.2  setosa
## 3   3  2          4.9           3          1.4         0.2  setosa
## 4   4  3          4.7         3.2          1.3         0.2  setosa
## 5   5  4          4.6         3.1          1.5         0.2  setosa
## 6   6  5            5         3.6          1.4         0.2  setosa

library('readxl')
setwd('R:/HDD1/data/iris')
read_excel("iris.xlsx",sheet=1)

## # A tibble: 151 x 7
##    X__1     V1 V2           V3          V4           V5          V6     
##    <chr> <dbl> <chr>        <chr>       <chr>        <chr>       <chr>  
##  1 1        NA Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##  2 2         1 5.1          3.5         1.4          0.2         setosa 
##  3 3         2 4.9          3           1.4          0.2         setosa 
##  4 4         3 4.7          3.2         1.3          0.2         setosa 
##  5 5         4 4.6          3.1         1.5          0.2         setosa 
##  6 6         5 5            3.6         1.4          0.2         setosa 
##  7 7         6 5.4          3.9         1.7          0.4         setosa 
##  8 8         7 4.6          3.4         1.4          0.3         setosa 
##  9 9         8 5            3.4         1.5          0.2         setosa 
## 10 10        9 4.4          2.9         1.4          0.2         setosa 
## # ... with 141 more rows

read_excel("iris.xls",sheet=1)

## # A tibble: 151 x 7
##    X__1     V1 V2           V3          V4           V5          V6     
##    <chr> <dbl> <chr>        <chr>       <chr>        <chr>       <chr>  
##  1 1        NA Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##  2 2         1 5.1          3.5         1.4          0.2         setosa 
##  3 3         2 4.9          3           1.4          0.2         setosa 
##  4 4         3 4.7          3.2         1.3          0.2         setosa 
##  5 5         4 4.6          3.1         1.5          0.2         setosa 
##  6 6         5 5            3.6         1.4          0.2         setosa 
##  7 7         6 5.4          3.9         1.7          0.4         setosa 
##  8 8         7 4.6          3.4         1.4          0.3         setosa 
##  9 9         8 5            3.4         1.5          0.2         setosa 
## 10 10        9 4.4          2.9         1.4          0.2         setosa 
## # ... with 141 more rows

readr 의 경우 오류가 발생하면 오류를 출력해주고 오류를 제외한 자료를 불러온다.

library(readr)
setwd('R:/HDD1/data/iris')
iris <- read_csv("iris.csv", locale = locale(encoding = "cp949"))

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Sepal.Length = col_double(),
##   Sepal.Width = col_double(),
##   Petal.Length = col_double(),
##   Petal.Width = col_double(),
##   Species = col_character()
## )

iris

## # A tibble: 150 x 6
##       X1 Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <dbl>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1     1          5.1         3.5          1.4         0.2 setosa 
##  2     2          4.9         3            1.4         0.2 setosa 
##  3     3          4.7         3.2          1.3         0.2 setosa 
##  4     4          4.6         3.1          1.5         0.2 setosa 
##  5     5          5           3.6          1.4         0.2 setosa 
##  6     6          5.4         3.9          1.7         0.4 setosa 
##  7     7          4.6         3.4          1.4         0.3 setosa 
##  8     8          5           3.4          1.5         0.2 setosa 
##  9     9          4.4         2.9          1.4         0.2 setosa 
## 10    10          4.9         3.1          1.5         0.1 setosa 
## # ... with 140 more rows

iris <- read_csv("iris.csv", locale = locale(encoding = "utf-8"))

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Sepal.Length = col_double(),
##   Sepal.Width = col_double(),
##   Petal.Length = col_double(),
##   Petal.Width = col_double(),
##   Species = col_character()
## )

iris

## # A tibble: 150 x 6
##       X1 Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <dbl>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1     1          5.1         3.5          1.4         0.2 setosa 
##  2     2          4.9         3            1.4         0.2 setosa 
##  3     3          4.7         3.2          1.3         0.2 setosa 
##  4     4          4.6         3.1          1.5         0.2 setosa 
##  5     5          5           3.6          1.4         0.2 setosa 
##  6     6          5.4         3.9          1.7         0.4 setosa 
##  7     7          4.6         3.4          1.4         0.3 setosa 
##  8     8          5           3.4          1.5         0.2 setosa 
##  9     9          4.4         2.9          1.4         0.2 setosa 
## 10    10          4.9         3.1          1.5         0.1 setosa 
## # ... with 140 more rows

자료가 일반적으로 불러지지 않아 어쩔수 없이 불러와야 할 때 readLines를 활용하여 data를 불러오는 것도 방법이 될 수 있다.

f <- file ("./input/train.csv") 
f

## A connection with                               
## description "./input/train.csv"
## class       "file"             
## mode        "r"                
## text        "text"             
## opened      "closed"           
## can read    "yes"              
## can write   "yes"

line=readLines(file('R:/HDD1/data/iris/iris.csv'),n=150)
head(read.csv(textConnection(line),header=F,sep=','))

##   V1           V2          V3           V4          V5      V6
## 1 NA Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2  1          5.1         3.5          1.4         0.2  setosa
## 3  2          4.9           3          1.4         0.2  setosa
## 4  3          4.7         3.2          1.3         0.2  setosa
## 5  4          4.6         3.1          1.5         0.2  setosa
## 6  5            5         3.6          1.4         0.2  setosa

github에 있는 자료를 바로 다운로드 받아 불러오는 코드

url<-"https://github.com/mrchypark/sejongFinData/raw/master/dataAll.csv"
setwd('R:/HDD1/data/iris')
download.file(url,destfile = "./dataAll.csv")
dataAll<-read.csv("./dataAll.csv",stringsAsFactors = F)
head(dataAll)

##    country              year  매출액 영업이익 순이익 연결순이익 자산총계
## 1 삼성전자 1997.12(GAAP연결) 226,820   21,387 -9,383     -6,069  320,316
## 2 삼성전자 1998.12(GAAP연결) 257,723   27,063 -4,128     -3,547  240,757
## 3 삼성전자 1999.12(GAAP연결) 320,877   53,760 31,857     31,753  291,786
## 4 삼성전자 2000.12(GAAP연결) 435,278   90,603 61,921     60,029  464,215
## 5 삼성전자 2001.12(GAAP연결) 464,438   39,514 33,709     30,551  521,149
## 6 삼성전자 2002.12(GAAP연결) 595,687   92,456 73,246     70,528  649,550
##   부채총계 자본총계 부채비율 영업이익률 순이익률 연결순이익률 ROE.순이익.
## 1  273,860   46,457   589.49       9.43    -4.14        -2.68       -20.2
## 2  190,162   50,595   375.85       10.5     -1.6        -1.38       -8.16
## 3  160,039  131,747   121.47      16.75     9.93          9.9       24.18
## 4  296,360  167,855   176.56      20.82    14.23        13.79       36.89
## 5  317,164  203,985   155.48       8.51     7.26         6.58       16.53
## 6  394,623  254,927    154.8      15.52     12.3        11.84       28.73
##   ROE.연결순이익. 매출액.성장률 영업이익.성장률 순이익.성장률
## 1          -13.06         14.11           55.95          적지
## 2           -7.01         13.62           26.54          적지
## 3            24.1          24.5           98.64          흑전
## 4           35.76         35.65           68.53         94.37
## 5           14.98           6.7          -56.39        -45.56
## 6           27.67         28.26          133.98        117.29

read excel file

인코딩 설정

패키지를 통한 자료 다운로드