[빅데이터 청년인재] DATA Handling

기본 R
데이터 형식 변환
자료 정제
dplyr 실습
mdis tst
xlsx,spss파일 불러오기
저장 및 로드
문제

기본 R

test<-read.csv("C:/Users/Gyu-ri Kim/Desktop/청년인재/bigdatacampus/data/test.csv")
#test

library("tidyr")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#wide format data1
wide<-data.frame(subject=c(1,2,3,4),
                 sex=c('M','F','F','M'),
                 time1=c(7.9,6.3,9.5,11.5),
                 time2=c(12.3,10.6,13.1,13.4),
                 time3=c(10.7,11.1,13.8,12.9))
wide

str(wide)

## 'data.frame':    4 obs. of  5 variables:
##  $ subject: num  1 2 3 4
##  $ sex    : Factor w/ 2 levels "F","M": 2 1 1 2
##  $ time1  : num  7.9 6.3 9.5 11.5
##  $ time2  : num  12.3 10.6 13.1 13.4
##  $ time3  : num  10.7 11.1 13.8 12.9

데이터 형식 변환

long format data

long<-gather(wide,time,value,time1:time3,
             factor_key=TRUE)
str(long)

## 'data.frame':    12 obs. of  4 variables:
##  $ subject: num  1 2 3 4 1 2 3 4 1 2 ...
##  $ sex    : Factor w/ 2 levels "F","M": 2 1 1 2 2 1 1 2 2 1 ...
##  $ time   : Factor w/ 3 levels "time1","time2",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ value  : num  7.9 6.3 9.5 11.5 12.3 10.6 13.1 13.4 10.7 11.1 ...

long

levels(long$time)<-1:3
long

str(long)

## 'data.frame':    12 obs. of  4 variables:
##  $ subject: num  1 2 3 4 1 2 3 4 1 2 ...
##  $ sex    : Factor w/ 2 levels "F","M": 2 1 1 2 2 1 1 2 2 1 ...
##  $ time   : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 2 2 2 3 3 ...
##  $ value  : num  7.9 6.3 9.5 11.5 12.3 10.6 13.1 13.4 10.7 11.1 ...

wide format data2

wide2<-spread(long,time,value)
wide2

colnames(wide2)[3:5]<-paste("time",1:3,sep="")

예제 long format으로 바꾸기

yes<-data.frame(subject=c(1,2,3),
           name=c("김철수","이영희","박길동"),
           middle=c(70,80,90),
           final=c(75,70,78))
yes

str(yes)

## 'data.frame':    3 obs. of  4 variables:
##  $ subject: num  1 2 3
##  $ name   : Factor w/ 3 levels "김철수","박길동",..: 1 3 2
##  $ middle : num  70 80 90
##  $ final  : num  75 70 78

yes_long<-gather(yes,time,score,middle:final,factor_key = T)
yes_long

예제 wide format으로 바꾸기

yes_wide<-spread(yes_long,time,score)
yes_wide

colnames(yes_wide)[2]<-c("testname")

iris data

data(iris)
head(iris)

head(iris,n=10)

tail(iris)

iris

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

iris[,"id"]<-1:nrow(iris)
iris_long<-gather(iris,measure,value,colnames(iris)[1:4], factor_key=T)
iris_wide<-spread(iris_long,measure,value)
iris_wide

read.table("C:/Users/Gyu-ri Kim/Desktop/청년인재/bigdatacampus/data/iris.txt", header=T, sep = ",")

#read.table(file.choose(),)
read.csv("C:/Users/Gyu-ri Kim/Desktop/청년인재/bigdatacampus/data/iris.csv", header=T, encoding="UTF-8")

iris

iris[,1]

##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4
##  [18] 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5
##  [35] 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0
##  [52] 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8
##  [69] 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4
##  [86] 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8
## [103] 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7
## [120] 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7
## [137] 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9

iris[1]

iris$Sepal.Length

##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4
##  [18] 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5
##  [35] 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0
##  [52] 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8
##  [69] 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4
##  [86] 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8
## [103] 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7
## [120] 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7
## [137] 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9

iris[,"Sepal.Length"]

##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4
##  [18] 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5
##  [35] 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0
##  [52] 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8
##  [69] 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4
##  [86] 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8
## [103] 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7
## [120] 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7
## [137] 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9

iris[1,-1]

iris[c(1,3,5),]

iris[iris$id%%2==1,]

iris[iris$Petal.Width==0.2,]

자료 정제

#install.packages("tidyr")
library(tidyr)
widee <- data.frame(name = c("김철수","이영희","박길동"),
                    middle= c(70,80,90),
                    final = c(75,70,78))
longg <- gather(widee, time,score,middle:final,factor_key = T)

widee2 <- spread(longg,time,score)

head(iris,10)

iris[,"id"] <- 1:nrow(iris)
iris.long <- gather(iris,measure,value,colnames(iris)[1:4],factor_key = T)
iris.wide <- spread(iris.long,measure,value)
head(iris.wide)

test <- read.csv("C:/Users/Gyu-ri Kim/Desktop/청년인재/bigdatacampus/data/test.csv")
iris <- iris
orthodont <- read.csv("C:/Users/Gyu-ri Kim/Desktop/청년인재/bigdatacampus/data/orthodont.csv")
iris.txt <- read.table("C:/Users/Gyu-ri Kim/Desktop/청년인재/bigdatacampus/data/iris.txt")

iris[,1]

##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4
##  [18] 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5
##  [35] 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0
##  [52] 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8
##  [69] 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4
##  [86] 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8
## [103] 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7
## [120] 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7
## [137] 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9

filter

## 'data.frame':    150 obs. of  6 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ id          : int  1 2 3 4 5 6 7 8 9 10 ...

행을 추출하는 함수이다.

‘행을 추출하는 함수이다.’

arrange

select

mutate

dplyr 실습

1.여성만 선택하기

## 'data.frame':    190 obs. of  10 variables:
##  $ Sex   : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 2 2 2 2 ...
##  $ Age   : int  26 30 22 46 41 28 53 60 32 34 ...
##  $ Height: num  157 176 161 164 164 ...
##  $ Weight: num  43.7 93.3 61.9 68 72.2 79.7 59.4 66.2 63.8 69.6 ...
##  $ SBP   : int  100 120 110 100 100 130 110 130 110 110 ...
##  $ DBP   : int  70 70 70 60 70 90 70 90 70 80 ...
##  $ Sugar : num  83.5 83.6 86.4 86.5 86.7 86.9 87.4 88 88.1 88.1 ...
##  $ Fat   : num  123 73.9 56.3 162.3 132.5 ...
##  $ Chol  : num  188 146 226 146 164 ...
##  $ DM    : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...

2.키가 170보다 큰 행

3.키가 170보다 크고 몸무게가 65보다 큰 행

4.이완기혈압이 100보다 큰 행

5.BMI열을 추가하시오

## 'data.frame':    190 obs. of  11 variables:
##  $ Sex   : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 2 2 2 2 ...
##  $ Age   : int  26 30 22 46 41 28 53 60 32 34 ...
##  $ Height: num  157 176 161 164 164 ...
##  $ Weight: num  43.7 93.3 61.9 68 72.2 79.7 59.4 66.2 63.8 69.6 ...
##  $ SBP   : int  100 120 110 100 100 130 110 130 110 110 ...
##  $ DBP   : int  70 70 70 60 70 90 70 90 70 80 ...
##  $ Sugar : num  83.5 83.6 86.4 86.5 86.7 86.9 87.4 88 88.1 88.1 ...
##  $ Fat   : num  123 73.9 56.3 162.3 132.5 ...
##  $ Chol  : num  188 146 226 146 164 ...
##  $ DM    : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BMI   : num  17.8 30.3 23.8 25.3 27 ...

6.BMI가 25보다 큰행

mdis tst

## New names:
## * `` -> ...10

## Classes 'tbl_df', 'tbl' and 'data.frame':    285534 obs. of  10 variables:
##  $ 사망연월일           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ 사망연령(각세)       : num  20170101 20170101 20170101 20170101 20170101 ...
##  $ 사망장소             : num  0 0 21 21 24 25 25 28 29 31 ...
##  $ 사망원인1            : num  1 2 2 2 5 2 5 5 2 2 ...
##  $ 사망원인2            : chr  "R99" "P249" "C716" "G809" ...
##  $ 사망원인 103항목 분류: chr  NA NA NA NA ...
##  $ 사망원인 56항목 분류 : num  94 92 42 61 96 101 96 96 94 74 ...
##  $ 사망자의 국적구분    : num  NA 47 25 NA 50 55 50 50 NA 38 ...
##  $ 사망자의 (이전)국적  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ...10                : logi  NA NA NA NA NA NA ...

## 'data.frame':    285534 obs. of  10 variables:
##  $ 사망연월일           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ 사망연령(각세)       : int  20170101 20170101 20170101 20170101 20170101 20170101 20170101 20170101 20170101 20170101 ...
##  $ 사망장소             : int  0 0 21 21 24 25 25 28 29 31 ...
##  $ 사망원인1            : int  1 2 2 2 5 2 5 5 2 2 ...
##  $ 사망원인2            : Factor w/ 2428 levels "A020","A021",..: 1985 1801 410 923 2241 2066 2241 2009 1985 1228 ...
##  $ 사망원인 103항목 분류: Factor w/ 764 levels "","V011","V021",..: 1 1 1 1 62 599 63 57 1 1 ...
##  $ 사망원인 56항목 분류 : int  94 92 42 61 96 101 96 96 94 74 ...
##  $ 사망자의 국적구분    : int  NA 47 25 NA 50 55 50 50 NA 38 ...
##  $ 사망자의 (이전)국적  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NA                   : int  NA NA NA NA NA NA NA NA NA NA ...

xlsx,spss파일 불러오기

## 'data.frame':    9 obs. of  3 variables:
##  $ x    : num  2 4 5 6 6 5 6 7 9
##  $ y    : num  3 5 6 2 5 8 9 4 10
##  $ group: Factor w/ 3 levels "C ","T1","T2": 1 1 1 2 2 2 3 3 3
##  - attr(*, "codepage")= int 65001

저장 및 로드

문제

곤충학자 A는 오느 숲에서 소나무에 사는 솔잎흑나방의 개체 수를 세어보고 있다. 각 나무에서 발견된 솔잎흑나방의 수는 아래와 같을 때 각 질문에 답하시오.

1. 몇 그루의 나무를 살펴보았나?

## [1] 12

##  num [1:12] 5 6 7 10 10 11 20 2 3 10 ...

1. 총 몇 마리의 솔잎흑나방ㅇ르 관찰하였나?

## [1] 107

1. 평균적으로 몇 마리가 발견되었나?

## [1] 8.916667

1. 발견된 솔잎흑나방 수의 표준편자는?

## [1] 4.851585

1. 가장 많이 발견된 나무에서는 몇 마리가 관찰되었나?

## [1] 20

1. 가장 적게 발견된 나무에서는 몇 마리가 관찰되었나?

## [1] 2

1. 관찰된 솔읲흑나방수의 범위는?

## [1]  2 20

뉴욕 항공 데이터로 실습하기

## -- Attaching packages ---------------------------------------------- tidyverse 1.2.1 --

## √ ggplot2 3.2.0     √ purrr   0.3.2
## √ tibble  2.1.1     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0

## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## 
##     1 
## 27004

## Classes 'tbl_df', 'tbl' and 'data.frame':    336776 obs. of  19 variables:
##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr  "UA" "UA" "AA" "B6" ...
##  $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr  "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr  "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr  "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num  1400 1416 1089 1576 762 ...
##  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...