데이터 출처: SKT Data Hub
Github Code: Jun4871 Github

Overview

이번 데이터 분석은 최근 작업했던 19년 12월 유동인구 분석의 연장선이다. 조금 더 다각도에서 바라보고 파고들어 상업적 측면에서의 새로운 인사이트를 찾아서 상업성과 유동인구를 연관지어 보는 것이 분석목표이다. 데이터는 19년 3월 ~ 19년 12월까지, 총 10개월 분량의 데이터를 다룰 것이다.

라이브러리 활성화

데이터 분석에 앞서 필요한 도구를 준비하는 단계로 아래 라이브러리들을 활용하였다.

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(fs)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggthemes)
library(DT)
library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(mlbench)
library(formattable)

## 
## Attaching package: 'formattable'

## The following object is masked from 'package:plotly':
## 
##     style

데이터 로드

지난 번 분석에서는 하나의 csv 파일을 불러왔기 때문에 loading 시 큰 어려움이 없었다. 하지만 이번에는 10개의 csv 파일을 불러와야 하기 때문에 다른 방법을 통해 데이터를 불러올 것이다.

Population_19 <- "year_19"
year_19_pop <- fs::dir_ls(Population_19) 

# 데이터 취합
year_19_data <- year_19_pop %>% map_dfr(read_csv)

## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )
## Parsed with column specification:
## cols(
##   일자 = col_double(),
##   `시간(1시간단위)` = col_character(),
##   `연령대(10세단위)` = col_double(),
##   성별 = col_character(),
##   시 = col_character(),
##   군구 = col_character(),
##   유동인구수 = col_double()
## )

# 데이터 프레임 형식으로 변환
year_19_pop_data_frame <- as.data.frame(year_19_data)

데이터 가공 및 NA 파악

이제 불러온 데이터 구조를 파악하고, 분석목적에 맞게 가공을 해줄 것이다. NA와 NULL은 없었으나, 데이터 속성변환이 필요한 것들이 보여 바꿔주었다. 첫번째, 기본적으로 서울시 내의 유동인구를 조사한 것이므로 각 행에 ’시(city)’를 포함시킬 필요가 없어 해당 컬럼을 제거했다. 이것은 타이틀 제목을 통해 보여줄 수 있는 부분이고, 포함시켜서 분석을 하는 것이 큰 의미가 없기 때문이다. 두번째, 날짜를 날짜형 데이터에 맞게 바꿔주었다. 원래 숫자형으로 되어있었는데, 이는 다닥다닥 붙어 있어 가독성이 떨어지고, 월(Month)이나 일(Date)를 추출할 때, 숫자형이라면 인식하는데 어려움이 있을 것 같았다. 세번째, 컬럼명을 영문화 시켰다. R 환경에서는 한글에 대한 지원도가 떨어지는 편이고, 기본 함수들도 모두 영문이므로 통일성과 호환성을 고려해 영문화했다. 네번째, 범주형 데이터들에 한하여, 펙터화 시키고 레벨을 지정해주었다.

head(str(year_19_pop_data_frame, 50))

## 'data.frame':    2192700 obs. of  7 variables:
##  $ 일자            : num  20190301 20190301 20190301 20190301 20190301 ...
##  $ 시간(1시간단위) : chr  "00" "00" "00" "00" ...
##  $ 연령대(10세단위): num  20 20 20 20 30 30 30 30 30 30 ...
##  $ 성별            : chr  "남성" "남성" "남성" "여성" ...
##  $ 시              : chr  "서울" "서울" "서울" "서울" ...
##  $ 군구            : chr  "양천구" "종로구" "중랑구" "중랑구" ...
##  $ 유동인구수      : num  28720 15920 26250 25890 34090 ...

## NULL

summary(year_19_pop_data_frame)

##       일자          시간(1시간단위)    연령대(10세단위)     성별          
##  Min.   :20190301   Length:2192700     Min.   :20       Length:2192700    
##  1st Qu.:20190516   Class :character   1st Qu.:30       Class :character  
##  Median :20190731   Mode  :character   Median :45       Mode  :character  
##  Mean   :20190765                      Mean   :45                         
##  3rd Qu.:20191015                      3rd Qu.:60                         
##  Max.   :20191231                      Max.   :70                         
##       시                군구             유동인구수    
##  Length:2192700     Length:2192700     Min.   :   960  
##  Class :character   Class :character   1st Qu.: 17980  
##  Mode  :character   Mode  :character   Median : 25050  
##                                        Mean   : 27040  
##                                        3rd Qu.: 33530  
##                                        Max.   :127820

sum(is.na(year_19_pop_data_frame))

## [1] 0

sum(is.null(year_19_pop_data_frame))

## [1] 0

# 불필요한 컬럼 제거
year_19_pop_data_frame <- year_19_pop_data_frame[,-5]

# 날짜 형식으로 변환
year_19_pop_data_frame$일자 <- ymd(year_19_pop_data_frame$일자)

# 컬럼명 영문화
colnames(year_19_pop_data_frame) <- c("Date", "Time","Age", "Sex", "Town","Population")

# Age 컬럼 펙터화 
year_19_pop_data_frame$Age <- factor(year_19_pop_data_frame$Age, levels = c("20","30","40","50","60","70"))

# Sex 컬럼 펙터화
year_19_pop_data_frame$Sex <- factor(year_19_pop_data_frame$Sex, levels = c("남성","여성"))

# Town 컬럼 펙터화
year_19_pop_data_frame$Town <- as.factor(year_19_pop_data_frame$Town)

# Time 컬럼 정수화
year_19_pop_data_frame$Time <- as.numeric(year_19_pop_data_frame$Time)

# Population 정수로 변환
year_19_pop_data_frame$Population <- as.numeric(year_19_pop_data_frame$Population)

# year_19_pop_data_frame$Time <- as.integer(year_19_pop_data_frame$Time)
# 
# year_19_pop_data_frame$Population <- as.integer(year_19_pop_data_frame$Population)

그룹화

정리된 데이터를 분류기준에 맞추어 그룹화 시켜보았다. 월별, 요일별 데이터도 파악해보고 싶어, mutate()함수를 사용하여 월별, 요일별 컬럼을 생성해주었다.

월별

12월의 유동인구 수가 가장 많았다. 예측하건데, 연말시즌에는 대부분의 사람들이 모임을 많이 가지기 때문일 것이라는 생각을 해볼 수 있다. 한해의 마지막 달이기도 하며, 새해를 맞이할 준비를 하는 시기이기 때문에 다른 월에 비해 의미와 상징성이 있다고 할 수 있다. 이러한 이유로 많은 사람들이 자신이 속한 그룹의 구성원과 지난 일년간의 회포를 풀기도 하고, 그간 미뤄두었던 약속들이 연말돼서야 성사되는 경우가 적지 않고 볼 수 있다.

# 월별로 그룹화한 month 컬럼 생성
year_19_pop_data_frame <- year_19_pop_data_frame %>% 
  mutate(Month = month(year_19_pop_data_frame$Date))

# month 컬럼 펙터화
year_19_pop_data_frame$Month <- as.factor(year_19_pop_data_frame$Month)

# is.factor(year_19_pop_data_frame$Month)
# month(year_19_pop_data_frame$Month, label = TRUE) 

# 요일별로 그룹화한 Wday 컬럼 생성
year_19_pop_data_frame <- year_19_pop_data_frame %>% 
  mutate(Wday = weekdays.POSIXt(year_19_pop_data_frame$Date))

# Wday 컬럼 팩터화
year_19_pop_data_frame$Wday <- as.factor(year_19_pop_data_frame$Wday)

# Time 컬럼 펙터화
year_19_pop_data_frame$Time <- as.factor(year_19_pop_data_frame$Time)

# str(year_19_pop_data_frame)
# summary(year_19_pop_data_frame)


# 월별 : 12월에 유동인구수가 제일 많음. 
Month_grouping <- year_19_pop_data_frame %>% 
  group_by(Month, Sex) %>% 
  summarize(Mon_pop = sum(Population)) %>%
  arrange(desc(Mon_pop, Month)) %>% as.data.frame()

# # 월별, 지역별 
# Month_grouping <- year_19_pop_data_frame %>% 
#   group_by(Month, Town) %>% 
#   summarize(Mon_pop = sum(Population)) %>%
#   arrange(desc(Mon_pop, Town)) %>%  as.data.frame()

# 시간별 : 3시에 유동인구수가 제일 많음.
Time_grouping <- year_19_pop_data_frame %>% 
  group_by(Time) %>% 
  summarize(Time_Pop = sum(Population)) %>% 
  arrange(desc(Time_Pop, Time)) %>% as.data.frame()


# 요일별 : 화요일 유동인구수가 제일 많음.
Wday_grouping <- year_19_pop_data_frame %>% 
  group_by(Wday, Sex) %>% 
  summarize(Wday_Pop = sum(Population)) %>% 
  arrange(desc(Wday_Pop)) %>% as.data.frame()

# 성별 : 여성 유동인구 수가 더 많음
Sex_grouping <- year_19_pop_data_frame %>% 
  group_by(Sex) %>% 
  summarize(Sex_Pop = sum(Population)) %>% 
  arrange(desc(Sex_Pop)) %>% as.data.frame()

# 지역별 : 1위 강남구 
Town_grouping <- year_19_pop_data_frame %>% 
  group_by(Town) %>% 
  summarize(Town_Pop = sum(Population)) %>% 
  arrange(desc(Town_Pop)) %>% as.data.frame()

# 여성, 지역, 시간, 월별 유동인구수 그룹화
Wom_Pop_grouping <- year_19_pop_data_frame %>% 
  group_by(Sex, Town, Time, Month) %>% 
  summarize(W_T_T_M = sum(Population)) %>%
  filter(Sex == "여성") %>% 
  arrange(desc(W_T_T_M, Time)) %>%  as.data.frame()

# 남성, 지역, 시간, 월별, 유동인구수 그룹화
Man_Pop_grouping <- year_19_pop_data_frame %>% 
  group_by(Sex, Town, Time, Month) %>% 
  summarize(M_T_T_M = sum(Population)) %>% 
  filter(Sex == "남성")  %>% 
  arrange(desc(M_T_T_M, Time)) %>%  as.data.frame()

# 월별, 요일별 유동인구수 그룹화
Monnth_Wday_grouping <- year_19_pop_data_frame %>% 
  group_by(Month, Date, Wday) %>% 
  summarize(Mon_Wday_Pop = sum(Population)) %>% 
  arrange(desc(Mon_Wday_Pop)) %>% as.data.frame()





# # 월별 유동인구수 
# Month_grouping <- year_19_pop_data_frame %>% 
#   group_by(Town) %>% 
#   summarise(Monthly_pop = sum(Population)) 
# 
# 
# year_19_pop_data_frame$Population <- as.numeric(year_19_pop_data_frame$Population)
# str(year_19_pop_data_frame)
  


#   library(lubridate)
# # month 컬럼 추가
# year_18_food_data_frame <- year_18_food_data_frame %>% 
#  mutate(month = month(year_18_food_data_frame$date))
# # factor 로 변경
# year_18_food_data_frame$month <- as.factor(year_18_food_data_frame$month)
  
#   # normalize
# normalize <- function(x) {
#   ((x - min(x))*100/(max(x)-min(x))) 
# }
# # normalized by group
# data_by_month_county <- data_by_month_county %>% 
#   group_by(county, type) %>% 
#   mutate(normalized = normalize(call)) %>% 
#   arrange(month) %>%
#   arrange(county) %>% 
#   arrange(type) %>%
#   as.data.frame()
# > data_by_month_county
# month   county    type  call  normalized
# 1       1   강남구 chicken 36534 100.0000000
# 2       2   강남구 chicken 30806  50.4541130
# 3       3   강남구 chicken 32427  64.4753914
# 4       4   강남구 chicken 32470  64.8473315
# 5       5   강남구 chicken 30224  45.4199464
# 6       6   강남구 chicken 33320  72.1996367
# 7       7   강남구 chicken 32650  66.4042903
# 8       8   강남구 chicken 32674  66.6118848
# 9       9   강남구 chicken 26771  15.5522879
# 10     10   강남구 chicken 27384  20.8545974
# 11     11   강남구 chicken 24973   0.0000000
# 12     12   강남구 chicken 28241  28.2674509

시각화

앞서 각 조건별로 그룹화한 데이터를 가지고 시각화를 해보겠다.

1) 몇시에 유동인구 수가 가장 많을까?

# 시간별 시각화 
# ggplot(Time_grouping, aes(Time, Time_Pop)) +
#   geom_col(aes(fill = Sex)) +
#   scale_y_continuous(labels =  comma)+
#     ggtitle( "19년 시간대별 유동인구 그래프", subtitle = ("기간 : 19.03 ~ 19.12")) +
#   labs(x= "시간", y="유동인구 수") 

#####
  ggplot(Time_grouping, aes(Time, Time_Pop)) +
  # geom_col() +
 # geom_line(aes(size=5, color=Sex,  linetype = 1)) +
  # geom_line() +
  geom_point(size=8,alpha=9,stroke = 2,shape=21,aes(fill = Time_Pop)) +
   scale_fill_gradient(low = 'yellow',
                      high = 'red') +
  guides(fill="none") +
  theme_calc() +
  # geom_text(aes(label=rownames(Time_grouping))) +
  ggtitle("19년 시간대별 유동인구수", subtitle = ("기간 : 19.03 ~ 19.12")) +
  theme(plot.title = element_text(face="bold", hjust=0.5, vjust=1, size=30, color="#008B8B")) +
  xlab("시간대") +
  ylab("유동인구") +
  theme(axis.title.x=element_text(family="NanumGothic",
                                        face="bold",
                                        hjust=0.5,
                                        size=25,
                                        color="Black",
                                        angle=0),
   axis.title.y=element_text(family="NanumGothic",
                                        face="bold",
                                        vjust=0.5,
                                        size=25,
                                        color="Black",
                                        angle=0)) +
  scale_y_continuous(limits=c(min(Time_grouping$Time_Pop),
                              max(Time_grouping$Time_Pop)),
                              labels=scales::comma) +
  theme(axis.text.x = element_text(size=20, face ="bold")) +
  theme(axis.text.y = element_text(size=20, face ="bold"))

# color = "#ff6600",

# 월별, 성별 시각화
ggplot(Month_grouping, aes(x=reorder(Month, -Mon_pop), y=Mon_pop)) + 
  geom_col(aes(fill=Sex)) +
  #ggtitle( "19년 월별 유동인구 그래프", subtitle = ("기간 : 19.03 ~ 19.12")) +
  labs(title = "19년 월별 유동인구 그래프",x= "월", y="유동인구수", subtitle = "기간 : 19.03 ~ 19.12") +
  scale_y_continuous(labels =  comma) +
   theme(axis.text.x = element_text(size=20)) +
  theme(axis.text.y = element_text(size=20))

# 요일별 시각화

# Wday_grouping <- year_19_pop_data_frame %>% 
#   group_by(Wday) %>% 
#   summarize(Wday_Pop = sum(Population)) %>% 
#   arrange(desc(Wday_Pop)) %>% as.data.frame()
ggplot(Wday_grouping, aes(x= reorder(Wday, -Wday_Pop) , y=Wday_Pop)) +
      geom_col(aes( fill = Sex)) +
  scale_y_continuous(labels =  comma)+
    ggtitle( "19년 요일별 유동인구 그래프", subtitle = ("기간 : 19.03 ~ 19.12")) +
  labs(x= "요일", y="유동인구 수")  +
   theme(axis.text.x = element_text(size=20)) +
  theme(axis.text.y = element_text(size=20))

   # scale_y_continuous(limits=c(min(Wday_grouping$Wday_Pop),
   #                            max(Wday_grouping$Wday_Pop)),
   #                            labels=scales::comma) +

유동인구 분석

Jun

2020-02-18

Overview

라이브러리 활성화

데이터 로드

데이터 가공 및 NA 파악

그룹화

시각화

1) 몇시에 유동인구 수가 가장 많을까?