library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
rm(list=ls())
getwd()
## [1] "C:/data"
subway_202210<-read.delim("CARD_SUBWAY_MONTH_202210.txt", fileEncoding = "euc-kr")
glimpse(subway_202210)
## Rows: 18,785
## Columns: 6
## $ 사용일자     <int> 20221001, 20221001, 20221001, 20221001, 20221001, 2022100…
## $ 노선명       <chr> "3호선", "3호선", "3호선", "3호선", "3호선", "3호선", "3…
## $ 역명         <chr> "고속터미널", "교대(법원.검찰청)", "학여울", "대청", "일…
## $ 승차총승객수 <int> 59124, 8040, 3355, 6517, 6231, 15481, 6913, 4490, 4155, 1…
## $ 하차총승객수 <int> 62989, 4875, 3401, 5926, 6025, 15390, 6566, 4231, 3923, 1…
## $ 등록일자     <int> 20221004, 20221004, 20221004, 20221004, 20221004, 2022100…
str(subway_202210)
## 'data.frame':    18785 obs. of  6 variables:
##  $ 사용일자    : int  20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 ...
##  $ 노선명      : chr  "3호선" "3호선" "3호선" "3호선" ...
##  $ 역명        : chr  "고속터미널" "교대(법원.검찰청)" "학여울" "대청" ...
##  $ 승차총승객수: int  59124 8040 3355 6517 6231 15481 6913 4490 4155 10551 ...
##  $ 하차총승객수: int  62989 4875 3401 5926 6025 15390 6566 4231 3923 10189 ...
##  $ 등록일자    : int  20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 ...
subway_202210<-subway_202210 %>% 
  rename(date="사용일자",
         line="노선명",
         station="역명",
         on_pass="승차총승객수",
         off_pass="하차총승객수") %>% 
  select(-"등록일자")
summary(subway_202210)
##       date              line             station             on_pass     
##  Min.   :20221001   Length:18785       Length:18785       Min.   :    1  
##  1st Qu.:20221008   Class :character   Class :character   1st Qu.: 3802  
##  Median :20221016   Mode  :character   Mode  :character   Median : 7865  
##  Mean   :20221016                                         Mean   :10917  
##  3rd Qu.:20221024                                         3rd Qu.:14432  
##  Max.   :20221031                                         Max.   :95408  
##     off_pass     
##  Min.   :     0  
##  1st Qu.:  3615  
##  Median :  7580  
##  Mean   : 10875  
##  3rd Qu.: 14197  
##  Max.   :102651
#2-1
subway_202210 %>% summarise(on_p=mean(on_pass), off_p=mean(off_pass))
##       on_p    off_p
## 1 10916.98 10875.09
#2-2
subway_202210 %>% filter(on_pass==max(on_pass))
##       date  line        station on_pass off_pass
## 1 20221028 2호선 잠실(송파구청)   95408    95061
#2-3

subway_202210 %>% group_by(station) %>% 
  mutate(total_pass=on_pass+off_pass) %>% summarise(m=mean(total_pass)) %>% 
  arrange(desc(m)) %>% head(3)
## # A tibble: 3 × 2
##   station              m
##   <chr>            <dbl>
## 1 강남           139260.
## 2 구로디지털단지 104601.
## 3 삼성(무역센터)  94918.
#2-4
subway_202210 %>% mutate(total_pass=on_pass+off_pass) %>% filter(line=="1호선") %>%
  filter(total_pass==max(total_pass))
##       date  line station on_pass off_pass total_pass
## 1 20221028 1호선  서울역   61206    60155     121361
#2-5
table(subway_202210$date)
## 
## 20221001 20221002 20221003 20221004 20221005 20221006 20221007 20221008 
##      606      606      604      606      607      606      606      606 
## 20221009 20221010 20221011 20221012 20221013 20221014 20221015 20221016 
##      605      605      607      604      606      607      605      606 
## 20221017 20221018 20221019 20221020 20221021 20221022 20221023 20221024 
##      606      606      607      607      606      606      606      606 
## 20221025 20221026 20221027 20221028 20221029 20221030 20221031 
##      605      606      607      608      607      604      606
subway_202210$day<-substr(subway_202210$date,7,8)
table(subway_202210$day)
## 
##  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20 
## 606 606 604 606 607 606 606 606 605 605 607 604 606 607 605 606 606 606 607 607 
##  21  22  23  24  25  26  27  28  29  30  31 
## 606 606 606 606 605 606 607 608 607 604 606
subway_202210$day<-as.numeric(subway_202210$day)
subway_202210$week<-ifelse(subway_202210$day%in%c(1,2,8,9,15,16,22,23,29,30), 
                           "weekend","weekday")
table(subway_202210$week)
## 
## weekday weekend 
##   12728    6057
options(scipen=999)
subway_202210<-subway_202210 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202210,total_pass~week)
## 
##  Welch Two Sample t-test
## 
## data:  total_pass by week
## t = 23.347, df = 15046, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
##  6546.031 7745.939
## sample estimates:
## mean in group weekday mean in group weekend 
##              24096.21              16950.23