library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
rm(list=ls())
subway_202210<-read.delim("CARD_SUBWAY_MONTH_202210.txt", fileEncoding = "euc-kr")
glimpse(subway_202210)
## Rows: 18,785
## Columns: 6
## $ 사용일자     <int> 20221001, 20221001, 20221001, 20221001, 20221001, 2022100…
## $ 노선명       <chr> "3호선", "3호선", "3호선", "3호선", "3호선", "3호선", "3…
## $ 역명         <chr> "고속터미널", "교대(법원.검찰청)", "학여울", "대청", "일…
## $ 승차총승객수 <int> 59124, 8040, 3355, 6517, 6231, 15481, 6913, 4490, 4155, 1…
## $ 하차총승객수 <int> 62989, 4875, 3401, 5926, 6025, 15390, 6566, 4231, 3923, 1…
## $ 등록일자     <int> 20221004, 20221004, 20221004, 20221004, 20221004, 2022100…
str(subway_202210)
## 'data.frame':    18785 obs. of  6 variables:
##  $ 사용일자    : int  20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 ...
##  $ 노선명      : chr  "3호선" "3호선" "3호선" "3호선" ...
##  $ 역명        : chr  "고속터미널" "교대(법원.검찰청)" "학여울" "대청" ...
##  $ 승차총승객수: int  59124 8040 3355 6517 6231 15481 6913 4490 4155 10551 ...
##  $ 하차총승객수: int  62989 4875 3401 5926 6025 15390 6566 4231 3923 10189 ...
##  $ 등록일자    : int  20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 ...
subway_202210<-subway_202210 %>%
  rename(date="사용일자",
         line="노선명",
         station="역명",
         on_pass="승차총승객수",
         off_pass="하차총승객수") %>%
  select(-"등록일자")
glimpse(subway_202210)
## Rows: 18,785
## Columns: 5
## $ date     <int> 20221001, 20221001, 20221001, 20221001, 20221001, 20221001, 2…
## $ line     <chr> "3호선", "3호선", "3호선", "3호선", "3호선", "3호선", "3호선"…
## $ station  <chr> "고속터미널", "교대(법원.검찰청)", "학여울", "대청", "일원", …
## $ on_pass  <int> 59124, 8040, 3355, 6517, 6231, 15481, 6913, 4490, 4155, 10551…
## $ off_pass <int> 62989, 4875, 3401, 5926, 6025, 15390, 6566, 4231, 3923, 10189…
summary(subway_202210)
##       date              line             station             on_pass     
##  Min.   :20221001   Length:18785       Length:18785       Min.   :    1  
##  1st Qu.:20221008   Class :character   Class :character   1st Qu.: 3802  
##  Median :20221016   Mode  :character   Mode  :character   Median : 7865  
##  Mean   :20221016                                         Mean   :10917  
##  3rd Qu.:20221024                                         3rd Qu.:14432  
##  Max.   :20221031                                         Max.   :95408  
##     off_pass     
##  Min.   :     0  
##  1st Qu.:  3615  
##  Median :  7580  
##  Mean   : 10875  
##  3rd Qu.: 14197  
##  Max.   :102651
subway_202210 %>% summarise(on_p=mean(on_pass), off_p=mean(off_pass))
##       on_p    off_p
## 1 10916.98 10875.09
subway_202210 %>% filter(on_pass==max(on_pass))
##       date  line        station on_pass off_pass
## 1 20221028 2호선 잠실(송파구청)   95408    95061
subway_202210 %>% group_by(station) %>%
  mutate(total_pass=on_pass+off_pass) %>%
  summarise(m=mean(total_pass)) %>% arrange(desc(m)) %>% head(3)
## # A tibble: 3 × 2
##   station              m
##   <chr>            <dbl>
## 1 강남           139260.
## 2 구로디지털단지 104601.
## 3 삼성(무역센터)  94918.
subway_202210 %>% mutate(total_pass=on_pass+off_pass) %>%
  filter(line=="1호선") %>% filter(total_pass==max(total_pass))
##       date  line station on_pass off_pass total_pass
## 1 20221028 1호선  서울역   61206    60155     121361
table(subway_202210$date)
## 
## 20221001 20221002 20221003 20221004 20221005 20221006 20221007 20221008 
##      606      606      604      606      607      606      606      606 
## 20221009 20221010 20221011 20221012 20221013 20221014 20221015 20221016 
##      605      605      607      604      606      607      605      606 
## 20221017 20221018 20221019 20221020 20221021 20221022 20221023 20221024 
##      606      606      607      607      606      606      606      606 
## 20221025 20221026 20221027 20221028 20221029 20221030 20221031 
##      605      606      607      608      607      604      606
subway_202210$day<-substr(subway_202210$date,7,8)
table(subway_202210$day)
## 
##  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20 
## 606 606 604 606 607 606 606 606 605 605 607 604 606 607 605 606 606 606 607 607 
##  21  22  23  24  25  26  27  28  29  30  31 
## 606 606 606 606 605 606 607 608 607 604 606
#day가 수치형이 아니기 때문에 as.numeric을 사용하여 수치형으로 변환
subway_202210$day<-as.numeric((subway_202210$day))
#day에 대한 조건이 많을 경우 %in% 사용
subway_202210$week<-ifelse(subway_202210$day%in%c(1,2,8,9,15,16,22,23,29,30),
                           "weekend", "weekday")
table(subway_202210$week)
## 
## weekday weekend 
##   12728    6057
options(scipen = 999)
subway_202210<-subway_202210 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202210, total_pass~week)
## 
##  Welch Two Sample t-test
## 
## data:  total_pass by week
## t = 23.347, df = 15046, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
##  6546.031 7745.939
## sample estimates:
## mean in group weekday mean in group weekend 
##              24096.21              16950.23
library(foreign)
koweps<-read.spss("Koweps_h16_2021_beta1.sav")
## Warning in read.spss("Koweps_h16_2021_beta1.sav"): Koweps_h16_2021_beta1.sav:
## Compression bias (0) is not the usual value of 100
## Warning in read.spss("Koweps_h16_2021_beta1.sav"): Koweps_h16_2021_beta1.sav:
## Very long string record(s) found (record type 7, subtype 14), each will be
## imported in consecutive separate variables
class(koweps)
## [1] "list"
koweps_21<-as.data.frame(koweps)
house<-koweps_21 %>% select(h1601_4, h1601_5, h1601_6,
                            h16_reg5, h1608_114, h1608_122)
str(house)
## 'data.frame':    5996 obs. of  6 variables:
##  $ h1601_4  : num  2 1 1 1 2 2 1 1 1 1 ...
##  $ h1601_5  : num  1945 1948 1942 1962 1940 ...
##  $ h1601_6  : num  4 3 7 6 3 5 4 6 7 5 ...
##  $ h16_reg5 : num  1 1 1 1 3 1 1 1 1 1 ...
##  $ h1608_114: num  NA NA NA 4392 NA ...
##  $ h1608_122: num  NA 1980 621 NA 324 NA NA 285 1500 NA ...
library(dplyr)
house1<-house %>% rename(gender = h1601_4,
                        birth = h1601_5,
                        edu = h1601_6,
                        region = h16_reg5,
                        r_salary = h1608_114,
                        t_salary = h1608_122)

summary(house1)
##      gender          birth           edu            region         r_salary    
##  Min.   :1.000   Min.   :1922   Min.   :2.000   Min.   :1.000   Min.   :    0  
##  1st Qu.:1.000   1st Qu.:1942   1st Qu.:3.000   1st Qu.:2.000   1st Qu.: 3280  
##  Median :1.000   Median :1955   Median :5.000   Median :3.000   Median : 4620  
##  Mean   :1.357   Mean   :1957   Mean   :4.635   Mean   :2.702   Mean   : 5250  
##  3rd Qu.:2.000   3rd Qu.:1970   3rd Qu.:6.000   3rd Qu.:3.000   3rd Qu.: 6620  
##  Max.   :2.000   Max.   :2001   Max.   :9.000   Max.   :5.000   Max.   :85860  
##                                                                 NA's   :4566   
##     t_salary    
##  Min.   :    0  
##  1st Qu.:  297  
##  Median : 1040  
##  Mean   : 1552  
##  3rd Qu.: 2340  
##  Max.   :14580  
##  NA's   :4384
house1$r_salary<-ifelse(house1$r_salary==0, NA, house1$r_salary)
house1$t_salary<-ifelse(house1$t_salary==0, NA, house1$t_salary)
house1$age<-2021-house1$birth+1
range(house1$age)
## [1]  21 100
table(house1$edu)
## 
##    2    3    4    5    6    7    8    9 
##  562 1358  810 1635  462  975  163   31
house1$edu_grade<-ifelse(house1$edu%in%c(2,3,4), "중학이하",
                  ifelse(house1$edu%in%c(5), "고교",
                  ifelse(house1$edu%in%c(6),"전문대", "대학이상")))
table(house1$edu_grade)
## 
##     고교 대학이상   전문대 중학이하 
##     1635     1169      462     2730
table(house1$region)
## 
##    1    2    3    4    5 
##  795 1612 2337 1089  163
region_name<-data.frame(region=c(1,2,3,4,5),
                        region1=c("서울","광역시","시","구","도농복합구"))

house1<-left_join(house1, region_name, by="region")
str(house1)
## 'data.frame':    5996 obs. of  9 variables:
##  $ gender   : num  2 1 1 1 2 2 1 1 1 1 ...
##  $ birth    : num  1945 1948 1942 1962 1940 ...
##  $ edu      : num  4 3 7 6 3 5 4 6 7 5 ...
##  $ region   : num  1 1 1 1 3 1 1 1 1 1 ...
##  $ r_salary : num  NA NA NA 4392 NA ...
##  $ t_salary : num  NA 1980 621 NA 324 NA NA 285 1500 NA ...
##  $ age      : num  77 74 80 60 82 52 82 60 44 81 ...
##  $ edu_grade: chr  "중학이하" "중학이하" "대학이상" "전문대" ...
##  $ region1  : chr  "서울" "서울" "서울" "서울" ...
glimpse(house1)
## Rows: 5,996
## Columns: 9
## $ gender    <dbl> 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, …
## $ birth     <dbl> 1945, 1948, 1942, 1962, 1940, 1970, 1940, 1962, 1978, 1941, …
## $ edu       <dbl> 4, 3, 7, 6, 3, 5, 4, 6, 7, 5, 3, 7, 4, 5, 4, 7, 7, 3, 3, 7, …
## $ region    <dbl> 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, …
## $ r_salary  <dbl> NA, NA, NA, 4392, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ t_salary  <dbl> NA, 1980, 621, NA, 324, NA, NA, 285, 1500, NA, NA, 2400, 279…
## $ age       <dbl> 77, 74, 80, 60, 82, 52, 82, 60, 44, 81, 58, 47, 61, 70, 61, …
## $ edu_grade <chr> "중학이하", "중학이하", "대학이상", "전문대", "중학이하", "…
## $ region1   <chr> "서울", "서울", "서울", "서울", "시", "서울", "서울", "서울"…
house1 %>% filter(!is.na(r_salary)) %>% group_by(gender) %>% 
  filter(r_salary==max(r_salary))
## # A tibble: 2 × 9
## # Groups:   gender [2]
##   gender birth   edu region r_salary t_salary   age edu_grade region1
##    <dbl> <dbl> <dbl>  <dbl>    <dbl>    <dbl> <dbl> <chr>     <chr>  
## 1      1  1992     7      2    85860       NA    30 대학이상  광역시 
## 2      2  1997     7      3    37260       NA    25 대학이상  시
house1 %>% filter(!is.na(r_salary)) %>% group_by(age) %>% 
  summarise(m=mean(r_salary)) %>% arrange(desc(m)) %>% head(3)
## # A tibble: 3 × 2
##     age     m
##   <dbl> <dbl>
## 1    76 9126 
## 2    30 8108.
## 3    51 7000.
house1 %>% filter(age == 76 & r_salary == 9126)
##   gender birth edu region r_salary t_salary age edu_grade region1
## 1      1  1946   4      3     9126       NA  76  중학이하      시
house1 %>% filter(!is.na(r_salary)) %>% group_by(gender,edu_grade) %>% 
  summarise(m=mean(r_salary)) %>% arrange(desc(m))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 3
## # Groups:   gender [2]
##   gender edu_grade     m
##    <dbl> <chr>     <dbl>
## 1      1 대학이상  6367.
## 2      1 전문대    4951.
## 3      1 고교      4841.
## 4      2 대학이상  4275.
## 5      2 고교      3135.
## 6      1 중학이하  2868.
## 7      2 전문대    2697.
## 8      2 중학이하  1954
house1 %>% filter(!is.na(t_salary)) %>% group_by(region1) %>% 
  summarise(m=mean(t_salary)) %>% arrange(desc(m)) 
## # A tibble: 5 × 2
##   region1        m
##   <chr>      <dbl>
## 1 서울       1853.
## 2 시         1693.
## 3 광역시     1591.
## 4 도농복합구 1137.
## 5 구          959.