221203

rm(list=ls())
library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

getwd()

## [1] "C:/data"

subway_202203<-read.csv("CARD_SUBWAY_MONTH_202203.csv", fileEncoding = "euc-kr")
glimpse(subway_202203)

## Rows: 18,467
## Columns: 6
## $ 사용일자     <int> 20220301, 20220301, 20220301, 20220301, 20220301, 2022030…
## $ 노선명       <chr> "장항선", "장항선", "장항선", "안산선", "안산선", "우이신…
## $ 역명         <chr> "배방", "온양온천", "신창(순천향대)", "오이도", "수리산",…
## $ 승차총승객수 <int> 593, 2388, 1065, 4789, 1892, 2122, 1360, 1836, 2211, 1899…
## $ 하차총승객수 <int> 698, 2517, 1164, 4668, 1693, 2228, 1331, 1663, 2122, 1814…
## $ 등록일자     <int> 20220304, 20220304, 20220304, 20220304, 20220304, 2022030…

str(subway_202203)

## 'data.frame':    18467 obs. of  6 variables:
##  $ 사용일자    : int  20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 ...
##  $ 노선명      : chr  "장항선" "장항선" "장항선" "안산선" ...
##  $ 역명        : chr  "배방" "온양온천" "신창(순천향대)" "오이도" ...
##  $ 승차총승객수: int  593 2388 1065 4789 1892 2122 1360 1836 2211 1899 ...
##  $ 하차총승객수: int  698 2517 1164 4668 1693 2228 1331 1663 2122 1814 ...
##  $ 등록일자    : int  20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 ...

subway_202203<-subway_202203 %>% 
  rename(date="사용일자",
         line="노선명",
         station="역명",
         on_pass="승차총승객수",
         off_pass="하차총승객수") %>% 
  select(-"등록일자")
summary(subway_202203)

##       date              line             station             on_pass     
##  Min.   :20220301   Length:18467       Length:18467       Min.   :    1  
##  1st Qu.:20220308   Class :character   Class :character   1st Qu.: 3078  
##  Median :20220316   Mode  :character   Mode  :character   Median : 6334  
##  Mean   :20220316                                         Mean   : 8852  
##  3rd Qu.:20220324                                         3rd Qu.:11838  
##  Max.   :20220331                                         Max.   :80279  
##     off_pass    
##  Min.   :    0  
##  1st Qu.: 2989  
##  Median : 6229  
##  Mean   : 8823  
##  3rd Qu.:11742  
##  Max.   :78816

#1
subway_202203 %>% summarise(on_p=mean(on_pass), off_p=mean(off_pass))

##       on_p    off_p
## 1 8851.886 8822.759

#2
subway_202203 %>% filter(on_pass==max(on_pass))

##       date  line station on_pass off_pass
## 1 20220325 2호선    강남   80279    78816

#3

subway_202203 %>% group_by(station) %>% 
  mutate(total_pass=on_pass+off_pass) %>% summarise(m=mean(total_pass)) %>% 
  arrange(desc(m)) %>% head(3)

## # A tibble: 3 × 2
##   station              m
##   <chr>            <dbl>
## 1 강남           125027.
## 2 신림           101545.
## 3 구로디지털단지  88652.

#4
subway_202203 %>% mutate(total_pass=on_pass+off_pass) %>% filter(line=="1호선") %>%
  filter(total_pass==max(total_pass))

##       date  line station on_pass off_pass total_pass
## 1 20220325 1호선  서울역   41104    41346      82450

#5
table(subway_202203$date)

## 
## 20220301 20220302 20220303 20220304 20220305 20220306 20220307 20220308 
##      593      598      597      598      595      599      597      599 
## 20220309 20220310 20220311 20220312 20220313 20220314 20220315 20220316 
##      595      597      597      595      594      597      594      596 
## 20220317 20220318 20220319 20220320 20220321 20220322 20220323 20220324 
##      597      597      594      595      596      593      594      594 
## 20220325 20220326 20220327 20220328 20220329 20220330 20220331 
##      595      597      595      595      595      593      596

subway_202203$day<-substr(subway_202203$date,7,8)
table(subway_202203$day)

## 
##  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20 
## 593 598 597 598 595 599 597 599 595 597 597 595 594 597 594 596 597 597 594 595 
##  21  22  23  24  25  26  27  28  29  30  31 
## 596 593 594 594 595 597 595 595 595 593 596

subway_202203$day<-as.numeric(subway_202203$day)
subway_202203$week<-ifelse(subway_202203$day%in%c(5,6,12,13,19,20,26,27), 
                            "weekend","weekday")
table(subway_202203$week)

## 
## weekday weekend 
##   13703    4764

options(scipen=999)
subway_202203<-subway_202203 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202203,total_pass~week)

## 
##  Welch Two Sample t-test
## 
## data:  total_pass by week
## t = 32.794, df = 12509, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
##  7342.155 8275.667
## sample estimates:
## mean in group weekday mean in group weekend 
##              19689.14              11880.23

#2 202210월 지하철 데이터 응용
rm(list=ls())
getwd()

## [1] "C:/data"

subway_202210<-read.delim("CARD_SUBWAY_MONTH_202210.txt", fileEncoding = "euc-kr")
glimpse(subway_202210)

## Rows: 18,785
## Columns: 6
## $ 사용일자     <int> 20221001, 20221001, 20221001, 20221001, 20221001, 2022100…
## $ 노선명       <chr> "3호선", "3호선", "3호선", "3호선", "3호선", "3호선", "3…
## $ 역명         <chr> "고속터미널", "교대(법원.검찰청)", "학여울", "대청", "일…
## $ 승차총승객수 <int> 59124, 8040, 3355, 6517, 6231, 15481, 6913, 4490, 4155, 1…
## $ 하차총승객수 <int> 62989, 4875, 3401, 5926, 6025, 15390, 6566, 4231, 3923, 1…
## $ 등록일자     <int> 20221004, 20221004, 20221004, 20221004, 20221004, 2022100…

str(subway_202210)

## 'data.frame':    18785 obs. of  6 variables:
##  $ 사용일자    : int  20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 ...
##  $ 노선명      : chr  "3호선" "3호선" "3호선" "3호선" ...
##  $ 역명        : chr  "고속터미널" "교대(법원.검찰청)" "학여울" "대청" ...
##  $ 승차총승객수: int  59124 8040 3355 6517 6231 15481 6913 4490 4155 10551 ...
##  $ 하차총승객수: int  62989 4875 3401 5926 6025 15390 6566 4231 3923 10189 ...
##  $ 등록일자    : int  20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 ...

subway_202210<-subway_202210 %>% 
  rename(date="사용일자",
         line="노선명",
         station="역명",
         on_pass="승차총승객수",
         off_pass="하차총승객수") %>% 
  select(-"등록일자")
summary(subway_202210)

##       date              line             station             on_pass     
##  Min.   :20221001   Length:18785       Length:18785       Min.   :    1  
##  1st Qu.:20221008   Class :character   Class :character   1st Qu.: 3802  
##  Median :20221016   Mode  :character   Mode  :character   Median : 7865  
##  Mean   :20221016                                         Mean   :10917  
##  3rd Qu.:20221024                                         3rd Qu.:14432  
##  Max.   :20221031                                         Max.   :95408  
##     off_pass     
##  Min.   :     0  
##  1st Qu.:  3615  
##  Median :  7580  
##  Mean   : 10875  
##  3rd Qu.: 14197  
##  Max.   :102651

#2-1
subway_202210 %>% summarise(on_p=mean(on_pass), off_p=mean(off_pass))

##       on_p    off_p
## 1 10916.98 10875.09

#2-2
subway_202210 %>% filter(on_pass==max(on_pass))

##       date  line        station on_pass off_pass
## 1 20221028 2호선 잠실(송파구청)   95408    95061

#2-3

subway_202210 %>% group_by(station) %>% 
  mutate(total_pass=on_pass+off_pass) %>% summarise(m=mean(total_pass)) %>% 
  arrange(desc(m)) %>% head(3)

## # A tibble: 3 × 2
##   station              m
##   <chr>            <dbl>
## 1 강남           139260.
## 2 구로디지털단지 104601.
## 3 삼성(무역센터)  94918.

#2-4
subway_202210 %>% mutate(total_pass=on_pass+off_pass) %>% filter(line=="1호선") %>%
  filter(total_pass==max(total_pass))

##       date  line station on_pass off_pass total_pass
## 1 20221028 1호선  서울역   61206    60155     121361

#2-5
table(subway_202210$date)

## 
## 20221001 20221002 20221003 20221004 20221005 20221006 20221007 20221008 
##      606      606      604      606      607      606      606      606 
## 20221009 20221010 20221011 20221012 20221013 20221014 20221015 20221016 
##      605      605      607      604      606      607      605      606 
## 20221017 20221018 20221019 20221020 20221021 20221022 20221023 20221024 
##      606      606      607      607      606      606      606      606 
## 20221025 20221026 20221027 20221028 20221029 20221030 20221031 
##      605      606      607      608      607      604      606

subway_202210$day<-substr(subway_202210$date,7,8)
table(subway_202210$day)

## 
##  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20 
## 606 606 604 606 607 606 606 606 605 605 607 604 606 607 605 606 606 606 607 607 
##  21  22  23  24  25  26  27  28  29  30  31 
## 606 606 606 606 605 606 607 608 607 604 606

subway_202210$day<-as.numeric(subway_202210$day)
subway_202210$week<-ifelse(subway_202210$day%in%c(1,2,8,9,15,16,22,23,29,30), 
                           "weekend","weekday")
table(subway_202210$week)

## 
## weekday weekend 
##   12728    6057

options(scipen=999)
subway_202210<-subway_202210 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202210,total_pass~week)

## 
##  Welch Two Sample t-test
## 
## data:  total_pass by week
## t = 23.347, df = 15046, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
##  6546.031 7745.939
## sample estimates:
## mean in group weekday mean in group weekend 
##              24096.21              16950.23

#3
library(foreign)
koweps<-read.spss("koweps_h16_2021_beta1.sav")

## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Compression bias (0) is not the usual value of 100

## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Very long string record(s) found (record type 7, subtype 14), each will be
## imported in consecutive separate variables

class(koweps)

## [1] "list"

korwps_21<-as.data.frame(koweps)
house<-korwps_21 %>% select(h1601_4, h1601_5, h1601_6, h16_reg5,h1608_114,h1608_122)
str(house)

## 'data.frame':    5996 obs. of  6 variables:
##  $ h1601_4  : num  2 1 1 1 2 2 1 1 1 1 ...
##  $ h1601_5  : num  1945 1948 1942 1962 1940 ...
##  $ h1601_6  : num  4 3 7 6 3 5 4 6 7 5 ...
##  $ h16_reg5 : num  1 1 1 1 3 1 1 1 1 1 ...
##  $ h1608_114: num  NA NA NA 4392 NA ...
##  $ h1608_122: num  NA 1980 621 NA 324 NA NA 285 1500 NA ...

library(dplyr)
house1<-house %>% rename(gender=h1601_4,
                         birth=h1601_5,
                         edu=h1601_6,
                         region=h16_reg5,
                         r_salary=h1608_114,
                         t_salary=h1608_122)
summary(house1)

##      gender          birth           edu            region         r_salary    
##  Min.   :1.000   Min.   :1922   Min.   :2.000   Min.   :1.000   Min.   :    0  
##  1st Qu.:1.000   1st Qu.:1942   1st Qu.:3.000   1st Qu.:2.000   1st Qu.: 3280  
##  Median :1.000   Median :1955   Median :5.000   Median :3.000   Median : 4620  
##  Mean   :1.357   Mean   :1957   Mean   :4.635   Mean   :2.702   Mean   : 5250  
##  3rd Qu.:2.000   3rd Qu.:1970   3rd Qu.:6.000   3rd Qu.:3.000   3rd Qu.: 6620  
##  Max.   :2.000   Max.   :2001   Max.   :9.000   Max.   :5.000   Max.   :85860  
##                                                                 NA's   :4566   
##     t_salary    
##  Min.   :    0  
##  1st Qu.:  297  
##  Median : 1040  
##  Mean   : 1552  
##  3rd Qu.: 2340  
##  Max.   :14580  
##  NA's   :4384

house1$r_salary<-ifelse(house1$r_salary==0, NA, house1$r_salary)
house1$t_salary<-ifelse(house1$t_salary==0,NA,house1$t_salary)
house1$age<- 2021-house1$birth+1
range(house1$age)

## [1]  21 100

table(house1$edu)

## 
##    2    3    4    5    6    7    8    9 
##  562 1358  810 1635  462  975  163   31

house1$edu_grade<-ifelse(house1$edu%in%c(2,3,4), "중학이하",
                  ifelse(house1$edu==5, "고교",
                  ifelse(house1$edu==6, "전문대", "대학이상")))

table(house1$edu_grade)

## 
##     고교 대학이상   전문대 중학이하 
##     1635     1169      462     2730

table(house1$region)

## 
##    1    2    3    4    5 
##  795 1612 2337 1089  163

region_name<-data.frame(region=c(1,2,3,4,5),
                        region1=c("서울", "광역시", "시", "구", "도농복합구"))
house1<-left_join(house1,region_name, by="region")
str(house1)

## 'data.frame':    5996 obs. of  9 variables:
##  $ gender   : num  2 1 1 1 2 2 1 1 1 1 ...
##  $ birth    : num  1945 1948 1942 1962 1940 ...
##  $ edu      : num  4 3 7 6 3 5 4 6 7 5 ...
##  $ region   : num  1 1 1 1 3 1 1 1 1 1 ...
##  $ r_salary : num  NA NA NA 4392 NA ...
##  $ t_salary : num  NA 1980 621 NA 324 NA NA 285 1500 NA ...
##  $ age      : num  77 74 80 60 82 52 82 60 44 81 ...
##  $ edu_grade: chr  "중학이하" "중학이하" "대학이상" "전문대" ...
##  $ region1  : chr  "서울" "서울" "서울" "서울" ...

glimpse(house1)

## Rows: 5,996
## Columns: 9
## $ gender    <dbl> 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, …
## $ birth     <dbl> 1945, 1948, 1942, 1962, 1940, 1970, 1940, 1962, 1978, 1941, …
## $ edu       <dbl> 4, 3, 7, 6, 3, 5, 4, 6, 7, 5, 3, 7, 4, 5, 4, 7, 7, 3, 3, 7, …
## $ region    <dbl> 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, …
## $ r_salary  <dbl> NA, NA, NA, 4392, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ t_salary  <dbl> NA, 1980, 621, NA, 324, NA, NA, 285, 1500, NA, NA, 2400, 279…
## $ age       <dbl> 77, 74, 80, 60, 82, 52, 82, 60, 44, 81, 58, 47, 61, 70, 61, …
## $ edu_grade <chr> "중학이하", "중학이하", "대학이상", "전문대", "중학이하", "…
## $ region1   <chr> "서울", "서울", "서울", "서울", "시", "서울", "서울", "서울"…

#4-1
house1 %>% filter(!is.na(r_salary)) %>% 
  group_by(gender) %>% filter(r_salary==max(r_salary))

## # A tibble: 2 × 9
## # Groups:   gender [2]
##   gender birth   edu region r_salary t_salary   age edu_grade region1
##    <dbl> <dbl> <dbl>  <dbl>    <dbl>    <dbl> <dbl> <chr>     <chr>  
## 1      1  1992     7      2    85860       NA    30 대학이상  광역시 
## 2      2  1997     7      3    37260       NA    25 대학이상  시

#4-2
house1 %>% filter(age==76&r_salary==9126)

##   gender birth edu region r_salary t_salary age edu_grade region1
## 1      1  1946   4      3     9126       NA  76  중학이하      시

#4-3
house1 %>% filter(!is.na(r_salary)) %>% group_by(gender, edu_grade) %>% 
  summarize(m=mean(r_salary)) %>% arrange(desc(m))

## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.

## # A tibble: 8 × 3
## # Groups:   gender [2]
##   gender edu_grade     m
##    <dbl> <chr>     <dbl>
## 1      1 대학이상  6367.
## 2      1 전문대    4951.
## 3      1 고교      4841.
## 4      2 대학이상  4275.
## 5      2 고교      3135.
## 6      1 중학이하  2868.
## 7      2 전문대    2697.
## 8      2 중학이하  1954

#4-4
house1 %>% filter(!is.na(t_salary)) %>% group_by(region1) %>% 
  summarize(m=mean(t_salary)) %>% 
  arrange(desc(m))

## # A tibble: 5 × 2
##   region1        m
##   <chr>      <dbl>
## 1 서울       1853.
## 2 시         1693.
## 3 광역시     1591.
## 4 도농복합구 1137.
## 5 구          959.

221203_2

이동건

2022-12-03