rm(list=ls())
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
getwd()
## [1] "C:/data"
subway_202203<-read.csv("CARD_SUBWAY_MONTH_202203.csv", fileEncoding = "euc-kr")
glimpse(subway_202203)
## Rows: 18,467
## Columns: 6
## $ 사용일자 <int> 20220301, 20220301, 20220301, 20220301, 20220301, 2022030…
## $ 노선명 <chr> "장항선", "장항선", "장항선", "안산선", "안산선", "우이신…
## $ 역명 <chr> "배방", "온양온천", "신창(순천향대)", "오이도", "수리산",…
## $ 승차총승객수 <int> 593, 2388, 1065, 4789, 1892, 2122, 1360, 1836, 2211, 1899…
## $ 하차총승객수 <int> 698, 2517, 1164, 4668, 1693, 2228, 1331, 1663, 2122, 1814…
## $ 등록일자 <int> 20220304, 20220304, 20220304, 20220304, 20220304, 2022030…
str(subway_202203)
## 'data.frame': 18467 obs. of 6 variables:
## $ 사용일자 : int 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 ...
## $ 노선명 : chr "장항선" "장항선" "장항선" "안산선" ...
## $ 역명 : chr "배방" "온양온천" "신창(순천향대)" "오이도" ...
## $ 승차총승객수: int 593 2388 1065 4789 1892 2122 1360 1836 2211 1899 ...
## $ 하차총승객수: int 698 2517 1164 4668 1693 2228 1331 1663 2122 1814 ...
## $ 등록일자 : int 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 ...
subway_202203<-subway_202203 %>%
rename(date="사용일자",
line="노선명",
station="역명",
on_pass="승차총승객수",
off_pass="하차총승객수") %>%
select(-"등록일자")
summary(subway_202203)
## date line station on_pass
## Min. :20220301 Length:18467 Length:18467 Min. : 1
## 1st Qu.:20220308 Class :character Class :character 1st Qu.: 3078
## Median :20220316 Mode :character Mode :character Median : 6334
## Mean :20220316 Mean : 8852
## 3rd Qu.:20220324 3rd Qu.:11838
## Max. :20220331 Max. :80279
## off_pass
## Min. : 0
## 1st Qu.: 2989
## Median : 6229
## Mean : 8823
## 3rd Qu.:11742
## Max. :78816
#1
subway_202203 %>% summarise(on_p=mean(on_pass), off_p=mean(off_pass))
## on_p off_p
## 1 8851.886 8822.759
#2
subway_202203 %>% filter(on_pass==max(on_pass))
## date line station on_pass off_pass
## 1 20220325 2호선 강남 80279 78816
#3
subway_202203 %>% group_by(station) %>%
mutate(total_pass=on_pass+off_pass) %>% summarise(m=mean(total_pass)) %>%
arrange(desc(m)) %>% head(3)
## # A tibble: 3 × 2
## station m
## <chr> <dbl>
## 1 강남 125027.
## 2 신림 101545.
## 3 구로디지털단지 88652.
#4
subway_202203 %>% mutate(total_pass=on_pass+off_pass) %>% filter(line=="1호선") %>%
filter(total_pass==max(total_pass))
## date line station on_pass off_pass total_pass
## 1 20220325 1호선 서울역 41104 41346 82450
#5
table(subway_202203$date)
##
## 20220301 20220302 20220303 20220304 20220305 20220306 20220307 20220308
## 593 598 597 598 595 599 597 599
## 20220309 20220310 20220311 20220312 20220313 20220314 20220315 20220316
## 595 597 597 595 594 597 594 596
## 20220317 20220318 20220319 20220320 20220321 20220322 20220323 20220324
## 597 597 594 595 596 593 594 594
## 20220325 20220326 20220327 20220328 20220329 20220330 20220331
## 595 597 595 595 595 593 596
subway_202203$day<-substr(subway_202203$date,7,8)
table(subway_202203$day)
##
## 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20
## 593 598 597 598 595 599 597 599 595 597 597 595 594 597 594 596 597 597 594 595
## 21 22 23 24 25 26 27 28 29 30 31
## 596 593 594 594 595 597 595 595 595 593 596
subway_202203$day<-as.numeric(subway_202203$day)
subway_202203$week<-ifelse(subway_202203$day%in%c(5,6,12,13,19,20,26,27),
"weekend","weekday")
table(subway_202203$week)
##
## weekday weekend
## 13703 4764
options(scipen=999)
subway_202203<-subway_202203 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202203,total_pass~week)
##
## Welch Two Sample t-test
##
## data: total_pass by week
## t = 32.794, df = 12509, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
## 7342.155 8275.667
## sample estimates:
## mean in group weekday mean in group weekend
## 19689.14 11880.23
#2 202210월 지하철 데이터 응용
rm(list=ls())
getwd()
## [1] "C:/data"
subway_202210<-read.delim("CARD_SUBWAY_MONTH_202210.txt", fileEncoding = "euc-kr")
glimpse(subway_202210)
## Rows: 18,785
## Columns: 6
## $ 사용일자 <int> 20221001, 20221001, 20221001, 20221001, 20221001, 2022100…
## $ 노선명 <chr> "3호선", "3호선", "3호선", "3호선", "3호선", "3호선", "3…
## $ 역명 <chr> "고속터미널", "교대(법원.검찰청)", "학여울", "대청", "일…
## $ 승차총승객수 <int> 59124, 8040, 3355, 6517, 6231, 15481, 6913, 4490, 4155, 1…
## $ 하차총승객수 <int> 62989, 4875, 3401, 5926, 6025, 15390, 6566, 4231, 3923, 1…
## $ 등록일자 <int> 20221004, 20221004, 20221004, 20221004, 20221004, 2022100…
str(subway_202210)
## 'data.frame': 18785 obs. of 6 variables:
## $ 사용일자 : int 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 20221001 ...
## $ 노선명 : chr "3호선" "3호선" "3호선" "3호선" ...
## $ 역명 : chr "고속터미널" "교대(법원.검찰청)" "학여울" "대청" ...
## $ 승차총승객수: int 59124 8040 3355 6517 6231 15481 6913 4490 4155 10551 ...
## $ 하차총승객수: int 62989 4875 3401 5926 6025 15390 6566 4231 3923 10189 ...
## $ 등록일자 : int 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 20221004 ...
subway_202210<-subway_202210 %>%
rename(date="사용일자",
line="노선명",
station="역명",
on_pass="승차총승객수",
off_pass="하차총승객수") %>%
select(-"등록일자")
summary(subway_202210)
## date line station on_pass
## Min. :20221001 Length:18785 Length:18785 Min. : 1
## 1st Qu.:20221008 Class :character Class :character 1st Qu.: 3802
## Median :20221016 Mode :character Mode :character Median : 7865
## Mean :20221016 Mean :10917
## 3rd Qu.:20221024 3rd Qu.:14432
## Max. :20221031 Max. :95408
## off_pass
## Min. : 0
## 1st Qu.: 3615
## Median : 7580
## Mean : 10875
## 3rd Qu.: 14197
## Max. :102651
#2-1
subway_202210 %>% summarise(on_p=mean(on_pass), off_p=mean(off_pass))
## on_p off_p
## 1 10916.98 10875.09
#2-2
subway_202210 %>% filter(on_pass==max(on_pass))
## date line station on_pass off_pass
## 1 20221028 2호선 잠실(송파구청) 95408 95061
#2-3
subway_202210 %>% group_by(station) %>%
mutate(total_pass=on_pass+off_pass) %>% summarise(m=mean(total_pass)) %>%
arrange(desc(m)) %>% head(3)
## # A tibble: 3 × 2
## station m
## <chr> <dbl>
## 1 강남 139260.
## 2 구로디지털단지 104601.
## 3 삼성(무역센터) 94918.
#2-4
subway_202210 %>% mutate(total_pass=on_pass+off_pass) %>% filter(line=="1호선") %>%
filter(total_pass==max(total_pass))
## date line station on_pass off_pass total_pass
## 1 20221028 1호선 서울역 61206 60155 121361
#2-5
table(subway_202210$date)
##
## 20221001 20221002 20221003 20221004 20221005 20221006 20221007 20221008
## 606 606 604 606 607 606 606 606
## 20221009 20221010 20221011 20221012 20221013 20221014 20221015 20221016
## 605 605 607 604 606 607 605 606
## 20221017 20221018 20221019 20221020 20221021 20221022 20221023 20221024
## 606 606 607 607 606 606 606 606
## 20221025 20221026 20221027 20221028 20221029 20221030 20221031
## 605 606 607 608 607 604 606
subway_202210$day<-substr(subway_202210$date,7,8)
table(subway_202210$day)
##
## 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20
## 606 606 604 606 607 606 606 606 605 605 607 604 606 607 605 606 606 606 607 607
## 21 22 23 24 25 26 27 28 29 30 31
## 606 606 606 606 605 606 607 608 607 604 606
subway_202210$day<-as.numeric(subway_202210$day)
subway_202210$week<-ifelse(subway_202210$day%in%c(1,2,8,9,15,16,22,23,29,30),
"weekend","weekday")
table(subway_202210$week)
##
## weekday weekend
## 12728 6057
options(scipen=999)
subway_202210<-subway_202210 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202210,total_pass~week)
##
## Welch Two Sample t-test
##
## data: total_pass by week
## t = 23.347, df = 15046, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
## 6546.031 7745.939
## sample estimates:
## mean in group weekday mean in group weekend
## 24096.21 16950.23
#3
library(foreign)
koweps<-read.spss("koweps_h16_2021_beta1.sav")
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Compression bias (0) is not the usual value of 100
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Very long string record(s) found (record type 7, subtype 14), each will be
## imported in consecutive separate variables
class(koweps)
## [1] "list"
korwps_21<-as.data.frame(koweps)
house<-korwps_21 %>% select(h1601_4, h1601_5, h1601_6, h16_reg5,h1608_114,h1608_122)
str(house)
## 'data.frame': 5996 obs. of 6 variables:
## $ h1601_4 : num 2 1 1 1 2 2 1 1 1 1 ...
## $ h1601_5 : num 1945 1948 1942 1962 1940 ...
## $ h1601_6 : num 4 3 7 6 3 5 4 6 7 5 ...
## $ h16_reg5 : num 1 1 1 1 3 1 1 1 1 1 ...
## $ h1608_114: num NA NA NA 4392 NA ...
## $ h1608_122: num NA 1980 621 NA 324 NA NA 285 1500 NA ...
library(dplyr)
house1<-house %>% rename(gender=h1601_4,
birth=h1601_5,
edu=h1601_6,
region=h16_reg5,
r_salary=h1608_114,
t_salary=h1608_122)
summary(house1)
## gender birth edu region r_salary
## Min. :1.000 Min. :1922 Min. :2.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.:1942 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 3280
## Median :1.000 Median :1955 Median :5.000 Median :3.000 Median : 4620
## Mean :1.357 Mean :1957 Mean :4.635 Mean :2.702 Mean : 5250
## 3rd Qu.:2.000 3rd Qu.:1970 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 6620
## Max. :2.000 Max. :2001 Max. :9.000 Max. :5.000 Max. :85860
## NA's :4566
## t_salary
## Min. : 0
## 1st Qu.: 297
## Median : 1040
## Mean : 1552
## 3rd Qu.: 2340
## Max. :14580
## NA's :4384
house1$r_salary<-ifelse(house1$r_salary==0, NA, house1$r_salary)
house1$t_salary<-ifelse(house1$t_salary==0,NA,house1$t_salary)
house1$age<- 2021-house1$birth+1
range(house1$age)
## [1] 21 100
table(house1$edu)
##
## 2 3 4 5 6 7 8 9
## 562 1358 810 1635 462 975 163 31
house1$edu_grade<-ifelse(house1$edu%in%c(2,3,4), "중학이하",
ifelse(house1$edu==5, "고교",
ifelse(house1$edu==6, "전문대", "대학이상")))
table(house1$edu_grade)
##
## 고교 대학이상 전문대 중학이하
## 1635 1169 462 2730
table(house1$region)
##
## 1 2 3 4 5
## 795 1612 2337 1089 163
region_name<-data.frame(region=c(1,2,3,4,5),
region1=c("서울", "광역시", "시", "구", "도농복합구"))
house1<-left_join(house1,region_name, by="region")
str(house1)
## 'data.frame': 5996 obs. of 9 variables:
## $ gender : num 2 1 1 1 2 2 1 1 1 1 ...
## $ birth : num 1945 1948 1942 1962 1940 ...
## $ edu : num 4 3 7 6 3 5 4 6 7 5 ...
## $ region : num 1 1 1 1 3 1 1 1 1 1 ...
## $ r_salary : num NA NA NA 4392 NA ...
## $ t_salary : num NA 1980 621 NA 324 NA NA 285 1500 NA ...
## $ age : num 77 74 80 60 82 52 82 60 44 81 ...
## $ edu_grade: chr "중학이하" "중학이하" "대학이상" "전문대" ...
## $ region1 : chr "서울" "서울" "서울" "서울" ...
glimpse(house1)
## Rows: 5,996
## Columns: 9
## $ gender <dbl> 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, …
## $ birth <dbl> 1945, 1948, 1942, 1962, 1940, 1970, 1940, 1962, 1978, 1941, …
## $ edu <dbl> 4, 3, 7, 6, 3, 5, 4, 6, 7, 5, 3, 7, 4, 5, 4, 7, 7, 3, 3, 7, …
## $ region <dbl> 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, …
## $ r_salary <dbl> NA, NA, NA, 4392, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ t_salary <dbl> NA, 1980, 621, NA, 324, NA, NA, 285, 1500, NA, NA, 2400, 279…
## $ age <dbl> 77, 74, 80, 60, 82, 52, 82, 60, 44, 81, 58, 47, 61, 70, 61, …
## $ edu_grade <chr> "중학이하", "중학이하", "대학이상", "전문대", "중학이하", "…
## $ region1 <chr> "서울", "서울", "서울", "서울", "시", "서울", "서울", "서울"…
#4-1
house1 %>% filter(!is.na(r_salary)) %>%
group_by(gender) %>% filter(r_salary==max(r_salary))
## # A tibble: 2 × 9
## # Groups: gender [2]
## gender birth edu region r_salary t_salary age edu_grade region1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 1992 7 2 85860 NA 30 대학이상 광역시
## 2 2 1997 7 3 37260 NA 25 대학이상 시
#4-2
house1 %>% filter(age==76&r_salary==9126)
## gender birth edu region r_salary t_salary age edu_grade region1
## 1 1 1946 4 3 9126 NA 76 중학이하 시
#4-3
house1 %>% filter(!is.na(r_salary)) %>% group_by(gender, edu_grade) %>%
summarize(m=mean(r_salary)) %>% arrange(desc(m))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 3
## # Groups: gender [2]
## gender edu_grade m
## <dbl> <chr> <dbl>
## 1 1 대학이상 6367.
## 2 1 전문대 4951.
## 3 1 고교 4841.
## 4 2 대학이상 4275.
## 5 2 고교 3135.
## 6 1 중학이하 2868.
## 7 2 전문대 2697.
## 8 2 중학이하 1954
#4-4
house1 %>% filter(!is.na(t_salary)) %>% group_by(region1) %>%
summarize(m=mean(t_salary)) %>%
arrange(desc(m))
## # A tibble: 5 × 2
## region1 m
## <chr> <dbl>
## 1 서울 1853.
## 2 시 1693.
## 3 광역시 1591.
## 4 도농복합구 1137.
## 5 구 959.