data("airquality")
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
names(airquality)<-tolower(names(airquality))
names(airquality)
## [1] "ozone" "solar.r" "wind" "temp" "month" "day"
is.na(airquality$ozone)
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
table(is.na(airquality))
##
## FALSE TRUE
## 874 44
table(is.na(airquality$ozone))
##
## FALSE TRUE
## 116 37
summary(is.na(airquality))
## ozone solar.r wind temp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:116 FALSE:146 FALSE:153 FALSE:153
## TRUE :37 TRUE :7
## month day
## Mode :logical Mode :logical
## FALSE:153 FALSE:153
##
sum(airquality$ozone)
## [1] NA
sum(airquality$ozone,na.rm = TRUE)
## [1] 4887
airquality<-na.omit(airquality)
table(is.na(airquality))
##
## FALSE
## 666
#결측치가 아닌 행만 추출
airquality %>% filter(!is.na(ozone)) %>% head(3)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
airquality %>% filter(!is.na(ozone)&!is.na(solar.r)) %>% head(3)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
mean(airquality$ozone,na.rm = TRUE)
## [1] 42.0991
#결측치를 평균값으로 대체
airquality$ozone<-ifelse(is.na(airquality$ozone),42.0991,airquality$ozone)
table(is.na(airquality$ozone))
##
## FALSE
## 111
#3이라는 데이터를 결측치로 처리하여 없애는 과정
ott7<-data.frame(gender=c("1","1","2","2","2","3"),
income=c(200,250,200,300,200,150))
ott7
## gender income
## 1 1 200
## 2 1 250
## 3 2 200
## 4 2 300
## 5 2 200
## 6 3 150
table(ott7$gender)
##
## 1 2 3
## 2 3 1
ott7$gender<-ifelse(ott7$gender==3,NA,ott7$gender)
table(is.na(ott7$gender))
##
## FALSE TRUE
## 5 1
ott7 %>% filter(!is.na(gender))
## gender income
## 1 1 200
## 2 1 250
## 3 2 200
## 4 2 300
## 5 2 200
boxplot(iris$Sepal.Width)$stats

## [,1]
## [1,] 2.2
## [2,] 2.8
## [3,] 3.0
## [4,] 3.3
## [5,] 4.0
iris %>% filter(Sepal.Width>4.0|Sepal.Width<2.2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.7 4.4 1.5 0.4 setosa
## 2 5.2 4.1 1.5 0.1 setosa
## 3 5.5 4.2 1.4 0.2 setosa
## 4 5.0 2.0 3.5 1.0 versicolor
iris$Sepal.Width<-ifelse(iris$Sepal.Width>4.0|iris$Sepal.Width<2.2,NA,iris$Sepal.Width)
table(is.na(iris$Sepal.Width))
##
## FALSE TRUE
## 146 4
iris %>% filter(!is.na(Sepal.Width)) %>% dim
## [1] 146 5
library(hflights)
hflights %>% count(Dest) %>% filter(n==max(n)|n==min(n))
## Dest n
## 1 AGS 1
## 2 DAL 9820
glimpse(hflights)
## Rows: 227,496
## Columns: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
table(is.na(hflights))
##
## FALSE TRUE
## 4751661 25755
table(hflights$Cancelled)
##
## 0 1
## 224523 2973
data("airquality")
table(is.na(airquality$Ozone))
##
## FALSE TRUE
## 116 37
median(airquality$Ozone,na.rm = TRUE)
## [1] 31.5
airquality$Ozone<-ifelse(is.na(airquality$Ozone),31.5,airquality$Ozone)
mean(airquality$Ozone)
## [1] 39.55882
library(ggplot2)
diamonds %>% select(price) %>% arrange(desc(price)) %>% head(3)
## # A tibble: 3 × 1
## price
## <int>
## 1 18823
## 2 18818
## 3 18806
diamonds %>% group_by(cut,color) %>% summarize(m=mean(price)) %>% arrange(desc(m)) %>% head(3)
## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups: cut [2]
## cut color m
## <ord> <ord> <dbl>
## 1 Premium J 6295.
## 2 Premium I 5946.
## 3 Very Good I 5256.
diamonds %>% group_by(cut) %>% summarize(n=n()) %>% mutate(total=sum(n),pct=n/total*100)
## # A tibble: 5 × 4
## cut n total pct
## <ord> <int> <int> <dbl>
## 1 Fair 1610 53940 2.98
## 2 Good 4906 53940 9.10
## 3 Very Good 12082 53940 22.4
## 4 Premium 13791 53940 25.6
## 5 Ideal 21551 53940 40.0
library(gapminder)
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
gapminder %>% filter(continent=="Africa"|continent=="Europe") %>% arrange(desc(pop))
## # A tibble: 984 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Nigeria Africa 2007 46.9 135031164 2014.
## 2 Nigeria Africa 2002 46.6 119901274 1615.
## 3 Nigeria Africa 1997 47.5 106207839 1625.
## 4 Nigeria Africa 1992 47.5 93364244 1620.
## 5 Germany Europe 2007 79.4 82400996 32170.
## 6 Germany Europe 2002 78.7 82350671 30036.
## 7 Germany Europe 1997 77.3 82011073 27789.
## 8 Nigeria Africa 1987 46.9 81551520 1385.
## 9 Germany Europe 1992 76.1 80597764 26505.
## 10 Egypt Africa 2007 71.3 80264543 5581.
## # … with 974 more rows
setwd("c:/data")
library(readxl)
airseoul<-read_excel("period1.xlsx")
str(airseoul)
## tibble [1,535 × 8] (S3: tbl_df/tbl/data.frame)
## $ 날짜 : chr [1:1535] "전체" "2022-03-31" "2022-03-31" "2022-03-31" ...
## $ 측정소명 : chr [1:1535] "평균" "평균" "강남구" "강동구" ...
## $ 미세먼지 PM10
## (㎍/m3) : num [1:1535] 41 20 21 26 NA 23 19 21 23 17 ...
## $ 초미세먼지
## PM2.5 (㎍/m3): num [1:1535] 23 11 10 13 17 9 9 10 10 9 ...
## $ 오존
## O3 (ppm) : chr [1:1535] "0.026" "0.032" "0.033" "0.026" ...
## $ 이산화질소
## NO2 (ppm) : chr [1:1535] "0.026" "0.014" "0.015" "0.014" ...
## $ 일산화탄소
## CO (ppm) : chr [1:1535] "0.5" "0.3" "0.3" "0.3" ...
## $ 아황산가스
## SO2(ppm) : num [1:1535] 0.003 0.002 0.003 0.002 0.003 0.003 0.002 0.003 0.003 0.002 ...
glimpse(airseoul)
## Rows: 1,535
## Columns: 8
## $ 날짜 <chr> "전체", "2022-03-31", "2022-03-31", "202…
## $ 측정소명 <chr> "평균", "평균", "강남구", "강동구", "강…
## $ `미세먼지 PM10\r\n(㎍/m3)` <dbl> 41, 20, 21, 26, NA, 23, 19, 21, 23, 17, …
## $ `초미세먼지\r\nPM2.5 (㎍/m3)` <dbl> 23, 11, 10, 13, 17, 9, 9, 10, 10, 9, 11,…
## $ `오존\r\nO3 (ppm)` <chr> "0.026", "0.032", "0.033", "0.026", "0.0…
## $ `이산화질소\r\nNO2 (ppm)` <chr> "0.026", "0.014", "0.015", "0.014", "0.0…
## $ `일산화탄소\r\nCO (ppm)` <chr> "0.5", "0.3", "0.3", "0.3", "0.3", "0.4"…
## $ `아황산가스\r\nSO2(ppm)` <dbl> 0.003, 0.002, 0.003, 0.002, 0.003, 0.003…
#데이터 형태 chr형태인것 바꾸기
names(airseoul)
## [1] "날짜" "측정소명"
## [3] "미세먼지 PM10\r\n(㎍/m3)" "초미세먼지\r\nPM2.5 (㎍/m3)"
## [5] "오존\r\nO3 (ppm)" "이산화질소\r\nNO2 (ppm)"
## [7] "일산화탄소\r\nCO (ppm)" "아황산가스\r\nSO2(ppm)"
#오류생성 방지를 위해 한글변수이름을 영문변수로 변경
airseoul1<-airseoul %>%
rename(date="날짜",
region="측정소명",
pm10="미세먼지 PM10\r\n(㎍/m3)",
pm2.5="초미세먼지\r\nPM2.5 (㎍/m3)") %>%
select(date,region,pm10,pm2.5)
table(airseoul1$date)
##
## 2022-02-01 2022-02-02 2022-02-03 2022-02-04 2022-02-05 2022-02-06 2022-02-07
## 26 26 26 26 26 26 26
## 2022-02-08 2022-02-09 2022-02-10 2022-02-11 2022-02-12 2022-02-13 2022-02-14
## 26 26 26 26 26 26 26
## 2022-02-15 2022-02-16 2022-02-17 2022-02-18 2022-02-19 2022-02-20 2022-02-21
## 26 26 26 26 26 26 26
## 2022-02-22 2022-02-23 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28
## 26 26 26 26 26 26 26
## 2022-03-01 2022-03-02 2022-03-03 2022-03-04 2022-03-05 2022-03-06 2022-03-07
## 26 26 26 26 26 26 26
## 2022-03-08 2022-03-09 2022-03-10 2022-03-11 2022-03-12 2022-03-13 2022-03-14
## 26 26 26 26 26 26 26
## 2022-03-15 2022-03-16 2022-03-17 2022-03-18 2022-03-19 2022-03-20 2022-03-21
## 26 26 26 26 26 26 26
## 2022-03-22 2022-03-23 2022-03-24 2022-03-25 2022-03-26 2022-03-27 2022-03-28
## 26 26 26 26 26 26 26
## 2022-03-29 2022-03-30 2022-03-31 전체
## 26 26 26 1
table(airseoul1$region)
##
## 강남구 강동구 강북구 강서구 관악구 광진구 구로구 금천구
## 59 59 59 59 59 59 59 59
## 노원구 도봉구 동대문구 동작구 마포구 서대문구 서초구 성동구
## 59 59 59 59 59 59 59 59
## 성북구 송파구 양천구 영등포구 용산구 은평구 종로구 중구
## 59 59 59 59 59 59 59 59
## 중랑구 평균
## 59 60
airseoul1<-airseoul1 %>% filter(date!="전체"®ion!="평균")
table(airseoul1$date)
##
## 2022-02-01 2022-02-02 2022-02-03 2022-02-04 2022-02-05 2022-02-06 2022-02-07
## 25 25 25 25 25 25 25
## 2022-02-08 2022-02-09 2022-02-10 2022-02-11 2022-02-12 2022-02-13 2022-02-14
## 25 25 25 25 25 25 25
## 2022-02-15 2022-02-16 2022-02-17 2022-02-18 2022-02-19 2022-02-20 2022-02-21
## 25 25 25 25 25 25 25
## 2022-02-22 2022-02-23 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28
## 25 25 25 25 25 25 25
## 2022-03-01 2022-03-02 2022-03-03 2022-03-04 2022-03-05 2022-03-06 2022-03-07
## 25 25 25 25 25 25 25
## 2022-03-08 2022-03-09 2022-03-10 2022-03-11 2022-03-12 2022-03-13 2022-03-14
## 25 25 25 25 25 25 25
## 2022-03-15 2022-03-16 2022-03-17 2022-03-18 2022-03-19 2022-03-20 2022-03-21
## 25 25 25 25 25 25 25
## 2022-03-22 2022-03-23 2022-03-24 2022-03-25 2022-03-26 2022-03-27 2022-03-28
## 25 25 25 25 25 25 25
## 2022-03-29 2022-03-30 2022-03-31
## 25 25 25
table(airseoul1$region)
##
## 강남구 강동구 강북구 강서구 관악구 광진구 구로구 금천구
## 59 59 59 59 59 59 59 59
## 노원구 도봉구 동대문구 동작구 마포구 서대문구 서초구 성동구
## 59 59 59 59 59 59 59 59
## 성북구 송파구 양천구 영등포구 용산구 은평구 종로구 중구
## 59 59 59 59 59 59 59 59
## 중랑구
## 59
summary(airseoul1$pm10)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.00 27.00 36.00 40.54 50.00 112.00 7
summary(airseoul1$pm2.5)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 13.00 19.00 23.28 32.00 92.00 3
airseoul1<-airseoul1 %>% filter(!is.na(pm10)&!is.na(pm2.5))
#미세먼지가 가장심한 날짜,지역,미세먼지수치 구하기
airseoul1 %>% filter(pm10==max(pm10)) %>% select(date,region,pm10)
## # A tibble: 1 × 3
## date region pm10
## <chr> <chr> <dbl>
## 1 2022-03-05 구로구 112
#미세먼지 평균을 구하고 평균기준으로 상위 5개 구 구하기
airseoul1 %>% group_by(region) %>% summarize(m=mean(pm10)) %>% arrange(desc(m)) %>% head(5)
## # A tibble: 5 × 2
## region m
## <chr> <dbl>
## 1 양천구 44.4
## 2 강북구 44.2
## 3 강서구 43.8
## 4 노원구 43.7
## 5 강동구 43.6
#미세먼지 상황은 1일수치가 0-30이면 굿,31-80이면 노말,81-150이면 뱃,151이상이면 월스로
#등급변수를 만들고 굿의 빈도수와 백분율 구하기
airseoul1 %>% mutate(pm_grade=ifelse(pm10<=30,"good",
ifelse(pm10<=81,"normal",
ifelse(pm10<=150,"bad","worse")))) %>%
group_by(pm_grade) %>%
summarize(n=n()) %>%
mutate(total=sum(n),pct=n/total*100)
## # A tibble: 3 × 4
## pm_grade n total pct
## <chr> <int> <int> <dbl>
## 1 bad 71 1467 4.84
## 2 good 538 1467 36.7
## 3 normal 858 1467 58.5
#초미세먼지가 최소인 날짜중에서 미세먼지가 최대인지역은?
airseoul1 %>% filter(pm2.5==min(pm2.5)) %>%
arrange(desc(pm10))
## # A tibble: 6 × 4
## date region pm10 pm2.5
## <chr> <chr> <dbl> <dbl>
## 1 2022-03-18 성동구 7 1
## 2 2022-03-18 구로구 6 1
## 3 2022-03-18 서초구 6 1
## 4 2022-03-19 구로구 5 1
## 5 2022-03-18 서대문구 5 1
## 6 2022-03-19 종로구 4 1
subway_202203<-read.csv("CARD_SUBWAY_MONTH_202203.csv",
fileEncoding = "euc-kr")
str(subway_202203)
## 'data.frame': 18467 obs. of 6 variables:
## $ 사용일자 : int 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 ...
## $ 노선명 : chr "장항선" "장항선" "장항선" "안산선" ...
## $ 역명 : chr "배방" "온양온천" "신창(순천향대)" "오이도" ...
## $ 승차총승객수: int 593 2388 1065 4789 1892 2122 1360 1836 2211 1899 ...
## $ 하차총승객수: int 698 2517 1164 4668 1693 2228 1331 1663 2122 1814 ...
## $ 등록일자 : int 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 ...
subway_202203<-subway_202203 %>%
rename(date="사용일자",
line="노선명",
station="역명",
on_pass="승차총승객수",
off_pass="하차총승객수") %>%
select(-"등록일자")
summary(subway_202203)
## date line station on_pass
## Min. :20220301 Length:18467 Length:18467 Min. : 1
## 1st Qu.:20220308 Class :character Class :character 1st Qu.: 3078
## Median :20220316 Mode :character Mode :character Median : 6334
## Mean :20220316 Mean : 8852
## 3rd Qu.:20220324 3rd Qu.:11838
## Max. :20220331 Max. :80279
## off_pass
## Min. : 0
## 1st Qu.: 2989
## Median : 6229
## Mean : 8823
## 3rd Qu.:11742
## Max. :78816
#역의 하루평균 승차고객수와 하차고객수 구하기
subway_202203 %>% summarise(on_p=mean(on_pass),off_p=mean(off_pass))
## on_p off_p
## 1 8851.886 8822.759
#승차고객수가 가장많은 노선과 승차고객수는?
subway_202203 %>% filter(on_pass==max(on_pass))
## date line station on_pass off_pass
## 1 20220325 2호선 강남 80279 78816
#전체승객수는 승차총고객수와 하차총고객수의 합으로 정의,역별로 평균 전체승객수 상위3개역은?
subway_202203 %>% group_by(station) %>%
mutate(total_pass=on_pass+off_pass) %>%
summarize(m=mean(total_pass)) %>% arrange(desc(m)) %>% head(3)
## # A tibble: 3 × 2
## station m
## <chr> <dbl>
## 1 강남 125027.
## 2 신림 101545.
## 3 구로디지털단지 88652.
#1호선 평균전체승객이 가장많았던 역과 요일 구하기
subway_202203 %>%mutate(total_pass=on_pass+off_pass)%>%
filter(line=="1호선") %>% filter( total_pass==max(total_pass))
## date line station on_pass off_pass total_pass
## 1 20220325 1호선 서울역 41104 41346 82450
#주중과 휴일의 전체승객수가 통계적으로 차이가있는가를 t.test를통해 문석하고 그때의 유의확률 구하기
table(subway_202203$date)
##
## 20220301 20220302 20220303 20220304 20220305 20220306 20220307 20220308
## 593 598 597 598 595 599 597 599
## 20220309 20220310 20220311 20220312 20220313 20220314 20220315 20220316
## 595 597 597 595 594 597 594 596
## 20220317 20220318 20220319 20220320 20220321 20220322 20220323 20220324
## 597 597 594 595 596 593 594 594
## 20220325 20220326 20220327 20220328 20220329 20220330 20220331
## 595 597 595 595 595 593 596
#특정문자를 추출하는 한수substr, 7번째와8번째 숫자 추출하여 몇일단위 추출
subway_202203$day<-substr(subway_202203$date,7,8)
table(subway_202203$day)
##
## 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20
## 593 598 597 598 595 599 597 599 595 597 597 595 594 597 594 596 597 597 594 595
## 21 22 23 24 25 26 27 28 29 30 31
## 596 593 594 594 595 597 595 595 595 593 596
glimpse(subway_202203)
## Rows: 18,467
## Columns: 6
## $ date <int> 20220301, 20220301, 20220301, 20220301, 20220301, 20220301, 2…
## $ line <chr> "장항선", "장항선", "장항선", "안산선", "안산선", "우이신설선…
## $ station <chr> "배방", "온양온천", "신창(순천향대)", "오이도", "수리산", "북…
## $ on_pass <int> 593, 2388, 1065, 4789, 1892, 2122, 1360, 1836, 2211, 1899, 12…
## $ off_pass <int> 698, 2517, 1164, 4668, 1693, 2228, 1331, 1663, 2122, 1814, 12…
## $ day <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "01", "…
subway_202203$day<-as.numeric(subway_202203$day)
subway_202203$week<-ifelse(subway_202203$day%in%c(5,6,12,13,19,20,26,27),
"weekend","weekday")
table(subway_202203$week)
##
## weekday weekend
## 13703 4764
subway_202203<-subway_202203 %>% mutate(total_pass=on_pass+off_pass)
options(scipen = 999)
t.test(data=subway_202203,total_pass~week)
##
## Welch Two Sample t-test
##
## data: total_pass by week
## t = 32.794, df = 12509, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
## 7342.155 8275.667
## sample estimates:
## mean in group weekday mean in group weekend
## 19689.14 11880.23
library(foreign)
koweps<-read.spss("koweps_h16_2021_beta1.sav")
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Compression bias (0) is not the usual value of 100
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Very long string record(s) found (record type 7, subtype 14), each will be
## imported in consecutive separate variables
class(koweps)
## [1] "list"
korwps_21<-as.data.frame(koweps)
house<-korwps_21 %>% select(h1601_4,h1601_5,h1601_6,
h16_reg5,h1608_114,h1608_122)
str(house)
## 'data.frame': 5996 obs. of 6 variables:
## $ h1601_4 : num 2 1 1 1 2 2 1 1 1 1 ...
## $ h1601_5 : num 1945 1948 1942 1962 1940 ...
## $ h1601_6 : num 4 3 7 6 3 5 4 6 7 5 ...
## $ h16_reg5 : num 1 1 1 1 3 1 1 1 1 1 ...
## $ h1608_114: num NA NA NA 4392 NA ...
## $ h1608_122: num NA 1980 621 NA 324 NA NA 285 1500 NA ...
house1<-house %>%
rename(gender=h1601_4,
birth=h1601_5,
edu=h1601_6,
region=h16_reg5,
r_salary=h1608_114,
t_salary=h1608_122)
summary(house1)
## gender birth edu region r_salary
## Min. :1.000 Min. :1922 Min. :2.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.:1942 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 3280
## Median :1.000 Median :1955 Median :5.000 Median :3.000 Median : 4620
## Mean :1.357 Mean :1957 Mean :4.635 Mean :2.702 Mean : 5250
## 3rd Qu.:2.000 3rd Qu.:1970 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 6620
## Max. :2.000 Max. :2001 Max. :9.000 Max. :5.000 Max. :85860
## NA's :4566
## t_salary
## Min. : 0
## 1st Qu.: 297
## Median : 1040
## Mean : 1552
## 3rd Qu.: 2340
## Max. :14580
## NA's :4384
house1$r_salary<-ifelse(house1$r_salary==0,NA,house1$r_salary)
house1$t_salary<-ifelse(house1$t_salary==0,NA,house1$t_salary)
house1$age<-2021-house1$birth+1
range(house1$age)
## [1] 21 100
table(house1$edu)
##
## 2 3 4 5 6 7 8 9
## 562 1358 810 1635 462 975 163 31
house1$edu_grade<-ifelse(house1$edu%in%c(2,3,4),"중학이하",
ifelse(house1$edu==5,"고교",
ifelse(house1$edu==6,"전문대","대학이상")))
table(house1$edu_grade)
##
## 고교 대학이상 전문대 중학이하
## 1635 1169 462 2730
table(house1$region)
##
## 1 2 3 4 5
## 795 1612 2337 1089 163
region_name<-data.frame(region=c(1,2,3,4,5),
region1=c("서울","광역시","시","구","도농복합구"))
house1<-left_join(house1,region_name,by="region")
str(house1)
## 'data.frame': 5996 obs. of 9 variables:
## $ gender : num 2 1 1 1 2 2 1 1 1 1 ...
## $ birth : num 1945 1948 1942 1962 1940 ...
## $ edu : num 4 3 7 6 3 5 4 6 7 5 ...
## $ region : num 1 1 1 1 3 1 1 1 1 1 ...
## $ r_salary : num NA NA NA 4392 NA ...
## $ t_salary : num NA 1980 621 NA 324 NA NA 285 1500 NA ...
## $ age : num 77 74 80 60 82 52 82 60 44 81 ...
## $ edu_grade: chr "중학이하" "중학이하" "대학이상" "전문대" ...
## $ region1 : chr "서울" "서울" "서울" "서울" ...