library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("airquality")
glimpse(airquality)
## Rows: 153
## Columns: 6
## $ Ozone <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 14, …
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290, 27…
## $ Wind <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9, 9…
## $ Temp <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58, 64…
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
names(airquality)<-tolower(names(airquality))
names(airquality)
## [1] "ozone" "solar.r" "wind" "temp" "month" "day"
is.na(airquality$ozone)
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
table(is.na(airquality))
##
## FALSE TRUE
## 874 44
table(is.na(airquality$ozone))
##
## FALSE TRUE
## 116 37
summary(is.na(airquality))
## ozone solar.r wind temp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:116 FALSE:146 FALSE:153 FALSE:153
## TRUE :37 TRUE :7
## month day
## Mode :logical Mode :logical
## FALSE:153 FALSE:153
##
sum(airquality$ozone)
## [1] NA
mean(airquality$ozone)
## [1] NA
sum(airquality$ozone,na.rm=TRUE)
## [1] 4887
mean(airquality$ozone,na.rm=TRUE)
## [1] 42.12931
airquality<-na.omit(airquality)
table(is.na(airquality))
##
## FALSE
## 666
library(dplyr)
names(airquality)<-tolower(names(airquality))
airquality %>% filter(!is.na(ozone)) %>% head(3)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
airquality %>% filter(!is.na(ozone)&!is.na(solar.r)) %>% head(3)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
mean(airquality$ozone,na.rm=TRUE)
## [1] 42.0991
ott7<-data.frame(gender=c("1","1","2","2","2","3"),income=c(200,250,200,300,200,150))
ott7
## gender income
## 1 1 200
## 2 1 250
## 3 2 200
## 4 2 300
## 5 2 200
## 6 3 150
table(ott7$gender)
##
## 1 2 3
## 2 3 1
boxplot(iris$Sepal.Width)$stats

## [,1]
## [1,] 2.2
## [2,] 2.8
## [3,] 3.0
## [4,] 3.3
## [5,] 4.0
library(dplyr)
iris %>% filter(Sepal.Width>4.0|Sepal.Width<2.2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.7 4.4 1.5 0.4 setosa
## 2 5.2 4.1 1.5 0.1 setosa
## 3 5.5 4.2 1.4 0.2 setosa
## 4 5.0 2.0 3.5 1.0 versicolor
iris$Sepal.Width<-ifelse(iris$Sepal.Width>4.0|iris$Sepal.Width<2.2,NA,iris$Sepal.Width)
table(is.na(iris$Sepal.Width))
##
## FALSE TRUE
## 146 4
#1
##install.packages("hflights")
library(hflights)
library(dplyr)
#install.packages("hflights")
hflights %>% count(Dest) %>% filter(n==max(n)|n==min(n))
## Dest n
## 1 AGS 1
## 2 DAL 9820
#5
#install.packages("ggplot2")
library(dplyr)
library(readxl)
#install.packages("readxl")
library(readxl)
setwd("c:/data")
airseoul<-read_excel("period1.xlsx")
str(airseoul)
## tibble [1,535 × 8] (S3: tbl_df/tbl/data.frame)
## $ 날짜 : chr [1:1535] "전체" "2022-03-31" "2022-03-31" "2022-03-31" ...
## $ 측정소명 : chr [1:1535] "평균" "평균" "강남구" "강동구" ...
## $ 미세먼지 PM10
## (㎍/m3) : num [1:1535] 41 20 21 26 NA 23 19 21 23 17 ...
## $ 초미세먼지
## PM2.5 (㎍/m3): num [1:1535] 23 11 10 13 17 9 9 10 10 9 ...
## $ 오존
## O3 (ppm) : chr [1:1535] "0.026" "0.032" "0.033" "0.026" ...
## $ 이산화질소
## NO2 (ppm) : chr [1:1535] "0.026" "0.014" "0.015" "0.014" ...
## $ 일산화탄소
## CO (ppm) : chr [1:1535] "0.5" "0.3" "0.3" "0.3" ...
## $ 아황산가스
## SO2(ppm) : num [1:1535] 0.003 0.002 0.003 0.002 0.003 0.003 0.002 0.003 0.003 0.002 ...
select<-dplyr::select
names(airseoul)
## [1] "날짜" "측정소명"
## [3] "미세먼지 PM10\r\n(㎍/m3)" "초미세먼지\r\nPM2.5 (㎍/m3)"
## [5] "오존\r\nO3 (ppm)" "이산화질소\r\nNO2 (ppm)"
## [7] "일산화탄소\r\nCO (ppm)" "아황산가스\r\nSO2(ppm)"
airseoul1<-airseoul %>%
rename(date="날짜",region="측정소명",pm10="미세먼지 PM10\r\n(㎍/m3)",pm2.5="초미세먼지\r\nPM2.5 (㎍/m3)") %>%
select(date,region,pm10,pm2.5)
table(airseoul$date)
## Warning: Unknown or uninitialised column: `date`.
## < table of extent 0 >
table(airseoul$region)
## Warning: Unknown or uninitialised column: `region`.
## < table of extent 0 >
airseoul1<-airseoul1 %>% filter(date!="전체"®ion!="평균")
table(airseoul1$date)
##
## 2022-02-01 2022-02-02 2022-02-03 2022-02-04 2022-02-05 2022-02-06 2022-02-07
## 25 25 25 25 25 25 25
## 2022-02-08 2022-02-09 2022-02-10 2022-02-11 2022-02-12 2022-02-13 2022-02-14
## 25 25 25 25 25 25 25
## 2022-02-15 2022-02-16 2022-02-17 2022-02-18 2022-02-19 2022-02-20 2022-02-21
## 25 25 25 25 25 25 25
## 2022-02-22 2022-02-23 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28
## 25 25 25 25 25 25 25
## 2022-03-01 2022-03-02 2022-03-03 2022-03-04 2022-03-05 2022-03-06 2022-03-07
## 25 25 25 25 25 25 25
## 2022-03-08 2022-03-09 2022-03-10 2022-03-11 2022-03-12 2022-03-13 2022-03-14
## 25 25 25 25 25 25 25
## 2022-03-15 2022-03-16 2022-03-17 2022-03-18 2022-03-19 2022-03-20 2022-03-21
## 25 25 25 25 25 25 25
## 2022-03-22 2022-03-23 2022-03-24 2022-03-25 2022-03-26 2022-03-27 2022-03-28
## 25 25 25 25 25 25 25
## 2022-03-29 2022-03-30 2022-03-31
## 25 25 25
table(airseoul1$region)
##
## 강남구 강동구 강북구 강서구 관악구 광진구 구로구 금천구
## 59 59 59 59 59 59 59 59
## 노원구 도봉구 동대문구 동작구 마포구 서대문구 서초구 성동구
## 59 59 59 59 59 59 59 59
## 성북구 송파구 양천구 영등포구 용산구 은평구 종로구 중구
## 59 59 59 59 59 59 59 59
## 중랑구
## 59
summary(airseoul1$pm10)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.00 27.00 36.00 40.54 50.00 112.00 7
summary(airseoul1$pm2.5)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 13.00 19.00 23.28 32.00 92.00 3
airseoul1<-airseoul1 %>% filter(!is.na(pm10)&!is.na(pm2.5))
#airseoul1 %>% mutate(pm_grade=ifelse(pm10<=30,"good",ifelse(pm10<=81,"normal",ifelse(pm<=150,"bad","worse")))) %>%
#group_by(pm_grade) %>%
# summarize(n=n())
airseoul1 %>% filter(pm2.5==min(pm2.5)) %>% arrange(desc(pm10))
## # A tibble: 6 × 4
## date region pm10 pm2.5
## <chr> <chr> <dbl> <dbl>
## 1 2022-03-18 성동구 7 1
## 2 2022-03-18 구로구 6 1
## 3 2022-03-18 서초구 6 1
## 4 2022-03-19 구로구 5 1
## 5 2022-03-18 서대문구 5 1
## 6 2022-03-19 종로구 4 1
subway_202203<-read.csv("CARD_SUBWAY_MONTH_202203.csv",fileEncoding="euc-kr")
str(subway_202203)
## 'data.frame': 18467 obs. of 6 variables:
## $ 사용일자 : int 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 ...
## $ 노선명 : chr "장항선" "장항선" "장항선" "안산선" ...
## $ 역명 : chr "배방" "온양온천" "신창(순천향대)" "오이도" ...
## $ 승차총승객수: int 593 2388 1065 4789 1892 2122 1360 1836 2211 1899 ...
## $ 하차총승객수: int 698 2517 1164 4668 1693 2228 1331 1663 2122 1814 ...
## $ 등록일자 : int 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 ...
library(dplyr)
subway_202203<-subway_202203 %>%
rename(date="사용일자",line="노선명",station="역명",on_pass="승차총승객수",off_pass="하차총승객수") %>%
select(-"등록일자")
summary(subway_202203)
## date line station on_pass
## Min. :20220301 Length:18467 Length:18467 Min. : 1
## 1st Qu.:20220308 Class :character Class :character 1st Qu.: 3078
## Median :20220316 Mode :character Mode :character Median : 6334
## Mean :20220316 Mean : 8852
## 3rd Qu.:20220324 3rd Qu.:11838
## Max. :20220331 Max. :80279
## off_pass
## Min. : 0
## 1st Qu.: 2989
## Median : 6229
## Mean : 8823
## 3rd Qu.:11742
## Max. :78816
subway_202203 %>% summarise(on_p=mean(on_pass),off_p=mean(off_pass))
## on_p off_p
## 1 8851.886 8822.759
subway_202203 %>% filter(on_pass==max(on_pass))
## date line station on_pass off_pass
## 1 20220325 2호선 강남 80279 78816
subway_202203 %>% group_by(station) %>%
mutate(total_pass=on_pass+off_pass) %>%
summarize(m=mean(total_pass)) %>% arrange(desc(m)) %>% head
## # A tibble: 6 × 2
## station m
## <chr> <dbl>
## 1 강남 125027.
## 2 신림 101545.
## 3 구로디지털단지 88652.
## 4 서울대입구(관악구청) 77404.
## 5 역삼 75427.
## 6 삼성(무역센터) 74311.
#4번
subway_202203 %>% mutate(total_pass=on_pass+off_pass) %>%
filter(line=="1호선") %>% filter(total_pass==max(total_pass))
## date line station on_pass off_pass total_pass
## 1 20220325 1호선 서울역 41104 41346 82450
#5번
table(subway_202203$date)
##
## 20220301 20220302 20220303 20220304 20220305 20220306 20220307 20220308
## 593 598 597 598 595 599 597 599
## 20220309 20220310 20220311 20220312 20220313 20220314 20220315 20220316
## 595 597 597 595 594 597 594 596
## 20220317 20220318 20220319 20220320 20220321 20220322 20220323 20220324
## 597 597 594 595 596 593 594 594
## 20220325 20220326 20220327 20220328 20220329 20220330 20220331
## 595 597 595 595 595 593 596
subway_202203$day<-substr(subway_202203$date,7,8)
table(subway_202203$day)
##
## 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20
## 593 598 597 598 595 599 597 599 595 597 597 595 594 597 594 596 597 597 594 595
## 21 22 23 24 25 26 27 28 29 30 31
## 596 593 594 594 595 597 595 595 595 593 596
subway_202203$day<-as.numeric(subway_202203$day)
subway_202203$week<-ifelse(subway_202203$day%in%c(5,6,12,13,19,20,26,27),"weekend","weekday")
table(subway_202203$week)
##
## weekday weekend
## 13703 4764
options(scipen=999)
subway_202203<-subway_202203 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202203,total_pass~week)
##
## Welch Two Sample t-test
##
## data: total_pass by week
## t = 32.794, df = 12509, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
## 7342.155 8275.667
## sample estimates:
## mean in group weekday mean in group weekend
## 19689.14 11880.23
#1분석데이터 구조 확인하기
library(foreign)
koweps<-read.spss("koweps_h16_2021_beta1.sav")
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Compression bias (0) is not the usual value of 100
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Very long string record(s) found (record type 7, subtype 14), each will be
## imported in consecutive separate variables
class(koweps)
## [1] "list"
korwps_21<-as.data.frame(koweps)
house<-korwps_21 %>% select(h1601_4,h1601_5,h1601_6,h16_reg5,h1608_114,h1608_122)
str(house)
## 'data.frame': 5996 obs. of 6 variables:
## $ h1601_4 : num 2 1 1 1 2 2 1 1 1 1 ...
## $ h1601_5 : num 1945 1948 1942 1962 1940 ...
## $ h1601_6 : num 4 3 7 6 3 5 4 6 7 5 ...
## $ h16_reg5 : num 1 1 1 1 3 1 1 1 1 1 ...
## $ h1608_114: num NA NA NA 4392 NA ...
## $ h1608_122: num NA 1980 621 NA 324 NA NA 285 1500 NA ...
library(dplyr)
house1<-house %>%
rename(gender=h1601_4,birth=h1601_5,edu=h1601_6,region=h16_reg5,r_salary=h1608_114,t_salary=h1608_122)
summary(house1)
## gender birth edu region r_salary
## Min. :1.000 Min. :1922 Min. :2.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.:1942 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 3280
## Median :1.000 Median :1955 Median :5.000 Median :3.000 Median : 4620
## Mean :1.357 Mean :1957 Mean :4.635 Mean :2.702 Mean : 5250
## 3rd Qu.:2.000 3rd Qu.:1970 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 6620
## Max. :2.000 Max. :2001 Max. :9.000 Max. :5.000 Max. :85860
## NA's :4566
## t_salary
## Min. : 0
## 1st Qu.: 297
## Median : 1040
## Mean : 1552
## 3rd Qu.: 2340
## Max. :14580
## NA's :4384
#2결측치와 이상값 확인하기
house1$r_salary<-ifelse(house1$r_salary==0,NA,house1$r_salary)
house1$t_salary<-ifelse(house1$r_salary==0,NA,house1$r_salary)
house1$age<-2021-house1$birth+1
range(house1$age)
## [1] 21 100
#3 범주형 분류 재부호하기