library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data("airquality")
glimpse(airquality)
## Rows: 153
## Columns: 6
## $ Ozone   <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 14, …
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290, 27…
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9, 9…
## $ Temp    <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58, 64…
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
names(airquality)<-tolower(names(airquality))
names(airquality)
## [1] "ozone"   "solar.r" "wind"    "temp"    "month"   "day"
is.na(airquality$ozone)
##   [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
##  [49] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
table(is.na(airquality))
## 
## FALSE  TRUE 
##   874    44
table(is.na(airquality$ozone))
## 
## FALSE  TRUE 
##   116    37
summary(is.na(airquality))
##    ozone          solar.r           wind            temp        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:116       FALSE:146       FALSE:153       FALSE:153      
##  TRUE :37        TRUE :7                                        
##    month            day         
##  Mode :logical   Mode :logical  
##  FALSE:153       FALSE:153      
## 
sum(airquality$ozone)
## [1] NA
mean(airquality$ozone)
## [1] NA
sum(airquality$ozone,na.rm=TRUE)
## [1] 4887
mean(airquality$ozone,na.rm=TRUE)
## [1] 42.12931
airquality<-na.omit(airquality)
table(is.na(airquality))
## 
## FALSE 
##   666
library(dplyr)
names(airquality)<-tolower(names(airquality))
airquality %>% filter(!is.na(ozone)) %>% head(3)
##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
airquality %>% filter(!is.na(ozone)&!is.na(solar.r)) %>% head(3)
##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
mean(airquality$ozone,na.rm=TRUE)
## [1] 42.0991
ott7<-data.frame(gender=c("1","1","2","2","2","3"),income=c(200,250,200,300,200,150))
ott7
##   gender income
## 1      1    200
## 2      1    250
## 3      2    200
## 4      2    300
## 5      2    200
## 6      3    150
table(ott7$gender)
## 
## 1 2 3 
## 2 3 1
boxplot(iris$Sepal.Width)$stats

##      [,1]
## [1,]  2.2
## [2,]  2.8
## [3,]  3.0
## [4,]  3.3
## [5,]  4.0
library(dplyr)
iris %>% filter(Sepal.Width>4.0|Sepal.Width<2.2)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.7         4.4          1.5         0.4     setosa
## 2          5.2         4.1          1.5         0.1     setosa
## 3          5.5         4.2          1.4         0.2     setosa
## 4          5.0         2.0          3.5         1.0 versicolor
iris$Sepal.Width<-ifelse(iris$Sepal.Width>4.0|iris$Sepal.Width<2.2,NA,iris$Sepal.Width)
table(is.na(iris$Sepal.Width))
## 
## FALSE  TRUE 
##   146     4
#1
##install.packages("hflights")
library(hflights)
library(dplyr)
#install.packages("hflights")
hflights %>% count(Dest) %>% filter(n==max(n)|n==min(n))
##   Dest    n
## 1  AGS    1
## 2  DAL 9820
#5
#install.packages("ggplot2")

                            




library(dplyr)
library(readxl)
#install.packages("readxl")
library(readxl)
setwd("c:/data")
airseoul<-read_excel("period1.xlsx")
str(airseoul)
## tibble [1,535 × 8] (S3: tbl_df/tbl/data.frame)
##  $ 날짜                       : chr [1:1535] "전체" "2022-03-31" "2022-03-31" "2022-03-31" ...
##  $ 측정소명                   : chr [1:1535] "평균" "평균" "강남구" "강동구" ...
##  $ 미세먼지 PM10
## (㎍/m3)   : num [1:1535] 41 20 21 26 NA 23 19 21 23 17 ...
##  $ 초미세먼지
## PM2.5 (㎍/m3): num [1:1535] 23 11 10 13 17 9 9 10 10 9 ...
##  $ 오존
## O3 (ppm)           : chr [1:1535] "0.026" "0.032" "0.033" "0.026" ...
##  $ 이산화질소
## NO2 (ppm)    : chr [1:1535] "0.026" "0.014" "0.015" "0.014" ...
##  $ 일산화탄소
## CO (ppm)     : chr [1:1535] "0.5" "0.3" "0.3" "0.3" ...
##  $ 아황산가스
## SO2(ppm)     : num [1:1535] 0.003 0.002 0.003 0.002 0.003 0.003 0.002 0.003 0.003 0.002 ...
select<-dplyr::select
names(airseoul)
## [1] "날짜"                        "측정소명"                   
## [3] "미세먼지 PM10\r\n(㎍/m3)"    "초미세먼지\r\nPM2.5 (㎍/m3)"
## [5] "오존\r\nO3 (ppm)"            "이산화질소\r\nNO2 (ppm)"    
## [7] "일산화탄소\r\nCO (ppm)"      "아황산가스\r\nSO2(ppm)"
airseoul1<-airseoul %>% 
rename(date="날짜",region="측정소명",pm10="미세먼지 PM10\r\n(㎍/m3)",pm2.5="초미세먼지\r\nPM2.5 (㎍/m3)") %>% 
select(date,region,pm10,pm2.5)
table(airseoul$date)
## Warning: Unknown or uninitialised column: `date`.
## < table of extent 0 >
table(airseoul$region)
## Warning: Unknown or uninitialised column: `region`.
## < table of extent 0 >
airseoul1<-airseoul1 %>% filter(date!="전체"&region!="평균")
table(airseoul1$date)
## 
## 2022-02-01 2022-02-02 2022-02-03 2022-02-04 2022-02-05 2022-02-06 2022-02-07 
##         25         25         25         25         25         25         25 
## 2022-02-08 2022-02-09 2022-02-10 2022-02-11 2022-02-12 2022-02-13 2022-02-14 
##         25         25         25         25         25         25         25 
## 2022-02-15 2022-02-16 2022-02-17 2022-02-18 2022-02-19 2022-02-20 2022-02-21 
##         25         25         25         25         25         25         25 
## 2022-02-22 2022-02-23 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28 
##         25         25         25         25         25         25         25 
## 2022-03-01 2022-03-02 2022-03-03 2022-03-04 2022-03-05 2022-03-06 2022-03-07 
##         25         25         25         25         25         25         25 
## 2022-03-08 2022-03-09 2022-03-10 2022-03-11 2022-03-12 2022-03-13 2022-03-14 
##         25         25         25         25         25         25         25 
## 2022-03-15 2022-03-16 2022-03-17 2022-03-18 2022-03-19 2022-03-20 2022-03-21 
##         25         25         25         25         25         25         25 
## 2022-03-22 2022-03-23 2022-03-24 2022-03-25 2022-03-26 2022-03-27 2022-03-28 
##         25         25         25         25         25         25         25 
## 2022-03-29 2022-03-30 2022-03-31 
##         25         25         25
table(airseoul1$region)
## 
##   강남구   강동구   강북구   강서구   관악구   광진구   구로구   금천구 
##       59       59       59       59       59       59       59       59 
##   노원구   도봉구 동대문구   동작구   마포구 서대문구   서초구   성동구 
##       59       59       59       59       59       59       59       59 
##   성북구   송파구   양천구 영등포구   용산구   은평구   종로구     중구 
##       59       59       59       59       59       59       59       59 
##   중랑구 
##       59
summary(airseoul1$pm10)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    3.00   27.00   36.00   40.54   50.00  112.00       7
summary(airseoul1$pm2.5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   13.00   19.00   23.28   32.00   92.00       3
airseoul1<-airseoul1 %>% filter(!is.na(pm10)&!is.na(pm2.5))

#airseoul1 %>% mutate(pm_grade=ifelse(pm10<=30,"good",ifelse(pm10<=81,"normal",ifelse(pm<=150,"bad","worse")))) %>% 
#group_by(pm_grade) %>%   
 # summarize(n=n())


airseoul1 %>% filter(pm2.5==min(pm2.5)) %>% arrange(desc(pm10))
## # A tibble: 6 × 4
##   date       region    pm10 pm2.5
##   <chr>      <chr>    <dbl> <dbl>
## 1 2022-03-18 성동구       7     1
## 2 2022-03-18 구로구       6     1
## 3 2022-03-18 서초구       6     1
## 4 2022-03-19 구로구       5     1
## 5 2022-03-18 서대문구     5     1
## 6 2022-03-19 종로구       4     1
subway_202203<-read.csv("CARD_SUBWAY_MONTH_202203.csv",fileEncoding="euc-kr")

str(subway_202203)
## 'data.frame':    18467 obs. of  6 variables:
##  $ 사용일자    : int  20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 20220301 ...
##  $ 노선명      : chr  "장항선" "장항선" "장항선" "안산선" ...
##  $ 역명        : chr  "배방" "온양온천" "신창(순천향대)" "오이도" ...
##  $ 승차총승객수: int  593 2388 1065 4789 1892 2122 1360 1836 2211 1899 ...
##  $ 하차총승객수: int  698 2517 1164 4668 1693 2228 1331 1663 2122 1814 ...
##  $ 등록일자    : int  20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 20220304 ...
library(dplyr)
subway_202203<-subway_202203 %>% 
rename(date="사용일자",line="노선명",station="역명",on_pass="승차총승객수",off_pass="하차총승객수") %>% 
  select(-"등록일자")

summary(subway_202203)
##       date              line             station             on_pass     
##  Min.   :20220301   Length:18467       Length:18467       Min.   :    1  
##  1st Qu.:20220308   Class :character   Class :character   1st Qu.: 3078  
##  Median :20220316   Mode  :character   Mode  :character   Median : 6334  
##  Mean   :20220316                                         Mean   : 8852  
##  3rd Qu.:20220324                                         3rd Qu.:11838  
##  Max.   :20220331                                         Max.   :80279  
##     off_pass    
##  Min.   :    0  
##  1st Qu.: 2989  
##  Median : 6229  
##  Mean   : 8823  
##  3rd Qu.:11742  
##  Max.   :78816
subway_202203 %>% summarise(on_p=mean(on_pass),off_p=mean(off_pass))
##       on_p    off_p
## 1 8851.886 8822.759
subway_202203 %>% filter(on_pass==max(on_pass))
##       date  line station on_pass off_pass
## 1 20220325 2호선    강남   80279    78816
subway_202203 %>% group_by(station) %>% 
  mutate(total_pass=on_pass+off_pass) %>% 
  summarize(m=mean(total_pass)) %>% arrange(desc(m)) %>% head
## # A tibble: 6 × 2
##   station                    m
##   <chr>                  <dbl>
## 1 강남                 125027.
## 2 신림                 101545.
## 3 구로디지털단지        88652.
## 4 서울대입구(관악구청)  77404.
## 5 역삼                  75427.
## 6 삼성(무역센터)        74311.
#4번
subway_202203 %>% mutate(total_pass=on_pass+off_pass) %>% 
  filter(line=="1호선") %>% filter(total_pass==max(total_pass))
##       date  line station on_pass off_pass total_pass
## 1 20220325 1호선  서울역   41104    41346      82450
#5번
table(subway_202203$date)
## 
## 20220301 20220302 20220303 20220304 20220305 20220306 20220307 20220308 
##      593      598      597      598      595      599      597      599 
## 20220309 20220310 20220311 20220312 20220313 20220314 20220315 20220316 
##      595      597      597      595      594      597      594      596 
## 20220317 20220318 20220319 20220320 20220321 20220322 20220323 20220324 
##      597      597      594      595      596      593      594      594 
## 20220325 20220326 20220327 20220328 20220329 20220330 20220331 
##      595      597      595      595      595      593      596
subway_202203$day<-substr(subway_202203$date,7,8)
table(subway_202203$day)
## 
##  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20 
## 593 598 597 598 595 599 597 599 595 597 597 595 594 597 594 596 597 597 594 595 
##  21  22  23  24  25  26  27  28  29  30  31 
## 596 593 594 594 595 597 595 595 595 593 596
subway_202203$day<-as.numeric(subway_202203$day)
subway_202203$week<-ifelse(subway_202203$day%in%c(5,6,12,13,19,20,26,27),"weekend","weekday")
table(subway_202203$week)
## 
## weekday weekend 
##   13703    4764
options(scipen=999)
subway_202203<-subway_202203 %>% mutate(total_pass=on_pass+off_pass)
t.test(data=subway_202203,total_pass~week)
## 
##  Welch Two Sample t-test
## 
## data:  total_pass by week
## t = 32.794, df = 12509, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group weekday and group weekend is not equal to 0
## 95 percent confidence interval:
##  7342.155 8275.667
## sample estimates:
## mean in group weekday mean in group weekend 
##              19689.14              11880.23
#1분석데이터 구조 확인하기
library(foreign)
koweps<-read.spss("koweps_h16_2021_beta1.sav")
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Compression bias (0) is not the usual value of 100
## Warning in read.spss("koweps_h16_2021_beta1.sav"): koweps_h16_2021_beta1.sav:
## Very long string record(s) found (record type 7, subtype 14), each will be
## imported in consecutive separate variables
class(koweps)
## [1] "list"
korwps_21<-as.data.frame(koweps)
house<-korwps_21 %>% select(h1601_4,h1601_5,h1601_6,h16_reg5,h1608_114,h1608_122)
str(house)
## 'data.frame':    5996 obs. of  6 variables:
##  $ h1601_4  : num  2 1 1 1 2 2 1 1 1 1 ...
##  $ h1601_5  : num  1945 1948 1942 1962 1940 ...
##  $ h1601_6  : num  4 3 7 6 3 5 4 6 7 5 ...
##  $ h16_reg5 : num  1 1 1 1 3 1 1 1 1 1 ...
##  $ h1608_114: num  NA NA NA 4392 NA ...
##  $ h1608_122: num  NA 1980 621 NA 324 NA NA 285 1500 NA ...
library(dplyr)
house1<-house %>% 
  rename(gender=h1601_4,birth=h1601_5,edu=h1601_6,region=h16_reg5,r_salary=h1608_114,t_salary=h1608_122)
summary(house1)
##      gender          birth           edu            region         r_salary    
##  Min.   :1.000   Min.   :1922   Min.   :2.000   Min.   :1.000   Min.   :    0  
##  1st Qu.:1.000   1st Qu.:1942   1st Qu.:3.000   1st Qu.:2.000   1st Qu.: 3280  
##  Median :1.000   Median :1955   Median :5.000   Median :3.000   Median : 4620  
##  Mean   :1.357   Mean   :1957   Mean   :4.635   Mean   :2.702   Mean   : 5250  
##  3rd Qu.:2.000   3rd Qu.:1970   3rd Qu.:6.000   3rd Qu.:3.000   3rd Qu.: 6620  
##  Max.   :2.000   Max.   :2001   Max.   :9.000   Max.   :5.000   Max.   :85860  
##                                                                 NA's   :4566   
##     t_salary    
##  Min.   :    0  
##  1st Qu.:  297  
##  Median : 1040  
##  Mean   : 1552  
##  3rd Qu.: 2340  
##  Max.   :14580  
##  NA's   :4384
#2결측치와 이상값 확인하기

house1$r_salary<-ifelse(house1$r_salary==0,NA,house1$r_salary)
house1$t_salary<-ifelse(house1$r_salary==0,NA,house1$r_salary)
house1$age<-2021-house1$birth+1
range(house1$age)
## [1]  21 100
#3 범주형 분류 재부호하기