# Q1
# install.packages('RSelenium')
# install.packages('stringr')
library(RSelenium)
## Warning: package 'RSelenium' was built under R version 4.0.3
library(rvest)
## Loading required package: xml2
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.3
# 우선 강의내용과 같이 명령프롬트콜을 통해 서버를 입력했습니다.
remDr<- remoteDriver(
remoteServerAddr = 'localhost',
port = 4445L, #서버
browserName = 'chrome'
)
# 크롬 열기!
remDr$open()
## [1] "Connecting to remote server"
## $acceptInsecureCerts
## [1] FALSE
##
## $browserName
## [1] "chrome"
##
## $browserVersion
## [1] "86.0.4240.198"
##
## $chrome
## $chrome$chromedriverVersion
## [1] "86.0.4240.22 (398b0743353ff36fb1b82468f63a3a93b4e2e89e-refs/branch-heads/4240@{#378})"
##
## $chrome$userDataDir
## [1] "C:\\Users\\huyn0\\AppData\\Local\\Temp\\scoped_dir22228_1255136749"
##
##
## $`goog:chromeOptions`
## $`goog:chromeOptions`$debuggerAddress
## [1] "localhost:60065"
##
##
## $networkConnectionEnabled
## [1] FALSE
##
## $pageLoadStrategy
## [1] "normal"
##
## $platformName
## [1] "windows"
##
## $proxy
## named list()
##
## $setWindowRect
## [1] TRUE
##
## $strictFileInteractability
## [1] FALSE
##
## $timeouts
## $timeouts$implicit
## [1] 0
##
## $timeouts$pageLoad
## [1] 300000
##
## $timeouts$script
## [1] 30000
##
##
## $unhandledPromptBehavior
## [1] "dismiss and notify"
##
## $`webauthn:virtualAuthenticators`
## [1] TRUE
##
## $webdriver.remote.sessionid
## [1] "46501e384430ed87961c569c1f077073"
##
## $id
## [1] "46501e384430ed87961c569c1f077073"
# 크롬으로 파파고 열기!
remDr$navigate('https://papago.naver.com/')
# 크롤링(번역할 내용에 대해)
webElem1 <- remDr$findElement(using = 'id',
value = 'txtSource')
# 확인하기
webElem1$highlightElement()
# 기존 타이핑된 입력값 없앰.
webElem1$clearElement
## Class method definition for method clearElement()
## function ()
## {
## "Clear a TEXTAREA or text INPUT element's value."
## qpath <- sprintf("%s/session/%s/element/%s/clear", serverURL,
## sessionInfo[["id"]], elementId)
## queryRD(qpath, "POST")
## }
## <environment: 0x0000000018c1c5e8>
##
## Methods used:
## "queryRD"
# sentece에 할말 저장 # 원문
sentence <- list('부장님 싫다. 직장생활 어렵다.')
# sentence에 저장된 할말 적용하기
webElem1$sendKeysToElement(sentence)
# 3초 정도 텀두기
Sys.sleep(3)
# 크롤링(또 다른 변수에 번역된 결과 값)
webElem2 <- remDr$findElement(using = 'id',
value = 'txtTarget')
# 확인하기
webElem2$highlightElement()
#번역된 결과값을 result에 저장하기
result <- webElem2$getElementText()
result_clean <- stringr::str_split(result, '\\.')
# 번역된 결과값이 r에 도출됨.
result_clean
## [[1]]
## [1] "I don't like the manager" " Work is hard"
## [3] ""
#_-----------------------------------------------------#
# Q2
rm(list =ls())
# 필요한 라이브러리
library(tidyverse)
## -- Attaching packages ---------------------------------------------- tidyverse 1.3.0 --
## ¡î ggplot2 3.3.2 ¡î purrr 0.3.4
## ¡î tibble 3.0.3 ¡î dplyr 1.0.2
## ¡î tidyr 1.1.2 ¡î forcats 0.5.0
## ¡î readr 1.3.1
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(rvest)
#install.packages('ggthemes')
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(zoo) # ma계산을 위해 zoo 필요
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# 영어할 때는 필요없지만, 한글할 때에는 필요한 장치가 존재함.
Sys.setlocale('LC_ALL', 'English')
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
url <- 'http://www.busan.go.kr/covid19/Status02.do'
# html을 전부 가져온다.
contents <- read_html(url) %>%
html_nodes('table') %>%
html_table(fill = TRUE) # 컬럼수가 일정하지 않을 때 fill=TRUE사용
# 필요한 컨텐츠가 4에 있는 것을 확인!
df <- contents[[4]]
names(df) # 컬럼이름 확인
## [1] "<U+B0A0><U+C9DC>" "<U+B0A0><U+C9DC>" "<U+D655><U+C9C4><U+C790><U+C218>"
## [4] "<U+D655><U+C9C4><U+C790><U+C218>" "<U+C644><U+CE58><U+C790><U+C218>(<U+B204><U+ACC4>)" "<U+C0AC><U+B9DD><U+C790><U+C218>(<U+B204><U+ACC4>)"
## [7] "<U+AC80><U+C0AC><U+ACB0><U+ACFC><U+C74C><U+C131>(<U+B204><U+ACC4>)" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>"
names(df) <- c( # 컬럼이름 바꾸기
'date',
'day',
'cumulated',
'new',
'cured_cum',
'death_cum',
'test_n_cum',
'quar',
'relased_cum')
# 수정된 변수이름 확인
names(df)
## [1] "date" "day" "cumulated" "new" "cured_cum"
## [6] "death_cum" "test_n_cum" "quar" "relased_cum"
# 불필요한 첫 행 빼기
df <- df[-1,]
df$date <- as.Date(df$date) # 날짜형으로 바꾸기
df$new <- as.numeric(df$new) # 숫자형으로바꾸기
head(df) # 확인하기
## date day cumulated new cured_cum death_cum test_n_cum quar
## 2 2020-12-01 <U+D654> 864 50 601 16 175,254 7,408
## 3 2020-11-30 <U+C6D4> 814 11 597 16 174,154 7,018
## 4 2020-11-29 <U+C77C> 803 51 595 16 173,116 6,699
## 5 2020-11-28 <U+D1A0> 752 25 591 15 171,510 5,454
## 6 2020-11-27 <U+AE08> 727 26 590 15 170,388 4,954
## 7 2020-11-26 <U+BAA9> 701 22 590 15 169,224 4,557
## relased_cum
## 2 58,496
## 3 58,237
## 4 57,946
## 5 57,508
## 6 57,362
## 7 56,982
# 1. 코로나 신규확진자 현황확인
df %>%
ggplot(aes(x= date))+
geom_col(aes(y = new),
fill = 'white', # 막대기 색
color = 'red', # 테두리 색
size = 0.3)

# 2. 코로나 신규확진자 현황 (ma(7일단위)), 날짜, 제목 등 추가)
# ma= mobing average
df %>% # ma= mobing average
mutate(ma = rollmean(new, 7, aligh = "right", fill=0)) %>% # 'new'신규확진자를 7일로 나누어 평균.
ggplot(aes(x = date))+
geom_col(aes(y = new),
fill = 'red', #
color = 'red',
size = 0.3) +
geom_line(aes(y = ma),
color = 'black', size =2)+
geom_area(aes(y = ma),
color = 'red', alpha = 0.4) +
scale_x_date(labels = date_format(format = '%B'), #%B= 매 월 단위
breaks = 'month')+ # 월 단위
scale_y_continuous(labels = comma_format()) + # y값이 크면 소수값나오게
labs(
x= 'Date',
y= 'Newly confirmed cases',
title = 'COVID-19 WAVE IN BUSAN',
sbutitle = 'Red line is 7day moving average'
)+
theme_minimal()

# log 표현 코로나 신규확진자 현황
df %>%
mutate(ma = rollmean(new, 7, aligh = 'right', fill=0)) %>%
ggplot(aes(x= date))+
geom_col(aes(y = new),
fill = 'red',
color = 'red',
size = 0.3) +
geom_line(aes(y = ma),
color = 'black', size =2)+
geom_area(aes(y = ma),
color = 'red', alpha = 0.25) +
scale_x_date(labels = date_format(format = '%B'),
breaks = 'month')+
scale_y_log10() # y축을 로그값으로 그린다.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 112 rows containing missing values (geom_col).

#---------------------------------------------#
# Q3
rm(list =ls())
# 필요한 라이브러리
# install.packages('AER')
library(AER)
## Warning: package 'AER' was built under R version 4.0.3
## Loading required package: car
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Loading required package: lmtest
## Loading required package: sandwich
## Loading required package: survival
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyverse)
library(RColorBrewer)
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
data('CPS1988')
# head(CPS1988)
# 간단한 CPS로 변수 변환
CPS <- CPS1988
# head(CPS)
str(CPS)
## 'data.frame': 28155 obs. of 7 variables:
## $ wage : num 355 123 370 755 594 ...
## $ education : int 7 12 9 11 12 16 8 12 12 14 ...
## $ experience: int 45 1 9 46 36 22 51 34 0 18 ...
## $ ethnicity : Factor w/ 2 levels "cauc","afam": 1 1 1 1 1 1 1 1 1 1 ...
## $ smsa : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ region : Factor w/ 4 levels "northeast","midwest",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ parttime : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
view(CPS)
# 형태를 정수형으로로 바꾼다. 그릴 때 편함.
CPS$ethnicity <- as.integer(CPS$ethnicity)
CPS$smsa <- as.integer(CPS$smsa)
CPS$region <- as.integer(CPS$region)
CPS$parttime <- as.integer(CPS$parttime)
# 교육과 수익률을 추정하기
fig <- CPS %>%
ggplot(mapping = aes(x=education, y=wage))+
geom_point(color = 'red',
size = 2,
alpha = 0.25)+
geom_smooth(method = 'lm',
linetype = 'dashed',
color = 'black',)+
labs(
x= 'Education',
y ='Wage',
title = 'Wage and Education Relationship'
)
fig
## `geom_smooth()` using formula 'y ~ x'

# 파트 타임 결정에 미치는 모형 구성 및 추정
lm.parttime <- lm(parttime~ ., data=CPS) # 종속변수 y
lm.parttime
##
## Call:
## lm(formula = parttime ~ ., data = CPS)
##
## Coefficients:
## (Intercept) wage education experience ethnicity smsa
## 1.0742881 -0.0001732 0.0064784 -0.0004421 -0.0048351 0.0201261
## region
## 0.0053441
summary(lm.parttime)
##
## Call:
## lm(formula = parttime ~ ., data = CPS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22628 -0.12478 -0.08624 -0.03524 3.04425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.074e+00 1.362e-02 78.894 < 2e-16 ***
## wage -1.732e-04 4.023e-06 -43.055 < 2e-16 ***
## education 6.478e-03 6.397e-04 10.128 < 2e-16 ***
## experience -4.421e-04 1.378e-04 -3.209 0.00133 **
## ethnicity -4.835e-03 6.126e-03 -0.789 0.42992
## smsa 2.013e-02 3.811e-03 5.281 1.3e-07 ***
## region 5.344e-03 1.544e-03 3.461 0.00054 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2752 on 28148 degrees of freedom
## Multiple R-squared: 0.07227, Adjusted R-squared: 0.07207
## F-statistic: 365.5 on 6 and 28148 DF, p-value: < 2.2e-16
stargazer::stargazer(lm.parttime,
type = 'text',
keep.stat = c('n', 'rsq'))
##
## ========================================
## Dependent variable:
## ---------------------------
## parttime
## ----------------------------------------
## wage -0.0002***
## (0.00000)
##
## education 0.006***
## (0.001)
##
## experience -0.0004***
## (0.0001)
##
## ethnicity -0.005
## (0.006)
##
## smsa 0.020***
## (0.004)
##
## region 0.005***
## (0.002)
##
## Constant 1.074***
## (0.014)
##
## ----------------------------------------
## Observations 28,155
## R2 0.072
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# 조정된 r의 값이 0.072의 설명력을 보인다.
# 값을 나누었을 때, wage,experience,ethnicity 컬럼은 t-value가 (-) 값이 나온다.
# 다른 나머지 컬럼들은 1.96보다 t값이 크게 나오는 것을 확인할 수 있다.
# t-value가 1.96보다 크면 통계적유의하기때매
# 귀무가설을 기각, 이때 기각할떄 실수하는 수준 = 유의수준이다.(오류)
# 유의수준을 0으로 만드는 것은 굉장히 힘듬. 그래서 유의수준 범위를 정해놓음
# 베이스 = 베타1이 0이라는 귀무가설