#1. Question 1 (40점)
다음과 같은 두 문장을 네이버 파파고를 통해 번역하는 구문을 짜시오. RSelenium 패키지를 사용하시오.
string <- “부장님 싫다. 직장생활 어렵다.”
library(RSelenium)
## Warning: package 'RSelenium' was built under R version 4.0.3
library(rvest)
## Loading required package: xml2
library(stringr)
remDr <- remoteDriver(
remoteServerAddr = "localhost",
port = 4445L,
browserName = 'chrome'
)
remDr$open() # 크롬이 하나 열림 R에서 크롬을 컨트롤하고 있는 것
## [1] "Connecting to remote server"
## $acceptInsecureCerts
## [1] FALSE
##
## $browserName
## [1] "chrome"
##
## $browserVersion
## [1] "87.0.4280.66"
##
## $chrome
## $chrome$chromedriverVersion
## [1] "86.0.4240.22 (398b0743353ff36fb1b82468f63a3a93b4e2e89e-refs/branch-heads/4240@{#378})"
##
## $chrome$userDataDir
## [1] "C:\\Users\\kim-n\\AppData\\Local\\Temp\\scoped_dir1240_1790011788"
##
##
## $`goog:chromeOptions`
## $`goog:chromeOptions`$debuggerAddress
## [1] "localhost:51459"
##
##
## $networkConnectionEnabled
## [1] FALSE
##
## $pageLoadStrategy
## [1] "normal"
##
## $platformName
## [1] "windows"
##
## $proxy
## named list()
##
## $setWindowRect
## [1] TRUE
##
## $strictFileInteractability
## [1] FALSE
##
## $timeouts
## $timeouts$implicit
## [1] 0
##
## $timeouts$pageLoad
## [1] 300000
##
## $timeouts$script
## [1] 30000
##
##
## $unhandledPromptBehavior
## [1] "dismiss and notify"
##
## $`webauthn:virtualAuthenticators`
## [1] TRUE
##
## $webdriver.remote.sessionid
## [1] "65fb461ef5df42ec5eb0a5e828cc8f80"
##
## $id
## [1] "65fb461ef5df42ec5eb0a5e828cc8f80"
##1. 네이버 파파고 열기
remDr$navigate("https://papago.naver.com/")
##2. string <- “부장님 싫다. 직장생활 어렵다.” 검색하기
webElement1 <- remDr$findElement(using = 'class',
value = 'edit_box___1KtZ3')
#확인하기
webElement1$highlightElement()
webElement1$clearElement()
#입력하기
webElement1$sendKeysToElement(list('부장님 싫다. 직장생활 어렵다.',
key = 'enter'))
##3. 결과값 가져오기
Sys.sleep(2) # 2초 쉬고
webElement2 <- remDr$findElement(using = 'id',
value = 'txtTarget')
# 확인하기
webElement2$highlightElement()
webElement2$getElementText() -> result #결과 result에 담기
result_clean <- stringr::str_split(result, "\\.")
result_clean
## [[1]]
## [1] "I don't like the manager" " Work is hard"
## [3] ""
#Question 2 (30점)
부산시 홈페이지에 가서 코로나19 신규 확진자 현황을 스크레이핑하고 추이를 그리시오.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## ¡î ggplot2 3.3.2 ¡î purrr 0.3.4
## ¡î tibble 3.0.3 ¡î dplyr 1.0.2
## ¡î tidyr 1.1.2 ¡î forcats 0.5.0
## ¡î readr 1.3.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(rvest)
library(zoo)
## Warning: package 'zoo' was built under R version 4.0.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# 그래프용
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(lubridate) # 날짜를 다뤄야하기 때문
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# 한글 다루기 위해
Sys.setlocale('LC_ALL', 'English')
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
타겟 url을 설정함
url <- "http://www.busan.go.kr/covid19/Status02.do"
html 테이블을 가져오기
contents <- read_html(url) %>%
html_nodes('#contents > div > table') %>%
html_table(fill = TRUE)
df <- contents[[1]]
names(df)
## [1] "<U+B0A0><U+C9DC>" "<U+B0A0><U+C9DC>" "<U+D655><U+C9C4><U+C790><U+C218>"
## [4] "<U+D655><U+C9C4><U+C790><U+C218>" "<U+C644><U+CE58><U+C790><U+C218>(<U+B204><U+ACC4>)" "<U+C0AC><U+B9DD><U+C790><U+C218>(<U+B204><U+ACC4>)"
## [7] "<U+AC80><U+C0AC><U+ACB0><U+ACFC><U+C74C><U+C131>(<U+B204><U+ACC4>)" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>"
# 변수 이름 영어로 변환
names(df) <- c(
'date',
'day',
'cumulated',
'new',
'cured_cum',
'death_cum',
'test_n_cum',
'quar',
'realased_cum'
)
names(df)
## [1] "date" "day" "cumulated" "new" "cured_cum"
## [6] "death_cum" "test_n_cum" "quar" "realased_cum"
# 첫번째 행 빼기
df <- df[-1,]
head(df)
## date day cumulated new cured_cum death_cum test_n_cum quar
## 2 2020-12-01 <U+D654> 864 50 601 16 175,254 7,408
## 3 2020-11-30 <U+C6D4> 814 11 597 16 174,154 7,018
## 4 2020-11-29 <U+C77C> 803 51 595 16 173,116 6,699
## 5 2020-11-28 <U+D1A0> 752 25 591 15 171,510 5,454
## 6 2020-11-27 <U+AE08> 727 26 590 15 170,388 4,954
## 7 2020-11-26 <U+BAA9> 701 22 590 15 169,224 4,557
## realased_cum
## 2 58,496
## 3 58,237
## 4 57,946
## 5 57,508
## 6 57,362
## 7 56,982
날짜별 신규 확진자 수 구하기
df$date <- as.Date(df$date)
df$new <- as.numeric(df$new)
df %>%
ggplot(aes(x = date)) +
geom_col(aes(y = new),
fill = '#FAC9C7',
color = 'white',
size = 0.2
)
x축 완성하기
df %>%
mutate(ma = rollmean(new, 7,aligh = 'right', fill = 0)) %>%
ggplot(aes(x = date)) +
geom_col(aes(y = new),
fill = '#FAC9C7',
color = 'white',
size = 0.2) +
geom_line(aes(y = ma),
color = '#CF1010', size = 1) +
geom_area(aes(y = ma),
color = '#CF1010',
alpha = 0.25) +
scale_x_date(labels = date_format(format = '%B'),
breaks = 'month')
# rollmean : 돌아가면서 평균을 내라
# geom_area() : # 그래프 밑에 색깔 칠해짐
# scale_x_date :날짜 추가 '%B' : 월이 보임, breaks = 'month': 매달
최종 그래프
# y축과 제목 완성하기
df %>%
mutate(ma = rollmean(new, 7,aligh = 'right', fill = 0)) %>%
ggplot(aes(x = date)) +
geom_col(aes(y = new),
fill = '#FAC9C7',
color = 'white',
size = 0.2) +
geom_line(aes(y = ma),
color = '#CF1010', size = 1) +
geom_area(aes(y = ma),
color = '#CF1010',
alpha = 0.25) +
scale_x_date(labels = date_format(format = '%B'),
breaks = 'month') +
scale_y_continuous(labels = comma_format()) +
labs(
x = 'Date',
y = 'Newly confirmed cases',
title = '3rd COVID-19 wave in Busan',
subtitle = "Red line : 7-day moving average"
)
“AER” 패키지를 설치하고 로딩하시오. 그리고 “CPS1988” 데이터를 불러들이시오.
# 라이브러리
library("AER")
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Loading required package: lmtest
## Loading required package: sandwich
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.3
library(tidyverse)
library(ggplot2)
library(grid)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
# 데이터 로딩
data('CPS1988')
attach(CPS1988)
str(CPS1988)
## 'data.frame': 28155 obs. of 7 variables:
## $ wage : num 355 123 370 755 594 ...
## $ education : int 7 12 9 11 12 16 8 12 12 14 ...
## $ experience: int 45 1 9 46 36 22 51 34 0 18 ...
## $ ethnicity : Factor w/ 2 levels "cauc","afam": 1 1 1 1 1 1 1 1 1 1 ...
## $ smsa : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ region : Factor w/ 4 levels "northeast","midwest",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ parttime : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##1. 교육수익률을 추정하시오.
교육수익률이란 1년 더 교육을 받았을 때 수익률(임금의 상승률)이 올랐는가를 의미한다.
###1. 히스토그램
# wage
CPS1988 %>%
ggplot(aes(x=wage)) +
geom_histogram(color= 'white') +
xlab('wage') -> fig1
fig1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#-> 로그-정규분포 로그를 취하여 정규분포를 취한다.
#log-wage
CPS1988 %>%
ggplot(aes(x=log(wage))) +
geom_histogram(color= 'white') +
xlab('log-wage') -> fig2
fig2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
두 가지 그래프 비교하기
gridExtra::grid.arrange(fig1, fig2, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# 비교결과 : log(wage)를 이용한다.
# 이유1 : 정규분포와 유사한 형태를 갖는다.
# 이유2 : 추정계수를 탄력성으로 해석할 수 있다는 장점이 있다.
###2. 산포도
# 교육과 임금간의 관계
CPS1988 %>%
ggplot(aes(x= education, y = log(wage))) +
geom_jitter(size = 3,
alpha = 0.1) +
geom_smooth(method = 'lm') +
scale_color_brewer(palette = 'Set1')+
labs(
x = 'education',
y = 'log(wage)'
)
## `geom_smooth()` using formula 'y ~ x'
# 양의 상관관계를 보인다.
###3. 회귀분석 모형 모형: ln(wage) = b_0 + b_1*edu + Z’gamma + e
lm.edu <- lm(log(wage)~education, data = CPS1988)
summary(lm.edu)
##
## Call:
## lm(formula = log(wage) ~ education, data = CPS1988)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6253 -0.3775 0.0981 0.4725 3.6864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.17829 0.01874 276.31 <2e-16 ***
## education 0.07594 0.00140 54.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6812 on 28153 degrees of freedom
## Multiple R-squared: 0.09461, Adjusted R-squared: 0.09457
## F-statistic: 2942 on 1 and 28153 DF, p-value: < 2.2e-16
stargazer(lm.edu,type="text",
keep.stat = c("n","rsq","adj.rsq"))
##
## ========================================
## Dependent variable:
## ---------------------------
## log(wage)
## ----------------------------------------
## education 0.076***
## (0.001)
##
## Constant 5.178***
## (0.019)
##
## ----------------------------------------
## Observations 28,155
## R2 0.095
## Adjusted R2 0.095
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
\[ ln(wage) = 5.17829 + 0.07594*edu + e \]
결론 : 교육을 한단위 더 받았을 때 임금 7.5% 올라간다.
##2.파트타임 결정에 미치는 모형을 구성하고, 추정하시오. 종속변수: parttime
str(CPS1988)
## 'data.frame': 28155 obs. of 7 variables:
## $ wage : num 355 123 370 755 594 ...
## $ education : int 7 12 9 11 12 16 8 12 12 14 ...
## $ experience: int 45 1 9 46 36 22 51 34 0 18 ...
## $ ethnicity : Factor w/ 2 levels "cauc","afam": 1 1 1 1 1 1 1 1 1 1 ...
## $ smsa : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ region : Factor w/ 4 levels "northeast","midwest",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ parttime : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
# parttime(종속변수)이 범주형이고 독립변수가 여러개이기 때문에 로지스틱 회귀분석을 시행한다.
# parttime(종속변수)를 제외한 나머지 변수를 독립변수에 넣고 로지스틱 회귀분석을 시행하여 변수의 유의미성을 파악한다.
glm(parttime ~ ., data= CPS1988, family = "binomial") -> model
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = parttime ~ ., family = "binomial", data = CPS1988)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7480 -0.4002 -0.1814 -0.0555 8.4904
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8647745 0.1483839 -12.567 < 2e-16 ***
## wage -0.0080067 0.0001631 -49.103 < 2e-16 ***
## education 0.1597539 0.0094945 16.826 < 2e-16 ***
## experience 0.0132287 0.0017438 7.586 3.29e-14 ***
## ethnicityafam -0.2057932 0.0800015 -2.572 0.0101 *
## smsayes 0.3591649 0.0540425 6.646 3.01e-11 ***
## regionmidwest 0.0592666 0.0716927 0.827 0.4084
## regionsouth -0.1305802 0.0685064 -1.906 0.0566 .
## regionwest 0.1841078 0.0719250 2.560 0.0105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16990 on 28154 degrees of freedom
## Residual deviance: 12020 on 28146 degrees of freedom
## AIC: 12038
##
## Number of Fisher Scoring iterations: 8
summary 결과 각 항목별 평가치(=Confficents의 각 항목의 오른쪽 무늬)를 보면 wage, education, experience, smsayes이 가장 유의미한 변수로 판단된다.(***:0~0.001) 그러므로, 유의미하지 못한 항목을 제거하고 새로운 로지스틱 회귀모형을 만든다.
glm(parttime ~ wage + education + experience + smsa, data= CPS1988, family = "binomial") -> new_model
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(new_model)
##
## Call:
## glm(formula = parttime ~ wage + education + experience + smsa,
## family = "binomial", data = CPS1988)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6793 -0.4027 -0.1815 -0.0559 8.4904
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8943938 0.1355943 -13.971 < 2e-16 ***
## wage -0.0079664 0.0001624 -49.041 < 2e-16 ***
## education 0.1620489 0.0094971 17.063 < 2e-16 ***
## experience 0.0127206 0.0017390 7.315 2.57e-13 ***
## smsayes 0.3433085 0.0533881 6.430 1.27e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16990 on 28154 degrees of freedom
## Residual deviance: 12060 on 28150 degrees of freedom
## AIC: 12070
##
## Number of Fisher Scoring iterations: 8
vif(new_model)
## wage education experience smsa
## 1.082253 1.172872 1.177297 1.010792
#공산성 확인했을때 5이상인 것이 없다. 따라서 wage, education, experience, smsa를 독립변수로 사용한다. (5~10 주의, 10이상 사용금지)
최종 로지스틱 회귀모형 \[ parttime = -1.894 + (-0.008*wage) + 0.162*education +0.013*experience + 0.343*smsayes \]