#1. Question 1 (40점)

다음과 같은 두 문장을 네이버 파파고를 통해 번역하는 구문을 짜시오. RSelenium 패키지를 사용하시오.

string <- “부장님 싫다. 직장생활 어렵다.”

library(RSelenium)

## Warning: package 'RSelenium' was built under R version 4.0.3

library(rvest)

## Loading required package: xml2

library(stringr)

remDr <- remoteDriver(
  remoteServerAddr = "localhost",
  port = 4445L,
  browserName = 'chrome'
)

remDr$open() # 크롬이 하나 열림 R에서 크롬을 컨트롤하고 있는 것

## [1] "Connecting to remote server"
## $acceptInsecureCerts
## [1] FALSE
## 
## $browserName
## [1] "chrome"
## 
## $browserVersion
## [1] "87.0.4280.66"
## 
## $chrome
## $chrome$chromedriverVersion
## [1] "86.0.4240.22 (398b0743353ff36fb1b82468f63a3a93b4e2e89e-refs/branch-heads/4240@{#378})"
## 
## $chrome$userDataDir
## [1] "C:\\Users\\kim-n\\AppData\\Local\\Temp\\scoped_dir1240_1790011788"
## 
## 
## $`goog:chromeOptions`
## $`goog:chromeOptions`$debuggerAddress
## [1] "localhost:51459"
## 
## 
## $networkConnectionEnabled
## [1] FALSE
## 
## $pageLoadStrategy
## [1] "normal"
## 
## $platformName
## [1] "windows"
## 
## $proxy
## named list()
## 
## $setWindowRect
## [1] TRUE
## 
## $strictFileInteractability
## [1] FALSE
## 
## $timeouts
## $timeouts$implicit
## [1] 0
## 
## $timeouts$pageLoad
## [1] 300000
## 
## $timeouts$script
## [1] 30000
## 
## 
## $unhandledPromptBehavior
## [1] "dismiss and notify"
## 
## $`webauthn:virtualAuthenticators`
## [1] TRUE
## 
## $webdriver.remote.sessionid
## [1] "65fb461ef5df42ec5eb0a5e828cc8f80"
## 
## $id
## [1] "65fb461ef5df42ec5eb0a5e828cc8f80"

##1. 네이버 파파고 열기

remDr$navigate("https://papago.naver.com/")

##2. string <- “부장님 싫다. 직장생활 어렵다.” 검색하기

webElement1 <- remDr$findElement(using = 'class',
                                 value = 'edit_box___1KtZ3')

#확인하기
webElement1$highlightElement()
webElement1$clearElement()

#입력하기
webElement1$sendKeysToElement(list('부장님 싫다. 직장생활 어렵다.',
                                   key = 'enter'))

##3. 결과값 가져오기

Sys.sleep(2) # 2초 쉬고

webElement2 <- remDr$findElement(using = 'id',
                                 value = 'txtTarget')

# 확인하기
webElement2$highlightElement()

webElement2$getElementText() -> result #결과 result에 담기

result_clean <- stringr::str_split(result, "\\.")

result_clean

## [[1]]
## [1] "I don't like the manager" " Work is hard"           
## [3] ""

#Question 2 (30점)

부산시 홈페이지에 가서 코로나19 신규 확진자 현황을 스크레이핑하고 추이를 그리시오.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## ¡î ggplot2 3.3.2     ¡î purrr   0.3.4
## ¡î tibble  3.0.3     ¡î dplyr   1.0.2
## ¡î tidyr   1.1.2     ¡î forcats 0.5.0
## ¡î readr   1.3.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

library(rvest)
library(zoo)

## Warning: package 'zoo' was built under R version 4.0.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# 그래프용
library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.0.3

library(lubridate) # 날짜를 다뤄야하기 때문

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

# 한글 다루기 위해
Sys.setlocale('LC_ALL', 'English')

## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"

타겟 url을 설정함

url <- "http://www.busan.go.kr/covid19/Status02.do"

html 테이블을 가져오기

contents <- read_html(url) %>% 
  html_nodes('#contents > div > table') %>% 
  html_table(fill = TRUE)

df <- contents[[1]]

names(df)

## [1] "<U+B0A0><U+C9DC>"   "<U+B0A0><U+C9DC>"   "<U+D655><U+C9C4><U+C790><U+C218>"
## [4] "<U+D655><U+C9C4><U+C790><U+C218>" "<U+C644><U+CE58><U+C790><U+C218>(<U+B204><U+ACC4>)" "<U+C0AC><U+B9DD><U+C790><U+C218>(<U+B204><U+ACC4>)"
## [7] "<U+AC80><U+C0AC><U+ACB0><U+ACFC><U+C74C><U+C131>(<U+B204><U+ACC4>)" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>"

# 변수 이름 영어로 변환
names(df) <- c(
  'date',
  'day',
  'cumulated',
  'new',
  'cured_cum',
  'death_cum',
  'test_n_cum',
  'quar',
  'realased_cum'
)

names(df)

## [1] "date"         "day"          "cumulated"    "new"          "cured_cum"   
## [6] "death_cum"    "test_n_cum"   "quar"         "realased_cum"

# 첫번째 행 빼기
df <- df[-1,]

head(df)

##         date      day cumulated new cured_cum death_cum test_n_cum  quar
## 2 2020-12-01 <U+D654>       864  50       601        16    175,254 7,408
## 3 2020-11-30 <U+C6D4>       814  11       597        16    174,154 7,018
## 4 2020-11-29 <U+C77C>       803  51       595        16    173,116 6,699
## 5 2020-11-28 <U+D1A0>       752  25       591        15    171,510 5,454
## 6 2020-11-27 <U+AE08>       727  26       590        15    170,388 4,954
## 7 2020-11-26 <U+BAA9>       701  22       590        15    169,224 4,557
##   realased_cum
## 2       58,496
## 3       58,237
## 4       57,946
## 5       57,508
## 6       57,362
## 7       56,982

날짜별 신규 확진자 수 구하기

df$date <- as.Date(df$date)
df$new <- as.numeric(df$new)

df %>% 
  ggplot(aes(x = date)) + 
  geom_col(aes(y = new),
           fill = '#FAC9C7',
           color = 'white',
           size = 0.2
           )

x축 완성하기

df %>% 
  mutate(ma = rollmean(new, 7,aligh = 'right', fill = 0)) %>% 
  ggplot(aes(x = date)) + 
  geom_col(aes(y = new),
           fill = '#FAC9C7',
           color = 'white',
           size = 0.2) +
  geom_line(aes(y = ma),
            color = '#CF1010', size = 1) + 
  geom_area(aes(y = ma),
            color = '#CF1010',
            alpha = 0.25) +
  scale_x_date(labels = date_format(format = '%B'),
               breaks = 'month')

# rollmean : 돌아가면서 평균을 내라  
# geom_area() : # 그래프 밑에 색깔 칠해짐 
# scale_x_date :날짜 추가  '%B' : 월이 보임,  breaks = 'month': 매달

최종 그래프

# y축과 제목 완성하기

df %>% 
  mutate(ma = rollmean(new, 7,aligh = 'right', fill = 0)) %>% 
  ggplot(aes(x = date)) + 
  geom_col(aes(y = new),
           fill = '#FAC9C7',
           color = 'white',
           size = 0.2) +
  geom_line(aes(y = ma),
            color = '#CF1010', size = 1) + 
  geom_area(aes(y = ma),
            color = '#CF1010',
            alpha = 0.25) +
  scale_x_date(labels = date_format(format = '%B'),
               breaks = 'month') +
  scale_y_continuous(labels = comma_format()) +
  labs(
    x = 'Date',
    y = 'Newly confirmed cases',
    title = '3rd COVID-19 wave in Busan',
    subtitle = "Red line : 7-day moving average"
  )

Question 3 (30점)

“AER” 패키지를 설치하고 로딩하시오. 그리고 “CPS1988” 데이터를 불러들이시오.

# 라이브러리 
library("AER")

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

## Loading required package: lmtest

## Loading required package: sandwich

## Loading required package: survival

## Warning: package 'survival' was built under R version 4.0.3

library(tidyverse)
library(ggplot2)
library(grid)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.0.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(stargazer)

## Warning: package 'stargazer' was built under R version 4.0.3

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

# 데이터 로딩
data('CPS1988')
attach(CPS1988)

str(CPS1988)

## 'data.frame':    28155 obs. of  7 variables:
##  $ wage      : num  355 123 370 755 594 ...
##  $ education : int  7 12 9 11 12 16 8 12 12 14 ...
##  $ experience: int  45 1 9 46 36 22 51 34 0 18 ...
##  $ ethnicity : Factor w/ 2 levels "cauc","afam": 1 1 1 1 1 1 1 1 1 1 ...
##  $ smsa      : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ region    : Factor w/ 4 levels "northeast","midwest",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ parttime  : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...

##1. 교육수익률을 추정하시오.

교육수익률이란 1년 더 교육을 받았을 때 수익률(임금의 상승률)이 올랐는가를 의미한다.

###1. 히스토그램

# wage
CPS1988 %>% 
  ggplot(aes(x=wage)) +
  geom_histogram(color= 'white') +
  xlab('wage') -> fig1

fig1

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#-> 로그-정규분포 로그를 취하여 정규분포를 취한다.

#log-wage
CPS1988 %>% 
  ggplot(aes(x=log(wage))) +
  geom_histogram(color= 'white') +
  xlab('log-wage') -> fig2

fig2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

두 가지 그래프 비교하기

gridExtra::grid.arrange(fig1, fig2, ncol = 1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 비교결과 : log(wage)를 이용한다.
# 이유1 : 정규분포와 유사한 형태를 갖는다.
# 이유2 : 추정계수를 탄력성으로 해석할 수 있다는 장점이 있다.

###2. 산포도

# 교육과 임금간의 관계
CPS1988 %>% 
  ggplot(aes(x= education, y = log(wage))) +
  geom_jitter(size = 3,
              alpha = 0.1) +
  geom_smooth(method = 'lm') +
  scale_color_brewer(palette = 'Set1')+
  labs(
    x = 'education',
    y = 'log(wage)'
  )

## `geom_smooth()` using formula 'y ~ x'

# 양의 상관관계를 보인다.

###3. 회귀분석 모형 모형: ln(wage) = b_0 + b_1*edu + Z’gamma + e

lm.edu <- lm(log(wage)~education, data = CPS1988)
summary(lm.edu)

## 
## Call:
## lm(formula = log(wage) ~ education, data = CPS1988)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6253 -0.3775  0.0981  0.4725  3.6864 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.17829    0.01874  276.31   <2e-16 ***
## education    0.07594    0.00140   54.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6812 on 28153 degrees of freedom
## Multiple R-squared:  0.09461,    Adjusted R-squared:  0.09457 
## F-statistic:  2942 on 1 and 28153 DF,  p-value: < 2.2e-16

stargazer(lm.edu,type="text",
          keep.stat = c("n","rsq","adj.rsq"))

## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                       log(wage)         
## ----------------------------------------
## education             0.076***          
##                        (0.001)          
##                                         
## Constant              5.178***          
##                        (0.019)          
##                                         
## ----------------------------------------
## Observations           28,155           
## R2                      0.095           
## Adjusted R2             0.095           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01

\[ ln(wage) = 5.17829 + 0.07594*edu + e \]

결론 : 교육을 한단위 더 받았을 때 임금 7.5% 올라간다.

##2.파트타임 결정에 미치는 모형을 구성하고, 추정하시오. 종속변수: parttime

str(CPS1988)

## 'data.frame':    28155 obs. of  7 variables:
##  $ wage      : num  355 123 370 755 594 ...
##  $ education : int  7 12 9 11 12 16 8 12 12 14 ...
##  $ experience: int  45 1 9 46 36 22 51 34 0 18 ...
##  $ ethnicity : Factor w/ 2 levels "cauc","afam": 1 1 1 1 1 1 1 1 1 1 ...
##  $ smsa      : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ region    : Factor w/ 4 levels "northeast","midwest",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ parttime  : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...

# parttime(종속변수)이 범주형이고 독립변수가 여러개이기 때문에 로지스틱 회귀분석을 시행한다.

# parttime(종속변수)를 제외한 나머지 변수를 독립변수에 넣고 로지스틱 회귀분석을 시행하여 변수의 유의미성을 파악한다.
glm(parttime ~ ., data= CPS1988, family = "binomial") -> model

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)

## 
## Call:
## glm(formula = parttime ~ ., family = "binomial", data = CPS1988)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7480  -0.4002  -0.1814  -0.0555   8.4904  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.8647745  0.1483839 -12.567  < 2e-16 ***
## wage          -0.0080067  0.0001631 -49.103  < 2e-16 ***
## education      0.1597539  0.0094945  16.826  < 2e-16 ***
## experience     0.0132287  0.0017438   7.586 3.29e-14 ***
## ethnicityafam -0.2057932  0.0800015  -2.572   0.0101 *  
## smsayes        0.3591649  0.0540425   6.646 3.01e-11 ***
## regionmidwest  0.0592666  0.0716927   0.827   0.4084    
## regionsouth   -0.1305802  0.0685064  -1.906   0.0566 .  
## regionwest     0.1841078  0.0719250   2.560   0.0105 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16990  on 28154  degrees of freedom
## Residual deviance: 12020  on 28146  degrees of freedom
## AIC: 12038
## 
## Number of Fisher Scoring iterations: 8

summary 결과 각 항목별 평가치(=Confficents의 각 항목의 오른쪽 무늬)를 보면 wage, education, experience, smsayes이 가장 유의미한 변수로 판단된다.(***:0~0.001) 그러므로, 유의미하지 못한 항목을 제거하고 새로운 로지스틱 회귀모형을 만든다.

glm(parttime ~ wage + education + experience + smsa, data= CPS1988, family = "binomial") -> new_model

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(new_model)

## 
## Call:
## glm(formula = parttime ~ wage + education + experience + smsa, 
##     family = "binomial", data = CPS1988)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6793  -0.4027  -0.1815  -0.0559   8.4904  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.8943938  0.1355943 -13.971  < 2e-16 ***
## wage        -0.0079664  0.0001624 -49.041  < 2e-16 ***
## education    0.1620489  0.0094971  17.063  < 2e-16 ***
## experience   0.0127206  0.0017390   7.315 2.57e-13 ***
## smsayes      0.3433085  0.0533881   6.430 1.27e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16990  on 28154  degrees of freedom
## Residual deviance: 12060  on 28150  degrees of freedom
## AIC: 12070
## 
## Number of Fisher Scoring iterations: 8

vif(new_model)

##       wage  education experience       smsa 
##   1.082253   1.172872   1.177297   1.010792

#공산성 확인했을때 5이상인 것이 없다. 따라서 wage, education, experience, smsa를 독립변수로 사용한다. (5~10 주의, 10이상 사용금지)

최종 로지스틱 회귀모형 \[ parttime = -1.894 + (-0.008*wage) + 0.162*education +0.013*experience + 0.343*smsayes \]

bigdata final 2020

kim-nari

2020년 12월 1일

Question 3 (30점)