# Q1

# install.packages('RSelenium')
# install.packages('stringr')
library(RSelenium)
## Warning: package 'RSelenium' was built under R version 4.0.3
library(rvest)
## Loading required package: xml2
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.3
# 우선 강의내용과 같이 명령프롬트콜을 통해 서버를 입력했습니다.
remDr<- remoteDriver(
  remoteServerAddr = 'localhost',
  port = 4445L,  #서버 
  browserName = 'chrome'   
)

# 크롬 열기! 
remDr$open()  
## [1] "Connecting to remote server"
## $acceptInsecureCerts
## [1] FALSE
## 
## $browserName
## [1] "chrome"
## 
## $browserVersion
## [1] "86.0.4240.198"
## 
## $chrome
## $chrome$chromedriverVersion
## [1] "86.0.4240.22 (398b0743353ff36fb1b82468f63a3a93b4e2e89e-refs/branch-heads/4240@{#378})"
## 
## $chrome$userDataDir
## [1] "C:\\Users\\huyn0\\AppData\\Local\\Temp\\scoped_dir22228_1255136749"
## 
## 
## $`goog:chromeOptions`
## $`goog:chromeOptions`$debuggerAddress
## [1] "localhost:60065"
## 
## 
## $networkConnectionEnabled
## [1] FALSE
## 
## $pageLoadStrategy
## [1] "normal"
## 
## $platformName
## [1] "windows"
## 
## $proxy
## named list()
## 
## $setWindowRect
## [1] TRUE
## 
## $strictFileInteractability
## [1] FALSE
## 
## $timeouts
## $timeouts$implicit
## [1] 0
## 
## $timeouts$pageLoad
## [1] 300000
## 
## $timeouts$script
## [1] 30000
## 
## 
## $unhandledPromptBehavior
## [1] "dismiss and notify"
## 
## $`webauthn:virtualAuthenticators`
## [1] TRUE
## 
## $webdriver.remote.sessionid
## [1] "46501e384430ed87961c569c1f077073"
## 
## $id
## [1] "46501e384430ed87961c569c1f077073"
# 크롬으로 파파고 열기!
remDr$navigate('https://papago.naver.com/') 

# 크롤링(번역할 내용에 대해)
webElem1 <- remDr$findElement(using = 'id',
                              value = 'txtSource') 

# 확인하기
webElem1$highlightElement() 

# 기존 타이핑된 입력값 없앰.
webElem1$clearElement
## Class method definition for method clearElement()
## function () 
## {
##     "Clear a TEXTAREA or text INPUT element's value."
##     qpath <- sprintf("%s/session/%s/element/%s/clear", serverURL, 
##         sessionInfo[["id"]], elementId)
##     queryRD(qpath, "POST")
## }
## <environment: 0x0000000018c1c5e8>
## 
## Methods used: 
##      "queryRD"
# sentece에 할말 저장 # 원문
sentence <- list('부장님 싫다. 직장생활 어렵다.') 

# sentence에 저장된 할말 적용하기
webElem1$sendKeysToElement(sentence)

# 3초 정도 텀두기
Sys.sleep(3) 

# 크롤링(또 다른 변수에 번역된 결과 값)
webElem2 <- remDr$findElement(using = 'id',
                              value = 'txtTarget')

# 확인하기
webElem2$highlightElement() 


#번역된 결과값을 result에 저장하기
result <- webElem2$getElementText()

result_clean <- stringr::str_split(result, '\\.') 

# 번역된 결과값이 r에 도출됨.
result_clean 
## [[1]]
## [1] "I don't like the manager" " Work is hard"           
## [3] ""
#_-----------------------------------------------------#

# Q2

rm(list =ls())

# 필요한 라이브러리 
library(tidyverse)
## -- Attaching packages ---------------------------------------------- tidyverse 1.3.0 --
## ¡î ggplot2 3.3.2     ¡î purrr   0.3.4
## ¡î tibble  3.0.3     ¡î dplyr   1.0.2
## ¡î tidyr   1.1.2     ¡î forcats 0.5.0
## ¡î readr   1.3.1
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
library(rvest)
#install.packages('ggthemes')
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(zoo)  # ma계산을 위해 zoo 필요 
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# 영어할 때는 필요없지만, 한글할 때에는 필요한 장치가 존재함.
Sys.setlocale('LC_ALL', 'English')
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
url <- 'http://www.busan.go.kr/covid19/Status02.do'

# html을 전부 가져온다.
contents <- read_html(url) %>%
  html_nodes('table') %>%
  html_table(fill = TRUE)    # 컬럼수가 일정하지 않을 때 fill=TRUE사용

# 필요한 컨텐츠가 4에 있는 것을 확인!
df <- contents[[4]]

names(df)                 # 컬럼이름 확인 
## [1] "<U+B0A0><U+C9DC>"   "<U+B0A0><U+C9DC>"   "<U+D655><U+C9C4><U+C790><U+C218>"
## [4] "<U+D655><U+C9C4><U+C790><U+C218>" "<U+C644><U+CE58><U+C790><U+C218>(<U+B204><U+ACC4>)" "<U+C0AC><U+B9DD><U+C790><U+C218>(<U+B204><U+ACC4>)"
## [7] "<U+AC80><U+C0AC><U+ACB0><U+ACFC><U+C74C><U+C131>(<U+B204><U+ACC4>)" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>" "<U+C790><U+AC00><U+ACA9><U+B9AC><U+D604><U+D669>"
names(df) <- c(           # 컬럼이름 바꾸기
  'date',
  'day',
  'cumulated',
  'new',
  'cured_cum',
  'death_cum',
  'test_n_cum',
  'quar',
  'relased_cum')

# 수정된 변수이름 확인
names(df)
## [1] "date"        "day"         "cumulated"   "new"         "cured_cum"  
## [6] "death_cum"   "test_n_cum"  "quar"        "relased_cum"
# 불필요한 첫 행 빼기
df <- df[-1,]  

df$date <- as.Date(df$date)  # 날짜형으로 바꾸기
df$new <- as.numeric(df$new) # 숫자형으로바꾸기

head(df) # 확인하기
##         date      day cumulated new cured_cum death_cum test_n_cum  quar
## 2 2020-12-01 <U+D654>       864  50       601        16    175,254 7,408
## 3 2020-11-30 <U+C6D4>       814  11       597        16    174,154 7,018
## 4 2020-11-29 <U+C77C>       803  51       595        16    173,116 6,699
## 5 2020-11-28 <U+D1A0>       752  25       591        15    171,510 5,454
## 6 2020-11-27 <U+AE08>       727  26       590        15    170,388 4,954
## 7 2020-11-26 <U+BAA9>       701  22       590        15    169,224 4,557
##   relased_cum
## 2      58,496
## 3      58,237
## 4      57,946
## 5      57,508
## 6      57,362
## 7      56,982
# 1. 코로나 신규확진자 현황확인
df %>%
  ggplot(aes(x= date))+ 
  geom_col(aes(y = new),
           fill = 'white', # 막대기 색
           color = 'red', # 테두리 색
           size = 0.3)

# 2. 코로나 신규확진자 현황 (ma(7일단위)), 날짜, 제목 등 추가)
# ma= mobing average
df %>%  # ma= mobing average
  mutate(ma = rollmean(new, 7, aligh = "right", fill=0)) %>%  # 'new'신규확진자를 7일로 나누어 평균. 
  ggplot(aes(x = date))+
  geom_col(aes(y = new),
           fill = 'red',   #
           color = 'red',
           size = 0.3) +
  geom_line(aes(y = ma),
            color = 'black', size =2)+
  geom_area(aes(y = ma),
            color = 'red', alpha = 0.4) +
  scale_x_date(labels = date_format(format = '%B'), #%B= 매 월 단위
               breaks = 'month')+       # 월 단위
  scale_y_continuous(labels = comma_format()) + # y값이 크면 소수값나오게
  labs(
    x= 'Date',
    y= 'Newly confirmed cases',
    title = 'COVID-19 WAVE IN BUSAN',
    sbutitle = 'Red line is 7day moving average'
  )+
  theme_minimal()

# log 표현 코로나 신규확진자 현황
df %>%
  mutate(ma = rollmean(new, 7, aligh = 'right', fill=0)) %>%
  ggplot(aes(x= date))+
  geom_col(aes(y = new),
           fill  = 'red',
           color = 'red',
           size = 0.3) +
  geom_line(aes(y = ma),
            color = 'black', size =2)+
  geom_area(aes(y = ma),
            color = 'red', alpha = 0.25) +
  scale_x_date(labels = date_format(format = '%B'),
               breaks = 'month')+
  scale_y_log10()  # y축을 로그값으로 그린다.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 112 rows containing missing values (geom_col).

#---------------------------------------------#

# Q3
rm(list =ls())

# 필요한 라이브러리
# install.packages('AER')
library(AER)
## Warning: package 'AER' was built under R version 4.0.3
## Loading required package: car
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## Loading required package: lmtest
## Loading required package: sandwich
## Loading required package: survival
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(tidyverse)
library(RColorBrewer)
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
data('CPS1988')
# head(CPS1988)

# 간단한 CPS로 변수 변환
CPS <- CPS1988
# head(CPS)

str(CPS)
## 'data.frame':    28155 obs. of  7 variables:
##  $ wage      : num  355 123 370 755 594 ...
##  $ education : int  7 12 9 11 12 16 8 12 12 14 ...
##  $ experience: int  45 1 9 46 36 22 51 34 0 18 ...
##  $ ethnicity : Factor w/ 2 levels "cauc","afam": 1 1 1 1 1 1 1 1 1 1 ...
##  $ smsa      : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ region    : Factor w/ 4 levels "northeast","midwest",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ parttime  : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
view(CPS)

# 형태를 정수형으로로 바꾼다. 그릴 때 편함.

CPS$ethnicity <- as.integer(CPS$ethnicity)
CPS$smsa <- as.integer(CPS$smsa)
CPS$region <- as.integer(CPS$region)
CPS$parttime <- as.integer(CPS$parttime)


# 교육과 수익률을 추정하기 
fig <- CPS %>%
  ggplot(mapping = aes(x=education, y=wage))+
  geom_point(color = 'red',
             size = 2,
             alpha = 0.25)+
  geom_smooth(method = 'lm',
              linetype = 'dashed',
              color  = 'black',)+
  labs(
    x= 'Education',
    y ='Wage',
    title = 'Wage and Education Relationship'
  )

fig
## `geom_smooth()` using formula 'y ~ x'

# 파트 타임 결정에 미치는 모형 구성 및 추정

lm.parttime <- lm(parttime~ ., data=CPS) # 종속변수 y
lm.parttime
## 
## Call:
## lm(formula = parttime ~ ., data = CPS)
## 
## Coefficients:
## (Intercept)         wage    education   experience    ethnicity         smsa  
##   1.0742881   -0.0001732    0.0064784   -0.0004421   -0.0048351    0.0201261  
##      region  
##   0.0053441
summary(lm.parttime)
## 
## Call:
## lm(formula = parttime ~ ., data = CPS)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22628 -0.12478 -0.08624 -0.03524  3.04425 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.074e+00  1.362e-02  78.894  < 2e-16 ***
## wage        -1.732e-04  4.023e-06 -43.055  < 2e-16 ***
## education    6.478e-03  6.397e-04  10.128  < 2e-16 ***
## experience  -4.421e-04  1.378e-04  -3.209  0.00133 ** 
## ethnicity   -4.835e-03  6.126e-03  -0.789  0.42992    
## smsa         2.013e-02  3.811e-03   5.281  1.3e-07 ***
## region       5.344e-03  1.544e-03   3.461  0.00054 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2752 on 28148 degrees of freedom
## Multiple R-squared:  0.07227,    Adjusted R-squared:  0.07207 
## F-statistic: 365.5 on 6 and 28148 DF,  p-value: < 2.2e-16
stargazer::stargazer(lm.parttime,
                     type = 'text',
                     keep.stat = c('n', 'rsq'))
## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                       parttime          
## ----------------------------------------
## wage                 -0.0002***         
##                       (0.00000)         
##                                         
## education             0.006***          
##                        (0.001)          
##                                         
## experience           -0.0004***         
##                       (0.0001)          
##                                         
## ethnicity              -0.005           
##                        (0.006)          
##                                         
## smsa                  0.020***          
##                        (0.004)          
##                                         
## region                0.005***          
##                        (0.002)          
##                                         
## Constant              1.074***          
##                        (0.014)          
##                                         
## ----------------------------------------
## Observations           28,155           
## R2                      0.072           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01
# 조정된 r의 값이 0.072의 설명력을 보인다.
# 값을 나누었을 때, wage,experience,ethnicity 컬럼은 t-value가 (-) 값이 나온다.
# 다른 나머지 컬럼들은 1.96보다 t값이 크게 나오는 것을 확인할 수 있다.
# t-value가 1.96보다 크면 통계적유의하기때매
# 귀무가설을 기각, 이때 기각할떄 실수하는 수준 = 유의수준이다.(오류)
# 유의수준을 0으로 만드는 것은 굉장히 힘듬. 그래서 유의수준 범위를 정해놓음
# 베이스 = 베타1이 0이라는 귀무가설