日本語だと文字化けするから、ところどころ英語で書いちゃってます。

コードは、右上の“Code”をクリックしてみてね。PCで閲覧推奨。

準備

生データが汚いので、きれいにする必要がある。このデータは、“Download”クリックでcsv等の形でダウンロードできるよ。

# setup library -----------------------------------------------------------

pacman::p_load("tidyverse", "skimr", "DT", "ggExtra")

# import data -------------------------------------------------------------

df_raw <- read_csv("~/R/Musashi_University/2021_second_semester/tem/SampleData20211026.csv")


# tidy data ---------------------------------------------------------------

# column's names

country_name <- df_raw[,1] %>% na.omit() 

# create a vector of countries' name
country <- c()
for(i in seq_along(country_name$X1)) {
  tem <- replicate(4, country_name$X1[i])
  country <- c(country, tem)
}


# create tidy data
df <- df_raw[,2:ncol(df_raw)] %>% 
  filter(!is.na(`2019`)) %>% 
  rename(vars = X2) %>% 
  mutate(country = country) %>% 
  pivot_longer(c(-country, -vars), names_to = "year", values_to = "value") %>% 
  select(country, year, vars, value) %>% 
  pivot_wider(names_from = "vars", values_from = "value")


# show data
datatable(df, 
          rownames = FALSE, 
          extensions = 'Buttons',
          options = list(autoWidth = FALSE,
                         pageLength = 5,
                         dom = 'Bfrtip',
                         buttons = list('copy', 'print', 
                                        list(extend = 'collection',
                                             buttons = c('csv', 'excel', 'pdf'),
                                             text = 'Download')),
                         scrollX = TRUE,
                         scrollCollapse = TRUE),
          class = 'cell-border stripe',
          caption = "")

作業1:散布図を作成する

base

with(df, 
     plot(rgdpepc, hc,
          type = "p",
          main = "title",
          sub = "subtitle",
          ylab = "y axis's label",
          xlab = "x axis's label"))

ggplot

これは、ggplot2というパッケージとかを使って書いたもの。マウスを近づければ、値が見れるヨ!

g1 <- df %>% 
  ggplot(aes(x = rgdpepc / 1e6, y = hc, color = country)) +
  geom_point(size = .9) +
  geom_smooth(method = "lm", 
              size = 0.5,
              se = FALSE) +
  facet_wrap(~country, scales = "free", ncol = 1) +
  labs(
    title = "title",
    x = "rdgpec (million $)",
    y = "hc"
  ) +
  scale_color_discrete(guide = FALSE) +
  theme_minimal()

plotly::ggplotly(g1)

作業2:最小二乗法

全ての国混合

全ての国混合の場合は、かなりシンプルなものになる。

# run a linear regression for all coutnries
mod1 <- lm(hc ~ rgdpepc, data = df)

# get summary of mod1
summary(mod1)
## 
## Call:
## lm(formula = hc ~ rgdpepc, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2342 -0.4164  0.1366  0.4069  0.6129 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.992e+00  3.876e-02   51.40   <2e-16 ***
## rgdpepc     2.842e-05  1.205e-06   23.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4331 on 334 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.6246, Adjusted R-squared:  0.6235 
## F-statistic: 555.7 on 1 and 334 DF,  p-value: < 2.2e-16

国別

国別で分析したい場合は、ちょっと工夫が必要。

# run a linear regression for each country
mod2 <- df %>% 
  group_by(country) %>% 
  nest() %>% 
  mutate(model_lm = purrr::map(data, ~lm(hc ~ rgdpepc, data = .))) 

# get summary of mod2
result <- list(
  country = country %>% unique(),
  lm = list()
)

for(i in 1:length(mod2$country)) {
  result$lm[[i]] <- summary(mod2$model_lm[[i]])
}

result <- setNames(result$lm, country %>% unique())

# show the result
result
## $JPN
## 
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18673 -0.04977  0.02114  0.04238  0.13415 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.396e+00  1.725e-02  138.92   <2e-16 ***
## rgdpepc     2.793e-05  6.477e-07   43.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07149 on 68 degrees of freedom
## Multiple R-squared:  0.9647, Adjusted R-squared:  0.9642 
## F-statistic:  1860 on 1 and 68 DF,  p-value: < 2.2e-16
## 
## 
## $TWN
## 
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.083965 -0.034942 -0.006075  0.035967  0.112635 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.471e+00  9.208e-03   159.8   <2e-16 ***
## rgdpepc     3.777e-05  3.507e-07   107.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0478 on 67 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9943, Adjusted R-squared:  0.9942 
## F-statistic: 1.16e+04 on 1 and 67 DF,  p-value: < 2.2e-16
## 
## 
## $SGP
## 
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48081 -0.08753  0.00393  0.09351  0.64989 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.482e+00  4.073e-02   36.39   <2e-16 ***
## rgdpepc     2.505e-05  9.326e-07   26.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2084 on 58 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.9256, Adjusted R-squared:  0.9243 
## F-statistic: 721.2 on 1 and 58 DF,  p-value: < 2.2e-16
## 
## 
## $KOR
## 
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39897 -0.14128 -0.01828  0.12768  0.36246 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.930e+00  3.579e-02   53.94   <2e-16 ***
## rgdpepc     4.734e-05  1.752e-06   27.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1979 on 65 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9182, Adjusted R-squared:  0.917 
## F-statistic:   730 on 1 and 65 DF,  p-value: < 2.2e-16
## 
## 
## $USA
## 
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19375 -0.09576 -0.01466  0.10151  0.23152 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.382e+00  3.953e-02   60.27   <2e-16 ***
## rgdpepc     2.459e-05  1.007e-06   24.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1214 on 68 degrees of freedom
## Multiple R-squared:  0.8977, Adjusted R-squared:  0.8962 
## F-statistic: 596.7 on 1 and 68 DF,  p-value: < 2.2e-16

作業3: pdf化

調べれば、RmarkdownというのでR言語を使ってレポートをPDF化できるぞい。

これは、長くなるから、“rmarkdown使い方”とかで検索しよう。

最後に、、、

これで以上です!

質問とかあったら気軽に聞いてね。

全コード

# setup library -----------------------------------------------------------

pacman::p_load("tidyverse", "skimr", "DT", "ggExtra")

# import data -------------------------------------------------------------

df_raw <- read_csv("~/R/Musashi_University/2021_second_semester/tem/SampleData20211026.csv")

# view df_raw
skim(df_raw)


# tidy data ---------------------------------------------------------------

# column's names

country_name <- df_raw[,1] %>% na.omit() 

# create a vector of countries' name
country <- c()
for(i in seq_along(country_name$X1)) {
  tem <- replicate(4, country_name$X1[i])
  country <- c(country, tem)
}
country


# create tidy data
df <- df_raw[,2:ncol(df_raw)] %>% 
  filter(!is.na(`2019`)) %>% 
  rename(vars = X2) %>% 
  mutate(country = country) %>% 
  pivot_longer(c(-country, -vars), names_to = "year", values_to = "value") %>% 
  select(country, year, vars, value) %>% 
  pivot_wider(names_from = "vars", values_from = "value")


# show data
datatable(df, 
          rownames = FALSE, 
          extensions = 'Buttons',
          options = list(autoWidth = FALSE,
                         pageLength = 5,
                         dom = 'Bfrtip',
                         buttons = list('copy', 'print', 
                                        list(extend = 'collection',
                                             buttons = c('csv', 'excel', 'pdf'),
                                             text = 'Download')),
                         scrollX = TRUE,
                         scrollCollapse = TRUE),
          class = 'cell-border stripe',
          caption = "")

# 作業1:散布図を作成する ------------------------------------------------------------

g1 <- df %>% 
  ggplot(aes(x = rgdpepc / 1e6, y = hc, color = country)) +
  geom_point(size = .9) +
  geom_smooth(method = "lm", 
              size = 0.5,
              se = FALSE) +
  facet_wrap(~country, scales = "free", ncol = 1) +
  labs(
    title = "ここにタイトルを記入",
    x = "ここにX軸名を記入 (million $)",
    y = "ここにY軸名を記入"
  ) +
  scale_color_discrete(guide = FALSE) +
  theme_minimal()


ggMarginal(
  g1, 
  type = "density", 
  margins = "both"
)

# 作業2:最小二乗法 ---------------------------------------------------------------

# 全ての国混合
mod1 <- lm(hc ~ rgdpepc, data = df)

# 国別
mod2 <- df %>% 
  group_by(country) %>% 
  nest() %>% 
  mutate(model_lm = purrr::map(data, ~lm(hc ~ rgdpepc, data = .))) 

summary(mod1)

result <- list(
  country = country %>% unique(),
  lm = list()
)

for(i in 1:length(mod2$country)) {
  result$lm[[i]] <- summary(mod2$model_lm[[i]])
}

result <- setNames(result$lm, country %>% unique())

result

# 作業3: pdf化 ---------------------------------------------------------------