日本語だと文字化けするから、ところどころ英語で書いちゃってます。
コードは、右上の“Code”をクリックしてみてね。PCで閲覧推奨。
生データが汚いので、きれいにする必要がある。このデータは、“Download”クリックでcsv等の形でダウンロードできるよ。
# setup library -----------------------------------------------------------
pacman::p_load("tidyverse", "skimr", "DT", "ggExtra")
# import data -------------------------------------------------------------
df_raw <- read_csv("~/R/Musashi_University/2021_second_semester/tem/SampleData20211026.csv")
# tidy data ---------------------------------------------------------------
# column's names
country_name <- df_raw[,1] %>% na.omit()
# create a vector of countries' name
country <- c()
for(i in seq_along(country_name$X1)) {
tem <- replicate(4, country_name$X1[i])
country <- c(country, tem)
}
# create tidy data
df <- df_raw[,2:ncol(df_raw)] %>%
filter(!is.na(`2019`)) %>%
rename(vars = X2) %>%
mutate(country = country) %>%
pivot_longer(c(-country, -vars), names_to = "year", values_to = "value") %>%
select(country, year, vars, value) %>%
pivot_wider(names_from = "vars", values_from = "value")
# show data
datatable(df,
rownames = FALSE,
extensions = 'Buttons',
options = list(autoWidth = FALSE,
pageLength = 5,
dom = 'Bfrtip',
buttons = list('copy', 'print',
list(extend = 'collection',
buttons = c('csv', 'excel', 'pdf'),
text = 'Download')),
scrollX = TRUE,
scrollCollapse = TRUE),
class = 'cell-border stripe',
caption = "")with(df,
plot(rgdpepc, hc,
type = "p",
main = "title",
sub = "subtitle",
ylab = "y axis's label",
xlab = "x axis's label"))これは、ggplot2というパッケージとかを使って書いたもの。マウスを近づければ、値が見れるヨ!
g1 <- df %>%
ggplot(aes(x = rgdpepc / 1e6, y = hc, color = country)) +
geom_point(size = .9) +
geom_smooth(method = "lm",
size = 0.5,
se = FALSE) +
facet_wrap(~country, scales = "free", ncol = 1) +
labs(
title = "title",
x = "rdgpec (million $)",
y = "hc"
) +
scale_color_discrete(guide = FALSE) +
theme_minimal()
plotly::ggplotly(g1)全ての国混合の場合は、かなりシンプルなものになる。
# run a linear regression for all coutnries
mod1 <- lm(hc ~ rgdpepc, data = df)
# get summary of mod1
summary(mod1)##
## Call:
## lm(formula = hc ~ rgdpepc, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2342 -0.4164 0.1366 0.4069 0.6129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.992e+00 3.876e-02 51.40 <2e-16 ***
## rgdpepc 2.842e-05 1.205e-06 23.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4331 on 334 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.6246, Adjusted R-squared: 0.6235
## F-statistic: 555.7 on 1 and 334 DF, p-value: < 2.2e-16
国別で分析したい場合は、ちょっと工夫が必要。
# run a linear regression for each country
mod2 <- df %>%
group_by(country) %>%
nest() %>%
mutate(model_lm = purrr::map(data, ~lm(hc ~ rgdpepc, data = .)))
# get summary of mod2
result <- list(
country = country %>% unique(),
lm = list()
)
for(i in 1:length(mod2$country)) {
result$lm[[i]] <- summary(mod2$model_lm[[i]])
}
result <- setNames(result$lm, country %>% unique())
# show the result
result## $JPN
##
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18673 -0.04977 0.02114 0.04238 0.13415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.396e+00 1.725e-02 138.92 <2e-16 ***
## rgdpepc 2.793e-05 6.477e-07 43.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07149 on 68 degrees of freedom
## Multiple R-squared: 0.9647, Adjusted R-squared: 0.9642
## F-statistic: 1860 on 1 and 68 DF, p-value: < 2.2e-16
##
##
## $TWN
##
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.083965 -0.034942 -0.006075 0.035967 0.112635
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.471e+00 9.208e-03 159.8 <2e-16 ***
## rgdpepc 3.777e-05 3.507e-07 107.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0478 on 67 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9943, Adjusted R-squared: 0.9942
## F-statistic: 1.16e+04 on 1 and 67 DF, p-value: < 2.2e-16
##
##
## $SGP
##
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48081 -0.08753 0.00393 0.09351 0.64989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.482e+00 4.073e-02 36.39 <2e-16 ***
## rgdpepc 2.505e-05 9.326e-07 26.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2084 on 58 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.9256, Adjusted R-squared: 0.9243
## F-statistic: 721.2 on 1 and 58 DF, p-value: < 2.2e-16
##
##
## $KOR
##
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39897 -0.14128 -0.01828 0.12768 0.36246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.930e+00 3.579e-02 53.94 <2e-16 ***
## rgdpepc 4.734e-05 1.752e-06 27.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1979 on 65 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9182, Adjusted R-squared: 0.917
## F-statistic: 730 on 1 and 65 DF, p-value: < 2.2e-16
##
##
## $USA
##
## Call:
## lm(formula = hc ~ rgdpepc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19375 -0.09576 -0.01466 0.10151 0.23152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.382e+00 3.953e-02 60.27 <2e-16 ***
## rgdpepc 2.459e-05 1.007e-06 24.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1214 on 68 degrees of freedom
## Multiple R-squared: 0.8977, Adjusted R-squared: 0.8962
## F-statistic: 596.7 on 1 and 68 DF, p-value: < 2.2e-16
調べれば、RmarkdownというのでR言語を使ってレポートをPDF化できるぞい。
これは、長くなるから、“rmarkdown使い方”とかで検索しよう。
これで以上です!
質問とかあったら気軽に聞いてね。
# setup library -----------------------------------------------------------
pacman::p_load("tidyverse", "skimr", "DT", "ggExtra")
# import data -------------------------------------------------------------
df_raw <- read_csv("~/R/Musashi_University/2021_second_semester/tem/SampleData20211026.csv")
# view df_raw
skim(df_raw)
# tidy data ---------------------------------------------------------------
# column's names
country_name <- df_raw[,1] %>% na.omit()
# create a vector of countries' name
country <- c()
for(i in seq_along(country_name$X1)) {
tem <- replicate(4, country_name$X1[i])
country <- c(country, tem)
}
country
# create tidy data
df <- df_raw[,2:ncol(df_raw)] %>%
filter(!is.na(`2019`)) %>%
rename(vars = X2) %>%
mutate(country = country) %>%
pivot_longer(c(-country, -vars), names_to = "year", values_to = "value") %>%
select(country, year, vars, value) %>%
pivot_wider(names_from = "vars", values_from = "value")
# show data
datatable(df,
rownames = FALSE,
extensions = 'Buttons',
options = list(autoWidth = FALSE,
pageLength = 5,
dom = 'Bfrtip',
buttons = list('copy', 'print',
list(extend = 'collection',
buttons = c('csv', 'excel', 'pdf'),
text = 'Download')),
scrollX = TRUE,
scrollCollapse = TRUE),
class = 'cell-border stripe',
caption = "")
# 作業1:散布図を作成する ------------------------------------------------------------
g1 <- df %>%
ggplot(aes(x = rgdpepc / 1e6, y = hc, color = country)) +
geom_point(size = .9) +
geom_smooth(method = "lm",
size = 0.5,
se = FALSE) +
facet_wrap(~country, scales = "free", ncol = 1) +
labs(
title = "ここにタイトルを記入",
x = "ここにX軸名を記入 (million $)",
y = "ここにY軸名を記入"
) +
scale_color_discrete(guide = FALSE) +
theme_minimal()
ggMarginal(
g1,
type = "density",
margins = "both"
)
# 作業2:最小二乗法 ---------------------------------------------------------------
# 全ての国混合
mod1 <- lm(hc ~ rgdpepc, data = df)
# 国別
mod2 <- df %>%
group_by(country) %>%
nest() %>%
mutate(model_lm = purrr::map(data, ~lm(hc ~ rgdpepc, data = .)))
summary(mod1)
result <- list(
country = country %>% unique(),
lm = list()
)
for(i in 1:length(mod2$country)) {
result$lm[[i]] <- summary(mod2$model_lm[[i]])
}
result <- setNames(result$lm, country %>% unique())
result
# 作業3: pdf化 ---------------------------------------------------------------