This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("readr")
## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)
## 패키지 'readr'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
##
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
## C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages
library(readr)
# 각 파일의 URL
urls <- c(
data_corruption = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/corruption.csv',
data_living = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/cost_of_living.csv',
data_richest = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/richest_countries.csv',
data_tourism = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/tourism.csv',
data_unemployment = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/tourism.csv' # URL이 중복되어 있으니 확인이 필요합니다.
)
# URL에서 데이터를 불러와서 목록에 저장합니다.
data_list <- lapply(urls, read_csv)
## Rows: 110 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (2): annual_income, corruption_index
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 107 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (3): cost_index, monthly_income, purchasing_power_index
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 50 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (1): gdp_per_capita
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 41 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (4): tourists_in_millions, receipts_in_billions, receipts_per_tourist, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 41 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (4): tourists_in_millions, receipts_in_billions, receipts_per_tourist, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 각 데이터를 개별 변수에 할당합니다.
data_corruption <- data_list$data_corruption
data_living <- data_list$data_living
data_richest <- data_list$data_richest
data_tourism <- data_list$data_tourism
data_unemployment <- data_list$data_unemployment
# 데이터의 첫 부분을 확인합니다.
head(data_corruption)
## # A tibble: 6 × 3
## country annual_income corruption_index
## <chr> <dbl> <dbl>
## 1 Denmark 68110 12
## 2 Finland 53660 12
## 3 New Zealand 45340 12
## 4 Norway 84090 15
## 5 Singapore 64010 15
## 6 Sweden 58890 15
head(data_living)
## # A tibble: 6 × 4
## country cost_index monthly_income purchasing_power_index
## <chr> <dbl> <dbl> <dbl>
## 1 Bermuda 158. 9712 105
## 2 Switzerland 142. 7530 90.1
## 3 Cayman Islands 138. 5281 65.2
## 4 Israel 130. 4130 54.1
## 5 Iceland 128 5368 71.5
## 6 New Caledonia 126. 1101 14.9
head(data_richest)
## # A tibble: 6 × 2
## country gdp_per_capita
## <chr> <dbl>
## 1 Luxembourg 134754
## 2 Singapore 116486
## 3 Ireland 106456
## 4 Qatar 93521
## 5 Bermuda 85192
## 6 Norway 79201
head(data_tourism)
## # A tibble: 6 × 5
## country tourists_in_millions receipts_in_billions receipts_per_tourist
## <chr> <dbl> <dbl> <dbl>
## 1 France 117. 36.0 307
## 2 Mexico 51.1 11.4 224
## 3 United States 45 84.2 1870
## 4 Italy 38.4 20.5 533
## 5 Hungary 31.6 4.22 133
## 6 Croatia 21.6 5.63 261
## # ℹ 1 more variable: percentage_of_gdp <dbl>
head(data_unemployment)
## # A tibble: 6 × 5
## country tourists_in_millions receipts_in_billions receipts_per_tourist
## <chr> <dbl> <dbl> <dbl>
## 1 France 117. 36.0 307
## 2 Mexico 51.1 11.4 224
## 3 United States 45 84.2 1870
## 4 Italy 38.4 20.5 533
## 5 Hungary 31.6 4.22 133
## 6 Croatia 21.6 5.63 261
## # ℹ 1 more variable: percentage_of_gdp <dbl>
install.packages("dplyr")
## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)
## 패키지 'dplyr'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## Warning: 패키지 'dplyr'의 이전설치를 삭제할 수 없습니다
## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## C:\Users\passi\AppData\Local\R\win-library\4.3\00LOCK\dplyr\libs\x64\dplyr.dll를
## C:\Users\passi\AppData\Local\R\win-library\4.3\dplyr\libs\x64\dplyr.dll로
## 복사하는데 문제가 발생했습니다: Permission denied
## Warning: 'dplyr'를 복구하였습니다
##
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
## C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
merged_data <- data_corruption %>%
left_join(data_living, by = "country") %>%
left_join(data_richest, by = "country") %>%
left_join(data_tourism, by = "country") %>%
left_join(data_unemployment, by = "country")
# 합쳐진 데이터를 확인합니다.
head(merged_data)
## # A tibble: 6 × 15
## country annual_income corruption_index cost_index monthly_income
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Denmark 68110 12 120. 5676
## 2 Finland 53660 12 108 4472
## 3 New Zealand 45340 12 117. 3778
## 4 Norway 84090 15 125. 7008
## 5 Singapore 64010 15 75 5334
## 6 Sweden 58890 15 109. 4908
## # ℹ 10 more variables: purchasing_power_index <dbl>, gdp_per_capita <dbl>,
## # tourists_in_millions.x <dbl>, receipts_in_billions.x <dbl>,
## # receipts_per_tourist.x <dbl>, percentage_of_gdp.x <dbl>,
## # tourists_in_millions.y <dbl>, receipts_in_billions.y <dbl>,
## # receipts_per_tourist.y <dbl>, percentage_of_gdp.y <dbl>
install.packages("ggplot2")
## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)
## 패키지 'ggplot2'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
##
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
## C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages
library(ggplot2)
# 데이터의 순서(인덱스)를 생성합니다.
merged_data$order <- seq.int(nrow(merged_data))
ggplot(merged_data, aes(x = order, y = annual_income)) +
geom_point(aes(color = annual_income), size = 3) +
scale_color_gradient(low = "blue", high = "red") +
labs(title = "Scatter Plot of Annual Income",
x = "Order (Index)",
y = "Annual Income") +
theme_minimal()
# colnames() 함수를 사용하는 방법
column_names <- colnames(merged_data)
print(column_names)
## [1] "country" "annual_income" "corruption_index"
## [4] "cost_index" "monthly_income" "purchasing_power_index"
## [7] "gdp_per_capita" "tourists_in_millions.x" "receipts_in_billions.x"
## [10] "receipts_per_tourist.x" "percentage_of_gdp.x" "tourists_in_millions.y"
## [13] "receipts_in_billions.y" "receipts_per_tourist.y" "percentage_of_gdp.y"
## [16] "order"
# 또는 names() 함수를 사용하는 방법
column_names <- names(merged_data)
print(column_names)
## [1] "country" "annual_income" "corruption_index"
## [4] "cost_index" "monthly_income" "purchasing_power_index"
## [7] "gdp_per_capita" "tourists_in_millions.x" "receipts_in_billions.x"
## [10] "receipts_per_tourist.x" "percentage_of_gdp.x" "tourists_in_millions.y"
## [13] "receipts_in_billions.y" "receipts_per_tourist.y" "percentage_of_gdp.y"
## [16] "order"
# 수치형 열만 선택합니다.
numeric_columns <- merged_data[sapply(merged_data, is.numeric)]
# 선택된 수치형 열로 상관 행렬을 계산합니다.
correlation_matrix <- cor(numeric_columns, use = "complete.obs", method = "pearson")
# 상관 행렬을 출력합니다.
print(correlation_matrix)
## annual_income corruption_index cost_index
## annual_income 1.000000000 -0.85884867 0.84426743
## corruption_index -0.858848666 1.00000000 -0.88788053
## cost_index 0.844267428 -0.88788053 1.00000000
## monthly_income 0.999999988 -0.85886823 0.84427187
## purchasing_power_index 0.892211427 -0.69240108 0.55239824
## gdp_per_capita 0.835850221 -0.61200687 0.54339624
## tourists_in_millions.x -0.009273268 -0.00946317 0.07564747
## receipts_in_billions.x 0.294220125 -0.23396168 0.20541009
## receipts_per_tourist.x 0.354146424 -0.21734725 0.13504949
## percentage_of_gdp.x 0.206680317 -0.17723182 -0.06726950
## tourists_in_millions.y -0.009273268 -0.00946317 0.07564747
## receipts_in_billions.y 0.294220125 -0.23396168 0.20541009
## receipts_per_tourist.y 0.354146424 -0.21734725 0.13504949
## percentage_of_gdp.y 0.206680317 -0.17723182 -0.06726950
## order -0.789951189 0.95450743 -0.81919731
## monthly_income purchasing_power_index gdp_per_capita
## annual_income 0.999999988 0.89221143 0.8358502
## corruption_index -0.858868226 -0.69240108 -0.6120069
## cost_index 0.844271866 0.55239824 0.5433962
## monthly_income 1.000000000 0.89220980 0.8358287
## purchasing_power_index 0.892209801 1.00000000 0.8555495
## gdp_per_capita 0.835828748 0.85554948 1.0000000
## tourists_in_millions.x -0.009262163 -0.04725314 -0.1247469
## receipts_in_billions.x 0.294174747 0.34803674 0.0631486
## receipts_per_tourist.x 0.354121303 0.56749593 0.5222063
## percentage_of_gdp.x 0.206689395 0.48192982 0.4535973
## tourists_in_millions.y -0.009262163 -0.04725314 -0.1247469
## receipts_in_billions.y 0.294174747 0.34803674 0.0631486
## receipts_per_tourist.y 0.354121303 0.56749593 0.5222063
## percentage_of_gdp.y 0.206689395 0.48192982 0.4535973
## order -0.789958713 -0.63569296 -0.5702086
## tourists_in_millions.x receipts_in_billions.x
## annual_income -0.009273268 0.29422013
## corruption_index -0.009463170 -0.23396168
## cost_index 0.075647470 0.20541009
## monthly_income -0.009262163 0.29417475
## purchasing_power_index -0.047253144 0.34803674
## gdp_per_capita -0.124746872 0.06314860
## tourists_in_millions.x 1.000000000 0.45500723
## receipts_in_billions.x 0.455007229 1.00000000
## receipts_per_tourist.x -0.300284786 0.02921910
## percentage_of_gdp.x -0.229266353 -0.04970387
## tourists_in_millions.y 1.000000000 0.45500723
## receipts_in_billions.y 0.455007229 1.00000000
## receipts_per_tourist.y -0.300284786 0.02921910
## percentage_of_gdp.y -0.229266353 -0.04970387
## order 0.023030878 -0.19417591
## receipts_per_tourist.x percentage_of_gdp.x
## annual_income 0.3541464 0.20668032
## corruption_index -0.2173472 -0.17723182
## cost_index 0.1350495 -0.06726950
## monthly_income 0.3541213 0.20668940
## purchasing_power_index 0.5674959 0.48192982
## gdp_per_capita 0.5222063 0.45359734
## tourists_in_millions.x -0.3002848 -0.22926635
## receipts_in_billions.x 0.0292191 -0.04970387
## receipts_per_tourist.x 1.0000000 0.66588433
## percentage_of_gdp.x 0.6658843 1.00000000
## tourists_in_millions.y -0.3002848 -0.22926635
## receipts_in_billions.y 0.0292191 -0.04970387
## receipts_per_tourist.y 1.0000000 0.66588433
## percentage_of_gdp.y 0.6658843 1.00000000
## order -0.2033242 -0.23907689
## tourists_in_millions.y receipts_in_billions.y
## annual_income -0.009273268 0.29422013
## corruption_index -0.009463170 -0.23396168
## cost_index 0.075647470 0.20541009
## monthly_income -0.009262163 0.29417475
## purchasing_power_index -0.047253144 0.34803674
## gdp_per_capita -0.124746872 0.06314860
## tourists_in_millions.x 1.000000000 0.45500723
## receipts_in_billions.x 0.455007229 1.00000000
## receipts_per_tourist.x -0.300284786 0.02921910
## percentage_of_gdp.x -0.229266353 -0.04970387
## tourists_in_millions.y 1.000000000 0.45500723
## receipts_in_billions.y 0.455007229 1.00000000
## receipts_per_tourist.y -0.300284786 0.02921910
## percentage_of_gdp.y -0.229266353 -0.04970387
## order 0.023030878 -0.19417591
## receipts_per_tourist.y percentage_of_gdp.y order
## annual_income 0.3541464 0.20668032 -0.78995119
## corruption_index -0.2173472 -0.17723182 0.95450743
## cost_index 0.1350495 -0.06726950 -0.81919731
## monthly_income 0.3541213 0.20668940 -0.78995871
## purchasing_power_index 0.5674959 0.48192982 -0.63569296
## gdp_per_capita 0.5222063 0.45359734 -0.57020862
## tourists_in_millions.x -0.3002848 -0.22926635 0.02303088
## receipts_in_billions.x 0.0292191 -0.04970387 -0.19417591
## receipts_per_tourist.x 1.0000000 0.66588433 -0.20332421
## percentage_of_gdp.x 0.6658843 1.00000000 -0.23907689
## tourists_in_millions.y -0.3002848 -0.22926635 0.02303088
## receipts_in_billions.y 0.0292191 -0.04970387 -0.19417591
## receipts_per_tourist.y 1.0000000 0.66588433 -0.20332421
## percentage_of_gdp.y 0.6658843 1.00000000 -0.23907689
## order -0.2033242 -0.23907689 1.00000000
# 상관계수의 절대값이 0.7 이상인 쌍을 찾습니다.
high_correlations <- which(abs(correlation_matrix) > 0.7 & correlation_matrix != 1, arr.ind = TRUE)
# 의미있는 상관관계를 가진 변수 쌍을 출력합니다.
for(i in seq_along(high_correlations[, 1])){
row <- high_correlations[i, 1]
col <- high_correlations[i, 2]
# 변수 이름을 가져옵니다.
var1 <- colnames(numeric_columns)[row]
var2 <- colnames(numeric_columns)[col]
# monthly_income과의 관계를 제외합니다.
if("monthly_income" %in% c(var1, var2)) next
# 중복된 쌍을 방지하기 위해 체크합니다.
if(col > row){
cat("Significant correlation between:", var1, "and", var2, "with a correlation coefficient of", correlation_matrix[row, col], "\n")
}
}
## Significant correlation between: annual_income and corruption_index with a correlation coefficient of -0.8588487
## Significant correlation between: annual_income and cost_index with a correlation coefficient of 0.8442674
## Significant correlation between: corruption_index and cost_index with a correlation coefficient of -0.8878805
## Significant correlation between: annual_income and purchasing_power_index with a correlation coefficient of 0.8922114
## Significant correlation between: annual_income and gdp_per_capita with a correlation coefficient of 0.8358502
## Significant correlation between: purchasing_power_index and gdp_per_capita with a correlation coefficient of 0.8555495
## Significant correlation between: receipts_in_billions.x and receipts_in_billions.y with a correlation coefficient of 1
## Significant correlation between: annual_income and order with a correlation coefficient of -0.7899512
## Significant correlation between: corruption_index and order with a correlation coefficient of 0.9545074
## Significant correlation between: cost_index and order with a correlation coefficient of -0.8191973
# 수치형 열만 선택합니다.
numeric_columns <- merged_data[sapply(merged_data, is.numeric)]
# 선택된 수치형 열로 상관 행렬을 계산합니다.
correlation_matrix <- cor(numeric_columns, use = "complete.obs", method = "pearson")
# 상관 행렬을 출력합니다.
print(correlation_matrix)
## annual_income corruption_index cost_index
## annual_income 1.000000000 -0.85884867 0.84426743
## corruption_index -0.858848666 1.00000000 -0.88788053
## cost_index 0.844267428 -0.88788053 1.00000000
## monthly_income 0.999999988 -0.85886823 0.84427187
## purchasing_power_index 0.892211427 -0.69240108 0.55239824
## gdp_per_capita 0.835850221 -0.61200687 0.54339624
## tourists_in_millions.x -0.009273268 -0.00946317 0.07564747
## receipts_in_billions.x 0.294220125 -0.23396168 0.20541009
## receipts_per_tourist.x 0.354146424 -0.21734725 0.13504949
## percentage_of_gdp.x 0.206680317 -0.17723182 -0.06726950
## tourists_in_millions.y -0.009273268 -0.00946317 0.07564747
## receipts_in_billions.y 0.294220125 -0.23396168 0.20541009
## receipts_per_tourist.y 0.354146424 -0.21734725 0.13504949
## percentage_of_gdp.y 0.206680317 -0.17723182 -0.06726950
## order -0.789951189 0.95450743 -0.81919731
## monthly_income purchasing_power_index gdp_per_capita
## annual_income 0.999999988 0.89221143 0.8358502
## corruption_index -0.858868226 -0.69240108 -0.6120069
## cost_index 0.844271866 0.55239824 0.5433962
## monthly_income 1.000000000 0.89220980 0.8358287
## purchasing_power_index 0.892209801 1.00000000 0.8555495
## gdp_per_capita 0.835828748 0.85554948 1.0000000
## tourists_in_millions.x -0.009262163 -0.04725314 -0.1247469
## receipts_in_billions.x 0.294174747 0.34803674 0.0631486
## receipts_per_tourist.x 0.354121303 0.56749593 0.5222063
## percentage_of_gdp.x 0.206689395 0.48192982 0.4535973
## tourists_in_millions.y -0.009262163 -0.04725314 -0.1247469
## receipts_in_billions.y 0.294174747 0.34803674 0.0631486
## receipts_per_tourist.y 0.354121303 0.56749593 0.5222063
## percentage_of_gdp.y 0.206689395 0.48192982 0.4535973
## order -0.789958713 -0.63569296 -0.5702086
## tourists_in_millions.x receipts_in_billions.x
## annual_income -0.009273268 0.29422013
## corruption_index -0.009463170 -0.23396168
## cost_index 0.075647470 0.20541009
## monthly_income -0.009262163 0.29417475
## purchasing_power_index -0.047253144 0.34803674
## gdp_per_capita -0.124746872 0.06314860
## tourists_in_millions.x 1.000000000 0.45500723
## receipts_in_billions.x 0.455007229 1.00000000
## receipts_per_tourist.x -0.300284786 0.02921910
## percentage_of_gdp.x -0.229266353 -0.04970387
## tourists_in_millions.y 1.000000000 0.45500723
## receipts_in_billions.y 0.455007229 1.00000000
## receipts_per_tourist.y -0.300284786 0.02921910
## percentage_of_gdp.y -0.229266353 -0.04970387
## order 0.023030878 -0.19417591
## receipts_per_tourist.x percentage_of_gdp.x
## annual_income 0.3541464 0.20668032
## corruption_index -0.2173472 -0.17723182
## cost_index 0.1350495 -0.06726950
## monthly_income 0.3541213 0.20668940
## purchasing_power_index 0.5674959 0.48192982
## gdp_per_capita 0.5222063 0.45359734
## tourists_in_millions.x -0.3002848 -0.22926635
## receipts_in_billions.x 0.0292191 -0.04970387
## receipts_per_tourist.x 1.0000000 0.66588433
## percentage_of_gdp.x 0.6658843 1.00000000
## tourists_in_millions.y -0.3002848 -0.22926635
## receipts_in_billions.y 0.0292191 -0.04970387
## receipts_per_tourist.y 1.0000000 0.66588433
## percentage_of_gdp.y 0.6658843 1.00000000
## order -0.2033242 -0.23907689
## tourists_in_millions.y receipts_in_billions.y
## annual_income -0.009273268 0.29422013
## corruption_index -0.009463170 -0.23396168
## cost_index 0.075647470 0.20541009
## monthly_income -0.009262163 0.29417475
## purchasing_power_index -0.047253144 0.34803674
## gdp_per_capita -0.124746872 0.06314860
## tourists_in_millions.x 1.000000000 0.45500723
## receipts_in_billions.x 0.455007229 1.00000000
## receipts_per_tourist.x -0.300284786 0.02921910
## percentage_of_gdp.x -0.229266353 -0.04970387
## tourists_in_millions.y 1.000000000 0.45500723
## receipts_in_billions.y 0.455007229 1.00000000
## receipts_per_tourist.y -0.300284786 0.02921910
## percentage_of_gdp.y -0.229266353 -0.04970387
## order 0.023030878 -0.19417591
## receipts_per_tourist.y percentage_of_gdp.y order
## annual_income 0.3541464 0.20668032 -0.78995119
## corruption_index -0.2173472 -0.17723182 0.95450743
## cost_index 0.1350495 -0.06726950 -0.81919731
## monthly_income 0.3541213 0.20668940 -0.78995871
## purchasing_power_index 0.5674959 0.48192982 -0.63569296
## gdp_per_capita 0.5222063 0.45359734 -0.57020862
## tourists_in_millions.x -0.3002848 -0.22926635 0.02303088
## receipts_in_billions.x 0.0292191 -0.04970387 -0.19417591
## receipts_per_tourist.x 1.0000000 0.66588433 -0.20332421
## percentage_of_gdp.x 0.6658843 1.00000000 -0.23907689
## tourists_in_millions.y -0.3002848 -0.22926635 0.02303088
## receipts_in_billions.y 0.0292191 -0.04970387 -0.19417591
## receipts_per_tourist.y 1.0000000 0.66588433 -0.20332421
## percentage_of_gdp.y 0.6658843 1.00000000 -0.23907689
## order -0.2033242 -0.23907689 1.00000000
# 상관계수의 절대값이 0.7 이상인 쌍을 찾습니다.
high_correlations <- which(abs(correlation_matrix) > 0.7 & correlation_matrix != 1, arr.ind = TRUE)
# 의미있는 상관관계를 가진 변수 쌍을 출력합니다.
for(i in seq_along(high_correlations[, 1])){
row <- high_correlations[i, 1]
col <- high_correlations[i, 2]
# 변수 이름을 가져옵니다.
var1 <- colnames(numeric_columns)[row]
var2 <- colnames(numeric_columns)[col]
# monthly_income과의 관계를 제외합니다.
if("monthly_income" %in% c(var1, var2)) next
# 중복된 쌍을 방지하기 위해 체크합니다.
if(col > row){
cat("Significant correlation between:", var1, "and", var2, "with a correlation coefficient of", correlation_matrix[row, col], "\n")
}
}
## Significant correlation between: annual_income and corruption_index with a correlation coefficient of -0.8588487
## Significant correlation between: annual_income and cost_index with a correlation coefficient of 0.8442674
## Significant correlation between: corruption_index and cost_index with a correlation coefficient of -0.8878805
## Significant correlation between: annual_income and purchasing_power_index with a correlation coefficient of 0.8922114
## Significant correlation between: annual_income and gdp_per_capita with a correlation coefficient of 0.8358502
## Significant correlation between: purchasing_power_index and gdp_per_capita with a correlation coefficient of 0.8555495
## Significant correlation between: receipts_in_billions.x and receipts_in_billions.y with a correlation coefficient of 1
## Significant correlation between: annual_income and order with a correlation coefficient of -0.7899512
## Significant correlation between: corruption_index and order with a correlation coefficient of 0.9545074
## Significant correlation between: cost_index and order with a correlation coefficient of -0.8191973
install.packages("gridExtra")
## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)
## 패키지 'gridExtra'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
##
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
## C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages
library(gridExtra)
##
## 다음의 패키지를 부착합니다: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
c("annual_income", "corruption_index"),
c("annual_income", "cost_index"),
c("corruption_index", "cost_index"),
c("annual_income", "purchasing_power_index"),
c("annual_income", "gdp_per_capita"),
c("purchasing_power_index", "gdp_per_capita"),
c("receipts_in_billions.x", "receipts_in_billions.y"),
c("annual_income", "order"),
c("corruption_index", "order"),
c("cost_index", "order")
)
# 각 쌍에 대한 산점도를 그립니다.
plot_list <- list()
for(pair in high_corr_pairs) {
p <- ggplot(merged_data, aes_string(x = pair[1], y = pair[2])) +
geom_point() +
labs(x = pair[1], y = pair[2], title = paste("Scatter plot of", pair[1], "and", pair[2])) +
theme_minimal()
plot_list[[length(plot_list)+1]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 그림들을 그리드로 배열합니다.
do.call(grid.arrange, c(plot_list, ncol = 2))
## Warning: Removed 39 rows containing missing values (`geom_point()`).
## Warning: Removed 39 rows containing missing values (`geom_point()`).
## Removed 39 rows containing missing values (`geom_point()`).
## Warning: Removed 76 rows containing missing values (`geom_point()`).
## Removed 76 rows containing missing values (`geom_point()`).
## Warning: Removed 82 rows containing missing values (`geom_point()`).
## Warning: Removed 39 rows containing missing values (`geom_point()`).
# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
c("annual_income", "corruption_index"),
c("annual_income", "cost_index"),
c("corruption_index", "cost_index"),
c("annual_income", "purchasing_power_index"),
c("annual_income", "gdp_per_capita"),
c("purchasing_power_index", "gdp_per_capita"),
c("receipts_in_billions.x", "receipts_in_billions.y"),
c("annual_income", "order"),
c("corruption_index", "order"),
c("cost_index", "order")
)
# 각 변수 쌍에 대한 산점도 출력
for(pair in high_corr_pairs){
var_x <- pair[1]
var_y <- pair[2]
print(ggplot(merged_data, aes_string(x = var_x, y = var_y)) +
geom_point() +
labs(title = paste("Relationship between", var_x, "and", var_y),
x = var_x,
y = var_y))
}
## Warning: Removed 39 rows containing missing values (`geom_point()`).
## Warning: Removed 39 rows containing missing values (`geom_point()`).
## Warning: Removed 39 rows containing missing values (`geom_point()`).
## Warning: Removed 76 rows containing missing values (`geom_point()`).
## Warning: Removed 76 rows containing missing values (`geom_point()`).
## Warning: Removed 82 rows containing missing values (`geom_point()`).
## Warning: Removed 39 rows containing missing values (`geom_point()`).
# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
c("annual_income", "corruption_index"),
c("annual_income", "cost_index"),
c("corruption_index", "cost_index"),
c("annual_income", "purchasing_power_index"),
c("annual_income", "gdp_per_capita"),
c("purchasing_power_index", "gdp_per_capita"),
c("receipts_in_billions.x", "receipts_in_billions.y"),
c("annual_income", "order"),
c("corruption_index", "order"),
c("cost_index", "order")
)
# 각 변수 쌍에 대해 회귀분석을 수행하고 결과를 출력
for(pair in high_corr_pairs){
var_x <- pair[1]
var_y <- pair[2]
# 선형 모델 생성
model <- lm(formula(paste(var_y, "~", var_x)), data = merged_data)
# 모델 요약 결과 출력
print(summary(model))
# 회귀선이 있는 산점도 출력
print(ggplot(merged_data, aes_string(x = var_x, y = var_y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Regression between", var_x, "and", var_y),
x = var_x,
y = var_y))
}
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.6891 -5.5876 -0.7898 6.1839 23.2769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.392e+01 1.050e+00 70.43 <2e-16 ***
## annual_income -8.560e-04 3.591e-05 -23.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.901 on 108 degrees of freedom
## Multiple R-squared: 0.8403, Adjusted R-squared: 0.8388
## F-statistic: 568.2 on 1 and 108 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.070 -9.086 -2.445 9.032 36.377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.467e+01 2.214e+00 15.66 <2e-16 ***
## annual_income 1.194e-03 6.106e-05 19.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.42 on 69 degrees of freedom
## (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 0.8471, Adjusted R-squared: 0.8448
## F-statistic: 382.1 on 1 and 69 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Removed 39 rows containing missing values (`geom_point()`).
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.335 -10.793 -1.223 7.775 53.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 131.30004 4.87201 26.95 <2e-16 ***
## corruption_index -1.33101 0.08903 -14.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.66 on 69 degrees of freedom
## (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 0.7641, Adjusted R-squared: 0.7607
## F-statistic: 223.5 on 1 and 69 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Removed 39 rows containing missing values (`geom_point()`).
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.543 -7.205 -2.089 3.840 34.579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.265e+01 1.878e+00 6.736 3.99e-09 ***
## annual_income 1.162e-03 5.181e-05 22.427 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.38 on 69 degrees of freedom
## (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 0.8794, Adjusted R-squared: 0.8776
## F-statistic: 503 on 1 and 69 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Removed 39 rows containing missing values (`geom_point()`).
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19017 -9503 -3041 4254 46538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.697e+04 6.540e+03 2.595 0.0142 *
## annual_income 8.784e-01 1.264e-01 6.947 7.23e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15440 on 32 degrees of freedom
## (결측으로 인하여 76개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 0.6013, Adjusted R-squared: 0.5888
## F-statistic: 48.26 on 1 and 32 DF, p-value: 7.226e-08
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 76 rows containing missing values (`geom_point()`).
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20862 -4775 -948 2259 45372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10358.13 6871.37 -1.507 0.142
## purchasing_power_index 979.77 93.56 10.473 7.24e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11620 on 32 degrees of freedom
## (결측으로 인하여 76개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 0.7741, Adjusted R-squared: 0.7671
## F-statistic: 109.7 on 1 and 32 DF, p-value: 7.238e-12
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite values (`stat_smooth()`).
## Removed 76 rows containing missing values (`geom_point()`).
## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.983e-15 -1.004e-15 -4.832e-16 -3.186e-16 1.929e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.686e-15 9.741e-16 2.757e+00 0.0105 *
## receipts_in_billions.x 1.000e+00 4.071e-17 2.456e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.976e-15 on 26 degrees of freedom
## (결측으로 인하여 82개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 6.034e+32 on 1 and 26 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 82 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 82 rows containing missing values (`geom_point()`).
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.247 -15.766 -3.267 16.101 47.187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.5213049 2.3645522 31.09 <2e-16 ***
## annual_income -0.0010480 0.0000809 -12.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.05 on 108 degrees of freedom
## Multiple R-squared: 0.6084, Adjusted R-squared: 0.6048
## F-statistic: 167.8 on 1 and 108 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.269 -10.527 2.672 8.967 17.157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.18457 3.08999 -7.827 3.64e-12 ***
## corruption_index 1.34602 0.04891 27.522 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.32 on 108 degrees of freedom
## Multiple R-squared: 0.8752, Adjusted R-squared: 0.8741
## F-statistic: 757.4 on 1 and 108 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'
##
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.774 -12.949 -2.954 8.847 56.750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.83644 4.74882 16.81 < 2e-16 ***
## cost_index -0.60084 0.06502 -9.24 1.09e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.53 on 69 degrees of freedom
## (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared: 0.5531, Adjusted R-squared: 0.5466
## F-statistic: 85.39 on 1 and 69 DF, p-value: 1.092e-13
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 39 rows containing missing values (`geom_point()`).
# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
c("annual_income", "corruption_index"),
c("annual_income", "cost_index"),
c("corruption_index", "cost_index"),
c("annual_income", "purchasing_power_index"),
c("annual_income", "gdp_per_capita"),
c("purchasing_power_index", "gdp_per_capita"),
c("receipts_in_billions.x", "receipts_in_billions.y"),
c("annual_income", "order"),
c("corruption_index", "order"),
c("cost_index", "order")
)
# 각 변수 쌍에 대해 회귀분석을 수행하고 잔차 관계를 시각화
for(pair in high_corr_pairs){
var_x <- pair[1]
var_y <- pair[2]
# 선형 모델 생성
model <- lm(formula(paste(var_y, "~", var_x)), data = merged_data)
# 잔차를 히스토그램으로 표현
print(ggplot(data = as.data.frame(residuals(model)), aes(x = residuals(model))) +
geom_histogram(binwidth = 0.5, fill = "blue", color = "black", alpha = 0.7) +
labs(title = paste("Residuals Histogram of", var_x, "and", var_y),
x = "Residuals",
y = "Frequency"))
# 잔차를 산점도로 표현
print(ggplot(data = as.data.frame(residuals(model)), aes(x = 1:length(residuals(model)), y = residuals(model))) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(title = paste("Residuals Scatter Plot of", var_x, "and", var_y),
x = "Index",
y = "Residuals"))
}
install.packages("caret")
## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)
## 패키지 'caret'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
##
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
## C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages
# 필요한 패키지를 로드합니다.
library(caret)
## 필요한 패키지를 로딩중입니다: lattice
# 데이터를 80%의 트레이닝 데이터와 20%의 테스트 데이터로 분리합니다.
set.seed(123)
splitIndex <- createDataPartition(merged_data$annual_income, p = .8,
list = FALSE,
times = 1)
train_data <- merged_data[ splitIndex,]
test_data <- merged_data[-splitIndex,]
# 선형 회귀 모델을 학습합니다.
lm_model <- lm(annual_income ~ purchasing_power_index, data = train_data)
# 테스트 데이터에 대해 예측을 수행합니다.
predictions <- predict(lm_model, test_data)
# 예측 오차를 계산합니다.
errors <- abs(test_data$annual_income - predictions)
# 예측 오차가 가장 큰 상위 3개 국가를 출력합니다.
test_data_with_errors <- cbind(test_data, errors)
top_3_errors <- test_data_with_errors[order(-test_data_with_errors$errors), ][1:3, ]
print(top_3_errors[, c("country", "annual_income", "errors")])
## country annual_income errors
## 7 Saudi Arabia 22270 26790.48
## 3 Qatar 57120 25247.90
## 5 Israel 49560 13822.49
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.