kufa_data

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("readr")

## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)

## 패키지 'readr'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages

library(readr)

# 각 파일의 URL
urls <- c(
  data_corruption = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/corruption.csv',
  data_living = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/cost_of_living.csv',
  data_richest = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/richest_countries.csv',
  data_tourism = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/tourism.csv',
  data_unemployment = 'https://raw.githubusercontent.com/rich-hyun/kufa_data_1/main/tourism.csv' # URL이 중복되어 있으니 확인이 필요합니다.
)

# URL에서 데이터를 불러와서 목록에 저장합니다.
data_list <- lapply(urls, read_csv)

## Rows: 110 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (2): annual_income, corruption_index
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 107 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (3): cost_index, monthly_income, purchasing_power_index
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 50 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (1): gdp_per_capita
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 41 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (4): tourists_in_millions, receipts_in_billions, receipts_per_tourist, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 41 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (4): tourists_in_millions, receipts_in_billions, receipts_per_tourist, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 각 데이터를 개별 변수에 할당합니다.
data_corruption <- data_list$data_corruption
data_living <- data_list$data_living
data_richest <- data_list$data_richest
data_tourism <- data_list$data_tourism
data_unemployment <- data_list$data_unemployment

# 데이터의 첫 부분을 확인합니다.
head(data_corruption)

## # A tibble: 6 × 3
##   country     annual_income corruption_index
##   <chr>               <dbl>            <dbl>
## 1 Denmark             68110               12
## 2 Finland             53660               12
## 3 New Zealand         45340               12
## 4 Norway              84090               15
## 5 Singapore           64010               15
## 6 Sweden              58890               15

head(data_living)

## # A tibble: 6 × 4
##   country        cost_index monthly_income purchasing_power_index
##   <chr>               <dbl>          <dbl>                  <dbl>
## 1 Bermuda              158.           9712                  105  
## 2 Switzerland          142.           7530                   90.1
## 3 Cayman Islands       138.           5281                   65.2
## 4 Israel               130.           4130                   54.1
## 5 Iceland              128            5368                   71.5
## 6 New Caledonia        126.           1101                   14.9

head(data_richest)

## # A tibble: 6 × 2
##   country    gdp_per_capita
##   <chr>               <dbl>
## 1 Luxembourg         134754
## 2 Singapore          116486
## 3 Ireland            106456
## 4 Qatar               93521
## 5 Bermuda             85192
## 6 Norway              79201

head(data_tourism)

## # A tibble: 6 × 5
##   country       tourists_in_millions receipts_in_billions receipts_per_tourist
##   <chr>                        <dbl>                <dbl>                <dbl>
## 1 France                       117.                 36.0                   307
## 2 Mexico                        51.1                11.4                   224
## 3 United States                 45                  84.2                  1870
## 4 Italy                         38.4                20.5                   533
## 5 Hungary                       31.6                 4.22                  133
## 6 Croatia                       21.6                 5.63                  261
## # ℹ 1 more variable: percentage_of_gdp <dbl>

head(data_unemployment)

## # A tibble: 6 × 5
##   country       tourists_in_millions receipts_in_billions receipts_per_tourist
##   <chr>                        <dbl>                <dbl>                <dbl>
## 1 France                       117.                 36.0                   307
## 2 Mexico                        51.1                11.4                   224
## 3 United States                 45                  84.2                  1870
## 4 Italy                         38.4                20.5                   533
## 5 Hungary                       31.6                 4.22                  133
## 6 Croatia                       21.6                 5.63                  261
## # ℹ 1 more variable: percentage_of_gdp <dbl>

install.packages("dplyr")

## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)

## 패키지 'dplyr'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다

## Warning: 패키지 'dplyr'의 이전설치를 삭제할 수 없습니다

## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## C:\Users\passi\AppData\Local\R\win-library\4.3\00LOCK\dplyr\libs\x64\dplyr.dll를
## C:\Users\passi\AppData\Local\R\win-library\4.3\dplyr\libs\x64\dplyr.dll로
## 복사하는데 문제가 발생했습니다: Permission denied

## Warning: 'dplyr'를 복구하였습니다

## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

merged_data <- data_corruption %>%
  left_join(data_living, by = "country") %>%
  left_join(data_richest, by = "country") %>%
  left_join(data_tourism, by = "country") %>%
  left_join(data_unemployment, by = "country")

# 합쳐진 데이터를 확인합니다.
head(merged_data)

## # A tibble: 6 × 15
##   country     annual_income corruption_index cost_index monthly_income
##   <chr>               <dbl>            <dbl>      <dbl>          <dbl>
## 1 Denmark             68110               12       120.           5676
## 2 Finland             53660               12       108            4472
## 3 New Zealand         45340               12       117.           3778
## 4 Norway              84090               15       125.           7008
## 5 Singapore           64010               15        75            5334
## 6 Sweden              58890               15       109.           4908
## # ℹ 10 more variables: purchasing_power_index <dbl>, gdp_per_capita <dbl>,
## #   tourists_in_millions.x <dbl>, receipts_in_billions.x <dbl>,
## #   receipts_per_tourist.x <dbl>, percentage_of_gdp.x <dbl>,
## #   tourists_in_millions.y <dbl>, receipts_in_billions.y <dbl>,
## #   receipts_per_tourist.y <dbl>, percentage_of_gdp.y <dbl>

install.packages("ggplot2")

## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)

## 패키지 'ggplot2'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages

library(ggplot2)

# 데이터의 순서(인덱스)를 생성합니다.
merged_data$order <- seq.int(nrow(merged_data))

ggplot(merged_data, aes(x = order, y = annual_income)) +
  geom_point(aes(color = annual_income), size = 3) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Scatter Plot of Annual Income",
       x = "Order (Index)",
       y = "Annual Income") +
  theme_minimal()

# colnames() 함수를 사용하는 방법
column_names <- colnames(merged_data)
print(column_names)

##  [1] "country"                "annual_income"          "corruption_index"      
##  [4] "cost_index"             "monthly_income"         "purchasing_power_index"
##  [7] "gdp_per_capita"         "tourists_in_millions.x" "receipts_in_billions.x"
## [10] "receipts_per_tourist.x" "percentage_of_gdp.x"    "tourists_in_millions.y"
## [13] "receipts_in_billions.y" "receipts_per_tourist.y" "percentage_of_gdp.y"   
## [16] "order"

# 또는 names() 함수를 사용하는 방법
column_names <- names(merged_data)
print(column_names)

##  [1] "country"                "annual_income"          "corruption_index"      
##  [4] "cost_index"             "monthly_income"         "purchasing_power_index"
##  [7] "gdp_per_capita"         "tourists_in_millions.x" "receipts_in_billions.x"
## [10] "receipts_per_tourist.x" "percentage_of_gdp.x"    "tourists_in_millions.y"
## [13] "receipts_in_billions.y" "receipts_per_tourist.y" "percentage_of_gdp.y"   
## [16] "order"

# 수치형 열만 선택합니다.
numeric_columns <- merged_data[sapply(merged_data, is.numeric)]

# 선택된 수치형 열로 상관 행렬을 계산합니다.
correlation_matrix <- cor(numeric_columns, use = "complete.obs", method = "pearson")

# 상관 행렬을 출력합니다.
print(correlation_matrix)

##                        annual_income corruption_index  cost_index
## annual_income            1.000000000      -0.85884867  0.84426743
## corruption_index        -0.858848666       1.00000000 -0.88788053
## cost_index               0.844267428      -0.88788053  1.00000000
## monthly_income           0.999999988      -0.85886823  0.84427187
## purchasing_power_index   0.892211427      -0.69240108  0.55239824
## gdp_per_capita           0.835850221      -0.61200687  0.54339624
## tourists_in_millions.x  -0.009273268      -0.00946317  0.07564747
## receipts_in_billions.x   0.294220125      -0.23396168  0.20541009
## receipts_per_tourist.x   0.354146424      -0.21734725  0.13504949
## percentage_of_gdp.x      0.206680317      -0.17723182 -0.06726950
## tourists_in_millions.y  -0.009273268      -0.00946317  0.07564747
## receipts_in_billions.y   0.294220125      -0.23396168  0.20541009
## receipts_per_tourist.y   0.354146424      -0.21734725  0.13504949
## percentage_of_gdp.y      0.206680317      -0.17723182 -0.06726950
## order                   -0.789951189       0.95450743 -0.81919731
##                        monthly_income purchasing_power_index gdp_per_capita
## annual_income             0.999999988             0.89221143      0.8358502
## corruption_index         -0.858868226            -0.69240108     -0.6120069
## cost_index                0.844271866             0.55239824      0.5433962
## monthly_income            1.000000000             0.89220980      0.8358287
## purchasing_power_index    0.892209801             1.00000000      0.8555495
## gdp_per_capita            0.835828748             0.85554948      1.0000000
## tourists_in_millions.x   -0.009262163            -0.04725314     -0.1247469
## receipts_in_billions.x    0.294174747             0.34803674      0.0631486
## receipts_per_tourist.x    0.354121303             0.56749593      0.5222063
## percentage_of_gdp.x       0.206689395             0.48192982      0.4535973
## tourists_in_millions.y   -0.009262163            -0.04725314     -0.1247469
## receipts_in_billions.y    0.294174747             0.34803674      0.0631486
## receipts_per_tourist.y    0.354121303             0.56749593      0.5222063
## percentage_of_gdp.y       0.206689395             0.48192982      0.4535973
## order                    -0.789958713            -0.63569296     -0.5702086
##                        tourists_in_millions.x receipts_in_billions.x
## annual_income                    -0.009273268             0.29422013
## corruption_index                 -0.009463170            -0.23396168
## cost_index                        0.075647470             0.20541009
## monthly_income                   -0.009262163             0.29417475
## purchasing_power_index           -0.047253144             0.34803674
## gdp_per_capita                   -0.124746872             0.06314860
## tourists_in_millions.x            1.000000000             0.45500723
## receipts_in_billions.x            0.455007229             1.00000000
## receipts_per_tourist.x           -0.300284786             0.02921910
## percentage_of_gdp.x              -0.229266353            -0.04970387
## tourists_in_millions.y            1.000000000             0.45500723
## receipts_in_billions.y            0.455007229             1.00000000
## receipts_per_tourist.y           -0.300284786             0.02921910
## percentage_of_gdp.y              -0.229266353            -0.04970387
## order                             0.023030878            -0.19417591
##                        receipts_per_tourist.x percentage_of_gdp.x
## annual_income                       0.3541464          0.20668032
## corruption_index                   -0.2173472         -0.17723182
## cost_index                          0.1350495         -0.06726950
## monthly_income                      0.3541213          0.20668940
## purchasing_power_index              0.5674959          0.48192982
## gdp_per_capita                      0.5222063          0.45359734
## tourists_in_millions.x             -0.3002848         -0.22926635
## receipts_in_billions.x              0.0292191         -0.04970387
## receipts_per_tourist.x              1.0000000          0.66588433
## percentage_of_gdp.x                 0.6658843          1.00000000
## tourists_in_millions.y             -0.3002848         -0.22926635
## receipts_in_billions.y              0.0292191         -0.04970387
## receipts_per_tourist.y              1.0000000          0.66588433
## percentage_of_gdp.y                 0.6658843          1.00000000
## order                              -0.2033242         -0.23907689
##                        tourists_in_millions.y receipts_in_billions.y
## annual_income                    -0.009273268             0.29422013
## corruption_index                 -0.009463170            -0.23396168
## cost_index                        0.075647470             0.20541009
## monthly_income                   -0.009262163             0.29417475
## purchasing_power_index           -0.047253144             0.34803674
## gdp_per_capita                   -0.124746872             0.06314860
## tourists_in_millions.x            1.000000000             0.45500723
## receipts_in_billions.x            0.455007229             1.00000000
## receipts_per_tourist.x           -0.300284786             0.02921910
## percentage_of_gdp.x              -0.229266353            -0.04970387
## tourists_in_millions.y            1.000000000             0.45500723
## receipts_in_billions.y            0.455007229             1.00000000
## receipts_per_tourist.y           -0.300284786             0.02921910
## percentage_of_gdp.y              -0.229266353            -0.04970387
## order                             0.023030878            -0.19417591
##                        receipts_per_tourist.y percentage_of_gdp.y       order
## annual_income                       0.3541464          0.20668032 -0.78995119
## corruption_index                   -0.2173472         -0.17723182  0.95450743
## cost_index                          0.1350495         -0.06726950 -0.81919731
## monthly_income                      0.3541213          0.20668940 -0.78995871
## purchasing_power_index              0.5674959          0.48192982 -0.63569296
## gdp_per_capita                      0.5222063          0.45359734 -0.57020862
## tourists_in_millions.x             -0.3002848         -0.22926635  0.02303088
## receipts_in_billions.x              0.0292191         -0.04970387 -0.19417591
## receipts_per_tourist.x              1.0000000          0.66588433 -0.20332421
## percentage_of_gdp.x                 0.6658843          1.00000000 -0.23907689
## tourists_in_millions.y             -0.3002848         -0.22926635  0.02303088
## receipts_in_billions.y              0.0292191         -0.04970387 -0.19417591
## receipts_per_tourist.y              1.0000000          0.66588433 -0.20332421
## percentage_of_gdp.y                 0.6658843          1.00000000 -0.23907689
## order                              -0.2033242         -0.23907689  1.00000000

# 상관계수의 절대값이 0.7 이상인 쌍을 찾습니다.
high_correlations <- which(abs(correlation_matrix) > 0.7 & correlation_matrix != 1, arr.ind = TRUE)

# 의미있는 상관관계를 가진 변수 쌍을 출력합니다.
for(i in seq_along(high_correlations[, 1])){
  row <- high_correlations[i, 1]
  col <- high_correlations[i, 2]
  
  # 변수 이름을 가져옵니다.
  var1 <- colnames(numeric_columns)[row]
  var2 <- colnames(numeric_columns)[col]
  
  # monthly_income과의 관계를 제외합니다.
  if("monthly_income" %in% c(var1, var2)) next
  
  # 중복된 쌍을 방지하기 위해 체크합니다.
  if(col > row){
    cat("Significant correlation between:", var1, "and", var2, "with a correlation coefficient of", correlation_matrix[row, col], "\n")
  }
}

## Significant correlation between: annual_income and corruption_index with a correlation coefficient of -0.8588487 
## Significant correlation between: annual_income and cost_index with a correlation coefficient of 0.8442674 
## Significant correlation between: corruption_index and cost_index with a correlation coefficient of -0.8878805 
## Significant correlation between: annual_income and purchasing_power_index with a correlation coefficient of 0.8922114 
## Significant correlation between: annual_income and gdp_per_capita with a correlation coefficient of 0.8358502 
## Significant correlation between: purchasing_power_index and gdp_per_capita with a correlation coefficient of 0.8555495 
## Significant correlation between: receipts_in_billions.x and receipts_in_billions.y with a correlation coefficient of 1 
## Significant correlation between: annual_income and order with a correlation coefficient of -0.7899512 
## Significant correlation between: corruption_index and order with a correlation coefficient of 0.9545074 
## Significant correlation between: cost_index and order with a correlation coefficient of -0.8191973

# 수치형 열만 선택합니다.
numeric_columns <- merged_data[sapply(merged_data, is.numeric)]

# 선택된 수치형 열로 상관 행렬을 계산합니다.
correlation_matrix <- cor(numeric_columns, use = "complete.obs", method = "pearson")

# 상관 행렬을 출력합니다.
print(correlation_matrix)

##                        annual_income corruption_index  cost_index
## annual_income            1.000000000      -0.85884867  0.84426743
## corruption_index        -0.858848666       1.00000000 -0.88788053
## cost_index               0.844267428      -0.88788053  1.00000000
## monthly_income           0.999999988      -0.85886823  0.84427187
## purchasing_power_index   0.892211427      -0.69240108  0.55239824
## gdp_per_capita           0.835850221      -0.61200687  0.54339624
## tourists_in_millions.x  -0.009273268      -0.00946317  0.07564747
## receipts_in_billions.x   0.294220125      -0.23396168  0.20541009
## receipts_per_tourist.x   0.354146424      -0.21734725  0.13504949
## percentage_of_gdp.x      0.206680317      -0.17723182 -0.06726950
## tourists_in_millions.y  -0.009273268      -0.00946317  0.07564747
## receipts_in_billions.y   0.294220125      -0.23396168  0.20541009
## receipts_per_tourist.y   0.354146424      -0.21734725  0.13504949
## percentage_of_gdp.y      0.206680317      -0.17723182 -0.06726950
## order                   -0.789951189       0.95450743 -0.81919731
##                        monthly_income purchasing_power_index gdp_per_capita
## annual_income             0.999999988             0.89221143      0.8358502
## corruption_index         -0.858868226            -0.69240108     -0.6120069
## cost_index                0.844271866             0.55239824      0.5433962
## monthly_income            1.000000000             0.89220980      0.8358287
## purchasing_power_index    0.892209801             1.00000000      0.8555495
## gdp_per_capita            0.835828748             0.85554948      1.0000000
## tourists_in_millions.x   -0.009262163            -0.04725314     -0.1247469
## receipts_in_billions.x    0.294174747             0.34803674      0.0631486
## receipts_per_tourist.x    0.354121303             0.56749593      0.5222063
## percentage_of_gdp.x       0.206689395             0.48192982      0.4535973
## tourists_in_millions.y   -0.009262163            -0.04725314     -0.1247469
## receipts_in_billions.y    0.294174747             0.34803674      0.0631486
## receipts_per_tourist.y    0.354121303             0.56749593      0.5222063
## percentage_of_gdp.y       0.206689395             0.48192982      0.4535973
## order                    -0.789958713            -0.63569296     -0.5702086
##                        tourists_in_millions.x receipts_in_billions.x
## annual_income                    -0.009273268             0.29422013
## corruption_index                 -0.009463170            -0.23396168
## cost_index                        0.075647470             0.20541009
## monthly_income                   -0.009262163             0.29417475
## purchasing_power_index           -0.047253144             0.34803674
## gdp_per_capita                   -0.124746872             0.06314860
## tourists_in_millions.x            1.000000000             0.45500723
## receipts_in_billions.x            0.455007229             1.00000000
## receipts_per_tourist.x           -0.300284786             0.02921910
## percentage_of_gdp.x              -0.229266353            -0.04970387
## tourists_in_millions.y            1.000000000             0.45500723
## receipts_in_billions.y            0.455007229             1.00000000
## receipts_per_tourist.y           -0.300284786             0.02921910
## percentage_of_gdp.y              -0.229266353            -0.04970387
## order                             0.023030878            -0.19417591
##                        receipts_per_tourist.x percentage_of_gdp.x
## annual_income                       0.3541464          0.20668032
## corruption_index                   -0.2173472         -0.17723182
## cost_index                          0.1350495         -0.06726950
## monthly_income                      0.3541213          0.20668940
## purchasing_power_index              0.5674959          0.48192982
## gdp_per_capita                      0.5222063          0.45359734
## tourists_in_millions.x             -0.3002848         -0.22926635
## receipts_in_billions.x              0.0292191         -0.04970387
## receipts_per_tourist.x              1.0000000          0.66588433
## percentage_of_gdp.x                 0.6658843          1.00000000
## tourists_in_millions.y             -0.3002848         -0.22926635
## receipts_in_billions.y              0.0292191         -0.04970387
## receipts_per_tourist.y              1.0000000          0.66588433
## percentage_of_gdp.y                 0.6658843          1.00000000
## order                              -0.2033242         -0.23907689
##                        tourists_in_millions.y receipts_in_billions.y
## annual_income                    -0.009273268             0.29422013
## corruption_index                 -0.009463170            -0.23396168
## cost_index                        0.075647470             0.20541009
## monthly_income                   -0.009262163             0.29417475
## purchasing_power_index           -0.047253144             0.34803674
## gdp_per_capita                   -0.124746872             0.06314860
## tourists_in_millions.x            1.000000000             0.45500723
## receipts_in_billions.x            0.455007229             1.00000000
## receipts_per_tourist.x           -0.300284786             0.02921910
## percentage_of_gdp.x              -0.229266353            -0.04970387
## tourists_in_millions.y            1.000000000             0.45500723
## receipts_in_billions.y            0.455007229             1.00000000
## receipts_per_tourist.y           -0.300284786             0.02921910
## percentage_of_gdp.y              -0.229266353            -0.04970387
## order                             0.023030878            -0.19417591
##                        receipts_per_tourist.y percentage_of_gdp.y       order
## annual_income                       0.3541464          0.20668032 -0.78995119
## corruption_index                   -0.2173472         -0.17723182  0.95450743
## cost_index                          0.1350495         -0.06726950 -0.81919731
## monthly_income                      0.3541213          0.20668940 -0.78995871
## purchasing_power_index              0.5674959          0.48192982 -0.63569296
## gdp_per_capita                      0.5222063          0.45359734 -0.57020862
## tourists_in_millions.x             -0.3002848         -0.22926635  0.02303088
## receipts_in_billions.x              0.0292191         -0.04970387 -0.19417591
## receipts_per_tourist.x              1.0000000          0.66588433 -0.20332421
## percentage_of_gdp.x                 0.6658843          1.00000000 -0.23907689
## tourists_in_millions.y             -0.3002848         -0.22926635  0.02303088
## receipts_in_billions.y              0.0292191         -0.04970387 -0.19417591
## receipts_per_tourist.y              1.0000000          0.66588433 -0.20332421
## percentage_of_gdp.y                 0.6658843          1.00000000 -0.23907689
## order                              -0.2033242         -0.23907689  1.00000000

# 상관계수의 절대값이 0.7 이상인 쌍을 찾습니다.
high_correlations <- which(abs(correlation_matrix) > 0.7 & correlation_matrix != 1, arr.ind = TRUE)

# 의미있는 상관관계를 가진 변수 쌍을 출력합니다.
for(i in seq_along(high_correlations[, 1])){
  row <- high_correlations[i, 1]
  col <- high_correlations[i, 2]
  
  # 변수 이름을 가져옵니다.
  var1 <- colnames(numeric_columns)[row]
  var2 <- colnames(numeric_columns)[col]
  
  # monthly_income과의 관계를 제외합니다.
  if("monthly_income" %in% c(var1, var2)) next
  
  # 중복된 쌍을 방지하기 위해 체크합니다.
  if(col > row){
    cat("Significant correlation between:", var1, "and", var2, "with a correlation coefficient of", correlation_matrix[row, col], "\n")
  }
}

## Significant correlation between: annual_income and corruption_index with a correlation coefficient of -0.8588487 
## Significant correlation between: annual_income and cost_index with a correlation coefficient of 0.8442674 
## Significant correlation between: corruption_index and cost_index with a correlation coefficient of -0.8878805 
## Significant correlation between: annual_income and purchasing_power_index with a correlation coefficient of 0.8922114 
## Significant correlation between: annual_income and gdp_per_capita with a correlation coefficient of 0.8358502 
## Significant correlation between: purchasing_power_index and gdp_per_capita with a correlation coefficient of 0.8555495 
## Significant correlation between: receipts_in_billions.x and receipts_in_billions.y with a correlation coefficient of 1 
## Significant correlation between: annual_income and order with a correlation coefficient of -0.7899512 
## Significant correlation between: corruption_index and order with a correlation coefficient of 0.9545074 
## Significant correlation between: cost_index and order with a correlation coefficient of -0.8191973

install.packages("gridExtra")

## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)

## 패키지 'gridExtra'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages

library(gridExtra)

## 
## 다음의 패키지를 부착합니다: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
  c("annual_income", "corruption_index"),
  c("annual_income", "cost_index"),
  c("corruption_index", "cost_index"),
  c("annual_income", "purchasing_power_index"),
  c("annual_income", "gdp_per_capita"),
  c("purchasing_power_index", "gdp_per_capita"),
  c("receipts_in_billions.x", "receipts_in_billions.y"),
  c("annual_income", "order"),
  c("corruption_index", "order"),
  c("cost_index", "order")
)

# 각 쌍에 대한 산점도를 그립니다.
plot_list <- list()
for(pair in high_corr_pairs) {
  p <- ggplot(merged_data, aes_string(x = pair[1], y = pair[2])) +
    geom_point() +
    labs(x = pair[1], y = pair[2], title = paste("Scatter plot of", pair[1], "and", pair[2])) +
    theme_minimal()
  plot_list[[length(plot_list)+1]] <- p
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 그림들을 그리드로 배열합니다.
do.call(grid.arrange, c(plot_list, ncol = 2))

## Warning: Removed 39 rows containing missing values (`geom_point()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).
## Removed 39 rows containing missing values (`geom_point()`).

## Warning: Removed 76 rows containing missing values (`geom_point()`).
## Removed 76 rows containing missing values (`geom_point()`).

## Warning: Removed 82 rows containing missing values (`geom_point()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).

# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
  c("annual_income", "corruption_index"),
  c("annual_income", "cost_index"),
  c("corruption_index", "cost_index"),
  c("annual_income", "purchasing_power_index"),
  c("annual_income", "gdp_per_capita"),
  c("purchasing_power_index", "gdp_per_capita"),
  c("receipts_in_billions.x", "receipts_in_billions.y"),
  c("annual_income", "order"),
  c("corruption_index", "order"),
  c("cost_index", "order")
)

# 각 변수 쌍에 대한 산점도 출력
for(pair in high_corr_pairs){
  var_x <- pair[1]
  var_y <- pair[2]
  
  print(ggplot(merged_data, aes_string(x = var_x, y = var_y)) +
          geom_point() +
          labs(title = paste("Relationship between", var_x, "and", var_y),
               x = var_x,
               y = var_y))
}

## Warning: Removed 39 rows containing missing values (`geom_point()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).

## Warning: Removed 76 rows containing missing values (`geom_point()`).

## Warning: Removed 76 rows containing missing values (`geom_point()`).

## Warning: Removed 82 rows containing missing values (`geom_point()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).

# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
  c("annual_income", "corruption_index"),
  c("annual_income", "cost_index"),
  c("corruption_index", "cost_index"),
  c("annual_income", "purchasing_power_index"),
  c("annual_income", "gdp_per_capita"),
  c("purchasing_power_index", "gdp_per_capita"),
  c("receipts_in_billions.x", "receipts_in_billions.y"),
  c("annual_income", "order"),
  c("corruption_index", "order"),
  c("cost_index", "order")
)

# 각 변수 쌍에 대해 회귀분석을 수행하고 결과를 출력
for(pair in high_corr_pairs){
  var_x <- pair[1]
  var_y <- pair[2]
  
  # 선형 모델 생성
  model <- lm(formula(paste(var_y, "~", var_x)), data = merged_data)
  
  # 모델 요약 결과 출력
  print(summary(model))
  
  # 회귀선이 있는 산점도 출력
  print(ggplot(merged_data, aes_string(x = var_x, y = var_y)) +
          geom_point() +
          geom_smooth(method = "lm", se = FALSE, color = "blue") +
          labs(title = paste("Regression between", var_x, "and", var_y),
               x = var_x,
               y = var_y))
}

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.6891  -5.5876  -0.7898   6.1839  23.2769 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.392e+01  1.050e+00   70.43   <2e-16 ***
## annual_income -8.560e-04  3.591e-05  -23.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.901 on 108 degrees of freedom
## Multiple R-squared:  0.8403, Adjusted R-squared:  0.8388 
## F-statistic: 568.2 on 1 and 108 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.070  -9.086  -2.445   9.032  36.377 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.467e+01  2.214e+00   15.66   <2e-16 ***
## annual_income 1.194e-03  6.106e-05   19.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.42 on 69 degrees of freedom
##   (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.8471, Adjusted R-squared:  0.8448 
## F-statistic: 382.1 on 1 and 69 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Removed 39 rows containing missing values (`geom_point()`).

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.335 -10.793  -1.223   7.775  53.471 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      131.30004    4.87201   26.95   <2e-16 ***
## corruption_index  -1.33101    0.08903  -14.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.66 on 69 degrees of freedom
##   (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.7641, Adjusted R-squared:  0.7607 
## F-statistic: 223.5 on 1 and 69 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Removed 39 rows containing missing values (`geom_point()`).

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.543  -7.205  -2.089   3.840  34.579 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.265e+01  1.878e+00   6.736 3.99e-09 ***
## annual_income 1.162e-03  5.181e-05  22.427  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.38 on 69 degrees of freedom
##   (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.8794, Adjusted R-squared:  0.8776 
## F-statistic:   503 on 1 and 69 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).
## Removed 39 rows containing missing values (`geom_point()`).

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19017  -9503  -3041   4254  46538 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.697e+04  6.540e+03   2.595   0.0142 *  
## annual_income 8.784e-01  1.264e-01   6.947 7.23e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15440 on 32 degrees of freedom
##   (결측으로 인하여 76개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.6013, Adjusted R-squared:  0.5888 
## F-statistic: 48.26 on 1 and 32 DF,  p-value: 7.226e-08

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 76 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 76 rows containing missing values (`geom_point()`).

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20862  -4775   -948   2259  45372 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -10358.13    6871.37  -1.507    0.142    
## purchasing_power_index    979.77      93.56  10.473 7.24e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11620 on 32 degrees of freedom
##   (결측으로 인하여 76개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.7741, Adjusted R-squared:  0.7671 
## F-statistic: 109.7 on 1 and 32 DF,  p-value: 7.238e-12

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 76 rows containing non-finite values (`stat_smooth()`).
## Removed 76 rows containing missing values (`geom_point()`).

## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.983e-15 -1.004e-15 -4.832e-16 -3.186e-16  1.929e-14 
## 
## Coefficients:
##                         Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)            2.686e-15  9.741e-16 2.757e+00   0.0105 *  
## receipts_in_billions.x 1.000e+00  4.071e-17 2.456e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.976e-15 on 26 degrees of freedom
##   (결측으로 인하여 82개의 관측치가 삭제되었습니다.)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 6.034e+32 on 1 and 26 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 82 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 82 rows containing missing values (`geom_point()`).

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.247 -15.766  -3.267  16.101  47.187 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   73.5213049  2.3645522   31.09   <2e-16 ***
## annual_income -0.0010480  0.0000809  -12.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.05 on 108 degrees of freedom
## Multiple R-squared:  0.6084, Adjusted R-squared:  0.6048 
## F-statistic: 167.8 on 1 and 108 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.269 -10.527   2.672   8.967  17.157 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -24.18457    3.08999  -7.827 3.64e-12 ***
## corruption_index   1.34602    0.04891  27.522  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.32 on 108 degrees of freedom
## Multiple R-squared:  0.8752, Adjusted R-squared:  0.8741 
## F-statistic: 757.4 on 1 and 108 DF,  p-value: < 2.2e-16

## `geom_smooth()` using formula = 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(var_y, "~", var_x)), data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.774 -12.949  -2.954   8.847  56.750 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 79.83644    4.74882   16.81  < 2e-16 ***
## cost_index  -0.60084    0.06502   -9.24 1.09e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.53 on 69 degrees of freedom
##   (결측으로 인하여 39개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.5531, Adjusted R-squared:  0.5466 
## F-statistic: 85.39 on 1 and 69 DF,  p-value: 1.092e-13

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 39 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).

# 상관관계가 높은 변수 쌍 리스트
high_corr_pairs <- list(
  c("annual_income", "corruption_index"),
  c("annual_income", "cost_index"),
  c("corruption_index", "cost_index"),
  c("annual_income", "purchasing_power_index"),
  c("annual_income", "gdp_per_capita"),
  c("purchasing_power_index", "gdp_per_capita"),
  c("receipts_in_billions.x", "receipts_in_billions.y"),
  c("annual_income", "order"),
  c("corruption_index", "order"),
  c("cost_index", "order")
)

# 각 변수 쌍에 대해 회귀분석을 수행하고 잔차 관계를 시각화
for(pair in high_corr_pairs){
  var_x <- pair[1]
  var_y <- pair[2]
  
  # 선형 모델 생성
  model <- lm(formula(paste(var_y, "~", var_x)), data = merged_data)
  
  # 잔차를 히스토그램으로 표현
  print(ggplot(data = as.data.frame(residuals(model)), aes(x = residuals(model))) +
          geom_histogram(binwidth = 0.5, fill = "blue", color = "black", alpha = 0.7) +
          labs(title = paste("Residuals Histogram of", var_x, "and", var_y),
               x = "Residuals",
               y = "Frequency"))
  
  # 잔차를 산점도로 표현
  print(ggplot(data = as.data.frame(residuals(model)), aes(x = 1:length(residuals(model)), y = residuals(model))) +
          geom_point() +
          geom_hline(yintercept = 0, color = "red") +
          labs(title = paste("Residuals Scatter Plot of", var_x, "and", var_y),
               x = "Index",
               y = "Residuals"))
}

install.packages("caret")

## 'C:/Users/passi/AppData/Local/R/win-library/4.3'의 위치에 패키지(들)을 설치합니다.
## (왜냐하면 'lib'가 지정되지 않았기 때문입니다)

## 패키지 'caret'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\passi\AppData\Local\Temp\RtmpCQrCwt\downloaded_packages

# 필요한 패키지를 로드합니다.
library(caret)

## 필요한 패키지를 로딩중입니다: lattice

# 데이터를 80%의 트레이닝 데이터와 20%의 테스트 데이터로 분리합니다.
set.seed(123)
splitIndex <- createDataPartition(merged_data$annual_income, p = .8, 
                                  list = FALSE, 
                                  times = 1)
train_data <- merged_data[ splitIndex,]
test_data <- merged_data[-splitIndex,]

# 선형 회귀 모델을 학습합니다.
lm_model <- lm(annual_income ~ purchasing_power_index, data = train_data)

# 테스트 데이터에 대해 예측을 수행합니다.
predictions <- predict(lm_model, test_data)

# 예측 오차를 계산합니다.
errors <- abs(test_data$annual_income - predictions)

# 예측 오차가 가장 큰 상위 3개 국가를 출력합니다.
test_data_with_errors <- cbind(test_data, errors)
top_3_errors <- test_data_with_errors[order(-test_data_with_errors$errors), ][1:3, ]

print(top_3_errors[, c("country", "annual_income", "errors")])

##        country annual_income   errors
## 7 Saudi Arabia         22270 26790.48
## 3        Qatar         57120 25247.90
## 5       Israel         49560 13822.49

kufa_data_1

jaehyunkim

2023-10-26

R Markdown

Including Plots