setwd("C:/BigData2024")
getwd()
library(tidyverse)
library(MASS)
library(caret)
library(stargazer)
library(GGally)
library(car)
library(mediation)
library(glmnet)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(randomForest)융합캡스톤디자인 프로젝트
I. 필요한 패키지 준비 & 워크 디렉토리 설정
II. 데이터 불러오고 회귀분석 시행
seoul <- read.csv("seoul_data.csv")
summary(seoul) region price income library
Length:25 Min. : 797.8 Min. :3202 Min. : 6.00
Class :character 1st Qu.:1106.2 1st Qu.:3495 1st Qu.:12.00
Mode :character Median :1523.0 Median :3803 Median :19.00
Mean :1554.9 Mean :3985 Mean :19.24
3rd Qu.:1718.3 3rd Qu.:4121 3rd Qu.:21.00
Max. :3174.6 Max. :6190 Max. :44.00
park subway school medical
Min. :-1.0770 Min. : 4.00 Min. : 64.0 Min. :-1.59600
1st Qu.:-0.6610 1st Qu.:11.00 1st Qu.:116.0 1st Qu.:-0.53200
Median :-0.2480 Median :16.00 Median :136.0 Median :-0.19300
Mean : 0.0002 Mean :15.64 Mean :157.4 Mean :-0.00672
3rd Qu.: 0.4280 3rd Qu.:19.00 3rd Qu.:205.0 3rd Qu.: 0.31400
Max. : 3.7260 Max. :33.00 Max. :310.0 Max. : 3.45900
crime academy mart department
Min. :1921 Min. :-2.16200 Min. : 2.00 Min. : 2.00
1st Qu.:2642 1st Qu.:-0.49200 1st Qu.: 5.00 1st Qu.: 5.00
Median :3348 Median :-0.01000 Median : 7.00 Median : 7.00
Mean :3449 Mean :-0.00388 Mean : 6.56 Mean : 7.08
3rd Qu.:3834 3rd Qu.: 0.77100 3rd Qu.: 7.00 3rd Qu.: 9.00
Max. :6763 Max. : 2.57400 Max. :13.00 Max. :12.00
lm(price ~ income + subway + crime, data = seoul)
Call:
lm(formula = price ~ income + subway + crime, data = seoul)
Coefficients:
(Intercept) income subway crime
-1277.6670 0.5945 11.2800 0.0833
- 특정 독립변수에 대한 계수가 비정상적으로 높은 문제 발생.
III. 표준화되지 않은 변수들을 표준화
# 이미 정제를 거친 상태이므로 코드 문단 전체 주석 처리
# cols_to_scale <- setdiff(names(seoul), c("region", "price"))
# seoul_scaled <- seoul %>%
# mutate(across(all_of(cols_to_scale), ~ scale(.)))
# summary(seoul_scaled)
# write.csv(seoul_scaled, "seoul_scaled_data.csv", row.names = FALSE)
seoul_scaled <- read.csv("seoul_scaled_data.csv")IV. 적당한 변수를 골라서 다중회귀분석 Test
lm(price ~ income + subway + crime, data = seoul_scaled)
Call:
lm(formula = price ~ income + subway + crime, data = seoul_scaled)
Coefficients:
(Intercept) income subway crime
1554.87 475.63 81.59 92.17
- 범죄율의 계수가 +인 것이 흥미롭다.
V. 임의로 3/3/4 나누어서 다중회귀분석 후 F-검정
lm.1 <- lm(price ~ income + library + park, data = seoul_scaled)
lm.2 <- lm(price ~ subway + school + medical, data = seoul_scaled)
lm.3 <- lm(price ~ crime + academy + mart + department, data = seoul_scaled)
stargazer(lm.1,
lm.2,
lm.3,
type = 'text')
======================================================================================
Dependent variable:
------------------------------------------------------------------
price
(1) (2) (3)
--------------------------------------------------------------------------------------
income 503.786***
(75.210)
library 117.562
(90.914)
park -87.595
(76.702)
subway 443.892***
(108.101)
school -125.941
(107.431)
medical 6.717
(102.624)
crime 223.533**
(94.993)
academy 66.823
(98.992)
mart -16.580
(126.837)
department 375.475***
(107.686)
Constant 1,554.869*** 1,554.869*** 1,554.869***
(57.432) (99.853) (82.134)
--------------------------------------------------------------------------------------
Observations 25 25 25
R2 0.818 0.450 0.645
Adjusted R2 0.792 0.371 0.574
Residual Std. Error 287.159 (df = 21) 499.267 (df = 21) 410.669 (df = 20)
F Statistic 31.449*** (df = 3; 21) 5.719*** (df = 3; 21) 9.100*** (df = 4; 20)
======================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
- 임의로 묶어서 최적 조합을 찾아내기는 어렵다고 판단하여 다음 단계로.
VI. step() 명령어를 이용해 최적 변수 조합 찾아내기
full_model <- lm(price ~ . - region, data = seoul_scaled)
best_model_object <- step(full_model, direction = "backward", trace = TRUE)Start: AIC=278.37
price ~ (region + income + library + park + subway + school +
medical + crime + academy + mart + department) - region
Df Sum of Sq RSS AIC
- park 1 122 710543 276.37
- medical 1 853 711273 276.40
- subway 1 6831 717252 276.61
<none> 710420 278.37
- library 1 83570 793990 279.15
- mart 1 122626 833046 280.35
- department 1 165287 875707 281.60
- school 1 172024 882445 281.79
- crime 1 258691 969112 284.13
- academy 1 287793 998213 284.87
- income 1 1410521 2120942 303.71
Step: AIC=276.37
price ~ income + library + subway + school + medical + crime +
academy + mart + department
Df Sum of Sq RSS AIC
- medical 1 742 711285 274.40
- subway 1 8160 718703 274.66
<none> 710543 276.37
- library 1 120878 831421 278.30
- mart 1 151560 862103 279.21
- department 1 167046 877589 279.65
- school 1 192339 902882 280.36
- crime 1 265554 976096 282.31
- academy 1 295955 1006498 283.08
- income 1 1412544 2123087 301.74
Step: AIC=274.4
price ~ income + library + subway + school + crime + academy +
mart + department
Df Sum of Sq RSS AIC
- subway 1 7980 719265 272.68
<none> 711285 274.40
- mart 1 154760 866045 277.32
- department 1 181309 892594 278.07
- library 1 184404 895689 278.16
- school 1 217669 928954 279.07
- crime 1 283990 995275 280.80
- academy 1 297091 1008376 281.12
- income 1 1481714 2192999 300.55
Step: AIC=272.68
price ~ income + library + school + crime + academy + mart +
department
Df Sum of Sq RSS AIC
<none> 719265 272.68
- mart 1 156503 875768 275.60
- school 1 211543 930807 277.12
- department 1 211898 931162 277.13
- library 1 214043 933307 277.19
- academy 1 320782 1040046 279.90
- crime 1 346495 1065759 280.51
- income 1 1533648 2252912 299.22
summary(best_model_object)
Call:
lm(formula = price ~ income + library + school + crime + academy +
mart + department, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-331.15 -121.70 19.70 91.28 382.06
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.87 41.14 37.796 < 2e-16 ***
income 374.43 62.19 6.021 1.38e-05 ***
library 140.63 62.52 2.249 0.0380 *
school -129.66 57.99 -2.236 0.0390 *
crime 186.02 65.00 2.862 0.0108 *
academy 161.75 58.74 2.754 0.0136 *
mart -133.70 69.51 -1.923 0.0714 .
department 138.37 61.83 2.238 0.0389 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 205.7 on 17 degrees of freedom
Multiple R-squared: 0.9244, Adjusted R-squared: 0.8932
F-statistic: 29.69 on 7 and 17 DF, p-value: 2.485e-08
- 결과에 따라 park, medical, subway를 제거.
VII. 최적 조합으로 다중회귀분석
lm.perfect <- lm(price ~ income + library + school + crime + academy + mart + department, data = seoul_scaled)
summary(lm.perfect)
Call:
lm(formula = price ~ income + library + school + crime + academy +
mart + department, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-331.15 -121.70 19.70 91.28 382.06
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.87 41.14 37.796 < 2e-16 ***
income 374.43 62.19 6.021 1.38e-05 ***
library 140.63 62.52 2.249 0.0380 *
school -129.66 57.99 -2.236 0.0390 *
crime 186.02 65.00 2.862 0.0108 *
academy 161.75 58.74 2.754 0.0136 *
mart -133.70 69.51 -1.923 0.0714 .
department 138.37 61.83 2.238 0.0389 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 205.7 on 17 degrees of freedom
Multiple R-squared: 0.9244, Adjusted R-squared: 0.8932
F-statistic: 29.69 on 7 and 17 DF, p-value: 2.485e-08
stargazer(lm.perfect, type = 'text')
===============================================
Dependent variable:
---------------------------
price
-----------------------------------------------
income 374.427***
(62.191)
library 140.626**
(62.522)
school -129.663**
(57.988)
crime 186.019**
(65.002)
academy 161.745**
(58.742)
mart -133.695*
(69.514)
department 138.370**
(61.830)
Constant 1,554.869***
(41.139)
-----------------------------------------------
Observations 25
R2 0.924
Adjusted R2 0.893
Residual Std. Error 205.693 (df = 17)
F Statistic 29.687*** (df = 7; 17)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
- Adjusted R2 계수가 0.893 = 상기 변수들은 집값의 변동성을 89.3% 정도 설명한다
- 대부분의 변수가 95% 이상의 신뢰도를 가지고 있기까지 하니, 설명력은 충분
- park는 그렇다 쳐도, medical과 subway의 순수한 효과가 (-)인 건 다소 의외인데?
VIII. 공원, 의료, 지하철에 대해 단일회귀분석
lm_park <- lm(price ~ park, data = seoul_scaled)
lm_medical <- lm(price ~ medical, data = seoul_scaled)
lm_subway <- lm(price ~ subway, data = seoul_scaled)
summary(lm_park)
Call:
lm(formula = price ~ park, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-756.78 -446.90 -29.59 161.79 1619.16
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.869 128.614 12.089 1.91e-11 ***
park -2.961 131.266 -0.023 0.982
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 643.1 on 23 degrees of freedom
Multiple R-squared: 2.212e-05, Adjusted R-squared: -0.04346
F-statistic: 0.0005088 on 1 and 23 DF, p-value: 0.9822
summary(lm_medical)
Call:
lm(formula = price ~ medical, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-747.4 -400.3 -147.5 190.8 1554.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.87 128.14 12.134 1.77e-11 ***
medical 54.05 130.78 0.413 0.683
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 640.7 on 23 degrees of freedom
Multiple R-squared: 0.00737, Adjusted R-squared: -0.03579
F-statistic: 0.1708 on 1 and 23 DF, p-value: 0.6832
summary(lm_subway)
Call:
lm(formula = price ~ subway, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-723.49 -389.63 -20.58 267.99 1278.02
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.87 98.49 15.786 7.81e-14 ***
subway 404.84 100.53 4.027 0.000526 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 492.5 on 23 degrees of freedom
Multiple R-squared: 0.4135, Adjusted R-squared: 0.388
F-statistic: 16.22 on 1 and 23 DF, p-value: 0.0005258
- 공원은 지극히 작은 음의 값, 의료는 50 정도로 양의 값
- 지하철은 무려 400 정도의 양의 값.
- 적어도 지하철만큼은 추가적인 설명이 필요할 것.
IX. 각 변수 간 상관관계를 산점도로 표시해 한 눈에 알아보기
seoul_long <- pivot_longer(seoul_scaled,
cols = -c(price, region),
names_to = "variable",
values_to = "value")
ggplot(seoul_long, aes(x = value, y = price)) +
geom_point(alpha = 1.0, color = "orange") +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1.0) +
facet_wrap(~ variable, scales = "free_x") +
labs(title = "평균평당매매가와 독립변수들의 상관관계",
y = "Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))`geom_smooth()` using formula = 'y ~ x'
X. 정말 subway는 불필요한 변수일까?
lm.perfect_plus_subway <- lm(price ~ income + library + school + crime + academy + mart + department + subway, data = seoul_scaled)
summary(lm.perfect_plus_subway)
Call:
lm(formula = price ~ income + library + school + crime + academy +
mart + department + subway, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-322.81 -102.03 19.47 102.09 409.70
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.87 42.17 36.872 < 2e-16 ***
income 370.99 64.26 5.773 2.85e-05 ***
library 134.17 65.88 2.037 0.0586 .
school -132.18 59.74 -2.213 0.0418 *
crime 176.93 70.00 2.527 0.0224 *
academy 157.65 60.98 2.585 0.0199 *
mart -132.99 71.27 -1.866 0.0805 .
department 131.79 65.26 2.020 0.0605 .
subway 27.58 65.09 0.424 0.6774
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 210.8 on 16 degrees of freedom
Multiple R-squared: 0.9252, Adjusted R-squared: 0.8878
F-statistic: 24.74 on 8 and 16 DF, p-value: 1.313e-07
stargazer(lm.perfect_plus_subway, type = 'text')
===============================================
Dependent variable:
---------------------------
price
-----------------------------------------------
income 370.994***
(64.261)
library 134.168*
(65.876)
school -132.185**
(59.737)
crime 176.927**
(70.001)
academy 157.651**
(60.984)
mart -132.985*
(71.275)
department 131.785*
(65.256)
subway 27.578
(65.093)
Constant 1,554.869***
(42.169)
-----------------------------------------------
Observations 25
R2 0.925
Adjusted R2 0.888
Residual Std. Error 210.844 (df = 16)
F Statistic 24.745*** (df = 8; 16)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
AIC(lm.perfect_plus_subway)[1] 347.3457
AIC(lm.perfect)[1] 345.6246
BIC(lm.perfect_plus_subway)[1] 359.5345
BIC(lm.perfect)[1] 356.5945
- 결론적으로 subway는 그 단일로 보면 price와 높은 상관관계를 보이지만…
- 다른 변수들까지 고려해보면 그 연관이 통계적으로 유의하지는 않다는 것.
- 혹시 다중공선성 문제일 가능성은 없나?
XI. (subway)다중공선성 검증
lm.all <- lm(price ~ income + library + park + subway + school + medical + crime + academy + mart + department, data = seoul_scaled)
vif(lm.perfect) income library school crime academy mart department
2.193919 2.217398 1.907412 2.396790 1.957337 2.741062 2.168542
vif(lm.all) income library park subway school medical crime
2.366448 4.979984 3.045025 2.526361 2.314752 2.004135 2.823422
academy mart department
2.115987 3.680751 2.662069
vif(lm.perfect_plus_subway) income library school crime academy mart department
2.229366 2.342833 1.926541 2.645431 2.007768 2.742578 2.298937
subway
2.287496
- 어느 쪽의 독립변수도 값이 5를 초과하지는 않는다(lm.all의 library가 조금 아슬아슬하지만…)
- 따라서, step() 명령어가 subway를 제거한 이유는 단순 설명력 부족 때문이 맞다.
XII. (subway)매개성 검증
model.m <- lm(income ~ subway, data=seoul)
model.y <- lm(price ~ subway + income, data=seoul)
med.out <- mediate(model.m, model.y, treat="subway", mediator="income", boot=TRUE, sims=1000)Running nonparametric bootstrap
summary(med.out)
Causal Mediation Analysis
Nonparametric Bootstrap Confidence Intervals with the Percentile Method
Estimate 95% CI Lower 95% CI Upper p-value
ACME 37.88879 16.23451 64.73576 <2e-16 ***
ADE 18.07800 -0.50495 35.53845 0.062 .
Total Effect 55.96679 25.72191 81.24959 0.002 **
Prop. Mediated 0.67699 0.45224 1.01393 0.002 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Sample Size Used: 25
Simulations: 1000
- subway는 price에 아주 큰 영향을 줌, 그 영향의 크기는 대략 67% 정도.
- subway 변수의 영향력 대부분은 income 변수가 이미 설명하고 있으며,
- 그 외 다른 변수들(학군, 마트 등)이 나머지를 설명하고 나니,
- subway 변수 자체는 price를 설명하는 데 추가적으로 기여하는 바가 없다 -> step()에서 제거된 이유!
XIII. (다중선형회귀)예측 모델으로의 발전
n_simulations <- 1000 # 최종본에서는 10000으로 바꿀 것
r_squared_results <- numeric(n_simulations)
for (i in 1:n_simulations) {
trainIndex <- createDataPartition(seoul_scaled$price, p = .8,
list = FALSE,
times = 1)
trainData <- seoul_scaled[trainIndex, ]
testData <- seoul_scaled[-trainIndex, ]
model_train <- lm(price ~ income + library + school + crime + academy + mart + department,
data = trainData)
predictions <- predict(model_train, testData)
valid_indices <- complete.cases(predictions, testData$price)
if (sum(valid_indices) > 0) {
metrics <- postResample(pred = predictions[valid_indices],
obs = testData$price[valid_indices])
r_squared_results[i] <- metrics["Rsquared"]
} else {
r_squared_results[i] <- NA
}
}
print(summary(r_squared_results, na.rm = TRUE)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1657 0.8403 0.9237 0.8678 0.9691 1.0000
stargazer(model_train, lm.perfect,
type = "text",
title = "Model Comparison: Train (80%) vs Full (100%)",
align = TRUE,
column.labels = c("model_train (80%)", "lm.perfect (100%)")
)
Model Comparison: Train (80%) vs Full (100%)
=================================================================
Dependent variable:
---------------------------------------------
price
model(80%) lm.perfect (100%)
(1) (2)
-----------------------------------------------------------------
income 353.600*** 374.427***
(70.092) (62.191)
library 150.493** 140.626**
(67.142) (62.522)
school -155.565** -129.663**
(70.545) (57.988)
crime 203.774** 186.019**
(75.189) (65.002)
academy 191.876** 161.745**
(79.108) (58.742)
mart -169.331** -133.695*
(77.001) (69.514)
department 156.897* 138.370**
(84.090) (61.830)
Constant 1,578.672*** 1,554.869***
(48.553) (41.139)
-----------------------------------------------------------------
Observations 21 25
R2 0.933 0.924
Adjusted R2 0.897 0.893
Residual Std. Error 214.664 (df = 13) 205.693 (df = 17)
F Statistic 25.872*** (df = 7; 13) 29.687*** (df = 7; 17)
=================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
- 기존 데이터를 8:2로 분류하여 traindata와 testdata 생성 후, traindata로 학습/testdata로 평가
- 해당 과정을 10000번 반복 후, 통계 요약으로 평균과 중앙값, 사분위수 등을 도출
- 처음 보는 데이터의 변동성을 80% 이상의 수준으로 예측해낼 수 있다!
- 설명 모형에서의 경우보다 설명력이 일정 부분 감소했지만, 충분히 만족스러운 결과.
- stargazer을 통한 검증에서도 변수의 방향은 전부 비슷하고, 일부 신뢰성이 감소했지만 그래도 나쁘지 않다.