setwd("C:/BigData2024")
library(tidyverse)
library(MASS)
library(caret)
library(stargazer)
library(GGally)
library(car)
library(mediation)
library(glmnet)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(randomForest)
library(xgboost) 서울 집값 예측 - 기계학습 모형
필요한 패키지 준비 & 워크 디렉토리 설정
기존 선형회귀 설명 모형
seoul_scaled <- read.csv("seoul_scaled_data.csv")
lm.perfect <- lm(price ~ income + library + school + crime + academy + mart + department, data = seoul_scaled)
summary(lm.perfect)
Call:
lm(formula = price ~ income + library + school + crime + academy +
mart + department, data = seoul_scaled)
Residuals:
Min 1Q Median 3Q Max
-331.15 -121.70 19.70 91.28 382.06
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1554.87 41.14 37.796 < 2e-16 ***
income 374.43 62.19 6.021 1.38e-05 ***
library 140.63 62.52 2.249 0.0380 *
school -129.66 57.99 -2.236 0.0390 *
crime 186.02 65.00 2.862 0.0108 *
academy 161.75 58.74 2.754 0.0136 *
mart -133.70 69.51 -1.923 0.0714 .
department 138.37 61.83 2.238 0.0389 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 205.7 on 17 degrees of freedom
Multiple R-squared: 0.9244, Adjusted R-squared: 0.8932
F-statistic: 29.69 on 7 and 17 DF, p-value: 2.485e-08
I-1. (다중선형회귀)예측 모델으로의 발전
n_simulations <- 100 # 최종본에서는 10000으로 바꿀 것
r_squared_results <- numeric(n_simulations)
for (i in 1:n_simulations) {
trainIndex <- createDataPartition(seoul_scaled$price, p = .8,
list = FALSE,
times = 1)
trainData <- seoul_scaled[trainIndex, ]
testData <- seoul_scaled[-trainIndex, ]
model_train <- lm(price ~ income + library + school + crime + academy + mart + department,
data = trainData)
predictions <- predict(model_train, testData)
valid_indices <- complete.cases(predictions, testData$price)
if (sum(valid_indices) > 0) {
metrics <- postResample(pred = predictions[valid_indices],
obs = testData$price[valid_indices])
r_squared_results[i] <- metrics["Rsquared"]
} else {
r_squared_results[i] <- NA
}
}
print(summary(r_squared_results, na.rm = TRUE)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2596 0.8105 0.8925 0.8369 0.9494 0.9996
stargazer(model_train, lm.perfect,
type = "text",
title = "Model Comparison: Train (80%) vs Full (100%)",
align = TRUE,
column.labels = c("model_train (80%)", "lm.perfect (100%)")
)
Model Comparison: Train (80%) vs Full (100%)
=================================================================
Dependent variable:
---------------------------------------------
price
model(80%) lm.perfect (100%)
(1) (2)
-----------------------------------------------------------------
income 364.072*** 374.427***
(68.041) (62.191)
library 120.892* 140.626**
(68.077) (62.522)
school -153.463** -129.663**
(66.771) (57.988)
crime 227.821*** 186.019**
(75.094) (65.002)
academy 158.337* 161.745**
(81.897) (58.742)
mart -163.769** -133.695*
(75.171) (69.514)
department 155.847* 138.370**
(74.514) (61.830)
Constant 1,558.878*** 1,554.869***
(46.311) (41.139)
-----------------------------------------------------------------
Observations 21 25
R2 0.937 0.924
Adjusted R2 0.903 0.893
Residual Std. Error 209.009 (df = 13) 205.693 (df = 17)
F Statistic 27.514*** (df = 7; 13) 29.687*** (df = 7; 17)
=================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
- 기존 데이터를 8:2로 분리 후 시뮬레이션 대량 반복을 통해 “어쩌다 운이 좋아서/나빠서” 나올 수 있는 오류를 방지하고자 노력
- stargazer() 함수를 이용하여 기존의 설명 모형과 최종적으로 계수 및 통계적 유의성 비교
- 그 결과, 원래의 설명 모형과 거의 비슷하거나 아주 약간 떨어지는 R-Square 값을 가지는 모형이 완성됨. 일부 변수의 통계적 유의성이 감소하기도 했지만, 아주 큰 변동은 없었음
I-2. Lasso 회귀를 이용한 추가 검정
x_var <- seoul_scaled %>%
dplyr::select(-price, -region) %>%
as.matrix()
y_var <- seoul_scaled$price
n_repeats <- 100 # 최종본에서는 10000으로 바꿀 것
best_lambdas_min <- numeric(n_repeats)
best_lambdas_1se <- numeric(n_repeats)
for(i in 1:n_repeats) {
cv_fit <- cv.glmnet(x_var, y_var, alpha = 1, nfolds = 5)
best_lambdas_min[i] <- cv_fit$lambda.min
best_lambdas_1se[i] <- cv_fit$lambda.1se
}
avg_lambda_min <- mean(best_lambdas_min)
avg_lambda_1se <- mean(best_lambdas_1se)
print(paste("Mean Lambda (min):", round(avg_lambda_min, 4)))[1] "Mean Lambda (min): 36.7813"
print(paste("Mean Lambda (1se):", round(avg_lambda_1se, 4)))[1] "Mean Lambda (1se): 156.6784"
hist(best_lambdas_min, breaks = 30, col = "orange",
main = "최적의 Lambda값 분포(Lasso) - Min 기준",
xlab = "Lambda값")
abline(v = avg_lambda_min, col = "red", lwd = 2, lty = 2)# Min 기준을 적용한 모델
final_model_min <- glmnet(x_var, y_var, alpha = 1, lambda = avg_lambda_min)
print(coef(final_model_min))11 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 1554.86920
income 393.40063
library 27.37676
park .
subway 25.63078
school -18.31688
medical .
crime 77.74637
academy 63.28709
mart .
department 102.09923
# 1SE Rule을 적용한 모델
final_model_1se <- glmnet(x_var, y_var, alpha = 1, lambda = avg_lambda_1se)
print(coef(final_model_1se))11 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 1554.869200
income 358.235869
library .
park .
subway 6.108687
school .
medical .
crime 13.918678
academy .
mart .
department 56.013848
- stepAIC() 명령어의 변수 선택 기준이 유의하지 않을 가능성을 고려한 추가 검정 1
- 8:2 분리 및 다량 반복의 원칙을 준수했으며, 최적 Lambda값의 분포 및 평균을 보기 쉽게 히스토그램으로 시각화
- Min 기준을 적용한 모델에서는 기존 선형회귀식과 유사하지만 mart가 빠지고 subway가 포함된 식을 도출해 냄. subway가 그 실질적 크기는 작아도 분명히 집값에 기여하는 바가 있다고 판단한 듯
- 1SE Rule을 적용한 모델은 income, school, department를 제외한 모든 변수를 제외시켜 유의하지 않음.
I-3. Ridge 회귀를 이용한 추가 검정
x_ridge <- seoul_scaled %>%
dplyr::select(-price, -region) %>%
as.matrix()
y_ridge <- seoul_scaled$price
n_repeats_ridge <- 100 #최종본에서는 10000으로 바뀔 것
ridge_lambdas_min <- numeric(n_repeats_ridge)
ridge_lambdas_1se <- numeric(n_repeats_ridge)
for(i in 1:n_repeats_ridge) {
cv_ridge <- cv.glmnet(x_ridge, y_ridge, alpha = 0, nfolds = 5)
ridge_lambdas_min[i] <- cv_ridge$lambda.min
ridge_lambdas_1se[i] <- cv_ridge$lambda.1se
}
avg_lambda_ridge_min <- mean(ridge_lambdas_min)
avg_lambda_ridge_1se <- mean(ridge_lambdas_1se)
print(paste("Ridge Mean Lambda (min):", round(avg_lambda_ridge_min, 4)))[1] "Ridge Mean Lambda (min): 135.1911"
print(paste("Ridge Mean Lambda (1se):", round(avg_lambda_ridge_1se, 4)))[1] "Ridge Mean Lambda (1se): 454.2104"
hist(ridge_lambdas_min, breaks = 30, col = "lightgreen",
main = "최적의 Lambda값 분포(Ridge) - Min 기준",
xlab = "Lambda값")
abline(v = avg_lambda_ridge_min, col = "darkgreen", lwd = 2, lty = 2)# Min 기준을 적용한 모델
final_model_ridge_min <- glmnet(x_ridge, y_ridge, alpha = 0, lambda = avg_lambda_ridge_min)
print(coef(final_model_ridge_min))11 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 1554.869200
income 292.567678
library 86.936370
park 13.698374
subway 65.246534
school -85.980058
medical -5.472103
crime 123.761566
academy 92.597798
mart -23.611486
department 121.180591
# 1SE 기준을 적용한 모델
final_model_ridge_1se <- glmnet(x_ridge, y_ridge, alpha = 0, lambda = avg_lambda_ridge_1se)
print(coef(final_model_ridge_1se))11 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 1554.869200
income 205.261768
library 74.109275
park 6.356733
subway 74.917953
school -46.474231
medical -4.800626
crime 89.771332
academy 56.260002
mart 29.485236
department 109.814219
- stepAIC() 명령어의 변수 선택 기준이 유의하지 않을 가능성을 고려한 추가 검정 2
- 8:2 분리 및 다량 반복의 원칙을 준수했으며, 최적 Lambda값의 분포 및 평균을 보기 쉽게 히스토그램으로 시각화
- Min 기준을 적용한 모델에서는 기존 모델들이 제거한 변수의 계수가 매우 낮은 것을 볼 수 있음. Ridge 검정이 역설적으로 기존 모형들의 강건성을 증명해주는 꼴이 됨.
- 1SE Rule을 적용한 모델에서도 Min 기준 적용 모델과 같은 현상을 확인할 수 있었음
I-4. 확장된 데이터로 다중선형회귀 예측모델
seoul_expanded <- read.csv("seoul_expanded.csv")
n_simulations <- 100 # 최종본에서는 10000으로 바꿀 것
r_squared_results_expanded <- numeric(n_simulations)
for (i in 1:n_simulations) {
trainIndex_expanded <- createDataPartition(seoul_expanded$price, p = .8,
list = FALSE,
times = 1)
trainData_expanded <- seoul_expanded[trainIndex_expanded, ]
testData_expanded <- seoul_expanded[-trainIndex_expanded, ]
model_train_expanded <- lm(price ~ income + library + school + crime + academy + mart + department,
data = trainData_expanded)
predictions_expanded <- predict(model_train_expanded, testData_expanded)
valid_indices_expanded <- complete.cases(predictions_expanded, testData_expanded$price)
if (sum(valid_indices_expanded) > 0) {
metrics <- postResample(pred = predictions_expanded[valid_indices_expanded],
obs = testData_expanded$price[valid_indices_expanded])
r_squared_results_expanded[i] <- metrics["Rsquared"]
} else {
r_squared_results_expanded[i] <- NA
}
}
print(summary(r_squared_results, na.rm = TRUE)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2596 0.8105 0.8925 0.8369 0.9494 0.9996
stargazer(model_train_expanded, lm.perfect,
type = "text",
title = "Expanded Model Comparison: Train (80%) vs Full (100%)",
align = TRUE, #
column.labels = c("model_train_expanded (80%)", "lm.perfect (100%)")
)
Expanded Model Comparison: Train (80%) vs Full (100%)
==================================================================
Dependent variable:
----------------------------------------------
price
model(80%) lm.perfect (100%)
(1) (2)
------------------------------------------------------------------
income 386.897*** 374.427***
(30.768) (62.191)
library 161.907*** 140.626**
(36.441) (62.522)
school -150.103*** -129.663**
(38.654) (57.988)
crime 182.759*** 186.019**
(39.438) (65.002)
academy 208.682*** 161.745**
(34.095) (58.742)
mart -177.285*** -133.695*
(40.918) (69.514)
department 150.747*** 138.370**
(31.052) (61.830)
Constant 1,557.914*** 1,554.869***
(18.583) (41.139)
------------------------------------------------------------------
Observations 80 25
R2 0.928 0.924
Adjusted R2 0.921 0.893
Residual Std. Error 160.482 (df = 72) 205.693 (df = 17)
F Statistic 131.953*** (df = 7; 72) 29.687*** (df = 7; 17)
==================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
- 다중선형회귀모형의 강건성을 확인하기 위해, SMOTE로 데이터를 4배 확장 후 재차 예측 모델 생성성
- 8:2 분리 및 다량 반복의 원칙을 준수했으며, stargazer() 함수를 이용하여 기존 설명모형 lm.perfect와 비교
- 결론적으로, 기존의 다중선형회귀 예측모형은 데이터의 확장에도 잘 견디는 안정적인 모델이라 할 수 있음
II-1. 확장된 데이터로 Random Forest 예측모형
seoul_expanded <- read.csv("seoul_expanded.csv")
n_simulations <- 100
r_squared_results_rf <- numeric(n_simulations)
for (i in 1:n_simulations) {
trainIndex <- createDataPartition(seoul_expanded$price, p = .8,
list = FALSE,
times = 1)
trainData <- seoul_expanded[trainIndex, ]
testData <- seoul_expanded[-trainIndex, ]
model_rf <- randomForest(
price ~ income + library + school + crime + academy + mart + department,
data = trainData,
ntree = 500,
mtry = 3,
importance = TRUE
)
predictions <- predict(model_rf, testData)
valid_indices <- complete.cases(predictions, testData$price)
if (sum(valid_indices) > 0) {
metrics <- postResample(pred = predictions[valid_indices],
obs = testData$price[valid_indices])
r_squared_results_rf[i] <- metrics["Rsquared"]
} else {
r_squared_results_rf[i] <- NA
}
}
print(summary(r_squared_results_rf, na.rm = TRUE)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8544 0.9348 0.9530 0.9473 0.9650 0.9887
library(randomForest)
library(caret)
ntree_candidates <- c(100, 300, 500)
mtry_candidates <- c(2, 3, 4, 5)
n_simulations <- 100
results <- data.frame(
ntree = integer(),
mtry = integer(),
mean_r2 = numeric(),
sd_r2 = numeric()
)
for (nt in ntree_candidates) {
for (mt in mtry_candidates) {
r2_vec <- numeric(n_simulations)
for (i in 1:n_simulations) {
trainIndex <- createDataPartition(seoul_expanded$price, p = .8,
list = FALSE, times = 1)
trainData <- seoul_expanded[trainIndex, ]
testData <- seoul_expanded[-trainIndex, ]
model_rf <- randomForest(
price ~ income + library + school + crime + academy + mart + department,
data = trainData,
ntree = nt,
mtry = mt,
importance = FALSE
)
pred <- predict(model_rf, testData)
r2_vec[i] <- caret::R2(pred, testData$price)
}
results <- rbind(
results,
data.frame(
ntree = nt,
mtry = mt,
mean_r2 = mean(r2_vec),
sd_r2 = sd(r2_vec)
)
)
cat("Done: ntree =", nt, "mtry =", mt, "\n")
}
}Done: ntree = 100 mtry = 2
Done: ntree = 100 mtry = 3
Done: ntree = 100 mtry = 4
Done: ntree = 100 mtry = 5
Done: ntree = 300 mtry = 2
Done: ntree = 300 mtry = 3
Done: ntree = 300 mtry = 4
Done: ntree = 300 mtry = 5
Done: ntree = 500 mtry = 2
Done: ntree = 500 mtry = 3
Done: ntree = 500 mtry = 4
Done: ntree = 500 mtry = 5
print(results) ntree mtry mean_r2 sd_r2
1 100 2 0.9311317 0.04207385
2 100 3 0.9544609 0.02233696
3 100 4 0.9486837 0.02941395
4 100 5 0.9487468 0.02741801
5 300 2 0.9419190 0.03140889
6 300 3 0.9512875 0.02705556
7 300 4 0.9568116 0.02169445
8 300 5 0.9551677 0.02529897
9 500 2 0.9364969 0.03774420
10 500 3 0.9512071 0.02615142
11 500 4 0.9537619 0.02684548
12 500 5 0.9572942 0.02614672
best_result <- results[which.max(results$mean_r2), ]
cat(" ntree =", best_result$ntree, "\n") ntree = 500
cat(" mtry =", best_result$mtry, "\n") mtry = 5
cat(" 평균 R² =", round(best_result$mean_r2, 4), "\n") 평균 R² = 0.9573
II-2. XGBoost와 Random Forest를 섞은 Ensemble 모형
set.seed(123456)
idx <- createDataPartition(seoul_expanded$price, p = 0.8, list = FALSE)
trainData <- seoul_expanded[idx, ]
testData <- seoul_expanded[-idx, ]
fitControl <- trainControl(
method = "cv",
number = 5,
verboseIter = TRUE
)model_xgb <- train(
price ~ income + library + school + crime + academy + mart + department,
data = trainData,
method = "xgbTree",
trControl = fitControl,
tuneLength = 3,
verbose = FALSE
)model_rf <- randomForest(
price ~ income + library + school + crime + academy + mart + department,
data = trainData,
ntree = 100
)
pred_rf <- predict(model_rf, testData)
pred_xgb <- predict(model_xgb, testData)
pred_ensemble <- (pred_rf + pred_xgb) / 2
calc_rmse <- function(actual, pred) { sqrt(mean((actual - pred)^2)) }
rmse_rf <- calc_rmse(testData$price, pred_rf)
rmse_xgb <- calc_rmse(testData$price, pred_xgb)
rmse_ens <- calc_rmse(testData$price, pred_ensemble)
cat(sprintf("1. Random Forest : %.2f \n", rmse_rf),
sprintf("2. XGBoost : %.2f \n", rmse_xgb),
sprintf("3. Ensemble (Mix) : %.2f \n", rmse_ens),
sep = "")1. Random Forest : 145.29
2. XGBoost : 99.06
3. Ensemble (Mix) : 108.19