문제 1
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
setwd("C:/Users/kyeeu/OneDrive/Desktop")
raw_data <- read.csv("day.csv")
df <- raw_data[1:80, c("temp", "atemp", "hum", "windspeed", "cnt")]
set.seed(1)
idx <- createDataPartition(df$cnt, p = 0.7, list = FALSE)
train_set <- df[idx, ]
test_set <- df[-idx, ]
히스토그램 생성
par(mfrow=c(1,2))
hist(train_set$cnt, main="Train Set CNT", col="gray")
hist(test_set$cnt, main="Test Set CNT", col="white")

문제 2
repeat_list <- c(1, 5, 10, 20, 30)
knn_results <- list()
for (i in repeat_list) {
my_ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = i)
set.seed(1)
knn_fit <- train(cnt ~ ., data = train_set,
method = "knn",
trControl = my_ctrl,
preProcess = c("center", "scale"))
knn_results[[paste0("rep", i)]] <- knn_fit
}
결과 확인
plot(knn_results[["rep30"]])

print(knn_results[["rep30"]]$bestTune)
## k
## 3 9
문제 3
final_ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 30)
set.seed(1)
lm_fit <- train(cnt ~ ., data = train_set,
method = "lm",
trControl = final_ctrl)
comparison <- resamples(list(kNN = knn_results[["rep30"]], Linear = lm_fit))
summary(comparison)
##
## Call:
## summary.resamples(object = comparison)
##
## Models: kNN, Linear
## Number of resamples: 150
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 201.4747 297.8030 344.9278 345.6582 389.7652 552.4630 0
## Linear 141.5368 231.1167 284.5558 283.9545 334.3250 415.8336 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 223.9069 365.3751 439.4457 436.3938 502.9441 688.2007 0
## Linear 174.3903 288.6731 348.8385 358.8649 422.9685 547.0752 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 0.012900932 0.2482154 0.4029453 0.3870312 0.5375121 0.9239330 0
## Linear 0.002146765 0.4823014 0.6034514 0.5860778 0.7265960 0.9035185 0
RMSE 차이
bwplot(comparison)

문제 4