
题1
library(MASS)
library(ggplot2)
true_beta <- c(0.5, 1.2, -1) # 真正的参数值
N_values <- c(200, 500, 800, 1000) # 不同的样本量
R <- 200 # 模拟的轮数
tolerance <- 1e-5 # 收敛的容差
set.seed(42) # 设置随机种子以确保结果可重复
# 定义sigmoid函数
sigmoid <- function(x) {
1 / (1 + exp(-x))
}
# 实现逻辑回归的Newton-Raphson算法
newton_raphson_logistic <- function(X, Y, beta_init, tolerance = 1e-5) {
beta <- beta_init
for (i in 1:100) { # 最大迭代次数为100次
p <- sigmoid(X %*% beta) # 计算概率p
W <- diag(as.vector(p * (1 - p))) # 权重矩阵
gradient <- t(X) %*% (Y - p) # 梯度
hessian <- t(X) %*% W %*% X # Hessian矩阵
try({
beta_new <- beta + solve(hessian) %*% gradient # Newton-Raphson更新
}, silent = TRUE)
if (max(abs(beta_new - beta)) < tolerance) { # 检查收敛条件
return(beta_new)
}
beta <- beta_new
}
return(beta)
}
# 模拟并存储结果
results <- list()
for (N in N_values) {
errors <- matrix(NA, nrow = R, ncol = length(true_beta)) # 存储每个样本量下的误差
for (r in 1:R) {
# 生成X和Y
X <- cbind(1, mvrnorm(N, mu = c(0, 0), Sigma = diag(2))) # 添加截距项
logits <- X %*% true_beta # 计算线性组合
p <- sigmoid(logits) # 转换为概率
Y <- rbinom(N, 1, p) # 根据概率生成二元响应变量Y
# 使用Newton-Raphson算法估计beta
beta_init <- rep(0, 3) # beta的初始猜测
beta_est <- newton_raphson_logistic(X, Y, beta_init, tolerance)
# 存储每个参数的估计误差
errors[r, ] <- beta_est - true_beta
}
results[[as.character(N)]] <- errors
}
# 转换结果为数据框,便于使用ggplot绘图
errors_df <- do.call(rbind, lapply(names(results), function(N) {
data.frame(N = as.integer(N),
Parameter = rep(paste0("beta_", 0:2), each = R),
Error = as.vector(results[[N]]))
}))
# 使用ggplot2绘制箱线图
ggplot(errors_df, aes(x = factor(N), y = Error)) +
geom_boxplot() +
facet_wrap(~ Parameter, scales = "free", labeller = label_parsed) +
labs(title = "不同样本量下每个参数的估计误差",
x = "样本量 (N)",
y = "估计误差") +
theme_minimal()

题2
# 读入数据
train_data <- read.csv("C:/Users/Lenovo/Desktop/新建文件夹/大三上/机器学习/sampledata.csv")
test_data <- read.csv("C:/Users/Lenovo/Desktop/新建文件夹/大三上/机器学习/preddata.csv")
# 判断是否右偏,并对右偏数据取对数
boxplot_columns <- function(data, columns) {
par(mfrow = c(2, 4)) # 设置多图显示
for (col in columns) {
if (sum(is.na(data[[col]])) > 0) {
data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm = TRUE)
}
if (e1071::skewness(data[[col]]) > 0.5) {
data[[col]] <- log1p(data[[col]]) # 对右偏数据取对数
}
boxplot(data[[col]], main = col)
}
}
# 绘制箱线图
library(e1071) # 用于计算偏度
##
## 载入程辑包:'e1071'
## The following object is masked _by_ '.GlobalEnv':
##
## sigmoid
boxplot_columns(train_data, c("tenure", "expense", "degree", "tightness", "entropy", "chgexpense", "chgdegree"))
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : boxplot1里没有画离群值(-Inf)

# 去除ID变量并标准化自变量
standardized_train <- as.data.frame(scale(train_data[, !colnames(train_data) %in% c("churn", "ID")])) # 去除ID和因变量并转为数据框
train_data_standardized <- cbind(standardized_train, churn = train_data$churn)
# 建立逻辑回归模型
logistic_model <- glm(churn ~ ., data = train_data_standardized, family = binomial)
summary(logistic_model)
##
## Call:
## glm(formula = churn ~ ., family = binomial, data = train_data_standardized)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.05338 0.07212 -70.067 < 2e-16 ***
## tenure -0.24767 0.06070 -4.080 4.50e-05 ***
## expense -0.29229 0.05904 -4.951 7.39e-07 ***
## degree -0.73751 0.13172 -5.599 2.15e-08 ***
## tightness -0.22660 0.04254 -5.327 9.99e-08 ***
## entropy -0.35176 0.07208 -4.880 1.06e-06 ***
## chgexpense -0.16071 0.04893 -3.284 0.00102 **
## chgdegree -0.38279 0.05208 -7.349 1.99e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6475.5 on 48284 degrees of freedom
## Residual deviance: 5757.6 on 48277 degrees of freedom
## AIC: 5773.6
##
## Number of Fisher Scoring iterations: 8
# 预测训练集
train_predictions <- predict(logistic_model, newdata = train_data_standardized, type = "response")
# 去除ID变量并标准化测试集
standardized_test <- as.data.frame(scale(test_data[, !colnames(test_data) %in% c("churn", "ID")]))
test_data_standardized <- cbind(standardized_test, churn = test_data$churn)
# 预测测试集
test_predictions <- predict(logistic_model, newdata = test_data_standardized, type = "response")
# 计算ROC和AUC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 载入程辑包:'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# 训练集
roc_train <- roc(train_data$churn, train_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_train <- auc(roc_train)
plot.roc(roc_train, main = paste("Training Set ROC Curve - AUC:", round(auc_train, 2)))

# 测试集
roc_test <- roc(test_data$churn, test_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_test <- auc(roc_test)
plot.roc(roc_test, main = paste("Test Set ROC Curve - AUC:", round(auc_test, 2)))
