题1

library(MASS)  
library(ggplot2)
true_beta <- c(0.5, 1.2, -1)  # 真正的参数值
N_values <- c(200, 500, 800, 1000)  # 不同的样本量
R <- 200  # 模拟的轮数
tolerance <- 1e-5  # 收敛的容差
set.seed(42)  # 设置随机种子以确保结果可重复
# 定义sigmoid函数
sigmoid <- function(x) {
  1 / (1 + exp(-x))
}

# 实现逻辑回归的Newton-Raphson算法
newton_raphson_logistic <- function(X, Y, beta_init, tolerance = 1e-5) {
  beta <- beta_init
  for (i in 1:100) {  # 最大迭代次数为100次
    p <- sigmoid(X %*% beta)  # 计算概率p
    W <- diag(as.vector(p * (1 - p)))  # 权重矩阵
    gradient <- t(X) %*% (Y - p)  # 梯度
    hessian <- t(X) %*% W %*% X  # Hessian矩阵
    try({
      beta_new <- beta + solve(hessian) %*% gradient  # Newton-Raphson更新
    }, silent = TRUE)
    if (max(abs(beta_new - beta)) < tolerance) {  # 检查收敛条件
      return(beta_new)
    }
    beta <- beta_new
  }
  return(beta)
}
# 模拟并存储结果
results <- list()
for (N in N_values) {
  errors <- matrix(NA, nrow = R, ncol = length(true_beta))  # 存储每个样本量下的误差
  for (r in 1:R) {
    # 生成X和Y
    X <- cbind(1, mvrnorm(N, mu = c(0, 0), Sigma = diag(2)))  # 添加截距项
    logits <- X %*% true_beta  # 计算线性组合
    p <- sigmoid(logits)  # 转换为概率
    Y <- rbinom(N, 1, p)  # 根据概率生成二元响应变量Y
    
    # 使用Newton-Raphson算法估计beta
    beta_init <- rep(0, 3)  # beta的初始猜测
    beta_est <- newton_raphson_logistic(X, Y, beta_init, tolerance)
    
    # 存储每个参数的估计误差
    errors[r, ] <- beta_est - true_beta
  }
  results[[as.character(N)]] <- errors
}
# 转换结果为数据框,便于使用ggplot绘图
errors_df <- do.call(rbind, lapply(names(results), function(N) {
  data.frame(N = as.integer(N),
             Parameter = rep(paste0("beta_", 0:2), each = R),
             Error = as.vector(results[[N]]))
}))

# 使用ggplot2绘制箱线图
ggplot(errors_df, aes(x = factor(N), y = Error)) +
  geom_boxplot() +
  facet_wrap(~ Parameter, scales = "free", labeller = label_parsed) +
  labs(title = "不同样本量下每个参数的估计误差",
       x = "样本量 (N)",
       y = "估计误差") +
  theme_minimal()

题2

# 读入数据
train_data <- read.csv("C:/Users/Lenovo/Desktop/新建文件夹/大三上/机器学习/sampledata.csv")
test_data <- read.csv("C:/Users/Lenovo/Desktop/新建文件夹/大三上/机器学习/preddata.csv")
# 判断是否右偏,并对右偏数据取对数
boxplot_columns <- function(data, columns) {
  par(mfrow = c(2, 4))  # 设置多图显示
  
  for (col in columns) {
    if (sum(is.na(data[[col]])) > 0) {
      data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm = TRUE)
    }
    if (e1071::skewness(data[[col]]) > 0.5) {
      data[[col]] <- log1p(data[[col]])  # 对右偏数据取对数
    }
    boxplot(data[[col]], main = col)
  }
}


# 绘制箱线图
library(e1071)  # 用于计算偏度
## 
## 载入程辑包:'e1071'
## The following object is masked _by_ '.GlobalEnv':
## 
##     sigmoid
boxplot_columns(train_data, c("tenure", "expense", "degree", "tightness", "entropy", "chgexpense", "chgdegree"))
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : boxplot1里没有画离群值(-Inf)

# 去除ID变量并标准化自变量
standardized_train <- as.data.frame(scale(train_data[, !colnames(train_data) %in% c("churn", "ID")]))  # 去除ID和因变量并转为数据框
train_data_standardized <- cbind(standardized_train, churn = train_data$churn)

# 建立逻辑回归模型
logistic_model <- glm(churn ~ ., data = train_data_standardized, family = binomial)
summary(logistic_model)
## 
## Call:
## glm(formula = churn ~ ., family = binomial, data = train_data_standardized)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.05338    0.07212 -70.067  < 2e-16 ***
## tenure      -0.24767    0.06070  -4.080 4.50e-05 ***
## expense     -0.29229    0.05904  -4.951 7.39e-07 ***
## degree      -0.73751    0.13172  -5.599 2.15e-08 ***
## tightness   -0.22660    0.04254  -5.327 9.99e-08 ***
## entropy     -0.35176    0.07208  -4.880 1.06e-06 ***
## chgexpense  -0.16071    0.04893  -3.284  0.00102 ** 
## chgdegree   -0.38279    0.05208  -7.349 1.99e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6475.5  on 48284  degrees of freedom
## Residual deviance: 5757.6  on 48277  degrees of freedom
## AIC: 5773.6
## 
## Number of Fisher Scoring iterations: 8
# 预测训练集
train_predictions <- predict(logistic_model, newdata = train_data_standardized, type = "response")

# 去除ID变量并标准化测试集
standardized_test <- as.data.frame(scale(test_data[, !colnames(test_data) %in% c("churn", "ID")]))
test_data_standardized <- cbind(standardized_test, churn = test_data$churn)

# 预测测试集
test_predictions <- predict(logistic_model, newdata = test_data_standardized, type = "response")

# 计算ROC和AUC
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## 载入程辑包:'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# 训练集
roc_train <- roc(train_data$churn, train_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_train <- auc(roc_train)
plot.roc(roc_train, main = paste("Training Set ROC Curve - AUC:", round(auc_train, 2)))

# 测试集
roc_test <- roc(test_data$churn, test_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_test <- auc(roc_test)
plot.roc(roc_test, main = paste("Test Set ROC Curve - AUC:", round(auc_test, 2)))