library(pROC);library(readr);library(glmnet)
y <- c(0,1,0,1,1)
yhat_prob <- c(0.1,0.2,0.4,0.7,0.9)
plot.roc(y , yhat_prob)


Call:
plot.roc.default(x = y, predictor = yhat_prob)

Data: yhat_prob in 2 controls (y 0) < 3 cases (y 1).
Area under the curve: 0.8333
auc <- pROC::auc(y , yhat_prob)
auc
Area under the curve: 0.8333

Problem 5

首先將Data讀入
Training data 及 testing data 皆讀入R中
分別命名為df_train 及 df_test
var_names <- c(paste0("X", 1:280), "Y")
df_train <- read_csv("data/BlogFeedback/blogData_train.csv",col_names = var_names)
file_all <- list.files("data/BlogFeedback")
file_all <- paste0("data/BlogFeedback/", file_all)
file_all <- file_all[-61]
list_df_test <- lapply(file_all, read_csv, col_names = var_names)
df_test <- do.call(rbind, list_df_test)
Step 1 :logistic lasso regression
首先,新建立一個變項名為Y_Class,先將Data的Y分為兩類,0 (Y=0) 及 1 (Y>0)
再以此類別變項Y_Class建立模型
由於變項數目相當多,故選用regularization中的lasso regression作為模型建立的方法
以此logistic lasso regression建立之模型,再應用回training data上,
可得其Y_Class=1之機率,稱之yhat_train_prob
並以此模型續作分類,可得預估之類別,稱之yhat_train_class
df_train$Y_class <- as.factor(ifelse(df_train$Y == 0 , 0 , 1))
x_train_class <- model.matrix(Y_class  ~. , df_train[,-281])[, -1]
y_train_class <- df_train$Y_class
rst_class <- cv.glmnet(x_train_class, y_train_class, alpha = 1, 
                  family="binomial", nfolds = 15,
                  type.measure = "auc")
yhat_train_prob <- predict(rst_class, s = rst_class$lambda.min, 
                           newx = x_train_class,type = "response")
yhat_train_class <- predict(rst_class, s = rst_class$lambda.min,
                            newx = x_train_class,type = "class")
Step 2:lasso regression
將上述之yhat_train_class與原df_train結合,於df_train得一新變項,稱之yhat_class,
並以yhat_class,subset出yhat_class=1之Data,稱之df_train_reg
續以df_train_reg作為建立模型之資料,由於變項數目仍舊眾多之故,仍選用lasso regression
而本次資料立論上應為Count data,故選用family=“poisson”之方式建立Model
模型建立完畢後,應用至training data(df_train_reg),可估出其yhat,稱之yhat_train_reg
df_train$yhat_class <- as.numeric(yhat_train_class)
df_train_reg <- subset(df_train,df_train$yhat_class == 1)
x_train_reg <- model.matrix(Y ~. , df_train_reg[,-c(282,283)])[, -1]
y_train_reg <- df_train_reg$Y
rst_reg <- cv.glmnet(x_train_reg, y_train_reg, alpha = 1, 
                  family="poisson", nfolds = 15,
                  type.measure = "deviance")
yhat_train_reg<- predict(rst_reg , s = rst_reg$lambda.min, newx = x_train_reg,type = "response")
df_train$yhat_reg <- ifelse(row.names(df_train) %in% row.names(yhat_train_reg),yhat_train_reg,0)
rsquared_regression_Train <- 1 - mean((df_train_reg$Y - yhat_train_reg)^2)/var(df_train_reg$Y)
auc_train <- pROC::auc(df_train$Y_class ,yhat_train_prob)
Results : Training Data
training data的結果如下,分別以
Step 1 之 AUC 及
Step 2 之 regression R squared 呈現 (只有在step 1 被選取出來的data有計入R-Squared)
print(paste("AUC in Training Sets :",auc_train))
[1] "AUC in Training Sets : 0.843887543028147"
print(paste("R Squared in Training Sets :",rsquared_regression_Train))
[1] "R Squared in Training Sets : 0.550569997348758"
Testing Data
將上述於Training data建立之模型應用至Testing data
步驟皆與上述雷同
df_test$Y_class  <- as.factor(ifelse(df_test$Y == 0 , 0 , 1))
x_test_class <- model.matrix(Y_class  ~. , df_test[,-281])[, -1]
y_test_class <- df_test$Y_class 
yhat_test_prob <- predict(rst_class, s = rst_class$lambda.min, 
                           newx = x_test_class,type = "response")
yhat_test_class <- predict(rst_class, s = rst_class$lambda.min,
                            newx = x_test_class,type = "class")
df_test$yhat_class <- as.numeric(yhat_test_class)
df_test_reg <- subset(df_test,df_test$yhat_class == 1)
x_test_reg <- model.matrix(Y ~. , df_test_reg[,-c(282,283)])[, -1]
y_test_reg <- df_test_reg$Y
yhat_test_reg<- predict(rst_reg , s = rst_reg$lambda.min, newx = x_test_reg,type = "response")
df_test$yhat_reg <- ifelse(row.names(df_test) %in% row.names(yhat_test_reg),yhat_test_reg,0)
rsquared_regression_Test <- 1 - mean((df_test_reg$Y - yhat_test_reg)^2)/var(df_test_reg$Y)
auc_test <- pROC::auc(df_test$Y_class ,yhat_test_prob )
Results : Testing Data
testing data的結果如下,分別以
Step 1 之 AUC 及
Step 2 之 regression R squared 呈現 (只有在step 1 被選取出來的data有計入R-Squared)
print(paste("AUC in Testing Sets :",auc_test))
[1] "AUC in Testing Sets : 0.834797275270716"
print(paste("R Squared in Testing Sets :",rsquared_regression_Test))
[1] "R Squared in Testing Sets : 0.360870137954522"