setwd("~/stats/kaggle/cancer/")
train <- read.csv('~/Downloads/Training.csv')
test <- read.csv('~/Downloads/Test.csv')
id <- test$Patient_ID
names(test) <- names(train)
comb <- rbind(train,test)
# Transform some numerical varaibles to categorial
comb$Local_tumor_recurrence <- as.factor(comb$Local_tumor_recurrence)
comb$T_category <- as.factor(comb$T_category)
comb$KM_Overall_survival_censor <- as.factor(comb$KM_Overall_survival_censor)
comb$Local_tumor_recurrence <- as.factor(comb$Local_tumor_recurrence)
train <- comb[1:150,]
test <- comb[-c(1:150),]
attach(train)
plot_pred_type_distribution <- function(df, y, threshold){
require(ggplot2)
#Fill NAs with baseline prediction
df$pred[is.na(df$pred)] <- floor(sum(df$label)/length(df$label))
v <- rep(NA, nrow(df))
v <- ifelse(df$pred >= threshold & df[,y] == 1, "TP", v)
v <- ifelse(df$pred >= threshold & df[,y] == 0, "FP", v)
v <- ifelse(df$pred < threshold & df[,y] == 1, "FN", v)
v <- ifelse(df$pred < threshold & df[,y] == 0, "TN", v)
df$pred_type <- v
ggplot(data=df, aes(x=df[,y], y=pred)) +
geom_violin(fill=rgb(1,1,1,alpha=.6), color=NA) +
geom_jitter(aes(color=pred_type), alpha=.6) +
geom_hline(yintercept=threshold, color="red", alpha=.6) +
scale_color_discrete(name="type") +
labs(title=sprintf("Threshold at %.2f", threshold))
}
calculate_roc <- function(pred, label, cost_of_fp, cost_of_fn, n=100) {
'
Function that calculates the ROC for given predictions and labels
@param: predictions (pred), labels (label), cost for FPs (cost_of_fp), cost of FNs (cost_of_fn), Number of steps regarding threshlevel p-value (n)
@return ROC for given preds/labels and sequence of threshlevel steps
'
tpr <- function(df, threshold) {
sum(pred >= threshold & label == 1) / sum(label == 1)
}
fpr <- function(df, threshold) {
sum(pred >= threshold & label == 0) / sum(label == 0)
}
cost <- function(df, threshold, cost_of_fp, cost_of_fn) {
sum(pred >= threshold & label == 0) * cost_of_fp +
sum(pred < threshold & label == 1) * cost_of_fn
}
roc <- data.frame(threshold = seq(0,1,length.out=n), tpr=NA, fpr=NA)
roc$tpr <- sapply(roc$threshold, function(th) tpr(df, th))
roc$fpr <- sapply(roc$threshold, function(th) fpr(df, th))
roc$cost <- sapply(roc$threshold, function(th) cost(df, th, cost_of_fp, cost_of_fn))
return(roc)
}
# I didn't change anything regarding this plot function
plot_roc <- function(roc, threshold, cost_of_fp, cost_of_fn) {
library(gridExtra)
library(grid)
norm_vec <- function(v) (v - min(v))/diff(range(v))
idx_threshold = which.min(abs(roc$threshold-threshold))
col_ramp <- colorRampPalette(c("green","orange","red","black"))(100)
col_by_cost <- col_ramp[ceiling(norm_vec(roc$cost)*99)+1]
p_roc <- ggplot(roc, aes(fpr,tpr)) +
geom_line(color=rgb(0,0,1,alpha=0.3)) +
geom_point(color=col_by_cost, size=4, alpha=0.5) +
coord_fixed() +
geom_line(aes(threshold,threshold), color=rgb(0,0,1,alpha=0.5)) +
labs(title = sprintf("ROC")) + xlab("FPR") + ylab("TPR") +
geom_hline(yintercept=roc[idx_threshold,"tpr"], alpha=0.5, linetype="dashed") +
geom_vline(xintercept=roc[idx_threshold,"fpr"], alpha=0.5, linetype="dashed") +
scale_y_continuous(breaks=seq(0,1,by=.05)) +
scale_x_continuous(breaks=seq(0,1,by=.075))
p_cost <- ggplot(roc, aes(threshold, cost)) +
geom_line(color=rgb(0,0,1,alpha=0.3)) +
geom_point(color=col_by_cost, size=4, alpha=0.5) +
labs(title = sprintf("cost function")) +
geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
sub_title <- sprintf("threshold at %.2f - cost of FP = %d, cost of FN = %d", threshold, cost_of_fp, cost_of_fn)
grid.arrange(p_roc, p_cost, ncol=2, sub=textGrob(sub_title, gp=gpar(cex=1), just="bottom"))
}
brier_score <- function(preds, outcomes){
sum((preds - outcomes)^2)/length(preds)
}
fit <- glm(Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor+Smoking_Pack.Years , train, family="binomial")
summary(fit)
##
## Call:
## glm(formula = Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor +
## Smoking_Pack.Years, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8621 -0.3352 -0.2165 -0.1538 2.7441
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.85118 0.82072 -3.474 0.000513 ***
## Tumor_subsiteGPS -15.61000 1999.95208 -0.008 0.993772
## Tumor_subsiteOther 1.54783 1.29252 1.198 0.231099
## Tumor_subsitePharyngeal_wall -14.13526 6522.63866 -0.002 0.998271
## Tumor_subsiteSoft_palate 3.25427 1.41590 2.298 0.021540 *
## Tumor_subsiteTonsil 0.68932 0.80761 0.854 0.393360
## KM_Overall_survival_censor1 -1.57963 0.74453 -2.122 0.033868 *
## Smoking_Pack.Years 0.02841 0.01420 2.000 0.045501 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 77.238 on 140 degrees of freedom
## Residual deviance: 57.275 on 133 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: 73.275
##
## Number of Fisher Scoring iterations: 17
par(mfrow=c(2,2))
plot(fit)
## Warning: not plotting observations with leverage one:
## 16
## Warning: not plotting observations with leverage one:
## 16
library(car)
## Warning: package 'car' was built under R version 3.2.5
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(fit)
## GVIF Df GVIF^(1/(2*Df))
## Tumor_subsite 1.107306 5 1.010245
## KM_Overall_survival_censor 1.082878 1 1.040614
## Smoking_Pack.Years 1.039950 1 1.019780
train_pred <- predict(fit, train, type="response")
# Create Prediction/Label DataFrame
df <- data.frame(cbind(label = as.numeric(as.character(train$Local_tumor_recurrence)), pred = ifelse(is.na(train_pred), 0, train_pred)))
# Brier score
brier_score(df$pred, df$label)
## [1] 0.06114459
# Plot pred type distribution
par(mfrow=c(2,2))
plot_pred_type_distribution(df, "label", 0.1)
plot_pred_type_distribution(df, "label", 0.3)
plot_pred_type_distribution(df, "label", 0.7)
#AUROC
require(ROCR)
## Loading required package: ROCR
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
t <- ifelse(df$pred > .09, 1, 0)
auroc <- prediction(t, df$label)
perform <- performance(auroc, "tpr", "fpr")
plot(perform)
require(pROC)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
auc(t, df$label)
## Area under the curve: 0.6235
#Plot AUROC
plot_roc(calculate_roc(df$pred, df$label, 1,2), .7, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .5, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .3, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .1, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .08, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2, n=100), .09, 1,2)
#index <- which(!(test$Tumor_subsite %in% labels(train$Tumor_subsite)))
pred <- predict(fit, newdata=test, type="response")
df <- data.frame(pred = pred)
label <- as.factor(ifelse(pred>=.09, 1, 0))
summary(label)
## 0 1 NA's
## 136 22 7
label[is.na(label)] <- 0
#LogSolution <- data.frame(Patient_ID = test$Patient_ID, Local_tumor_recurrence = label)
#write.csv(LogSolution, "LogSolution075.csv", row.names = F)
require(Matrix)
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.2.5
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
comb <- read.csv("imputedComb.csv")
str(comb)
## 'data.frame': 315 obs. of 18 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 1 2 1 2 ...
## $ Age_at_diagnosis : int 58 78 57 56 60 66 72 77 71 50 ...
## $ Race : Factor w/ 5 levels "American_Indian/Alaska_Native",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Tumor_side : Factor w/ 3 levels "Bilateral","L",..: 2 3 3 3 2 3 2 3 2 3 ...
## $ Tumor_subsite : Factor w/ 6 levels "BOT","GPS","Other",..: 6 1 6 1 6 1 1 6 6 1 ...
## $ T_category : int 2 3 1 2 2 1 4 1 1 3 ...
## $ N_category : Factor w/ 7 levels "0","1","2a","2b",..: 1 1 4 4 4 2 1 3 3 6 ...
## $ AJCC_Stage : Factor w/ 5 levels "I","II","III",..: 2 3 4 4 4 3 4 4 4 4 ...
## $ Pathological_grade : Factor w/ 6 levels "I","I-II","II",..: 5 3 5 5 3 5 3 5 1 3 ...
## $ Smoking_status_at_diagnosis : Factor w/ 3 levels "Current","Former",..: 2 2 1 3 3 3 2 3 2 3 ...
## $ Smoking_Pack.Years : int 5 70 30 0 0 0 26 0 50 0 ...
## $ Radiation_treatment_course_duration : int 41 48 42 45 54 39 47 42 43 43 ...
## $ Total_prescribed_Radiation_treatment_dose: int 66 70 66 66 70 66 70 70 66 72 ...
## $ X._Radiation_treatment_fractions : int 30 35 30 30 33 30 33 30 30 40 ...
## $ Induction_Chemotherapy : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 2 ...
## $ Concurrent_chemotherapy : Factor w/ 2 levels "N","Y": 1 2 2 2 2 2 2 1 1 1 ...
## $ KM_Overall_survival_censor : int 1 0 1 1 1 1 1 1 1 1 ...
# Fix facotrs
comb$KM_Overall_survival_censor <- as.factor(comb$KM_Overall_survival_censor)
comb$T_category <- as.factor(comb$T_category)
# scale
#comb$Age_at_diagnosis <- scale(comb$Age_at_diagnosis)
#comb[,12:15] <- scale(comb[,12:15])
#str(comb)
y <- read.csv("y.csv")
t <- 1:150
train <- comb[t, ]
train$X <- as.factor(y$x)
test <- comb[-t, ]
id <- read.csv("PatientID.csv")$x
# Function for scaling numeric vars of a DF
scale_df <- function(df, y){
numeric_cols <- sapply(df, is.numeric)
cbind(scale(df[,numeric_cols]), df[, !numeric_cols])
}
comb <- scale_df(comb[,-1])
dummies <- model.matrix( ~ . , data=comb)[,-1]
# redundant steps, ignore. was just checkin something
x <- data.frame(dummies)
x <- as.matrix(x)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.2.4
## Loading required package: foreach
## Loaded glmnet 2.0-5
##
## Attaching package: 'glmnet'
## The following object is masked from 'package:pROC':
##
## auc
glmmod <- glmnet(x[1:150,],y=as.factor(y$x), alpha=1, family='binomial')
#plot variable coefficients vs. shrinkage parameter lambda.
par(mfrow=c(1,1))
plot(glmmod,xvar="lambda")
#grid()
coef(glmmod)[,10]
## (Intercept)
## -2.1065221
## Age_at_diagnosis
## 0.0000000
## Smoking_Pack.Years
## 0.1377487
## Radiation_treatment_course_duration
## 0.0000000
## Total_prescribed_Radiation_treatment_dose
## 0.0000000
## X._Radiation_treatment_fractions
## 0.0000000
## GenderMale
## 0.0000000
## RaceAsian
## 0.0000000
## RaceBlack
## 0.0000000
## RaceHispanic
## 0.0000000
## RaceWhite
## 0.0000000
## Tumor_sideL
## 0.0000000
## Tumor_sideR
## 0.0000000
## Tumor_subsiteGPS
## 0.0000000
## Tumor_subsiteOther
## 0.0000000
## Tumor_subsitePharyngeal_wall
## 0.0000000
## Tumor_subsiteSoft_palate
## 1.7906967
## Tumor_subsiteTonsil
## 0.0000000
## T_category2
## 0.0000000
## T_category3
## 0.0000000
## T_category4
## 0.0000000
## N_category1
## 0.0000000
## N_category2a
## 0.0000000
## N_category2b
## 0.0000000
## N_category2c
## 0.4150608
## N_category3
## 0.0000000
## N_categoryX
## 0.0000000
## AJCC_StageII
## 0.0000000
## AJCC_StageIII
## 0.0000000
## AJCC_StageIV
## 0.0000000
## AJCC_StageX
## 0.0000000
## Pathological_gradeI.II
## 0.0000000
## Pathological_gradeII
## 0.0000000
## Pathological_gradeII.III
## 0.0000000
## Pathological_gradeIII
## 0.0000000
## Pathological_gradeIV
## 0.0000000
## Smoking_status_at_diagnosisFormer
## 0.0000000
## Smoking_status_at_diagnosisNever
## 0.0000000
## Induction_ChemotherapyY
## 0.0000000
## Concurrent_chemotherapyY
## 0.0000000
## KM_Overall_survival_censor1
## -0.8232407
cv.glmmod <- cv.glmnet(x[1:150,],y=y$x,alpha=1)
best_lambda <- cv.glmmod$lambda.min
#?glmnet
best_lambda
## [1] 0.02500581
plot(cv.glmmod)
best <- glmnet(x[t,], y=as.factor(y$x), alpha=1, lambda = best_lambda, family = "binomial")
coef(best)
## 41 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.2198348
## Age_at_diagnosis .
## Smoking_Pack.Years 0.2692014
## Radiation_treatment_course_duration .
## Total_prescribed_Radiation_treatment_dose .
## X._Radiation_treatment_fractions .
## GenderMale .
## RaceAsian .
## RaceBlack .
## RaceHispanic .
## RaceWhite .
## Tumor_sideL .
## Tumor_sideR .
## Tumor_subsiteGPS .
## Tumor_subsiteOther .
## Tumor_subsitePharyngeal_wall .
## Tumor_subsiteSoft_palate 2.1307174
## Tumor_subsiteTonsil .
## T_category2 0.1689655
## T_category3 .
## T_category4 .
## N_category1 .
## N_category2a .
## N_category2b .
## N_category2c 0.6937694
## N_category3 .
## N_categoryX .
## AJCC_StageII .
## AJCC_StageIII .
## AJCC_StageIV .
## AJCC_StageX .
## Pathological_gradeI.II .
## Pathological_gradeII .
## Pathological_gradeII.III .
## Pathological_gradeIII .
## Pathological_gradeIV .
## Smoking_status_at_diagnosisFormer .
## Smoking_status_at_diagnosisNever .
## Induction_ChemotherapyY .
## Concurrent_chemotherapyY .
## KM_Overall_survival_censor1 -1.0132357
# PREDICT TRAIN LABELS
pred <- predict(cv.glmmod, newx=x[t, ], s="lambda.min")
# Plot ROC
plot_roc(calculate_roc(pred, y$x, 1,2), .1, 1,2)
require(ROCR)
p <- ifelse(pred > .1, 1, 0)
auroc <- prediction(p, y$x)
perform <- performance(auroc, "tpr", "fpr")
plot(perform)
require(pROC)
auc(p, y$x)
## [1] 0.6072944
pred <- predict(cv.glmmod, newx=x[-t, ], s="lambda.min")
labels <- ifelse(pred > .1, 1, 0)
table(labels)
## labels
## 0 1
## 129 36
solution <- data.frame(Patient_ID = id, Local_tumor_recurrence = labels)
colnames(solution)[2] <- "Local_tumor_recurrence"
#write.csv(solution, "AucScale01LassoCONFUSION.csv", row.names = F)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
x_train <- as.data.frame(x[t, ])
x_train$y <- as.factor(y$x)
x_test <- as.data.frame(x[-t, ])
rf.fit <- randomForest(y ~ ., data=x_train, family="class")
varImpPlot(rf.fit)
rf.pred <- predict(rf.fit, x_train)
# Accuracy
prop.table(table(y$x, rf.pred))
## rf.pred
## 0 1
## 0 0.92 0.00
## 1 0.00 0.08
rf.cv <- rfcv(x[t, ], x_train$y, cv.fold=10)
result <- replicate(5, rfcv(x[t, ], x_train$y), simplify=FALSE)
error.cv <- sapply(result, "[[", "error.cv")
matplot(result[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x",
xlab="Number of variables", ylab="CV Error")
with(rf.cv, plot(n.var, error.cv))
# take subspace given by varImpPlot
rf.fit <- randomForest(y~ Smoking_Pack.Years + N_category2c + RaceHispanic + KM_Overall_survival_censor1 + Tumor_subsiteSoft_palate, data = x_train, family = "class")
pred <- predict(rf.fit, newdata = x_test, type="class")
rf.solution <- data.frame(Patient_ID = id, Local_tumor_recurrence = pred)
table(pred)
## pred
## 0 1
## 165 0
fit <- glm(y ~ Smoking_Pack.Years + N_category2c + KM_Overall_survival_censor1 + Tumor_subsiteSoft_palate, data=x_train, family="binomial")
summary(fit)
##
## Call:
## glm(formula = y ~ Smoking_Pack.Years + N_category2c + KM_Overall_survival_censor1 +
## Tumor_subsiteSoft_palate, family = "binomial", data = x_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6561 -0.3518 -0.2028 -0.1645 2.9357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3706 0.6181 -3.835 0.000125 ***
## Smoking_Pack.Years 0.5451 0.3197 1.705 0.088234 .
## N_category2c 1.3396 0.6916 1.937 0.052746 .
## KM_Overall_survival_censor1 -1.5441 0.6942 -2.224 0.026137 *
## Tumor_subsiteSoft_palate 2.9417 1.3968 2.106 0.035203 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.631 on 149 degrees of freedom
## Residual deviance: 62.422 on 145 degrees of freedom
## AIC: 72.422
##
## Number of Fisher Scoring iterations: 6
vif(fit)
## Smoking_Pack.Years N_category2c
## 1.015513 1.022182
## KM_Overall_survival_censor1 Tumor_subsiteSoft_palate
## 1.010448 1.014329
pred <- predict(fit, newdata = x_test, type="response")
labels <- ifelse(pred > .1, 1, 0)
glm.sol <- data.frame(Patient_ID = id, Local_tumor_recurrence = labels)
#write.csv(glm.sol, "GLMlassoPreds.csv", row.names = F)
labels <- ifelse(pred > .08, 1, 0)
table(labels)
## labels
## 0 1
## 137 28
nas <- which(is.na(label))
test[nas,c("Tumor_subsite", "KM_Overall_survival_censor", "Smoking_Pack.Years")]
## [1] Tumor_subsite KM_Overall_survival_censor
## [3] Smoking_Pack.Years
## <0 rows> (or 0-length row.names)
fit2 <- glm(Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor + Smoking_Pack.Years, train, family="binomial")
summary(fit2)
##
## Call:
## glm(formula = Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor +
## Smoking_Pack.Years, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8596 -0.3962 -0.2210 -0.1860 2.7675
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.36408 0.72078 -3.280 0.00104 **
## Tumor_subsiteGPS -15.70595 1922.08867 -0.008 0.99348
## Tumor_subsiteOther 1.24873 1.24544 1.003 0.31603
## Tumor_subsitePharyngeal_wall -14.51722 6522.63865 -0.002 0.99822
## Tumor_subsiteSoft_palate 2.96854 1.39466 2.129 0.03330 *
## Tumor_subsiteTonsil 0.24137 0.74755 0.323 0.74679
## KM_Overall_survival_censor1 -1.68477 0.71554 -2.355 0.01855 *
## Smoking_Pack.Years 0.02323 0.01328 1.750 0.08020 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.631 on 149 degrees of freedom
## Residual deviance: 63.774 on 142 degrees of freedom
## AIC: 79.774
##
## Number of Fisher Scoring iterations: 17
pred2 <- predict(fit2, newdata=test, type="response")
label2 <- as.factor(ifelse(pred2>.1, 1, 0))
# Check results
summary(label2)
## 0 1
## 144 21
# Check overlap
prop.table(table(label, label2))
## label2
## label 0 1
## 0 0.860606061 0.006060606
## 1 0.012121212 0.121212121