Load Data

setwd("~/stats/kaggle/cancer/")

train <- read.csv('~/Downloads/Training.csv')
test <- read.csv('~/Downloads/Test.csv')

id <- test$Patient_ID
names(test) <- names(train)
comb <- rbind(train,test)

# Transform some numerical varaibles to categorial
comb$Local_tumor_recurrence <- as.factor(comb$Local_tumor_recurrence)
comb$T_category <- as.factor(comb$T_category)
comb$KM_Overall_survival_censor <- as.factor(comb$KM_Overall_survival_censor)
comb$Local_tumor_recurrence <- as.factor(comb$Local_tumor_recurrence)

train <- comb[1:150,]
test <- comb[-c(1:150),]

attach(train)

Plot prediction type distribution given threshold p value

I got this from https://www.r-bloggers.com/illustrated-guide-to-roc-and-auc/ and extended it

plot_pred_type_distribution <- function(df, y, threshold){
  require(ggplot2)
  
  #Fill NAs with baseline prediction
  df$pred[is.na(df$pred)] <- floor(sum(df$label)/length(df$label))
  
  v <- rep(NA, nrow(df))
  v <- ifelse(df$pred >= threshold & df[,y] == 1, "TP", v)
  v <- ifelse(df$pred >= threshold & df[,y] == 0, "FP", v)
  v <- ifelse(df$pred < threshold & df[,y] == 1, "FN", v)
  v <- ifelse(df$pred < threshold & df[,y] == 0, "TN", v)
  
  df$pred_type <- v
  
  ggplot(data=df, aes(x=df[,y], y=pred)) + 
    geom_violin(fill=rgb(1,1,1,alpha=.6), color=NA) + 
    geom_jitter(aes(color=pred_type), alpha=.6) + 
    geom_hline(yintercept=threshold, color="red", alpha=.6) +
    scale_color_discrete(name="type") + 
    labs(title=sprintf("Threshold at %.2f", threshold))
}

calculate_roc <- function(pred, label, cost_of_fp, cost_of_fn, n=100) {
  '
  Function that calculates the ROC for given predictions and labels
  @param: predictions (pred), labels (label), cost for FPs (cost_of_fp), cost of FNs (cost_of_fn), Number of steps regarding threshlevel p-value (n)
  @return ROC for given preds/labels and sequence of threshlevel steps
  '
  tpr <- function(df, threshold) {
    sum(pred >= threshold & label == 1) / sum(label == 1)
  }
  
  fpr <- function(df, threshold) {
    sum(pred >= threshold & label == 0) / sum(label == 0)
  }
  
  cost <- function(df, threshold, cost_of_fp, cost_of_fn) {
    sum(pred >= threshold & label == 0) * cost_of_fp + 
      sum(pred < threshold & label == 1) * cost_of_fn
  }
  
  roc <- data.frame(threshold = seq(0,1,length.out=n), tpr=NA, fpr=NA)
  roc$tpr <- sapply(roc$threshold, function(th) tpr(df, th))
  roc$fpr <- sapply(roc$threshold, function(th) fpr(df, th))
  roc$cost <- sapply(roc$threshold, function(th) cost(df, th, cost_of_fp, cost_of_fn))
  
  return(roc)
}

# I didn't change anything regarding this plot function
plot_roc <- function(roc, threshold, cost_of_fp, cost_of_fn) {
  library(gridExtra)
  library(grid)
  norm_vec <- function(v) (v - min(v))/diff(range(v))
  
  idx_threshold = which.min(abs(roc$threshold-threshold))
  
  col_ramp <- colorRampPalette(c("green","orange","red","black"))(100)
  col_by_cost <- col_ramp[ceiling(norm_vec(roc$cost)*99)+1]
  p_roc <- ggplot(roc, aes(fpr,tpr)) + 
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=4, alpha=0.5) +
    coord_fixed() +
    geom_line(aes(threshold,threshold), color=rgb(0,0,1,alpha=0.5)) +
    labs(title = sprintf("ROC")) + xlab("FPR") + ylab("TPR") +
    geom_hline(yintercept=roc[idx_threshold,"tpr"], alpha=0.5, linetype="dashed") +
    geom_vline(xintercept=roc[idx_threshold,"fpr"], alpha=0.5, linetype="dashed") + 
    scale_y_continuous(breaks=seq(0,1,by=.05)) + 
    scale_x_continuous(breaks=seq(0,1,by=.075))
  
  p_cost <- ggplot(roc, aes(threshold, cost)) +
    geom_line(color=rgb(0,0,1,alpha=0.3)) +
    geom_point(color=col_by_cost, size=4, alpha=0.5) +
    labs(title = sprintf("cost function")) +
    geom_vline(xintercept=threshold, alpha=0.5, linetype="dashed")
  
  sub_title <- sprintf("threshold at %.2f - cost of FP = %d, cost of FN = %d", threshold, cost_of_fp, cost_of_fn)

  grid.arrange(p_roc, p_cost, ncol=2, sub=textGrob(sub_title, gp=gpar(cex=1), just="bottom"))
}

Logistic Regression

Brier score to measure accuracy of prediction https://en.wikipedia.org/wiki/Brier_score

brier_score <- function(preds, outcomes){
  sum((preds - outcomes)^2)/length(preds)
}

Model.

I’ve run some Models before and checked coefficients to decided which predictors to use for this subset.

fit <- glm(Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor+Smoking_Pack.Years , train, family="binomial")
summary(fit)

## 
## Call:
## glm(formula = Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor + 
##     Smoking_Pack.Years, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8621  -0.3352  -0.2165  -0.1538   2.7441  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -2.85118    0.82072  -3.474 0.000513 ***
## Tumor_subsiteGPS              -15.61000 1999.95208  -0.008 0.993772    
## Tumor_subsiteOther              1.54783    1.29252   1.198 0.231099    
## Tumor_subsitePharyngeal_wall  -14.13526 6522.63866  -0.002 0.998271    
## Tumor_subsiteSoft_palate        3.25427    1.41590   2.298 0.021540 *  
## Tumor_subsiteTonsil             0.68932    0.80761   0.854 0.393360    
## KM_Overall_survival_censor1    -1.57963    0.74453  -2.122 0.033868 *  
## Smoking_Pack.Years              0.02841    0.01420   2.000 0.045501 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 77.238  on 140  degrees of freedom
## Residual deviance: 57.275  on 133  degrees of freedom
##   (9 observations deleted due to missingness)
## AIC: 73.275
## 
## Number of Fisher Scoring iterations: 17

par(mfrow=c(2,2))
plot(fit)

## Warning: not plotting observations with leverage one:
##   16

## Warning: not plotting observations with leverage one:
##   16

library(car)

## Warning: package 'car' was built under R version 3.2.5

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

vif(fit)

##                                GVIF Df GVIF^(1/(2*Df))
## Tumor_subsite              1.107306  5        1.010245
## KM_Overall_survival_censor 1.082878  1        1.040614
## Smoking_Pack.Years         1.039950  1        1.019780

train_pred <- predict(fit, train, type="response")

# Create Prediction/Label DataFrame
df <- data.frame(cbind(label = as.numeric(as.character(train$Local_tumor_recurrence)), pred = ifelse(is.na(train_pred), 0, train_pred)))

# Brier score
brier_score(df$pred, df$label)

## [1] 0.06114459

# Plot pred type distribution
par(mfrow=c(2,2))
plot_pred_type_distribution(df, "label", 0.1)

plot_pred_type_distribution(df, "label", 0.3)

plot_pred_type_distribution(df, "label", 0.7)

#AUROC
require(ROCR)

## Loading required package: ROCR

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.2.4

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

t <- ifelse(df$pred > .09, 1, 0)
auroc <- prediction(t, df$label)
perform <- performance(auroc, "tpr", "fpr")
plot(perform)

require(pROC)

## Loading required package: pROC

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

auc(t, df$label)

## Area under the curve: 0.6235

#Plot AUROC
plot_roc(calculate_roc(df$pred, df$label, 1,2), .7, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .5, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .3, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .1, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2), .08, 1,2)
plot_roc(calculate_roc(df$pred, df$label, 1,2, n=100), .09, 1,2)

Predict

#index <- which(!(test$Tumor_subsite %in% labels(train$Tumor_subsite)))

pred <- predict(fit, newdata=test, type="response")
df <- data.frame(pred = pred)


label <- as.factor(ifelse(pred>=.09, 1, 0))
summary(label)

##    0    1 NA's 
##  136   22    7

Save csv

label[is.na(label)] <- 0

#LogSolution <- data.frame(Patient_ID = test$Patient_ID, Local_tumor_recurrence = label)

#write.csv(LogSolution, "LogSolution075.csv", row.names = F)

They get all predicted with 0 so no improvement here

05.09.2016

Idea: Transform Categorial Vars to Dummy Vars and Use Lasso Regression

require(Matrix)

## Loading required package: Matrix

## Warning: package 'Matrix' was built under R version 3.2.5

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

comb <- read.csv("imputedComb.csv")
str(comb)

## 'data.frame':    315 obs. of  18 variables:
##  $ X                                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                                   : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 1 2 1 2 ...
##  $ Age_at_diagnosis                         : int  58 78 57 56 60 66 72 77 71 50 ...
##  $ Race                                     : Factor w/ 5 levels "American_Indian/Alaska_Native",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Tumor_side                               : Factor w/ 3 levels "Bilateral","L",..: 2 3 3 3 2 3 2 3 2 3 ...
##  $ Tumor_subsite                            : Factor w/ 6 levels "BOT","GPS","Other",..: 6 1 6 1 6 1 1 6 6 1 ...
##  $ T_category                               : int  2 3 1 2 2 1 4 1 1 3 ...
##  $ N_category                               : Factor w/ 7 levels "0","1","2a","2b",..: 1 1 4 4 4 2 1 3 3 6 ...
##  $ AJCC_Stage                               : Factor w/ 5 levels "I","II","III",..: 2 3 4 4 4 3 4 4 4 4 ...
##  $ Pathological_grade                       : Factor w/ 6 levels "I","I-II","II",..: 5 3 5 5 3 5 3 5 1 3 ...
##  $ Smoking_status_at_diagnosis              : Factor w/ 3 levels "Current","Former",..: 2 2 1 3 3 3 2 3 2 3 ...
##  $ Smoking_Pack.Years                       : int  5 70 30 0 0 0 26 0 50 0 ...
##  $ Radiation_treatment_course_duration      : int  41 48 42 45 54 39 47 42 43 43 ...
##  $ Total_prescribed_Radiation_treatment_dose: int  66 70 66 66 70 66 70 70 66 72 ...
##  $ X._Radiation_treatment_fractions         : int  30 35 30 30 33 30 33 30 30 40 ...
##  $ Induction_Chemotherapy                   : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ Concurrent_chemotherapy                  : Factor w/ 2 levels "N","Y": 1 2 2 2 2 2 2 1 1 1 ...
##  $ KM_Overall_survival_censor               : int  1 0 1 1 1 1 1 1 1 1 ...

# Fix facotrs
comb$KM_Overall_survival_censor <- as.factor(comb$KM_Overall_survival_censor)
comb$T_category <- as.factor(comb$T_category)

# scale 
#comb$Age_at_diagnosis <- scale(comb$Age_at_diagnosis)
#comb[,12:15] <- scale(comb[,12:15])

#str(comb)
y <- read.csv("y.csv")
t <- 1:150
train <- comb[t, ]
train$X <- as.factor(y$x)
test <- comb[-t, ]
id <- read.csv("PatientID.csv")$x


# Function for scaling numeric vars of a DF
scale_df <- function(df, y){
  numeric_cols <- sapply(df, is.numeric)
  cbind(scale(df[,numeric_cols]), df[, !numeric_cols])
}

comb <- scale_df(comb[,-1])

dummies <- model.matrix( ~ . , data=comb)[,-1]
# redundant steps, ignore. was just checkin something
 x <- data.frame(dummies)
 x <- as.matrix(x)

library(glmnet)

## Warning: package 'glmnet' was built under R version 3.2.4

## Loading required package: foreach

## Loaded glmnet 2.0-5

## 
## Attaching package: 'glmnet'

## The following object is masked from 'package:pROC':
## 
##     auc

glmmod <- glmnet(x[1:150,],y=as.factor(y$x), alpha=1, family='binomial')
#plot variable coefficients vs. shrinkage parameter lambda.
par(mfrow=c(1,1))
plot(glmmod,xvar="lambda")

#grid()

coef(glmmod)[,10]

##                               (Intercept) 
##                                -2.1065221 
##                          Age_at_diagnosis 
##                                 0.0000000 
##                        Smoking_Pack.Years 
##                                 0.1377487 
##       Radiation_treatment_course_duration 
##                                 0.0000000 
## Total_prescribed_Radiation_treatment_dose 
##                                 0.0000000 
##          X._Radiation_treatment_fractions 
##                                 0.0000000 
##                                GenderMale 
##                                 0.0000000 
##                                 RaceAsian 
##                                 0.0000000 
##                                 RaceBlack 
##                                 0.0000000 
##                              RaceHispanic 
##                                 0.0000000 
##                                 RaceWhite 
##                                 0.0000000 
##                               Tumor_sideL 
##                                 0.0000000 
##                               Tumor_sideR 
##                                 0.0000000 
##                          Tumor_subsiteGPS 
##                                 0.0000000 
##                        Tumor_subsiteOther 
##                                 0.0000000 
##              Tumor_subsitePharyngeal_wall 
##                                 0.0000000 
##                  Tumor_subsiteSoft_palate 
##                                 1.7906967 
##                       Tumor_subsiteTonsil 
##                                 0.0000000 
##                               T_category2 
##                                 0.0000000 
##                               T_category3 
##                                 0.0000000 
##                               T_category4 
##                                 0.0000000 
##                               N_category1 
##                                 0.0000000 
##                              N_category2a 
##                                 0.0000000 
##                              N_category2b 
##                                 0.0000000 
##                              N_category2c 
##                                 0.4150608 
##                               N_category3 
##                                 0.0000000 
##                               N_categoryX 
##                                 0.0000000 
##                              AJCC_StageII 
##                                 0.0000000 
##                             AJCC_StageIII 
##                                 0.0000000 
##                              AJCC_StageIV 
##                                 0.0000000 
##                               AJCC_StageX 
##                                 0.0000000 
##                    Pathological_gradeI.II 
##                                 0.0000000 
##                      Pathological_gradeII 
##                                 0.0000000 
##                  Pathological_gradeII.III 
##                                 0.0000000 
##                     Pathological_gradeIII 
##                                 0.0000000 
##                      Pathological_gradeIV 
##                                 0.0000000 
##         Smoking_status_at_diagnosisFormer 
##                                 0.0000000 
##          Smoking_status_at_diagnosisNever 
##                                 0.0000000 
##                   Induction_ChemotherapyY 
##                                 0.0000000 
##                  Concurrent_chemotherapyY 
##                                 0.0000000 
##               KM_Overall_survival_censor1 
##                                -0.8232407

cv.glmmod <- cv.glmnet(x[1:150,],y=y$x,alpha=1)
best_lambda <- cv.glmmod$lambda.min
#?glmnet
best_lambda

## [1] 0.02500581

plot(cv.glmmod)

best <- glmnet(x[t,], y=as.factor(y$x), alpha=1, lambda = best_lambda, family = "binomial")
coef(best)

## 41 x 1 sparse Matrix of class "dgCMatrix"
##                                                   s0
## (Intercept)                               -2.2198348
## Age_at_diagnosis                           .        
## Smoking_Pack.Years                         0.2692014
## Radiation_treatment_course_duration        .        
## Total_prescribed_Radiation_treatment_dose  .        
## X._Radiation_treatment_fractions           .        
## GenderMale                                 .        
## RaceAsian                                  .        
## RaceBlack                                  .        
## RaceHispanic                               .        
## RaceWhite                                  .        
## Tumor_sideL                                .        
## Tumor_sideR                                .        
## Tumor_subsiteGPS                           .        
## Tumor_subsiteOther                         .        
## Tumor_subsitePharyngeal_wall               .        
## Tumor_subsiteSoft_palate                   2.1307174
## Tumor_subsiteTonsil                        .        
## T_category2                                0.1689655
## T_category3                                .        
## T_category4                                .        
## N_category1                                .        
## N_category2a                               .        
## N_category2b                               .        
## N_category2c                               0.6937694
## N_category3                                .        
## N_categoryX                                .        
## AJCC_StageII                               .        
## AJCC_StageIII                              .        
## AJCC_StageIV                               .        
## AJCC_StageX                                .        
## Pathological_gradeI.II                     .        
## Pathological_gradeII                       .        
## Pathological_gradeII.III                   .        
## Pathological_gradeIII                      .        
## Pathological_gradeIV                       .        
## Smoking_status_at_diagnosisFormer          .        
## Smoking_status_at_diagnosisNever           .        
## Induction_ChemotherapyY                    .        
## Concurrent_chemotherapyY                   .        
## KM_Overall_survival_censor1               -1.0132357

# PREDICT TRAIN LABELS
pred <- predict(cv.glmmod, newx=x[t, ], s="lambda.min")

# Plot ROC
plot_roc(calculate_roc(pred, y$x, 1,2), .1, 1,2)

require(ROCR)
p <- ifelse(pred > .1, 1, 0)
auroc <- prediction(p, y$x)
perform <- performance(auroc, "tpr", "fpr")
plot(perform)

require(pROC)
auc(p, y$x)

## [1] 0.6072944

pred <- predict(cv.glmmod, newx=x[-t, ], s="lambda.min")
labels <- ifelse(pred > .1, 1, 0)
table(labels)

## labels
##   0   1 
## 129  36

solution <- data.frame(Patient_ID = id, Local_tumor_recurrence = labels)
colnames(solution)[2] <- "Local_tumor_recurrence"

#write.csv(solution, "AucScale01LassoCONFUSION.csv", row.names = F)

Random forest

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

x_train <- as.data.frame(x[t, ])
x_train$y <- as.factor(y$x)
x_test <- as.data.frame(x[-t, ])
rf.fit <- randomForest(y ~ ., data=x_train, family="class")
varImpPlot(rf.fit)

rf.pred <- predict(rf.fit, x_train)
# Accuracy
prop.table(table(y$x, rf.pred))

##    rf.pred
##        0    1
##   0 0.92 0.00
##   1 0.00 0.08

rf.cv <- rfcv(x[t, ], x_train$y, cv.fold=10)

result <- replicate(5, rfcv(x[t, ], x_train$y), simplify=FALSE)
error.cv <- sapply(result, "[[", "error.cv")
matplot(result[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x",
xlab="Number of variables", ylab="CV Error")

with(rf.cv, plot(n.var, error.cv))

# take subspace given by varImpPlot
rf.fit <- randomForest(y~ Smoking_Pack.Years + N_category2c + RaceHispanic + KM_Overall_survival_censor1 + Tumor_subsiteSoft_palate, data = x_train, family = "class")

pred <- predict(rf.fit, newdata = x_test, type="class")

rf.solution <- data.frame(Patient_ID = id, Local_tumor_recurrence = pred)

table(pred)

## pred
##   0   1 
## 165   0

Classifies all with 0 so it’s not better than Baseline prediction. Too much overfitting here.

Logistic Regression with Vars of Lasso just without Hispanic

fit <- glm(y ~ Smoking_Pack.Years + N_category2c + KM_Overall_survival_censor1 + Tumor_subsiteSoft_palate, data=x_train, family="binomial")
summary(fit)

## 
## Call:
## glm(formula = y ~ Smoking_Pack.Years + N_category2c + KM_Overall_survival_censor1 + 
##     Tumor_subsiteSoft_palate, family = "binomial", data = x_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6561  -0.3518  -0.2028  -0.1645   2.9357  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -2.3706     0.6181  -3.835 0.000125 ***
## Smoking_Pack.Years            0.5451     0.3197   1.705 0.088234 .  
## N_category2c                  1.3396     0.6916   1.937 0.052746 .  
## KM_Overall_survival_censor1  -1.5441     0.6942  -2.224 0.026137 *  
## Tumor_subsiteSoft_palate      2.9417     1.3968   2.106 0.035203 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.631  on 149  degrees of freedom
## Residual deviance: 62.422  on 145  degrees of freedom
## AIC: 72.422
## 
## Number of Fisher Scoring iterations: 6

vif(fit)

##          Smoking_Pack.Years                N_category2c 
##                    1.015513                    1.022182 
## KM_Overall_survival_censor1    Tumor_subsiteSoft_palate 
##                    1.010448                    1.014329

pred <- predict(fit, newdata = x_test, type="response")
labels <- ifelse(pred > .1, 1, 0)

glm.sol <- data.frame(Patient_ID = id, Local_tumor_recurrence = labels)

#write.csv(glm.sol, "GLMlassoPreds.csv", row.names = F)

AUC ~ 64

Lets try another threshlvl

labels <- ifelse(pred > .08, 1, 0)
table(labels)

## labels
##   0   1 
## 137  28

Find the NA rows

nas <- which(is.na(label))
test[nas,c("Tumor_subsite", "KM_Overall_survival_censor", "Smoking_Pack.Years")]

## [1] Tumor_subsite              KM_Overall_survival_censor
## [3] Smoking_Pack.Years        
## <0 rows> (or 0-length row.names)

Their Smocking.pack.Years values is NA so that’s why we can’t make a prediction for them. Let’s make a prediction with a new Model excluding Smoking.Pack.Years

fit2 <- glm(Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor + Smoking_Pack.Years, train, family="binomial")
summary(fit2)

## 
## Call:
## glm(formula = Local_tumor_recurrence ~ Tumor_subsite + KM_Overall_survival_censor + 
##     Smoking_Pack.Years, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8596  -0.3962  -0.2210  -0.1860   2.7675  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                    -2.36408    0.72078  -3.280  0.00104 **
## Tumor_subsiteGPS              -15.70595 1922.08867  -0.008  0.99348   
## Tumor_subsiteOther              1.24873    1.24544   1.003  0.31603   
## Tumor_subsitePharyngeal_wall  -14.51722 6522.63865  -0.002  0.99822   
## Tumor_subsiteSoft_palate        2.96854    1.39466   2.129  0.03330 * 
## Tumor_subsiteTonsil             0.24137    0.74755   0.323  0.74679   
## KM_Overall_survival_censor1    -1.68477    0.71554  -2.355  0.01855 * 
## Smoking_Pack.Years              0.02323    0.01328   1.750  0.08020 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.631  on 149  degrees of freedom
## Residual deviance: 63.774  on 142  degrees of freedom
## AIC: 79.774
## 
## Number of Fisher Scoring iterations: 17

pred2 <- predict(fit2, newdata=test, type="response")
label2 <- as.factor(ifelse(pred2>.1, 1, 0))

# Check results
summary(label2)

##   0   1 
## 144  21

# Check overlap
prop.table(table(label, label2))

##      label2
## label           0           1
##     0 0.860606061 0.006060606
##     1 0.012121212 0.121212121

Oropharynx Cancer (OPC) Radiomics Challenge :: Local Recurrence Prediction - Logistic Regression, Lasso Regression and Random Forest

Load Data

Plot prediction type distribution given threshold p value

I got this from https://www.r-bloggers.com/illustrated-guide-to-roc-and-auc/ and extended it

Logistic Regression

Brier score to measure accuracy of prediction https://en.wikipedia.org/wiki/Brier_score

Model.

I’ve run some Models before and checked coefficients to decided which predictors to use for this subset.

Predict

Save csv

They get all predicted with 0 so no improvement here

05.09.2016

Idea: Transform Categorial Vars to Dummy Vars and Use Lasso Regression

Random forest

Classifies all with 0 so it’s not better than Baseline prediction. Too much overfitting here.

Logistic Regression with Vars of Lasso just without Hispanic

AUC ~ 64

Lets try another threshlvl

Find the NA rows

Their Smocking.pack.Years values is NA so that’s why we can’t make a prediction for them. Let’s make a prediction with a new Model excluding Smoking.Pack.Years