Develop a model for detecting breast cancer

That’s an algorithm developed

By

Abotaleb mostafa “Аботалеб Мостафа”

south ural state university, Russian federation

# logistic regression
# dependent variable must be nominal variable 
# must be there are not collinearity 
#linearity between independents variables and odds ratio
#outlier values
#sample size 
# bin / Multinomial logistic regression
dataset_path<-"F:/Phd/Private Project/binlogdata.csv"
original_data<-read.csv(dataset_path) 
my_data<-data.frame(original_data)
bin<-glm(my_data$Cancer~my_data$noCig+my_data$Gender,data = my_data,family = binomial)
summary(bin)

## 
## Call:
## glm(formula = my_data$Cancer ~ my_data$noCig + my_data$Gender, 
##     family = binomial, data = my_data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.81722  -0.55619   0.04376   0.48011   1.81029  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -7.80396    4.31209  -1.810   0.0703 .
## my_data$noCig   0.18064    0.08391   2.153   0.0313 *
## my_data$Gender  2.55845    1.86681   1.370   0.1705  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22.181  on 15  degrees of freedom
## Residual deviance: 12.772  on 13  degrees of freedom
## AIC: 18.772
## 
## Number of Fisher Scoring iterations: 5

#calculate odds ratio
exp(cbind("OR"=coef(bin),confint(bin)))

## Waiting for profiling to be done...

##                          OR        2.5 %      97.5 %
## (Intercept)    4.081169e-04 7.603775e-10    0.205689
## my_data$noCig  1.197983e+00 1.051239e+00    1.526905
## my_data$Gender 1.291574e+01 6.755416e-01 3116.431011

#prediction
predict<-predict(bin,type = 'response')
table_1<-table(my_data$Cancer,predict>0.5)
(7+6)/(7+6+2+1)

## [1] 0.8125

# dataset  
dataset_path<-"F:/Phd/Private Project/breast_cancer.csv"
original_data<-read.csv(dataset_path) 
my_data<-data.frame(original_data)

#identify variables
x1<-my_data$Clump.Thickness
x2<-my_data$Uniformity.of.Cell.Size
x3<-my_data$Uniformity.of.Cell.Shape
x4<-my_data$Marginal.Adhesion
x5<-my_data$Single.Epithelial.Cell.Size
x6<-my_data$Bare.Nuclei
x7<-my_data$Bland.Chromatin
x8<-my_data$Mitoses
x9<-my_data$Normal.Nucleoli
y<-my_data$Class
my_data$Normal.Nucleoli = as.factor(my_data$Normal.Nucleoli)
bin<-glm(y~x1+x2+x3+x4+x5+x6+x7+x8+x9,data = my_data,family = binomial)
summary(bin)

## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9, 
##     family = binomial, data = my_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4841  -0.1153  -0.0619   0.0222   2.4698  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.10394    1.17488  -8.600  < 2e-16 ***
## x1            0.53501    0.14202   3.767 0.000165 ***
## x2           -0.00628    0.20908  -0.030 0.976039    
## x3            0.32271    0.23060   1.399 0.161688    
## x4            0.33064    0.12345   2.678 0.007400 ** 
## x5            0.09663    0.15659   0.617 0.537159    
## x6            0.38303    0.09384   4.082 4.47e-05 ***
## x7            0.44719    0.17138   2.609 0.009073 ** 
## x8            0.53484    0.32877   1.627 0.103788    
## x9            0.21303    0.11287   1.887 0.059115 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 884.35  on 682  degrees of freedom
## Residual deviance: 102.89  on 673  degrees of freedom
## AIC: 122.89
## 
## Number of Fisher Scoring iterations: 8

#calculate odds ratio
exp(cbind("OR"=coef(bin),confint(bin)))

## Waiting for profiling to be done...

##                       OR        2.5 %       97.5 %
## (Intercept) 4.091793e-05 2.878802e-06 0.0003081064
## x1          1.707472e+00 1.315481e+00 2.3111005101
## x2          9.937400e-01 6.737927e-01 1.5494825697
## x3          1.380860e+00 8.622282e-01 2.1562448344
## x4          1.391854e+00 1.097470e+00 1.7984446648
## x5          1.101459e+00 8.050104e-01 1.4990926079
## x6          1.466714e+00 1.229862e+00 1.7836925585
## x7          1.563908e+00 1.131130e+00 2.2251645818
## x8          1.707168e+00 9.933376e-01 3.0237053763
## x9          1.237423e+00 9.981573e-01 1.5613578633

#prediction
predict<-predict(bin,type = 'response')
table_1<-table(my_data$Class,predict>0.5)
table_1

##    
##     FALSE TRUE
##   0   434   10
##   1    11  228