south ural state university, Russian federation
# logistic regression
# dependent variable must be nominal variable
# must be there are not collinearity
#linearity between independents variables and odds ratio
#outlier values
#sample size
# bin / Multinomial logistic regression
dataset_path<-"F:/Phd/Private Project/binlogdata.csv"
original_data<-read.csv(dataset_path)
my_data<-data.frame(original_data)
bin<-glm(my_data$Cancer~my_data$noCig+my_data$Gender,data = my_data,family = binomial)
summary(bin)
##
## Call:
## glm(formula = my_data$Cancer ~ my_data$noCig + my_data$Gender,
## family = binomial, data = my_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.81722 -0.55619 0.04376 0.48011 1.81029
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.80396 4.31209 -1.810 0.0703 .
## my_data$noCig 0.18064 0.08391 2.153 0.0313 *
## my_data$Gender 2.55845 1.86681 1.370 0.1705
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22.181 on 15 degrees of freedom
## Residual deviance: 12.772 on 13 degrees of freedom
## AIC: 18.772
##
## Number of Fisher Scoring iterations: 5
#calculate odds ratio
exp(cbind("OR"=coef(bin),confint(bin)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 4.081169e-04 7.603775e-10 0.205689
## my_data$noCig 1.197983e+00 1.051239e+00 1.526905
## my_data$Gender 1.291574e+01 6.755416e-01 3116.431011
#prediction
predict<-predict(bin,type = 'response')
table_1<-table(my_data$Cancer,predict>0.5)
(7+6)/(7+6+2+1)
## [1] 0.8125
# dataset
dataset_path<-"F:/Phd/Private Project/breast_cancer.csv"
original_data<-read.csv(dataset_path)
my_data<-data.frame(original_data)
#identify variables
x1<-my_data$Clump.Thickness
x2<-my_data$Uniformity.of.Cell.Size
x3<-my_data$Uniformity.of.Cell.Shape
x4<-my_data$Marginal.Adhesion
x5<-my_data$Single.Epithelial.Cell.Size
x6<-my_data$Bare.Nuclei
x7<-my_data$Bland.Chromatin
x8<-my_data$Mitoses
x9<-my_data$Normal.Nucleoli
y<-my_data$Class
my_data$Normal.Nucleoli = as.factor(my_data$Normal.Nucleoli)
bin<-glm(y~x1+x2+x3+x4+x5+x6+x7+x8+x9,data = my_data,family = binomial)
summary(bin)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9,
## family = binomial, data = my_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4841 -0.1153 -0.0619 0.0222 2.4698
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.10394 1.17488 -8.600 < 2e-16 ***
## x1 0.53501 0.14202 3.767 0.000165 ***
## x2 -0.00628 0.20908 -0.030 0.976039
## x3 0.32271 0.23060 1.399 0.161688
## x4 0.33064 0.12345 2.678 0.007400 **
## x5 0.09663 0.15659 0.617 0.537159
## x6 0.38303 0.09384 4.082 4.47e-05 ***
## x7 0.44719 0.17138 2.609 0.009073 **
## x8 0.53484 0.32877 1.627 0.103788
## x9 0.21303 0.11287 1.887 0.059115 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 884.35 on 682 degrees of freedom
## Residual deviance: 102.89 on 673 degrees of freedom
## AIC: 122.89
##
## Number of Fisher Scoring iterations: 8
#calculate odds ratio
exp(cbind("OR"=coef(bin),confint(bin)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 4.091793e-05 2.878802e-06 0.0003081064
## x1 1.707472e+00 1.315481e+00 2.3111005101
## x2 9.937400e-01 6.737927e-01 1.5494825697
## x3 1.380860e+00 8.622282e-01 2.1562448344
## x4 1.391854e+00 1.097470e+00 1.7984446648
## x5 1.101459e+00 8.050104e-01 1.4990926079
## x6 1.466714e+00 1.229862e+00 1.7836925585
## x7 1.563908e+00 1.131130e+00 2.2251645818
## x8 1.707168e+00 9.933376e-01 3.0237053763
## x9 1.237423e+00 9.981573e-01 1.5613578633
#prediction
predict<-predict(bin,type = 'response')
table_1<-table(my_data$Class,predict>0.5)
table_1
##
## FALSE TRUE
## 0 434 10
## 1 11 228