Logistic Regression

This is logistic regression algorithm for the machine.csv dataset from the link https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

CODE

IMPORT DATASET

hurl <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data")
hdataset = read.table(hurl,header = F,sep=",",col.names = c("vendor name","Model Name","MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX","PRP","ERP"))
head(hdataset)
##   vendor.name Model.Name MYCT MMIN  MMAX CACH CHMIN CHMAX PRP ERP
## 1     adviser      32/60  125  256  6000  256    16   128 198 199
## 2      amdahl     470v/7   29 8000 32000   32     8    32 269 253
## 3      amdahl    470v/7a   29 8000 32000   32     8    32 220 253
## 4      amdahl    470v/7b   29 8000 32000   32     8    32 172 253
## 5      amdahl    470v/7c   29 8000 16000   32     8    16 132 132
## 6      amdahl     470v/b   26 8000 32000   64     8    32 318 290

Feature Selection

hdataset = hdataset[3:10]
head(hdataset)
##   MYCT MMIN  MMAX CACH CHMIN CHMAX PRP ERP
## 1  125  256  6000  256    16   128 198 199
## 2   29 8000 32000   32     8    32 269 253
## 3   29 8000 32000   32     8    32 220 253
## 4   29 8000 32000   32     8    32 172 253
## 5   29 8000 16000   32     8    16 132 132
## 6   26 8000 32000   64     8    32 318 290
str(hdataset)
## 'data.frame':    209 obs. of  8 variables:
##  $ MYCT : int  125 29 29 29 29 26 23 23 23 23 ...
##  $ MMIN : int  256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
##  $ MMAX : int  6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
##  $ CACH : int  256 32 32 32 32 64 64 64 64 128 ...
##  $ CHMIN: int  16 8 8 8 8 8 16 16 16 32 ...
##  $ CHMAX: int  128 32 32 32 16 32 32 32 32 64 ...
##  $ PRP  : int  198 269 220 172 132 318 367 489 636 1144 ...
##  $ ERP  : int  199 253 253 253 132 290 381 381 749 1238 ...
summary(hdataset)
##       MYCT             MMIN            MMAX            CACH       
##  Min.   :  17.0   Min.   :   64   Min.   :   64   Min.   :  0.00  
##  1st Qu.:  50.0   1st Qu.:  768   1st Qu.: 4000   1st Qu.:  0.00  
##  Median : 110.0   Median : 2000   Median : 8000   Median :  8.00  
##  Mean   : 203.8   Mean   : 2868   Mean   :11796   Mean   : 25.21  
##  3rd Qu.: 225.0   3rd Qu.: 4000   3rd Qu.:16000   3rd Qu.: 32.00  
##  Max.   :1500.0   Max.   :32000   Max.   :64000   Max.   :256.00  
##      CHMIN            CHMAX             PRP              ERP         
##  Min.   : 0.000   Min.   :  0.00   Min.   :   6.0   Min.   :  15.00  
##  1st Qu.: 1.000   1st Qu.:  5.00   1st Qu.:  27.0   1st Qu.:  28.00  
##  Median : 2.000   Median :  8.00   Median :  50.0   Median :  45.00  
##  Mean   : 4.699   Mean   : 18.27   Mean   : 105.6   Mean   :  99.33  
##  3rd Qu.: 6.000   3rd Qu.: 24.00   3rd Qu.: 113.0   3rd Qu.: 101.00  
##  Max.   :52.000   Max.   :176.00   Max.   :1150.0   Max.   :1238.00
plot(hdataset)

Splitting into training and test set

library(caTools)
split = sample.split(hdataset$MYCT,SplitRatio = 0.75)
training_set = subset(hdataset,split == T)
test_set = subset(hdataset,split == F)
dim(training_set)
## [1] 163   8
dim(test_set)
## [1] 46  8

Feature Scaling

training_set[-8] = scale(training_set[-8])
test_set[-8] = scale(test_set[-8])

Model

classifier = glm(formula = PRP~.,data = training_set,family = gaussian())
classifier
## 
## Call:  glm(formula = PRP ~ ., family = gaussian(), data = training_set)
## 
## Coefficients:
## (Intercept)         MYCT         MMIN         MMAX         CACH        CHMIN  
##   -0.589340    -0.029046     0.127542    -0.059697     0.067984     0.037615  
##       CHMAX          ERP  
##    0.041677     0.006257  
## 
## Degrees of Freedom: 162 Total (i.e. Null);  155 Residual
## Null Deviance:       162 
## Residual Deviance: 13.72     AIC: 77.21
plot(classifier,main = "CLASSIFIER")

summary(classifier)
## 
## Call:
## glm(formula = PRP ~ ., family = gaussian(), data = training_set)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.22305  -0.11790  -0.01718   0.05665   1.69013  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.5893402  0.0647541  -9.101 4.17e-16 ***
## MYCT        -0.0290465  0.0271164  -1.071  0.28575    
## MMIN         0.1275424  0.0442669   2.881  0.00452 ** 
## MMAX        -0.0596975  0.0636481  -0.938  0.34974    
## CACH         0.0679840  0.0356138   1.909  0.05812 .  
## CHMIN        0.0376149  0.0327483   1.149  0.25249    
## CHMAX        0.0416765  0.0342107   1.218  0.22499    
## ERP          0.0062569  0.0006414   9.755  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.08853978)
## 
##     Null deviance: 162.000  on 162  degrees of freedom
## Residual deviance:  13.724  on 155  degrees of freedom
## AIC: 77.21
## 
## Number of Fisher Scoring iterations: 2

Prediction

y_pred = predict(object = classifier,newdata = test_set,type = "response")
y_pred
##           3          10          12          20          23          28 
##  1.12429122  8.16702591 -0.57247340  0.60023604 -0.29457982 -0.58244737 
##          29          33          41          54          58          62 
## -0.54388585 -0.03590057 -0.40337148 -0.48888301 -0.60231899 -0.60231899 
##          69          72          74          78          84          90 
## -0.46466189 -0.21367688 -0.50516141 -0.40570801 -0.58207969 -0.18823199 
##          91          98         108         113         119         127 
##  0.48677977  0.84479913 -0.67061913 -0.44647102 -0.29289533 -0.35730751 
##         130         133         137         139         143         144 
## -0.13538910 -0.35327718 -0.05047970 -0.35592780 -0.40313021 -0.07765276 
##         147         156         162         164         166         173 
##  0.24132097  1.26170150 -0.40305098 -0.23667143  0.09127259 -0.14031462 
##         175         177         184         192         195         200 
## -0.53980960 -0.53982581 -0.42678653  1.32898425  0.48441230  5.94064812 
##         203         204         207         208 
## -0.56205734 -0.55558335 -0.39770018 -0.41931962
plot(y_pred, main = "predictions")

y_pred =ifelse(y_pred>0.5,1,0)
summary(y_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1522  0.0000  1.0000

Analysis

cm = table(test_set$PRP,y_pred)
cm
##                       y_pred
##                        0 1
##   -0.452056845590158   1 0
##   -0.447757772839042   1 0
##   -0.430561481834579   1 0
##   -0.421963336332347   1 0
##   -0.409066118079      1 0
##   -0.404767045327884   2 0
##   -0.396168899825652   1 0
##   -0.387570754323421   1 0
##   -0.383271681572305   1 0
##   -0.370374463318957   1 0
##   -0.361776317816726   2 0
##   -0.344580026812262   3 0
##   -0.327383735807799   2 0
##   -0.323084663056683   1 0
##   -0.318785590305567   1 0
##   -0.30588837205222    1 0
##   -0.288692081047757   1 0
##   -0.284393008296641   3 0
##   -0.280093935545525   1 0
##   -0.275794862794409   1 0
##   -0.271495790043293   1 0
##   -0.237103208034367   1 0
##   -0.224205989781019   1 0
##   -0.219906917029903   1 0
##   -0.215607844278787   2 0
##   -0.211308771527672   1 0
##   -0.168318044016513   1 0
##   -0.155420825763166   1 0
##   -0.09953287999866    1 0
##   -0.00925235222522757 1 0
##   0.0165420842814674   0 1
##   0.1068226120549      1 0
##   0.446449359393051    0 1
##   0.566823396424294    0 1
##   0.691496506206653    0 1
##   0.902151071011329    0 1
##   4.41879258142408     0 1
##   4.44458701793077     0 1
acc = sum(diag(cm))/sum(cm)
acc
## [1] 0.02173913