logist.Regression.knit

Title: “logistic_regression”

Find the current Directory

getwd()

## [1] "C:/Users/Admin/Documents/R studio STTP"

Change the directory to working directory

setwd("c:\\Users\\Admin\\Documents\\R studio STTP")

Load Dataset

crash_train<-read.csv("crashTest_1.csv")
crash_test<-read.csv("crashTest_1_TEST.csv")

firts few rows of dataset

head(crash_train)

##   CarID ManHI ManBI  IntI HVACi Safety   CarType
## 1     1 -5.27 -1.30  2.86 -4.85   4.04       SUV
## 2     2 -4.82 -5.38  9.72 -0.97  -4.57 Hatchback
## 3     3  9.57 -7.50 -7.61  1.33  -5.10 Hatchback
## 4     4  2.84 -2.85  0.92  5.51  -6.64 Hatchback
## 5     5  0.00  2.68 -4.15  0.85   5.58       SUV
## 6     6  0.40  6.34  0.83  5.03  -8.10       SUV

view Dataset

View(crash_train)

View Structure of the data

str(crash_train)

## 'data.frame':    80 obs. of  7 variables:
##  $ CarID  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ManHI  : num  -5.27 -4.82 9.57 2.84 0 0.4 5.94 5.78 0.86 7.36 ...
##  $ ManBI  : num  -1.3 -5.38 -7.5 -2.85 2.68 6.34 3.14 -1.75 -4.32 7.42 ...
##  $ IntI   : num  2.86 9.72 -7.61 0.92 -4.15 0.83 -6.65 -6.85 8.1 0.27 ...
##  $ HVACi  : num  -4.85 -0.97 1.33 5.51 0.85 5.03 6.62 0.73 -8.96 -8.62 ...
##  $ Safety : num  4.04 -4.57 -5.1 -6.64 5.58 -8.1 -1.32 5.5 3.1 3.08 ...
##  $ CarType: chr  "SUV" "Hatchback" "Hatchback" "Hatchback" ...

5 point Summary of the dataset

summary(crash_train)

##      CarID           ManHI             ManBI              IntI        
##  Min.   : 1.00   Min.   :-9.9300   Min.   :-9.9400   Min.   :-9.9900  
##  1st Qu.:20.75   1st Qu.:-5.1950   1st Qu.:-5.7050   1st Qu.:-5.5725  
##  Median :40.50   Median : 0.6350   Median :-1.8150   Median :-0.4150  
##  Mean   :40.50   Mean   :-0.0935   Mean   :-0.9277   Mean   :-0.1349  
##  3rd Qu.:60.25   3rd Qu.: 5.0500   3rd Qu.: 3.4175   3rd Qu.: 4.9775  
##  Max.   :80.00   Max.   : 9.5700   Max.   : 9.6100   Max.   : 9.7200  
##      HVACi             Safety          CarType         
##  Min.   :-9.8200   Min.   :-9.8000   Length:80         
##  1st Qu.:-5.6750   1st Qu.:-4.6775   Class :character  
##  Median : 0.8700   Median : 0.8300   Mode  :character  
##  Mean   : 0.1197   Mean   : 0.5437                     
##  3rd Qu.: 5.0625   3rd Qu.: 4.6225                     
##  Max.   : 9.8900   Max.   : 9.9900

checking if class attribute “CarType” is binary (0/1)

crash_train$CarType_binary <- ifelse(crash_train$CarType == "SUV", 1, 0)

Fit logistic regression model

lfit <-glm(CarType_binary ~ ., family = binomial, data = crash_train)

understand

summary of lfit

summary(lfit)

## 
## Call:
## glm(formula = CarType_binary ~ ., family = binomial, data = crash_train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01  9.787e+04       0        1
## CarID        3.641e-13  1.765e+03       0        1
## ManHI       -3.035e-12  7.249e+03       0        1
## ManBI        3.271e-13  7.978e+03       0        1
## IntI        -3.977e-12  8.685e+03       0        1
## HVACi       -4.546e-12  1.009e+04       0        1
## Safety      -1.522e-12  7.770e+03       0        1
## CarTypeSUV   5.313e+01  1.347e+05       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.0585e+02  on 79  degrees of freedom
## Residual deviance: 4.6413e-10  on 72  degrees of freedom
## AIC: 16
## 
## Number of Fisher Scoring iterations: 25

#maximum likelhood estimation derivate of newton graphison method

Find the probability

logtrain<-predict(lfit,type='response')

plotting the probability

plot(logtrain)

tapply(logtrain,
       crash_train$CarType,
       mean)

##    Hatchback          SUV 
## 2.900701e-12 1.000000e+00

logispred<-predict(lfit,newdata=crash_test,Type='response')

plot(logispred)

logispred

##         1         2         3         4         5         6         7         8 
## -26.56607  26.56607 -26.56607 -26.56607 -26.56607 -26.56607  26.56607  26.56607 
##         9        10        11        12        13        14        15        16 
##  26.56607  26.56607  26.56607 -26.56607  26.56607  26.56607  26.56607  26.56607 
##        17        18        19        20 
## -26.56607 -26.56607 -26.56607 -26.56607

lets classify test point as hatchback or SUV by setting a threshold

crash_test[logispred<=0.5,"logispred"]<-"Hatchback"
crash_test[logispred>0.5,"logispred"]<-"SUV"

confusionMatrix(table(crash_test[,8],crash_test[,7]),positive = 'Hatchback')

## Confusion Matrix and Statistics
## 
##            
##             Hatchback SUV
##   Hatchback        10   0
##   SUV               0  10
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8316, 1)
##     No Information Rate : 0.5        
##     P-Value [Acc > NIR] : 9.537e-07  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0        
##             Specificity : 1.0        
##          Pos Pred Value : 1.0        
##          Neg Pred Value : 1.0        
##              Prevalence : 0.5        
##          Detection Rate : 0.5        
##    Detection Prevalence : 0.5        
##       Balanced Accuracy : 1.0        
##                                      
##        'Positive' Class : Hatchback  
##