Load the file

audit_risk <- read_delim("audit_risk.csv", delim = ",")
## Rows: 776 Columns: 27
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): LOCATION_ID
## dbl (26): Sector_score, PARA_A, Score_A, Risk_A, PARA_B, Score_B, Risk_B, TO...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trial <- read_delim("trial.csv", delim = ",")
## Rows: 776 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): LOCATION_ID
## dbl (17): Sector_score, PARA_A, SCORE_A, PARA_B, SCORE_B, TOTAL, numbers, Ma...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Overview of the dataset

str(audit_risk)
## spec_tbl_df [776 x 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Sector_score  : num [1:776] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
##  $ LOCATION_ID   : chr [1:776] "23" "6" "6" "6" ...
##  $ PARA_A        : num [1:776] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
##  $ Score_A       : num [1:776] 0.6 0.2 0.2 0.2 0.2 0.2 0.4 0.6 0.6 0.6 ...
##  $ Risk_A        : num [1:776] 2.508 0 0.102 0 0 ...
##  $ PARA_B        : num [1:776] 2.5 4.83 0.23 10.8 0.08 ...
##  $ Score_B       : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.4 0.6 0.6 0.2 ...
##  $ Risk_B        : num [1:776] 0.5 0.966 0.046 6.48 0.016 ...
##  $ TOTAL         : num [1:776] 6.68 4.83 0.74 10.8 0.08 ...
##  $ numbers       : num [1:776] 5 5 5 6 5 5 5 5.5 5.5 5 ...
##  $ Score_C       : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.2 0.4 0.4 0.2 ...
##  $ Risk_C        : num [1:776] 1 1 1 3.6 1 1 1 2.2 2.2 1 ...
##  $ Money_Value   : num [1:776] 3.38 0.94 0 11.75 0 ...
##  $ Score_D       : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.6 0.4 0.4 0.2 ...
##  $ Risk_D        : num [1:776] 0.676 0.188 0 7.05 0 ...
##  $ District_Loss : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  $ PROB          : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ Risk_E        : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ History       : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Prob          : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ Risk_F        : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Score         : num [1:776] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
##  $ Inherent_Risk : num [1:776] 8.57 2.55 1.55 17.53 1.42 ...
##  $ CONTROL_RISK  : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ Detection_Risk: num [1:776] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ Audit_Risk    : num [1:776] 1.715 0.511 0.31 3.506 0.283 ...
##  $ Risk          : num [1:776] 1 0 0 1 0 0 1 1 1 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Sector_score = col_double(),
##   ..   LOCATION_ID = col_character(),
##   ..   PARA_A = col_double(),
##   ..   Score_A = col_double(),
##   ..   Risk_A = col_double(),
##   ..   PARA_B = col_double(),
##   ..   Score_B = col_double(),
##   ..   Risk_B = col_double(),
##   ..   TOTAL = col_double(),
##   ..   numbers = col_double(),
##   ..   Score_C = col_double(),
##   ..   Risk_C = col_double(),
##   ..   Money_Value = col_double(),
##   ..   Score_D = col_double(),
##   ..   Risk_D = col_double(),
##   ..   District_Loss = col_double(),
##   ..   PROB = col_double(),
##   ..   Risk_E = col_double(),
##   ..   History = col_double(),
##   ..   Prob = col_double(),
##   ..   Risk_F = col_double(),
##   ..   Score = col_double(),
##   ..   Inherent_Risk = col_double(),
##   ..   CONTROL_RISK = col_double(),
##   ..   Detection_Risk = col_double(),
##   ..   Audit_Risk = col_double(),
##   ..   Risk = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(trial)
## spec_tbl_df [776 x 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Sector_score : num [1:776] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
##  $ LOCATION_ID  : chr [1:776] "23" "6" "6" "6" ...
##  $ PARA_A       : num [1:776] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
##  $ SCORE_A      : num [1:776] 6 2 2 2 2 2 4 6 6 6 ...
##  $ PARA_B       : num [1:776] 2.5 4.83 0.23 10.8 0.08 ...
##  $ SCORE_B      : num [1:776] 2 2 2 6 2 2 4 6 6 2 ...
##  $ TOTAL        : num [1:776] 6.68 4.83 0.74 10.8 0.08 ...
##  $ numbers      : num [1:776] 5 5 5 6 5 5 5 5.5 5.5 5 ...
##  $ Marks        : num [1:776] 2 2 2 6 2 2 2 4 4 2 ...
##  $ Money_Value  : num [1:776] 3.38 0.94 0 11.75 0 ...
##  $ MONEY_Marks  : num [1:776] 2 2 2 6 2 2 6 4 4 2 ...
##  $ District     : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  $ Loss         : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ LOSS_SCORE   : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  $ History      : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ History_score: num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  $ Score        : num [1:776] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
##  $ Risk         : num [1:776] 1 0 0 1 0 0 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Sector_score = col_double(),
##   ..   LOCATION_ID = col_character(),
##   ..   PARA_A = col_double(),
##   ..   SCORE_A = col_double(),
##   ..   PARA_B = col_double(),
##   ..   SCORE_B = col_double(),
##   ..   TOTAL = col_double(),
##   ..   numbers = col_double(),
##   ..   Marks = col_double(),
##   ..   Money_Value = col_double(),
##   ..   MONEY_Marks = col_double(),
##   ..   District = col_double(),
##   ..   Loss = col_double(),
##   ..   LOSS_SCORE = col_double(),
##   ..   History = col_double(),
##   ..   History_score = col_double(),
##   ..   Score = col_double(),
##   ..   Risk = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Changing the scale of Score_A and Score_B to match with audit dataset

trial$SCORE_A <- trial$SCORE_A/10
trial$SCORE_B <- trial$SCORE_B/10

Checking whether the columns in both the datasets match

all.equal(trial$SCORE_A, audit_risk$Score_A)
## [1] TRUE
all.equal(trial$SCORE_B, audit_risk$Score_B)
## [1] TRUE
all.equal(trial$Sector_score, audit_risk$Sector_score)
## [1] TRUE
all.equal(trial$LOCATION_ID, audit_risk$LOCATION_ID)
## [1] TRUE
all.equal(trial$PARA_A, audit_risk$PARA_A)
## [1] TRUE
all.equal(trial$PARA_B, audit_risk$PARA_B)
## [1] TRUE
all.equal(trial$History, audit_risk$History)
## [1] TRUE
all.equal(trial$TOTAL, audit_risk$TOTAL)
## [1] TRUE
all.equal(trial$numbers, audit_risk$numbers)
## [1] TRUE
all.equal(trial$Score, audit_risk$Score)
## [1] TRUE
all.equal(trial$Money_Value, audit_risk$Money_Value)
## [1] TRUE
all.equal(trial$District, audit_risk$District_Loss)
## [1] TRUE

Comment: We can see that the columns of both the dataset Audit and Trial match so we can remove these duplicate columns.

Removing duplicate columns in Trial dataset

trial[, c('Sector_score', 'LOCATION_ID', 'PARA_A', 'SCORE_A', 'PARA_B', 'SCORE_B',
       'TOTAL', 'numbers', 'Score', 'History', 'Money_Value', 'District', 'Risk')] <- list(NULL)

Combining the variables of audit and trial dataset in a new dataset

audit_new <- bind_cols(audit_risk, trial)

Overview of the audit_new dataset

str(audit_new)
## spec_tbl_df [776 x 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Sector_score  : num [1:776] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
##  $ LOCATION_ID   : chr [1:776] "23" "6" "6" "6" ...
##  $ PARA_A        : num [1:776] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
##  $ Score_A       : num [1:776] 0.6 0.2 0.2 0.2 0.2 0.2 0.4 0.6 0.6 0.6 ...
##  $ Risk_A        : num [1:776] 2.508 0 0.102 0 0 ...
##  $ PARA_B        : num [1:776] 2.5 4.83 0.23 10.8 0.08 ...
##  $ Score_B       : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.4 0.6 0.6 0.2 ...
##  $ Risk_B        : num [1:776] 0.5 0.966 0.046 6.48 0.016 ...
##  $ TOTAL         : num [1:776] 6.68 4.83 0.74 10.8 0.08 ...
##  $ numbers       : num [1:776] 5 5 5 6 5 5 5 5.5 5.5 5 ...
##  $ Score_C       : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.2 0.4 0.4 0.2 ...
##  $ Risk_C        : num [1:776] 1 1 1 3.6 1 1 1 2.2 2.2 1 ...
##  $ Money_Value   : num [1:776] 3.38 0.94 0 11.75 0 ...
##  $ Score_D       : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.6 0.4 0.4 0.2 ...
##  $ Risk_D        : num [1:776] 0.676 0.188 0 7.05 0 ...
##  $ District_Loss : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  $ PROB          : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ Risk_E        : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ History       : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Prob          : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ Risk_F        : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Score         : num [1:776] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
##  $ Inherent_Risk : num [1:776] 8.57 2.55 1.55 17.53 1.42 ...
##  $ CONTROL_RISK  : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ Detection_Risk: num [1:776] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ Audit_Risk    : num [1:776] 1.715 0.511 0.31 3.506 0.283 ...
##  $ Risk          : num [1:776] 1 0 0 1 0 0 1 1 1 0 ...
##  $ Marks         : num [1:776] 2 2 2 6 2 2 2 4 4 2 ...
##  $ MONEY_Marks   : num [1:776] 2 2 2 6 2 2 6 4 4 2 ...
##  $ Loss          : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
##  $ LOSS_SCORE    : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  $ History_score : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Sector_score = col_double(),
##   ..   LOCATION_ID = col_character(),
##   ..   PARA_A = col_double(),
##   ..   Score_A = col_double(),
##   ..   Risk_A = col_double(),
##   ..   PARA_B = col_double(),
##   ..   Score_B = col_double(),
##   ..   Risk_B = col_double(),
##   ..   TOTAL = col_double(),
##   ..   numbers = col_double(),
##   ..   Score_C = col_double(),
##   ..   Risk_C = col_double(),
##   ..   Money_Value = col_double(),
##   ..   Score_D = col_double(),
##   ..   Risk_D = col_double(),
##   ..   District_Loss = col_double(),
##   ..   PROB = col_double(),
##   ..   Risk_E = col_double(),
##   ..   History = col_double(),
##   ..   Prob = col_double(),
##   ..   Risk_F = col_double(),
##   ..   Score = col_double(),
##   ..   Inherent_Risk = col_double(),
##   ..   CONTROL_RISK = col_double(),
##   ..   Detection_Risk = col_double(),
##   ..   Audit_Risk = col_double(),
##   ..   Risk = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Looking for odd patterns

summary(audit_new)
##   Sector_score   LOCATION_ID            PARA_A          Score_A      
##  Min.   : 1.85   Length:776         Min.   : 0.000   Min.   :0.2000  
##  1st Qu.: 2.37   Class :character   1st Qu.: 0.210   1st Qu.:0.2000  
##  Median : 3.89   Mode  :character   Median : 0.875   Median :0.2000  
##  Mean   :20.18                      Mean   : 2.450   Mean   :0.3513  
##  3rd Qu.:55.57                      3rd Qu.: 2.480   3rd Qu.:0.6000  
##  Max.   :59.85                      Max.   :85.000   Max.   :0.6000  
##                                                                      
##      Risk_A           PARA_B            Score_B           Risk_B       
##  Min.   : 0.000   Min.   :   0.000   Min.   :0.2000   Min.   :  0.000  
##  1st Qu.: 0.042   1st Qu.:   0.000   1st Qu.:0.2000   1st Qu.:  0.000  
##  Median : 0.175   Median :   0.405   Median :0.2000   Median :  0.081  
##  Mean   : 1.351   Mean   :  10.800   Mean   :0.3131   Mean   :  6.334  
##  3rd Qu.: 1.488   3rd Qu.:   4.160   3rd Qu.:0.4000   3rd Qu.:  1.841  
##  Max.   :51.000   Max.   :1264.630   Max.   :0.6000   Max.   :758.778  
##                                                                        
##      TOTAL              numbers         Score_C           Risk_C     
##  Min.   :   0.0000   Min.   :5.000   Min.   :0.2000   Min.   :1.000  
##  1st Qu.:   0.5375   1st Qu.:5.000   1st Qu.:0.2000   1st Qu.:1.000  
##  Median :   1.3700   Median :5.000   Median :0.2000   Median :1.000  
##  Mean   :  13.2185   Mean   :5.068   Mean   :0.2237   Mean   :1.153  
##  3rd Qu.:   7.7075   3rd Qu.:5.000   3rd Qu.:0.2000   3rd Qu.:1.000  
##  Max.   :1268.9100   Max.   :9.000   Max.   :0.6000   Max.   :5.400  
##                                                                      
##   Money_Value         Score_D          Risk_D        District_Loss  
##  Min.   :  0.000   Min.   :0.200   Min.   :  0.000   Min.   :2.000  
##  1st Qu.:  0.000   1st Qu.:0.200   1st Qu.:  0.000   1st Qu.:2.000  
##  Median :  0.090   Median :0.200   Median :  0.018   Median :2.000  
##  Mean   : 14.138   Mean   :0.291   Mean   :  8.265   Mean   :2.505  
##  3rd Qu.:  5.595   3rd Qu.:0.400   3rd Qu.:  2.235   3rd Qu.:2.000  
##  Max.   :935.030   Max.   :0.600   Max.   :561.018   Max.   :6.000  
##  NA's   :1                                                          
##       PROB            Risk_E          History            Prob       
##  Min.   :0.2000   Min.   :0.4000   Min.   :0.0000   Min.   :0.2000  
##  1st Qu.:0.2000   1st Qu.:0.4000   1st Qu.:0.0000   1st Qu.:0.2000  
##  Median :0.2000   Median :0.4000   Median :0.0000   Median :0.2000  
##  Mean   :0.2062   Mean   :0.5191   Mean   :0.1044   Mean   :0.2168  
##  3rd Qu.:0.2000   3rd Qu.:0.4000   3rd Qu.:0.0000   3rd Qu.:0.2000  
##  Max.   :0.6000   Max.   :2.4000   Max.   :9.0000   Max.   :0.6000  
##                                                                     
##      Risk_F            Score       Inherent_Risk      CONTROL_RISK   
##  Min.   :0.00000   Min.   :2.000   Min.   :  1.400   Min.   :0.4000  
##  1st Qu.:0.00000   1st Qu.:2.000   1st Qu.:  1.583   1st Qu.:0.4000  
##  Median :0.00000   Median :2.400   Median :  2.214   Median :0.4000  
##  Mean   :0.05361   Mean   :2.703   Mean   : 17.681   Mean   :0.5727  
##  3rd Qu.:0.00000   3rd Qu.:3.250   3rd Qu.: 10.664   3rd Qu.:0.4000  
##  Max.   :5.40000   Max.   :5.200   Max.   :801.262   Max.   :5.8000  
##                                                                      
##  Detection_Risk   Audit_Risk            Risk           Marks      
##  Min.   :0.5    Min.   :  0.2800   Min.   :0.000   Min.   :2.000  
##  1st Qu.:0.5    1st Qu.:  0.3167   1st Qu.:0.000   1st Qu.:2.000  
##  Median :0.5    Median :  0.5556   Median :0.000   Median :2.000  
##  Mean   :0.5    Mean   :  7.1682   Mean   :0.393   Mean   :2.237  
##  3rd Qu.:0.5    3rd Qu.:  3.2499   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :0.5    Max.   :961.5144   Max.   :1.000   Max.   :6.000  
##                                                                   
##   MONEY_Marks        Loss           LOSS_SCORE    History_score  
##  Min.   :2.00   Min.   :0.00000   Min.   :2.000   Min.   :2.000  
##  1st Qu.:2.00   1st Qu.:0.00000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.00   Median :0.00000   Median :2.000   Median :2.000  
##  Mean   :2.91   Mean   :0.02964   Mean   :2.062   Mean   :2.168  
##  3rd Qu.:4.00   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :6.00   Max.   :2.00000   Max.   :6.000   Max.   :6.000  
## 

Comment: We can see that Money_Value has 1 missing value and Detection_Risk has 0.5 value throughout that concluded we do not need Detection_Risk for our analysis.

Replacing missing value in Money_Value with the median

audit_new$Money_Value[is.na(audit_new$Money_Value)] <- median(audit_new$Money_Value, na.rm = T)

Checking for missing value in Money_Value again

sum(is.na(audit_new$Money_Value))
## [1] 0

Comment: We can see that Money_Value does not have any missing value now.

Removing Detection_Risk column

audit_new[, c('Detection_Risk')] <- list(NULL)

Checking the unique values in LOCATION_ID column

unique(audit_new$LOCATION_ID)
##  [1] "23"      "6"       "7"       "8"       "13"      "37"      "24"     
##  [8] "3"       "4"       "14"      "5"       "20"      "19"      "21"     
## [15] "22"      "9"       "11"      "12"      "29"      "30"      "38"     
## [22] "31"      "2"       "32"      "16"      "33"      "15"      "36"     
## [29] "34"      "18"      "25"      "39"      "27"      "35"      "40"     
## [36] "41"      "42"      "1"       "28"      "LOHARU"  "NUH"     "SAFIDON"
## [43] "43"      "44"      "17"

Comment: We can see that Location_ID has some character values. We will remove these 3 rows from the dataset.

Removing the character values from Location_ID

audit_new <- audit_new[!(audit_new$LOCATION_ID == 'LOHARU' | 
                           audit_new$LOCATION_ID == 'NUH' | 
                           audit_new$LOCATION_ID == 'SAFIDON'),]

Changing Location_ID from char to factor

audit_new$LOCATION_ID = as.factor(audit_new$LOCATION_ID)

Overview of the dataset

str(audit_new)
## tibble [773 x 31] (S3: tbl_df/tbl/data.frame)
##  $ Sector_score : num [1:773] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
##  $ LOCATION_ID  : Factor w/ 42 levels "1","11","12",..: 15 39 39 39 39 39 40 41 41 41 ...
##  $ PARA_A       : num [1:773] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
##  $ Score_A      : num [1:773] 0.6 0.2 0.2 0.2 0.2 0.2 0.4 0.6 0.6 0.6 ...
##  $ Risk_A       : num [1:773] 2.508 0 0.102 0 0 ...
##  $ PARA_B       : num [1:773] 2.5 4.83 0.23 10.8 0.08 ...
##  $ Score_B      : num [1:773] 0.2 0.2 0.2 0.6 0.2 0.2 0.4 0.6 0.6 0.2 ...
##  $ Risk_B       : num [1:773] 0.5 0.966 0.046 6.48 0.016 ...
##  $ TOTAL        : num [1:773] 6.68 4.83 0.74 10.8 0.08 ...
##  $ numbers      : num [1:773] 5 5 5 6 5 5 5 5.5 5.5 5 ...
##  $ Score_C      : num [1:773] 0.2 0.2 0.2 0.6 0.2 0.2 0.2 0.4 0.4 0.2 ...
##  $ Risk_C       : num [1:773] 1 1 1 3.6 1 1 1 2.2 2.2 1 ...
##  $ Money_Value  : num [1:773] 3.38 0.94 0 11.75 0 ...
##  $ Score_D      : num [1:773] 0.2 0.2 0.2 0.6 0.2 0.2 0.6 0.4 0.4 0.2 ...
##  $ Risk_D       : num [1:773] 0.676 0.188 0 7.05 0 ...
##  $ District_Loss: num [1:773] 2 2 2 2 2 2 2 2 2 2 ...
##  $ PROB         : num [1:773] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ Risk_E       : num [1:773] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ History      : num [1:773] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Prob         : num [1:773] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
##  $ Risk_F       : num [1:773] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Score        : num [1:773] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
##  $ Inherent_Risk: num [1:773] 8.57 2.55 1.55 17.53 1.42 ...
##  $ CONTROL_RISK : num [1:773] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
##  $ Audit_Risk   : num [1:773] 1.715 0.511 0.31 3.506 0.283 ...
##  $ Risk         : num [1:773] 1 0 0 1 0 0 1 1 1 0 ...
##  $ Marks        : num [1:773] 2 2 2 6 2 2 2 4 4 2 ...
##  $ MONEY_Marks  : num [1:773] 2 2 2 6 2 2 6 4 4 2 ...
##  $ Loss         : num [1:773] 0 0 0 0 0 0 0 0 0 0 ...
##  $ LOSS_SCORE   : num [1:773] 2 2 2 2 2 2 2 2 2 2 ...
##  $ History_score: num [1:773] 2 2 2 2 2 2 2 2 2 2 ...

Finding duplicates in the dataset

sum(duplicated(audit_new))
## [1] 13

Comment: We have 13 duplicate rows in the dataset, we will be removing those because it might bias the analysis.

Removing the duplicates

audit_new <- unique(audit_new)

Checking imbalance in the dataset

ggplot(audit_new, aes(x = as.factor(Risk))) +
  geom_bar(aes(fill= as.factor(Risk))) +
  geom_text(stat = "count", aes(label = ..count..), size = 5,)

Comment: From the above plot we can see that the dataset is not that imbalanced.

Checking for correlation between the variables

pairs(audit_new[,c(1, 3, 5, 6, 8, 9, 10, 12, 13, 15, 16, 18, 21, 30, 31 )])

Comments: We can see that there is correlation between Para_A and Risk_A and between Money_Value and Risk_D

Do fraudulent transactions occur more for certain money values?

boxplot(Money_Value ~ Risk, data = audit_new)

How the response variable (Risk) looks like in relationship with money value?

audit_new %>% 
  group_by(Risk) %>% 
  summarize(min = min(Money_Value),
            median = median(Money_Value),
            mean = round(mean(Money_Value),2),
            max = max(Money_Value))
## # A tibble: 2 x 5
##    Risk   min median  mean    max
##   <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1     0     0   0     0.37   8.21
## 2     1     0   9.75 35.5  935.

Comments: Here we can see that as expected, the more activity a company has (which is reflected by their Money Value) the more likely that any fraudulent activity can be taking place. We can also see that for “no risk at all” (Risk = 0) the maximum Money Value we have is 8.21.

Are there any location ids that have a higher audit risk?

audit_new %>% 
  ggplot(aes(x = LOCATION_ID, y = Audit_Risk)) + 
  geom_point()

Comment: We found that the location id 2 has the highest audit risk which will lead to Risk = 1

Models

a) Logistic Regression

audit_glm = glm(formula = Risk ~ .,  data = audit_new)
summary(audit_glm)
## 
## Call:
## glm(formula = Risk ~ ., data = audit_new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.66874  -0.12670  -0.01766   0.07071   0.76598  
## 
## Coefficients: (7 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.3830035  1.6954890  -0.226 0.821349    
## Sector_score  -0.0011833  0.0004206  -2.814 0.005034 ** 
## LOCATION_ID11  0.2574022  0.0967371   2.661 0.007974 ** 
## LOCATION_ID12  0.2515845  0.0918968   2.738 0.006345 ** 
## LOCATION_ID13  0.2303455  0.0947018   2.432 0.015253 *  
## LOCATION_ID14  0.2079508  0.1007209   2.065 0.039328 *  
## LOCATION_ID15  0.1766392  0.0942246   1.875 0.061258 .  
## LOCATION_ID16  0.1459474  0.0884235   1.651 0.099282 .  
## LOCATION_ID17  0.2363941  0.2511565   0.941 0.346918    
## LOCATION_ID18  0.2440441  0.1024057   2.383 0.017434 *  
## LOCATION_ID19  0.2550547  0.0872659   2.923 0.003582 ** 
## LOCATION_ID2   0.2346071  0.0902049   2.601 0.009497 ** 
## LOCATION_ID20  0.2893866  0.1351300   2.142 0.032577 *  
## LOCATION_ID21  0.1346268  0.1194071   1.127 0.259936    
## LOCATION_ID22  0.1868199  0.0982779   1.901 0.057723 .  
## LOCATION_ID23  0.7524498  0.2539217   2.963 0.003147 ** 
## LOCATION_ID24  0.1930840  0.2544526   0.759 0.448216    
## LOCATION_ID25  0.2203093  0.1285624   1.714 0.087041 .  
## LOCATION_ID27  0.0641308  0.1261615   0.508 0.611388    
## LOCATION_ID28  0.2810318  0.1171610   2.399 0.016716 *  
## LOCATION_ID29  0.2568824  0.0998976   2.571 0.010334 *  
## LOCATION_ID3   0.1143117  0.1609560   0.710 0.477815    
## LOCATION_ID30  0.2772368  0.1452556   1.909 0.056723 .  
## LOCATION_ID31  0.0744278  0.1092585   0.681 0.495966    
## LOCATION_ID32  0.1395604  0.0946509   1.474 0.140806    
## LOCATION_ID33  0.4240991  0.2604555   1.628 0.103914    
## LOCATION_ID34  0.2705853  0.2545939   1.063 0.288236    
## LOCATION_ID35  0.0326710  0.1894288   0.172 0.863117    
## LOCATION_ID36  0.2167367  0.1460804   1.484 0.138346    
## LOCATION_ID37  0.2301872  0.1132255   2.033 0.042432 *  
## LOCATION_ID38  0.1543471  0.1472464   1.048 0.294900    
## LOCATION_ID39  0.3597260  0.1175724   3.060 0.002301 ** 
## LOCATION_ID4   0.2492708  0.0933776   2.669 0.007774 ** 
## LOCATION_ID40  0.2023971  0.1841862   1.099 0.272204    
## LOCATION_ID41  0.1651569  0.2543457   0.649 0.516333    
## LOCATION_ID42  0.1357814  0.2529688   0.537 0.591611    
## LOCATION_ID43  0.1476109  0.1237343   1.193 0.233289    
## LOCATION_ID44  0.4426633  0.2510519   1.763 0.078300 .  
## LOCATION_ID5   0.2003184  0.0924720   2.166 0.030630 *  
## LOCATION_ID6   0.1920891  0.0944728   2.033 0.042404 *  
## LOCATION_ID7   0.1769363  0.1454134   1.217 0.224100    
## LOCATION_ID8   0.2128274  0.0878347   2.423 0.015645 *  
## LOCATION_ID9   0.2478953  0.0921901   2.689 0.007339 ** 
## PARA_A        -0.0720880  0.0441813  -1.632 0.103208    
## Score_A        0.4903076  0.0745215   6.579 9.29e-11 ***
## Risk_A         0.1298831  0.0690804   1.880 0.060502 .  
## PARA_B         0.0663981  0.0226645   2.930 0.003505 ** 
## Score_B        0.7865866  0.0814324   9.659  < 2e-16 ***
## Risk_B        -0.0987223  0.0275721  -3.581 0.000367 ***
## TOTAL         -0.0056284  0.0138251  -0.407 0.684046    
## numbers       -0.1547765  0.3384589  -0.457 0.647600    
## Score_C       -1.0864821  2.3311008  -0.466 0.641303    
## Risk_C         0.2015124  0.4940976   0.408 0.683517    
## Money_Value    0.0563557  0.0125810   4.479 8.75e-06 ***
## Score_D        1.1969019  0.0849080  14.096  < 2e-16 ***
## Risk_D        -0.0931575  0.0209579  -4.445 1.02e-05 ***
## District_Loss  0.1545024  0.0365866   4.223 2.73e-05 ***
## PROB           1.3855819  1.6713128   0.829 0.407367    
## Risk_E        -0.0015930  0.1682499  -0.009 0.992448    
## History        0.0188982  0.1990701   0.095 0.924396    
## Prob          -0.3868537  0.3503493  -1.104 0.269890    
## Risk_F         0.1321981  0.3095144   0.427 0.669428    
## Score                 NA         NA      NA       NA    
## Inherent_Risk         NA         NA      NA       NA    
## CONTROL_RISK          NA         NA      NA       NA    
## Audit_Risk    -0.0029946  0.0008836  -3.389 0.000741 ***
## Marks                 NA         NA      NA       NA    
## MONEY_Marks           NA         NA      NA       NA    
## Loss          -0.0792544  0.2942355  -0.269 0.787735    
## LOSS_SCORE            NA         NA      NA       NA    
## History_score         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.05557932)
## 
##     Null deviance: 182.400  on 759  degrees of freedom
## Residual deviance:  38.683  on 696  degrees of freedom
## AIC: 23.573
## 
## Number of Fisher Scoring iterations: 2

Comments: I ran a full model to check which variables have an effect on the response variable. However, while trying different combination of variables the significance kept changing each time. So I decided to apply stepwise selection.

Stepwise AIC selection

glm.null.audit = glm(Risk ~ 1, data = audit_new)
glm.full.audit = glm(Risk ~ ., 
                      data = audit_new)
step.AIC = step(glm.null.audit, scope = list(upper=glm.full.audit),
     direction ="both",test ="Chisq", trace = F)


step.AIC
## 
## Call:  glm(formula = Risk ~ Score + District_Loss + Score_D + Sector_score + 
##     Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B + 
##     Money_Value + Risk_D, data = audit_new)
## 
## Coefficients:
##   (Intercept)          Score  District_Loss        Score_D   Sector_score  
##     -0.630298      -0.160586       0.166156       1.430392      -0.001281  
##    Audit_Risk        Score_A           Loss          TOTAL         Risk_F  
##     -0.002975       0.719352       0.200060       0.001495       0.144392  
##       Score_B    Money_Value         Risk_D  
##      1.134973       0.070668      -0.117111  
## 
## Degrees of Freedom: 759 Total (i.e. Null);  747 Residual
## Null Deviance:       182.4 
## Residual Deviance: 42.61     AIC: -5.001
summary(step.AIC)
## 
## Call:
## glm(formula = Risk ~ Score + District_Loss + Score_D + Sector_score + 
##     Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B + 
##     Money_Value + Risk_D, data = audit_new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.66946  -0.14249  -0.03200   0.03947   0.75969  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.6302977  0.0524977 -12.006  < 2e-16 ***
## Score         -0.1605864  0.0621197  -2.585  0.00992 ** 
## District_Loss  0.1661558  0.0101102  16.434  < 2e-16 ***
## Score_D        1.4303920  0.1180163  12.120  < 2e-16 ***
## Sector_score  -0.0012815  0.0004096  -3.129  0.00182 ** 
## Audit_Risk    -0.0029755  0.0006844  -4.347 1.57e-05 ***
## Score_A        0.7193523  0.0928920   7.744 3.14e-14 ***
## Loss           0.2000604  0.0480261   4.166 3.47e-05 ***
## TOTAL          0.0014947  0.0004942   3.024  0.00258 ** 
## Risk_F         0.1443921  0.0345638   4.178 3.29e-05 ***
## Score_B        1.1349726  0.2090378   5.430 7.65e-08 ***
## Money_Value    0.0706676  0.0122357   5.776 1.13e-08 ***
## Risk_D        -0.1171110  0.0203692  -5.749 1.31e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.05703748)
## 
##     Null deviance: 182.400  on 759  degrees of freedom
## Residual deviance:  42.607  on 747  degrees of freedom
## AIC: -5.0014
## 
## Number of Fisher Scoring iterations: 2

Comment: As we can see the result from the AIC selection, all the variables from the final AIC model are significant and we will be using these going forward for all the models.

Train Test split

set.seed(1)
row.number = sample(1:nrow(audit_new), 0.8*nrow(audit_new))
train_audit = audit_new[row.number,]
test_audit = audit_new[-row.number,]

Logistic Regression on Training dataset

audit_glm1 = glm(formula = Risk ~ Score + District_Loss + Score_D + Sector_score + 
    Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B + 
    Money_Value + Risk_D, data = train_audit)

Prediction of the Probabilities

test_audit$PredProb = predict.glm(audit_glm1, newdata = test_audit, type = "response")

Establishing probability cutoff

test_audit$PredSub = ifelse(test_audit$PredProb >= 0.5, "1", "0")


table(test_audit$PredSub)
## 
##  0  1 
## 89 63

Confusion Matrix

caret::confusionMatrix(as.factor(test_audit$Risk), as.factor(test_audit$PredSub))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 81  4
##          1  8 59
##                                           
##                Accuracy : 0.9211          
##                  95% CI : (0.8662, 0.9585)
##     No Information Rate : 0.5855          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8388          
##                                           
##  Mcnemar's Test P-Value : 0.3865          
##                                           
##             Sensitivity : 0.9101          
##             Specificity : 0.9365          
##          Pos Pred Value : 0.9529          
##          Neg Pred Value : 0.8806          
##              Prevalence : 0.5855          
##          Detection Rate : 0.5329          
##    Detection Prevalence : 0.5592          
##       Balanced Accuracy : 0.9233          
##                                           
##        'Positive' Class : 0               
## 

Comment: Here we can see that the accuracy is 92.11%. Now we will check the optimal cutoff.

Sensitivity-Specificity plot

PredProb1 = prediction(predict.glm(step.AIC, newdata = test_audit, type = "response"), test_audit$Risk)

# Computing threshold for cutoff to best trade off sensitivity and specificity
plot(unlist(performance(PredProb1,'sens')@x.values),unlist(performance(PredProb1,'sens')@y.values), type='l', lwd=2, ylab = "", xlab = 'Cutoff')
mtext('Sensitivity',side=2)
mtext('Sensitivity vs. Specificity Plot for AIC Model', side=3)

# Second specificity in same plot
par(new=TRUE)
plot(unlist(performance(PredProb1,'spec')@x.values),unlist(performance(PredProb1,'spec')@y.values), type='l', lwd=2,col='red', ylab = "", xlab = 'Cutoff')
axis(4,at=seq(0,1,0.2)) 
mtext('Specificity',side=4, col='red')

par(new=TRUE)

min.diff <-which.min(abs(unlist(performance(PredProb1, "sens")@y.values) - unlist(performance(PredProb1, "spec")@y.values)))
min.x<-unlist(performance(PredProb1, "sens")@x.values)[min.diff]
min.y<-unlist(performance(PredProb1, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Comment: We can see from the Sensitivity-Specificity plot that the optimal cutoff is 0.40. Now we will use this cutoff for our prediction.

Using the Sensitivity-Specificity cutoff

test_audit$PredSubOptimal = ifelse(test_audit$PredProb >= 0.40, "1", "0")


table(test_audit$PredSubOptimal)
## 
##  0  1 
## 85 67

Confusion Matrix using the cutoff

glm.cm = caret::confusionMatrix(as.factor(test_audit$Risk),as.factor(test_audit$PredSubOptimal))
glm.cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 81  4
##          1  4 63
##                                          
##                Accuracy : 0.9474         
##                  95% CI : (0.8989, 0.977)
##     No Information Rate : 0.5592         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8932         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9529         
##             Specificity : 0.9403         
##          Pos Pred Value : 0.9529         
##          Neg Pred Value : 0.9403         
##              Prevalence : 0.5592         
##          Detection Rate : 0.5329         
##    Detection Prevalence : 0.5592         
##       Balanced Accuracy : 0.9466         
##                                          
##        'Positive' Class : 0              
## 

Logistic Regression Model Performance

glm.accuracy = glm.cm$overall[1]
glm.accuracy
##  Accuracy 
## 0.9473684

Comment: As we can see now that the accuracy of the logistic regression model has increased after applying the cutoff.

b) SVM

Tuning the RBF Kernel

tuned = tune.svm(Risk ~ Score + District_Loss + Score_D + Sector_score + 
                   Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B + 
                   Money_Value + Risk_D , 
                 data = train_audit, gamma = seq(0.1, 1, by = 0.5), 
                 cost = seq(0.1, 1, by = 0.5), scale = TRUE)

Model using the values of the best parameters

svm.fit.glm = svm(Risk ~ Score + District_Loss + Score_D + Sector_score + 
                   Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B + 
                   Money_Value + Risk_D , 
                  data = train_audit, gamma = tuned$best.parameters$gamma, 
                  cost = tuned$best.parameters$cost, scale = TRUE)

Predictions on test dataset for SVM

svm.preds.glm = predict(svm.fit.glm, test_audit, type = "response")

test_audit$PredSubSVM = ifelse(svm.preds.glm >= 0.5, "1", "0")


svm.cm = caret::confusionMatrix(as.factor(test_audit$Risk), as.factor(test_audit$PredSubSVM))
svm.cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 82  3
##          1  2 65
##                                           
##                Accuracy : 0.9671          
##                  95% CI : (0.9249, 0.9892)
##     No Information Rate : 0.5526          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9334          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9762          
##             Specificity : 0.9559          
##          Pos Pred Value : 0.9647          
##          Neg Pred Value : 0.9701          
##              Prevalence : 0.5526          
##          Detection Rate : 0.5395          
##    Detection Prevalence : 0.5592          
##       Balanced Accuracy : 0.9660          
##                                           
##        'Positive' Class : 0               
## 

SVM Model Performance

svm.accuracy = svm.cm$overall[1]
svm.accuracy
##  Accuracy 
## 0.9671053

Comment: As expected, the SVM performed better than Logistic Regression because of tuning.

c) Decision Tree

Build model to predict audit risk

dt.audit = rpart(as.factor(Risk) ~ Score + District_Loss + Score_D + Sector_score + 
                   Score_A + Loss + TOTAL + Risk_F + Score_B + 
                   Money_Value + Risk_D , 
                     data = train_audit) 

rattle::fancyRpartPlot(dt.audit, sub = "")

Predictions on test dataset for Decision Tree

dt.audit.pred = predict(dt.audit, test_audit, type = "class")

Confusion matrix and accuracy of model

dt.cm <- confusionMatrix(as.factor(dt.audit.pred), as.factor(test_audit$Risk), positive = '1')
dt.cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 82  0
##          1  3 67
##                                           
##                Accuracy : 0.9803          
##                  95% CI : (0.9434, 0.9959)
##     No Information Rate : 0.5592          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9602          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9647          
##          Pos Pred Value : 0.9571          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.4408          
##          Detection Rate : 0.4408          
##    Detection Prevalence : 0.4605          
##       Balanced Accuracy : 0.9824          
##                                           
##        'Positive' Class : 1               
## 

Decision Tree Model Performance

dt.accuracy <- dt.cm$overall[1]
dt.accuracy
##  Accuracy 
## 0.9802632

Comment: We can see that the accuracy of decision tree model increased compared to Logistic Regression and SVM.

d) Random Forest

Tuning forest hyper-parameters for predictive accuracy

mtry.values = seq(2,6,1)
nodesize.values = seq(2,8,2)
ntree.values = seq(1e3,6e3,1e3)
 
hyper_grid = expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values) #df with all combinations
oob_err = c()

Train a Random Forest model

for (i in 1:nrow(hyper_grid)) {
set.seed(123)
   model = randomForest(as.factor(Risk) ~ Score + District_Loss + Score_D + Sector_score + 
                   Score_A + Loss + TOTAL + Risk_F + Score_B + 
                   Money_Value + Risk_D,
                  data = train_audit,
                  mtry = hyper_grid$mtry[i],
                  nodesize = hyper_grid$nodesize[i],
                  ntree = hyper_grid$ntree[i])  
  
    # Store OOB error for the model                      
    oob_err[i] = model$err.rate[length(model$err.rate)]
}

# Identify optimal set of hyperparmeters based on OOB error
opt_i = which.min(oob_err)
print(hyper_grid[opt_i,])
##   mtry nodesize ntree
## 3    4        2  1000

Build random forest model using best hyperparameter

set.seed(123)
rf_model01 <- randomForest(as.factor(Risk) ~ Score + District_Loss + Score_D + Sector_score +
                             Score_A + Loss + TOTAL + Risk_F + Score_B +
                             Money_Value + Risk_D,
                             data = train_audit, 
                             mtry=4, 
                             ntree=1000, 
                             nodesize=2, 
                             importance=TRUE)

Predicting audit risk using Random Forest model

test_audit$pred_audit <- predict(rf_model01, newdata = test_audit) # predict class

rf.cm <- caret::confusionMatrix(as.factor(test_audit$pred_audit), as.factor(test_audit$Risk))
rf.cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 83  0
##          1  2 67
##                                           
##                Accuracy : 0.9868          
##                  95% CI : (0.9533, 0.9984)
##     No Information Rate : 0.5592          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9734          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.9765          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9710          
##              Prevalence : 0.5592          
##          Detection Rate : 0.5461          
##    Detection Prevalence : 0.5461          
##       Balanced Accuracy : 0.9882          
##                                           
##        'Positive' Class : 0               
## 

Random Forest Model Performance

rf.accuracy <- rf.cm$overall[1]
rf.accuracy
##  Accuracy 
## 0.9868421

Comment: As we can see Random Forest outperformed all the other models because it is fairly good with the missing values, outliers, or unbalanced data.

Model comparison

accuracy.df = data.frame(glm.accuracy, svm.accuracy, dt.accuracy, rf.accuracy)
colnames(accuracy.df) <- c("Logistic Regression", "SVM", "Decision Tree", "Random Forest")

accuracy.df
##          Logistic Regression       SVM Decision Tree Random Forest
## Accuracy           0.9473684 0.9671053     0.9802632     0.9868421