audit_risk <- read_delim("audit_risk.csv", delim = ",")
## Rows: 776 Columns: 27
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): LOCATION_ID
## dbl (26): Sector_score, PARA_A, Score_A, Risk_A, PARA_B, Score_B, Risk_B, TO...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trial <- read_delim("trial.csv", delim = ",")
## Rows: 776 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): LOCATION_ID
## dbl (17): Sector_score, PARA_A, SCORE_A, PARA_B, SCORE_B, TOTAL, numbers, Ma...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(audit_risk)
## spec_tbl_df [776 x 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Sector_score : num [1:776] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
## $ LOCATION_ID : chr [1:776] "23" "6" "6" "6" ...
## $ PARA_A : num [1:776] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
## $ Score_A : num [1:776] 0.6 0.2 0.2 0.2 0.2 0.2 0.4 0.6 0.6 0.6 ...
## $ Risk_A : num [1:776] 2.508 0 0.102 0 0 ...
## $ PARA_B : num [1:776] 2.5 4.83 0.23 10.8 0.08 ...
## $ Score_B : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.4 0.6 0.6 0.2 ...
## $ Risk_B : num [1:776] 0.5 0.966 0.046 6.48 0.016 ...
## $ TOTAL : num [1:776] 6.68 4.83 0.74 10.8 0.08 ...
## $ numbers : num [1:776] 5 5 5 6 5 5 5 5.5 5.5 5 ...
## $ Score_C : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.2 0.4 0.4 0.2 ...
## $ Risk_C : num [1:776] 1 1 1 3.6 1 1 1 2.2 2.2 1 ...
## $ Money_Value : num [1:776] 3.38 0.94 0 11.75 0 ...
## $ Score_D : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.6 0.4 0.4 0.2 ...
## $ Risk_D : num [1:776] 0.676 0.188 0 7.05 0 ...
## $ District_Loss : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## $ PROB : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ Risk_E : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ History : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ Prob : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ Risk_F : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ Score : num [1:776] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
## $ Inherent_Risk : num [1:776] 8.57 2.55 1.55 17.53 1.42 ...
## $ CONTROL_RISK : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ Detection_Risk: num [1:776] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ Audit_Risk : num [1:776] 1.715 0.511 0.31 3.506 0.283 ...
## $ Risk : num [1:776] 1 0 0 1 0 0 1 1 1 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Sector_score = col_double(),
## .. LOCATION_ID = col_character(),
## .. PARA_A = col_double(),
## .. Score_A = col_double(),
## .. Risk_A = col_double(),
## .. PARA_B = col_double(),
## .. Score_B = col_double(),
## .. Risk_B = col_double(),
## .. TOTAL = col_double(),
## .. numbers = col_double(),
## .. Score_C = col_double(),
## .. Risk_C = col_double(),
## .. Money_Value = col_double(),
## .. Score_D = col_double(),
## .. Risk_D = col_double(),
## .. District_Loss = col_double(),
## .. PROB = col_double(),
## .. Risk_E = col_double(),
## .. History = col_double(),
## .. Prob = col_double(),
## .. Risk_F = col_double(),
## .. Score = col_double(),
## .. Inherent_Risk = col_double(),
## .. CONTROL_RISK = col_double(),
## .. Detection_Risk = col_double(),
## .. Audit_Risk = col_double(),
## .. Risk = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(trial)
## spec_tbl_df [776 x 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Sector_score : num [1:776] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
## $ LOCATION_ID : chr [1:776] "23" "6" "6" "6" ...
## $ PARA_A : num [1:776] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
## $ SCORE_A : num [1:776] 6 2 2 2 2 2 4 6 6 6 ...
## $ PARA_B : num [1:776] 2.5 4.83 0.23 10.8 0.08 ...
## $ SCORE_B : num [1:776] 2 2 2 6 2 2 4 6 6 2 ...
## $ TOTAL : num [1:776] 6.68 4.83 0.74 10.8 0.08 ...
## $ numbers : num [1:776] 5 5 5 6 5 5 5 5.5 5.5 5 ...
## $ Marks : num [1:776] 2 2 2 6 2 2 2 4 4 2 ...
## $ Money_Value : num [1:776] 3.38 0.94 0 11.75 0 ...
## $ MONEY_Marks : num [1:776] 2 2 2 6 2 2 6 4 4 2 ...
## $ District : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## $ Loss : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ LOSS_SCORE : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## $ History : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ History_score: num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## $ Score : num [1:776] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
## $ Risk : num [1:776] 1 0 0 1 0 0 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Sector_score = col_double(),
## .. LOCATION_ID = col_character(),
## .. PARA_A = col_double(),
## .. SCORE_A = col_double(),
## .. PARA_B = col_double(),
## .. SCORE_B = col_double(),
## .. TOTAL = col_double(),
## .. numbers = col_double(),
## .. Marks = col_double(),
## .. Money_Value = col_double(),
## .. MONEY_Marks = col_double(),
## .. District = col_double(),
## .. Loss = col_double(),
## .. LOSS_SCORE = col_double(),
## .. History = col_double(),
## .. History_score = col_double(),
## .. Score = col_double(),
## .. Risk = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
trial$SCORE_A <- trial$SCORE_A/10
trial$SCORE_B <- trial$SCORE_B/10
all.equal(trial$SCORE_A, audit_risk$Score_A)
## [1] TRUE
all.equal(trial$SCORE_B, audit_risk$Score_B)
## [1] TRUE
all.equal(trial$Sector_score, audit_risk$Sector_score)
## [1] TRUE
all.equal(trial$LOCATION_ID, audit_risk$LOCATION_ID)
## [1] TRUE
all.equal(trial$PARA_A, audit_risk$PARA_A)
## [1] TRUE
all.equal(trial$PARA_B, audit_risk$PARA_B)
## [1] TRUE
all.equal(trial$History, audit_risk$History)
## [1] TRUE
all.equal(trial$TOTAL, audit_risk$TOTAL)
## [1] TRUE
all.equal(trial$numbers, audit_risk$numbers)
## [1] TRUE
all.equal(trial$Score, audit_risk$Score)
## [1] TRUE
all.equal(trial$Money_Value, audit_risk$Money_Value)
## [1] TRUE
all.equal(trial$District, audit_risk$District_Loss)
## [1] TRUE
Comment: We can see that the columns of both the dataset Audit and Trial match so we can remove these duplicate columns.
trial[, c('Sector_score', 'LOCATION_ID', 'PARA_A', 'SCORE_A', 'PARA_B', 'SCORE_B',
'TOTAL', 'numbers', 'Score', 'History', 'Money_Value', 'District', 'Risk')] <- list(NULL)
audit_new <- bind_cols(audit_risk, trial)
str(audit_new)
## spec_tbl_df [776 x 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Sector_score : num [1:776] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
## $ LOCATION_ID : chr [1:776] "23" "6" "6" "6" ...
## $ PARA_A : num [1:776] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
## $ Score_A : num [1:776] 0.6 0.2 0.2 0.2 0.2 0.2 0.4 0.6 0.6 0.6 ...
## $ Risk_A : num [1:776] 2.508 0 0.102 0 0 ...
## $ PARA_B : num [1:776] 2.5 4.83 0.23 10.8 0.08 ...
## $ Score_B : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.4 0.6 0.6 0.2 ...
## $ Risk_B : num [1:776] 0.5 0.966 0.046 6.48 0.016 ...
## $ TOTAL : num [1:776] 6.68 4.83 0.74 10.8 0.08 ...
## $ numbers : num [1:776] 5 5 5 6 5 5 5 5.5 5.5 5 ...
## $ Score_C : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.2 0.4 0.4 0.2 ...
## $ Risk_C : num [1:776] 1 1 1 3.6 1 1 1 2.2 2.2 1 ...
## $ Money_Value : num [1:776] 3.38 0.94 0 11.75 0 ...
## $ Score_D : num [1:776] 0.2 0.2 0.2 0.6 0.2 0.2 0.6 0.4 0.4 0.2 ...
## $ Risk_D : num [1:776] 0.676 0.188 0 7.05 0 ...
## $ District_Loss : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## $ PROB : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ Risk_E : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ History : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ Prob : num [1:776] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ Risk_F : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ Score : num [1:776] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
## $ Inherent_Risk : num [1:776] 8.57 2.55 1.55 17.53 1.42 ...
## $ CONTROL_RISK : num [1:776] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ Detection_Risk: num [1:776] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ Audit_Risk : num [1:776] 1.715 0.511 0.31 3.506 0.283 ...
## $ Risk : num [1:776] 1 0 0 1 0 0 1 1 1 0 ...
## $ Marks : num [1:776] 2 2 2 6 2 2 2 4 4 2 ...
## $ MONEY_Marks : num [1:776] 2 2 2 6 2 2 6 4 4 2 ...
## $ Loss : num [1:776] 0 0 0 0 0 0 0 0 0 0 ...
## $ LOSS_SCORE : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## $ History_score : num [1:776] 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "spec")=
## .. cols(
## .. Sector_score = col_double(),
## .. LOCATION_ID = col_character(),
## .. PARA_A = col_double(),
## .. Score_A = col_double(),
## .. Risk_A = col_double(),
## .. PARA_B = col_double(),
## .. Score_B = col_double(),
## .. Risk_B = col_double(),
## .. TOTAL = col_double(),
## .. numbers = col_double(),
## .. Score_C = col_double(),
## .. Risk_C = col_double(),
## .. Money_Value = col_double(),
## .. Score_D = col_double(),
## .. Risk_D = col_double(),
## .. District_Loss = col_double(),
## .. PROB = col_double(),
## .. Risk_E = col_double(),
## .. History = col_double(),
## .. Prob = col_double(),
## .. Risk_F = col_double(),
## .. Score = col_double(),
## .. Inherent_Risk = col_double(),
## .. CONTROL_RISK = col_double(),
## .. Detection_Risk = col_double(),
## .. Audit_Risk = col_double(),
## .. Risk = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(audit_new)
## Sector_score LOCATION_ID PARA_A Score_A
## Min. : 1.85 Length:776 Min. : 0.000 Min. :0.2000
## 1st Qu.: 2.37 Class :character 1st Qu.: 0.210 1st Qu.:0.2000
## Median : 3.89 Mode :character Median : 0.875 Median :0.2000
## Mean :20.18 Mean : 2.450 Mean :0.3513
## 3rd Qu.:55.57 3rd Qu.: 2.480 3rd Qu.:0.6000
## Max. :59.85 Max. :85.000 Max. :0.6000
##
## Risk_A PARA_B Score_B Risk_B
## Min. : 0.000 Min. : 0.000 Min. :0.2000 Min. : 0.000
## 1st Qu.: 0.042 1st Qu.: 0.000 1st Qu.:0.2000 1st Qu.: 0.000
## Median : 0.175 Median : 0.405 Median :0.2000 Median : 0.081
## Mean : 1.351 Mean : 10.800 Mean :0.3131 Mean : 6.334
## 3rd Qu.: 1.488 3rd Qu.: 4.160 3rd Qu.:0.4000 3rd Qu.: 1.841
## Max. :51.000 Max. :1264.630 Max. :0.6000 Max. :758.778
##
## TOTAL numbers Score_C Risk_C
## Min. : 0.0000 Min. :5.000 Min. :0.2000 Min. :1.000
## 1st Qu.: 0.5375 1st Qu.:5.000 1st Qu.:0.2000 1st Qu.:1.000
## Median : 1.3700 Median :5.000 Median :0.2000 Median :1.000
## Mean : 13.2185 Mean :5.068 Mean :0.2237 Mean :1.153
## 3rd Qu.: 7.7075 3rd Qu.:5.000 3rd Qu.:0.2000 3rd Qu.:1.000
## Max. :1268.9100 Max. :9.000 Max. :0.6000 Max. :5.400
##
## Money_Value Score_D Risk_D District_Loss
## Min. : 0.000 Min. :0.200 Min. : 0.000 Min. :2.000
## 1st Qu.: 0.000 1st Qu.:0.200 1st Qu.: 0.000 1st Qu.:2.000
## Median : 0.090 Median :0.200 Median : 0.018 Median :2.000
## Mean : 14.138 Mean :0.291 Mean : 8.265 Mean :2.505
## 3rd Qu.: 5.595 3rd Qu.:0.400 3rd Qu.: 2.235 3rd Qu.:2.000
## Max. :935.030 Max. :0.600 Max. :561.018 Max. :6.000
## NA's :1
## PROB Risk_E History Prob
## Min. :0.2000 Min. :0.4000 Min. :0.0000 Min. :0.2000
## 1st Qu.:0.2000 1st Qu.:0.4000 1st Qu.:0.0000 1st Qu.:0.2000
## Median :0.2000 Median :0.4000 Median :0.0000 Median :0.2000
## Mean :0.2062 Mean :0.5191 Mean :0.1044 Mean :0.2168
## 3rd Qu.:0.2000 3rd Qu.:0.4000 3rd Qu.:0.0000 3rd Qu.:0.2000
## Max. :0.6000 Max. :2.4000 Max. :9.0000 Max. :0.6000
##
## Risk_F Score Inherent_Risk CONTROL_RISK
## Min. :0.00000 Min. :2.000 Min. : 1.400 Min. :0.4000
## 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.: 1.583 1st Qu.:0.4000
## Median :0.00000 Median :2.400 Median : 2.214 Median :0.4000
## Mean :0.05361 Mean :2.703 Mean : 17.681 Mean :0.5727
## 3rd Qu.:0.00000 3rd Qu.:3.250 3rd Qu.: 10.664 3rd Qu.:0.4000
## Max. :5.40000 Max. :5.200 Max. :801.262 Max. :5.8000
##
## Detection_Risk Audit_Risk Risk Marks
## Min. :0.5 Min. : 0.2800 Min. :0.000 Min. :2.000
## 1st Qu.:0.5 1st Qu.: 0.3167 1st Qu.:0.000 1st Qu.:2.000
## Median :0.5 Median : 0.5556 Median :0.000 Median :2.000
## Mean :0.5 Mean : 7.1682 Mean :0.393 Mean :2.237
## 3rd Qu.:0.5 3rd Qu.: 3.2499 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :0.5 Max. :961.5144 Max. :1.000 Max. :6.000
##
## MONEY_Marks Loss LOSS_SCORE History_score
## Min. :2.00 Min. :0.00000 Min. :2.000 Min. :2.000
## 1st Qu.:2.00 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.00 Median :0.00000 Median :2.000 Median :2.000
## Mean :2.91 Mean :0.02964 Mean :2.062 Mean :2.168
## 3rd Qu.:4.00 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :6.00 Max. :2.00000 Max. :6.000 Max. :6.000
##
Comment: We can see that Money_Value has 1 missing value and Detection_Risk has 0.5 value throughout that concluded we do not need Detection_Risk for our analysis.
audit_new$Money_Value[is.na(audit_new$Money_Value)] <- median(audit_new$Money_Value, na.rm = T)
sum(is.na(audit_new$Money_Value))
## [1] 0
Comment: We can see that Money_Value does not have any missing value now.
audit_new[, c('Detection_Risk')] <- list(NULL)
unique(audit_new$LOCATION_ID)
## [1] "23" "6" "7" "8" "13" "37" "24"
## [8] "3" "4" "14" "5" "20" "19" "21"
## [15] "22" "9" "11" "12" "29" "30" "38"
## [22] "31" "2" "32" "16" "33" "15" "36"
## [29] "34" "18" "25" "39" "27" "35" "40"
## [36] "41" "42" "1" "28" "LOHARU" "NUH" "SAFIDON"
## [43] "43" "44" "17"
Comment: We can see that Location_ID has some character values. We will remove these 3 rows from the dataset.
audit_new <- audit_new[!(audit_new$LOCATION_ID == 'LOHARU' |
audit_new$LOCATION_ID == 'NUH' |
audit_new$LOCATION_ID == 'SAFIDON'),]
audit_new$LOCATION_ID = as.factor(audit_new$LOCATION_ID)
str(audit_new)
## tibble [773 x 31] (S3: tbl_df/tbl/data.frame)
## $ Sector_score : num [1:773] 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 3.89 ...
## $ LOCATION_ID : Factor w/ 42 levels "1","11","12",..: 15 39 39 39 39 39 40 41 41 41 ...
## $ PARA_A : num [1:773] 4.18 0 0.51 0 0 0 1.1 8.5 8.4 3.98 ...
## $ Score_A : num [1:773] 0.6 0.2 0.2 0.2 0.2 0.2 0.4 0.6 0.6 0.6 ...
## $ Risk_A : num [1:773] 2.508 0 0.102 0 0 ...
## $ PARA_B : num [1:773] 2.5 4.83 0.23 10.8 0.08 ...
## $ Score_B : num [1:773] 0.2 0.2 0.2 0.6 0.2 0.2 0.4 0.6 0.6 0.2 ...
## $ Risk_B : num [1:773] 0.5 0.966 0.046 6.48 0.016 ...
## $ TOTAL : num [1:773] 6.68 4.83 0.74 10.8 0.08 ...
## $ numbers : num [1:773] 5 5 5 6 5 5 5 5.5 5.5 5 ...
## $ Score_C : num [1:773] 0.2 0.2 0.2 0.6 0.2 0.2 0.2 0.4 0.4 0.2 ...
## $ Risk_C : num [1:773] 1 1 1 3.6 1 1 1 2.2 2.2 1 ...
## $ Money_Value : num [1:773] 3.38 0.94 0 11.75 0 ...
## $ Score_D : num [1:773] 0.2 0.2 0.2 0.6 0.2 0.2 0.6 0.4 0.4 0.2 ...
## $ Risk_D : num [1:773] 0.676 0.188 0 7.05 0 ...
## $ District_Loss: num [1:773] 2 2 2 2 2 2 2 2 2 2 ...
## $ PROB : num [1:773] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ Risk_E : num [1:773] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ History : num [1:773] 0 0 0 0 0 0 0 0 0 0 ...
## $ Prob : num [1:773] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
## $ Risk_F : num [1:773] 0 0 0 0 0 0 0 0 0 0 ...
## $ Score : num [1:773] 2.4 2 2 4.4 2 2 3.2 4.2 4.2 2.4 ...
## $ Inherent_Risk: num [1:773] 8.57 2.55 1.55 17.53 1.42 ...
## $ CONTROL_RISK : num [1:773] 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
## $ Audit_Risk : num [1:773] 1.715 0.511 0.31 3.506 0.283 ...
## $ Risk : num [1:773] 1 0 0 1 0 0 1 1 1 0 ...
## $ Marks : num [1:773] 2 2 2 6 2 2 2 4 4 2 ...
## $ MONEY_Marks : num [1:773] 2 2 2 6 2 2 6 4 4 2 ...
## $ Loss : num [1:773] 0 0 0 0 0 0 0 0 0 0 ...
## $ LOSS_SCORE : num [1:773] 2 2 2 2 2 2 2 2 2 2 ...
## $ History_score: num [1:773] 2 2 2 2 2 2 2 2 2 2 ...
sum(duplicated(audit_new))
## [1] 13
Comment: We have 13 duplicate rows in the dataset, we will be removing those because it might bias the analysis.
audit_new <- unique(audit_new)
ggplot(audit_new, aes(x = as.factor(Risk))) +
geom_bar(aes(fill= as.factor(Risk))) +
geom_text(stat = "count", aes(label = ..count..), size = 5,)
Comment: From the above plot we can see that the dataset is not that imbalanced.
pairs(audit_new[,c(1, 3, 5, 6, 8, 9, 10, 12, 13, 15, 16, 18, 21, 30, 31 )])
Comments: We can see that there is correlation between Para_A and Risk_A and between Money_Value and Risk_D
Do fraudulent transactions occur more for certain money values?
boxplot(Money_Value ~ Risk, data = audit_new)
How the response variable (Risk) looks like in relationship with money value?
audit_new %>%
group_by(Risk) %>%
summarize(min = min(Money_Value),
median = median(Money_Value),
mean = round(mean(Money_Value),2),
max = max(Money_Value))
## # A tibble: 2 x 5
## Risk min median mean max
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 0.37 8.21
## 2 1 0 9.75 35.5 935.
Comments: Here we can see that as expected, the more activity a company has (which is reflected by their Money Value) the more likely that any fraudulent activity can be taking place. We can also see that for “no risk at all” (Risk = 0) the maximum Money Value we have is 8.21.
Are there any location ids that have a higher audit risk?
audit_new %>%
ggplot(aes(x = LOCATION_ID, y = Audit_Risk)) +
geom_point()
Comment: We found that the location id 2 has the highest audit risk which will lead to Risk = 1
audit_glm = glm(formula = Risk ~ ., data = audit_new)
summary(audit_glm)
##
## Call:
## glm(formula = Risk ~ ., data = audit_new)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.66874 -0.12670 -0.01766 0.07071 0.76598
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3830035 1.6954890 -0.226 0.821349
## Sector_score -0.0011833 0.0004206 -2.814 0.005034 **
## LOCATION_ID11 0.2574022 0.0967371 2.661 0.007974 **
## LOCATION_ID12 0.2515845 0.0918968 2.738 0.006345 **
## LOCATION_ID13 0.2303455 0.0947018 2.432 0.015253 *
## LOCATION_ID14 0.2079508 0.1007209 2.065 0.039328 *
## LOCATION_ID15 0.1766392 0.0942246 1.875 0.061258 .
## LOCATION_ID16 0.1459474 0.0884235 1.651 0.099282 .
## LOCATION_ID17 0.2363941 0.2511565 0.941 0.346918
## LOCATION_ID18 0.2440441 0.1024057 2.383 0.017434 *
## LOCATION_ID19 0.2550547 0.0872659 2.923 0.003582 **
## LOCATION_ID2 0.2346071 0.0902049 2.601 0.009497 **
## LOCATION_ID20 0.2893866 0.1351300 2.142 0.032577 *
## LOCATION_ID21 0.1346268 0.1194071 1.127 0.259936
## LOCATION_ID22 0.1868199 0.0982779 1.901 0.057723 .
## LOCATION_ID23 0.7524498 0.2539217 2.963 0.003147 **
## LOCATION_ID24 0.1930840 0.2544526 0.759 0.448216
## LOCATION_ID25 0.2203093 0.1285624 1.714 0.087041 .
## LOCATION_ID27 0.0641308 0.1261615 0.508 0.611388
## LOCATION_ID28 0.2810318 0.1171610 2.399 0.016716 *
## LOCATION_ID29 0.2568824 0.0998976 2.571 0.010334 *
## LOCATION_ID3 0.1143117 0.1609560 0.710 0.477815
## LOCATION_ID30 0.2772368 0.1452556 1.909 0.056723 .
## LOCATION_ID31 0.0744278 0.1092585 0.681 0.495966
## LOCATION_ID32 0.1395604 0.0946509 1.474 0.140806
## LOCATION_ID33 0.4240991 0.2604555 1.628 0.103914
## LOCATION_ID34 0.2705853 0.2545939 1.063 0.288236
## LOCATION_ID35 0.0326710 0.1894288 0.172 0.863117
## LOCATION_ID36 0.2167367 0.1460804 1.484 0.138346
## LOCATION_ID37 0.2301872 0.1132255 2.033 0.042432 *
## LOCATION_ID38 0.1543471 0.1472464 1.048 0.294900
## LOCATION_ID39 0.3597260 0.1175724 3.060 0.002301 **
## LOCATION_ID4 0.2492708 0.0933776 2.669 0.007774 **
## LOCATION_ID40 0.2023971 0.1841862 1.099 0.272204
## LOCATION_ID41 0.1651569 0.2543457 0.649 0.516333
## LOCATION_ID42 0.1357814 0.2529688 0.537 0.591611
## LOCATION_ID43 0.1476109 0.1237343 1.193 0.233289
## LOCATION_ID44 0.4426633 0.2510519 1.763 0.078300 .
## LOCATION_ID5 0.2003184 0.0924720 2.166 0.030630 *
## LOCATION_ID6 0.1920891 0.0944728 2.033 0.042404 *
## LOCATION_ID7 0.1769363 0.1454134 1.217 0.224100
## LOCATION_ID8 0.2128274 0.0878347 2.423 0.015645 *
## LOCATION_ID9 0.2478953 0.0921901 2.689 0.007339 **
## PARA_A -0.0720880 0.0441813 -1.632 0.103208
## Score_A 0.4903076 0.0745215 6.579 9.29e-11 ***
## Risk_A 0.1298831 0.0690804 1.880 0.060502 .
## PARA_B 0.0663981 0.0226645 2.930 0.003505 **
## Score_B 0.7865866 0.0814324 9.659 < 2e-16 ***
## Risk_B -0.0987223 0.0275721 -3.581 0.000367 ***
## TOTAL -0.0056284 0.0138251 -0.407 0.684046
## numbers -0.1547765 0.3384589 -0.457 0.647600
## Score_C -1.0864821 2.3311008 -0.466 0.641303
## Risk_C 0.2015124 0.4940976 0.408 0.683517
## Money_Value 0.0563557 0.0125810 4.479 8.75e-06 ***
## Score_D 1.1969019 0.0849080 14.096 < 2e-16 ***
## Risk_D -0.0931575 0.0209579 -4.445 1.02e-05 ***
## District_Loss 0.1545024 0.0365866 4.223 2.73e-05 ***
## PROB 1.3855819 1.6713128 0.829 0.407367
## Risk_E -0.0015930 0.1682499 -0.009 0.992448
## History 0.0188982 0.1990701 0.095 0.924396
## Prob -0.3868537 0.3503493 -1.104 0.269890
## Risk_F 0.1321981 0.3095144 0.427 0.669428
## Score NA NA NA NA
## Inherent_Risk NA NA NA NA
## CONTROL_RISK NA NA NA NA
## Audit_Risk -0.0029946 0.0008836 -3.389 0.000741 ***
## Marks NA NA NA NA
## MONEY_Marks NA NA NA NA
## Loss -0.0792544 0.2942355 -0.269 0.787735
## LOSS_SCORE NA NA NA NA
## History_score NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.05557932)
##
## Null deviance: 182.400 on 759 degrees of freedom
## Residual deviance: 38.683 on 696 degrees of freedom
## AIC: 23.573
##
## Number of Fisher Scoring iterations: 2
Comments: I ran a full model to check which variables have an effect on the response variable. However, while trying different combination of variables the significance kept changing each time. So I decided to apply stepwise selection.
glm.null.audit = glm(Risk ~ 1, data = audit_new)
glm.full.audit = glm(Risk ~ .,
data = audit_new)
step.AIC = step(glm.null.audit, scope = list(upper=glm.full.audit),
direction ="both",test ="Chisq", trace = F)
step.AIC
##
## Call: glm(formula = Risk ~ Score + District_Loss + Score_D + Sector_score +
## Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B +
## Money_Value + Risk_D, data = audit_new)
##
## Coefficients:
## (Intercept) Score District_Loss Score_D Sector_score
## -0.630298 -0.160586 0.166156 1.430392 -0.001281
## Audit_Risk Score_A Loss TOTAL Risk_F
## -0.002975 0.719352 0.200060 0.001495 0.144392
## Score_B Money_Value Risk_D
## 1.134973 0.070668 -0.117111
##
## Degrees of Freedom: 759 Total (i.e. Null); 747 Residual
## Null Deviance: 182.4
## Residual Deviance: 42.61 AIC: -5.001
summary(step.AIC)
##
## Call:
## glm(formula = Risk ~ Score + District_Loss + Score_D + Sector_score +
## Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B +
## Money_Value + Risk_D, data = audit_new)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.66946 -0.14249 -0.03200 0.03947 0.75969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6302977 0.0524977 -12.006 < 2e-16 ***
## Score -0.1605864 0.0621197 -2.585 0.00992 **
## District_Loss 0.1661558 0.0101102 16.434 < 2e-16 ***
## Score_D 1.4303920 0.1180163 12.120 < 2e-16 ***
## Sector_score -0.0012815 0.0004096 -3.129 0.00182 **
## Audit_Risk -0.0029755 0.0006844 -4.347 1.57e-05 ***
## Score_A 0.7193523 0.0928920 7.744 3.14e-14 ***
## Loss 0.2000604 0.0480261 4.166 3.47e-05 ***
## TOTAL 0.0014947 0.0004942 3.024 0.00258 **
## Risk_F 0.1443921 0.0345638 4.178 3.29e-05 ***
## Score_B 1.1349726 0.2090378 5.430 7.65e-08 ***
## Money_Value 0.0706676 0.0122357 5.776 1.13e-08 ***
## Risk_D -0.1171110 0.0203692 -5.749 1.31e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.05703748)
##
## Null deviance: 182.400 on 759 degrees of freedom
## Residual deviance: 42.607 on 747 degrees of freedom
## AIC: -5.0014
##
## Number of Fisher Scoring iterations: 2
Comment: As we can see the result from the AIC selection, all the variables from the final AIC model are significant and we will be using these going forward for all the models.
set.seed(1)
row.number = sample(1:nrow(audit_new), 0.8*nrow(audit_new))
train_audit = audit_new[row.number,]
test_audit = audit_new[-row.number,]
audit_glm1 = glm(formula = Risk ~ Score + District_Loss + Score_D + Sector_score +
Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B +
Money_Value + Risk_D, data = train_audit)
test_audit$PredProb = predict.glm(audit_glm1, newdata = test_audit, type = "response")
test_audit$PredSub = ifelse(test_audit$PredProb >= 0.5, "1", "0")
table(test_audit$PredSub)
##
## 0 1
## 89 63
caret::confusionMatrix(as.factor(test_audit$Risk), as.factor(test_audit$PredSub))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 81 4
## 1 8 59
##
## Accuracy : 0.9211
## 95% CI : (0.8662, 0.9585)
## No Information Rate : 0.5855
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8388
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.9101
## Specificity : 0.9365
## Pos Pred Value : 0.9529
## Neg Pred Value : 0.8806
## Prevalence : 0.5855
## Detection Rate : 0.5329
## Detection Prevalence : 0.5592
## Balanced Accuracy : 0.9233
##
## 'Positive' Class : 0
##
Comment: Here we can see that the accuracy is 92.11%. Now we will check the optimal cutoff.
PredProb1 = prediction(predict.glm(step.AIC, newdata = test_audit, type = "response"), test_audit$Risk)
# Computing threshold for cutoff to best trade off sensitivity and specificity
plot(unlist(performance(PredProb1,'sens')@x.values),unlist(performance(PredProb1,'sens')@y.values), type='l', lwd=2, ylab = "", xlab = 'Cutoff')
mtext('Sensitivity',side=2)
mtext('Sensitivity vs. Specificity Plot for AIC Model', side=3)
# Second specificity in same plot
par(new=TRUE)
plot(unlist(performance(PredProb1,'spec')@x.values),unlist(performance(PredProb1,'spec')@y.values), type='l', lwd=2,col='red', ylab = "", xlab = 'Cutoff')
axis(4,at=seq(0,1,0.2))
mtext('Specificity',side=4, col='red')
par(new=TRUE)
min.diff <-which.min(abs(unlist(performance(PredProb1, "sens")@y.values) - unlist(performance(PredProb1, "spec")@y.values)))
min.x<-unlist(performance(PredProb1, "sens")@x.values)[min.diff]
min.y<-unlist(performance(PredProb1, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
Comment: We can see from the Sensitivity-Specificity plot that the optimal cutoff is 0.40. Now we will use this cutoff for our prediction.
test_audit$PredSubOptimal = ifelse(test_audit$PredProb >= 0.40, "1", "0")
table(test_audit$PredSubOptimal)
##
## 0 1
## 85 67
glm.cm = caret::confusionMatrix(as.factor(test_audit$Risk),as.factor(test_audit$PredSubOptimal))
glm.cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 81 4
## 1 4 63
##
## Accuracy : 0.9474
## 95% CI : (0.8989, 0.977)
## No Information Rate : 0.5592
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8932
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9529
## Specificity : 0.9403
## Pos Pred Value : 0.9529
## Neg Pred Value : 0.9403
## Prevalence : 0.5592
## Detection Rate : 0.5329
## Detection Prevalence : 0.5592
## Balanced Accuracy : 0.9466
##
## 'Positive' Class : 0
##
glm.accuracy = glm.cm$overall[1]
glm.accuracy
## Accuracy
## 0.9473684
Comment: As we can see now that the accuracy of the logistic regression model has increased after applying the cutoff.
tuned = tune.svm(Risk ~ Score + District_Loss + Score_D + Sector_score +
Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B +
Money_Value + Risk_D ,
data = train_audit, gamma = seq(0.1, 1, by = 0.5),
cost = seq(0.1, 1, by = 0.5), scale = TRUE)
svm.fit.glm = svm(Risk ~ Score + District_Loss + Score_D + Sector_score +
Audit_Risk + Score_A + Loss + TOTAL + Risk_F + Score_B +
Money_Value + Risk_D ,
data = train_audit, gamma = tuned$best.parameters$gamma,
cost = tuned$best.parameters$cost, scale = TRUE)
svm.preds.glm = predict(svm.fit.glm, test_audit, type = "response")
test_audit$PredSubSVM = ifelse(svm.preds.glm >= 0.5, "1", "0")
svm.cm = caret::confusionMatrix(as.factor(test_audit$Risk), as.factor(test_audit$PredSubSVM))
svm.cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 82 3
## 1 2 65
##
## Accuracy : 0.9671
## 95% CI : (0.9249, 0.9892)
## No Information Rate : 0.5526
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9334
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9762
## Specificity : 0.9559
## Pos Pred Value : 0.9647
## Neg Pred Value : 0.9701
## Prevalence : 0.5526
## Detection Rate : 0.5395
## Detection Prevalence : 0.5592
## Balanced Accuracy : 0.9660
##
## 'Positive' Class : 0
##
svm.accuracy = svm.cm$overall[1]
svm.accuracy
## Accuracy
## 0.9671053
Comment: As expected, the SVM performed better than Logistic Regression because of tuning.
dt.audit = rpart(as.factor(Risk) ~ Score + District_Loss + Score_D + Sector_score +
Score_A + Loss + TOTAL + Risk_F + Score_B +
Money_Value + Risk_D ,
data = train_audit)
rattle::fancyRpartPlot(dt.audit, sub = "")
dt.audit.pred = predict(dt.audit, test_audit, type = "class")
dt.cm <- confusionMatrix(as.factor(dt.audit.pred), as.factor(test_audit$Risk), positive = '1')
dt.cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 82 0
## 1 3 67
##
## Accuracy : 0.9803
## 95% CI : (0.9434, 0.9959)
## No Information Rate : 0.5592
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9602
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 1.0000
## Specificity : 0.9647
## Pos Pred Value : 0.9571
## Neg Pred Value : 1.0000
## Prevalence : 0.4408
## Detection Rate : 0.4408
## Detection Prevalence : 0.4605
## Balanced Accuracy : 0.9824
##
## 'Positive' Class : 1
##
dt.accuracy <- dt.cm$overall[1]
dt.accuracy
## Accuracy
## 0.9802632
Comment: We can see that the accuracy of decision tree model increased compared to Logistic Regression and SVM.
mtry.values = seq(2,6,1)
nodesize.values = seq(2,8,2)
ntree.values = seq(1e3,6e3,1e3)
hyper_grid = expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values) #df with all combinations
oob_err = c()
for (i in 1:nrow(hyper_grid)) {
set.seed(123)
model = randomForest(as.factor(Risk) ~ Score + District_Loss + Score_D + Sector_score +
Score_A + Loss + TOTAL + Risk_F + Score_B +
Money_Value + Risk_D,
data = train_audit,
mtry = hyper_grid$mtry[i],
nodesize = hyper_grid$nodesize[i],
ntree = hyper_grid$ntree[i])
# Store OOB error for the model
oob_err[i] = model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error
opt_i = which.min(oob_err)
print(hyper_grid[opt_i,])
## mtry nodesize ntree
## 3 4 2 1000
set.seed(123)
rf_model01 <- randomForest(as.factor(Risk) ~ Score + District_Loss + Score_D + Sector_score +
Score_A + Loss + TOTAL + Risk_F + Score_B +
Money_Value + Risk_D,
data = train_audit,
mtry=4,
ntree=1000,
nodesize=2,
importance=TRUE)
test_audit$pred_audit <- predict(rf_model01, newdata = test_audit) # predict class
rf.cm <- caret::confusionMatrix(as.factor(test_audit$pred_audit), as.factor(test_audit$Risk))
rf.cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 83 0
## 1 2 67
##
## Accuracy : 0.9868
## 95% CI : (0.9533, 0.9984)
## No Information Rate : 0.5592
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9734
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9765
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9710
## Prevalence : 0.5592
## Detection Rate : 0.5461
## Detection Prevalence : 0.5461
## Balanced Accuracy : 0.9882
##
## 'Positive' Class : 0
##
rf.accuracy <- rf.cm$overall[1]
rf.accuracy
## Accuracy
## 0.9868421
Comment: As we can see Random Forest outperformed all the other models because it is fairly good with the missing values, outliers, or unbalanced data.
accuracy.df = data.frame(glm.accuracy, svm.accuracy, dt.accuracy, rf.accuracy)
colnames(accuracy.df) <- c("Logistic Regression", "SVM", "Decision Tree", "Random Forest")
accuracy.df
## Logistic Regression SVM Decision Tree Random Forest
## Accuracy 0.9473684 0.9671053 0.9802632 0.9868421