## Import Dataset
library(readxl)
data <- read_excel("data.xlsx")
head(data)
## # A tibble: 6 × 16
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AD 17.5 38675. 173. 0.68 1.22 1.79 -2.08 55 -26.5 2.86 8
## 2 AE 18.2 40105. 104. 1.77 0.870 2.66 -0.725 103. -13.6 353. 8.15
## 3 AE-AZ 18.7 76038. 31.0 2.63 1.49 1.85 -1.90 103. -56.2 200. 8.15
## 4 AE-RK NA 27883. 24.8 1.29 1.75 2.23 -1.14 103. 24.8 10.1 NA
## 5 AM 14 4251. 89.6 1.44 0.256 4.75 2.33 167. 47.3 12.6 6.6
## 6 AO NA 2034. 57.1 22.4 3.34 -0.878 -5.20 34.8 15.4 62.5 10.3
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, Risk_level <dbl>
## Data Preprocessing: detecting missing value
missing <- sum(is.na(data))
total_cells <- nrow(data)*ncol(data)
percentage_missing <- sprintf("%2.f%%", missing/total_cells*100)
print(paste("Amount of missing value:", missing))
## [1] "Amount of missing value: 47"
print(paste("Percentage of missing value:", percentage_missing))
## [1] "Percentage of missing value: 3%"
## Data Preprocessing: identification of missing values in predictor (X) and target (Y: Risk Level) variables
variable <- c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14", "Risk Level")
for(i in variable) {
print(paste("Amount of missing value in", i, "is:", sum(is.na(data[[i]]))))
}
## [1] "Amount of missing value in X1 is: 12"
## [1] "Amount of missing value in X2 is: 0"
## [1] "Amount of missing value in X3 is: 0"
## [1] "Amount of missing value in X4 is: 0"
## [1] "Amount of missing value in X5 is: 0"
## [1] "Amount of missing value in X6 is: 0"
## [1] "Amount of missing value in X7 is: 0"
## [1] "Amount of missing value in X8 is: 7"
## [1] "Amount of missing value in X9 is: 0"
## [1] "Amount of missing value in X10 is: 0"
## [1] "Amount of missing value in X11 is: 17"
## [1] "Amount of missing value in X12 is: 0"
## [1] "Amount of missing value in X13 is: 0"
## [1] "Amount of missing value in X14 is: 11"
## [1] "Amount of missing value in Risk Level is: 0"
## Data cleaning: missing value imputation with Multiple Imputation by Chained Equations (MICE) method
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
imputed_data <- mice(data, m = 5, method = 'pmm', seed = 123 )
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## Warning: Number of logged events: 1
complete_data <- complete(imputed_data, 1)
head(complete_data)
## Country X1 X2 X3 X4 X5 X6 X7
## 1 AD 17.5000 38674.616 172.75400 0.68000 1.2206 1.78560 -2.0843
## 2 AE 18.2000 40105.120 103.52280 1.76600 0.8698 2.65884 -0.7254
## 3 AE-AZ 18.7000 76037.997 31.03626 2.63056 1.4893 1.85034 -1.9008
## 4 AE-RK 16.9822 27882.829 24.78532 1.29416 1.7530 2.23192 -1.1355
## 5 AM 14.0000 4251.398 89.61882 1.44000 0.2562 4.74800 2.3318
## 6 AO 16.6966 2033.900 57.05566 22.35646 3.3422 -0.87800 -5.2032
## X8 X9 X10 X11 X12 X13 X14 Risk_level
## 1 55.00000 -26.52000 2.857862 8.000 23.08410 26.94344 3.00 0
## 2 102.52738 -13.59890 352.910575 8.155 24.85976 32.47740 2.45 0
## 3 102.52738 -56.24160 199.928422 8.155 20.39940 31.03926 0.12 0
## 4 102.52738 24.78532 10.108892 1.660 21.69104 17.30888 7.80 0
## 5 166.80851 47.27262 12.645460 6.600 19.40300 15.11172 18.50 1
## 6 34.81845 15.44938 62.485865 10.300 31.12380 20.57210 10.50 1
## Data cleaning: rechecking missing values
## Data preprocessing: identify missing values in predictor (X) and target (Y: Risk Level) variables.
variable_check <- c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14", "Risk Level")
for(i in variable_check) {
print(paste("Amount of missing value in", i, "is: ", sum(is.na(complete_data[[i]]))))
}
## [1] "Amount of missing value in X1 is: 0"
## [1] "Amount of missing value in X2 is: 0"
## [1] "Amount of missing value in X3 is: 0"
## [1] "Amount of missing value in X4 is: 0"
## [1] "Amount of missing value in X5 is: 0"
## [1] "Amount of missing value in X6 is: 0"
## [1] "Amount of missing value in X7 is: 0"
## [1] "Amount of missing value in X8 is: 0"
## [1] "Amount of missing value in X9 is: 0"
## [1] "Amount of missing value in X10 is: 0"
## [1] "Amount of missing value in X11 is: 0"
## [1] "Amount of missing value in X12 is: 0"
## [1] "Amount of missing value in X13 is: 0"
## [1] "Amount of missing value in X14 is: 0"
## [1] "Amount of missing value in Risk Level is: 0"
## Display a table in R studio
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
table <- tbl_df(complete_data)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
View(table)
## Exploratory Data Analysis (EDA): Descriptive Statistics on each predictor variable (x)
predictors <- complete_data[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")]
summary(predictors)
## X1 X2 X3 X4
## Min. : 4.20 Min. : 434.5 Min. : 13.63 Min. :-0.151
## 1st Qu.:15.93 1st Qu.: 4265.9 1st Qu.: 42.96 1st Qu.: 0.869
## Median :18.58 Median : 11659.1 Median : 70.42 Median : 1.700
## Mean :18.75 Mean : 22641.6 Mean : 191.94 Mean : 3.263
## 3rd Qu.:21.64 3rd Qu.: 34815.2 3rd Qu.: 130.63 3rd Qu.: 3.939
## Max. :47.50 Max. :124340.4 Max. :6908.35 Max. :36.703
## X5 X6 X7 X8
## Min. :-0.8862 Min. :-5.135 Min. :-9.84530 Min. : 34.82
## 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720 1st Qu.: 77.25
## Median : 1.1402 Median : 2.984 Median : 0.07155 Median : 90.31
## Mean : 1.2019 Mean : 3.076 Mean : 0.10804 Mean :100.39
## 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108 3rd Qu.:116.44
## Max. : 4.4021 Max. :10.076 Max. : 6.07120 Max. :359.14
## X9 X10 X11 X12
## Min. :-1955.72 Min. : 1.171 Min. : 0.3357 Min. :12.67
## 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 1.9625 1st Qu.:20.79
## Median : 12.67 Median : 106.872 Median : 3.9790 Median :23.40
## Mean : -13.58 Mean : 582.318 Mean : 5.4023 Mean :24.96
## 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 7.9250 3rd Qu.:28.38
## Max. : 456.49 Max. :14866.703 Max. :26.9780 Max. :46.83
## X13 X14
## Min. :10.95 Min. : 0.120
## 1st Qu.:19.06 1st Qu.: 4.813
## Median :24.28 Median : 7.000
## Mean :24.48 Mean : 8.494
## 3rd Qu.:29.36 3rd Qu.:10.596
## Max. :55.09 Max. :24.650
## Exploratory Data Analysis (EDA): Looking at proportions for Low Risk(0) and High Risk(1)
risk_counts <- table(complete_data$Risk_level)
risk_labels <- c("Low Risk", "High Risk")
percentage <- round(risk_counts/sum(risk_counts)*100)
risk_label_percentage <- paste(risk_labels, percentage, "%", sep = " ")
color_labels <- c("blue","red")
pie(risk_counts, labels = risk_label_percentage, col = color_labels, main = "Risk Level")

## Exploratory Data Analysis (EDA): View the Distribution Graph of each predictor variable(x)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
list_hist_plot <- list()
predictor_labels <- names(complete_data[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")])
for(i in predictor_labels) {
p <- ggplot(data = complete_data, aes_string(x = i)) + geom_histogram(binwidth = 5, fill = "blue", color = "black") + labs(paste("Histogram of", i), x = i, y = "Frequency")
list_hist_plot[[i]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(grobs = list_hist_plot, ncol = 4)

## Exploratory Data Analysis + Data Cleaning Part 2: Detection of outliers in predictor variable(x) with box plot
library(ggplot2)
library(gridExtra)
list_box_plot <- list()
p_detect <- names(complete_data[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")])
for(i in p_detect) {
p <- ggplot(data = complete_data, mapping = aes_string(x = i)) + geom_boxplot(outlier.colour = "red") + labs(paste("Box plot of", i))
list_box_plot[[i]] <- p
}
grid.arrange(grobs = list_box_plot, ncol = 4)

## Exploratory Data Analysis + Data Cleaning Part 2: Outlier Perncentage
calculate_outliers <- function(column_predictors) {
q1 <- quantile(column_predictors, 0.25)
q3 <- quantile(column_predictors, 0.75)
iqr <- q3-q1
lower_bound <- q1 - 1.5*iqr
upper_bound <- q3 + 1.5*iqr
num_outlier <- sum(column_predictors < lower_bound | column_predictors > upper_bound)
total_data <- length(column_predictors)
outlier_percentage <- (num_outlier/(total_data*100))
return(c(num_outlier, outlier_percentage))
}
predictor_columns <- complete_data[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")]
outlier_summaries <- sapply(predictor_columns, calculate_outliers)
total_outlier <- sum(outlier_summaries[1,])
total_data_points <- sum(sapply(predictor_columns, length))
total_percentage_outlier <- (total_outlier/total_data_points)*100
total_data_predictors <- nrow(complete_data) * ncol(complete_data)
print(paste("Total outliers: ", total_outlier))
## [1] "Total outliers: 77"
print(paste("Percentage of outlier: ", total_percentage_outlier, "%"))
## [1] "Percentage of outlier: 5.5 %"
print(paste("Total data predictor: ", total_data_predictors))
## [1] "Total data predictor: 1600"
## Data scaling using Robust Scaling
library(caret)
## Loading required package: lattice
predictors <- complete_data[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")]
process_parameters <- preProcess(predictors, method=c("center","scale"))
predictors_scaled <- predict(process_parameters, predictors)
complete_data[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")] <- predictors_scaled
head(complete_data)
## Country X1 X2 X3 X4 X5 X6
## 1 AD -0.230727264 0.6452923 -0.02750584 -0.5309986 0.01764994 -0.5700293
## 2 AE -0.101433318 0.7028668 -0.12678694 -0.3077268 -0.31258627 -0.1843743
## 3 AE-AZ -0.009080499 2.1490800 -0.23073637 -0.1299812 0.27059882 -0.5414378
## 4 AE-RK -0.326367844 0.2109484 -0.23970053 -0.4047328 0.51884081 -0.3729179
## 5 AM -0.877196998 -0.7401607 -0.14672596 -0.3747495 -0.89021722 0.7382759
## 6 AO -0.379119774 -0.8294098 -0.19342321 3.9254845 2.01488237 -1.7463734
## X7 X8 X9 X10 X11 X12
## 1 -0.8494699 -1.09036143 -0.0595941002 -0.3492613 0.5331255 -0.27916463
## 2 -0.3229352 0.05139249 -0.0000680741 -0.1382721 0.5649355 -0.01530186
## 3 -0.7783689 0.05139249 -0.1965181034 -0.2304799 0.5649355 -0.67811060
## 4 -0.4818372 0.05139249 0.1767636262 -0.3448909 -0.7680085 -0.48617311
## 5 0.8616419 1.59562277 0.2803600388 -0.3433620 0.2458088 -0.82617547
## 6 -2.0579539 -1.57518432 0.1337540211 -0.3133214 1.0051457 0.91553342
## X13 X14 Risk_level
## 1 0.3057805 -1.0304266 0
## 2 0.9914798 -1.1335801 0
## 3 0.8132834 -1.5705755 0
## 4 -0.8880141 -0.1301785 0
## 5 -1.1602588 1.8766246 1
## 6 -0.4836766 0.3762111 1
## Display a table of data that has been scaled in R Studio
table_scaled <- tbl_df(complete_data)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
View(table_scaled)
## Modeling: using XGBoost (retaining outlier data)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(caret)
# Memisahkan variabel prediktor (fitur) dan variabel target (Risk_level)
complete_data_numeric <- complete_data
for(i in colnames(complete_data_numeric)) {
if(is.character(complete_data_numeric[[i]])) {
complete_data_numeric[[i]] <- as.numeric(as.factor(complete_data_numeric[[i]]))
}
}
predictors <- as.matrix(complete_data_numeric[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")])
target <- complete_data_numeric$Risk_level
train_data_matrix <- as.matrix(complete_data_numeric[, -which(names(complete_data_numeric) == "Risk_level")])
train_label <- as.numeric(target)
# Building the XgBoost model
dtrain <- xgb.DMatrix(data = train_data_matrix, label = train_label)
# XGBoost parameters (for binary classification) using binary logistic
parameters_xgb <- list(
objective = "binary:logistic",
eval_metric = "error",
max_depth = 6,
eta = 0.3
)
unique(train_label)
## [1] 0 1
# train the model with 100 rounds
model <- xgb.train(params = parameters_xgb, data = dtrain, nrounds = 100, watchlist = list(train = dtrain), verbose = 1)
## [1] train-error:0.060000
## [2] train-error:0.030000
## [3] train-error:0.040000
## [4] train-error:0.010000
## [5] train-error:0.010000
## [6] train-error:0.010000
## [7] train-error:0.000000
## [8] train-error:0.000000
## [9] train-error:0.000000
## [10] train-error:0.000000
## [11] train-error:0.000000
## [12] train-error:0.000000
## [13] train-error:0.000000
## [14] train-error:0.000000
## [15] train-error:0.000000
## [16] train-error:0.000000
## [17] train-error:0.000000
## [18] train-error:0.000000
## [19] train-error:0.000000
## [20] train-error:0.000000
## [21] train-error:0.000000
## [22] train-error:0.000000
## [23] train-error:0.000000
## [24] train-error:0.000000
## [25] train-error:0.000000
## [26] train-error:0.000000
## [27] train-error:0.000000
## [28] train-error:0.000000
## [29] train-error:0.000000
## [30] train-error:0.000000
## [31] train-error:0.000000
## [32] train-error:0.000000
## [33] train-error:0.000000
## [34] train-error:0.000000
## [35] train-error:0.000000
## [36] train-error:0.000000
## [37] train-error:0.000000
## [38] train-error:0.000000
## [39] train-error:0.000000
## [40] train-error:0.000000
## [41] train-error:0.000000
## [42] train-error:0.000000
## [43] train-error:0.000000
## [44] train-error:0.000000
## [45] train-error:0.000000
## [46] train-error:0.000000
## [47] train-error:0.000000
## [48] train-error:0.000000
## [49] train-error:0.000000
## [50] train-error:0.000000
## [51] train-error:0.000000
## [52] train-error:0.000000
## [53] train-error:0.000000
## [54] train-error:0.000000
## [55] train-error:0.000000
## [56] train-error:0.000000
## [57] train-error:0.000000
## [58] train-error:0.000000
## [59] train-error:0.000000
## [60] train-error:0.000000
## [61] train-error:0.000000
## [62] train-error:0.000000
## [63] train-error:0.000000
## [64] train-error:0.000000
## [65] train-error:0.000000
## [66] train-error:0.000000
## [67] train-error:0.000000
## [68] train-error:0.000000
## [69] train-error:0.000000
## [70] train-error:0.000000
## [71] train-error:0.000000
## [72] train-error:0.000000
## [73] train-error:0.000000
## [74] train-error:0.000000
## [75] train-error:0.000000
## [76] train-error:0.000000
## [77] train-error:0.000000
## [78] train-error:0.000000
## [79] train-error:0.000000
## [80] train-error:0.000000
## [81] train-error:0.000000
## [82] train-error:0.000000
## [83] train-error:0.000000
## [84] train-error:0.000000
## [85] train-error:0.000000
## [86] train-error:0.000000
## [87] train-error:0.000000
## [88] train-error:0.000000
## [89] train-error:0.000000
## [90] train-error:0.000000
## [91] train-error:0.000000
## [92] train-error:0.000000
## [93] train-error:0.000000
## [94] train-error:0.000000
## [95] train-error:0.000000
## [96] train-error:0.000000
## [97] train-error:0.000000
## [98] train-error:0.000000
## [99] train-error:0.000000
## [100] train-error:0.000000
## Prediction on train data and accuracy of train model
train_predictions <- predict(model, dtrain)
train_pred_labels <- ifelse(train_predictions > 0.5, 1, 0)
accuracy <- mean(train_pred_labels == train_label)
print(paste("Prediction model accuracy: ", round(accuracy*100, 2), "%"))
## [1] "Prediction model accuracy: 100 %"
## Testing the model on testing data
# Prepare excel data
library(readxl)
data_testing <- read_excel("data_testing.xlsx")
testing_predictors <- data_testing[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")]
total_missing <- is.na(sum(testing_predictors))
total_column <- nrow(testing_predictors) * ncol(testing_predictors)
percentage <- (total_missing/total_column)
print(paste("Percentage of missing values in testing data: ", round(percentage*100, 2), "%"))
## [1] "Percentage of missing values in testing data: 0.42 %"
## Data cleaning: missing value imputation with Multiple Imputation by Chained Equations (MICE) method
library(mice)
imputed_data <- mice(testing_predictors, m = 5, method = 'pmm', seed = 123 )
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## Warning: Number of logged events: 31
complete_data_testing <- complete(imputed_data, 1)
head(complete_data_testing)
## X1 X2 X3 X4 X5 X6 X7 X8
## 1 23.2000 60338.020 175.42230 1.62000 0.6755 2.47168 0.3526 185.64097
## 2 16.8056 62432.995 409.69700 0.10510 0.9068 2.77600 0.2912 94.00211
## 3 18.2857 28684.168 103.06040 0.84352 0.0746 3.55290 1.9299 72.30708
## 4 19.6715 21042.722 102.73060 1.17400 0.0734 3.21976 1.2325 111.78982
## 5 11.9000 49356.262 60.15464 0.89594 0.5865 1.75420 -1.1342 88.60514
## 6 12.9000 3989.191 65.55750 0.39400 0.5042 2.44734 -0.1248 88.88685
## X9 X10 X11 X12 X13 X14
## 1 64.14972 537.609866 0.5000 25.11320 27.95256 8.6
## 2 -200.98100 339.988210 1.3095 26.76784 47.25374 3.0
## 3 16.23838 52.761781 3.0176 19.90742 25.76882 5.0
## 4 33.35258 102.567122 2.5300 22.83084 20.95780 7.0
## 5 -145.43800 1.490827 63.5000 17.79208 23.21144 7.3
## 6 27.33332 24.638720 1.5706 16.78238 14.52982 9.0
## Rechecking missing values
total_missing <- is.na(sum(complete_data_testing))
total_column <- nrow(complete_data_testing) * ncol(complete_data_testing)
percentage <- (total_missing/total_column)
print(paste("Persentase missing value pada data testing: ", round(percentage*100, 2), "%"))
## [1] "Persentase missing value pada data testing: 0 %"
## Data scaling using Robust Scaling
library(caret)
testing_predictors <- complete_data_testing[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")]
process_testing_parameters <- preProcess(testing_predictors, method=c("center","scale"))
testing_predictors_scaled <- predict(process_testing_parameters, testing_predictors)
complete_data_testing[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14")] <-testing_predictors_scaled
print(complete_data_testing)
## X1 X2 X3 X4 X5 X6
## 1 1.72741539 1.59598965 0.92316558 -0.53272665 -0.16322642 -0.2833164
## 2 -0.09938924 1.68396038 3.53411407 -0.81344271 0.08950939 -0.1182225
## 3 0.32345786 0.26680298 0.11670550 -0.67661100 -0.81981504 0.3032466
## 4 0.71936457 -0.05407134 0.11302994 -0.61537195 -0.82112625 0.1225178
## 5 -1.50086128 1.13485124 -0.36147131 -0.66689740 -0.26047452 -0.6725500
## 6 -1.21517308 -0.77017142 -0.30125743 -0.75990857 -0.35040171 -0.2965208
## 7 0.75607550 -0.62482434 -0.66162439 -0.76898843 -0.55680808 0.2423129
## 8 -1.21517308 -0.78580589 -0.08160224 0.19662735 0.31951756 -0.7517551
## 9 0.24183674 -0.57433210 -0.45615611 1.32668875 0.72063869 0.6309823
## 10 -0.86091971 0.39992018 -0.49124690 -0.69883258 -0.79042208 -0.2469579
## 11 1.38458955 -0.77160420 0.12614182 2.71990314 -1.32812762 -1.4397549
## 12 1.26142936 -0.90464076 -0.56081641 -0.03709577 3.09250910 1.4890670
## 13 -0.24383320 1.97335350 0.12908405 -0.54511233 -0.21786019 -0.2920723
## 14 -0.03242393 -0.26715588 -0.21819957 0.65015361 -0.50883963 -1.1788662
## 15 0.35611202 -0.85904692 -0.69698244 1.44601669 0.85405435 1.5440006
## 16 -1.44438073 -0.77448288 -0.64710983 -0.31480991 0.02810104 2.1438440
## 17 -0.15812674 -0.66874220 -0.46577431 0.09040776 0.71277142 -1.1959550
## X7 X8 X9 X10 X11 X12
## 1 -0.2845212 2.225910918 1.00111233 -0.1842929 -0.56874403 0.71991807
## 2 -0.3174812 -0.163801781 -2.20043008 -0.2236332 -0.52895865 1.09146731
## 3 0.5621869 -0.729554106 0.42256682 -0.2808109 -0.44500879 -0.44903930
## 4 0.1878167 0.300057149 0.62922653 -0.2708963 -0.46897340 0.20741436
## 5 -1.0826481 -0.304541497 -1.52972972 -0.2910174 2.52758573 -0.92403868
## 6 -0.5407935 -0.297195080 0.55654194 -0.2864093 -0.51612608 -1.15076671
## 7 0.2126172 -0.002359588 -0.28748440 -0.1914526 -0.43604418 0.25884976
## 8 -1.2815356 0.891757366 1.00489384 -0.2835070 0.07509597 -0.69624179
## 9 0.4962132 0.423623993 0.57147503 -0.1479361 -0.42937478 1.49352390
## 10 1.0124083 -0.761472471 -2.05744603 -0.1583120 -0.43604418 0.02744211
## 11 0.5410904 -0.730876094 0.16048132 -0.2603427 1.81493841 -0.92363449
## 12 -0.2479645 -0.803906658 0.49069828 -0.2846377 -0.42937478 0.66305305
## 13 -0.2125351 2.225910918 0.80250180 3.8761854 -0.54416998 -1.01062063
## 14 -0.8586379 -1.335892651 0.03048212 -0.2806383 -0.42937478 -1.22615260
## 15 1.1760812 -0.761472471 -0.31884043 -0.2798265 -0.49010708 2.17689211
## 16 2.3585093 -0.357821123 0.31579477 -0.2213051 -0.51025780 0.36837173
## 17 -1.7208061 0.181633174 0.40815587 -0.2311673 1.81493841 -0.62643821
## X13 X14
## 1 0.46874142 -0.0528763294
## 2 2.59277476 -0.8025600870
## 3 0.22842779 -0.5348158878
## 4 -0.30100962 -0.2670716887
## 5 -0.05300372 -0.2269100588
## 6 -1.00838834 0.0006725104
## 7 0.96691674 -0.9364321865
## 8 -1.62993494 1.0716493070
## 9 0.28993956 0.5629353286
## 10 1.11236341 -0.7088496173
## 11 -0.84113479 0.0676085602
## 12 -0.46382828 0.0676085602
## 13 -0.71366096 -0.4525380954
## 14 -0.67295024 0.1747062399
## 15 0.61073482 -0.4009437883
## 16 0.23275263 -0.8694961368
## 17 -0.81874026 3.3073133697
## Testing the XGBoost model for testing data
library(xgboost)
complete_data_testing <- data_testing
# Preprocessing fortesting data
for(i in colnames(complete_data_testing)) {
if(is.character(complete_data_testing[[i]])) {
complete_data_testing[[i]] <- as.numeric(as.factor(complete_data_testing[[i]]))
}
}
# Convert testing data into a matrix
test_data_matrix <- as.matrix(complete_data_testing)
# Create DMatrix for testing data
dtest <- xgb.DMatrix(data = test_data_matrix)
# Make predictions with the trained model
predictions <- predict(model, dtest)
# Convert prediction to class (if using binary classification)
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Save the prediction results into a dataframe for further analysis
results <- data.frame(Predicted_Risk_Level = predicted_classes)
# Prediction result
print(results)
## Predicted_Risk_Level
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 1
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## 17 0
## Prediction on test data and test model accuracy
test_predictions <- predict(model, dtest)
test_pred_labels <- ifelse(test_predictions > 0.5, 1, 0)
test_labels <- as.numeric(predicted_classes)
accuracy <- mean(test_pred_labels == test_labels)
print(paste("Accuracy on test data: ", round(accuracy*100, 2), "%"))
## [1] "Accuracy on test data: 100 %"
## Combining prediction results with test data
final_data <- testing_predictors_scaled
final_data$Predicted_scale_level <- predicted_classes
print(final_data)
## X1 X2 X3 X4 X5 X6
## 1 1.72741539 1.59598965 0.92316558 -0.53272665 -0.16322642 -0.2833164
## 2 -0.09938924 1.68396038 3.53411407 -0.81344271 0.08950939 -0.1182225
## 3 0.32345786 0.26680298 0.11670550 -0.67661100 -0.81981504 0.3032466
## 4 0.71936457 -0.05407134 0.11302994 -0.61537195 -0.82112625 0.1225178
## 5 -1.50086128 1.13485124 -0.36147131 -0.66689740 -0.26047452 -0.6725500
## 6 -1.21517308 -0.77017142 -0.30125743 -0.75990857 -0.35040171 -0.2965208
## 7 0.75607550 -0.62482434 -0.66162439 -0.76898843 -0.55680808 0.2423129
## 8 -1.21517308 -0.78580589 -0.08160224 0.19662735 0.31951756 -0.7517551
## 9 0.24183674 -0.57433210 -0.45615611 1.32668875 0.72063869 0.6309823
## 10 -0.86091971 0.39992018 -0.49124690 -0.69883258 -0.79042208 -0.2469579
## 11 1.38458955 -0.77160420 0.12614182 2.71990314 -1.32812762 -1.4397549
## 12 1.26142936 -0.90464076 -0.56081641 -0.03709577 3.09250910 1.4890670
## 13 -0.24383320 1.97335350 0.12908405 -0.54511233 -0.21786019 -0.2920723
## 14 -0.03242393 -0.26715588 -0.21819957 0.65015361 -0.50883963 -1.1788662
## 15 0.35611202 -0.85904692 -0.69698244 1.44601669 0.85405435 1.5440006
## 16 -1.44438073 -0.77448288 -0.64710983 -0.31480991 0.02810104 2.1438440
## 17 -0.15812674 -0.66874220 -0.46577431 0.09040776 0.71277142 -1.1959550
## X7 X8 X9 X10 X11 X12
## 1 -0.2845212 2.225910918 1.00111233 -0.1842929 -0.56874403 0.71991807
## 2 -0.3174812 -0.163801781 -2.20043008 -0.2236332 -0.52895865 1.09146731
## 3 0.5621869 -0.729554106 0.42256682 -0.2808109 -0.44500879 -0.44903930
## 4 0.1878167 0.300057149 0.62922653 -0.2708963 -0.46897340 0.20741436
## 5 -1.0826481 -0.304541497 -1.52972972 -0.2910174 2.52758573 -0.92403868
## 6 -0.5407935 -0.297195080 0.55654194 -0.2864093 -0.51612608 -1.15076671
## 7 0.2126172 -0.002359588 -0.28748440 -0.1914526 -0.43604418 0.25884976
## 8 -1.2815356 0.891757366 1.00489384 -0.2835070 0.07509597 -0.69624179
## 9 0.4962132 0.423623993 0.57147503 -0.1479361 -0.42937478 1.49352390
## 10 1.0124083 -0.761472471 -2.05744603 -0.1583120 -0.43604418 0.02744211
## 11 0.5410904 -0.730876094 0.16048132 -0.2603427 1.81493841 -0.92363449
## 12 -0.2479645 -0.803906658 0.49069828 -0.2846377 -0.42937478 0.66305305
## 13 -0.2125351 2.225910918 0.80250180 3.8761854 -0.54416998 -1.01062063
## 14 -0.8586379 -1.335892651 0.03048212 -0.2806383 -0.42937478 -1.22615260
## 15 1.1760812 -0.761472471 -0.31884043 -0.2798265 -0.49010708 2.17689211
## 16 2.3585093 -0.357821123 0.31579477 -0.2213051 -0.51025780 0.36837173
## 17 -1.7208061 0.181633174 0.40815587 -0.2311673 1.81493841 -0.62643821
## X13 X14 Predicted_scale_level
## 1 0.46874142 -0.0528763294 0
## 2 2.59277476 -0.8025600870 0
## 3 0.22842779 -0.5348158878 0
## 4 -0.30100962 -0.2670716887 0
## 5 -0.05300372 -0.2269100588 0
## 6 -1.00838834 0.0006725104 0
## 7 0.96691674 -0.9364321865 0
## 8 -1.62993494 1.0716493070 1
## 9 0.28993956 0.5629353286 0
## 10 1.11236341 -0.7088496173 0
## 11 -0.84113479 0.0676085602 0
## 12 -0.46382828 0.0676085602 0
## 13 -0.71366096 -0.4525380954 0
## 14 -0.67295024 0.1747062399 0
## 15 0.61073482 -0.4009437883 0
## 16 0.23275263 -0.8694961368 0
## 17 -0.81874026 3.3073133697 0
# Confusion Matrix, Sensitivity, Specificity
true_labels <- final_data$Predicted_scale_level
conf_matrix <- confusionMatrix(as.factor(predicted_classes), as.factor(true_labels))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16 0
## 1 0 1
##
## Accuracy : 1
## 95% CI : (0.8049, 1)
## No Information Rate : 0.9412
## P-Value [Acc > NIR] : 0.3568
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.9412
## Detection Rate : 0.9412
## Detection Prevalence : 0.9412
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##