Email : albert.prayogo99@gmail.com
RPubs : https://rpubs.com/albert23899
Jurusan : Statistika
Address : ARA Center, Matana University Tower
Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.
library(reticulate)
library(Rcpp)
use_condaenv("py39",required=TRUE)# Importing modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Loading dataset
raw_dataset = pd.read_csv('HR_comma_sep.csv')
raw_dataset.head(3) #Print the first 3 rows## satisfaction_level last_evaluation ... sales salary
## 0 0.38 0.53 ... sales low
## 1 0.80 0.86 ... sales medium
## 2 0.11 0.88 ... sales medium
##
## [3 rows x 10 columns]
# Creating the time and event columns
time_column = 'time_spend_company'
event_column = 'left'
# Creating one-hot vectors
category_columns = ['sales', 'salary']
dataset = pd.get_dummies(raw_dataset, columns=category_columns, drop_first=True)
dataset.head()
# Creating the features## satisfaction_level last_evaluation ... salary_low salary_medium
## 0 0.38 0.53 ... 1 0
## 1 0.80 0.86 ... 0 1
## 2 0.11 0.88 ... 0 1
## 3 0.72 0.87 ... 1 0
## 4 0.37 0.52 ... 1 0
##
## [5 rows x 19 columns]
features = np.setdiff1d(dataset.columns, [time_column, event_column] ).tolist()# Checking for null values
N_null = sum(dataset[features].isnull().sum())
print("The dataset contains {} null values".format(N_null)) #0 null values
# Removing duplicates if there exist## The dataset contains 0 null values
N_dupli = sum(dataset.duplicated(keep='first'))
dataset = dataset.drop_duplicates(keep='first').reset_index(drop=True)
print("The dataset contains {} duplicates".format(N_dupli))
# Number of samples in the dataset## The dataset contains 3008 duplicates
N = dataset.shape[0]