Email             :
RPubs            : https://rpubs.com/albert23899
Jurusan          : Statistika
Address         : ARA Center, Matana University Tower
                         Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.


1 Mengkoneksikan Environment Phyton ke RStudio

library(reticulate)
library(Rcpp)
use_condaenv("py39",required=TRUE)

2 Memuat Dataset Employee Atrition

# Importing modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Loading dataset
raw_dataset = pd.read_csv('HR_comma_sep.csv') 
raw_dataset.head(3) #Print the first 3 rows
##    satisfaction_level  last_evaluation  ...  sales  salary
## 0                0.38             0.53  ...  sales     low
## 1                0.80             0.86  ...  sales  medium
## 2                0.11             0.88  ...  sales  medium
## 
## [3 rows x 10 columns]

3 Mengubah Variabel Kategorikal menjadi Variabel Numerik

# Creating the time and event columns
time_column = 'time_spend_company'
event_column = 'left'

# Creating one-hot vectors
category_columns = ['sales', 'salary']
dataset = pd.get_dummies(raw_dataset, columns=category_columns, drop_first=True)
dataset.head()

# Creating the features
##    satisfaction_level  last_evaluation  ...  salary_low  salary_medium
## 0                0.38             0.53  ...           1              0
## 1                0.80             0.86  ...           0              1
## 2                0.11             0.88  ...           0              1
## 3                0.72             0.87  ...           1              0
## 4                0.37             0.52  ...           1              0
## 
## [5 rows x 19 columns]
features = np.setdiff1d(dataset.columns, [time_column, event_column] ).tolist()

4 Memeriksa Null Values dan Duplicate Value

# Checking for null values
N_null = sum(dataset[features].isnull().sum())
print("The dataset contains {} null values".format(N_null)) #0 null values

# Removing duplicates if there exist
## The dataset contains 0 null values
N_dupli = sum(dataset.duplicated(keep='first'))
dataset = dataset.drop_duplicates(keep='first').reset_index(drop=True)
print("The dataset contains {} duplicates".format(N_dupli))

# Number of samples in the dataset
## The dataset contains 3008 duplicates
N = dataset.shape[0]