library(DriveML)    ## automl package
library(data.table) ## data wrangling package
library(caret)      ## ML wrapper package
library(mlr)        ## ML wrapper package
library(h2o)        ## automl package
library(OneR)       ## automl package
library(Metrics)    ## auc function
library(autoxgboost)## automl package

HR Analytics: Job Change of Data Scientists

data source :


A company which is active in Medium data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

This dataset designed to understand the factors that lead a person to leave current job for HR researches too. By model(s) that uses the current credentials,demographics,experience data you will predict the probability of a candidate to look for a new job or will work for the company, as well as interpreting affected factors on employee decision.

The whole data divided to train and test . Target isnt included in test but the test target values data file is in hands for related tasks. A sample submission correspond to enrollee_id of test set provided too with columns : enrollee _id , target


The dataset is imbalanced.

Most features are categorical (Nominal, Ordinal, Binary), some with high cardinality.

Missing imputation can be a part of your pipeline as well.

Problem Statement

Predict who will move to a new job

Please refer to the following task for more details:

DriveML Experiment

data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)

##target variable distributions
##     0     1 
## 14381  4777

Split sample to test the model accuaracy with other open source R package

train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)

DriveML step 1 - Missing variable treatment

marobj <- autoMAR (hr_ana_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")

DriveML step 2 - Auto Dataprep

## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(hr_ana_data, target = "target_var",
                          auto_mar = TRUE,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = 'enrollee_id',
                          verbose =TRUE)
## autoDataprep < MAR variable computation.... > 
## autoDataprep < missing imputation.... > 
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_hr_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
hr_train_data <- master_hr_data[sele_var]

### Train and valid data
train_hr <- hr_train_data[train.index,]
valid_hr <- hr_train_data[-train.index,]

DriveML step 3 - Model development

mymodel_hr <- autoMLmodel( train = train_hr,
                           test = valid_hr,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "all",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           uid = 'enrollee_id',
                           htmlreport = FALSE,
                           pdp = TRUE,
                           verbose = TRUE,
                           seed = 42)

save(mymodel_hr, file="medium_data_mymodel.rdata")

DriveML Results

Model results

results <- mymodel_hr$results
Model Fitting time Scoring time Train AUC Test AUC Accuracy Precision Recall F1_score
5 xgboost 10.702 secs 0.06 secs 0.866 0.789 0.784 0.563 0.535 0.549
2 logreg 6.214 secs 0.188 secs 0.805 0.786 0.776 0.564 0.380 0.454
4 ranger 24.495 secs 0.721 secs 0.975 0.783 0.783 0.569 0.471 0.515
1 glmnet 5.939 secs 0.061 secs 0.795 0.781 0.767 0.544 0.296 0.383
3 randomForest 1.686 mins 0.521 secs 0.875 0.780 0.774 0.563 0.354 0.434
6 rpart 3.88 secs 0.038 secs 0.783 0.761 0.778 0.553 0.496 0.523
## Variable Lift

### Partial dependency plot

### Random Forest Model validation ROC

### XGBoost Model validation ROC

### Random Forest Model Variable Importance
## [[1]]

Best ML model comparison with other R packages

1. DriveML

available on CRAN and git

selected best model from driveml outcome

time <- Sys.time()
marobj <- autoMAR (hr_ana_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
##               Variable       AUC
## 1:        company_size 1.0000000
## 2:        company_type 1.0000000
## 3:     education_level 1.0000000
## 4: enrolled_university 1.0000000
## 5:          experience 0.9204786
## 6:        last_new_job 1.0000000
## 7:    major_discipline 1.0000000
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(hr_ana_data, target = "target_var",
                          auto_mar = TRUE,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = 'enrollee_id',
                          verbose =TRUE)
## autoDataprep < MAR variable computation.... > 
## autoDataprep < missing imputation.... > 
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_hr_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
hr_train_data <- master_hr_data[sele_var]

### Train and valid data
train_hr <- hr_train_data[train.index,]
valid_hr <- hr_train_data[-train.index,]

driveml_mediumdata <- autoMLmodel( train = train_hr,
                           test = valid_hr,
                           target = 'target_var',
                           models = "xgboost",
                           uid = 'enrollee_id',
                           verbose = TRUE,
                           seed = 42)
## xgboost Model tuning started.... 
## autoMLmodel < All features xgboost tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_mediumdata$results$`Test AUC`,3)

comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")

comparision_metric[1,"dataset_name"] <- "Medium data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "Yes"
comparision_metric[1,"Train_instances"] <- 15327
comparision_metric[1,"Test_instances"] <- 3831

2. H2o AutoML

available on CRAN and git

## Convert character data to factor for H2o automl function
hr_ana_data[sapply(hr_ana_data, is.character)] <- lapply(hr_ana_data[sapply(hr_ana_data, is.character)], as.factor)

## Convert target class variable as factor
## For binary classification, response should be a factor
hr_ana_data$target_var <- as.factor(hr_ana_data$target_var)
hr_ana_data$enrollee_id <- NULL

### Train and valid data
train_data <- hr_ana_data[train.index,]
valid_data <- hr_ana_data[-train.index,]

Training using h2o.automl() function

## train and valid data set inot h2o 
time = Sys.time() ## start time
train_d = as.h2o(train_data)
Train Model

h2o_medium_data_model <- h2o.automl(y = "target_var", 
                        training_frame = train_d, 
                        validation_frame = test_d,
                        exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
lb =
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3) 
save(h2o_medium_data_model, file = "h2o_model_medium_data_hr_ana.rdata")

comparision_metric[2,"dataset_name"] <- "Medium data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "Yes"
comparision_metric[2,"Train_instances"] <- 15327
comparision_metric[2,"Test_instances"] <- 3831

3. OneR

available on CRAN and git

build model with the OneR package

Note: There is no inbuilt feature engineering functions available on OneR. We used DriveML function to prepare the input dataset

## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(hr_ana_data) # set as data frame
hr_ana_data$enrollee_id <- NULL

## Impute missing values
cc=impute(hr_ana_data, classes = list(factor = imputeMode(),
                                              integer = imputeMean(),
                                              numeric = imputeMedian(),
                                              character = imputeMode()))
## split randomly into a training (80%) and a test (20%) set
set.seed(12345) # for reproducibility
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)

train_oth <- cc$data[train.index,]
valid_oth <- cc$data[-train.index,]

## OneR model
model_oner_medium <- OneR(target_var ~., data = train_oth, verbose = TRUE)
##     Attribute              Accuracy
## 1 * city                   78.52%  
## 2   city_development_index 78.31%  
## 3   gender                 74.96%  
## 3   relevent_experience    74.96%  
## 3   enrolled_university    74.96%  
## 3   education_level        74.96%  
## 3   major_discipline       74.96%  
## 3   experience             74.96%  
## 3   company_size           74.96%  
## 3   company_type           74.96%  
## 3   last_new_job           74.96%  
## 3   training_hours         74.96%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_medium, valid_oth,type = c("prob"))
medium_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_medium, file = "oner_model_medium_data_har_ana.rdata")
best_auc_oner <- round(auc(valid_oth$target_var, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Medium data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- medium_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "Yes"
comparision_metric[3,"Train_instances"] <- 15327
comparision_metric[3,"Test_instances"] <- 3831

4. autoxgboost

available on git

Note: There is no inbuilt feature engineering functions available on autoxgboost. We used DriveML function to prepare the input dataset


## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)
setDF(hr_ana_data) # set as data frame

time = Sys.time() ## Start time
traindata <- autoDataprep(hr_ana_data, target = "target_var",
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = FALSE,
                          char_var_limit = 150,
                          interaction_var = FALSE,
                          frequent_var = FALSE,
                          uid = 'enrollee_id',
                          verbose =TRUE)
## autoDataprep < missing imputation.... > 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_cc_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
cc_train_data <- master_cc_data[sele_var]
cc_train_data$enrollee_id <- NULL

train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)

train_oth <- cc_train_data[train.index,]
valid_oth <- cc_train_data[-train.index,]
# create a classification task
train_oth$target_var <- as.factor(train_oth$target_var)

trainTask = makeClassifTask(data = train_oth, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L) 
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
prediction = predict(auto_xgb_model, valid_oth[,-1])
prediction = getPredictionProbabilities(prediction)

myauc = round(auc(valid_oth$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_medium_data_hr_ana.rdata")

comparision_metric[4,"dataset_name"] <- "Medium data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "Yes"
comparision_metric[4,"Train_instances"] <- 15327
comparision_metric[4,"Test_instances"] <- 3831

Comparision results

r_package_name dataset_name attributes missing Train_instances Test_instances time_taken_min test_auc
DriveML Medium data 14 Yes 15327 3831 35.126 0.798
H2o automl Medium data 14 Yes 15327 3831 60.869 0.8
OneR Medium data 14 Yes 15327 3831 0.105 0.71
autoxgboost Medium data 14 Yes 15327 3831 13.548 0.643