library(DriveML)    ## automl package
library(data.table) ## data wrangling package
library(caret)      ## ML wrapper package
library(mlr)        ## ML wrapper package
library(h2o)        ## automl package
library(OneR)       ## automl package
library(Metrics)    ## auc function
library(autoxgboost)## automl package

HR Analytics: Job Change of Data Scientists

data source : https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists

Content

A company which is active in Medium data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

This dataset designed to understand the factors that lead a person to leave current job for HR researches too. By model(s) that uses the current credentials,demographics,experience data you will predict the probability of a candidate to look for a new job or will work for the company, as well as interpreting affected factors on employee decision.

The whole data divided to train and test . Target isnt included in test but the test target values data file is in hands for related tasks. A sample submission correspond to enrollee_id of test set provided too with columns : enrollee _id , target

Note:

The dataset is imbalanced.

Most features are categorical (Nominal, Ordinal, Binary), some with high cardinality.

Missing imputation can be a part of your pipeline as well.

Problem Statement

Predict who will move to a new job

Please refer to the following task for more details: https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists/tasks?taskId=3015

DriveML Experiment

data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)


##target variable distributions
table(hr_ana_data$target_var)
## 
##     0     1 
## 14381  4777

Split sample to test the model accuaracy with other open source R package

set.seed(12345)
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)

DriveML step 1 - Missing variable treatment

marobj <- autoMAR (hr_ana_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features

DriveML step 2 - Auto Dataprep

## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(hr_ana_data, target = "target_var",
                          auto_mar = TRUE,
                          mar_object=marobj,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = 'enrollee_id',
                          verbose =TRUE)
## autoDataprep < MAR variable computation.... > 
## autoDataprep < missing imputation.... > 
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_hr_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
hr_train_data <- master_hr_data[sele_var]

### Train and valid data
train_hr <- hr_train_data[train.index,]
valid_hr <- hr_train_data[-train.index,]

DriveML step 3 - Model development

mymodel_hr <- autoMLmodel( train = train_hr,
                           test = valid_hr,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "all",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           uid = 'enrollee_id',
                           htmlreport = FALSE,
                           pdp = TRUE,
                           verbose = TRUE,
                           seed = 42)

save(mymodel_hr, file="medium_data_mymodel.rdata")

DriveML Results

Model results

results <- mymodel_hr$results
kableExtra::kable(results)
Model Fitting time Scoring time Train AUC Test AUC Accuracy Precision Recall F1_score
5 xgboost 10.702 secs 0.06 secs 0.866 0.789 0.784 0.563 0.535 0.549
2 logreg 6.214 secs 0.188 secs 0.805 0.786 0.776 0.564 0.380 0.454
4 ranger 24.495 secs 0.721 secs 0.975 0.783 0.783 0.569 0.471 0.515
1 glmnet 5.939 secs 0.061 secs 0.795 0.781 0.767 0.544 0.296 0.383
3 randomForest 1.686 mins 0.521 secs 0.875 0.780 0.774 0.563 0.354 0.434
6 rpart 3.88 secs 0.038 secs 0.783 0.761 0.778 0.553 0.496 0.523
## Variable Lift
mymodel_hr$modelexp$Lift_plot

### Partial dependency plot
#mymodel_hr$modelexp$pdp

### Random Forest Model validation ROC
mymodel_hr$trainedModels$randomForest$modelPlots$TestROC

### XGBoost Model validation ROC
mymodel_hr$trainedModels$xgboost$modelPlots$TestROC

### Random Forest Model Variable Importance
mymodel_hr$trainedModels$randomForest$modelPlots$VarImp
## [[1]]

Best ML model comparison with other R packages

1. DriveML

available on CRAN and git

selected best model from driveml outcome

time <- Sys.time()
marobj <- autoMAR (hr_ana_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |======================================================================| 100%
marobj$auc_features
##               Variable       AUC
## 1:        company_size 1.0000000
## 2:        company_type 1.0000000
## 3:     education_level 1.0000000
## 4: enrolled_university 1.0000000
## 5:          experience 0.9204786
## 6:        last_new_job 1.0000000
## 7:    major_discipline 1.0000000
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(hr_ana_data, target = "target_var",
                          auto_mar = TRUE,
                          mar_object=marobj,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = 'enrollee_id',
                          verbose =TRUE)
## autoDataprep < MAR variable computation.... > 
## autoDataprep < missing imputation.... > 
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_hr_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
hr_train_data <- master_hr_data[sele_var]

### Train and valid data
train_hr <- hr_train_data[train.index,]
valid_hr <- hr_train_data[-train.index,]

driveml_mediumdata <- autoMLmodel( train = train_hr,
                           test = valid_hr,
                           target = 'target_var',
                           models = "xgboost",
                           uid = 'enrollee_id',
                           verbose = TRUE,
                           seed = 42)
## xgboost Model tuning started.... 
## autoMLmodel < All features xgboost tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_mediumdata$results$`Test AUC`,3)

comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")

comparision_metric[1,"dataset_name"] <- "Medium data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "Yes"
comparision_metric[1,"Train_instances"] <- 15327
comparision_metric[1,"Test_instances"] <- 3831

2. H2o AutoML

available on CRAN and git

setDF(hr_ana_data)
## Convert character data to factor for H2o automl function
hr_ana_data[sapply(hr_ana_data, is.character)] <- lapply(hr_ana_data[sapply(hr_ana_data, is.character)], as.factor)

## Convert target class variable as factor
## For binary classification, response should be a factor
hr_ana_data$target_var <- as.factor(hr_ana_data$target_var)
hr_ana_data$enrollee_id <- NULL

### Train and valid data
train_data <- hr_ana_data[train.index,]
valid_data <- hr_ana_data[-train.index,]

Training using h2o.automl() function

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 14 hours 
##     H2O cluster timezone:       Asia/Kolkata 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.1.3 
##     H2O cluster version age:    28 days, 19 hours and 37 minutes  
##     H2O cluster name:           H2O_started_from_R_dubrangala_zcg934 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   5.99 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.3 (2020-10-10)
## train and valid data set inot h2o 
time = Sys.time() ## start time
train_d = as.h2o(train_data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_d = as.h2o(valid_data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Train Model

h2o_medium_data_model <- h2o.automl(y = "target_var", 
                        training_frame = train_d, 
                        validation_frame = test_d,
                        exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
## 
  |                                                                            
  |                                                                      |   0%
## 13:37:31.49: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 13:37:31.49: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |======================================================================| 100%
lb = as.data.frame(h2o_medium_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3) 
save(h2o_medium_data_model, file = "h2o_model_medium_data_hr_ana.rdata")

comparision_metric[2,"dataset_name"] <- "Medium data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "Yes"
comparision_metric[2,"Train_instances"] <- 15327
comparision_metric[2,"Test_instances"] <- 3831

3. OneR

available on CRAN and git

build model with the OneR package

Note: There is no inbuilt feature engineering functions available on OneR. We used DriveML function to prepare the input dataset

## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(hr_ana_data) # set as data frame
hr_ana_data$enrollee_id <- NULL

## Impute missing values
cc=impute(hr_ana_data, classes = list(factor = imputeMode(),
                                              integer = imputeMean(),
                                              numeric = imputeMedian(),
                                              character = imputeMode()))
## split randomly into a training (80%) and a test (20%) set
set.seed(12345) # for reproducibility
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)

train_oth <- cc$data[train.index,]
valid_oth <- cc$data[-train.index,]

## OneR model
model_oner_medium <- OneR(target_var ~., data = train_oth, verbose = TRUE)
## 
##     Attribute              Accuracy
## 1 * city                   78.52%  
## 2   city_development_index 78.31%  
## 3   gender                 74.96%  
## 3   relevent_experience    74.96%  
## 3   enrolled_university    74.96%  
## 3   education_level        74.96%  
## 3   major_discipline       74.96%  
## 3   experience             74.96%  
## 3   company_size           74.96%  
## 3   company_type           74.96%  
## 3   last_new_job           74.96%  
## 3   training_hours         74.96%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_medium, valid_oth,type = c("prob"))
medium_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_medium, file = "oner_model_medium_data_har_ana.rdata")
summary(model_oner_medium)
## 
## Call:
## OneR.formula(formula = target_var ~ ., data = train_oth, verbose = TRUE)
## 
## Rules:
## If city = city_1   then target_var = 0
## If city = city_10  then target_var = 0
## If city = city_100 then target_var = 0
## If city = city_101 then target_var = 1
## If city = city_102 then target_var = 0
## If city = city_103 then target_var = 0
## If city = city_104 then target_var = 0
## If city = city_105 then target_var = 0
## If city = city_106 then target_var = 0
## If city = city_107 then target_var = 1
## If city = city_109 then target_var = 0
## If city = city_11  then target_var = 1
## If city = city_111 then target_var = 0
## If city = city_114 then target_var = 0
## If city = city_115 then target_var = 0
## If city = city_116 then target_var = 0
## If city = city_117 then target_var = 0
## If city = city_118 then target_var = 0
## If city = city_12  then target_var = 0
## If city = city_120 then target_var = 0
## If city = city_121 then target_var = 0
## If city = city_123 then target_var = 0
## If city = city_126 then target_var = 1
## If city = city_127 then target_var = 0
## If city = city_128 then target_var = 1
## If city = city_129 then target_var = 0
## If city = city_13  then target_var = 0
## If city = city_131 then target_var = 0
## If city = city_133 then target_var = 0
## If city = city_134 then target_var = 0
## If city = city_136 then target_var = 0
## If city = city_138 then target_var = 0
## If city = city_139 then target_var = 1
## If city = city_14  then target_var = 0
## If city = city_141 then target_var = 0
## If city = city_142 then target_var = 0
## If city = city_143 then target_var = 0
## If city = city_144 then target_var = 0
## If city = city_145 then target_var = 1
## If city = city_146 then target_var = 0
## If city = city_149 then target_var = 0
## If city = city_150 then target_var = 0
## If city = city_152 then target_var = 0
## If city = city_155 then target_var = 1
## If city = city_157 then target_var = 0
## If city = city_158 then target_var = 0
## If city = city_159 then target_var = 0
## If city = city_16  then target_var = 0
## If city = city_160 then target_var = 0
## If city = city_162 then target_var = 0
## If city = city_165 then target_var = 0
## If city = city_166 then target_var = 0
## If city = city_167 then target_var = 0
## If city = city_171 then target_var = 1
## If city = city_173 then target_var = 0
## If city = city_175 then target_var = 0
## If city = city_176 then target_var = 0
## If city = city_179 then target_var = 0
## If city = city_18  then target_var = 0
## If city = city_180 then target_var = 0
## If city = city_19  then target_var = 0
## If city = city_2   then target_var = 0
## If city = city_20  then target_var = 0
## If city = city_21  then target_var = 1
## If city = city_23  then target_var = 0
## If city = city_24  then target_var = 0
## If city = city_25  then target_var = 1
## If city = city_26  then target_var = 0
## If city = city_27  then target_var = 0
## If city = city_28  then target_var = 0
## If city = city_30  then target_var = 0
## If city = city_31  then target_var = 0
## If city = city_33  then target_var = 1
## If city = city_36  then target_var = 0
## If city = city_37  then target_var = 0
## If city = city_39  then target_var = 0
## If city = city_40  then target_var = 0
## If city = city_41  then target_var = 0
## If city = city_42  then target_var = 1
## If city = city_43  then target_var = 1
## If city = city_44  then target_var = 0
## If city = city_45  then target_var = 0
## If city = city_46  then target_var = 0
## If city = city_48  then target_var = 0
## If city = city_50  then target_var = 0
## If city = city_53  then target_var = 0
## If city = city_54  then target_var = 0
## If city = city_55  then target_var = 0
## If city = city_57  then target_var = 0
## If city = city_59  then target_var = 0
## If city = city_61  then target_var = 0
## If city = city_62  then target_var = 0
## If city = city_64  then target_var = 0
## If city = city_65  then target_var = 0
## If city = city_67  then target_var = 0
## If city = city_69  then target_var = 0
## If city = city_7   then target_var = 0
## If city = city_70  then target_var = 0
## If city = city_71  then target_var = 0
## If city = city_72  then target_var = 0
## If city = city_73  then target_var = 0
## If city = city_74  then target_var = 1
## If city = city_75  then target_var = 0
## If city = city_76  then target_var = 0
## If city = city_77  then target_var = 0
## If city = city_78  then target_var = 1
## If city = city_79  then target_var = 0
## If city = city_8   then target_var = 0
## If city = city_80  then target_var = 0
## If city = city_81  then target_var = 0
## If city = city_82  then target_var = 0
## If city = city_83  then target_var = 0
## If city = city_84  then target_var = 0
## If city = city_89  then target_var = 0
## If city = city_9   then target_var = 0
## If city = city_90  then target_var = 0
## If city = city_91  then target_var = 0
## If city = city_93  then target_var = 0
## If city = city_94  then target_var = 0
## If city = city_97  then target_var = 0
## If city = city_98  then target_var = 0
## If city = city_99  then target_var = 0
## 
## Accuracy:
## 12034 of 15327 instances classified correctly (78.52%)
## 
## Contingency table:
##           city
## target_var city_1 city_10 city_100 city_101 city_102 city_103 city_104 city_105
##        0     * 21    * 63    * 160       27    * 216   * 2730    * 221     * 54
##        1        3       9       56     * 38       36      731       23        9
##        Sum     24      72      216       65      252     3461      244       63
##           city
## target_var city_106 city_107 city_109 city_11 city_111 city_114 city_115
##        0        * 5        2      * 5      81      * 2    * 969     * 27
##        1          2      * 3        2   * 117        0      109       15
##        Sum        7        5        7     198        2     1078       42
##           city
## target_var city_116 city_117 city_118 city_12 city_120 city_121 city_123
##        0       * 90      * 6     * 16    * 10      * 3      * 2     * 45
##        1         16        3        7       2        0        1       18
##        Sum      106        9       23      12        3        3       63
##           city
## target_var city_126 city_127 city_128 city_129 city_13 city_131 city_133
##        0         10      * 7       28      * 1    * 35      * 5      * 6
##        1       * 13        1     * 41        0       4        3        1
##        Sum       23        8       69        1      39        8        7
##           city
## target_var city_134 city_136 city_138 city_139 city_14 city_141 city_142
##        0       * 27    * 435     * 81        0    * 17     * 16     * 32
##        1         10       49        9      * 3       6        3       12
##        Sum       37      484       90        3      23       19       44
##           city
## target_var city_143 city_144 city_145 city_146 city_149 city_150 city_152
##        0       * 16     * 12       20      * 4     * 63     * 40     * 30
##        1         12        5     * 31        2       20       13        9
##        Sum       28       17       51        6       83       53       39
##           city
## target_var city_155 city_157 city_158 city_159 city_16 city_160 city_162
##        0          2     * 15     * 30     * 66  * 1086    * 519     * 71
##        1        * 9        3       11       11     143      157       34
##        Sum       11       18       41       77    1229      676      105
##           city
## target_var city_165 city_166 city_167 city_171 city_173 city_175 city_176
##        0       * 47      * 2      * 2        0    * 105     * 10     * 15
##        1         10        1        2      * 1       16        3        6
##        Sum       57        3        4        1      121       13       21
##           city
## target_var city_179 city_18 city_180 city_19 city_2 city_20 city_21 city_23
##        0        * 1     * 3      * 4    * 53    * 5    * 22     874   * 133
##        1          1       1        2      37      0       3  * 1311      13
##        Sum        2       4        6      90      5      25    2185     146
##           city
## target_var city_24 city_25 city_26 city_27 city_28 city_30 city_31 city_33
##        0      * 42       1    * 16    * 29   * 144    * 14     * 3       6
##        1         9     * 2       4       6      13       3       1    * 10
##        Sum      51       3      20      35     157      17       4      16
##           city
## target_var city_36 city_37 city_39 city_40 city_41 city_42 city_43 city_44
##        0     * 117    * 10     * 9    * 46    * 55       2       4     * 9
##        1        11       2       0       8      15     * 6     * 7       6
##        Sum     128      12       9      54      70       8      11      15
##           city
## target_var city_45 city_46 city_48 city_50 city_53 city_54 city_55 city_57
##        0      * 83    * 70     * 5   * 101    * 16    * 10     * 8    * 74
##        1        13      26       5      12       5       2       4      12
##        Sum      96      96      10     113      21      12      12      86
##           city
## target_var city_59 city_61 city_62 city_64 city_65 city_67 city_69 city_7
##        0       * 5   * 134     * 5    * 83   * 118   * 292    * 15   * 20
##        1         1      17       0      12      21      44       2      4
##        Sum       6     151       5      95     139     336      17     24
##           city
## target_var city_70 city_71 city_72 city_73 city_74 city_75 city_76 city_77
##        0      * 21   * 183    * 17   * 162      34   * 218    * 29    * 24
##        1        16      30       2      57    * 41      24      12       1
##        Sum      37     213      19     219      75     242      41      25
##           city
## target_var city_78 city_79 city_8 city_80 city_81 city_82 city_83 city_84
##        0        12     * 3    * 4    * 11     * 4     * 4    * 93    * 13
##        1      * 15       2      0       1       1       0      20       3
##        Sum      27       5      4      12       5       4     113      16
##           city
## target_var city_89 city_9 city_90 city_91 city_93 city_94 city_97 city_98
##        0      * 38   * 12   * 105    * 23    * 17    * 12    * 78    * 54
##        1        10      3      47      14       5       8       8       5
##        Sum      48     15     152      37      22      20      86      59
##           city
## target_var city_99   Sum
##        0      * 67 11489
##        1         8  3838
##        Sum      75 15327
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 2524.1, df = 121, p-value < 2.2e-16
best_auc_oner <- round(auc(valid_oth$target_var, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Medium data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- medium_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "Yes"
comparision_metric[3,"Train_instances"] <- 15327
comparision_metric[3,"Test_instances"] <- 3831

4. autoxgboost

available on git

Note: There is no inbuilt feature engineering functions available on autoxgboost. We used DriveML function to prepare the input dataset

devtools::install_github(“ja-thomas/autoxgboost”)

## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)
setDF(hr_ana_data) # set as data frame

time = Sys.time() ## Start time
traindata <- autoDataprep(hr_ana_data, target = "target_var",
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = FALSE,
                          char_var_limit = 150,
                          interaction_var = FALSE,
                          frequent_var = FALSE,
                          uid = 'enrollee_id',
                          verbose =TRUE)
## autoDataprep < missing imputation.... > 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_cc_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
cc_train_data <- master_cc_data[sele_var]
cc_train_data$enrollee_id <- NULL

set.seed(12345)
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)

train_oth <- cc_train_data[train.index,]
valid_oth <- cc_train_data[-train.index,]
# create a classification task
train_oth$target_var <- as.factor(train_oth$target_var)

trainTask = makeClassifTask(data = train_oth, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L) 
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
prediction = predict(auto_xgb_model, valid_oth[,-1])
prediction = getPredictionProbabilities(prediction)

myauc = round(auc(valid_oth$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_medium_data_hr_ana.rdata")

comparision_metric[4,"dataset_name"] <- "Medium data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "Yes"
comparision_metric[4,"Train_instances"] <- 15327
comparision_metric[4,"Test_instances"] <- 3831

Comparision results

kableExtra::kable(comparision_metric)
r_package_name dataset_name attributes missing Train_instances Test_instances time_taken_min test_auc
DriveML Medium data 14 Yes 15327 3831 35.126 0.798
H2o automl Medium data 14 Yes 15327 3831 60.869 0.8
OneR Medium data 14 Yes 15327 3831 0.105 0.71
autoxgboost Medium data 14 Yes 15327 3831 13.548 0.643