DriveML Experiment
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)
##target variable distributions
table(hr_ana_data$target_var)
##
## 0 1
## 14381 4777
Split sample to test the model accuaracy with other open source R package
set.seed(12345)
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)
DriveML step 1 - Missing variable treatment
marobj <- autoMAR (hr_ana_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features
DriveML step 2 - Auto Dataprep
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(hr_ana_data, target = "target_var",
auto_mar = TRUE,
mar_object=marobj,
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
uid = 'enrollee_id',
verbose =TRUE)
## autoDataprep < MAR variable computation.... >
## autoDataprep < missing imputation.... >
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_hr_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
hr_train_data <- master_hr_data[sele_var]
### Train and valid data
train_hr <- hr_train_data[train.index,]
valid_hr <- hr_train_data[-train.index,]
DriveML step 3 - Model development
mymodel_hr <- autoMLmodel( train = train_hr,
test = valid_hr,
target = 'target_var',
tuneIters = 10,
tuneType = "random",
models = "all",
varImp = 10,
liftGroup = 50,
maxObs = 5000,
uid = 'enrollee_id',
htmlreport = FALSE,
pdp = TRUE,
verbose = TRUE,
seed = 42)
save(mymodel_hr, file="medium_data_mymodel.rdata")
DriveML Results
Model results
results <- mymodel_hr$results
kableExtra::kable(results)
|
Model
|
Fitting time
|
Scoring time
|
Train AUC
|
Test AUC
|
Accuracy
|
Precision
|
Recall
|
F1_score
|
5
|
xgboost
|
10.702 secs
|
0.06 secs
|
0.866
|
0.789
|
0.784
|
0.563
|
0.535
|
0.549
|
2
|
logreg
|
6.214 secs
|
0.188 secs
|
0.805
|
0.786
|
0.776
|
0.564
|
0.380
|
0.454
|
4
|
ranger
|
24.495 secs
|
0.721 secs
|
0.975
|
0.783
|
0.783
|
0.569
|
0.471
|
0.515
|
1
|
glmnet
|
5.939 secs
|
0.061 secs
|
0.795
|
0.781
|
0.767
|
0.544
|
0.296
|
0.383
|
3
|
randomForest
|
1.686 mins
|
0.521 secs
|
0.875
|
0.780
|
0.774
|
0.563
|
0.354
|
0.434
|
6
|
rpart
|
3.88 secs
|
0.038 secs
|
0.783
|
0.761
|
0.778
|
0.553
|
0.496
|
0.523
|
## Variable Lift
mymodel_hr$modelexp$Lift_plot
### Partial dependency plot
#mymodel_hr$modelexp$pdp
### Random Forest Model validation ROC
mymodel_hr$trainedModels$randomForest$modelPlots$TestROC
### XGBoost Model validation ROC
mymodel_hr$trainedModels$xgboost$modelPlots$TestROC
### Random Forest Model Variable Importance
mymodel_hr$trainedModels$randomForest$modelPlots$VarImp
## [[1]]
Best ML model comparison with other R packages
1. DriveML
available on CRAN and git
selected best model from driveml outcome
time <- Sys.time()
marobj <- autoMAR (hr_ana_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
##
|
| | 0%
|
|========= | 12%
|
|================== | 25%
|
|========================== | 38%
|
|=================================== | 50%
|
|============================================ | 62%
|
|==================================================== | 75%
|
|============================================================= | 88%
|
|======================================================================| 100%
marobj$auc_features
## Variable AUC
## 1: company_size 1.0000000
## 2: company_type 1.0000000
## 3: education_level 1.0000000
## 4: enrolled_university 1.0000000
## 5: experience 0.9204786
## 6: last_new_job 1.0000000
## 7: major_discipline 1.0000000
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(hr_ana_data, target = "target_var",
auto_mar = TRUE,
mar_object=marobj,
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
uid = 'enrollee_id',
verbose =TRUE)
## autoDataprep < MAR variable computation.... >
## autoDataprep < missing imputation.... >
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_hr_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
hr_train_data <- master_hr_data[sele_var]
### Train and valid data
train_hr <- hr_train_data[train.index,]
valid_hr <- hr_train_data[-train.index,]
driveml_mediumdata <- autoMLmodel( train = train_hr,
test = valid_hr,
target = 'target_var',
models = "xgboost",
uid = 'enrollee_id',
verbose = TRUE,
seed = 42)
## xgboost Model tuning started....
## autoMLmodel < All features xgboost tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_mediumdata$results$`Test AUC`,3)
comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")
comparision_metric[1,"dataset_name"] <- "Medium data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "Yes"
comparision_metric[1,"Train_instances"] <- 15327
comparision_metric[1,"Test_instances"] <- 3831
2. H2o AutoML
available on CRAN and git
setDF(hr_ana_data)
## Convert character data to factor for H2o automl function
hr_ana_data[sapply(hr_ana_data, is.character)] <- lapply(hr_ana_data[sapply(hr_ana_data, is.character)], as.factor)
## Convert target class variable as factor
## For binary classification, response should be a factor
hr_ana_data$target_var <- as.factor(hr_ana_data$target_var)
hr_ana_data$enrollee_id <- NULL
### Train and valid data
train_data <- hr_ana_data[train.index,]
valid_data <- hr_ana_data[-train.index,]
Training using h2o.automl() function
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 days 14 hours
## H2O cluster timezone: Asia/Kolkata
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.1.3
## H2O cluster version age: 28 days, 19 hours and 37 minutes
## H2O cluster name: H2O_started_from_R_dubrangala_zcg934
## H2O cluster total nodes: 1
## H2O cluster total memory: 5.99 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.3 (2020-10-10)
## train and valid data set inot h2o
time = Sys.time() ## start time
train_d = as.h2o(train_data)
##
|
| | 0%
|
|======================================================================| 100%
test_d = as.h2o(valid_data)
##
|
| | 0%
|
|======================================================================| 100%
Train Model
h2o_medium_data_model <- h2o.automl(y = "target_var",
training_frame = train_d,
validation_frame = test_d,
exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
##
|
| | 0%
## 13:37:31.49: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 13:37:31.49: AutoML: XGBoost is not available; skipping it.
|
|====== | 9%
|
|================== | 25%
|
|==================== | 29%
|
|========================= | 36%
|
|================================ | 45%
|
|======================================================================| 100%
lb = as.data.frame(h2o_medium_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3)
save(h2o_medium_data_model, file = "h2o_model_medium_data_hr_ana.rdata")
comparision_metric[2,"dataset_name"] <- "Medium data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "Yes"
comparision_metric[2,"Train_instances"] <- 15327
comparision_metric[2,"Test_instances"] <- 3831
3. OneR
available on CRAN and git
build model with the OneR package
Note: There is no inbuilt feature engineering functions available on OneR
. We used DriveML function to prepare the input dataset
## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)
## Recode target variable
time = Sys.time() ## Start time
setDF(hr_ana_data) # set as data frame
hr_ana_data$enrollee_id <- NULL
## Impute missing values
cc=impute(hr_ana_data, classes = list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## split randomly into a training (80%) and a test (20%) set
set.seed(12345) # for reproducibility
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)
train_oth <- cc$data[train.index,]
valid_oth <- cc$data[-train.index,]
## OneR model
model_oner_medium <- OneR(target_var ~., data = train_oth, verbose = TRUE)
##
## Attribute Accuracy
## 1 * city 78.52%
## 2 city_development_index 78.31%
## 3 gender 74.96%
## 3 relevent_experience 74.96%
## 3 enrolled_university 74.96%
## 3 education_level 74.96%
## 3 major_discipline 74.96%
## 3 experience 74.96%
## 3 company_size 74.96%
## 3 company_type 74.96%
## 3 last_new_job 74.96%
## 3 training_hours 74.96%
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_medium, valid_oth,type = c("prob"))
medium_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_medium, file = "oner_model_medium_data_har_ana.rdata")
summary(model_oner_medium)
##
## Call:
## OneR.formula(formula = target_var ~ ., data = train_oth, verbose = TRUE)
##
## Rules:
## If city = city_1 then target_var = 0
## If city = city_10 then target_var = 0
## If city = city_100 then target_var = 0
## If city = city_101 then target_var = 1
## If city = city_102 then target_var = 0
## If city = city_103 then target_var = 0
## If city = city_104 then target_var = 0
## If city = city_105 then target_var = 0
## If city = city_106 then target_var = 0
## If city = city_107 then target_var = 1
## If city = city_109 then target_var = 0
## If city = city_11 then target_var = 1
## If city = city_111 then target_var = 0
## If city = city_114 then target_var = 0
## If city = city_115 then target_var = 0
## If city = city_116 then target_var = 0
## If city = city_117 then target_var = 0
## If city = city_118 then target_var = 0
## If city = city_12 then target_var = 0
## If city = city_120 then target_var = 0
## If city = city_121 then target_var = 0
## If city = city_123 then target_var = 0
## If city = city_126 then target_var = 1
## If city = city_127 then target_var = 0
## If city = city_128 then target_var = 1
## If city = city_129 then target_var = 0
## If city = city_13 then target_var = 0
## If city = city_131 then target_var = 0
## If city = city_133 then target_var = 0
## If city = city_134 then target_var = 0
## If city = city_136 then target_var = 0
## If city = city_138 then target_var = 0
## If city = city_139 then target_var = 1
## If city = city_14 then target_var = 0
## If city = city_141 then target_var = 0
## If city = city_142 then target_var = 0
## If city = city_143 then target_var = 0
## If city = city_144 then target_var = 0
## If city = city_145 then target_var = 1
## If city = city_146 then target_var = 0
## If city = city_149 then target_var = 0
## If city = city_150 then target_var = 0
## If city = city_152 then target_var = 0
## If city = city_155 then target_var = 1
## If city = city_157 then target_var = 0
## If city = city_158 then target_var = 0
## If city = city_159 then target_var = 0
## If city = city_16 then target_var = 0
## If city = city_160 then target_var = 0
## If city = city_162 then target_var = 0
## If city = city_165 then target_var = 0
## If city = city_166 then target_var = 0
## If city = city_167 then target_var = 0
## If city = city_171 then target_var = 1
## If city = city_173 then target_var = 0
## If city = city_175 then target_var = 0
## If city = city_176 then target_var = 0
## If city = city_179 then target_var = 0
## If city = city_18 then target_var = 0
## If city = city_180 then target_var = 0
## If city = city_19 then target_var = 0
## If city = city_2 then target_var = 0
## If city = city_20 then target_var = 0
## If city = city_21 then target_var = 1
## If city = city_23 then target_var = 0
## If city = city_24 then target_var = 0
## If city = city_25 then target_var = 1
## If city = city_26 then target_var = 0
## If city = city_27 then target_var = 0
## If city = city_28 then target_var = 0
## If city = city_30 then target_var = 0
## If city = city_31 then target_var = 0
## If city = city_33 then target_var = 1
## If city = city_36 then target_var = 0
## If city = city_37 then target_var = 0
## If city = city_39 then target_var = 0
## If city = city_40 then target_var = 0
## If city = city_41 then target_var = 0
## If city = city_42 then target_var = 1
## If city = city_43 then target_var = 1
## If city = city_44 then target_var = 0
## If city = city_45 then target_var = 0
## If city = city_46 then target_var = 0
## If city = city_48 then target_var = 0
## If city = city_50 then target_var = 0
## If city = city_53 then target_var = 0
## If city = city_54 then target_var = 0
## If city = city_55 then target_var = 0
## If city = city_57 then target_var = 0
## If city = city_59 then target_var = 0
## If city = city_61 then target_var = 0
## If city = city_62 then target_var = 0
## If city = city_64 then target_var = 0
## If city = city_65 then target_var = 0
## If city = city_67 then target_var = 0
## If city = city_69 then target_var = 0
## If city = city_7 then target_var = 0
## If city = city_70 then target_var = 0
## If city = city_71 then target_var = 0
## If city = city_72 then target_var = 0
## If city = city_73 then target_var = 0
## If city = city_74 then target_var = 1
## If city = city_75 then target_var = 0
## If city = city_76 then target_var = 0
## If city = city_77 then target_var = 0
## If city = city_78 then target_var = 1
## If city = city_79 then target_var = 0
## If city = city_8 then target_var = 0
## If city = city_80 then target_var = 0
## If city = city_81 then target_var = 0
## If city = city_82 then target_var = 0
## If city = city_83 then target_var = 0
## If city = city_84 then target_var = 0
## If city = city_89 then target_var = 0
## If city = city_9 then target_var = 0
## If city = city_90 then target_var = 0
## If city = city_91 then target_var = 0
## If city = city_93 then target_var = 0
## If city = city_94 then target_var = 0
## If city = city_97 then target_var = 0
## If city = city_98 then target_var = 0
## If city = city_99 then target_var = 0
##
## Accuracy:
## 12034 of 15327 instances classified correctly (78.52%)
##
## Contingency table:
## city
## target_var city_1 city_10 city_100 city_101 city_102 city_103 city_104 city_105
## 0 * 21 * 63 * 160 27 * 216 * 2730 * 221 * 54
## 1 3 9 56 * 38 36 731 23 9
## Sum 24 72 216 65 252 3461 244 63
## city
## target_var city_106 city_107 city_109 city_11 city_111 city_114 city_115
## 0 * 5 2 * 5 81 * 2 * 969 * 27
## 1 2 * 3 2 * 117 0 109 15
## Sum 7 5 7 198 2 1078 42
## city
## target_var city_116 city_117 city_118 city_12 city_120 city_121 city_123
## 0 * 90 * 6 * 16 * 10 * 3 * 2 * 45
## 1 16 3 7 2 0 1 18
## Sum 106 9 23 12 3 3 63
## city
## target_var city_126 city_127 city_128 city_129 city_13 city_131 city_133
## 0 10 * 7 28 * 1 * 35 * 5 * 6
## 1 * 13 1 * 41 0 4 3 1
## Sum 23 8 69 1 39 8 7
## city
## target_var city_134 city_136 city_138 city_139 city_14 city_141 city_142
## 0 * 27 * 435 * 81 0 * 17 * 16 * 32
## 1 10 49 9 * 3 6 3 12
## Sum 37 484 90 3 23 19 44
## city
## target_var city_143 city_144 city_145 city_146 city_149 city_150 city_152
## 0 * 16 * 12 20 * 4 * 63 * 40 * 30
## 1 12 5 * 31 2 20 13 9
## Sum 28 17 51 6 83 53 39
## city
## target_var city_155 city_157 city_158 city_159 city_16 city_160 city_162
## 0 2 * 15 * 30 * 66 * 1086 * 519 * 71
## 1 * 9 3 11 11 143 157 34
## Sum 11 18 41 77 1229 676 105
## city
## target_var city_165 city_166 city_167 city_171 city_173 city_175 city_176
## 0 * 47 * 2 * 2 0 * 105 * 10 * 15
## 1 10 1 2 * 1 16 3 6
## Sum 57 3 4 1 121 13 21
## city
## target_var city_179 city_18 city_180 city_19 city_2 city_20 city_21 city_23
## 0 * 1 * 3 * 4 * 53 * 5 * 22 874 * 133
## 1 1 1 2 37 0 3 * 1311 13
## Sum 2 4 6 90 5 25 2185 146
## city
## target_var city_24 city_25 city_26 city_27 city_28 city_30 city_31 city_33
## 0 * 42 1 * 16 * 29 * 144 * 14 * 3 6
## 1 9 * 2 4 6 13 3 1 * 10
## Sum 51 3 20 35 157 17 4 16
## city
## target_var city_36 city_37 city_39 city_40 city_41 city_42 city_43 city_44
## 0 * 117 * 10 * 9 * 46 * 55 2 4 * 9
## 1 11 2 0 8 15 * 6 * 7 6
## Sum 128 12 9 54 70 8 11 15
## city
## target_var city_45 city_46 city_48 city_50 city_53 city_54 city_55 city_57
## 0 * 83 * 70 * 5 * 101 * 16 * 10 * 8 * 74
## 1 13 26 5 12 5 2 4 12
## Sum 96 96 10 113 21 12 12 86
## city
## target_var city_59 city_61 city_62 city_64 city_65 city_67 city_69 city_7
## 0 * 5 * 134 * 5 * 83 * 118 * 292 * 15 * 20
## 1 1 17 0 12 21 44 2 4
## Sum 6 151 5 95 139 336 17 24
## city
## target_var city_70 city_71 city_72 city_73 city_74 city_75 city_76 city_77
## 0 * 21 * 183 * 17 * 162 34 * 218 * 29 * 24
## 1 16 30 2 57 * 41 24 12 1
## Sum 37 213 19 219 75 242 41 25
## city
## target_var city_78 city_79 city_8 city_80 city_81 city_82 city_83 city_84
## 0 12 * 3 * 4 * 11 * 4 * 4 * 93 * 13
## 1 * 15 2 0 1 1 0 20 3
## Sum 27 5 4 12 5 4 113 16
## city
## target_var city_89 city_9 city_90 city_91 city_93 city_94 city_97 city_98
## 0 * 38 * 12 * 105 * 23 * 17 * 12 * 78 * 54
## 1 10 3 47 14 5 8 8 5
## Sum 48 15 152 37 22 20 86 59
## city
## target_var city_99 Sum
## 0 * 67 11489
## 1 8 3838
## Sum 75 15327
## ---
## Maximum in each column: '*'
##
## Pearson's Chi-squared test:
## X-squared = 2524.1, df = 121, p-value < 2.2e-16
best_auc_oner <- round(auc(valid_oth$target_var, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Medium data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- medium_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "Yes"
comparision_metric[3,"Train_instances"] <- 15327
comparision_metric[3,"Test_instances"] <- 3831
4. autoxgboost
available on git
Note: There is no inbuilt feature engineering functions available on autoxgboost
. We used DriveML function to prepare the input dataset
devtools::install_github(“ja-thomas/autoxgboost”)
## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/medium_19k_HR_analytics_data_kaagle"
hr_ana_data <- fread(paste0(data_path, "/","aug_train.csv"), sep = ",",header = TRUE)
setDF(hr_ana_data) # set as data frame
time = Sys.time() ## Start time
traindata <- autoDataprep(hr_ana_data, target = "target_var",
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = FALSE,
char_var_limit = 150,
interaction_var = FALSE,
frequent_var = FALSE,
uid = 'enrollee_id',
verbose =TRUE)
## autoDataprep < missing imputation.... >
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_cc_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
cc_train_data <- master_cc_data[sele_var]
cc_train_data$enrollee_id <- NULL
set.seed(12345)
train.index <- createDataPartition(hr_ana_data$target_var, p = .8, list = FALSE)
train_oth <- cc_train_data[train.index,]
valid_oth <- cc_train_data[-train.index,]
# create a classification task
train_oth$target_var <- as.factor(train_oth$target_var)
trainTask = makeClassifTask(data = train_oth, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L)
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
prediction = predict(auto_xgb_model, valid_oth[,-1])
prediction = getPredictionProbabilities(prediction)
myauc = round(auc(valid_oth$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_medium_data_hr_ana.rdata")
comparision_metric[4,"dataset_name"] <- "Medium data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "Yes"
comparision_metric[4,"Train_instances"] <- 15327
comparision_metric[4,"Test_instances"] <- 3831
Comparision results
kableExtra::kable(comparision_metric)
r_package_name
|
dataset_name
|
attributes
|
missing
|
Train_instances
|
Test_instances
|
time_taken_min
|
test_auc
|
DriveML
|
Medium data
|
14
|
Yes
|
15327
|
3831
|
35.126
|
0.798
|
H2o automl
|
Medium data
|
14
|
Yes
|
15327
|
3831
|
60.869
|
0.8
|
OneR
|
Medium data
|
14
|
Yes
|
15327
|
3831
|
0.105
|
0.71
|
autoxgboost
|
Medium data
|
14
|
Yes
|
15327
|
3831
|
13.548
|
0.643
|