DriveML Experiment
data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_data <- fread(paste0(data_path, "/","adult_data.csv"), sep = ",",header = TRUE)
##target variable distributions
table(adult_data$target_var)
##
## 0 1
## 37155 11687
Split sample to test the model accuaracy with other open source R package
set.seed(12345)
train.index <- createDataPartition(adult_data$target_var, p = .8, list = FALSE)
DriveML step 1 - Missing variable treatment
marobj <- autoMAR (adult_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features
DriveML step 2 - Auto Dataprep
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(adult_data, target = "target_var",
auto_mar = TRUE,
mar_object=marobj,
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
uid = c('adult_id','flag'),
verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_adult_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
adult_train_data <- master_adult_data[sele_var]
### Train and valid data
train_adult <- adult_train_data[adult_train_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_train_data[adult_train_data$flag==1,]
valid_adult$flag <- NULL
DriveML step 3 - Model development
mymodel_adult <- autoMLmodel( train = train_adult,
test = valid_adult,
target = 'target_var',
tuneIters = 10,
tuneType = "random",
models = "all",
varImp = 10,
liftGroup = 50,
maxObs = 5000,
uid = 'adult_id',
htmlreport = FALSE,
verbose = TRUE,
seed = 42)
save(mymodel_adult, file="large_data_mymodel.rdata")
DriveML Results
Model results
results <- mymodel_adult$results
kableExtra::kable(results)
Model
|
Fitting time
|
Scoring time
|
Train AUC
|
Test AUC
|
Accuracy
|
Precision
|
Recall
|
F1_score
|
glmnet
|
10.335 secs
|
0.121 secs
|
0.907
|
0.906
|
0.854
|
0.751
|
0.570
|
0.648
|
logreg
|
9.729 secs
|
0.103 secs
|
0.910
|
0.907
|
0.853
|
0.730
|
0.603
|
0.660
|
randomForest
|
5.754 mins
|
3.064 secs
|
0.965
|
0.910
|
0.861
|
0.757
|
0.606
|
0.673
|
ranger
|
51.118 secs
|
7.057 secs
|
0.996
|
0.911
|
0.860
|
0.751
|
0.611
|
0.674
|
xgboost
|
11.074 secs
|
0.156 secs
|
0.966
|
0.924
|
0.869
|
0.764
|
0.642
|
0.698
|
rpart
|
4.67 secs
|
0.092 secs
|
0.865
|
0.866
|
0.855
|
0.749
|
0.581
|
0.654
|
## Variable Lift
mymodel_adult$modelexp$Lift_plot
### Random Forest Model validation ROC
mymodel_adult$trainedModels$randomForest$modelPlots$TestROC
### XGBoost Model validation ROC
mymodel_adult$trainedModels$xgboost$modelPlots$TestROC
### Random Forest Model Variable Importance
mymodel_adult$trainedModels$randomForest$modelPlots$VarImp
## [[1]]
Best ML model comparison with other R packages
1. DriveML
available on CRAN and git
selected best model from driveml outcome
time <- Sys.time()
marobj <- autoMAR (adult_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
## less than or equal to one missing value coloumn found in the dataframe
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(adult_data, target = "target_var",
auto_mar = TRUE,
mar_object=marobj,
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
uid = c('adult_id','flag'),
verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_adult_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
adult_train_data <- master_adult_data[sele_var]
### Train and valid data
train_adult <- adult_train_data[adult_train_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_train_data[adult_train_data$flag==1,]
valid_adult$flag <- NULL
driveml_largedata <- autoMLmodel( train = train_adult,
test = valid_adult,
target = 'target_var',
models = "xgboost",
uid = 'adult_id',
maxObs = 5000,
verbose = TRUE,
seed = 42)
## xgboost Model tuning started....
## autoMLmodel < All features xgboost tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_largedata$results$`Test AUC`,3)
comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")
comparision_metric[1,"dataset_name"] <- "Large data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "Yes"
comparision_metric[1,"Train_instances"] <- 32561
comparision_metric[1,"Test_instances"] <- 16281
2. H2o AutoML
available on CRAN and git
setDF(adult_data)
## Convert character data to factor for H2o automl function
adult_data[sapply(adult_data, is.character)] <- lapply(adult_data[sapply(adult_data, is.character)], as.factor)
## Convert target class variable as factor
## For binary classification, response should be a factor
adult_data$target_var <- as.factor(adult_data$target_var)
adult_data$adult_id <- NULL
### Train and valid data
train_adult <- adult_data[adult_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_data[adult_data$flag==1,]
valid_adult$flag <- NULL
Training using h2o.automl() function
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 days 17 hours
## H2O cluster timezone: Asia/Kolkata
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.1.3
## H2O cluster version age: 28 days, 22 hours and 19 minutes
## H2O cluster name: H2O_started_from_R_dubrangala_zcg934
## H2O cluster total nodes: 1
## H2O cluster total memory: 5.09 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.3 (2020-10-10)
## train and valid data set inot h2o
time = Sys.time() ## start time
train_d = as.h2o(train_adult)
##
|
| | 0%
|
|======================================================================| 100%
test_d = as.h2o(valid_adult)
##
|
| | 0%
|
|======================================================================| 100%
Train Model
h2o_large_data_model <- h2o.automl(y = "target_var",
training_frame = train_d,
validation_frame = test_d,
max_models = 5L,
exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
##
|
| | 0%
## 16:19:42.197: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 16:19:42.198: AutoML: XGBoost is not available; skipping it.
|
|==== | 6%
|
|===== | 7%
|
|======== | 12%
|
|=========== | 16%
|
|============= | 18%
|
|============== | 20%
|
|================== | 25%
|
|=================== | 27%
|
|======================== | 35%
|
|========================= | 36%
|
|============================= | 41%
|
|=============================== | 44%
|
|======================================================================| 100%
lb = as.data.frame(h2o_large_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3)
save(h2o_large_data_model, file = "h2o_model_large_data_hr_ana.rdata")
comparision_metric[2,"dataset_name"] <- "Large data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "Yes"
comparision_metric[2,"Train_instances"] <- 32561
comparision_metric[2,"Test_instances"] <- 16281
3. OneR
available on CRAN and git
build model with the OneR package
Note: There is no inbuilt feature engineering functions available on OneR
. We used DriveML function to prepare the input dataset
## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_data <- fread(paste0(data_path, "/","adult_data.csv"), sep = ",",header = TRUE)
## Recode target variable
time = Sys.time() ## Start time
setDF(adult_data) # set as data frame
adult_data$adult_id <- NULL
### Train and valid data
train_adult <- adult_data[adult_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_data[adult_data$flag==1,]
valid_adult$flag <- NULL
## OneR model
model_oner_large <- OneR(target_var ~., data = train_adult, verbose = TRUE)
## Warning in OneR.data.frame(x = data, ties.method = ties.method, verbose =
## verbose, : data contains unused factor levels
##
## Attribute Accuracy
## 1 * V4 77.96%
## 1 V5 77.96%
## 3 V12 77.16%
## 4 V11 76.65%
## 5 V2 76.31%
## 6 V1 75.92%
## 6 V3 75.92%
## 6 V6 75.92%
## 6 V7 75.92%
## 6 V8 75.92%
## 6 V9 75.92%
## 6 V10 75.92%
## 6 V13 75.92%
## 6 V14 75.92%
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_large, valid_adult,type = c("prob"))
large_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_large, file = "oner_model_large_data_adult_ana.rdata")
summary(model_oner_large)
##
## Call:
## OneR.formula(formula = target_var ~ ., data = train_adult, verbose = TRUE)
##
## Rules:
## If V4 = 10th then target_var = 0
## If V4 = 11th then target_var = 0
## If V4 = 12th then target_var = 0
## If V4 = 1st-4th then target_var = 0
## If V4 = 5th-6th then target_var = 0
## If V4 = 7th-8th then target_var = 0
## If V4 = 9th then target_var = 0
## If V4 = Assoc-acdm then target_var = 0
## If V4 = Assoc-voc then target_var = 0
## If V4 = Bachelors then target_var = 0
## If V4 = Doctorate then target_var = 1
## If V4 = HS-grad then target_var = 0
## If V4 = Masters then target_var = 1
## If V4 = Preschool then target_var = 0
## If V4 = Prof-school then target_var = 1
## If V4 = Some-college then target_var = 0
##
## Accuracy:
## 25384 of 32561 instances classified correctly (77.96%)
##
## Contingency table:
## V4
## target_var 10th 11th 12th 1st-4th 5th-6th 7th-8th 9th Assoc-acdm
## 0 * 871 * 1115 * 400 * 162 * 317 * 606 * 487 * 802
## 1 62 60 33 6 16 40 27 265
## Sum 933 1175 433 168 333 646 514 1067
## V4
## target_var Assoc-voc Bachelors Doctorate HS-grad Masters Preschool
## 0 * 1021 * 3134 107 * 8826 764 * 51
## 1 361 2221 * 306 1675 * 959 0
## Sum 1382 5355 413 10501 1723 51
## V4
## target_var Prof-school Some-college Sum
## 0 153 * 5904 24720
## 1 * 423 1387 7841
## Sum 576 7291 32561
## ---
## Maximum in each column: '*'
##
## Pearson's Chi-squared test:
## X-squared = 4429.7, df = 15, p-value < 2.2e-16
best_auc_oner <- round(auc(valid_adult$target_var, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Large data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- large_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "Yes"
comparision_metric[3,"Train_instances"] <- 32561
comparision_metric[3,"Test_instances"] <- 16281
4. autoxgboost
available on git
Note: There is no inbuilt feature engineering functions available on autoxgboost
. We used DriveML function to prepare the input dataset
devtools::install_github(“ja-thomas/autoxgboost”)
## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_data <- fread(paste0(data_path, "/","adult_data.csv"), sep = ",",header = TRUE)
## Recode target variable
time = Sys.time() ## Start time
setDF(adult_data) # set as data frame
time = Sys.time() ## Start time
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(adult_data, target = "target_var",
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
uid = c('adult_id','flag'),
verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_adult_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
adult_train_data <- master_adult_data[sele_var]
adult_train_data$adult_id <- NULL
### Train and valid data
train_adult <- adult_train_data[adult_train_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_train_data[adult_train_data$flag==1,]
valid_adult$flag <- NULL
# create a classification task
train_adult$target_var <- as.factor(train_adult$target_var)
trainTask = makeClassifTask(data = train_adult, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L)
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
prediction = predict(auto_xgb_model, valid_adult[,-1])
prediction = getPredictionProbabilities(prediction)
myauc = round(auc(valid_adult$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_large_data_adult_ana.rdata")
comparision_metric[4,"dataset_name"] <- "Large data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "Yes"
comparision_metric[4,"Train_instances"] <- 32561
comparision_metric[4,"Test_instances"] <- 16281
Comparision results
kableExtra::kable(comparision_metric)
r_package_name
|
dataset_name
|
attributes
|
missing
|
Train_instances
|
Test_instances
|
time_taken_min
|
test_auc
|
DriveML
|
Large data
|
14
|
Yes
|
32561
|
16281
|
29.898
|
0.924
|
H2o automl
|
Large data
|
14
|
Yes
|
32561
|
16281
|
47.839
|
0.928
|
OneR
|
Large data
|
14
|
Yes
|
32561
|
16281
|
0.132
|
0.71
|
autoxgboost
|
Large data
|
14
|
Yes
|
32561
|
16281
|
40.7
|
0.922
|