library(DriveML)    ## automl package
library(data.table) ## data wrangling package
library(caret)      ## ML wrapper package
library(mlr)        ## ML wrapper package
library(h2o)        ## automl package
library(OneR)       ## automl package
library(Metrics)    ## auc function
library(autoxgboost)## automl package

Adult Data Set

data source : https://archive.ics.uci.edu/ml/datasets/adult

Content

Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))

Prediction task is to determine whether a person makes over 50K a year.

Source:

Donor:

Ronny Kohavi and Barry Becker Data Mining and Visualization Silicon Graphics. e-mail: ronnyk ‘@’ live.com for questions.

Problem Statement

Predict whether income exceeds $50K/yr based on census data. Also known as “Census Income” dataset.

Data preparations

Initial data preparation

data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_train <- read.table(paste0(data_path, "\\","adult.data"), sep = ",")
adult_test <- read.table(paste0(data_path, "\\","adult.test"), sep = ",")
setDT(adult_train); setDT(adult_test)

adult_train[, target_var:= ifelse(V15 == ' >50K',1,0)]
adult_train[, V15:=NULL]
table(adult_train$target_var)

adult_test[, target_var:= ifelse(V15 == ' >50K.',1,0)]
adult_test[, V15:=NULL]
table(adult_test$target_var)

adult_train[, flag:= 0]
adult_test[, flag:= 1]
adult <- rbind.data.frame(adult_train, adult_test)
adult[, adult_id := paste0("adult", 1:.N)]
write.csv(adult,file="adult_data.csv",row.names =  FALSE)

DriveML Experiment

data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_data <- fread(paste0(data_path, "/","adult_data.csv"), sep = ",",header = TRUE)

##target variable distributions
table(adult_data$target_var)
## 
##     0     1 
## 37155 11687

Split sample to test the model accuaracy with other open source R package

set.seed(12345)
train.index <- createDataPartition(adult_data$target_var, p = .8, list = FALSE)

DriveML step 1 - Missing variable treatment

marobj <- autoMAR (adult_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features

DriveML step 2 - Auto Dataprep

## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(adult_data, target = "target_var",
                          auto_mar = TRUE,
                          mar_object=marobj,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = c('adult_id','flag'),
                          verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_adult_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
adult_train_data <- master_adult_data[sele_var]

### Train and valid data
train_adult <- adult_train_data[adult_train_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_train_data[adult_train_data$flag==1,]
valid_adult$flag <- NULL

DriveML step 3 - Model development

mymodel_adult <- autoMLmodel( train = train_adult,
                           test = valid_adult,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "all",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           uid = 'adult_id',
                           htmlreport = FALSE,
                           verbose = TRUE,
                           seed = 42)
  save(mymodel_adult, file="large_data_mymodel.rdata")

DriveML Results

Model results

results <- mymodel_adult$results
kableExtra::kable(results)
Model Fitting time Scoring time Train AUC Test AUC Accuracy Precision Recall F1_score
glmnet 10.335 secs 0.121 secs 0.907 0.906 0.854 0.751 0.570 0.648
logreg 9.729 secs 0.103 secs 0.910 0.907 0.853 0.730 0.603 0.660
randomForest 5.754 mins 3.064 secs 0.965 0.910 0.861 0.757 0.606 0.673
ranger 51.118 secs 7.057 secs 0.996 0.911 0.860 0.751 0.611 0.674
xgboost 11.074 secs 0.156 secs 0.966 0.924 0.869 0.764 0.642 0.698
rpart 4.67 secs 0.092 secs 0.865 0.866 0.855 0.749 0.581 0.654
## Variable Lift
mymodel_adult$modelexp$Lift_plot

### Random Forest Model validation ROC
mymodel_adult$trainedModels$randomForest$modelPlots$TestROC

### XGBoost Model validation ROC
mymodel_adult$trainedModels$xgboost$modelPlots$TestROC

### Random Forest Model Variable Importance
mymodel_adult$trainedModels$randomForest$modelPlots$VarImp
## [[1]]

Best ML model comparison with other R packages

1. DriveML

available on CRAN and git

selected best model from driveml outcome

time <- Sys.time()
marobj <- autoMAR (adult_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
## less than or equal to one missing value coloumn found in the dataframe
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(adult_data, target = "target_var",
                          auto_mar = TRUE,
                          mar_object=marobj,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = c('adult_id','flag'),
                          verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_adult_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
adult_train_data <- master_adult_data[sele_var]

### Train and valid data
train_adult <- adult_train_data[adult_train_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_train_data[adult_train_data$flag==1,]
valid_adult$flag <- NULL

driveml_largedata <- autoMLmodel( train = train_adult,
                           test = valid_adult,
                           target = 'target_var',
                           models = "xgboost",
                           uid = 'adult_id',
                           maxObs = 5000,
                           verbose = TRUE,
                           seed = 42)
## xgboost Model tuning started.... 
## autoMLmodel < All features xgboost tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_largedata$results$`Test AUC`,3)

comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")

comparision_metric[1,"dataset_name"] <- "Large data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "Yes"
comparision_metric[1,"Train_instances"] <- 32561
comparision_metric[1,"Test_instances"] <- 16281

2. H2o AutoML

available on CRAN and git

setDF(adult_data)
## Convert character data to factor for H2o automl function
adult_data[sapply(adult_data, is.character)] <- lapply(adult_data[sapply(adult_data, is.character)], as.factor)

## Convert target class variable as factor
## For binary classification, response should be a factor
adult_data$target_var <- as.factor(adult_data$target_var)
adult_data$adult_id <- NULL

### Train and valid data
train_adult <- adult_data[adult_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_data[adult_data$flag==1,]
valid_adult$flag <- NULL

Training using h2o.automl() function

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 17 hours 
##     H2O cluster timezone:       Asia/Kolkata 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.1.3 
##     H2O cluster version age:    28 days, 22 hours and 19 minutes  
##     H2O cluster name:           H2O_started_from_R_dubrangala_zcg934 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   5.09 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.3 (2020-10-10)
## train and valid data set inot h2o 
time = Sys.time() ## start time
train_d = as.h2o(train_adult)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_d = as.h2o(valid_adult)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Train Model

h2o_large_data_model <- h2o.automl(y = "target_var", 
                        training_frame = train_d, 
                        validation_frame = test_d,
                        max_models = 5L,
                        exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
## 
  |                                                                            
  |                                                                      |   0%
## 16:19:42.197: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 16:19:42.198: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |======================================================================| 100%
lb = as.data.frame(h2o_large_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3) 
save(h2o_large_data_model, file = "h2o_model_large_data_hr_ana.rdata")

comparision_metric[2,"dataset_name"] <- "Large data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "Yes"
comparision_metric[2,"Train_instances"] <- 32561
comparision_metric[2,"Test_instances"] <- 16281

3. OneR

available on CRAN and git

build model with the OneR package

Note: There is no inbuilt feature engineering functions available on OneR. We used DriveML function to prepare the input dataset

## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_data <- fread(paste0(data_path, "/","adult_data.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(adult_data) # set as data frame
adult_data$adult_id <- NULL

### Train and valid data
train_adult <- adult_data[adult_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_data[adult_data$flag==1,]
valid_adult$flag <- NULL

## OneR model
model_oner_large <- OneR(target_var ~., data = train_adult, verbose = TRUE)
## Warning in OneR.data.frame(x = data, ties.method = ties.method, verbose =
## verbose, : data contains unused factor levels
## 
##     Attribute Accuracy
## 1 * V4        77.96%  
## 1   V5        77.96%  
## 3   V12       77.16%  
## 4   V11       76.65%  
## 5   V2        76.31%  
## 6   V1        75.92%  
## 6   V3        75.92%  
## 6   V6        75.92%  
## 6   V7        75.92%  
## 6   V8        75.92%  
## 6   V9        75.92%  
## 6   V10       75.92%  
## 6   V13       75.92%  
## 6   V14       75.92%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_large, valid_adult,type = c("prob"))
large_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_large, file = "oner_model_large_data_adult_ana.rdata")
summary(model_oner_large)
## 
## Call:
## OneR.formula(formula = target_var ~ ., data = train_adult, verbose = TRUE)
## 
## Rules:
## If V4 =  10th         then target_var = 0
## If V4 =  11th         then target_var = 0
## If V4 =  12th         then target_var = 0
## If V4 =  1st-4th      then target_var = 0
## If V4 =  5th-6th      then target_var = 0
## If V4 =  7th-8th      then target_var = 0
## If V4 =  9th          then target_var = 0
## If V4 =  Assoc-acdm   then target_var = 0
## If V4 =  Assoc-voc    then target_var = 0
## If V4 =  Bachelors    then target_var = 0
## If V4 =  Doctorate    then target_var = 1
## If V4 =  HS-grad      then target_var = 0
## If V4 =  Masters      then target_var = 1
## If V4 =  Preschool    then target_var = 0
## If V4 =  Prof-school  then target_var = 1
## If V4 =  Some-college then target_var = 0
## 
## Accuracy:
## 25384 of 32561 instances classified correctly (77.96%)
## 
## Contingency table:
##           V4
## target_var  10th   11th  12th  1st-4th  5th-6th  7th-8th   9th  Assoc-acdm
##        0   * 871 * 1115 * 400    * 162    * 317    * 606 * 487       * 802
##        1      62     60    33        6       16       40    27         265
##        Sum   933   1175   433      168      333      646   514        1067
##           V4
## target_var  Assoc-voc  Bachelors  Doctorate  HS-grad  Masters  Preschool
##        0       * 1021     * 3134        107   * 8826      764       * 51
##        1          361       2221      * 306     1675    * 959          0
##        Sum       1382       5355        413    10501     1723         51
##           V4
## target_var  Prof-school  Some-college   Sum
##        0            153        * 5904 24720
##        1          * 423          1387  7841
##        Sum          576          7291 32561
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 4429.7, df = 15, p-value < 2.2e-16
best_auc_oner <- round(auc(valid_adult$target_var, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Large data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- large_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "Yes"
comparision_metric[3,"Train_instances"] <- 32561
comparision_metric[3,"Test_instances"] <- 16281

4. autoxgboost

available on git

Note: There is no inbuilt feature engineering functions available on autoxgboost. We used DriveML function to prepare the input dataset

devtools::install_github(“ja-thomas/autoxgboost”)

## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/large_data - adult data"
adult_data <- fread(paste0(data_path, "/","adult_data.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(adult_data) # set as data frame

time = Sys.time() ## Start time
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(adult_data, target = "target_var",
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          uid = c('adult_id','flag'),
                          verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Frequent transformer....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_adult_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
adult_train_data <- master_adult_data[sele_var]
adult_train_data$adult_id <- NULL
### Train and valid data
train_adult <- adult_train_data[adult_train_data$flag==0,]
train_adult$flag <- NULL
valid_adult <- adult_train_data[adult_train_data$flag==1,]
valid_adult$flag <- NULL
# create a classification task
train_adult$target_var <- as.factor(train_adult$target_var)

trainTask = makeClassifTask(data = train_adult, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L) 
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
prediction = predict(auto_xgb_model, valid_adult[,-1])
prediction = getPredictionProbabilities(prediction)

myauc = round(auc(valid_adult$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_large_data_adult_ana.rdata")

comparision_metric[4,"dataset_name"] <- "Large data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "Yes"
comparision_metric[4,"Train_instances"] <- 32561
comparision_metric[4,"Test_instances"] <- 16281

Comparision results

kableExtra::kable(comparision_metric)
r_package_name dataset_name attributes missing Train_instances Test_instances time_taken_min test_auc
DriveML Large data 14 Yes 32561 16281 29.898 0.924
H2o automl Large data 14 Yes 32561 16281 47.839 0.928
OneR Large data 14 Yes 32561 16281 0.132 0.71
autoxgboost Large data 14 Yes 32561 16281 40.7 0.922