library(DriveML)    ## automl package
library(data.table) ## data wrangling package
library(caret)      ## ML wrapper package
library(mlr)        ## ML wrapper package
library(h2o)        ## automl package
library(OneR)       ## automl package
library(Metrics)    ## auc function
library(autoxgboost)## automl package

Heart Disease UCI

data source : https://archive.ics.uci.edu/ml/datasets/Heart+Disease

Content

This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date. The “goal” field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.

Acknowledgements:

Creators:

Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D. Donor: David W. Aha (aha ‘@’ ics.uci.edu) (714) 856-8779

DriveML Experiment

data_path = "C:/backup/R packages/DriveML_Experiments/small_data_heart_desease"
heart_data <- fread(paste0(data_path, "/","heart.csv"), sep = ",",header = TRUE)

##target variable distributions
table(heart_data$target_var)
## 
##   0   1 
## 138 165

Split sample to test the model accuaracy with other open source R package

set.seed(1991)
train.index <- createDataPartition(heart_data$target_var, p = .8, list = FALSE)

DriveML step 1 - Missing variable treatment

marobj <- autoMAR (heart_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features

DriveML step 2 - Auto Dataprep

## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(heart_data, target = "target_var",
                          auto_mar = TRUE,
                          mar_object=marobj,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_heart_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
heart_train_data <- master_heart_data[sele_var]

### Train and valid data
train_heart <- heart_train_data[train.index,]
valid_heart <- heart_train_data[-train.index,]

DriveML step 3 - Model development

mymodel_heart <- autoMLmodel( train = train_heart,
                           test = valid_heart,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "all",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           htmlreport = FALSE,
                           verbose = TRUE,
                           seed = 1991)
  save(mymodel_heart, file="small_data_mymodel.rdata")

DriveML Results

Model results

results <- mymodel_heart$results
kableExtra::kable(results)
Model Fitting time Scoring time Train AUC Test AUC Accuracy Precision Recall F1_score
glmnet 3.051 secs 0.008 secs 0.925 0.917 0.767 0.737 0.875 0.800
logreg 2.347 secs 0.009 secs 0.952 0.708 0.750 0.707 0.906 0.795
randomForest 2.553 secs 0.008 secs 0.982 0.939 0.817 0.784 0.906 0.841
ranger 2.567 secs 0.03 secs 0.996 0.927 0.800 0.778 0.875 0.824
xgboost 3.013 secs 0.006 secs 0.999 0.868 0.750 0.743 0.812 0.776
rpart 2.211 secs 0.008 secs 0.941 0.875 0.783 0.788 0.812 0.800
## Variable Lift
mymodel_heart$modelexp$Lift_plot

### Random Forest Model validation ROC
mymodel_heart$trainedModels$randomForest$modelPlots$TestROC

### XGBoost Model validation ROC
mymodel_heart$trainedModels$xgboost$modelPlots$TestROC

### Random Forest Model Variable Importance
mymodel_heart$trainedModels$randomForest$modelPlots$VarImp
## [[1]]

Best ML model comparison with other R packages

1. DriveML

available on CRAN and git

selected best model from driveml outcome

time <- Sys.time()
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(heart_data, target = "target_var",
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          verbose =TRUE)
## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >
master_heart_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
heart_train_data <- master_heart_data[sele_var]

### Train and valid data
train_heart <- heart_train_data[train.index,]
valid_heart <- heart_train_data[-train.index,]

driveml_smalldata <- autoMLmodel( train = train_heart,
                           test = valid_heart,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "randomforest",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           htmlreport = FALSE,
                           verbose = TRUE,
                           seed = 1991)
## randomForest Model tuning started.... 
## autoMLmodel < All features randomForest tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_smalldata$results$`Test AUC`,3)

comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")

comparision_metric[1,"dataset_name"] <- "Small data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "No"
comparision_metric[1,"Train_instances"] <- 243
comparision_metric[1,"Test_instances"] <- 60

2. H2o AutoML

available on CRAN and git

setDF(heart_data)
## Convert character data to factor for H2o automl function
heart_data[sapply(heart_data, is.character)] <- lapply(heart_data[sapply(heart_data, is.character)], as.factor)

## Convert target class variable as factor
## For binary classification, response should be a factor
heart_data$target_var <- as.factor(heart_data$target_var)

### Train and valid data
train_heart <- heart_data[train.index,]
valid_heart <- heart_data[-train.index,]

Training using h2o.automl() function

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 18 hours 
##     H2O cluster timezone:       Asia/Kolkata 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.1.3 
##     H2O cluster version age:    28 days, 23 hours and 14 minutes  
##     H2O cluster name:           H2O_started_from_R_dubrangala_zcg934 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.72 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.3 (2020-10-10)
## train and valid data set inot h2o 
time = Sys.time() ## start time
train_d = as.h2o(train_heart)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_d = as.h2o(valid_heart)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Train Model

h2o_small_data_model <- h2o.automl(y = "target_var", 
                        training_frame = train_d, 
                        validation_frame = test_d,
                        max_models = 5L,
                        exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
## 
  |                                                                            
  |                                                                      |   0%
## 17:15:07.421: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 17:15:07.421: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |================================                                      |  45%
## 17:15:12.474: GBM_5_AutoML_20210617_171507 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20210617_171507.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 194.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 194.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 194.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 195.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 195.0.
## 
  |                                                                            
  |======================================================================| 100%
lb = as.data.frame(h2o_small_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3) 
save(h2o_small_data_model, file = "h2o_model_small_data_heart_ana.rdata")

comparision_metric[2,"dataset_name"] <- "Small data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "No"
comparision_metric[2,"Train_instances"] <- 243
comparision_metric[2,"Test_instances"] <- 60

3. OneR

available on CRAN and git

build model with the OneR package

Note: There is no inbuilt feature engineering functions available on OneR. We used DriveML function to prepare the input dataset

## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/small_data_heart_desease"
heart_data <- fread(paste0(data_path, "/","heart.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(heart_data) # set as data frame


### Train and valid data
train_heart <- heart_data[train.index,]
valid_heart <- heart_data[-train.index,]

## OneR model
model_oner_large <- OneR(target_var ~., data = train_heart, verbose = TRUE)
## 
##     Attribute Accuracy
## 1 * thal      78.19%  
## 2   cp        74.07%  
## 3   ca        73.66%  
## 4   exang     71.19%  
## 5   thalach   70.78%  
## 5   slope     70.78%  
## 7   oldpeak   67.9%   
## 8   age       63.79%  
## 9   sex       61.73%  
## 10  restecg   58.02%  
## 11  trestbps  57.2%   
## 12  chol      53.91%  
## 13  fbs       53.5%   
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_large, valid_heart,type = c("prob"))
small_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_large, file = "oner_model_small_data_heart_ana.rdata")
summary(model_oner_large)
## 
## Call:
## OneR.formula(formula = target_var ~ ., data = train_heart, verbose = TRUE)
## 
## Rules:
## If thal = 0 then target_var = 0
## If thal = 1 then target_var = 0
## If thal = 2 then target_var = 1
## If thal = 3 then target_var = 0
## 
## Accuracy:
## 190 of 243 instances classified correctly (78.19%)
## 
## Contingency table:
##           thal
## target_var   0    1     2    3 Sum
##        0   * 1 * 11    25 * 76 113
##        1     1    5 * 102   22 130
##        Sum   2   16   127   98 243
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 77.882, df = 3, p-value < 2.2e-16
best_auc_oner <- round(auc(valid_heart$target_var, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Small data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- small_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "No"
comparision_metric[3,"Train_instances"] <- 243
comparision_metric[3,"Test_instances"] <- 60

4. autoxgboost

available on git

Note: There is no inbuilt feature engineering functions available on autoxgboost. We used DriveML function to prepare the input dataset

devtools::install_github(“ja-thomas/autoxgboost”)

## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/small_data_heart_desease"
heart_data <- fread(paste0(data_path, "/","heart.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(heart_data) # set as data frame


### Train and valid data
train_heart <- heart_data[train.index,]
valid_heart <- heart_data[-train.index,]
# create a classification task
train_heart$target_var <- as.factor(train_heart$target_var)

trainTask = makeClassifTask(data = train_heart, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L) 
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
valid_heart_x <- valid_heart
valid_heart_x$target_var <- NULL
prediction = predict(auto_xgb_model, valid_heart_x)
prediction = getPredictionProbabilities(prediction)

myauc = round(auc(valid_heart$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_small_data_heart_ana.rdata")

comparision_metric[4,"dataset_name"] <- "Small data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "No"
comparision_metric[4,"Train_instances"] <- 243
comparision_metric[4,"Test_instances"] <- 60

Comparision results

kableExtra::kable(comparision_metric)
r_package_name dataset_name attributes missing Train_instances Test_instances time_taken_min test_auc
DriveML Small data 14 No 243 60 6.073 0.914
H2o automl Small data 14 No 243 60 32.066 0.906
OneR Small data 14 No 243 60 0.045 0.68
autoxgboost Small data 14 No 243 60 10.288 0.902