library(DriveML)    ## automl package
library(data.table) ## data wrangling package
library(caret)      ## ML wrapper package
library(mlr)        ## ML wrapper package
library(h2o)        ## automl package
library(OneR)       ## automl package
library(Metrics)    ## auc function
library(autoxgboost)## automl package

Heart Disease UCI

data source : https://archive.ics.uci.edu/ml/datasets/Heart+Disease

Content

This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date. The “goal” field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.

Acknowledgements:

Creators:

Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D. Donor: David W. Aha (aha ‘@’ ics.uci.edu) (714) 856-8779

DriveML Experiment

data_path = "C:/backup/R packages/DriveML_Experiments/small_data_heart_desease"
heart_data <- fread(paste0(data_path, "/","heart.csv"), sep = ",",header = TRUE)

##target variable distributions
table(heart_data$target_var)

## 
##   0   1 
## 138 165

Split sample to test the model accuaracy with other open source R package

set.seed(1991)
train.index <- createDataPartition(heart_data$target_var, p = .8, list = FALSE)

DriveML step 1 - Missing variable treatment

marobj <- autoMAR (heart_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features

DriveML step 2 - Auto Dataprep

## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(heart_data, target = "target_var",
                          auto_mar = TRUE,
                          mar_object=marobj,
                          missimpute = myimpute,
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          verbose =TRUE)

## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >

master_heart_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
heart_train_data <- master_heart_data[sele_var]

### Train and valid data
train_heart <- heart_train_data[train.index,]
valid_heart <- heart_train_data[-train.index,]

DriveML step 3 - Model development

mymodel_heart <- autoMLmodel( train = train_heart,
                           test = valid_heart,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "all",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           htmlreport = FALSE,
                           verbose = TRUE,
                           seed = 1991)
  save(mymodel_heart, file="small_data_mymodel.rdata")

DriveML Results

Model results

results <- mymodel_heart$results
kableExtra::kable(results)

Model	Fitting time	Scoring time	Train AUC	Test AUC	Accuracy	Precision	Recall	F1_score
glmnet	3.051 secs	0.008 secs	0.925	0.917	0.767	0.737	0.875	0.800
logreg	2.347 secs	0.009 secs	0.952	0.708	0.750	0.707	0.906	0.795
randomForest	2.553 secs	0.008 secs	0.982	0.939	0.817	0.784	0.906	0.841
ranger	2.567 secs	0.03 secs	0.996	0.927	0.800	0.778	0.875	0.824
xgboost	3.013 secs	0.006 secs	0.999	0.868	0.750	0.743	0.812	0.776
rpart	2.211 secs	0.008 secs	0.941	0.875	0.783	0.788	0.812	0.800

## Variable Lift
mymodel_heart$modelexp$Lift_plot

### Random Forest Model validation ROC
mymodel_heart$trainedModels$randomForest$modelPlots$TestROC

### XGBoost Model validation ROC
mymodel_heart$trainedModels$xgboost$modelPlots$TestROC

### Random Forest Model Variable Importance
mymodel_heart$trainedModels$randomForest$modelPlots$VarImp

## [[1]]

Best ML model comparison with other R packages

1. DriveML

available on CRAN and git

selected best model from driveml outcome

time <- Sys.time()
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
                              integer = imputeMean(),
                              numeric = imputeMedian(),
                              character = imputeMode()))

## AutoDataprep
traindata <- autoDataprep(heart_data, target = "target_var",
                          dummyvar = TRUE,
                          aucv = 0.002, corr = 0.999,
                          outlier_flag = TRUE,
                          char_var_limit = 150,
                          interaction_var = TRUE,
                          frequent_var = TRUE,
                          verbose =TRUE)

## autoDataprep < Outlier treatment based on Tukey method....> 
## autoDataprep < Interactions transformer....> 
## autoDataprep < Categorical variable - one hot encoding....> 
## autoDataprep < variable reduction - zero variance method.... > 
## autoDataprep < variable selection - pearson correlation method.... > 
## autoDataprep < variable selection - AUC method.... >

master_heart_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
heart_train_data <- master_heart_data[sele_var]

### Train and valid data
train_heart <- heart_train_data[train.index,]
valid_heart <- heart_train_data[-train.index,]

driveml_smalldata <- autoMLmodel( train = train_heart,
                           test = valid_heart,
                           target = 'target_var',
                           tuneIters = 10,
                           tuneType = "random",
                           models = "randomforest",
                           varImp = 10,
                           liftGroup = 50,
                           maxObs = 5000,
                           htmlreport = FALSE,
                           verbose = TRUE,
                           seed = 1991)

## randomForest Model tuning started.... 
## autoMLmodel < All features randomForest tuned and trained >

dtime <- round(difftime(Sys.time(), time, units='secs'),3)

bestroc <- round(driveml_smalldata$results$`Test AUC`,3)

comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")

comparision_metric[1,"dataset_name"] <- "Small data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 14
comparision_metric[1,"missing"] <- "No"
comparision_metric[1,"Train_instances"] <- 243
comparision_metric[1,"Test_instances"] <- 60

2. H2o AutoML

available on CRAN and git

setDF(heart_data)
## Convert character data to factor for H2o automl function
heart_data[sapply(heart_data, is.character)] <- lapply(heart_data[sapply(heart_data, is.character)], as.factor)

## Convert target class variable as factor
## For binary classification, response should be a factor
heart_data$target_var <- as.factor(heart_data$target_var)

### Train and valid data
train_heart <- heart_data[train.index,]
valid_heart <- heart_data[-train.index,]

Training using h2o.automl() function

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 18 hours 
##     H2O cluster timezone:       Asia/Kolkata 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.1.3 
##     H2O cluster version age:    28 days, 23 hours and 14 minutes  
##     H2O cluster name:           H2O_started_from_R_dubrangala_zcg934 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.72 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.3 (2020-10-10)

## train and valid data set inot h2o 
time = Sys.time() ## start time
train_d = as.h2o(train_heart)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test_d = as.h2o(valid_heart)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Train Model

h2o_small_data_model <- h2o.automl(y = "target_var", 
                        training_frame = train_d, 
                        validation_frame = test_d,
                        max_models = 5L,
                        exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))

## 
  |                                                                            
  |                                                                      |   0%
## 17:15:07.421: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 17:15:07.421: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |================================                                      |  45%
## 17:15:12.474: GBM_5_AutoML_20210617_171507 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20210617_171507.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 194.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 194.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 194.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 195.0.
## ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 195.0.
## 
  |                                                                            
  |======================================================================| 100%

lb = as.data.frame(h2o_small_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3) 
save(h2o_small_data_model, file = "h2o_model_small_data_heart_ana.rdata")

comparision_metric[2,"dataset_name"] <- "Small data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 14
comparision_metric[2,"missing"] <- "No"
comparision_metric[2,"Train_instances"] <- 243
comparision_metric[2,"Test_instances"] <- 60

3. OneR

available on CRAN and git

build model with the OneR package

Note: There is no inbuilt feature engineering functions available on OneR. We used DriveML function to prepare the input dataset

## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/small_data_heart_desease"
heart_data <- fread(paste0(data_path, "/","heart.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(heart_data) # set as data frame


### Train and valid data
train_heart <- heart_data[train.index,]
valid_heart <- heart_data[-train.index,]

## OneR model
model_oner_large <- OneR(target_var ~., data = train_heart, verbose = TRUE)

## 
##     Attribute Accuracy
## 1 * thal      78.19%  
## 2   cp        74.07%  
## 3   ca        73.66%  
## 4   exang     71.19%  
## 5   thalach   70.78%  
## 5   slope     70.78%  
## 7   oldpeak   67.9%   
## 8   age       63.79%  
## 9   sex       61.73%  
## 10  restecg   58.02%  
## 11  trestbps  57.2%   
## 12  chol      53.91%  
## 13  fbs       53.5%   
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

prediction_oner <- predict(model_oner_large, valid_heart,type = c("prob"))
small_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_large, file = "oner_model_small_data_heart_ana.rdata")
summary(model_oner_large)

## 
## Call:
## OneR.formula(formula = target_var ~ ., data = train_heart, verbose = TRUE)
## 
## Rules:
## If thal = 0 then target_var = 0
## If thal = 1 then target_var = 0
## If thal = 2 then target_var = 1
## If thal = 3 then target_var = 0
## 
## Accuracy:
## 190 of 243 instances classified correctly (78.19%)
## 
## Contingency table:
##           thal
## target_var   0    1     2    3 Sum
##        0   * 1 * 11    25 * 76 113
##        1     1    5 * 102   22 130
##        Sum   2   16   127   98 243
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 77.882, df = 3, p-value < 2.2e-16

best_auc_oner <- round(auc(valid_heart$target_var, prediction_oner[,"1"]),2)

comparision_metric[3,"dataset_name"] <- "Small data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- small_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 14
comparision_metric[3,"missing"] <- "No"
comparision_metric[3,"Train_instances"] <- 243
comparision_metric[3,"Test_instances"] <- 60

4. autoxgboost

available on git

Note: There is no inbuilt feature engineering functions available on autoxgboost. We used DriveML function to prepare the input dataset

devtools::install_github(“ja-thomas/autoxgboost”)

## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/small_data_heart_desease"
heart_data <- fread(paste0(data_path, "/","heart.csv"), sep = ",",header = TRUE)

## Recode target variable
time = Sys.time() ## Start time
setDF(heart_data) # set as data frame


### Train and valid data
train_heart <- heart_data[train.index,]
valid_heart <- heart_data[-train.index,]

# create a classification task
train_heart$target_var <- as.factor(train_heart$target_var)

trainTask = makeClassifTask(data = train_heart, target = "target_var", positive = 1)
# create a control object for optimizer
time = Sys.time()
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L) 
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)

# do prediction and print confusion matrix
valid_heart_x <- valid_heart
valid_heart_x$target_var <- NULL
prediction = predict(auto_xgb_model, valid_heart_x)
prediction = getPredictionProbabilities(prediction)

myauc = round(auc(valid_heart$target_var, prediction),3)
save(auto_xgb_model, file = "axgb_model_small_data_heart_ana.rdata")

comparision_metric[4,"dataset_name"] <- "Small data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 14
comparision_metric[4,"missing"] <- "No"
comparision_metric[4,"Train_instances"] <- 243
comparision_metric[4,"Test_instances"] <- 60

Comparision results

kableExtra::kable(comparision_metric)

r_package_name	dataset_name	attributes	missing	Train_instances	Test_instances	time_taken_min	test_auc
DriveML	Small data	14	No	243	60	6.073	0.914
H2o automl	Small data	14	No	243	60	32.066	0.906
OneR	Small data	14	No	243	60	0.045	0.68
autoxgboost	Small data	14	No	243	60	10.288	0.902

Experiment of DriveML R package - Small Data set

Dayanand, Sayan, Ravi, Kiran R

16/06/2021

Heart Disease UCI

Content

DriveML Experiment

Split sample to test the model accuaracy with other open source R package

DriveML step 1 - Missing variable treatment

DriveML step 2 - Auto Dataprep

DriveML step 3 - Model development

DriveML Results

Best ML model comparison with other R packages

1. DriveML

2. H2o AutoML

3. OneR

4. autoxgboost

Comparision results