DriveML Experiment
data_path = "C:/backup/R packages/DriveML_Experiments/big_data - weatherAUS"
weather_data <- fread(paste0(data_path, "\\","weatherAUS.csv"), sep = ",",header = TRUE)
##Recode target variable
weather_data[, RainTomorrow:= ifelse(RainTomorrow=='Yes',1,0)]
weather_data[, RainTomorrow:= ifelse(is.na(RainTomorrow),0,RainTomorrow)]
table(weather_data$RainTomorrow)
##
## 0 1
## 113583 31877
Split sample to test the model accuaracy with other open source R package
set.seed(12345)
train.index <- createDataPartition(weather_data$RainTomorrow, p = .8, list = FALSE)
DriveML step 1 - Missing variable treatment
marobj <- autoMAR (weather_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
marobj$auc_features
DriveML step 2 - Auto Dataprep
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(weather_data, target = "RainTomorrow",
auto_mar = TRUE,
mar_object=marobj,
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
#uid = 'ID',
verbose =TRUE)
## autoDataprep < MAR variable computation.... >
## autoDataprep < missing imputation.... >
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
train_data <- master_data[sele_var]
### Train and valid data
train_d <- train_data[train.index,]
valid_d <- train_data[-train.index,]
DriveML step 3 - Model development
mymodel_rain <- autoMLmodel( train = train_d,
test = valid_d,
target = 'RainTomorrow',
tuneIters = 10,
tuneType = "random",
models = "all",
varImp = 10,
liftGroup = 50,
maxObs = 5000,
#uid = 'ID',
htmlreport = FALSE,
pdp = TRUE,
verbose = TRUE,
seed = 42)
save(mymodel_rain, file="big_data_mymodel.rdata")
DriveML Results
Model results
results <- mymodel_rain$results
kableExtra::kable(results)
|
Model
|
Fitting time
|
Scoring time
|
Train AUC
|
Test AUC
|
Accuracy
|
Precision
|
Recall
|
F1_score
|
5
|
xgboost
|
2.763 mins
|
2.311 secs
|
0.956
|
0.892
|
0.861
|
0.748
|
0.540
|
0.627
|
4
|
ranger
|
11.19 mins
|
1.535 mins
|
1.000
|
0.880
|
0.856
|
0.756
|
0.496
|
0.599
|
3
|
randomForest
|
27.422 mins
|
17.544 secs
|
1.000
|
0.879
|
0.855
|
0.761
|
0.487
|
0.594
|
2
|
logreg
|
27.214 secs
|
0.441 secs
|
0.869
|
0.871
|
0.849
|
0.717
|
0.506
|
0.593
|
1
|
glmnet
|
33.127 mins
|
0.607 secs
|
0.857
|
0.858
|
0.845
|
0.734
|
0.445
|
0.554
|
6
|
rpart
|
30.951 secs
|
1.052 secs
|
0.779
|
0.778
|
0.840
|
0.728
|
0.421
|
0.534
|
## Variable Lift
mymodel_rain$modelexp$Lift_plot
### Random Forest Model validation ROC
mymodel_rain$trainedModels$randomForest$modelPlots$TestROC
### XGBoost Model validation ROC
mymodel_rain$trainedModels$xgboost$modelPlots$TestROC
### Random Forest Model Variable Importance
mymodel_rain$trainedModels$randomForest$modelPlots$VarImp
## [[1]]
Best ML model comparison with other R packages
1. DriveML
available on CRAN and git
selected best model from driveml outcome
time <- Sys.time()
marobj <- autoMAR (weather_data, aucv = 0.9, strataname = NULL, stratasize = NULL, mar_method="glm")
##
|
| | 0%
|
|==== | 6%
|
|======== | 11%
|
|============ | 17%
|
|================ | 22%
|
|=================== | 28%
|
|======================= | 33%
|
|=========================== | 39%
|
|=============================== | 44%
|
|=================================== | 50%
|
|======================================= | 56%
|
|=========================================== | 61%
|
|=============================================== | 67%
|
|=================================================== | 72%
|
|====================================================== | 78%
|
|========================================================== | 83%
|
|============================================================== | 89%
|
|================================================================== | 94%
|
|======================================================================| 100%
## Type of missing imputation
myimpute <- list(classes=list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## AutoDataprep
traindata <- autoDataprep(weather_data, target = "RainTomorrow",
auto_mar = TRUE,
mar_object=marobj,
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = TRUE,
char_var_limit = 150,
interaction_var = TRUE,
frequent_var = TRUE,
verbose =TRUE)
## autoDataprep < MAR variable computation.... >
## autoDataprep < missing imputation.... >
## autoDataprep < Outlier treatment based on Tukey method....>
## autoDataprep < Frequent transformer....>
## autoDataprep < Interactions transformer....>
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
train_data <- master_data[sele_var]
### Train and valid data
train_d <- train_data[train.index,]
valid_d <- train_data[-train.index,]
driveml_bigdata <- autoMLmodel( train = train_d,
test = valid_d,
target = 'RainTomorrow',
models = "xgboost",
maxObs = 5000,
verbose = TRUE,
seed = 42)
## xgboost Model tuning started....
## autoMLmodel < All features xgboost tuned and trained >
dtime <- round(difftime(Sys.time(), time, units='secs'),3)
bestroc <- round(driveml_bigdata$results$`Test AUC`,3)
comparision_metric <- matrix(data=NA, nrow=4, ncol=8)
colnames(comparision_metric) <- c("r_package_name","dataset_name","attributes","missing","Train_instances","Test_instances", "time_taken_min", "test_auc")
comparision_metric[1,"dataset_name"] <- "Big data"
comparision_metric[1,"r_package_name"] <- "DriveML"
comparision_metric[1,"time_taken_min"] <- dtime
comparision_metric[1,"test_auc"] <- bestroc
comparision_metric[1,"attributes"] <- 23
comparision_metric[1,"missing"] <- "Yes"
comparision_metric[1,"Train_instances"] <- 116368
comparision_metric[1,"Test_instances"] <- 29092
2. H2o AutoML
available on CRAN and git
setDF(weather_data)
## Convert character data to factor for H2o automl function
weather_data[sapply(weather_data, is.character)] <- lapply(weather_data[sapply(weather_data, is.character)], as.factor)
## Convert target class variable as factor
## For binary classification, response should be a factor
weather_data$RainTomorrow <- as.factor(weather_data$RainTomorrow)
weather_data$Date <- NULL
### Train and valid data
train_data <- weather_data[train.index,]
valid_data <- weather_data[-train.index,]
Training using h2o.automl() function
time = Sys.time() ## Start time
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 days 16 hours
## H2O cluster timezone: Asia/Kolkata
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.1.3
## H2O cluster version age: 28 days, 21 hours and 6 minutes
## H2O cluster name: H2O_started_from_R_dubrangala_zcg934
## H2O cluster total nodes: 1
## H2O cluster total memory: 5.10 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.3 (2020-10-10)
## train and valid data set inot h2o
train_d = as.h2o(train_data)
##
|
| | 0%
|
|======================================================================| 100%
test_d = as.h2o(valid_data)
##
|
| | 0%
|
|======================================================================| 100%
Train Model
h2o_big_data_model <- h2o.automl(y = "RainTomorrow",
training_frame = train_d,
validation_frame = test_d,
exclude_algos = c("GLM", "DeepLearning", "DRF","StackedEnsemble"))
##
|
| | 0%
## 15:07:20.310: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 15:07:20.311: AutoML: XGBoost is not available; skipping it.
|
|== | 3%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 9%
|
|======== | 11%
|
|======== | 12%
|
|=========== | 15%
|
|=========== | 16%
|
|============= | 18%
|
|=============== | 21%
|
|================= | 25%
|
|================== | 25%
|
|=================== | 27%
|
|===================== | 30%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|=========================== | 39%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|======================================================================| 100%
lb = as.data.frame(h2o_big_data_model@leaderboard)
best_auc <- round(lb$auc[1],3)
time_h2o <- round(difftime(Sys.time(), time, units='secs'),3) # Time difference of 3.209718 mins
save(h2o_big_data_model, file = "h2o_model_big_data_weahteraus.rdata")
comparision_metric[2,"dataset_name"] <- "Big data"
comparision_metric[2,"r_package_name"] <- "H2o automl"
comparision_metric[2,"time_taken_min"] <- time_h2o
comparision_metric[2,"test_auc"] <- best_auc
comparision_metric[2,"attributes"] <- 23
comparision_metric[2,"missing"] <- "Yes"
comparision_metric[2,"Train_instances"] <- 116368
comparision_metric[2,"Test_instances"] <- 29092
3. OneR
available on CRAN and git
build model with the OneR package
## Read Raw data
data_path = "C:/backup/R packages/DriveML_Experiments/big_data - weatherAUS"
weather_data <- fread(paste0(data_path, "\\","weatherAUS.csv"), sep = ",",header = TRUE)
## Recode target variable
time = Sys.time()
weather_data[, RainTomorrow:= ifelse(RainTomorrow=='Yes',1,0)]
weather_data[, RainTomorrow:= ifelse(is.na(RainTomorrow),0,RainTomorrow)]
setDF(weather_data) # set as data frame
## Impute missing values
cc=impute(weather_data, classes = list(factor = imputeMode(),
integer = imputeMean(),
numeric = imputeMedian(),
character = imputeMode()))
## split randomly into a training (80%) and a test (20%) set
set.seed(12345) # for reproducibility
train.index <- createDataPartition(weather_data$RainTomorrow, p = .8, list = FALSE)
train_oth <- cc$data[train.index,]
valid_oth <- cc$data[-train.index,]
## OneR model
model_oner_bigdata <- OneR(RainTomorrow ~., data = train_oth, verbose = TRUE)
##
## Attribute Accuracy
## 1 * Humidity3pm 82.49%
## 2 Date 78.89%
## 3 Cloud3pm 78.78%
## 4 Sunshine 78.76%
## 5 Pressure3pm 78.44%
## 6 Pressure9am 78.36%
## 7 Rainfall 78.22%
## 8 WindGustSpeed 78.2%
## 9 WindSpeed3pm 78.1%
## 10 Location 78.1%
## 10 MinTemp 78.1%
## 10 MaxTemp 78.1%
## 10 Evaporation 78.1%
## 10 WindGustDir 78.1%
## 10 WindDir9am 78.1%
## 10 WindDir3pm 78.1%
## 10 WindSpeed9am 78.1%
## 10 Humidity9am 78.1%
## 10 Cloud9am 78.1%
## 10 Temp9am 78.1%
## 10 Temp3pm 78.1%
## 10 RainToday 78.1%
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
prediction_oner <- predict(model_oner_bigdata, valid_oth,type = c("prob"))
big_data_onertime <- round(difftime(Sys.time(), time, units='secs'),3)
save(model_oner_bigdata, file = "oner_model_big_data_weahteraus.rdata")
summary(model_oner_bigdata)
##
## Call:
## OneR.formula(formula = RainTomorrow ~ ., data = train_oth, verbose = TRUE)
##
## Rules:
## If Humidity3pm = (-0.1,20] then RainTomorrow = 0
## If Humidity3pm = (20,40] then RainTomorrow = 0
## If Humidity3pm = (40,60] then RainTomorrow = 0
## If Humidity3pm = (60,80] then RainTomorrow = 0
## If Humidity3pm = (80,100] then RainTomorrow = 1
##
## Accuracy:
## 95997 of 116368 instances classified correctly (82.49%)
##
## Contingency table:
## Humidity3pm
## RainTomorrow (-0.1,20] (20,40] (40,60] (60,80] (80,100] Sum
## 0 * 8380 * 23739 * 37542 * 18711 2508 90880
## 1 309 1819 6337 9398 * 7625 25488
## Sum 8689 25558 43879 28109 10133 116368
## ---
## Maximum in each column: '*'
##
## Pearson's Chi-squared test:
## X-squared = 25447, df = 4, p-value < 2.2e-16
best_auc_oner <- round(auc(valid_oth$RainTomorrow, prediction_oner[,"1"]),2)
comparision_metric[3,"dataset_name"] <- "Big data"
comparision_metric[3,"r_package_name"] <- "OneR"
comparision_metric[3,"time_taken_min"] <- big_data_onertime
comparision_metric[3,"test_auc"] <- best_auc_oner
comparision_metric[3,"attributes"] <- 23
comparision_metric[3,"missing"] <- "Yes"
comparision_metric[3,"Train_instances"] <- 116368
comparision_metric[3,"Test_instances"] <- 29092
4. autoxgboost
available on git
devtools::install_github(“ja-thomas/autoxgboost”)
## Using DriveML autodataprep create a cleaned data set
data_path = "C:/backup/R packages/DriveML_Experiments/big_data - weatherAUS"
weather_data <- fread(paste0(data_path, "\\","weatherAUS.csv"), sep = ",",header = TRUE)
## Recode target variable
weather_data[, RainTomorrow:= ifelse(RainTomorrow=='Yes',1,0)]
weather_data[, RainTomorrow:= ifelse(is.na(RainTomorrow),0,RainTomorrow)]
setDF(weather_data) # set as data frame
traindata <- autoDataprep(weather_data, target = "RainTomorrow",
missimpute = myimpute,
dummyvar = TRUE,
aucv = 0.002, corr = 0.999,
outlier_flag = FALSE,
char_var_limit = 150,
interaction_var = FALSE,
frequent_var = FALSE,
#uid = 'ID',
verbose =TRUE)
## autoDataprep < missing imputation.... >
## autoDataprep < Categorical variable - one hot encoding....>
## autoDataprep < variable reduction - zero variance method.... >
## autoDataprep < variable selection - pearson correlation method.... >
## autoDataprep < variable selection - AUC method.... >
master_cc_data <- traindata$master_data
sele_var <- traindata$var_list$mydata_var
cc_train_data <- master_cc_data[sele_var]
set.seed(12345)
train.index <- createDataPartition(weather_data$RainTomorrow, p = .8, list = FALSE)
train_oth <- cc_train_data[train.index,]
valid_oth <- cc_train_data[-train.index,]
# create a classification task
train_oth$RainTomorrow <- as.factor(train_oth$RainTomorrow)
trainTask = makeClassifTask(data = train_oth, target = "RainTomorrow", positive = 1)
# create a control object for optimizer
ctrl = makeMBOControl()
ctrl = setMBOControlTermination(ctrl, iters = 1L)
# fit the model
auto_xgb_model = autoxgboost(trainTask, control = ctrl, tune.threshold = TRUE)
tot_time = round(difftime(Sys.time(), time, units='secs'),3)
# do prediction and print confusion matrix
prediction = predict(auto_xgb_model, valid_oth[,-1])
prediction = getPredictionProbabilities(prediction)
myauc = round(auc(valid_oth$RainTomorrow, prediction),3)
save(auto_xgb_model, file = "axgb_model_big_data_weahteraus.rdata")
comparision_metric[4,"dataset_name"] <- "Big data"
comparision_metric[4,"r_package_name"] <- "autoxgboost"
comparision_metric[4,"time_taken_min"] <- tot_time
comparision_metric[4,"test_auc"] <- myauc
comparision_metric[4,"attributes"] <- 23
comparision_metric[4,"missing"] <- "Yes"
comparision_metric[4,"Train_instances"] <- 116368
comparision_metric[4,"Test_instances"] <- 29092
Comparision results
kableExtra::kable(comparision_metric)
r_package_name
|
dataset_name
|
attributes
|
missing
|
Train_instances
|
Test_instances
|
time_taken_min
|
test_auc
|
DriveML
|
Big data
|
23
|
Yes
|
116368
|
29092
|
263.294
|
0.892
|
H2o automl
|
Big data
|
23
|
Yes
|
116368
|
29092
|
1176.912
|
0.894
|
OneR
|
Big data
|
23
|
Yes
|
116368
|
29092
|
1.188
|
0.77
|
autoxgboost
|
Big data
|
23
|
Yes
|
116368
|
29092
|
139.161
|
0.889
|