Merge pull request #126 from ModelOriented/Hubert_issues

forester 1.6.1 - minor fixes and default options changes
ModelOriented · May 9, 2024 · 4b8bc9b · 4b8bc9b
2 parents fb7833e + 1acbbd2
commit 4b8bc9b
Show file tree

Hide file tree

Showing 13 changed files with 35 additions and 34 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: forester
 Type: Package
 Title: Quick and Simple Tools for Training and Testing of Tree-Based Models
-Version: 1.6.0
+Version: 1.6.1
 Authors@R:
     c(person("Hubert", "Ruczyński", role = c("aut", "cre", "cph"), email = "[email protected]"),
     person("Anna", "Kozak", role = c("aut", "ths"), email = "[email protected]"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# forester 1.6.1
+-   Changed default feature selection method in `custom_preprocessing()`, and `preprocessing_feature_selection()` to `BORUTA`, as it is the most effective one,
+-   Changed default imputation method for `preprocessing()` inside `train()` to `knn`, as it is the most effective one,
+-   Fixed an issue for VS plots, where the color was not assigned properly for the models,
+
 # forester 1.6.0
 
 -   Updated `.Rbuildignore`, `DESCRIPTION`, and `NAMESPACE`.

diff --git a/R/custom_preprocessing.R b/R/custom_preprocessing.R
@@ -61,7 +61,7 @@
 #' the same column names. The parameters are described below:
 #' \itemize{
 #' \item \code{`feature_selection_method`} A string value indication the feature selection method.
-#' The imputation method must be one of 'VI', 'MCFS', 'MI', 'BORUTA', or 'none' if we don't
+#' The imputation method must be one of 'VI', 'MCFS', 'MI', 'BORUTA' (default), or 'none' if we don't
 #' want it.
 #' \item \code{`max_features`} A positive integer value describing the desired number of
 #' selected features. Initial value set as 'default' which is min(10, ncol(data) - 1)
@@ -117,7 +117,7 @@
 #'                        m = 5
 #'                      ),
 #'                      feature_selection_parameters = list(
-#'                        feature_selection_method = 'VI',
+#'                        feature_selection_method = 'BORUTA',
 #'                        max_features = 'default',
 #'                        nperm = 1,
 #'                        cutoffPermutations = 20,
@@ -152,7 +152,7 @@ custom_preprocessing <- function(data,
                                    m = 5
                                  ),
                                  feature_selection_parameters = list(
-                                   feature_selection_method = 'none',
+                                   feature_selection_method = 'BORUTA',
                                    max_features = 'default',
                                    nperm = 1,
                                    cutoffPermutations = 20,

diff --git a/R/plot_classification.R b/R/plot_classification.R
@@ -246,7 +246,7 @@ plot.binary_clf <- function(x,
 
     score <- cbind(train_score, test_score)
 
-    p <- ggplot(score, aes(x = .data[[paste0(metric, '_train')]], y = .data[[paste0(metric, '_test')]], color = 'engine')) +
+    p <- ggplot(score, aes(x = .data[[paste0(metric, '_train')]], y = .data[[paste0(metric, '_test')]], color = .data[['engine']])) +
       geom_point() +
       geom_abline(intercept = 0, slope = 1) +
       theme_forester() +

diff --git a/R/plot_multiclass.R b/R/plot_multiclass.R
@@ -202,7 +202,7 @@ plot.multiclass <- function(x,
 
     score <- cbind(train_score, test_score)
 
-    p <- ggplot(score, aes(x = .data[[paste0(metric, '_train')]], y = .data[[paste0(metric, '_test')]], color = 'engine')) +
+    p <- ggplot(score, aes(x = .data[[paste0(metric, '_train')]], y = .data[[paste0(metric, '_test')]], color = .data[['engine']])) +
       geom_point() +
       geom_abline(intercept = 0, slope = 1) +
       theme_forester() +

diff --git a/R/plot_regression.R b/R/plot_regression.R
@@ -155,7 +155,7 @@ plot.regression <- function(x,
 
     score <- cbind(train_score, test_score)
 
-    p <- ggplot(score, aes(x = .data[[paste0(metric, '_train')]], y = .data[[paste0(metric, '_test')]], color = 'engine')) +
+    p <- ggplot(score, aes(x = .data[[paste0(metric, '_train')]], y = .data[[paste0(metric, '_test')]], color = .data[['engine']])) +
       geom_point() +
       geom_abline(intercept = 0, slope = 1) +
       theme_forester() +

diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -1,4 +1,4 @@
-#' Conduct preprocessing processes
+#' Conduct basic preprocessing processes
 #'
 #' @param data A data source, that is one the of major R formats: data.table, data.frame,
 #' matrix, and so on.
@@ -133,8 +133,7 @@ manage_missing <- function(df, y) {
     df <- df[, -col_to_rm]
   }
   # Input missing values via mice algorithm.
-  df <- mice::mice(df, seed = 123, print = FALSE, remove_collinear = FALSE)
-  df <- mice::complete(df)
+  df <- preprocessing_imputation(df, imputation_method = 'knn', verbose = FALSE)
   return(df)
 }
 

diff --git a/R/preprocessing_feature_selection.R b/R/preprocessing_feature_selection.R
@@ -1,18 +1,18 @@
-#' Conducts a feature selection process with one out of four proposed methods
+#' Conducts a feature selection process with one out of five proposed methods
 #'
 #' \itemize{
-#' \item \code{`VI`} The variable importance method based on random forest - short time,
-#' \item \code{`MCFS`} The Monte Carlo Feature Selection - long time,
-#' \item \code{`MI`} The Varrank method based on mutual information scores - moderate time,
-#' if we set too big `max_features` it can work really long,
-#' \item \code{`BORUTA`} The BORUTA algorithm - short time.
+#' \item \code{`VI`} The variable importance method based on random forest - long time, worst results,
+#' \item \code{`MCFS`} The Monte Carlo Feature Selection - short time, reasonable results,
+#' \item \code{`MI`} The Varrank method based on mutual information scores - short time,
+#' if we set too big `max_features` it can work really long, bad results,
+#' \item \code{`BORUTA`} The BORUTA algorithm - long time, best results.
 #' }
 #'
 #' @param data A data source, that is one of the major R formats: data.table, data.frame,
 #' matrix and so on.
 #' @param y A string that indicates a target column name.
 #' @param feature_selection_method A string value indication the feature selection method.
-#' The imputation method must be one of 'VI', 'MCFS', 'MI', or 'BORUTA'.
+#' The imputation method must be one of 'VI', 'MCFS', 'MI', or 'BORUTA' (default).
 #' @param max_features A positive integer value describing the desired number of
 #' selected features. Initial value set as 'default' which is min(10, ncol(data) - 1)
 #' for `VI` and `MI`, and NULL (number of relevant features chosen by the method)
@@ -43,7 +43,7 @@
 #' @export
 preprocessing_feature_selection <- function(data,
                                             y,
-                                            feature_selection_method = 'VI',
+                                            feature_selection_method = 'BORUTA',
                                             max_features = 'default',
                                             nperm = 1,
                                             cutoffPermutations = 20,

diff --git a/man/custom_preprocessing.Rd b/man/custom_preprocessing.Rd
diff --git a/man/preprocessing.Rd b/man/preprocessing.Rd
diff --git a/man/preprocessing_feature_selection.Rd b/man/preprocessing_feature_selection.Rd
diff --git a/tests/testthat/test-13-check-data.R b/tests/testthat/test-13-check-data.R
@@ -21,6 +21,4 @@ test_that('test-check-data', {
     expect_output(check_y_balance(df, y, time, status, type, verbose))
     expect_output(detect_id_columns(df, verbose))
   }
-
-
 })
diff --git a/tests/testthat/test-17-predict-new.R b/tests/testthat/test-17-predict-new.R
@@ -43,6 +43,5 @@ test_that('test-predict-new', {
     for (j in 1:(length(predictions) - 1)) {
       expect_equal(length(as.vector(predictions[[j]])), length(as.vector(predictions[[j + 1]])))
     }
-
   }
 })
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,4 @@ test_that('test-check-data', { @@
         expect_output(check_y_balance(df, y, time, status, type, verbose))
         expect_output(detect_id_columns(df, verbose))
       }
     })