Update Default max_samples_leaf Value

- Change default value from None to 1 - Update unit tests - Add note to documentation
zillow · Nov 7, 2022 · 18f74af · 18f74af
1 parent 5763153
commit 18f74af
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 13 deletions.
diff --git a/docs/user_guide.rst b/docs/user_guide.rst
@@ -51,6 +51,8 @@ Let's fit a quantile forest on a simple regression dataset::
     >>> reg.fit(X_train, y_train)
     RandomForestQuantileRegressor(...)
 
+During model initialization, the parameter `max_samples_leaf` can be specified, which determines the maximum number of samples per leaf node to retain. If `max_samples_leaf` is smaller than the number of samples in a given leaf node, then a subset of values are randomly selected. By default, the model retains one randomly selected sample per leaf node (`max_samples_leaf = 1`); all samples can be retained by specifying `max_samples_leaf = None`. Note that the number of retained samples can materially impact the size of the model object.
+
 As quantile forests can provide predictions at quantiles, they accept an optional parameter during the call to the `predict` method, which can be a float or list of floats that specify the empirical quantiles to return::
 
     >>> y_pred = reg.predict(X_test, quantiles=[0.25, 0.5, 0.75])  # returns three columns per row

diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py
@@ -800,13 +800,14 @@ class RandomForestQuantileRegressor(BaseForestQuantileRegressor):
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
-    max_samples_leaf : int or float, default=None
+    max_samples_leaf : int, float or None, default=1
         The maximum number of samples permitted to be at a leaf node.
 
         - If int, then consider `max_samples_leaf` as the maximum number.
         - If float, then `max_samples_leaf` is a fraction and
           `ceil(max_samples_leaf * n_samples)` are the maximum
           number of samples for each node.
+        - If None then unlimited number of leaf samples.
 
     min_weight_fraction_leaf : float, default=0.0
         The minimum weighted fraction of the sum total of weights (of all
@@ -965,7 +966,7 @@ def __init__(
         max_depth=None,
         min_samples_split=2,
         min_samples_leaf=1,
-        max_samples_leaf=None,
+        max_samples_leaf=1,
         min_weight_fraction_leaf=0.0,
         max_features=1.0,
         max_leaf_nodes=None,
@@ -1068,13 +1069,14 @@ class ExtraTreesQuantileRegressor(BaseForestQuantileRegressor):
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
-    max_samples_leaf : int or float, default=None
+    max_samples_leaf : int, float or None, default=1
         The maximum number of samples permitted to be at a leaf node.
 
         - If int, then consider `max_samples_leaf` as the maximum number.
         - If float, then `max_samples_leaf` is a fraction and
           `ceil(max_samples_leaf * n_samples)` are the maximum
           number of samples for each node.
+        - If None then unlimited number of leaf samples.
 
     min_weight_fraction_leaf : float, default=0.0
         The minimum weighted fraction of the sum total of weights (of all
@@ -1237,7 +1239,7 @@ def __init__(
         max_depth=None,
         min_samples_split=2,
         min_samples_leaf=1,
-        max_samples_leaf=None,
+        max_samples_leaf=1,
         min_weight_fraction_leaf=0.0,
         max_features=1.0,
         max_leaf_nodes=None,

diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py
@@ -59,7 +59,9 @@ def check_regression_toy(name, weighted_quantile):
 
     ForestRegressor = FOREST_REGRESSORS[name]
 
-    regr = ForestRegressor(n_estimators=10, bootstrap=False, random_state=1)
+    regr = ForestRegressor(
+        n_estimators=10, max_samples_leaf=None, bootstrap=False, random_state=1
+    )
     regr.fit(X, y)
 
     # Check model and apply outputs shape.
@@ -154,7 +156,9 @@ def check_predict_quantiles_toy(name):
     X = [[-2, -2], [-2, -2], [-1, -1], [-1, -1], [1, 1], [1, 2]]
     y = [-1, -1, 0, 1, 1, 2]
 
-    est = ForestRegressor(n_estimators=1, bootstrap=False, random_state=0)
+    est = ForestRegressor(
+        n_estimators=1, max_samples_leaf=None, bootstrap=False, random_state=0
+    )
     est.fit(X, y)
 
     expected = [
@@ -425,7 +429,9 @@ def check_quantile_ranks_toy(name):
     X = [[-2, -2], [-2, -2], [-1, -1], [-1, -1], [1, 1], [1, 2]]
     y = [-1, -1, 0, 1, 1, 2]
 
-    est = ForestRegressor(n_estimators=1, bootstrap=False, random_state=0)
+    est = ForestRegressor(
+        n_estimators=1, max_samples_leaf=None, bootstrap=False, random_state=0
+    )
     est.fit(X, y)
 
     expected = [0.75, 0.75, 0.5, 1., 1., 1.]
@@ -445,7 +451,9 @@ def check_quantile_ranks_toy(name):
     assert_array_equal(y_ranks, expected)
 
     # Check aggregated and unaggregated predicted ranks.
-    est = ForestRegressor(n_estimators=2, bootstrap=False, random_state=0)
+    est = ForestRegressor(
+        n_estimators=2, max_samples_leaf=None, bootstrap=False, random_state=0
+    )
     est.fit(X, y)
 
     kwargs = {"aggregate_leaves_first": True}
@@ -534,7 +542,9 @@ def check_proximity_counts(name):
     y = [-1, -1, 0, 1, 1, 2]
 
     # Check that proximity counts match expected counts without bootstrap.
-    est = ForestRegressor(n_estimators=5, bootstrap=False, random_state=0)
+    est = ForestRegressor(
+        n_estimators=5, max_samples_leaf=None, bootstrap=False, random_state=0
+    )
     est.fit(X, y)
 
     expected = [
@@ -581,6 +591,7 @@ def check_proximity_counts(name):
     # Check that proximity counts match expected counts without splits.
     est = ForestRegressor(
         n_estimators=1,
+        max_samples_leaf=None,
         min_samples_leaf=len(X),
         bootstrap=False,
         random_state=0,
@@ -591,7 +602,9 @@ def check_proximity_counts(name):
     assert np.sum(proximity_counts) == (1 * len(X) * len(X))
 
     # Check proximity counts on the California Housing Prices dataset.
-    est = ForestRegressor(n_estimators=10, bootstrap=True, random_state=0)
+    est = ForestRegressor(
+        n_estimators=10, max_samples_leaf=None, bootstrap=True, random_state=0
+    )
     est.fit(X_california, y_california)
 
     # Check that proximity counts match bootstrap counts.
@@ -1007,7 +1020,11 @@ def check_proximity_counts_oob(name):
     ForestRegressor = FOREST_REGRESSORS[name]
 
     est = ForestRegressor(
-        n_estimators=20, bootstrap=True, oob_score=True, random_state=0
+        n_estimators=20,
+        max_samples_leaf=None,
+        bootstrap=True,
+        oob_score=True,
+        random_state=0,
     )
     est.fit(X, y)
 
@@ -1054,7 +1071,11 @@ def check_proximity_counts_oob(name):
     # Check warning if not enough estimators.
     with np.errstate(divide="ignore", invalid="ignore"):
         est = ForestRegressor(
-            n_estimators=4, bootstrap=True, oob_score=True, random_state=0
+            n_estimators=4,
+            max_samples_leaf=None,
+            bootstrap=True,
+            oob_score=True,
+            random_state=0,
         )
         with pytest.warns(UserWarning):
             est.fit(X, y)
@@ -1063,7 +1084,9 @@ def check_proximity_counts_oob(name):
                 assert any(len(x) == 0 for x in proximities)
 
     # Check error if no bootstrapping.
-    est = ForestRegressor(n_estimators=1, bootstrap=False)
+    est = ForestRegressor(
+        n_estimators=1, max_samples_leaf=None, bootstrap=False
+    )
     est.fit(X, y)
     assert_raises(ValueError, est.proximity_counts, X, oob_score=True)