Skip to content

Commit

Permalink
Update Default max_samples_leaf Value
Browse files Browse the repository at this point in the history
- Change default value from None to 1
- Update unit tests
- Add note to documentation
  • Loading branch information
reidjohnson committed Nov 7, 2022
1 parent 5763153 commit 18f74af
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 13 deletions.
2 changes: 2 additions & 0 deletions docs/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ Let's fit a quantile forest on a simple regression dataset::
>>> reg.fit(X_train, y_train)
RandomForestQuantileRegressor(...)

During model initialization, the parameter `max_samples_leaf` can be specified, which determines the maximum number of samples per leaf node to retain. If `max_samples_leaf` is smaller than the number of samples in a given leaf node, then a subset of values are randomly selected. By default, the model retains one randomly selected sample per leaf node (`max_samples_leaf = 1`); all samples can be retained by specifying `max_samples_leaf = None`. Note that the number of retained samples can materially impact the size of the model object.

As quantile forests can provide predictions at quantiles, they accept an optional parameter during the call to the `predict` method, which can be a float or list of floats that specify the empirical quantiles to return::

>>> y_pred = reg.predict(X_test, quantiles=[0.25, 0.5, 0.75]) # returns three columns per row
Expand Down
10 changes: 6 additions & 4 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,13 +800,14 @@ class RandomForestQuantileRegressor(BaseForestQuantileRegressor):
`ceil(min_samples_leaf * n_samples)` are the minimum
number of samples for each node.
max_samples_leaf : int or float, default=None
max_samples_leaf : int, float or None, default=1
The maximum number of samples permitted to be at a leaf node.
- If int, then consider `max_samples_leaf` as the maximum number.
- If float, then `max_samples_leaf` is a fraction and
`ceil(max_samples_leaf * n_samples)` are the maximum
number of samples for each node.
- If None then unlimited number of leaf samples.
min_weight_fraction_leaf : float, default=0.0
The minimum weighted fraction of the sum total of weights (of all
Expand Down Expand Up @@ -965,7 +966,7 @@ def __init__(
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_samples_leaf=None,
max_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=1.0,
max_leaf_nodes=None,
Expand Down Expand Up @@ -1068,13 +1069,14 @@ class ExtraTreesQuantileRegressor(BaseForestQuantileRegressor):
`ceil(min_samples_leaf * n_samples)` are the minimum
number of samples for each node.
max_samples_leaf : int or float, default=None
max_samples_leaf : int, float or None, default=1
The maximum number of samples permitted to be at a leaf node.
- If int, then consider `max_samples_leaf` as the maximum number.
- If float, then `max_samples_leaf` is a fraction and
`ceil(max_samples_leaf * n_samples)` are the maximum
number of samples for each node.
- If None then unlimited number of leaf samples.
min_weight_fraction_leaf : float, default=0.0
The minimum weighted fraction of the sum total of weights (of all
Expand Down Expand Up @@ -1237,7 +1239,7 @@ def __init__(
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_samples_leaf=None,
max_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=1.0,
max_leaf_nodes=None,
Expand Down
41 changes: 32 additions & 9 deletions quantile_forest/tests/test_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ def check_regression_toy(name, weighted_quantile):

ForestRegressor = FOREST_REGRESSORS[name]

regr = ForestRegressor(n_estimators=10, bootstrap=False, random_state=1)
regr = ForestRegressor(
n_estimators=10, max_samples_leaf=None, bootstrap=False, random_state=1
)
regr.fit(X, y)

# Check model and apply outputs shape.
Expand Down Expand Up @@ -154,7 +156,9 @@ def check_predict_quantiles_toy(name):
X = [[-2, -2], [-2, -2], [-1, -1], [-1, -1], [1, 1], [1, 2]]
y = [-1, -1, 0, 1, 1, 2]

est = ForestRegressor(n_estimators=1, bootstrap=False, random_state=0)
est = ForestRegressor(
n_estimators=1, max_samples_leaf=None, bootstrap=False, random_state=0
)
est.fit(X, y)

expected = [
Expand Down Expand Up @@ -425,7 +429,9 @@ def check_quantile_ranks_toy(name):
X = [[-2, -2], [-2, -2], [-1, -1], [-1, -1], [1, 1], [1, 2]]
y = [-1, -1, 0, 1, 1, 2]

est = ForestRegressor(n_estimators=1, bootstrap=False, random_state=0)
est = ForestRegressor(
n_estimators=1, max_samples_leaf=None, bootstrap=False, random_state=0
)
est.fit(X, y)

expected = [0.75, 0.75, 0.5, 1., 1., 1.]
Expand All @@ -445,7 +451,9 @@ def check_quantile_ranks_toy(name):
assert_array_equal(y_ranks, expected)

# Check aggregated and unaggregated predicted ranks.
est = ForestRegressor(n_estimators=2, bootstrap=False, random_state=0)
est = ForestRegressor(
n_estimators=2, max_samples_leaf=None, bootstrap=False, random_state=0
)
est.fit(X, y)

kwargs = {"aggregate_leaves_first": True}
Expand Down Expand Up @@ -534,7 +542,9 @@ def check_proximity_counts(name):
y = [-1, -1, 0, 1, 1, 2]

# Check that proximity counts match expected counts without bootstrap.
est = ForestRegressor(n_estimators=5, bootstrap=False, random_state=0)
est = ForestRegressor(
n_estimators=5, max_samples_leaf=None, bootstrap=False, random_state=0
)
est.fit(X, y)

expected = [
Expand Down Expand Up @@ -581,6 +591,7 @@ def check_proximity_counts(name):
# Check that proximity counts match expected counts without splits.
est = ForestRegressor(
n_estimators=1,
max_samples_leaf=None,
min_samples_leaf=len(X),
bootstrap=False,
random_state=0,
Expand All @@ -591,7 +602,9 @@ def check_proximity_counts(name):
assert np.sum(proximity_counts) == (1 * len(X) * len(X))

# Check proximity counts on the California Housing Prices dataset.
est = ForestRegressor(n_estimators=10, bootstrap=True, random_state=0)
est = ForestRegressor(
n_estimators=10, max_samples_leaf=None, bootstrap=True, random_state=0
)
est.fit(X_california, y_california)

# Check that proximity counts match bootstrap counts.
Expand Down Expand Up @@ -1007,7 +1020,11 @@ def check_proximity_counts_oob(name):
ForestRegressor = FOREST_REGRESSORS[name]

est = ForestRegressor(
n_estimators=20, bootstrap=True, oob_score=True, random_state=0
n_estimators=20,
max_samples_leaf=None,
bootstrap=True,
oob_score=True,
random_state=0,
)
est.fit(X, y)

Expand Down Expand Up @@ -1054,7 +1071,11 @@ def check_proximity_counts_oob(name):
# Check warning if not enough estimators.
with np.errstate(divide="ignore", invalid="ignore"):
est = ForestRegressor(
n_estimators=4, bootstrap=True, oob_score=True, random_state=0
n_estimators=4,
max_samples_leaf=None,
bootstrap=True,
oob_score=True,
random_state=0,
)
with pytest.warns(UserWarning):
est.fit(X, y)
Expand All @@ -1063,7 +1084,9 @@ def check_proximity_counts_oob(name):
assert any(len(x) == 0 for x in proximities)

# Check error if no bootstrapping.
est = ForestRegressor(n_estimators=1, bootstrap=False)
est = ForestRegressor(
n_estimators=1, max_samples_leaf=None, bootstrap=False
)
est.fit(X, y)
assert_raises(ValueError, est.proximity_counts, X, oob_score=True)

Expand Down

0 comments on commit 18f74af

Please sign in to comment.