From 6f81ee7ec1831350328b80c273843c674c94efeb Mon Sep 17 00:00:00 2001 From: Reid Johnson Date: Tue, 6 Aug 2024 18:19:47 -0700 Subject: [PATCH] Update example plots --- examples/plot_huggingface_model.py | 18 ++++---- examples/plot_predict_custom.py | 12 +++-- examples/plot_proximity_counts.py | 16 ++++--- examples/plot_quantile_conformalized.py | 36 +++++++-------- examples/plot_quantile_example.py | 11 ++--- examples/plot_quantile_extrapolation.py | 58 +++++++++++++++---------- examples/plot_quantile_interpolation.py | 13 +++--- examples/plot_quantile_intervals.py | 20 +++++---- examples/plot_quantile_multioutput.py | 20 ++++++--- examples/plot_quantile_ranks.py | 9 ++-- examples/plot_quantile_vs_standard.py | 33 ++++++++------ examples/plot_treeshap_example.py | 9 ++-- 12 files changed, 149 insertions(+), 106 deletions(-) diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py index c263159..f2a546c 100755 --- a/examples/plot_huggingface_model.py +++ b/examples/plot_huggingface_model.py @@ -27,15 +27,17 @@ alt.data_transformers.disable_max_rows() +random_seed = 0 + token = "" repo_id = "quantile-forest/california-housing-example" load_existing = True -quantiles = np.arange(0, 1.25, 0.25).round(2).tolist() +quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist() sample_frac = 1 -def fit_and_upload_model(token, repo_id, local_dir="./local_repo"): +def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None): """Function used to fit the model and upload it to Hugging Face Hub.""" from pathlib import Path @@ -49,10 +51,10 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"): from skops import card X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) # Fit the model. - qrf = RandomForestQuantileRegressor(random_state=0).fit(X_train, y_train) + qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train) # Save the model to a file. model_filename = "model.pkl" @@ -145,7 +147,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"): if not load_existing: - fit_and_upload_model(token, repo_id) + fit_and_upload_model(token, repo_id, random_state=random_seed) # Download the repository locally. local_dir = "./local_repo" @@ -166,7 +168,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"): df = ( pd.DataFrame(y_pred, columns=quantiles) .reset_index() - .sample(frac=sample_frac, random_state=0) + .sample(frac=sample_frac, random_state=random_seed) .melt(id_vars=["index"], var_name="quantile", value_name="value") .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right") ) @@ -178,7 +180,7 @@ def plot_quantiles_by_latlon(df, quantiles): min=0, max=1, step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1), - name="Quantile: ", + name="Predicted Quantile: ", ) q_val = alt.selection_point( @@ -217,7 +219,7 @@ def plot_quantiles_by_latlon(df, quantiles): .properties( height=650, width=650, - title="Quantile Estimates on the California Housing Dataset", + title="Quantile Predictions on the California Housing Dataset", ) ) return chart diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py index e1c69a7..526e5a6 100755 --- a/examples/plot_predict_custom.py +++ b/examples/plot_predict_custom.py @@ -19,10 +19,12 @@ import scipy as sp from sklearn import datasets from sklearn.model_selection import train_test_split +from sklearn.utils.validation import check_random_state from quantile_forest import RandomForestQuantileRegressor -np.random.seed(0) +random_seed = 0 +rng = check_random_state(random_seed) n_test_samples = 100 @@ -68,12 +70,14 @@ def predict(reg, X, quantiles=0.5, what=None): X, y = datasets.load_diabetes(return_X_y=True) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test_samples, random_state=random_seed +) -reg = RandomForestQuantileRegressor().fit(X_train, y_train) +reg = RandomForestQuantileRegressor(random_state=random_seed).fit(X_train, y_train) # Define a user-specified function; here we randomly sample 1000 values with replacement. -func = lambda x: np.random.choice(x, size=1000) +func = lambda x: rng.choice(x, size=1000) # Output array with the user-specified function applied to each sample's empirical distribution. y_out = predict(reg, X_test, what=func) diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py index ba2fd06..b28c4dd 100644 --- a/examples/plot_proximity_counts.py +++ b/examples/plot_proximity_counts.py @@ -25,7 +25,8 @@ from quantile_forest import RandomForestQuantileRegressor -rng = check_random_state(0) +random_seed = 0 +rng = check_random_state(random_seed) n_test_samples = 25 noise_std = 0.1 @@ -33,13 +34,17 @@ # Load the Digits dataset. X, y = datasets.load_digits(return_X_y=True, as_frame=True) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test_samples, random_state=random_seed +) def add_gaussian_noise(X, mean=0, std=0.1, random_state=None): """Add Gaussian noise to input data.""" if random_state is None: rng = check_random_state(0) + elif isinstance(random_state, int): + rng = check_random_state(random_state) else: rng = random_state @@ -73,13 +78,13 @@ def extract_floats(combined_df, scale=100): # Randomly add noise to the training and test data. -X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=rng) -X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=rng) +X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed) +X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed) # We set `max_samples_leaf=None` to ensure that every sample in the training # data is stored in the leaf nodes. By doing this, we allow the model to # consider all samples as potential candidates for proximity calculations. -qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=0) +qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_seed) qrf.fit(X_train_noisy, X_train) # Get the proximity counts. @@ -177,6 +182,7 @@ def plot_digits_proximities( color=alt.Color("value_clean:Q", legend=None, scale=alt.Scale(scheme="greys")), opacity=alt.condition(alt.datum["value_clean"] == 0, alt.value(0), alt.value(0.67)), tooltip=[ + alt.Tooltip("prox_idx", title="Proximity Index"), alt.Tooltip("prox_cnt", title="Proximity Count"), alt.Tooltip("target:Q", title="Digit"), ], diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py index 60d03e7..5bc2f1f 100755 --- a/examples/plot_quantile_conformalized.py +++ b/examples/plot_quantile_conformalized.py @@ -25,31 +25,31 @@ alt.data_transformers.disable_max_rows() +random_seed = 0 +rng = check_random_state(random_seed) + +n_samples = 1000 +coverages = np.linspace(0, 1, num=11, endpoint=True).round(1).tolist() # the "coverage level" + strategies = { "qrf": "Quantile Regression Forest (QRF)", "cqr": "Conformalized Quantile Regression (CQR)", } -random_state = 0 -rng = check_random_state(random_state) - -coverages = np.arange(0, 1.1, 0.1).round(1).tolist() # the "coverage level" - # Load the California Housing Prices dataset. -california = datasets.fetch_california_housing() -n_samples = min(california.target.size, 1000) -perm = rng.permutation(n_samples) -X = california.data[perm] -y = california.target[perm] +X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) +perm = rng.permutation(min(len(X), n_samples)) +X = X.iloc[perm] +y = y.iloc[perm] -X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed) def sort_y_values(y_test, y_pred, y_pis): """Sort the target values and predictions.""" indices = np.argsort(y_test) return { - "y_test": y_test[indices], + "y_test": np.asarray(y_test)[indices], "y_pred": y_pred[indices], "y_pred_low": y_pis[:, 0][indices], "y_pred_upp": y_pis[:, 1][indices], @@ -68,10 +68,10 @@ def mean_width_score(y_pred_low, y_pred_upp): return float(mean_width) -def qrf_strategy(alpha, X_train, X_test, y_train, y_test): +def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None): quantiles = [alpha / 2, 1 - alpha / 2] - qrf = RandomForestQuantileRegressor(random_state=0) + qrf = RandomForestQuantileRegressor(random_state=random_state) qrf.fit(X_train, y_train) # Calculate the lower and upper quantile values on the test data. @@ -88,15 +88,15 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test): return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="qrf") -def cqr_strategy(alpha, X_train, X_test, y_train, y_test): +def cqr_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None): quantiles = [alpha / 2, 1 - alpha / 2] # Create calibration set. X_train, X_calib, y_train, y_calib = train_test_split( - X_train, y_train, test_size=0.5, random_state=0 + X_train, y_train, test_size=0.5, random_state=random_state ) - qrf = RandomForestQuantileRegressor(random_state=0) + qrf = RandomForestQuantileRegressor(random_state=random_state) qrf.fit(X_train, y_train) # Calculate the lower and upper quantile values on the test data. @@ -134,7 +134,7 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test): dfs = [] for cov_frac in coverages: alpha = float(round(1 - cov_frac, 2)) - args = (alpha, X_train, X_test, y_train, y_test) + args = (alpha, X_train, X_test, y_train, y_test, random_seed) dfs.append(pd.concat([qrf_strategy(*args), cqr_strategy(*args)]).assign(alpha=alpha)) df = pd.concat(dfs) diff --git a/examples/plot_quantile_example.py b/examples/plot_quantile_example.py index 6a5a96b..a3a1008 100755 --- a/examples/plot_quantile_example.py +++ b/examples/plot_quantile_example.py @@ -14,6 +14,7 @@ from quantile_forest import RandomForestQuantileRegressor +random_seed = 0 n_samples = 1000 bounds = [0, 10] quantiles = [0.025, 0.5, 0.975] @@ -33,12 +34,12 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0): # Create noisy data for modeling and non-noisy function data for illustration. -X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0) -X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=0) +X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=random_seed) +X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=random_seed) -X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed) -qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=0) +qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_seed) qrf.fit(X_train, y_train) y_pred_func = qrf.predict(X_func, quantiles=quantiles) @@ -133,7 +134,7 @@ def plot_fit_and_intervals(df): chart = ( (area_pred + points + line_true + line_pred + blank) .resolve_scale(color="independent") - .properties(height=400, width=650) + .properties(height=400, width=650, title="QRF Predictions vs. Ground Truth on Toy Dataset") ) return chart diff --git a/examples/plot_quantile_extrapolation.py b/examples/plot_quantile_extrapolation.py index 6f86e0f..ff3c430 100755 --- a/examples/plot_quantile_extrapolation.py +++ b/examples/plot_quantile_extrapolation.py @@ -19,10 +19,12 @@ import altair as alt import numpy as np import pandas as pd +from sklearn.utils.validation import check_random_state from quantile_forest import RandomForestQuantileRegressor -np.random.seed(0) +random_seed = 0 +rng = check_random_state(random_seed) n_samples = 500 bounds = [0, 15] @@ -31,14 +33,15 @@ func_str = "f(x) = x sin(x)" quantiles = [0.025, 0.975, 0.5] -qrf_params = {"max_samples_leaf": None, "min_samples_leaf": 4, "random_state": 0} +qrf_params = {"max_samples_leaf": None, "min_samples_leaf": 4, "random_state": random_seed} -def make_func_Xy(func, bounds, n_samples): +def make_func_Xy(func, bounds, n_samples, random_seed=0): + rng = np.random.RandomState(random_seed) x = np.linspace(bounds[0], bounds[1], n_samples) f = func(x) std = 0.01 + np.abs(x - 5.0) / 5.0 - noise = np.random.normal(scale=std) + noise = rng.normal(scale=std) y = f + noise return np.atleast_2d(x).T, y @@ -103,7 +106,7 @@ def _penalized_locpol(fval, v, X, weights, degree, pen=0, penalize_intercept=Fal return deriv_mat @staticmethod - def _get_tree_weight_matrix(X, Y, X_eval=None, n_trees=100, **kwargs): + def _get_tree_weight_matrix(X, Y, X_eval=None, n_trees=100, rng=None, **kwargs): """Fit forest and extract weights. This implementation extracts the weight matrix from a list of quantile @@ -118,6 +121,8 @@ def _get_tree_weight_matrix(X, Y, X_eval=None, n_trees=100, **kwargs): del kwargs["random_state"] kwargs["bootstrap"] = False + rng = np.random.RandomState(0) if rng is None else rng + trees = [RandomForestQuantileRegressor(random_state=i, **kwargs) for i in range(n_trees)] n = X.shape[0] @@ -132,7 +137,7 @@ def _get_tree_weight_matrix(X, Y, X_eval=None, n_trees=100, **kwargs): for tree in trees: # Draw bootstrap sample. - boot_sample = np.random.choice(np.arange(n), bn, replace=False) + boot_sample = rng.choice(np.arange(n), bn, replace=False) split1 = boot_sample[: int(bn / 2)] split2 = np.concatenate([boot_sample[int(bn / 2) :], np.arange(nn) + n]) @@ -155,7 +160,7 @@ def _get_tree_weight_matrix(X, Y, X_eval=None, n_trees=100, **kwargs): return weight_mat - def fit_weights(self, X, fval, x0=None, train=False, **kwargs): + def fit_weights(self, X, fval, x0=None, train=False, rng=None, **kwargs): """Compute random forest weights for derivative estimation.""" n, d = X.shape fval = fval.flatten() @@ -167,19 +172,21 @@ def fit_weights(self, X, fval, x0=None, train=False, **kwargs): for jj, var in enumerate(xtra_features): var_order = list(range(d)) var_order = np.array([var] + var_order[:var] + var_order[var + 1 :]) - weights[jj] = self._get_tree_weight_matrix(X[:, var_order], fval, x0, **kwargs) + weights[jj] = self._get_tree_weight_matrix( + X[:, var_order], fval, x0, rng=rng, **kwargs + ) else: - weights = self._get_tree_weight_matrix(X, fval, x0, **kwargs)[n:, :n] + weights = self._get_tree_weight_matrix(X, fval, x0, rng=rng, **kwargs)[n:, :n] return weights - def fit_derivatives(self, X, fval, pen=0.1, **kwargs): + def fit_derivatives(self, X, fval, pen=0.1, rng=None, **kwargs): """Estimate derivatives.""" n, d = X.shape fval = fval.flatten() # Fit weights for local polynomial. - weights = self.fit_weights(X, fval, train=True, **kwargs) + weights = self.fit_weights(X, fval, train=True, rng=rng, **kwargs) # Estimate derivatives with local polynomial. derivatives = np.zeros((self.max_order_ + 1, n, d)) @@ -203,7 +210,7 @@ def fit_derivatives(self, X, fval, pen=0.1, **kwargs): return derivatives - def prediction_bounds(self, X, fval, x0, nn=50, **kwargs): + def prediction_bounds(self, X, fval, x0, nn=50, rng=None, **kwargs): """Compute extrapolation bounds.""" n, d = X.shape fval = fval.flatten() @@ -213,7 +220,7 @@ def prediction_bounds(self, X, fval, x0, nn=50, **kwargs): xtra_features = list(range(d)) # Fit derivatives. - derivatives = self.fit_derivatives(X, fval, **kwargs) + derivatives = self.fit_derivatives(X, fval, rng=rng, **kwargs) # Determine weighting for extrapolation points (using rotation). mu = derivatives[1].mean(axis=0) @@ -293,7 +300,7 @@ def prediction_bounds(self, X, fval, x0, nn=50, **kwargs): return bounds -def train_test_split(train_indices, **kwargs): +def train_test_split(train_indices, rng=None, **kwargs): """Fit model on training samples and extrapolate on test samples.""" X_train = X[train_indices, :] y_train = y[train_indices] @@ -308,7 +315,9 @@ def train_test_split(train_indices, **kwargs): for i in range(len(quantiles)): # Run Xtrapolation on quantile. xtra = Xtrapolation() - bounds_list[i] = xtra.prediction_bounds(X_train, qmat[train_indices, i], X, **kwargs) + bounds_list[i] = xtra.prediction_bounds( + X_train, qmat[train_indices, i], X, rng=rng, **kwargs + ) return { "train_indices": train_indices, @@ -331,33 +340,34 @@ def prob_randomized_pi(qmat, y, coverage): return prob_si -def randomized_pi(qmat, prob_si, y): +def randomized_pi(qmat, prob_si, y, random_state=None): """Calculate coverage.""" - si_index = np.random.choice([False, True], len(y), replace=True, p=[prob_si, 1 - prob_si]) + rng = np.random.RandomState(0) if random_state is None else random_state + si_index = rng.choice([False, True], len(y), replace=True, p=[prob_si, 1 - prob_si]) included = (qmat[:, 0] < y) & (y < qmat[:, 1]) boundary = (qmat[:, 0] == y) | (qmat[:, 1] == y) return included | (boundary & si_index) -def get_coverage_qrf(qmat, train_indices, test_indices, y_train, level): +def get_coverage_qrf(qmat, train_indices, test_indices, y_train, level, *args): """Calculate extrapolation coverage for regular quantile forest.""" prob_si = prob_randomized_pi(qmat[train_indices, :], y_train, level) - qrf = randomized_pi(qmat, prob_si, y) + qrf = randomized_pi(qmat, prob_si, y, *args) return np.mean(qrf[test_indices]) -def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level): +def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *args): """Calculate extrapolation coverage for Xtrapolation.""" bb_low = np.max(bounds_list[0][:, :, 0], axis=1) bb_upp = np.min(bounds_list[1][:, :, 1], axis=1) bb_low_train, bb_upp_train = bb_low[train_indices], bb_upp[train_indices] prob_si = prob_randomized_pi(np.c_[bb_low_train, bb_upp_train], y_train, level) - xtra = randomized_pi(np.c_[bb_low, bb_upp], prob_si, y) + xtra = randomized_pi(np.c_[bb_low, bb_upp], prob_si, y, *args) return np.mean(xtra[test_indices]) # Create the full dataset. -X, y = make_func_Xy(func, bounds, n_samples) +X, y = make_func_Xy(func, bounds, n_samples, random_seed=random_seed) # Fit and extrapolate based on train-test split (depending on X). extrap_min_idx = int(n_samples * (extrap_frac / 2)) @@ -365,10 +375,10 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level): sort_X = np.argsort(X.squeeze()) train_indices = np.repeat(False, len(y)) train_indices[sort_X[extrap_min_idx] : sort_X[extrap_max_idx]] = True -res = train_test_split(train_indices, **qrf_params) +res = train_test_split(train_indices, rng=rng, **qrf_params) # Get coverages on extrapolated samples. -args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0]) +args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], rng) cov_qrf = get_coverage_qrf(res["qmat"], *args) cov_xtr = get_coverage_xtr(res["bounds_list"], *args) diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py index 5a940cf..37cf5f5 100755 --- a/examples/plot_quantile_interpolation.py +++ b/examples/plot_quantile_interpolation.py @@ -16,7 +16,8 @@ from quantile_forest import RandomForestQuantileRegressor -intervals = np.arange(0, 1.01, 0.01).round(2).tolist() +random_seed = 0 +intervals = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist() # Create toy dataset. X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) @@ -26,7 +27,7 @@ n_estimators=1, max_samples_leaf=None, bootstrap=False, - random_state=0, + random_state=random_seed, ) qrf.fit(X, y) @@ -134,9 +135,9 @@ def plot_interpolations(df, legend): (area + point) .add_params(interval_selection, click) .transform_filter( - "(datum.quantile_low == round((0.5 - interval / 2) * 1000) / 1000)" + "(datum.method == 'Actual')" + "| (datum.quantile_low == round((0.5 - interval / 2) * 1000) / 1000)" "| (datum.quantile_upp == round((0.5 + interval / 2) * 1000) / 1000)" - "| (datum.method == 'Actual')" ) .properties(height=400) .facet( @@ -144,11 +145,13 @@ def plot_interpolations(df, legend): "X:N", header=alt.Header(labelOrient="bottom", titleOrient="bottom"), title="Samples (Feature Values)", - ) + ), + title="QRF Prediction Intervals by Quantile Interpolation on Toy Dataset", ) .configure_facet(spacing=15) .configure_range(category=alt.RangeScheme(list(legend.values()))) .configure_scale(bandPaddingInner=0.9) + .configure_title(anchor="middle") .configure_view(stroke=None) ) diff --git a/examples/plot_quantile_intervals.py b/examples/plot_quantile_intervals.py index e85ad59..72412a4 100755 --- a/examples/plot_quantile_intervals.py +++ b/examples/plot_quantile_intervals.py @@ -17,16 +17,18 @@ from quantile_forest import RandomForestQuantileRegressor -rng = check_random_state(0) +random_seed = 0 +rng = check_random_state(random_seed) + +n_samples = 1000 # Load the California Housing Prices dataset. -california = datasets.fetch_california_housing() -n_samples = min(california.target.size, 1000) -perm = rng.permutation(n_samples) -X = california.data[perm] -y = california.target[perm] +X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) +perm = rng.permutation(min(len(X), n_samples)) +X = X.iloc[perm] +y = y.iloc[perm] -qrf = RandomForestQuantileRegressor(n_estimators=100, random_state=0) +qrf = RandomForestQuantileRegressor(random_state=random_seed) kf = KFold(n_splits=5) kf.get_n_splits(X) @@ -34,8 +36,8 @@ # Using k-fold cross-validation, get predictions for all samples. data = {"y_true": [], "y_pred": [], "y_pred_low": [], "y_pred_upp": []} for train_index, test_index in kf.split(X): - X_train, y_train = X[train_index], y[train_index] - X_test, y_test = X[test_index], y[test_index] + X_train, y_train = X.iloc[train_index], y.iloc[train_index] + X_test, y_test = X.iloc[test_index], y.iloc[test_index] qrf.set_params(max_features=X_train.shape[1] // 3) qrf.fit(X_train, y_train) diff --git a/examples/plot_quantile_multioutput.py b/examples/plot_quantile_multioutput.py index a632ded..f763e74 100755 --- a/examples/plot_quantile_multioutput.py +++ b/examples/plot_quantile_multioutput.py @@ -14,24 +14,26 @@ import numpy as np import pandas as pd from sklearn.model_selection import train_test_split +from sklearn.utils.validation import check_random_state from quantile_forest import RandomForestQuantileRegressor -np.random.seed(0) +random_seed = 0 +rng = check_random_state(random_seed) n_samples = 2500 bounds = [0, 100] -quantiles = np.arange(0, 1.025, 0.025).round(3).tolist() +quantiles = np.linspace(0, 1, num=41, endpoint=True).round(3).tolist() # Define functions that generate targets; each function maps to one target. funcs = [ { "signal": lambda x: np.log1p(x + 1), - "noise": lambda x: np.log1p(x) * np.random.uniform(size=len(x)), + "noise": lambda x: np.log1p(x) * rng.uniform(size=len(x)), }, { "signal": lambda x: np.log1p(np.sqrt(x)), - "noise": lambda x: np.log1p(x / 2) * np.random.uniform(size=len(x)), + "noise": lambda x: np.log1p(x / 2) * rng.uniform(size=len(x)), }, ] @@ -57,9 +59,9 @@ def format_frac(fraction): # Create the dataset with multiple target variables. X, y = make_func_Xy(funcs, bounds, n_samples) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed) -qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=0) +qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=random_seed) qrf.fit(X_train, y_train) # fit on all of the targets simultaneously # Get multiple-output predictions at many quantiles. @@ -153,7 +155,11 @@ def plot_multioutputs(df, legend): (points + area + line) .add_params(interval_selection, click) .configure_range(category=alt.RangeScheme(list(legend.values()))) - .properties(height=400, width=650, title="Multi-target Prediction Intervals") + .properties( + height=400, + width=650, + title="Multi-target Predictions and Prediction Intervals on Toy Dataset", + ) ) return chart diff --git a/examples/plot_quantile_ranks.py b/examples/plot_quantile_ranks.py index b950c7d..4eb5452 100644 --- a/examples/plot_quantile_ranks.py +++ b/examples/plot_quantile_ranks.py @@ -16,6 +16,7 @@ from quantile_forest import RandomForestQuantileRegressor +random_seed = 0 n_samples = 5000 bounds = [0, 10] @@ -28,9 +29,9 @@ def make_toy_dataset(n_samples, bounds, random_seed=0): return X, y -X, y = make_toy_dataset(n_samples, bounds, random_seed=0) +X, y = make_toy_dataset(n_samples, bounds, random_seed=random_seed) -params = {"max_samples_leaf": None, "min_samples_leaf": 50, "random_state": 0} +params = {"max_samples_leaf": None, "min_samples_leaf": 50, "random_state": random_seed} qrf = RandomForestQuantileRegressor(**params).fit(X, y) y_pred = qrf.predict(X, quantiles=0.5) @@ -94,7 +95,9 @@ def plot_fit_and_ranks(df): ) chart = (dummy_legend + points + line_pred).properties( - height=400, width=650, title="QRF Predictions with Quantile Rank Thresholding" + height=400, + width=650, + title="QRF Predictions with Quantile Rank Thresholding on Toy Dataset", ) return chart diff --git a/examples/plot_quantile_vs_standard.py b/examples/plot_quantile_vs_standard.py index 224b9b2..b63d69d 100755 --- a/examples/plot_quantile_vs_standard.py +++ b/examples/plot_quantile_vs_standard.py @@ -2,13 +2,13 @@ Quantile Regression Forests vs. Random Forests ============================================== -This example compares the estimates generated by a quantile regression forest -(QRF) and a standard random forest regressor on a synthetic, right-skewed -dataset. In a right-skewed distribution, the mean is to the right of the -median. As illustrated by a greater overlap in the frequencies of the actual -and predicted values, the median (quantile = 0.5) estimated by a quantile -regressor can be a more reliable estimator of a skewed distribution than the -mean. +This example compares the predictions generated by a quantile regression +forest (QRF) and a standard random forest regressor on a synthetic +right-skewed dataset. In a right-skewed distribution, the mean is to the right +of the median. As illustrated by a greater overlap in the frequencies of the +actual and predicted values, the median (quantile = 0.5) predicted by a +quantile regressor can be a more reliable estimator of a skewed distribution +than the mean. """ import altair as alt @@ -23,9 +23,10 @@ alt.data_transformers.disable_max_rows() -rng = check_random_state(0) +random_seed = 0 +rng = check_random_state(random_seed) -quantiles = np.arange(0, 1.05, 0.05).round(2).tolist() +quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist() # Create right-skewed dataset. n_samples = 5000 @@ -35,10 +36,10 @@ y = skewnorm_rv.rvs(n_samples) X = rng.randn(n_samples, 2) * y.reshape(-1, 1) -regr_rf = RandomForestRegressor(n_estimators=10, random_state=0) -regr_qrf = RandomForestQuantileRegressor(n_estimators=10, random_state=0) +regr_rf = RandomForestRegressor(n_estimators=10, random_state=random_seed) +regr_qrf = RandomForestQuantileRegressor(n_estimators=10, random_state=random_seed) -X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed) regr_rf.fit(X_train, y_train) regr_qrf.fit(X_train, y_train) @@ -69,7 +70,7 @@ def plot_prediction_histograms(df, legend): min=0, max=1, step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1), - name="Quantile: ", + name="Predicted Quantile: ", ) q_val = alt.selection_point( @@ -114,7 +115,11 @@ def plot_prediction_histograms(df, legend): ], ) .configure_range(category=alt.RangeScheme(list(legend.values()))) - .properties(height=400, width=650) + .properties( + height=400, + width=650, + title="Distribution of RF vs. QRF Predictions on Right-Skewed Distribution", + ) ) return chart diff --git a/examples/plot_treeshap_example.py b/examples/plot_treeshap_example.py index 271ad72..a7dc33c 100644 --- a/examples/plot_treeshap_example.py +++ b/examples/plot_treeshap_example.py @@ -22,9 +22,10 @@ from quantile_forest import RandomForestQuantileRegressor +random_seed = 0 n_samples = 500 test_idx = 0 -quantiles = np.arange(0, 1.1, 0.1).round(1).tolist() +quantiles = np.linspace(0, 1, num=11, endpoint=True).round(1).tolist() def get_shap_values(qrf, X, quantile=0.5, **kwargs): @@ -69,9 +70,9 @@ def get_shap_value_by_index(shap_values, index): X = X.iloc[:n_samples] y = y[:n_samples] y *= 100_000 # convert to dollars -X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed) -qrf = RandomForestQuantileRegressor(random_state=0) +qrf = RandomForestQuantileRegressor(random_state=random_seed) qrf.fit(X_train, y_train) df = pd.concat( @@ -103,7 +104,7 @@ def plot_shap_waterfall_with_quantiles(df, height=300): min=0, max=1, step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1), - name="Quantile: ", + name="Predicted Quantile: ", ) q_val = alt.selection_point(