diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97f..b50414b08e 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -7,6 +7,11 @@ FROM ${BASE} as pip-base
 
 ENV DEFAULT_VIRTUAL_ENV=rapids
 
+RUN apt update -y \
+ && DEBIAN_FRONTEND=noninteractive apt install -y \
+    libblas-dev liblapack-dev \
+ && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
+
 FROM ${BASE} as conda-base
 
 ENV DEFAULT_CONDA_ENV=rapids
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index c2084dfec3..822b27f3fe 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 2f26104aae..d2bf9e6dc9 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,25 +5,26 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
-      "version": "1.14.1"
-    },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index dc52d6cf6a..9a0fa0e594 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 0e01da40d4..4cd630f1c2 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,25 +5,26 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ucx1.15.0-openmpi-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
-      "version": "1.14.1"
-    },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 83323153f1..d42fda9063 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -60,7 +60,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuml:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,12 +79,12 @@ jobs:
       # the CMake variables in get_cumlprims_mg.cmake since CMake will just use
       # the clone as is.
       extra-repo: rapidsai/cumlprims_mg
-      extra-repo-sha: branch-24.04
+      extra-repo-sha: branch-24.06
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cuml:
     needs: wheel-build-cuml
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 00cfa4c7a8..018387ec92 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -16,6 +16,7 @@ jobs:
       - clang-tidy
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-python-build
       - conda-python-tests-singlegpu
       - conda-python-tests-dask
@@ -25,16 +26,18 @@ jobs:
       - wheel-tests-cuml
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
     with:
       enable_check_generated_files: false
+      ignored_pr_jobs: >-
+        optional-job-conda-python-tests-cudf-pandas-integration
   clang-tidy:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "cpu8"
@@ -44,39 +47,55 @@ jobs:
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: raft_cutlass
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-tests-singlegpu:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
       script: "ci/test_python_singlegpu.sh"
+  optional-job-conda-python-tests-cudf-pandas-integration:
+    needs: conda-python-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    with:
+      matrix_filter: map(select(.ARCH == "amd64"))
+      build_type: pull-request
+      script: "ci/test_python_integration.sh"
   conda-python-tests-dask:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
       script: "ci/test_python_dask.sh"
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -86,7 +105,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -96,23 +115,23 @@ jobs:
   wheel-build-cuml:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
       extra-repo: rapidsai/cumlprims_mg
-      extra-repo-sha: branch-24.04
+      extra-repo-sha: branch-24.06
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cuml:
     needs: wheel-build-cuml
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a830eeb23d..8ba5cb3036 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,9 +14,19 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests-singlegpu:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +43,7 @@ jobs:
       script: "ci/test_python_singlegpu.sh"
   conda-python-tests-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +52,7 @@ jobs:
       script: "ci/test_python_dask.sh"
   wheel-tests-cuml:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c07dbe9ccb..9c0fe00a99 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,57 @@
+# cuml 24.06.00 (5 Jun 2024)
+
+## 🐛 Bug Fixes
+
+- [HOTFIX] Fix import of sklearn by using cpu_only_import ([#5914](https://github.com/rapidsai/cuml/pull/5914)) [@dantegd](https://github.com/dantegd)
+- Fix label binarize for binary class ([#5900](https://github.com/rapidsai/cuml/pull/5900)) [@jinsolp](https://github.com/jinsolp)
+- Fix RandomForestClassifier return type ([#5896](https://github.com/rapidsai/cuml/pull/5896)) [@jinsolp](https://github.com/jinsolp)
+- Fix nightly CI: remove deprecated creation of columns by using explicit dtype ([#5880](https://github.com/rapidsai/cuml/pull/5880)) [@dantegd](https://github.com/dantegd)
+- Fix DBSCAN allocates rbc index even if deactivated ([#5859](https://github.com/rapidsai/cuml/pull/5859)) [@mfoerste4](https://github.com/mfoerste4)
+- Remove gtest from dependencies.yaml ([#5854](https://github.com/rapidsai/cuml/pull/5854)) [@robertmaynard](https://github.com/robertmaynard)
+- Support expression-based Dask Dataframe API ([#5835](https://github.com/rapidsai/cuml/pull/5835)) [@rjzamora](https://github.com/rjzamora)
+- Mark all kernels with internal linkage ([#5764](https://github.com/rapidsai/cuml/pull/5764)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix build.sh clean command ([#5730](https://github.com/rapidsai/cuml/pull/5730)) [@csadorf](https://github.com/csadorf)
+
+## 📖 Documentation
+
+- Update the developer&#39;s guide with new copyright hook ([#5848](https://github.com/rapidsai/cuml/pull/5848)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+
+## 🚀 New Features
+
+- Always use a static gtest and gbench ([#5847](https://github.com/rapidsai/cuml/pull/5847)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🛠️ Improvements
+
+- Support double precision in MNMG Logistic Regression ([#5898](https://github.com/rapidsai/cuml/pull/5898)) [@lijinf2](https://github.com/lijinf2)
+- Reduce and rename cudf.pandas integrations jobs ([#5890](https://github.com/rapidsai/cuml/pull/5890)) [@dantegd](https://github.com/dantegd)
+- Fix building cuml with CCCL main ([#5886](https://github.com/rapidsai/cuml/pull/5886)) [@trxcllnt](https://github.com/trxcllnt)
+- Add optional CI job for integration tests with cudf.pandas ([#5881](https://github.com/rapidsai/cuml/pull/5881)) [@dantegd](https://github.com/dantegd)
+- Enable pytest failures on FutureWarnings/DeprecationWarnings ([#5877](https://github.com/rapidsai/cuml/pull/5877)) [@mroeschke](https://github.com/mroeschke)
+- Remove return in test_lbfgs ([#5875](https://github.com/rapidsai/cuml/pull/5875)) [@mroeschke](https://github.com/mroeschke)
+- Avoid dask_cudf.core imports ([#5874](https://github.com/rapidsai/cuml/pull/5874)) [@bdice](https://github.com/bdice)
+- Support CPU object for `train_test_split` ([#5873](https://github.com/rapidsai/cuml/pull/5873)) [@isVoid](https://github.com/isVoid)
+- Only use functions in the limited API ([#5871](https://github.com/rapidsai/cuml/pull/5871)) [@vyasr](https://github.com/vyasr)
+- Replace deprecated disutils.version with packaging.version ([#5868](https://github.com/rapidsai/cuml/pull/5868)) [@mroeschke](https://github.com/mroeschke)
+- Adjust deprecated cupy.sparse usage ([#5867](https://github.com/rapidsai/cuml/pull/5867)) [@mroeschke](https://github.com/mroeschke)
+- Fix numpy 2.0 deprecations ([#5866](https://github.com/rapidsai/cuml/pull/5866)) [@mroeschke](https://github.com/mroeschke)
+- Fix deprecated positional arg usage ([#5865](https://github.com/rapidsai/cuml/pull/5865)) [@mroeschke](https://github.com/mroeschke)
+- Use int instead of float in random.randint ([#5864](https://github.com/rapidsai/cuml/pull/5864)) [@mroeschke](https://github.com/mroeschke)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#5863](https://github.com/rapidsai/cuml/pull/5863)) [@hcho3](https://github.com/hcho3)
+- Avoid deprecated API in notebook ([#5862](https://github.com/rapidsai/cuml/pull/5862)) [@rjzamora](https://github.com/rjzamora)
+- Add dedicated handling for cudf.pandas wrapped Numpy arrays ([#5861](https://github.com/rapidsai/cuml/pull/5861)) [@betatim](https://github.com/betatim)
+- Prepend devcontainer name with the username ([#5860](https://github.com/rapidsai/cuml/pull/5860)) [@trxcllnt](https://github.com/trxcllnt)
+- add --rm and --name to devcontainer run args ([#5857](https://github.com/rapidsai/cuml/pull/5857)) [@trxcllnt](https://github.com/trxcllnt)
+- Update pip devcontainers to UCX v1.15.0 ([#5856](https://github.com/rapidsai/cuml/pull/5856)) [@trxcllnt](https://github.com/trxcllnt)
+- Replace rmm::mr::device_memory_resource* with rmm::device_async_resource_ref ([#5853](https://github.com/rapidsai/cuml/pull/5853)) [@harrism](https://github.com/harrism)
+- Update scikit-learn to 1.4 ([#5851](https://github.com/rapidsai/cuml/pull/5851)) [@betatim](https://github.com/betatim)
+- Prevent undefined behavior when passing handle from Treelite to cuML FIL ([#5849](https://github.com/rapidsai/cuml/pull/5849)) [@hcho3](https://github.com/hcho3)
+- Adds missing files to `update-version.sh` ([#5830](https://github.com/rapidsai/cuml/pull/5830)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Enable all tests for `arm` arch ([#5824](https://github.com/rapidsai/cuml/pull/5824)) [@galipremsagar](https://github.com/galipremsagar)
+- Address PytestReturnNotNoneWarning in cuml tests ([#5819](https://github.com/rapidsai/cuml/pull/5819)) [@mroeschke](https://github.com/mroeschke)
+- Handle binary classifier with all-0 labels ([#5810](https://github.com/rapidsai/cuml/pull/5810)) [@hcho3](https://github.com/hcho3)
+- Use pytest_cases.fixture to fix warnings. ([#5798](https://github.com/rapidsai/cuml/pull/5798)) [@bdice](https://github.com/bdice)
+- Enable Dask tests with UCX-Py/UCXX in CI ([#5697](https://github.com/rapidsai/cuml/pull/5697)) [@pentschev](https://github.com/pentschev)
+
 # cuML 24.04.00 (10 Apr 2024)
 
 ## 🐛 Bug Fixes
diff --git a/README.md b/README.md
index d2c5b98d95..e3a3a5e6ba 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ programming. In most cases, cuML's Python API matches the API from
 
 For large datasets, these GPU-based implementations can complete 10-50x faster
 than their CPU equivalents. For details on performance, see the [cuML Benchmarks
-Notebook](https://github.com/rapidsai/cuml/tree/branch-24.04/notebooks/tools).
+Notebook](https://github.com/rapidsai/cuml/tree/branch-24.06/notebooks/tools).
 
 As an example, the following Python snippet loads input and computes DBSCAN clusters, all on GPU, using cuDF:
 ```python
@@ -74,7 +74,7 @@ neighbors = nn.kneighbors(df)
 For additional examples, browse our complete [API
 documentation](https://docs.rapids.ai/api/cuml/stable/), or check out our
 example [walkthrough
-notebooks](https://github.com/rapidsai/cuml/tree/branch-24.04/notebooks). Finally, you
+notebooks](https://github.com/rapidsai/cuml/tree/branch-24.06/notebooks). Finally, you
 can find complete end-to-end examples in the [notebooks-contrib
 repo](https://github.com/rapidsai/notebooks-contrib).
 
diff --git a/VERSION b/VERSION
index 4a2fe8aa57..0bff6981a3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.06.00
diff --git a/build.sh b/build.sh
index 1132dcaddf..a76203a6db 100755
--- a/build.sh
+++ b/build.sh
@@ -58,7 +58,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    CUML_EXTRA_CMAKE_ARGS  - Extra arguments to pass directly to cmake. Values listed in environment
                             variable will override existing arguments. Example:
                             CUML_EXTRA_CMAKE_ARGS=\"-DBUILD_CUML_C_LIBRARY=OFF\" ./build.sh
-   CUML_EXTRA_PYTHON_ARGS - Extra argument to pass directly to python setup.py
+   CUML_EXTRA_PYTHON_ARGS - Extra arguments to pass directly to pip install
 "
 LIBCUML_BUILD_DIR=${LIBCUML_BUILD_DIR:=${REPODIR}/cpp/build}
 CUML_BUILD_DIR=${REPODIR}/python/build
@@ -223,9 +223,14 @@ if (( ${CLEAN} == 1 )); then
         fi
     done
 
-    cd ${REPODIR}/python
-    python setup.py clean --all
-    cd ${REPODIR}
+    # Clean up python artifacts
+    find ${REPODIR}/python/ | grep -E "(__pycache__|\.pyc|\.pyo|\.so|\_skbuild)$"  | xargs rm -rf
+
+    # Remove Doxyfile
+    rm -rf ${REPODIR}/cpp/Doxyfile
+
+    # Remove .benchmark dirs and .pytest_cache
+    find ${REPODIR}/ | grep -E "(\.pytest_cache|\.benchmarks)$"  | xargs rm -rf
 fi
 
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 6af69b92c6..90ea99af16 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -27,7 +27,7 @@ rapids-mamba-retry install \
 
 export RAPIDS_VERSION="$(rapids-version)"
 export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="24.04"
+export RAPIDS_VERSION_NUMBER="24.06"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e096b97a15..273a375ea3 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -22,7 +22,6 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the original version
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
@@ -82,5 +81,7 @@ sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TA
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-${CURRENT_SHORT_TAG}@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
diff --git a/ci/run_cuml_dask_pytests.sh b/ci/run_cuml_dask_pytests.sh
index 70a86c5029..0472c147fa 100755
--- a/ci/run_cuml_dask_pytests.sh
+++ b/ci/run_cuml_dask_pytests.sh
@@ -4,4 +4,11 @@
 # Support invoking run_cuml_dask_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests/dask
 
-python -m pytest --cache-clear "$@" .
+rapids-logger "pytest cuml-dask (No UCX-Py/UCXX)"
+timeout 2h python -m pytest --cache-clear "$@" .
+
+rapids-logger "pytest cuml-dask (UCX-Py only)"
+timeout 5m python -m pytest --cache-clear --run_ucx "$@" .
+
+rapids-logger "pytest cuml-dask (UCXX only)"
+timeout 5m python -m pytest --cache-clear --run_ucxx "$@" .
diff --git a/ci/run_cuml_integration_pytests.sh b/ci/run_cuml_integration_pytests.sh
new file mode 100755
index 0000000000..7e0990ca1f
--- /dev/null
+++ b/ci/run_cuml_integration_pytests.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Support invoking run_cuml_singlegpu_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests
+
+python -m pytest -p cudf.pandas --cache-clear --ignore=dask -m "not memleak" "$@" --quick_run .
diff --git a/ci/test_python_dask.sh b/ci/test_python_dask.sh
index 34a319f25a..db49a8b2a4 100755
--- a/ci/test_python_dask.sh
+++ b/ci/test_python_dask.sh
@@ -11,7 +11,7 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest cuml-dask"
+# Run tests
 ./ci/run_cuml_dask_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-dask.xml" \
   --cov-config=../../../.coveragerc \
diff --git a/ci/test_python_integration.sh b/ci/test_python_integration.sh
new file mode 100755
index 0000000000..1178130aa0
--- /dev/null
+++ b/ci/test_python_integration.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Support invoking test_python_singlegpu.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
+
+# Common setup steps shared by Python test jobs
+source ./ci/test_python_common.sh
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
+rapids-logger "pytest cuml integration"
+./ci/run_cuml_integration_pytests.sh \
+  --numprocesses=8 \
+  --dist=worksteal \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml" \
+  --cov-config=../../.coveragerc \
+  --cov=cuml \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cuml-coverage.xml" \
+  --cov-report=term
+
+rapids-logger "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 0ecbd6a430..86eef035cd 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -15,30 +15,29 @@ fi
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cuml*.whl)[test]
 
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
-    python ci/wheel_smoke_test_cuml.py
-else
-    rapids-logger "pytest cuml single GPU"
-    ./ci/run_cuml_singlegpu_pytests.sh \
-      --numprocesses=8 \
-      --dist=worksteal \
-      -k 'not test_sparse_pca_inputs' \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml"
-
-    # Run test_sparse_pca_inputs separately
-    ./ci/run_cuml_singlegpu_pytests.sh \
-      -k 'test_sparse_pca_inputs' \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-sparse-pca.xml"
-
-    rapids-logger "pytest cuml-dask"
-    ./ci/run_cuml_dask_pytests.sh \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-dask.xml"
-fi
+
+rapids-logger "pytest cuml single GPU"
+./ci/run_cuml_singlegpu_pytests.sh \
+  --numprocesses=8 \
+  --dist=worksteal \
+  -k 'not test_sparse_pca_inputs' \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml"
+
+# Run test_sparse_pca_inputs separately
+./ci/run_cuml_singlegpu_pytests.sh \
+  -k 'test_sparse_pca_inputs' \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-sparse-pca.xml"
+
+rapids-logger "pytest cuml-dask"
+./ci/run_cuml_dask_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-dask.xml"
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/wheel_smoke_test_cuml.py b/ci/wheel_smoke_test_cuml.py
deleted file mode 100644
index aa935639ae..0000000000
--- a/ci/wheel_smoke_test_cuml.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""
-A simple test for cuML based on scikit-learn.
-
-Adapted from xgboost:
-https://raw.githubusercontent.com/rapidsai/xgboost-conda/branch-23.02/recipes/xgboost/test-py-xgboost.py
-"""
-from cuml.ensemble import RandomForestClassifier
-import sklearn.datasets
-import sklearn.model_selection
-import sklearn.metrics
-
-X, y = sklearn.datasets.load_iris(return_X_y=True)
-Xtrn, Xtst, ytrn, ytst = sklearn.model_selection.train_test_split(
-    X, y, train_size=0.8, random_state=4)
-
-clf = RandomForestClassifier(
-    max_depth=2,
-    n_estimators=10,
-    n_bins=32,
-    accuracy_metric="multi:softmax")
-clf.fit(Xtrn, ytrn)
-ypred = clf.predict(Xtst)
-acc = sklearn.metrics.accuracy_score(ytst, ypred)
-
-print('cuml RandomForestClassifier accuracy on iris:', acc)
-assert acc > 0.9
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 15f8aee324..c292f5598b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -12,18 +12,16 @@ dependencies:
 - cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.4.*
+- cudf==24.6.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
-- dask-cudf==24.4.*
+- dask-cuda==24.6.*
+- dask-cudf==24.6.*
 - dask-ml
 - doxygen=1.9.1
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - hdbscan<=0.8.30
 - hypothesis>=6.0,<7
 - ipykernel
@@ -33,25 +31,26 @@ dependencies:
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58
 - libcufft=10.9.0.58
-- libcumlprims==24.4.*
+- libcumlprims==24.6.*
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libcusolver-dev=11.4.1.48
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libraft-headers==24.4.*
-- libraft==24.4.*
-- librmm==24.4.*
+- libraft-headers==24.6.*
+- libraft==24.6.*
+- librmm==24.6.*
 - nbsphinx
 - ninja
 - nltk
 - numba>=0.57
 - numpydoc
 - nvcc_linux-64=11.8
+- packaging
 - pip
 - pydata-sphinx-theme!=0.14.2
-- pylibraft==24.4.*
+- pylibraft==24.6.*
 - pynndescent==0.5.8
 - pytest-benchmark
 - pytest-cases
@@ -59,12 +58,12 @@ dependencies:
 - pytest-xdist
 - pytest==7.*
 - python>=3.9,<3.12
-- raft-dask==24.4.*
-- rapids-dask-dependency==24.4.*
+- raft-dask==24.6.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
-- scikit-learn==1.2
+- scikit-learn==1.5
 - scipy>=1.8.0
 - seaborn
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index f923628764..43bf3069b3 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -14,18 +14,16 @@ dependencies:
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.2
-- cudf==24.4.*
+- cudf==24.6.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
-- dask-cudf==24.4.*
+- dask-cuda==24.6.*
+- dask-cudf==24.6.*
 - dask-ml
 - doxygen=1.9.1
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - hdbscan<=0.8.30
 - hypothesis>=6.0,<7
 - ipykernel
@@ -33,21 +31,22 @@ dependencies:
 - joblib>=0.11
 - libcublas-dev
 - libcufft-dev
-- libcumlprims==24.4.*
+- libcumlprims==24.6.*
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==24.4.*
-- libraft==24.4.*
-- librmm==24.4.*
+- libraft-headers==24.6.*
+- libraft==24.6.*
+- librmm==24.6.*
 - nbsphinx
 - ninja
 - nltk
 - numba>=0.57
 - numpydoc
+- packaging
 - pip
 - pydata-sphinx-theme!=0.14.2
-- pylibraft==24.4.*
+- pylibraft==24.6.*
 - pynndescent==0.5.8
 - pytest-benchmark
 - pytest-cases
@@ -55,12 +54,12 @@ dependencies:
 - pytest-xdist
 - pytest==7.*
 - python>=3.9,<3.12
-- raft-dask==24.4.*
-- rapids-dask-dependency==24.4.*
+- raft-dask==24.6.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
-- scikit-learn==1.2
+- scikit-learn==1.5
 - scipy>=1.8.0
 - seaborn
 - sphinx-copybutton
diff --git a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
index 5eb69ceb2a..6c93208450 100644
--- a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
@@ -15,22 +15,20 @@ dependencies:
 - cudatoolkit
 - cxx-compiler
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58
 - libcufft=10.9.0.58
-- libcumlprims==24.4.*
+- libcumlprims==24.6.*
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libcusolver-dev=11.4.1.48
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libraft-headers==24.4.*
-- libraft==24.4.*
-- librmm==24.4.*
+- libraft-headers==24.6.*
+- libraft==24.6.*
+- librmm==24.6.*
 - ninja
 - nvcc_linux-64=11.8
 - sysroot_linux-64==2.17
diff --git a/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml b/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
index 0a7a7f901b..857ec209b2 100644
--- a/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
@@ -13,22 +13,20 @@ dependencies:
 - cudatoolkit
 - cxx-compiler
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58
 - libcufft=10.9.0.58
-- libcumlprims==24.4.*
+- libcumlprims==24.6.*
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libcusolver-dev=11.4.1.48
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libraft-headers==24.4.*
-- libraft==24.4.*
-- librmm==24.4.*
+- libraft-headers==24.6.*
+- libraft==24.6.*
+- librmm==24.6.*
 - ninja
 - nvcc_linux-64=11.8
 - sysroot_linux-64==2.17
diff --git a/conda/environments/cpp_all_cuda-122_arch-x86_64.yaml b/conda/environments/cpp_all_cuda-122_arch-x86_64.yaml
index a6cb550bf1..ba43637f9d 100644
--- a/conda/environments/cpp_all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/cpp_all_cuda-122_arch-x86_64.yaml
@@ -15,17 +15,15 @@ dependencies:
 - cuda-version=12.2
 - cxx-compiler
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - libcublas-dev
 - libcufft-dev
-- libcumlprims==24.4.*
+- libcumlprims==24.6.*
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==24.4.*
-- libraft==24.4.*
-- librmm==24.4.*
+- libraft-headers==24.6.*
+- libraft==24.6.*
+- librmm==24.6.*
 - ninja
 - sysroot_linux-64==2.17
 name: cpp_all_cuda-122_arch-x86_64
diff --git a/conda/recipes/cuml-cpu/conda_build_config.yaml b/conda/recipes/cuml-cpu/conda_build_config.yaml
index 05a623f0db..2e58c8f113 100644
--- a/conda/recipes/cuml-cpu/conda_build_config.yaml
+++ b/conda/recipes/cuml-cpu/conda_build_config.yaml
@@ -7,5 +7,8 @@ cxx_compiler_version:
 cmake_version:
   - ">=3.26.4"
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "=2.17"
diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml
index dae692c3a9..61bf97f4b0 100644
--- a/conda/recipes/cuml-cpu/meta.yaml
+++ b/conda/recipes/cuml-cpu/meta.yaml
@@ -22,7 +22,7 @@ requirements:
     - cmake {{ cmake_version }}
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
     - ninja
   host:
     - python x.x
diff --git a/conda/recipes/cuml/conda_build_config.yaml b/conda/recipes/cuml/conda_build_config.yaml
index b42c9d56f3..780ff3c412 100644
--- a/conda/recipes/cuml/conda_build_config.yaml
+++ b/conda/recipes/cuml/conda_build_config.yaml
@@ -13,7 +13,10 @@ cuda11_compiler:
 cmake_version:
   - ">=3.26.4"
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "=2.17"
 
 treelite_version:
diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
index 5d2bcfbebf..3e3edf0e4b 100644
--- a/conda/recipes/cuml/meta.yaml
+++ b/conda/recipes/cuml/meta.yaml
@@ -52,7 +52,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcuml/conda_build_config.yaml b/conda/recipes/libcuml/conda_build_config.yaml
index 706f8fc747..db0a8a16c8 100644
--- a/conda/recipes/libcuml/conda_build_config.yaml
+++ b/conda/recipes/libcuml/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "=2.17"
 
 cmake_version:
@@ -19,9 +22,6 @@ cmake_version:
 treelite_version:
   - "=4.1.2"
 
-gtest_version:
-  - ">=1.13.0"
-
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
 # and the "*_run_*" version specifiers correspond to `11.x` packages.
diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml
index fe16aa9f0f..718375b198 100644
--- a/conda/recipes/libcuml/meta.yaml
+++ b/conda/recipes/libcuml/meta.yaml
@@ -45,7 +45,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
@@ -68,8 +68,6 @@ requirements:
     - libcusolver-dev
     - libcusparse-dev
     {% endif %}
-    - gmock {{ gtest_version }}
-    - gtest {{ gtest_version }}
     - libcumlprims ={{ minor_version }}
     - libraft ={{ minor_version }}
     - libraft-headers ={{ minor_version }}
@@ -152,8 +150,6 @@ outputs:
         - cuda-cudart
         {% endif %}
         - {{ pin_subpackage('libcuml', exact=True) }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cacd96a922..4b9692beb2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -245,12 +245,13 @@ if(ENABLE_CUMLPRIMS_MG)
 endif()
 
 if(BUILD_CUML_TESTS OR BUILD_PRIMS_TESTS)
-  include(cmake/thirdparty/get_gtest.cmake)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
 endif()
 
 if(BUILD_CUML_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 endif()
 
 ##############################################################################
@@ -497,8 +498,7 @@ if(BUILD_CUML_CPP_LIBRARY)
       PRIVATE
         src/svm/svc.cu
         src/svm/svr.cu
-        src/svm/linear.cu
-        src/svm/ws_util.cu)
+        src/svm/linear.cu)
   endif()
 
   if(all_algo OR autoarima_algo)
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 1eccd65ba4..4f8c312717 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,6 +61,10 @@ if(BUILD_CUML_BENCH)
   set_target_properties(
     ${CUML_CPP_BENCH_TARGET}
     PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD                      17
+               CXX_STANDARD_REQUIRED             ON
+               CUDA_STANDARD                     17
+               CUDA_STANDARD_REQUIRED            ON
   )
 
   install(
diff --git a/cpp/bench/sg/umap.cu b/cpp/bench/sg/umap.cu
index cafbd76aa7..4334a82983 100644
--- a/cpp/bench/sg/umap.cu
+++ b/cpp/bench/sg/umap.cu
@@ -34,7 +34,7 @@ struct Params {
 };
 
 template <typename OutT, typename InT, typename IdxT>
-__global__ void castKernel(OutT* out, const InT* in, IdxT len)
+CUML_KERNEL void castKernel(OutT* out, const InT* in, IdxT len)
 {
   auto tid = IdxT(blockIdx.x) * blockDim.x + IdxT(threadIdx.x);
   if (tid < len) { out[tid] = OutT(in[tid]); }
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index b6c49bb2c2..60cc5dae15 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ endif()
 list(APPEND CUML_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations,-Wno-error=sign-compare)
 
 if(DISABLE_DEPRECATION_WARNINGS)
-    list(APPEND CUML_CXX_FLAGS -Wno-deprecated-declarations)
-    list(APPEND CUML_CUDA_FLAGS -Wno-deprecated-declarations -Xcompiler=-Wno-deprecated-declarations)
+    list(APPEND CUML_CXX_FLAGS -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
+    list(APPEND CUML_CUDA_FLAGS -Wno-deprecated-declarations -Xcompiler=-Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
 endif()
 
 # make sure we produce smallest binary size
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
deleted file mode 100644
index cdc2c5d889..0000000000
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-#=============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_gtest)
-
-    include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest()
-
-endfunction()
-
-find_and_configure_gtest()
diff --git a/cpp/include/cuml/common/utils.hpp b/cpp/include/cuml/common/utils.hpp
index db5f8fc00b..8ac9d93a1f 100644
--- a/cpp/include/cuml/common/utils.hpp
+++ b/cpp/include/cuml/common/utils.hpp
@@ -29,3 +29,9 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
+
+#ifdef __CUDACC__
+#define CUML_KERNEL __global__ static
+#else
+#define CUML_KERNEL static
+#endif
diff --git a/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh b/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh
index a3cd2909cc..d253074744 100644
--- a/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh
+++ b/cpp/include/cuml/experimental/fil/detail/infer_kernel/gpu.cuh
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #pragma once
+#include <cuml/common/utils.hpp>
 #include <cuml/experimental/fil/detail/evaluate_tree.hpp>
 #include <cuml/experimental/fil/detail/gpu_introspection.hpp>
 #include <cuml/experimental/fil/detail/index_type.hpp>
@@ -80,7 +81,7 @@ template <bool has_categorical_nodes,
           typename forest_t,
           typename vector_output_t    = std::nullptr_t,
           typename categorical_data_t = std::nullptr_t>
-__global__ void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) infer_kernel(
+CUML_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) infer_kernel(
   forest_t forest,
   postprocessor<typename forest_t::io_type> postproc,
   typename forest_t::io_type* output,
diff --git a/cpp/include/cuml/linear_model/qn_mg.hpp b/cpp/include/cuml/linear_model/qn_mg.hpp
index 048d65c322..aa7c3226c5 100644
--- a/cpp/include/cuml/linear_model/qn_mg.hpp
+++ b/cpp/include/cuml/linear_model/qn_mg.hpp
@@ -37,9 +37,10 @@ namespace opg {
  * @param[in] labels: labels data
  * @returns host vector that stores the distinct labels
  */
-std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
-                                     Matrix::PartDescriptor& input_desc,
-                                     std::vector<Matrix::Data<float>*>& labels);
+template <typename T>
+std::vector<T> getUniquelabelsMG(const raft::handle_t& handle,
+                                 Matrix::PartDescriptor& input_desc,
+                                 std::vector<Matrix::Data<T>*>& labels);
 
 /**
  * @brief performs MNMG fit operation for the logistic regression using quasi newton methods
@@ -55,16 +56,17 @@ std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
  * @param[out] f: host pointer holding the final objective value
  * @param[out] num_iters: host pointer holding the actual number of iterations taken
  */
+template <typename T>
 void qnFit(raft::handle_t& handle,
-           std::vector<Matrix::Data<float>*>& input_data,
+           std::vector<Matrix::Data<T>*>& input_data,
            Matrix::PartDescriptor& input_desc,
-           std::vector<Matrix::Data<float>*>& labels,
-           float* coef,
+           std::vector<Matrix::Data<T>*>& labels,
+           T* coef,
            const qn_params& pams,
            bool X_col_major,
            bool standardization,
            int n_classes,
-           float* f,
+           T* f,
            int* num_iters);
 
 /**
@@ -86,18 +88,19 @@ void qnFit(raft::handle_t& handle,
  * @param[out] f: host pointer holding the final objective value
  * @param[out] num_iters: host pointer holding the actual number of iterations taken
  */
+template <typename T>
 void qnFitSparse(raft::handle_t& handle,
-                 std::vector<Matrix::Data<float>*>& input_values,
+                 std::vector<Matrix::Data<T>*>& input_values,
                  int* input_cols,
                  int* input_row_ids,
                  int X_nnz,
                  Matrix::PartDescriptor& input_desc,
-                 std::vector<Matrix::Data<float>*>& labels,
-                 float* coef,
+                 std::vector<Matrix::Data<T>*>& labels,
+                 T* coef,
                  const qn_params& pams,
                  bool standardization,
                  int n_classes,
-                 float* f,
+                 T* f,
                  int* num_iters);
 
 };  // namespace opg
diff --git a/cpp/include/cuml/tsa/arima_common.h b/cpp/include/cuml/tsa/arima_common.h
index cd9f2d22f5..1586ca963c 100644
--- a/cpp/include/cuml/tsa/arima_common.h
+++ b/cpp/include/cuml/tsa/arima_common.h
@@ -18,7 +18,9 @@
 
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime.h>
 #include <thrust/execution_policy.h>
@@ -79,15 +81,27 @@ struct ARIMAParams {
    */
   void allocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
   {
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    if (order.k && !tr) mu = (DataT*)rmm_alloc->allocate(batch_size * sizeof(DataT), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    if (order.k && !tr)
+      mu = (DataT*)rmm_alloc.allocate_async(
+        batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     if (order.n_exog && !tr)
-      beta = (DataT*)rmm_alloc->allocate(order.n_exog * batch_size * sizeof(DataT), stream);
-    if (order.p) ar = (DataT*)rmm_alloc->allocate(order.p * batch_size * sizeof(DataT), stream);
-    if (order.q) ma = (DataT*)rmm_alloc->allocate(order.q * batch_size * sizeof(DataT), stream);
-    if (order.P) sar = (DataT*)rmm_alloc->allocate(order.P * batch_size * sizeof(DataT), stream);
-    if (order.Q) sma = (DataT*)rmm_alloc->allocate(order.Q * batch_size * sizeof(DataT), stream);
-    sigma2 = (DataT*)rmm_alloc->allocate(batch_size * sizeof(DataT), stream);
+      beta = (DataT*)rmm_alloc.allocate_async(
+        order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.p)
+      ar = (DataT*)rmm_alloc.allocate_async(
+        order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.q)
+      ma = (DataT*)rmm_alloc.allocate_async(
+        order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.P)
+      sar = (DataT*)rmm_alloc.allocate_async(
+        order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.Q)
+      sma = (DataT*)rmm_alloc.allocate_async(
+        order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    sigma2 = (DataT*)rmm_alloc.allocate_async(
+      batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -101,15 +115,27 @@ struct ARIMAParams {
    */
   void deallocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
   {
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    if (order.k && !tr) rmm_alloc->deallocate(mu, batch_size * sizeof(DataT), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    if (order.k && !tr)
+      rmm_alloc.deallocate_async(
+        mu, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     if (order.n_exog && !tr)
-      rmm_alloc->deallocate(beta, order.n_exog * batch_size * sizeof(DataT), stream);
-    if (order.p) rmm_alloc->deallocate(ar, order.p * batch_size * sizeof(DataT), stream);
-    if (order.q) rmm_alloc->deallocate(ma, order.q * batch_size * sizeof(DataT), stream);
-    if (order.P) rmm_alloc->deallocate(sar, order.P * batch_size * sizeof(DataT), stream);
-    if (order.Q) rmm_alloc->deallocate(sma, order.Q * batch_size * sizeof(DataT), stream);
-    rmm_alloc->deallocate(sigma2, batch_size * sizeof(DataT), stream);
+      rmm_alloc.deallocate_async(
+        beta, order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.p)
+      rmm_alloc.deallocate_async(
+        ar, order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.q)
+      rmm_alloc.deallocate_async(
+        ma, order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.P)
+      rmm_alloc.deallocate_async(
+        sar, order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.Q)
+      rmm_alloc.deallocate_async(
+        sma, order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    rmm_alloc.deallocate_async(
+      sigma2, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 50ce88be25..2b262df412 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -84,7 +84,7 @@ struct is_missing {
   typedef T argument_type;
   typedef T result_type;
 
-  __thrust_exec_check_disable__ __device__ const T operator()(const T& x) const { return isnan(x); }
+  __device__ const T operator()(const T& x) const { return isnan(x); }
 };  // end is_missing
 
 bool detect_missing(raft::handle_t& handle, const double* d_y, int n_elem)
@@ -277,25 +277,25 @@ void predict(raft::handle_t& handle,
  * @param[in]  start_v    First used v index (residual)
  */
 template <typename DataT>
-__global__ void sum_of_squares_kernel(const DataT* d_y,
-                                      const DataT* d_mu,
-                                      const DataT* d_ar,
-                                      const DataT* d_ma,
-                                      const DataT* d_sar,
-                                      const DataT* d_sma,
-                                      DataT* d_loglike,
-                                      int n_obs,
-                                      int n_phi,
-                                      int n_theta,
-                                      int p,
-                                      int q,
-                                      int P,
-                                      int Q,
-                                      int s,
-                                      int k,
-                                      int start_sum,
-                                      int start_y,
-                                      int start_v)
+CUML_KERNEL void sum_of_squares_kernel(const DataT* d_y,
+                                       const DataT* d_mu,
+                                       const DataT* d_ar,
+                                       const DataT* d_ma,
+                                       const DataT* d_sar,
+                                       const DataT* d_sma,
+                                       DataT* d_loglike,
+                                       int n_obs,
+                                       int n_phi,
+                                       int n_theta,
+                                       int p,
+                                       int q,
+                                       int P,
+                                       int Q,
+                                       int s,
+                                       int k,
+                                       int start_sum,
+                                       int start_y,
+                                       int start_v)
 {
   // Load phi, theta and mu to registers
   DataT phi, theta;
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index 034089c2b9..84974ea9f2 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -128,25 +128,25 @@ DI void numerical_stability(double* A)
  * @param[out] d_F_fc          Batched variance of forecast errors   (fc_steps)
  */
 template <int rd>
-__global__ void batched_kalman_loop_kernel(const double* ys,
-                                           int nobs,
-                                           const double* T,
-                                           const double* Z,
-                                           const double* RQR,
-                                           const double* P,
-                                           const double* alpha,
-                                           bool intercept,
-                                           const double* d_mu,
-                                           int batch_size,
-                                           const double* d_obs_inter,
-                                           const double* d_obs_inter_fut,
-                                           double* d_pred,
-                                           double* d_loglike,
-                                           int n_diff,
-                                           int fc_steps   = 0,
-                                           double* d_fc   = nullptr,
-                                           bool conf_int  = false,
-                                           double* d_F_fc = nullptr)
+CUML_KERNEL void batched_kalman_loop_kernel(const double* ys,
+                                            int nobs,
+                                            const double* T,
+                                            const double* Z,
+                                            const double* RQR,
+                                            const double* P,
+                                            const double* alpha,
+                                            bool intercept,
+                                            const double* d_mu,
+                                            int batch_size,
+                                            const double* d_obs_inter,
+                                            const double* d_obs_inter_fut,
+                                            double* d_pred,
+                                            double* d_loglike,
+                                            int n_diff,
+                                            int fc_steps   = 0,
+                                            double* d_fc   = nullptr,
+                                            bool conf_int  = false,
+                                            double* d_F_fc = nullptr)
 {
   constexpr int rd2 = rd * rd;
   double l_RQR[rd2];
@@ -384,28 +384,28 @@ union KalmanLoopSharedMemory {
  * @param[out] d_F_fc          Batched variance of forecast errors   (fc_steps)
  */
 template <typename GemmPolicy, typename GemvPolicy, typename CovPolicy>
-__global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
-                                                         int batch_size,
-                                                         int n_obs,
-                                                         const double* d_T,
-                                                         const double* d_Z,
-                                                         const double* d_RQR,
-                                                         double* d_P,
-                                                         double* d_alpha,
-                                                         double* d_m_tmp,
-                                                         double* d_TP,
-                                                         bool intercept,
-                                                         const double* d_mu,
-                                                         int rd,
-                                                         const double* d_obs_inter,
-                                                         const double* d_obs_inter_fut,
-                                                         double* d_pred,
-                                                         double* d_loglike,
-                                                         int n_diff,
-                                                         int fc_steps,
-                                                         double* d_fc,
-                                                         bool conf_int,
-                                                         double* d_F_fc)
+CUML_KERNEL void _batched_kalman_device_loop_large_kernel(const double* d_ys,
+                                                          int batch_size,
+                                                          int n_obs,
+                                                          const double* d_T,
+                                                          const double* d_Z,
+                                                          const double* d_RQR,
+                                                          double* d_P,
+                                                          double* d_alpha,
+                                                          double* d_m_tmp,
+                                                          double* d_TP,
+                                                          bool intercept,
+                                                          const double* d_mu,
+                                                          int rd,
+                                                          const double* d_obs_inter,
+                                                          const double* d_obs_inter_fut,
+                                                          double* d_pred,
+                                                          double* d_loglike,
+                                                          int n_diff,
+                                                          int fc_steps,
+                                                          double* d_fc,
+                                                          bool conf_int,
+                                                          double* d_F_fc)
 {
   int rd2 = rd * rd;
 
@@ -1128,7 +1128,7 @@ void batched_kalman_loop(raft::handle_t& handle,
  * @param[in]    n_elem     Total number of elements (fc_steps * batch_size)
  * @param[in]    multiplier Coefficient associated with the confidence level
  */
-__global__ void confidence_intervals(
+CUML_KERNEL void confidence_intervals(
   const double* d_fc, double* d_lower, double* d_upper, int n_elem, double multiplier)
 {
   for (int idx = threadIdx.x; idx < n_elem; idx += blockDim.x * gridDim.x) {
diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index 6e0b8d4f7b..804be72fc7 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -27,6 +27,7 @@
 
 #include <cuml/cluster/dbscan.hpp>
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 
 #include <raft/core/nvtx.hpp>
 #include <raft/label/classlabels.cuh>
@@ -51,7 +52,7 @@ static const int TPB = 256;
  * 2. Subtract 1 from all other labels.
  */
 template <typename Index_ = int>
-__global__ void relabelForSkl(Index_* labels, Index_ N, Index_ MAX_LABEL)
+CUML_KERNEL void relabelForSkl(Index_* labels, Index_ N, Index_ MAX_LABEL)
 {
   Index_ tid = threadIdx.x + blockDim.x * blockIdx.x;
   if (tid < N) {
@@ -177,6 +178,15 @@ std::size_t run(const raft::handle_t& handle,
     return size;
   }
 
+  if (sparse_rbc_mode && (std::size_t)D > static_cast<std::size_t>(MAX_LABEL / N)) {
+    CUML_LOG_WARN(
+      "You are using an index type of size (%d bytes) which is not sufficient "
+      "to represent the input dimensions in the RBC index. "
+      "Consider using a larger index type. Falling back to BRUTE_FORCE strategy.",
+      (int)sizeof(Index_));
+    sparse_rbc_mode = false;
+  }
+
   // partition the temporary workspace needed for different stages of dbscan.
 
   std::vector<Index_> batchadjlen(n_batches);
@@ -210,7 +220,7 @@ std::size_t run(const raft::handle_t& handle,
   raft::neighbors::ball_cover::BallCoverIndex<Index_, Type_f, Index_, Index_>* rbc_index_ptr =
     nullptr;
   raft::neighbors::ball_cover::BallCoverIndex<Index_, Type_f, Index_, Index_> rbc_index(
-    handle, x, N, D, metric);
+    handle, x, sparse_rbc_mode ? N : 0, sparse_rbc_mode ? D : 0, metric);
 
   if (sparse_rbc_mode) {
     raft::neighbors::ball_cover::build_index(handle, rbc_index);
diff --git a/cpp/src/dbscan/vertexdeg/algo.cuh b/cpp/src/dbscan/vertexdeg/algo.cuh
index eff4452c84..b19345b033 100644
--- a/cpp/src/dbscan/vertexdeg/algo.cuh
+++ b/cpp/src/dbscan/vertexdeg/algo.cuh
@@ -18,6 +18,8 @@
 
 #include "pack.h"
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
@@ -57,11 +59,11 @@ struct column_counter : public thrust::unary_function<index_t, index_t> {
 };
 
 template <typename math_t, typename index_t = int, int tpb = 128, int warpsize = 32>
-static __global__ void accumulateWeights(const index_t* ia,
-                                         const index_t num_rows,
-                                         const index_t* ja,
-                                         const math_t* col_weights,
-                                         math_t* weight_sums)
+CUML_KERNEL void accumulateWeights(const index_t* ia,
+                                   const index_t num_rows,
+                                   const index_t* ja,
+                                   const math_t* col_weights,
+                                   math_t* weight_sums)
 {
   constexpr int warps_per_block = tpb / warpsize;
 
diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh
index 29aaea2c00..40ba02676d 100644
--- a/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include "../objectives.cuh"
 #include "../quantiles.h"
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/random/rng.cuh>
 
 #include <cub/cub.cuh>
@@ -72,21 +74,23 @@ DI OutT* alignPointer(InT dataset)
 }
 
 template <typename DataT, typename LabelT, typename IdxT, int TPB>
-__global__ void nodeSplitKernel(const IdxT max_depth,
-                                const IdxT min_samples_leaf,
-                                const IdxT min_samples_split,
-                                const IdxT max_leaves,
-                                const DataT min_impurity_decrease,
-                                const Dataset<DataT, LabelT, IdxT> dataset,
-                                const NodeWorkItem* work_items,
-                                const Split<DataT, IdxT>* splits);
+__attribute__((visibility("hidden"))) __global__ void nodeSplitKernel(
+  const IdxT max_depth,
+  const IdxT min_samples_leaf,
+  const IdxT min_samples_split,
+  const IdxT max_leaves,
+  const DataT min_impurity_decrease,
+  const Dataset<DataT, LabelT, IdxT> dataset,
+  const NodeWorkItem* work_items,
+  const Split<DataT, IdxT>* splits);
 
 template <typename DatasetT, typename NodeT, typename ObjectiveT, typename DataT>
-__global__ void leafKernel(ObjectiveT objective,
-                           DatasetT dataset,
-                           const NodeT* tree,
-                           const InstanceRange* instance_ranges,
-                           DataT* leaves);
+__attribute__((visibility("hidden"))) __global__ void leafKernel(
+  ObjectiveT objective,
+  DatasetT dataset,
+  const NodeT* tree,
+  const InstanceRange* instance_ranges,
+  DataT* leaves);
 // 32-bit FNV1a hash
 // Reference: http://www.isthe.com/chongo/tech/comp/fnv/index.html
 const uint32_t fnv1a32_prime = uint32_t(16777619);
@@ -142,7 +146,7 @@ struct CustomDifference {
  * at least 'k' uniques.
  */
 template <typename IdxT, int MAX_SAMPLES_PER_THREAD, int BLOCK_THREADS = 128>
-__global__ void excess_sample_with_replacement_kernel(
+CUML_KERNEL void excess_sample_with_replacement_kernel(
   IdxT* colids,
   const NodeWorkItem* work_items,
   size_t work_items_size,
@@ -258,13 +262,13 @@ __global__ void excess_sample_with_replacement_kernel(
  * https://en.wikipedia.org/wiki/Reservoir_sampling#An_optimal_algorithm
  */
 template <typename IdxT>
-__global__ void algo_L_sample_kernel(int* colids,
-                                     const NodeWorkItem* work_items,
-                                     size_t work_items_size,
-                                     IdxT treeid,
-                                     uint64_t seed,
-                                     size_t n /* total cols to sample from*/,
-                                     size_t k /* cols to sample */)
+CUML_KERNEL void algo_L_sample_kernel(int* colids,
+                                      const NodeWorkItem* work_items,
+                                      size_t work_items_size,
+                                      IdxT treeid,
+                                      uint64_t seed,
+                                      size_t n /* total cols to sample from*/,
+                                      size_t k /* cols to sample */)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= work_items_size) return;
@@ -308,13 +312,13 @@ __global__ void algo_L_sample_kernel(int* colids,
 }
 
 template <typename IdxT>
-__global__ void adaptive_sample_kernel(int* colids,
-                                       const NodeWorkItem* work_items,
-                                       size_t work_items_size,
-                                       IdxT treeid,
-                                       uint64_t seed,
-                                       int N,
-                                       int M)
+CUML_KERNEL void adaptive_sample_kernel(int* colids,
+                                        const NodeWorkItem* work_items,
+                                        size_t work_items_size,
+                                        IdxT treeid,
+                                        uint64_t seed,
+                                        int N,
+                                        int M)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= work_items_size) return;
@@ -344,23 +348,24 @@ template <typename DataT,
           int TPB,
           typename ObjectiveT,
           typename BinT>
-__global__ void computeSplitKernel(BinT* histograms,
-                                   IdxT n_bins,
-                                   IdxT max_depth,
-                                   IdxT min_samples_split,
-                                   IdxT max_leaves,
-                                   const Dataset<DataT, LabelT, IdxT> dataset,
-                                   const Quantiles<DataT, IdxT> quantiles,
-                                   const NodeWorkItem* work_items,
-                                   IdxT colStart,
-                                   const IdxT* colids,
-                                   int* done_count,
-                                   int* mutex,
-                                   volatile Split<DataT, IdxT>* splits,
-                                   ObjectiveT objective,
-                                   IdxT treeid,
-                                   const WorkloadInfo<IdxT>* workload_info,
-                                   uint64_t seed);
+__attribute__((visibility("hidden"))) __global__ void computeSplitKernel(
+  BinT* histograms,
+  IdxT n_bins,
+  IdxT max_depth,
+  IdxT min_samples_split,
+  IdxT max_leaves,
+  const Dataset<DataT, LabelT, IdxT> dataset,
+  const Quantiles<DataT, IdxT> quantiles,
+  const NodeWorkItem* work_items,
+  IdxT colStart,
+  const IdxT* colids,
+  int* done_count,
+  int* mutex,
+  volatile Split<DataT, IdxT>* splits,
+  ObjectiveT objective,
+  IdxT treeid,
+  const WorkloadInfo<IdxT>* workload_info,
+  uint64_t seed);
 
 }  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels_impl.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels_impl.cuh
index c73af68f4d..d2ad7a436a 100644
--- a/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels_impl.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels_impl.cuh
@@ -88,14 +88,15 @@ DI void partitionSamples(const Dataset<DataT, LabelT, IdxT>& dataset,
   }
 }
 template <typename DataT, typename LabelT, typename IdxT, int TPB>
-__global__ void nodeSplitKernel(const IdxT max_depth,
-                                const IdxT min_samples_leaf,
-                                const IdxT min_samples_split,
-                                const IdxT max_leaves,
-                                const DataT min_impurity_decrease,
-                                const Dataset<DataT, LabelT, IdxT> dataset,
-                                const NodeWorkItem* work_items,
-                                const Split<DataT, IdxT>* splits)
+__attribute__((visibility("hidden"))) __global__ void nodeSplitKernel(
+  const IdxT max_depth,
+  const IdxT min_samples_leaf,
+  const IdxT min_samples_split,
+  const IdxT max_leaves,
+  const DataT min_impurity_decrease,
+  const Dataset<DataT, LabelT, IdxT> dataset,
+  const NodeWorkItem* work_items,
+  const Split<DataT, IdxT>* splits)
 {
   extern __shared__ char smem[];
   const auto work_item = work_items[blockIdx.x];
@@ -108,11 +109,12 @@ __global__ void nodeSplitKernel(const IdxT max_depth,
 }
 
 template <typename DatasetT, typename NodeT, typename ObjectiveT, typename DataT>
-__global__ void leafKernel(ObjectiveT objective,
-                           DatasetT dataset,
-                           const NodeT* tree,
-                           const InstanceRange* instance_ranges,
-                           DataT* leaves)
+__attribute__((visibility("hidden"))) __global__ void leafKernel(
+  ObjectiveT objective,
+  DatasetT dataset,
+  const NodeT* tree,
+  const InstanceRange* instance_ranges,
+  DataT* leaves)
 {
   using BinT = typename ObjectiveT::BinT;
   extern __shared__ char shared_memory[];
@@ -173,23 +175,24 @@ template <typename DataT,
           int TPB,
           typename ObjectiveT,
           typename BinT>
-__global__ void computeSplitKernel(BinT* histograms,
-                                   IdxT max_n_bins,
-                                   IdxT max_depth,
-                                   IdxT min_samples_split,
-                                   IdxT max_leaves,
-                                   const Dataset<DataT, LabelT, IdxT> dataset,
-                                   const Quantiles<DataT, IdxT> quantiles,
-                                   const NodeWorkItem* work_items,
-                                   IdxT colStart,
-                                   const IdxT* colids,
-                                   int* done_count,
-                                   int* mutex,
-                                   volatile Split<DataT, IdxT>* splits,
-                                   ObjectiveT objective,
-                                   IdxT treeid,
-                                   const WorkloadInfo<IdxT>* workload_info,
-                                   uint64_t seed)
+__attribute__((visibility("hidden"))) __global__ void computeSplitKernel(
+  BinT* histograms,
+  IdxT max_n_bins,
+  IdxT max_depth,
+  IdxT min_samples_split,
+  IdxT max_leaves,
+  const Dataset<DataT, LabelT, IdxT> dataset,
+  const Quantiles<DataT, IdxT> quantiles,
+  const NodeWorkItem* work_items,
+  IdxT colStart,
+  const IdxT* colids,
+  int* done_count,
+  int* mutex,
+  volatile Split<DataT, IdxT>* splits,
+  ObjectiveT objective,
+  IdxT treeid,
+  const WorkloadInfo<IdxT>* workload_info,
+  uint64_t seed)
 {
   // dynamic shared memory
   extern __shared__ char smem[];
diff --git a/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh b/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh
index 93ce1d4d64..6bbfb05c73 100644
--- a/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh
@@ -35,11 +35,16 @@ namespace ML {
 namespace DT {
 
 template <typename T>
-__global__ void computeQuantilesKernel(
+__attribute__((visibility("hidden"))) __global__ void computeQuantilesKernel(
   T* quantiles, int* n_bins, const T* sorted_data, const int max_n_bins, const int n_rows);
 
 template <typename T>
-auto computeQuantiles(
+using QuantileReturnValue = std::tuple<ML::DT::Quantiles<T, int>,
+                                       std::shared_ptr<rmm::device_uvector<T>>,
+                                       std::shared_ptr<rmm::device_uvector<int>>>;
+
+template <typename T>
+QuantileReturnValue<T> computeQuantiles(
   const raft::handle_t& handle, const T* data, int max_n_bins, int n_rows, int n_cols)
 {
   raft::common::nvtx::push_range("computeQuantiles");
diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh
index 296239e10a..50f8d8d3ac 100644
--- a/cpp/src/decisiontree/decisiontree.cuh
+++ b/cpp/src/decisiontree/decisiontree.cuh
@@ -203,11 +203,15 @@ tl::Tree<T, T> build_treelite_tree(const DT::TreeMetaDataNode<T, L>& rf_tree,
           tl_node_id, q_node.ColumnId(), q_node.QueryValue(), true, tl::Operator::kLE);
 
       } else {
-        auto leaf_begin = rf_tree.vector_leaf.begin() + cuml_node_id * num_class;
+        auto leaf_begin = rf_tree.vector_leaf.begin() + cuml_node_id * rf_tree.num_outputs;
         if (num_class == 1) {
           tl_tree.SetLeaf(tl_node_id, *leaf_begin);
         } else {
-          std::vector<T> leaf_vector(leaf_begin, leaf_begin + num_class);
+          // if rf_tree.num_outputs < num_class, fill the remainder with zero
+          // Most likely this happens when a binary classifier is fit with all-0 labels
+          ASSERT(rf_tree.num_outputs <= num_class, "num_class too small");
+          std::vector<T> leaf_vector(num_class, T(0));
+          std::copy(leaf_begin, leaf_begin + rf_tree.num_outputs, leaf_vector.begin());
           tl_tree.SetLeafVector(tl_node_id, leaf_vector);
         }
       }
diff --git a/cpp/src/explainer/kernel_shap.cu b/cpp/src/explainer/kernel_shap.cu
index 141258a0d5..ba3be13700 100644
--- a/cpp/src/explainer/kernel_shap.cu
+++ b/cpp/src/explainer/kernel_shap.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuml/common/utils.hpp>
 #include <cuml/explainer/kernel_shap.hpp>
 
 #include <raft/core/handle.hpp>
@@ -46,13 +47,13 @@ namespace Explainer {
 */
 
 template <typename DataT, typename IdxT>
-__global__ void exact_rows_kernel(float* X,
-                                  IdxT nrows_X,
-                                  IdxT ncols,
-                                  DataT* background,
-                                  IdxT nrows_background,
-                                  DataT* dataset,
-                                  DataT* observation)
+CUML_KERNEL void exact_rows_kernel(float* X,
+                                   IdxT nrows_X,
+                                   IdxT ncols,
+                                   DataT* background,
+                                   IdxT nrows_background,
+                                   DataT* dataset,
+                                   DataT* observation)
 {
   // Each block processes one row of X. Columns are iterated over by blockDim.x at a time to ensure
   // data coelescing
@@ -104,15 +105,15 @@ __global__ void exact_rows_kernel(float* X,
 *
 */
 template <typename DataT, typename IdxT>
-__global__ void sampled_rows_kernel(IdxT* nsamples,
-                                    float* X,
-                                    IdxT nrows_X,
-                                    IdxT ncols,
-                                    DataT* background,
-                                    IdxT nrows_background,
-                                    DataT* dataset,
-                                    DataT* observation,
-                                    uint64_t seed)
+CUML_KERNEL void sampled_rows_kernel(IdxT* nsamples,
+                                     float* X,
+                                     IdxT nrows_X,
+                                     IdxT ncols,
+                                     DataT* background,
+                                     IdxT nrows_background,
+                                     DataT* dataset,
+                                     DataT* observation,
+                                     uint64_t seed)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   // see what k this block will generate
diff --git a/cpp/src/explainer/permutation_shap.cu b/cpp/src/explainer/permutation_shap.cu
index 1cc19e9139..e6df0e6e8c 100644
--- a/cpp/src/explainer/permutation_shap.cu
+++ b/cpp/src/explainer/permutation_shap.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuml/common/utils.hpp>
 #include <cuml/explainer/permutation_shap.hpp>
 
 #include <raft/core/handle.hpp>
@@ -23,15 +24,15 @@ namespace ML {
 namespace Explainer {
 
 template <typename DataT, typename IdxT>
-__global__ void _fused_tile_scatter_pe(DataT* dataset,
-                                       const DataT* background,
-                                       IdxT nrows_dataset,
-                                       IdxT ncols,
-                                       const DataT* obs,
-                                       IdxT* idx,
-                                       IdxT nrows_background,
-                                       IdxT sc_size,
-                                       bool row_major)
+CUML_KERNEL void _fused_tile_scatter_pe(DataT* dataset,
+                                        const DataT* background,
+                                        IdxT nrows_dataset,
+                                        IdxT ncols,
+                                        const DataT* obs,
+                                        IdxT* idx,
+                                        IdxT nrows_background,
+                                        IdxT sc_size,
+                                        bool row_major)
 {
   // kernel that actually does the scattering as described in the
   // descriptions of `permutation_dataset` and `shap_main_effect_dataset`
@@ -191,10 +192,10 @@ void shap_main_effect_dataset(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IdxT>
-__global__ void update_perm_shap_values_kernel(DataT* output,
-                                               const DataT* input,
-                                               const IdxT ncols,
-                                               const IdxT* idx)
+CUML_KERNEL void update_perm_shap_values_kernel(DataT* output,
+                                                const DataT* input,
+                                                const IdxT ncols,
+                                                const IdxT* idx)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index bce33bf4b8..e62df3e21f 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -314,7 +314,8 @@ template <int NITEMS,
           bool cols_in_shmem,
           bool CATS_SUPPORTED,
           class storage_type>
-__global__ void infer_k(storage_type forest, predict_params params);
+__attribute__((visibility("hidden"))) __global__ void infer_k(storage_type forest,
+                                                              predict_params params);
 
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 9a42b16de4..e0d2f8baaf 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -20,6 +20,7 @@ creation and prediction (the main inference kernel is defined in infer.cu). */
 #include "common.cuh"    // for predict_params, storage, storage
 #include "internal.cuh"  // for cat_sets_device_owner, categorical_sets, output_t,
 
+#include <cuml/common/utils.hpp>
 #include <cuml/fil/fil.h>  // for algo_t,
 
 #include <raft/core/error.hpp>         // for ASSERT
@@ -49,13 +50,13 @@ __host__ __device__ real_t sigmoid(real_t x)
     sigmoid and applying threshold. in case of complement_proba,
     fills in the complement probability */
 template <typename real_t>
-__global__ void transform_k(real_t* preds,
-                            size_t n,
-                            output_t output,
-                            real_t inv_num_trees,
-                            real_t threshold,
-                            real_t global_bias,
-                            bool complement_proba)
+CUML_KERNEL void transform_k(real_t* preds,
+                             size_t n,
+                             output_t output,
+                             real_t inv_num_trees,
+                             real_t threshold,
+                             real_t global_bias,
+                             bool complement_proba)
 {
   size_t i = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
   if (i >= n) return;
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 540293c24e..574a0a37e3 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -17,6 +17,7 @@
 #include "common.cuh"
 #include "internal.cuh"
 
+#include <cuml/common/utils.hpp>
 #include <cuml/fil/multi_sum.cuh>
 
 #include <raft/util/cuda_utils.cuh>
@@ -793,7 +794,7 @@ template <int NITEMS,
           bool cols_in_shmem,
           bool CATS_SUPPORTED,
           class storage_type>
-__global__ void infer_k(storage_type forest, predict_params params)
+CUML_KERNEL void infer_k(storage_type forest, predict_params params)
 {
   using real_t = typename storage_type::real_type;
   extern __shared__ char smem[];
diff --git a/cpp/src/genetic/genetic.cu b/cpp/src/genetic/genetic.cu
index f4c3ae5e01..947220ceb3 100644
--- a/cpp/src/genetic/genetic.cu
+++ b/cpp/src/genetic/genetic.cu
@@ -18,6 +18,7 @@
 #include "node.cuh"
 
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 #include <cuml/genetic/common.h>
 #include <cuml/genetic/genetic.h>
 #include <cuml/genetic/program.h>
@@ -55,14 +56,14 @@ namespace genetic {
  * @param criterion     Selection criterion for choices(min/max)
  * @param parsimony     Parsimony coefficient to account for bloat
  */
-__global__ void batched_tournament_kernel(const program_t progs,
-                                          int* win_indices,
-                                          const int* seeds,
-                                          const int n_progs,
-                                          const int n_tours,
-                                          const int tour_size,
-                                          const int criterion,
-                                          const float parsimony)
+CUML_KERNEL void batched_tournament_kernel(const program_t progs,
+                                           int* win_indices,
+                                           const int* seeds,
+                                           const int n_progs,
+                                           const int n_tours,
+                                           const int tour_size,
+                                           const int criterion,
+                                           const float parsimony)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= n_tours) return;
diff --git a/cpp/src/genetic/program.cu b/cpp/src/genetic/program.cu
index 6b172f7874..aa66d1433f 100644
--- a/cpp/src/genetic/program.cu
+++ b/cpp/src/genetic/program.cu
@@ -20,6 +20,7 @@
 #include "reg_stack.cuh"
 
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 #include <cuml/genetic/node.h>
 #include <cuml/genetic/program.h>
 
@@ -41,10 +42,10 @@ namespace genetic {
  * is stored in column major format.
  */
 template <int MaxSize = MAX_STACK_SIZE>
-__global__ void execute_kernel(const program_t d_progs,
-                               const float* data,
-                               float* y_pred,
-                               const uint64_t n_rows)
+CUML_KERNEL void execute_kernel(const program_t d_progs,
+                                const float* data,
+                                float* y_pred,
+                                const uint64_t n_rows)
 {
   uint64_t pid    = blockIdx.y;                             // current program
   uint64_t row_id = blockIdx.x * blockDim.x + threadIdx.x;  // current dataset row
@@ -562,4 +563,4 @@ void hoist_mutation(const program& prog, program& p_out, const param& params, st
 }
 
 }  // namespace genetic
-}  // namespace cuml
\ No newline at end of file
+}  // namespace cuml
diff --git a/cpp/src/glm/qn/glm_softmax.cuh b/cpp/src/glm/qn/glm_softmax.cuh
index cd4a67f9c7..aa8143c22d 100644
--- a/cpp/src/glm/qn/glm_softmax.cuh
+++ b/cpp/src/glm/qn/glm_softmax.cuh
@@ -19,6 +19,8 @@
 #include "glm_base.cuh"
 #include "simple_mat.cuh"
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/linalg/add.cuh>
 #include <raft/util/cuda_utils.cuh>
 
@@ -39,7 +41,7 @@ namespace detail {
 //     coalesced reduce, i.e. blocks should take care of columns
 // TODO split into two kernels for small and large case?
 template <typename T, int BX = 32, int BY = 8>
-__global__ void logSoftmaxKernel(
+CUML_KERNEL void logSoftmaxKernel(
   T* out, T* dZ, const T* in, const T* labels, int C, int N, bool getDerivative = true)
 {
   typedef cub::WarpReduce<T, BX> WarpRed;
diff --git a/cpp/src/glm/qn_mg.cu b/cpp/src/glm/qn_mg.cu
index 786df4c1ea..0c679c55f4 100644
--- a/cpp/src/glm/qn_mg.cu
+++ b/cpp/src/glm/qn_mg.cu
@@ -183,42 +183,76 @@ void qnFit_impl(raft::handle_t& handle,
                 input_desc.uniqueRanks().size());
 }
 
-std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
-                                     Matrix::PartDescriptor& input_desc,
-                                     std::vector<Matrix::Data<float>*>& labels)
+template <typename T>
+std::vector<T> getUniquelabelsMG(const raft::handle_t& handle,
+                                 Matrix::PartDescriptor& input_desc,
+                                 std::vector<Matrix::Data<T>*>& labels)
 {
   RAFT_EXPECTS(labels.size() == 1,
                "getUniqueLabelsMG currently does not accept more than one data chunk");
-  Matrix::Data<float>* data_y = labels[0];
-  int n_rows                  = input_desc.totalElementsOwnedBy(input_desc.rank);
-  return distinct_mg<float>(handle, data_y->ptr, n_rows);
+  Matrix::Data<T>* data_y = labels[0];
+  size_t n_rows           = input_desc.totalElementsOwnedBy(input_desc.rank);
+  return distinct_mg<T>(handle, data_y->ptr, n_rows);
 }
 
+template std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
+                                              Matrix::PartDescriptor& input_desc,
+                                              std::vector<Matrix::Data<float>*>& labels);
+
+template std::vector<double> getUniquelabelsMG(const raft::handle_t& handle,
+                                               Matrix::PartDescriptor& input_desc,
+                                               std::vector<Matrix::Data<double>*>& labels);
+
+template <typename T>
 void qnFit(raft::handle_t& handle,
-           std::vector<Matrix::Data<float>*>& input_data,
+           std::vector<Matrix::Data<T>*>& input_data,
            Matrix::PartDescriptor& input_desc,
-           std::vector<Matrix::Data<float>*>& labels,
-           float* coef,
+           std::vector<Matrix::Data<T>*>& labels,
+           T* coef,
            const qn_params& pams,
            bool X_col_major,
            bool standardization,
            int n_classes,
-           float* f,
+           T* f,
            int* num_iters)
 {
-  qnFit_impl<float>(handle,
-                    input_data,
-                    input_desc,
-                    labels,
-                    coef,
-                    pams,
-                    X_col_major,
-                    standardization,
-                    n_classes,
-                    f,
-                    num_iters);
+  qnFit_impl<T>(handle,
+                input_data,
+                input_desc,
+                labels,
+                coef,
+                pams,
+                X_col_major,
+                standardization,
+                n_classes,
+                f,
+                num_iters);
 }
 
+template void qnFit(raft::handle_t& handle,
+                    std::vector<Matrix::Data<float>*>& input_data,
+                    Matrix::PartDescriptor& input_desc,
+                    std::vector<Matrix::Data<float>*>& labels,
+                    float* coef,
+                    const qn_params& pams,
+                    bool X_col_major,
+                    bool standardization,
+                    int n_classes,
+                    float* f,
+                    int* num_iters);
+
+template void qnFit(raft::handle_t& handle,
+                    std::vector<Matrix::Data<double>*>& input_data,
+                    Matrix::PartDescriptor& input_desc,
+                    std::vector<Matrix::Data<double>*>& labels,
+                    double* coef,
+                    const qn_params& pams,
+                    bool X_col_major,
+                    bool standardization,
+                    int n_classes,
+                    double* f,
+                    int* num_iters);
+
 template <typename T, typename I>
 void qnFitSparse_impl(const raft::handle_t& handle,
                       const qn_params& pams,
@@ -269,18 +303,19 @@ void qnFitSparse_impl(const raft::handle_t& handle,
   return;
 }
 
+template <typename T>
 void qnFitSparse(raft::handle_t& handle,
-                 std::vector<Matrix::Data<float>*>& input_values,
+                 std::vector<Matrix::Data<T>*>& input_values,
                  int* input_cols,
                  int* input_row_ids,
                  int X_nnz,
                  Matrix::PartDescriptor& input_desc,
-                 std::vector<Matrix::Data<float>*>& labels,
-                 float* coef,
+                 std::vector<Matrix::Data<T>*>& labels,
+                 T* coef,
                  const qn_params& pams,
                  bool standardization,
                  int n_classes,
-                 float* f,
+                 T* f,
                  int* num_iters)
 {
   RAFT_EXPECTS(input_values.size() == 1,
@@ -289,25 +324,53 @@ void qnFitSparse(raft::handle_t& handle,
   auto data_input_values = input_values[0];
   auto data_y            = labels[0];
 
-  qnFitSparse_impl<float, int>(handle,
-                               pams,
-                               data_input_values->ptr,
-                               input_cols,
-                               input_row_ids,
-                               X_nnz,
-                               standardization,
-                               data_y->ptr,
-                               input_desc.totalElementsOwnedBy(input_desc.rank),
-                               input_desc.N,
-                               n_classes,
-                               coef,
-                               f,
-                               num_iters,
-                               input_desc.M,
-                               input_desc.rank,
-                               input_desc.uniqueRanks().size());
+  qnFitSparse_impl<T, int>(handle,
+                           pams,
+                           data_input_values->ptr,
+                           input_cols,
+                           input_row_ids,
+                           X_nnz,
+                           standardization,
+                           data_y->ptr,
+                           input_desc.totalElementsOwnedBy(input_desc.rank),
+                           input_desc.N,
+                           n_classes,
+                           coef,
+                           f,
+                           num_iters,
+                           input_desc.M,
+                           input_desc.rank,
+                           input_desc.uniqueRanks().size());
 }
 
+template void qnFitSparse(raft::handle_t& handle,
+                          std::vector<Matrix::Data<float>*>& input_values,
+                          int* input_cols,
+                          int* input_row_ids,
+                          int X_nnz,
+                          Matrix::PartDescriptor& input_desc,
+                          std::vector<Matrix::Data<float>*>& labels,
+                          float* coef,
+                          const qn_params& pams,
+                          bool standardization,
+                          int n_classes,
+                          float* f,
+                          int* num_iters);
+
+template void qnFitSparse(raft::handle_t& handle,
+                          std::vector<Matrix::Data<double>*>& input_values,
+                          int* input_cols,
+                          int* input_row_ids,
+                          int X_nnz,
+                          Matrix::PartDescriptor& input_desc,
+                          std::vector<Matrix::Data<double>*>& labels,
+                          double* coef,
+                          const qn_params& pams,
+                          bool standardization,
+                          int n_classes,
+                          double* f,
+                          int* num_iters);
+
 };  // namespace opg
 };  // namespace GLM
 };  // namespace ML
diff --git a/cpp/src/hdbscan/condensed_hierarchy.cu b/cpp/src/hdbscan/condensed_hierarchy.cu
index 20f155b012..76f1a19cf8 100644
--- a/cpp/src/hdbscan/condensed_hierarchy.cu
+++ b/cpp/src/hdbscan/condensed_hierarchy.cu
@@ -26,6 +26,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -156,7 +157,7 @@ void CondensedHierarchy<value_idx, value_t>::condense(value_idx* full_parents,
     thrust::cuda::par.on(stream),
     full_sizes,
     full_sizes + size,
-    [=] __device__(value_idx a) { return a != -1; },
+    cuda::proclaim_return_type<bool>([=] __device__(value_idx a) -> bool { return a != -1; }),
     0,
     thrust::plus<value_idx>());
 
diff --git a/cpp/src/hdbscan/detail/kernels/condense.cuh b/cpp/src/hdbscan/detail/kernels/condense.cuh
index bd8bcabfda..ce57cb75a4 100644
--- a/cpp/src/hdbscan/detail/kernels/condense.cuh
+++ b/cpp/src/hdbscan/detail/kernels/condense.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 namespace ML {
 namespace HDBSCAN {
 namespace detail {
@@ -82,19 +84,19 @@ __device__ inline value_t get_lambda(value_idx node, value_idx num_points, const
  * @param out_count children cluster sizes of output dendrogram.
  */
 template <typename value_idx, typename value_t>
-__global__ void condense_hierarchy_kernel(bool* frontier,
-                                          bool* next_frontier,
-                                          value_t* ignore,
-                                          value_idx* relabel,
-                                          const value_idx* children,
-                                          const value_t* deltas,
-                                          const value_idx* sizes,
-                                          int n_leaves,
-                                          int min_cluster_size,
-                                          value_idx* out_parent,
-                                          value_idx* out_child,
-                                          value_t* out_lambda,
-                                          value_idx* out_count)
+CUML_KERNEL void condense_hierarchy_kernel(bool* frontier,
+                                           bool* next_frontier,
+                                           value_t* ignore,
+                                           value_idx* relabel,
+                                           const value_idx* children,
+                                           const value_t* deltas,
+                                           const value_idx* sizes,
+                                           int n_leaves,
+                                           int min_cluster_size,
+                                           value_idx* out_parent,
+                                           value_idx* out_child,
+                                           value_t* out_lambda,
+                                           value_idx* out_count)
 {
   int node = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/cpp/src/hdbscan/detail/kernels/predict.cuh b/cpp/src/hdbscan/detail/kernels/predict.cuh
index 181e32ff6b..b089e5e8ad 100644
--- a/cpp/src/hdbscan/detail/kernels/predict.cuh
+++ b/cpp/src/hdbscan/detail/kernels/predict.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,14 +22,14 @@ namespace detail {
 namespace Predict {
 
 template <typename value_idx, typename value_t>
-__global__ void min_mutual_reachability_kernel(value_t* input_core_dists,
-                                               value_t* prediction_core_dists,
-                                               value_t* pairwise_dists,
-                                               value_idx* neighbor_indices,
-                                               size_t n_prediction_points,
-                                               value_idx neighborhood,
-                                               value_t* min_mr_dists,
-                                               value_idx* min_mr_indices)
+CUML_KERNEL void min_mutual_reachability_kernel(value_t* input_core_dists,
+                                                value_t* prediction_core_dists,
+                                                value_t* pairwise_dists,
+                                                value_idx* neighbor_indices,
+                                                size_t n_prediction_points,
+                                                value_idx neighborhood,
+                                                value_t* min_mr_dists,
+                                                value_idx* min_mr_indices)
 {
   value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx < value_idx(n_prediction_points)) {
@@ -55,18 +55,18 @@ __global__ void min_mutual_reachability_kernel(value_t* input_core_dists,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void cluster_probability_kernel(value_idx* min_mr_indices,
-                                           value_t* prediction_lambdas,
-                                           value_idx* index_into_children,
-                                           value_idx* labels,
-                                           value_t* deaths,
-                                           value_idx* selected_clusters,
-                                           value_idx* parents,
-                                           value_t* lambdas,
-                                           value_idx n_leaves,
-                                           size_t n_prediction_points,
-                                           value_idx* predicted_labels,
-                                           value_t* cluster_probabilities)
+CUML_KERNEL void cluster_probability_kernel(value_idx* min_mr_indices,
+                                            value_t* prediction_lambdas,
+                                            value_idx* index_into_children,
+                                            value_idx* labels,
+                                            value_t* deaths,
+                                            value_idx* selected_clusters,
+                                            value_idx* parents,
+                                            value_t* lambdas,
+                                            value_idx n_leaves,
+                                            size_t n_prediction_points,
+                                            value_idx* predicted_labels,
+                                            value_t* cluster_probabilities)
 {
   value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx < value_idx(n_prediction_points)) {
@@ -99,4 +99,4 @@ __global__ void cluster_probability_kernel(value_idx* min_mr_indices,
 };  // namespace Predict
 };  // namespace detail
 };  // namespace HDBSCAN
-};  // namespace ML
\ No newline at end of file
+};  // namespace ML
diff --git a/cpp/src/hdbscan/detail/kernels/select.cuh b/cpp/src/hdbscan/detail/kernels/select.cuh
index 058c234f13..c281e8ac57 100644
--- a/cpp/src/hdbscan/detail/kernels/select.cuh
+++ b/cpp/src/hdbscan/detail/kernels/select.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,12 +33,12 @@ namespace Select {
  * @param[in] n_clusters number of clusters
  */
 template <typename value_idx>
-__global__ void propagate_cluster_negation_kernel(const value_idx* indptr,
-                                                  const value_idx* children,
-                                                  int* frontier,
-                                                  int* next_frontier,
-                                                  int* is_cluster,
-                                                  int n_clusters)
+CUML_KERNEL void propagate_cluster_negation_kernel(const value_idx* indptr,
+                                                   const value_idx* children,
+                                                   int* frontier,
+                                                   int* next_frontier,
+                                                   int* is_cluster,
+                                                   int n_clusters)
 {
   int cluster = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -56,17 +56,17 @@ __global__ void propagate_cluster_negation_kernel(const value_idx* indptr,
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-__global__ void cluster_epsilon_search_kernel(const int* selected_clusters,
-                                              const int n_selected_clusters,
-                                              const value_idx* parents,
-                                              const value_idx* children,
-                                              const value_t* lambdas,
-                                              const value_idx cluster_tree_edges,
-                                              int* is_cluster,
-                                              int* frontier,
-                                              const int n_clusters,
-                                              const value_t cluster_selection_epsilon,
-                                              const bool allow_single_cluster)
+CUML_KERNEL void cluster_epsilon_search_kernel(const int* selected_clusters,
+                                               const int n_selected_clusters,
+                                               const value_idx* parents,
+                                               const value_idx* children,
+                                               const value_t* lambdas,
+                                               const value_idx cluster_tree_edges,
+                                               int* is_cluster,
+                                               int* frontier,
+                                               const int n_clusters,
+                                               const value_t cluster_selection_epsilon,
+                                               const bool allow_single_cluster)
 {
   auto selected_cluster_idx = threadIdx.x + blockDim.x * blockIdx.x;
 
diff --git a/cpp/src/hdbscan/detail/kernels/soft_clustering.cuh b/cpp/src/hdbscan/detail/kernels/soft_clustering.cuh
index 4d77ce4e50..dd20f9b0f2 100644
--- a/cpp/src/hdbscan/detail/kernels/soft_clustering.cuh
+++ b/cpp/src/hdbscan/detail/kernels/soft_clustering.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@ namespace detail {
 namespace Predict {
 
 template <typename value_idx, typename value_t, int tpb = 256>
-__global__ void merge_height_kernel(value_t* heights,
-                                    value_t* lambdas,
-                                    value_idx* index_into_children,
-                                    value_idx* parents,
-                                    size_t m,
-                                    value_idx n_selected_clusters,
-                                    raft::util::FastIntDiv n,
-                                    value_idx* selected_clusters)
+CUML_KERNEL void merge_height_kernel(value_t* heights,
+                                     value_t* lambdas,
+                                     value_idx* index_into_children,
+                                     value_idx* parents,
+                                     size_t m,
+                                     value_idx n_selected_clusters,
+                                     raft::util::FastIntDiv n,
+                                     value_idx* selected_clusters)
 {
   value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx < value_idx(m * n_selected_clusters)) {
@@ -65,16 +65,16 @@ __global__ void merge_height_kernel(value_t* heights,
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-__global__ void merge_height_kernel(value_t* heights,
-                                    value_t* lambdas,
-                                    value_t* prediction_lambdas,
-                                    value_idx* min_mr_indices,
-                                    value_idx* index_into_children,
-                                    value_idx* parents,
-                                    size_t n_prediction_points,
-                                    value_idx n_selected_clusters,
-                                    raft::util::FastIntDiv n,
-                                    value_idx* selected_clusters)
+CUML_KERNEL void merge_height_kernel(value_t* heights,
+                                     value_t* lambdas,
+                                     value_t* prediction_lambdas,
+                                     value_idx* min_mr_indices,
+                                     value_idx* index_into_children,
+                                     value_idx* parents,
+                                     size_t n_prediction_points,
+                                     value_idx n_selected_clusters,
+                                     raft::util::FastIntDiv n,
+                                     value_idx* selected_clusters)
 {
   value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx < value_idx(n_prediction_points * n_selected_clusters)) {
diff --git a/cpp/src/hdbscan/detail/select.cuh b/cpp/src/hdbscan/detail/select.cuh
index 3bf17c437f..36e674e40b 100644
--- a/cpp/src/hdbscan/detail/select.cuh
+++ b/cpp/src/hdbscan/detail/select.cuh
@@ -216,13 +216,14 @@ void excess_of_mass(const raft::handle_t& handle,
     value_t subtree_stability = 0.0;
 
     if (indptr_h[node + 1] - indptr_h[node] > 0) {
-      subtree_stability = thrust::transform_reduce(
-        exec_policy,
-        children + indptr_h[node],
-        children + indptr_h[node + 1],
-        [=] __device__(value_idx a) { return stability[a]; },
-        0.0,
-        thrust::plus<value_t>());
+      subtree_stability =
+        thrust::transform_reduce(exec_policy,
+                                 children + indptr_h[node],
+                                 children + indptr_h[node + 1],
+                                 cuda::proclaim_return_type<value_t>(
+                                   [=] __device__(value_idx a) -> value_t { return stability[a]; }),
+                                 0.0,
+                                 thrust::plus<value_t>());
     }
 
     if (subtree_stability > node_stability || cluster_sizes_h[node] > max_cluster_size) {
diff --git a/cpp/src/hdbscan/detail/utils.h b/cpp/src/hdbscan/detail/utils.h
index 092dc2e673..b151628429 100644
--- a/cpp/src/hdbscan/detail/utils.h
+++ b/cpp/src/hdbscan/detail/utils.h
@@ -114,7 +114,7 @@ Common::CondensedHierarchy<value_idx, value_t> make_cluster_tree(
     thrust_policy,
     sizes,
     sizes + condensed_tree.get_n_edges(),
-    [=] __device__(value_idx a) { return a > 1; },
+    cuda::proclaim_return_type<bool>([=] __device__(value_idx a) -> bool { return a > 1; }),
     0,
     thrust::plus<value_idx>());
 
diff --git a/cpp/src/holtwinters/internal/hw_decompose.cuh b/cpp/src/holtwinters/internal/hw_decompose.cuh
index 1c78a7af62..f0166c4834 100644
--- a/cpp/src/holtwinters/internal/hw_decompose.cuh
+++ b/cpp/src/holtwinters/internal/hw_decompose.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
 // #TODO: Replace with public header when ready
@@ -31,12 +33,12 @@
 // optimize, maybe im2col ?
 // https://github.com/rapidsai/cuml/issues/891
 template <typename Dtype>
-__global__ void conv1d_kernel(const Dtype* input,
-                              int batch_size,
-                              const Dtype* filter,
-                              int filter_size,
-                              Dtype* output,
-                              int output_size)
+CUML_KERNEL void conv1d_kernel(const Dtype* input,
+                               int batch_size,
+                               const Dtype* filter,
+                               int filter_size,
+                               Dtype* output,
+                               int output_size)
 {
   const int tid = GET_TID;
   if (tid < batch_size) {
@@ -68,13 +70,13 @@ void conv1d(const raft::handle_t& handle,
 
 // https://github.com/rapidsai/cuml/issues/891
 template <typename Dtype>
-__global__ void season_mean_kernel(const Dtype* season,
-                                   int len,
-                                   int batch_size,
-                                   Dtype* start_season,
-                                   int frequency,
-                                   int half_filter_size,
-                                   bool ADDITIVE_KERNEL)
+CUML_KERNEL void season_mean_kernel(const Dtype* season,
+                                    int len,
+                                    int batch_size,
+                                    Dtype* start_season,
+                                    int frequency,
+                                    int half_filter_size,
+                                    bool ADDITIVE_KERNEL)
 {
   int tid = GET_TID;
   if (tid < batch_size) {
@@ -120,7 +122,7 @@ void season_mean(const raft::handle_t& handle,
 }
 
 template <typename Dtype>
-__global__ void RinvKernel(const Dtype* A, Dtype* Rinv, int trend_len)
+CUML_KERNEL void RinvKernel(const Dtype* A, Dtype* Rinv, int trend_len)
 {
   // Inverse of R (2x2 upper triangular matrix)
   int tid = GET_TID;
@@ -135,7 +137,7 @@ __global__ void RinvKernel(const Dtype* A, Dtype* Rinv, int trend_len)
 }
 
 template <typename Dtype>
-__global__ void batched_ls_solver_kernel(
+CUML_KERNEL void batched_ls_solver_kernel(
   const Dtype* B, const Dtype* rq, int batch_size, int len, Dtype* level, Dtype* trend)
 {
   int tid = GET_TID;
diff --git a/cpp/src/holtwinters/internal/hw_eval.cuh b/cpp/src/holtwinters/internal/hw_eval.cuh
index 0115be99fc..75ad65f536 100644
--- a/cpp/src/holtwinters/internal/hw_eval.cuh
+++ b/cpp/src/holtwinters/internal/hw_eval.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,22 +105,22 @@ __device__ Dtype holtwinters_eval_device(int tid,
 }
 
 template <typename Dtype>
-__global__ void holtwinters_eval_gpu_shared_kernel(const Dtype* ts,
-                                                   int n,
-                                                   int batch_size,
-                                                   int frequency,
-                                                   const Dtype* start_level,
-                                                   const Dtype* start_trend,
-                                                   const Dtype* start_season,
-                                                   const Dtype* alpha,
-                                                   const Dtype* beta,
-                                                   const Dtype* gamma,
-                                                   Dtype* level,
-                                                   Dtype* trend,
-                                                   Dtype* season,
-                                                   Dtype* xhat,
-                                                   Dtype* error,
-                                                   bool additive_seasonal)
+CUML_KERNEL void holtwinters_eval_gpu_shared_kernel(const Dtype* ts,
+                                                    int n,
+                                                    int batch_size,
+                                                    int frequency,
+                                                    const Dtype* start_level,
+                                                    const Dtype* start_trend,
+                                                    const Dtype* start_season,
+                                                    const Dtype* alpha,
+                                                    const Dtype* beta,
+                                                    const Dtype* gamma,
+                                                    Dtype* level,
+                                                    Dtype* trend,
+                                                    Dtype* season,
+                                                    Dtype* xhat,
+                                                    Dtype* error,
+                                                    bool additive_seasonal)
 {
   int tid = GET_TID;
   extern __shared__ __align__(sizeof(Dtype)) unsigned char pseason_[];
@@ -167,23 +167,23 @@ __global__ void holtwinters_eval_gpu_shared_kernel(const Dtype* ts,
 }
 
 template <typename Dtype>
-__global__ void holtwinters_eval_gpu_global_kernel(const Dtype* ts,
-                                                   int n,
-                                                   int batch_size,
-                                                   int frequency,
-                                                   const Dtype* start_level,
-                                                   const Dtype* start_trend,
-                                                   const Dtype* start_season,
-                                                   Dtype* pseason,
-                                                   const Dtype* alpha,
-                                                   const Dtype* beta,
-                                                   const Dtype* gamma,
-                                                   Dtype* level,
-                                                   Dtype* trend,
-                                                   Dtype* season,
-                                                   Dtype* xhat,
-                                                   Dtype* error,
-                                                   bool additive_seasonal)
+CUML_KERNEL void holtwinters_eval_gpu_global_kernel(const Dtype* ts,
+                                                    int n,
+                                                    int batch_size,
+                                                    int frequency,
+                                                    const Dtype* start_level,
+                                                    const Dtype* start_trend,
+                                                    const Dtype* start_season,
+                                                    Dtype* pseason,
+                                                    const Dtype* alpha,
+                                                    const Dtype* beta,
+                                                    const Dtype* gamma,
+                                                    Dtype* level,
+                                                    Dtype* trend,
+                                                    Dtype* season,
+                                                    Dtype* xhat,
+                                                    Dtype* error,
+                                                    bool additive_seasonal)
 {
   int tid = GET_TID;
 
diff --git a/cpp/src/holtwinters/internal/hw_forecast.cuh b/cpp/src/holtwinters/internal/hw_forecast.cuh
index eb69f650af..072029271c 100644
--- a/cpp/src/holtwinters/internal/hw_forecast.cuh
+++ b/cpp/src/holtwinters/internal/hw_forecast.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,14 +18,14 @@
 #include "hw_utils.cuh"
 
 template <typename Dtype>
-__global__ void holtwinters_seasonal_forecast_kernel(Dtype* forecast,
-                                                     int h,
-                                                     int batch_size,
-                                                     int frequency,
-                                                     const Dtype* level_coef,
-                                                     const Dtype* trend_coef,
-                                                     const Dtype* season_coef,
-                                                     bool additive)
+CUML_KERNEL void holtwinters_seasonal_forecast_kernel(Dtype* forecast,
+                                                      int h,
+                                                      int batch_size,
+                                                      int frequency,
+                                                      const Dtype* level_coef,
+                                                      const Dtype* trend_coef,
+                                                      const Dtype* season_coef,
+                                                      bool additive)
 {
   int tid = GET_TID;
   if (tid < batch_size) {
@@ -42,7 +42,7 @@ __global__ void holtwinters_seasonal_forecast_kernel(Dtype* forecast,
 }
 
 template <typename Dtype>
-__global__ void holtwinters_nonseasonal_forecast_kernel(
+CUML_KERNEL void holtwinters_nonseasonal_forecast_kernel(
   Dtype* forecast, int h, int batch_size, const Dtype* level_coef, const Dtype* trend_coef)
 {
   int tid = GET_TID;
@@ -55,10 +55,10 @@ __global__ void holtwinters_nonseasonal_forecast_kernel(
 }
 
 template <typename Dtype>
-__global__ void holtwinters_level_forecast_kernel(Dtype* forecast,
-                                                  int h,
-                                                  int batch_size,
-                                                  const Dtype* level_coef)
+CUML_KERNEL void holtwinters_level_forecast_kernel(Dtype* forecast,
+                                                   int h,
+                                                   int batch_size,
+                                                   const Dtype* level_coef)
 {
   int tid = GET_TID;
   if (tid < batch_size) {
diff --git a/cpp/src/holtwinters/internal/hw_optim.cuh b/cpp/src/holtwinters/internal/hw_optim.cuh
index 164fdd7c50..1fa3ad34b3 100644
--- a/cpp/src/holtwinters/internal/hw_optim.cuh
+++ b/cpp/src/holtwinters/internal/hw_optim.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -595,28 +595,28 @@ __device__ ML::OptimCriterion holtwinters_bfgs_optim_device(
 }
 
 template <typename Dtype>
-__global__ void holtwinters_optim_gpu_shared_kernel(const Dtype* ts,
-                                                    int n,
-                                                    int batch_size,
-                                                    int frequency,
-                                                    const Dtype* start_level,
-                                                    const Dtype* start_trend,
-                                                    const Dtype* start_season,
-                                                    Dtype* alpha,
-                                                    bool optim_alpha,
-                                                    Dtype* beta,
-                                                    bool optim_beta,
-                                                    Dtype* gamma,
-                                                    bool optim_gamma,
-                                                    Dtype* level,
-                                                    Dtype* trend,
-                                                    Dtype* season,
-                                                    Dtype* xhat,
-                                                    Dtype* error,
-                                                    ML::OptimCriterion* optim_result,
-                                                    const ML::OptimParams<Dtype> optim_params,
-                                                    bool ADDITIVE_KERNEL,
-                                                    bool single_param)
+CUML_KERNEL void holtwinters_optim_gpu_shared_kernel(const Dtype* ts,
+                                                     int n,
+                                                     int batch_size,
+                                                     int frequency,
+                                                     const Dtype* start_level,
+                                                     const Dtype* start_trend,
+                                                     const Dtype* start_season,
+                                                     Dtype* alpha,
+                                                     bool optim_alpha,
+                                                     Dtype* beta,
+                                                     bool optim_beta,
+                                                     Dtype* gamma,
+                                                     bool optim_gamma,
+                                                     Dtype* level,
+                                                     Dtype* trend,
+                                                     Dtype* season,
+                                                     Dtype* xhat,
+                                                     Dtype* error,
+                                                     ML::OptimCriterion* optim_result,
+                                                     const ML::OptimParams<Dtype> optim_params,
+                                                     bool ADDITIVE_KERNEL,
+                                                     bool single_param)
 {
   int tid = GET_TID;
   extern __shared__ __align__(sizeof(Dtype)) unsigned char pseason_[];
@@ -718,29 +718,29 @@ __global__ void holtwinters_optim_gpu_shared_kernel(const Dtype* ts,
 }
 
 template <typename Dtype>
-__global__ void holtwinters_optim_gpu_global_kernel(const Dtype* ts,
-                                                    int n,
-                                                    int batch_size,
-                                                    int frequency,
-                                                    const Dtype* start_level,
-                                                    const Dtype* start_trend,
-                                                    const Dtype* start_season,
-                                                    Dtype* pseason,
-                                                    Dtype* alpha,
-                                                    bool optim_alpha,
-                                                    Dtype* beta,
-                                                    bool optim_beta,
-                                                    Dtype* gamma,
-                                                    bool optim_gamma,
-                                                    Dtype* level,
-                                                    Dtype* trend,
-                                                    Dtype* season,
-                                                    Dtype* xhat,
-                                                    Dtype* error,
-                                                    ML::OptimCriterion* optim_result,
-                                                    const ML::OptimParams<Dtype> optim_params,
-                                                    bool ADDITIVE_KERNEL,
-                                                    bool single_param)
+CUML_KERNEL void holtwinters_optim_gpu_global_kernel(const Dtype* ts,
+                                                     int n,
+                                                     int batch_size,
+                                                     int frequency,
+                                                     const Dtype* start_level,
+                                                     const Dtype* start_trend,
+                                                     const Dtype* start_season,
+                                                     Dtype* pseason,
+                                                     Dtype* alpha,
+                                                     bool optim_alpha,
+                                                     Dtype* beta,
+                                                     bool optim_beta,
+                                                     Dtype* gamma,
+                                                     bool optim_gamma,
+                                                     Dtype* level,
+                                                     Dtype* trend,
+                                                     Dtype* season,
+                                                     Dtype* xhat,
+                                                     Dtype* error,
+                                                     ML::OptimCriterion* optim_result,
+                                                     const ML::OptimParams<Dtype> optim_params,
+                                                     bool ADDITIVE_KERNEL,
+                                                     bool single_param)
 {
   int tid = GET_TID;
   if (tid < batch_size) {
diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index 495b3d5a2e..188244d643 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -474,13 +474,13 @@ void perform_local_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
  * @param[in] n_labels number of labels to write (batch_size * n_outputs)
  */
 template <int TPB_X, typename ind_t, typename out_t>
-__global__ void copy_label_outputs_from_index_parts_kernel(out_t* out,
-                                                           ind_t* knn_indices,
-                                                           out_t** parts,
-                                                           uint64_t* offsets,
-                                                           size_t cur_batch_size,
-                                                           int n_parts,
-                                                           int n_labels)
+CUML_KERNEL void copy_label_outputs_from_index_parts_kernel(out_t* out,
+                                                            ind_t* knn_indices,
+                                                            out_t** parts,
+                                                            uint64_t* offsets,
+                                                            size_t cur_batch_size,
+                                                            int n_parts,
+                                                            int n_labels)
 {
   uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
   if (i >= n_labels) return;
@@ -790,17 +790,17 @@ void reduce(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
  * @param[in] n_ranks number of index ranks
  */
 template <int TPB_X, typename dist_t, typename out_t>
-__global__ void merge_labels_kernel(out_t* outputs,
-                                    dist_t* knn_indices,
-                                    out_t* unmerged_outputs,
-                                    dist_t* unmerged_knn_indices,
-                                    size_t* offsets,
-                                    int* parts_to_ranks,
-                                    int nearest_neighbors,
-                                    int n_outputs,
-                                    int n_labels,
-                                    int n_parts,
-                                    int n_ranks)
+CUML_KERNEL void merge_labels_kernel(out_t* outputs,
+                                     dist_t* knn_indices,
+                                     out_t* unmerged_outputs,
+                                     dist_t* unmerged_knn_indices,
+                                     size_t* offsets,
+                                     int* parts_to_ranks,
+                                     int nearest_neighbors,
+                                     int n_outputs,
+                                     int n_labels,
+                                     int n_parts,
+                                     int n_ranks)
 {
   uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
   if (i >= n_labels) return;
diff --git a/cpp/src/random_projection/rproj_utils.cuh b/cpp/src/random_projection/rproj_utils.cuh
index 38abdca84f..386f5bef8c 100644
--- a/cpp/src/random_projection/rproj_utils.cuh
+++ b/cpp/src/random_projection/rproj_utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
 #include <cuml/random_projection/rproj_c.h>
 
 #include <raft/random/rng.cuh>
@@ -55,7 +56,7 @@ inline void sample_without_replacement(size_t n_population,
   }
 }
 
-__global__ void sum_bools(bool* in_bools, int n, int* out_val)
+CUML_KERNEL void sum_bools(bool* in_bools, int n, int* out_val)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < n) {
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 784420baf5..14e76c8f38 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -264,7 +264,8 @@ void build_treelite_forest(TreeliteModelHandle* model_handle,
   }
 
   if constexpr (std::is_integral_v<L>) {
-    ASSERT(num_outputs > 1, "More than one variable expected for classification problem.");
+    num_outputs = std::max(num_outputs, 2);
+    // Ensure that num_outputs is at least 2
     model->task_type     = tl::TaskType::kMultiClf;
     model->postprocessor = "identity_multiclass";
   } else {
diff --git a/cpp/src/solver/cd.cuh b/cpp/src/solver/cd.cuh
index 5926a7e41d..14aeecd226 100644
--- a/cpp/src/solver/cd.cuh
+++ b/cpp/src/solver/cd.cuh
@@ -18,6 +18,7 @@
 
 #include "shuffle.h"
 
+#include <cuml/common/utils.hpp>
 #include <cuml/solvers/params.hpp>
 
 #include <raft/core/handle.hpp>
@@ -68,10 +69,10 @@ struct ConvState {
  * @param[in] l1_alpha L1 regularization coef
  */
 template <typename math_t>
-__global__ void __launch_bounds__(1, 1) cdUpdateCoefKernel(math_t* coefLoc,
-                                                           const math_t* squaredLoc,
-                                                           ConvState<math_t>* convStateLoc,
-                                                           const math_t l1_alpha)
+CUML_KERNEL void __launch_bounds__(1, 1) cdUpdateCoefKernel(math_t* coefLoc,
+                                                            const math_t* squaredLoc,
+                                                            ConvState<math_t>* convStateLoc,
+                                                            const math_t l1_alpha)
 {
   auto coef    = *coefLoc;
   auto r       = coef > l1_alpha ? coef - l1_alpha : (coef < -l1_alpha ? coef + l1_alpha : 0);
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index a7c8ca7c3b..cbaec2b386 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -60,7 +60,7 @@ namespace {  // unnamed namespace to avoid multiple definition error
  * @param [in] n_ws_perm array with indices of vectors in the working set, size [n_ws]
  * @param [out] out array with workspace idx to column idx mapping, size [n_ws]
  */
-__global__ void mapColumnIndicesToSVRSpace(
+CUML_KERNEL void mapColumnIndicesToSVRSpace(
   const int* ws, int n_ws, int n_rows, const int* n_ws_perm, int* out)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu
index 55870e766e..ac1d561ed0 100644
--- a/cpp/src/svm/linear.cu
+++ b/cpp/src/svm/linear.cu
@@ -16,6 +16,7 @@
 
 #include <common/nvtx.hpp>
 
+#include <cuml/common/utils.hpp>
 #include <cuml/linear_model/glm.hpp>
 #include <cuml/svm/linear.hpp>
 #include <cuml/svm/svm_model.h>
@@ -64,7 +65,7 @@ inline int narrowDown(std::size_t n)
 
 /** The cuda kernel for classification. Call it via PredictClass::run(..). */
 template <typename T, int BX = 32, int BY = 8>
-__global__ void predictClass(
+CUML_KERNEL void predictClass(
   T* out, const T* z, const T* classes, const int nRows, const int coefCols)
 {
   const int i = threadIdx.y + blockIdx.y * BY;
@@ -131,7 +132,7 @@ struct PredictClass {
 
 /**  The cuda kernel for classification. Call it via PredictProba::run(..). */
 template <typename T, bool Log, bool Binary, int BX = 32, int BY = 8>
-__global__ void predictProba(T* out, const T* z, const int nRows, const int nClasses)
+CUML_KERNEL void predictProba(T* out, const T* z, const int nRows, const int nClasses)
 {
   typedef cub::WarpReduce<T, BX> WarpRed;
   __shared__ typename WarpRed::TempStorage shm[BY];
diff --git a/cpp/src/svm/results.cuh b/cpp/src/svm/results.cuh
index a48baa23ad..f33e8c4552 100644
--- a/cpp/src/svm/results.cuh
+++ b/cpp/src/svm/results.cuh
@@ -29,8 +29,10 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_select.cuh>
 
@@ -44,7 +46,7 @@ namespace ML {
 namespace SVM {
 
 template <typename math_t, typename Lambda>
-__global__ void set_flag(bool* flag, const math_t* alpha, int n, Lambda op)
+CUML_KERNEL void set_flag(bool* flag, const math_t* alpha, int n, Lambda op)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) flag[tid] = op(alpha[tid]);
@@ -150,8 +152,8 @@ class Results {
     // allow ~1GB dense support matrix
     if (isDenseType<MatrixViewType>() ||
         ((size_t)n_support * n_cols * sizeof(math_t) < (1 << 30))) {
-      support_matrix.data =
-        (math_t*)rmm_alloc->allocate(n_support * n_cols * sizeof(math_t), stream);
+      support_matrix.data = (math_t*)rmm_alloc.allocate_async(
+        n_support * n_cols * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
       ML::SVM::extractRows<math_t>(matrix, support_matrix.data, idx, n_support, handle);
     } else {
       ML::SVM::extractRows<math_t>(matrix,
@@ -208,7 +210,8 @@ class Results {
     // Return only the non-zero coefficients
     auto select_op = [] __device__(math_t a) { return 0 != a; };
     *n_support     = SelectByCoef(val_tmp, n_rows, val_tmp, select_op, val_selected.data());
-    *dual_coefs    = (math_t*)rmm_alloc->allocate(*n_support * sizeof(math_t), stream);
+    *dual_coefs    = (math_t*)rmm_alloc.allocate_async(
+      *n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(*dual_coefs, val_selected.data(), *n_support, stream);
     handle.sync_stream(stream);
   }
@@ -225,7 +228,8 @@ class Results {
   {
     auto select_op = [] __device__(math_t a) -> bool { return 0 != a; };
     SelectByCoef(coef, n_rows, f_idx.data(), select_op, idx_selected.data());
-    int* idx = (int*)rmm_alloc->allocate(n_support * sizeof(int), stream);
+    int* idx = (int*)rmm_alloc.allocate_async(
+      n_support * sizeof(int), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(idx, idx_selected.data(), n_support, stream);
     return idx;
   }
@@ -297,7 +301,7 @@ class Results {
     return n_selected;
   }
 
-  rmm::mr::device_memory_resource* rmm_alloc;
+  rmm::device_async_resource_ref rmm_alloc;
 
  private:
   const raft::handle_t& handle;
diff --git a/cpp/src/svm/smoblocksolve.cuh b/cpp/src/svm/smoblocksolve.cuh
index f3c2bbd003..20de4cd94b 100644
--- a/cpp/src/svm/smoblocksolve.cuh
+++ b/cpp/src/svm/smoblocksolve.cuh
@@ -145,19 +145,19 @@ namespace SVM {
  * @param [in] svmType type of the SVM problem to solve
  */
 template <typename math_t, int WSIZE>
-__global__ __launch_bounds__(WSIZE) void SmoBlockSolve(math_t* y_array,
-                                                       int n_train,
-                                                       math_t* alpha,
-                                                       int n_ws,
-                                                       math_t* delta_alpha,
-                                                       math_t* f_array,
-                                                       const math_t* kernel,
-                                                       const int* ws_idx,
-                                                       const math_t* C_vec,
-                                                       math_t eps,
-                                                       math_t* return_buff,
-                                                       int max_iter    = 10000,
-                                                       SvmType svmType = C_SVC)
+CUML_KERNEL __launch_bounds__(WSIZE) void SmoBlockSolve(math_t* y_array,
+                                                        int n_train,
+                                                        math_t* alpha,
+                                                        int n_ws,
+                                                        math_t* delta_alpha,
+                                                        math_t* f_array,
+                                                        const math_t* kernel,
+                                                        const int* ws_idx,
+                                                        const math_t* C_vec,
+                                                        math_t eps,
+                                                        math_t* return_buff,
+                                                        int max_iter    = 10000,
+                                                        SvmType svmType = C_SVC)
 {
   typedef MLCommon::Selection::KVPair<math_t, int> Pair;
   typedef cub::BlockReduce<Pair, WSIZE> BlockReduce;
diff --git a/cpp/src/svm/smosolver.cuh b/cpp/src/svm/smosolver.cuh
index a819d6b779..384c6c236b 100644
--- a/cpp/src/svm/smosolver.cuh
+++ b/cpp/src/svm/smosolver.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cuml/common/logger.hpp>
+#include "smosolver.h"
 
 // #TODO: Replace with public header when ready
 #include "kernelcache.cuh"
@@ -55,235 +55,204 @@
 namespace ML {
 namespace SVM {
 
+template <typename math_t>
+void SmoSolver<math_t>::GetNonzeroDeltaAlpha(const math_t* vec,
+                                             int n_ws,
+                                             const int* idx,
+                                             math_t* nz_vec,
+                                             int* n_nz,
+                                             int* nz_idx,
+                                             cudaStream_t stream)
+{
+  thrust::device_ptr<math_t> vec_ptr(const_cast<math_t*>(vec));
+  thrust::device_ptr<math_t> nz_vec_ptr(nz_vec);
+  thrust::device_ptr<int> idx_ptr(const_cast<int*>(idx));
+  thrust::device_ptr<int> nz_idx_ptr(nz_idx);
+  auto nonzero                   = [] __device__(math_t a) { return a != 0; };
+  thrust::device_ptr<int> nz_end = thrust::copy_if(
+    thrust::cuda::par.on(stream), idx_ptr, idx_ptr + n_ws, vec_ptr, nz_idx_ptr, nonzero);
+  *n_nz = nz_end - nz_idx_ptr;
+  thrust::copy_if(thrust::cuda::par.on(stream), vec_ptr, vec_ptr + n_ws, nz_vec_ptr, nonzero);
+}
+
 /**
- * @brief Solve the quadratic optimization problem using two level decomposition
- * and Sequential Minimal Optimization (SMO).
- *
- * The general decomposition idea by Osuna is to choose q examples from all the
- * training examples, and solve the QP problem for this subset (discussed in
- * section 11.2 by Joachims [1]). SMO is the extreme case where we choose q=2.
+ * @brief Solve the quadratic optimization problem.
  *
- * Here we follow [2] and [3] and use two level decomposition. First we set
- * q_1=1024, and solve the QP sub-problem for that (let's call it QP1). This is
- * the outer iteration, implemented in SmoSolver::Solve.
+ * The output arrays (dual_coefs, support_matrix, idx) will be allocated on the
+ * device, they should be unallocated on entry.
  *
- * To solve QP1, we use another decomposition, specifically the SMO (q_2 = 2),
- * which is implemented in SmoBlockSolve.
- *
- * References:
- * - [1] Joachims, T. Making large-scale support vector machine learning
- *      practical. In B. Scholkopf, C. Burges, & A. Smola (Eds.), Advances in
- *      kernel methods: Support vector machines. Cambridge, MA: MIT Press (1998)
- * - [2] J. Vanek et al. A GPU-Architecture Optimized Hierarchical Decomposition
- *      Algorithm for Support VectorMachine Training, IEEE Transactions on
- *      Parallel and Distributed Systems, vol 28, no 12, 3330, (2017)
- * - [3] Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal
- *      of Machine Learning Research, 19, 1-5 (2018)
+ * @param [in] matrix training vectors in matrix format(MLCommon::Matrix::Matrix),
+ * size [n_rows x * n_cols]
+ * @param [in] n_rows number of rows (training vectors)
+ * @param [in] n_cols number of columns (features)
+ * @param [in] y labels (values +/-1), size [n_rows]
+ * @param [in] sample_weight device array of sample weights (or nullptr if not
+ *     applicable)
+ * @param [out] dual_coefs size [n_support] on exit
+ * @param [out] n_support number of support vectors
+ * @param [out] support_matrix support vectors in matrix format, size [n_support, n_cols]
+ * @param [out] idx the original training set indices of the support vectors, size [n_support]
+ * @param [out] b scalar constant for the decision function
+ * @param [in] max_outer_iter maximum number of outer iteration (default 100 * n_rows)
+ * @param [in] max_inner_iter maximum number of inner iterations (default 10000)
  */
 template <typename math_t>
-class SmoSolver {
- public:
-  SmoSolver(const raft::handle_t& handle,
-            SvmParameter param,
-            raft::distance::kernels::KernelType kernel_type,
-            raft::distance::kernels::GramMatrixBase<math_t>* kernel)
-    : handle(handle),
-      C(param.C),
-      tol(param.tol),
-      kernel(kernel),
-      kernel_type(kernel_type),
-      cache_size(param.cache_size),
-      nochange_steps(param.nochange_steps),
-      epsilon(param.epsilon),
-      svmType(param.svmType),
-      stream(handle.get_stream()),
-      return_buff(2, stream),
-      alpha(0, stream),
-      C_vec(0, stream),
-      delta_alpha(0, stream),
-      f(0, stream),
-      y_label(0, stream)
-  {
-    ML::Logger::get().setLevel(param.verbosity);
-  }
-
-  void GetNonzeroDeltaAlpha(const math_t* vec,
-                            int n_ws,
-                            const int* idx,
-                            math_t* nz_vec,
-                            int* n_nz,
-                            int* nz_idx,
-                            cudaStream_t stream)
-  {
-    thrust::device_ptr<math_t> vec_ptr(const_cast<math_t*>(vec));
-    thrust::device_ptr<math_t> nz_vec_ptr(nz_vec);
-    thrust::device_ptr<int> idx_ptr(const_cast<int*>(idx));
-    thrust::device_ptr<int> nz_idx_ptr(nz_idx);
-    auto nonzero                   = [] __device__(math_t a) { return a != 0; };
-    thrust::device_ptr<int> nz_end = thrust::copy_if(
-      thrust::cuda::par.on(stream), idx_ptr, idx_ptr + n_ws, vec_ptr, nz_idx_ptr, nonzero);
-    *n_nz = nz_end - nz_idx_ptr;
-    thrust::copy_if(thrust::cuda::par.on(stream), vec_ptr, vec_ptr + n_ws, nz_vec_ptr, nonzero);
-  }
-
-#define SMO_WS_SIZE 1024
-  /**
-   * @brief Solve the quadratic optimization problem.
-   *
-   * The output arrays (dual_coefs, support_matrix, idx) will be allocated on the
-   * device, they should be unallocated on entry.
-   *
-   * @param [in] matrix training vectors in matrix format(MLCommon::Matrix::Matrix),
-   * size [n_rows x * n_cols]
-   * @param [in] n_rows number of rows (training vectors)
-   * @param [in] n_cols number of columns (features)
-   * @param [in] y labels (values +/-1), size [n_rows]
-   * @param [in] sample_weight device array of sample weights (or nullptr if not
-   *     applicable)
-   * @param [out] dual_coefs size [n_support] on exit
-   * @param [out] n_support number of support vectors
-   * @param [out] support_matrix support vectors in matrix format, size [n_support, n_cols]
-   * @param [out] idx the original training set indices of the support vectors, size [n_support]
-   * @param [out] b scalar constant for the decision function
-   * @param [in] max_outer_iter maximum number of outer iteration (default 100 * n_rows)
-   * @param [in] max_inner_iter maximum number of inner iterations (default 10000)
-   */
-  template <typename MatrixViewType>
-  void Solve(MatrixViewType matrix,
-             int n_rows,
-             int n_cols,
-             math_t* y,
-             const math_t* sample_weight,
-             math_t** dual_coefs,
-             int* n_support,
-             SupportStorage<math_t>* support_matrix,
-             int** idx,
-             math_t* b,
-             int max_outer_iter = -1,
-             int max_inner_iter = 10000)
-  {
-    // Prepare data structures for SMO
-    WorkingSet<math_t> ws(handle, stream, n_rows, SMO_WS_SIZE, svmType);
-    n_ws = ws.GetSize();
-    Initialize(&y, sample_weight, n_rows, n_cols);
-    KernelCache<math_t, MatrixViewType> cache(
-      handle, matrix, n_rows, n_cols, n_ws, kernel, kernel_type, cache_size, svmType);
-
-    // Init counters
-    max_outer_iter        = GetDefaultMaxIter(n_train, max_outer_iter);
-    n_iter                = 0;
-    int n_inner_iter      = 0;
-    diff_prev             = 0;
-    n_small_diff          = 0;
-    n_increased_diff      = 0;
-    report_increased_diff = true;
-    bool keep_going       = true;
-
-    rmm::device_uvector<math_t> nz_da(n_ws, stream);
-    rmm::device_uvector<int> nz_da_idx(n_ws, stream);
-
-    while (n_iter < max_outer_iter && keep_going) {
-      RAFT_CUDA_TRY(cudaMemsetAsync(delta_alpha.data(), 0, n_ws * sizeof(math_t), stream));
-      raft::common::nvtx::push_range("SmoSolver::ws_select");
-      ws.Select(f.data(), alpha.data(), y, C_vec.data());
-      raft::common::nvtx::pop_range();
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-      raft::common::nvtx::push_range("SmoSolver::Kernel");
-
-      cache.InitWorkingSet(ws.GetIndices());
-
-      math_t* cacheTile = cache.getSquareTileWithoutCaching();
-
-      raft::common::nvtx::pop_range();
-      raft::common::nvtx::push_range("SmoSolver::SmoBlockSolve");
-      SmoBlockSolve<math_t, SMO_WS_SIZE><<<1, n_ws, 0, stream>>>(y,
-                                                                 n_train,
-                                                                 alpha.data(),
-                                                                 n_ws,
-                                                                 delta_alpha.data(),
-                                                                 f.data(),
-                                                                 cacheTile,
-                                                                 cache.getKernelIndices(true),
-                                                                 C_vec.data(),
-                                                                 tol,
-                                                                 return_buff.data(),
-                                                                 max_inner_iter,
-                                                                 svmType);
-
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-      raft::update_host(host_return_buff, return_buff.data(), 2, stream);
-      raft::common::nvtx::pop_range();
-      raft::common::nvtx::push_range("SmoSolver::UpdateF");
-      raft::common::nvtx::push_range("SmoSolver::UpdateF::getNnzDaRows");
-      int nnz_da;
-      GetNonzeroDeltaAlpha(delta_alpha.data(),
-                           n_ws,
-                           cache.getKernelIndices(false),
-                           nz_da.data(),
-                           &nnz_da,
-                           nz_da_idx.data(),
-                           stream);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-      // The following should be performed only for elements with nonzero delta_alpha
-      if (nnz_da > 0) {
-        auto batch_descriptor = cache.InitFullTileBatching(nz_da_idx.data(), nnz_da);
-
-        while (cache.getNextBatchKernel(batch_descriptor)) {
-          raft::common::nvtx::pop_range();
-          raft::common::nvtx::push_range("SmoSolver::UpdateF::updateBatch");
-          // do (partial) update
-          UpdateF(f.data() + batch_descriptor.offset,
-                  batch_descriptor.batch_size,
-                  nz_da.data(),
-                  nnz_da,
-                  batch_descriptor.kernel_data);
-          RAFT_CUDA_TRY(cudaPeekAtLastError());
-        }
+template <typename MatrixViewType>
+void SmoSolver<math_t>::Solve(MatrixViewType matrix,
+                              int n_rows,
+                              int n_cols,
+                              math_t* y,
+                              const math_t* sample_weight,
+                              math_t** dual_coefs,
+                              int* n_support,
+                              SupportStorage<math_t>* support_matrix,
+                              int** idx,
+                              math_t* b,
+                              int max_outer_iter,
+                              int max_inner_iter)
+{
+  constexpr const int SMO_WS_SIZE = 1024;
+  // Prepare data structures for SMO
+  WorkingSet<math_t> ws(handle, stream, n_rows, SMO_WS_SIZE, svmType);
+  n_ws = ws.GetSize();
+  Initialize(&y, sample_weight, n_rows, n_cols);
+  KernelCache<math_t, MatrixViewType> cache(
+    handle, matrix, n_rows, n_cols, n_ws, kernel, kernel_type, cache_size, svmType);
+
+  // Init counters
+  max_outer_iter        = GetDefaultMaxIter(n_train, max_outer_iter);
+  n_iter                = 0;
+  int n_inner_iter      = 0;
+  diff_prev             = 0;
+  n_small_diff          = 0;
+  n_increased_diff      = 0;
+  report_increased_diff = true;
+  bool keep_going       = true;
+
+  rmm::device_uvector<math_t> nz_da(n_ws, stream);
+  rmm::device_uvector<int> nz_da_idx(n_ws, stream);
+
+  while (n_iter < max_outer_iter && keep_going) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(delta_alpha.data(), 0, n_ws * sizeof(math_t), stream));
+    raft::common::nvtx::push_range("SmoSolver::ws_select");
+    ws.Select(f.data(), alpha.data(), y, C_vec.data());
+    raft::common::nvtx::pop_range();
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    raft::common::nvtx::push_range("SmoSolver::Kernel");
+
+    cache.InitWorkingSet(ws.GetIndices());
+
+    math_t* cacheTile = cache.getSquareTileWithoutCaching();
+
+    raft::common::nvtx::pop_range();
+    raft::common::nvtx::push_range("SmoSolver::SmoBlockSolve");
+    SmoBlockSolve<math_t, SMO_WS_SIZE><<<1, n_ws, 0, stream>>>(y,
+                                                               n_train,
+                                                               alpha.data(),
+                                                               n_ws,
+                                                               delta_alpha.data(),
+                                                               f.data(),
+                                                               cacheTile,
+                                                               cache.getKernelIndices(true),
+                                                               C_vec.data(),
+                                                               tol,
+                                                               return_buff.data(),
+                                                               max_inner_iter,
+                                                               svmType);
+
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    raft::update_host(host_return_buff, return_buff.data(), 2, stream);
+    raft::common::nvtx::pop_range();
+    raft::common::nvtx::push_range("SmoSolver::UpdateF");
+    raft::common::nvtx::push_range("SmoSolver::UpdateF::getNnzDaRows");
+    int nnz_da;
+    GetNonzeroDeltaAlpha(delta_alpha.data(),
+                         n_ws,
+                         cache.getKernelIndices(false),
+                         nz_da.data(),
+                         &nnz_da,
+                         nz_da_idx.data(),
+                         stream);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    // The following should be performed only for elements with nonzero delta_alpha
+    if (nnz_da > 0) {
+      auto batch_descriptor = cache.InitFullTileBatching(nz_da_idx.data(), nnz_da);
+
+      while (cache.getNextBatchKernel(batch_descriptor)) {
+        raft::common::nvtx::pop_range();
+        raft::common::nvtx::push_range("SmoSolver::UpdateF::updateBatch");
+        // do (partial) update
+        UpdateF(f.data() + batch_descriptor.offset,
+                batch_descriptor.batch_size,
+                nz_da.data(),
+                nnz_da,
+                batch_descriptor.kernel_data);
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
       }
-      handle.sync_stream(stream);
-      raft::common::nvtx::pop_range();
-      raft::common::nvtx::pop_range();  // ("SmoSolver::UpdateF");
-
-      math_t diff = host_return_buff[0];
-      keep_going  = CheckStoppingCondition(diff);
-      n_inner_iter += host_return_buff[1];
-      n_iter++;
-      if (n_iter % 500 == 0) { CUML_LOG_DEBUG("SMO iteration %d, diff %lf", n_iter, (double)diff); }
     }
+    handle.sync_stream(stream);
+    raft::common::nvtx::pop_range();
+    raft::common::nvtx::pop_range();  // ("SmoSolver::UpdateF");
+
+    math_t diff = host_return_buff[0];
+    keep_going  = CheckStoppingCondition(diff);
+    n_inner_iter += host_return_buff[1];
+    n_iter++;
+    if (n_iter % 500 == 0) { CUML_LOG_DEBUG("SMO iteration %d, diff %lf", n_iter, (double)diff); }
+  }
 
-    CUML_LOG_DEBUG(
-      "SMO solver finished after %d outer iterations, total inner %d"
-      " iterations, and diff %lf",
-      n_iter,
-      n_inner_iter,
-      diff_prev);
+  CUML_LOG_DEBUG(
+    "SMO solver finished after %d outer iterations, total inner %d"
+    " iterations, and diff %lf",
+    n_iter,
+    n_inner_iter,
+    diff_prev);
 
-    Results<math_t, MatrixViewType> res(handle, matrix, n_rows, n_cols, y, C_vec.data(), svmType);
-    res.Get(alpha.data(), f.data(), dual_coefs, n_support, idx, support_matrix, b);
+  Results<math_t, MatrixViewType> res(handle, matrix, n_rows, n_cols, y, C_vec.data(), svmType);
+  res.Get(alpha.data(), f.data(), dual_coefs, n_support, idx, support_matrix, b);
 
-    ReleaseBuffers();
-  }
+  ReleaseBuffers();
+}
 
-  /**
-   * @brief Update the f vector after a block solve step.
-   *
-   * \f[ f_i = f_i + \sum_{k\in WS} K_{i,k} * \Delta \alpha_k, \f]
-   * where i = [0..n_train-1], WS is the set of workspace indices,
-   * and \f$K_{i,k}\f$ is the kernel function evaluated for training vector x_i and workspace vector
-   * x_k.
-   *
-   * @param f size [n_train]
-   * @param n_rows
-   * @param delta_alpha size [n_ws]
-   * @param n_ws
-   * @param cacheTile kernel function evaluated for the following set K[X,x_ws],
-   *   size [n_rows, n_ws]
-   */
-  void UpdateF(math_t* f, int n_rows, const math_t* delta_alpha, int n_ws, const math_t* cacheTile)
-  {
-    // multipliers used in the equation : f = 1*cachtile * delta_alpha + 1*f
-    math_t one = 1;
+/**
+ * @brief Update the f vector after a block solve step.
+ *
+ * \f[ f_i = f_i + \sum_{k\in WS} K_{i,k} * \Delta \alpha_k, \f]
+ * where i = [0..n_train-1], WS is the set of workspace indices,
+ * and \f$K_{i,k}\f$ is the kernel function evaluated for training vector x_i and workspace vector
+ * x_k.
+ *
+ * @param f size [n_train]
+ * @param n_rows
+ * @param delta_alpha size [n_ws]
+ * @param n_ws
+ * @param cacheTile kernel function evaluated for the following set K[X,x_ws],
+ *   size [n_rows, n_ws]
+ */
+template <typename math_t>
+void SmoSolver<math_t>::UpdateF(
+  math_t* f, int n_rows, const math_t* delta_alpha, int n_ws, const math_t* cacheTile)
+{
+  // multipliers used in the equation : f = 1*cachtile * delta_alpha + 1*f
+  math_t one = 1;
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                   CUBLAS_OP_N,
+                                                   n_rows,
+                                                   n_ws,
+                                                   &one,
+                                                   cacheTile,
+                                                   n_rows,
+                                                   delta_alpha,
+                                                   1,
+                                                   &one,
+                                                   f,
+                                                   1,
+                                                   stream));
+  if (svmType == EPSILON_SVR) {
+    // SVR has doubled the number of training vectors and we need to update
+    // alpha for both batches individually
     // #TODO: Call from public API when ready
     RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
                                                      CUBLAS_OP_N,
@@ -295,285 +264,145 @@ class SmoSolver {
                                                      delta_alpha,
                                                      1,
                                                      &one,
-                                                     f,
+                                                     f + n_rows,
                                                      1,
                                                      stream));
-    if (svmType == EPSILON_SVR) {
-      // SVR has doubled the number of training vectors and we need to update
-      // alpha for both batches individually
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
-                                                       CUBLAS_OP_N,
-                                                       n_rows,
-                                                       n_ws,
-                                                       &one,
-                                                       cacheTile,
-                                                       n_rows,
-                                                       delta_alpha,
-                                                       1,
-                                                       &one,
-                                                       f + n_rows,
-                                                       1,
-                                                       stream));
-    }
   }
+}
 
-  /** @brief Initialize the problem to solve.
-   *
-   * Both SVC and SVR are solved as a classification problem.
-   * The optimization target (W) does not appear directly in the SMO
-   * formulation, only its derivative through f (optimality indicator vector):
-   * \f[ f_i = y_i \frac{\partial W }{\partial \alpha_i}. \f]
-   *
-   * The f_i values are initialized here, and updated at every solver iteration
-   * when alpha changes. The update step is the same for SVC and SVR, only the
-   * init step differs.
-   *
-   * Additionally, we zero init the dual coefficients (alpha), and initialize
-   * class labels for SVR.
-   *
-   * @param[inout] y on entry class labels or target values,
-   *    on exit device pointer to class labels
-   * @param[in] sample_weight sample weights (can be nullptr, otherwise device
-   *    array of size [n_rows])
-   * @param[in] n_rows
-   * @param[in] n_cols
-   */
-  void Initialize(math_t** y, const math_t* sample_weight, int n_rows, int n_cols)
-  {
-    this->n_rows = n_rows;
-    this->n_cols = n_cols;
-    n_train      = (svmType == EPSILON_SVR) ? n_rows * 2 : n_rows;
-    ResizeBuffers(n_train, n_cols);
-    // Zero init alpha
-    RAFT_CUDA_TRY(cudaMemsetAsync(alpha.data(), 0, n_train * sizeof(math_t), stream));
-    InitPenalty(C_vec.data(), sample_weight, n_rows);
-    // Init f (and also class labels for SVR)
-    switch (svmType) {
-      case C_SVC: SvcInit(*y); break;
-      case EPSILON_SVR:
-        SvrInit(*y, n_rows, y_label.data(), f.data());
-        // We return the pointer to the class labels (the target values are
-        // not needed anymore, they are incorporated in f).
-        *y = y_label.data();
-        break;
-      default: THROW("SMO initialization not implemented SvmType=%d", svmType);
-    }
-  }
-
-  void InitPenalty(math_t* C_vec, const math_t* sample_weight, int n_rows)
-  {
-    if (sample_weight == nullptr) {
-      thrust::device_ptr<math_t> c_ptr(C_vec);
-      thrust::fill(thrust::cuda::par.on(stream), c_ptr, c_ptr + n_train, C);
-    } else {
-      math_t C = this->C;
-      raft::linalg::unaryOp(
-        C_vec, sample_weight, n_rows, [C] __device__(math_t w) { return C * w; }, stream);
-      if (n_train > n_rows) {
-        // Set the same penalty parameter for the duplicate set of vectors
-        raft::linalg::unaryOp(
-          C_vec + n_rows,
-          sample_weight,
-          n_rows,
-          [C] __device__(math_t w) { return C * w; },
-          stream);
-      }
-    }
-  }
-  /** @brief Initialize Support Vector Classification
-   *
-   * We would like to maximize the following quantity
-   * \f[ W(\mathbf{\alpha}) = -\mathbf{\alpha}^T \mathbf{1}
-   *   + \frac{1}{2} \mathbf{\alpha}^T Q \mathbf{\alpha}, \f]
-   *
-   * We initialize f as:
-   * \f[ f_i = y_i \frac{\partial W(\mathbf{\alpha})}{\partial \alpha_i} =
-   *          -y_i +   y_j \alpha_j K(\mathbf{x}_i, \mathbf{x}_j) \f]
-   *
-   * @param [in] y device pointer of class labels size [n_rows]
-   */
-  void SvcInit(const math_t* y)
-  {
-    raft::linalg::unaryOp(
-      f.data(), y, n_rows, [] __device__(math_t y) { return -y; }, stream);
+/** @brief Initialize the problem to solve.
+ *
+ * Both SVC and SVR are solved as a classification problem.
+ * The optimization target (W) does not appear directly in the SMO
+ * formulation, only its derivative through f (optimality indicator vector):
+ * \f[ f_i = y_i \frac{\partial W }{\partial \alpha_i}. \f]
+ *
+ * The f_i values are initialized here, and updated at every solver iteration
+ * when alpha changes. The update step is the same for SVC and SVR, only the
+ * init step differs.
+ *
+ * Additionally, we zero init the dual coefficients (alpha), and initialize
+ * class labels for SVR.
+ *
+ * @param[inout] y on entry class labels or target values,
+ *    on exit device pointer to class labels
+ * @param[in] sample_weight sample weights (can be nullptr, otherwise device
+ *    array of size [n_rows])
+ * @param[in] n_rows
+ * @param[in] n_cols
+ */
+template <typename math_t>
+void SmoSolver<math_t>::Initialize(math_t** y, const math_t* sample_weight, int n_rows, int n_cols)
+{
+  this->n_rows = n_rows;
+  this->n_cols = n_cols;
+  n_train      = (svmType == EPSILON_SVR) ? n_rows * 2 : n_rows;
+  ResizeBuffers(n_train, n_cols);
+  // Zero init alpha
+  RAFT_CUDA_TRY(cudaMemsetAsync(alpha.data(), 0, n_train * sizeof(math_t), stream));
+  InitPenalty(C_vec.data(), sample_weight, n_rows);
+  // Init f (and also class labels for SVR)
+  switch (svmType) {
+    case C_SVC: SvcInit(*y); break;
+    case EPSILON_SVR:
+      SvrInit(*y, n_rows, y_label.data(), f.data());
+      // We return the pointer to the class labels (the target values are
+      // not needed anymore, they are incorporated in f).
+      *y = y_label.data();
+      break;
+    default: THROW("SMO initialization not implemented SvmType=%d", svmType);
   }
+}
 
-  /**
-   * @brief Initializes the solver for epsilon-SVR.
-   *
-   * For regression we are optimizing the following quantity
-   * \f[
-   * W(\alpha^+, \alpha^-) =
-   * \epsilon \sum_{i=1}^l (\alpha_i^+ + \alpha_i^-)
-   * - \sum_{i=1}^l yc_i (\alpha_i^+ - \alpha_i^-)
-   * + \frac{1}{2} \sum_{i,j=1}^l
-   *   (\alpha_i^+ - \alpha_i^-)(\alpha_j^+ - \alpha_j^-) K(\bm{x}_i, \bm{x}_j)
-   * \f]
-   *
-   * Then \f$ f_i = y_i \frac{\partial W(\alpha}{\partial \alpha_i} \f$
-   *      \f$     = yc_i*epsilon - yr_i \f$
-   *
-   * Additionally we set class labels for the training vectors.
-   *
-   * References:
-   * [1] B. Schölkopf et. al (1998): New support vector algorithms,
-   *     NeuroCOLT2 Technical Report Series, NC2-TR-1998-031, Section 6
-   * [2] A.J. Smola, B. Schölkopf (2004): A tutorial on support vector
-   *     regression, Statistics and Computing 14, 199–222
-   * [3] Orchel M. (2011) Support Vector Regression as a Classification Problem
-   *     with a Priori Knowledge in the Form of Detractors,
-   *     Man-Machine Interactions 2. Advances in Intelligent and Soft Computing,
-   *     vol 103
-   *
-   * @param [in] yr device pointer with values for regression, size [n_rows]
-   * @param [in] n_rows
-   * @param [out] yc device pointer to classes associated to the dual
-   *     coefficients, size [n_rows*2]
-   * @param [out] f device pointer f size [n_rows*2]
-   */
-  void SvrInit(const math_t* yr, int n_rows, math_t* yc, math_t* f)
-  {
-    // Init class labels to [1, 1, 1, ..., -1, -1, -1, ...]
-    thrust::device_ptr<math_t> yc_ptr(yc);
-    thrust::constant_iterator<math_t> one(1);
-    thrust::copy(thrust::cuda::par.on(stream), one, one + n_rows, yc_ptr);
-    thrust::constant_iterator<math_t> minus_one(-1);
-    thrust::copy(thrust::cuda::par.on(stream), minus_one, minus_one + n_rows, yc_ptr + n_rows);
-
-    // f_i = epsilon - y_i, for i \in [0..n_rows-1]
-    math_t epsilon = this->epsilon;
-    raft::linalg::unaryOp(
-      f, yr, n_rows, [epsilon] __device__(math_t y) { return epsilon - y; }, stream);
-
-    // f_i = -epsilon - y_i, for i \in [n_rows..2*n_rows-1]
+template <typename math_t>
+void SmoSolver<math_t>::InitPenalty(math_t* C_vec, const math_t* sample_weight, int n_rows)
+{
+  if (sample_weight == nullptr) {
+    thrust::device_ptr<math_t> c_ptr(C_vec);
+    thrust::fill(thrust::cuda::par.on(stream), c_ptr, c_ptr + n_train, C);
+  } else {
+    math_t C = this->C;
     raft::linalg::unaryOp(
-      f + n_rows, yr, n_rows, [epsilon] __device__(math_t y) { return -epsilon - y; }, stream);
-  }
-
- private:
-  const raft::handle_t& handle;
-  cudaStream_t stream;
-
-  int n_rows  = 0;  //!< training data number of rows
-  int n_cols  = 0;  //!< training data number of columns
-  int n_ws    = 0;  //!< size of the working set
-  int n_train = 0;  //!< number of training vectors (including duplicates for SVR)
-
-  // Buffers for the domain [n_train]
-  rmm::device_uvector<math_t> alpha;    //!< dual coordinates
-  rmm::device_uvector<math_t> f;        //!< optimality indicator vector
-  rmm::device_uvector<math_t> y_label;  //!< extra label for regression
-
-  rmm::device_uvector<math_t> C_vec;  //!< penalty parameter vector
-
-  // Buffers for the working set [n_ws]
-  //! change in alpha parameter during a blocksolve step
-  rmm::device_uvector<math_t> delta_alpha;
-
-  // Buffers to return some parameters from the kernel (iteration number, and
-  // convergence information)
-  rmm::device_uvector<math_t> return_buff;
-  math_t host_return_buff[2];
-
-  math_t C;
-  math_t tol;      //!< tolerance for stopping condition
-  math_t epsilon;  //!< epsilon parameter for epsiolon-SVR
-
-  raft::distance::kernels::GramMatrixBase<math_t>* kernel;
-  raft::distance::kernels::KernelType kernel_type;
-  float cache_size;  //!< size of kernel cache in MiB
-
-  SvmType svmType;  ///!< Type of the SVM problem to solve
-
-  // Variables to track convergence of training
-  math_t diff_prev;
-  int n_small_diff;
-  int nochange_steps;
-  int n_increased_diff;
-  int n_iter;
-  bool report_increased_diff;
-
-  bool CheckStoppingCondition(math_t diff)
-  {
-    if (diff > diff_prev * 1.5 && n_iter > 0) {
-      // Ideally, diff should decrease monotonically. In practice we can have
-      // small fluctuations (10% increase is not uncommon). Here we consider a
-      // 50% increase in the diff value large enough to indicate a problem.
-      // The 50% value is an educated guess that triggers the convergence debug
-      // message for problematic use cases while avoids false alarms in many
-      // other cases.
-      n_increased_diff++;
-    }
-    if (report_increased_diff && n_iter > 100 && n_increased_diff > n_iter * 0.1) {
-      CUML_LOG_DEBUG(
-        "Solver is not converging monotonically. This might be caused by "
-        "insufficient normalization of the feature columns. In that case "
-        "MinMaxScaler((0,1)) could help. Alternatively, for nonlinear kernels, "
-        "you can try to increase the gamma parameter. To limit execution time, "
-        "you can also adjust the number of iterations using the max_iter "
-        "parameter.");
-      report_increased_diff = false;
-    }
-    bool keep_going = true;
-    if (abs(diff - diff_prev) < 0.001 * tol) {
-      n_small_diff++;
-    } else {
-      diff_prev    = diff;
-      n_small_diff = 0;
-    }
-    if (n_small_diff > nochange_steps) {
-      CUML_LOG_ERROR(
-        "SMO error: Stopping due to unchanged diff over %d"
-        " consecutive steps",
-        nochange_steps);
-      keep_going = false;
-    }
-    if (diff < tol) keep_going = false;
-    if (isnan(diff)) {
-      std::string txt;
-      if (std::is_same<float, math_t>::value) {
-        txt +=
-          " This might be caused by floating point overflow. In such case using"
-          " fp64 could help. Alternatively, try gamma='scale' kernel"
-          " parameter.";
-      }
-      THROW("SMO error: NaN found during fitting.%s", txt.c_str());
-    }
-    return keep_going;
-  }
-
-  /// Return the number of maximum iterations.
-  int GetDefaultMaxIter(int n_train, int max_outer_iter)
-  {
-    if (max_outer_iter == -1) {
-      max_outer_iter = n_train < std::numeric_limits<int>::max() / 100
-                         ? n_train * 100
-                         : std::numeric_limits<int>::max();
-      max_outer_iter = max(100000, max_outer_iter);
+      C_vec, sample_weight, n_rows, [C] __device__(math_t w) { return C * w; }, stream);
+    if (n_train > n_rows) {
+      // Set the same penalty parameter for the duplicate set of vectors
+      raft::linalg::unaryOp(
+        C_vec + n_rows, sample_weight, n_rows, [C] __device__(math_t w) { return C * w; }, stream);
     }
-    // else we have user defined iteration count which we do not change
-    return max_outer_iter;
-  }
-
-  void ResizeBuffers(int n_train, int n_cols)
-  {
-    // This needs to know n_train, therefore it can be only called during solve
-    alpha.resize(n_train, stream);
-    C_vec.resize(n_train, stream);
-    f.resize(n_train, stream);
-    delta_alpha.resize(n_ws, stream);
-    if (svmType == EPSILON_SVR) y_label.resize(n_train, stream);
   }
+}
 
-  void ReleaseBuffers()
-  {
-    alpha.release();
-    delta_alpha.release();
-    f.release();
-    y_label.release();
-  }
-};
+/** @brief Initialize Support Vector Classification
+ *
+ * We would like to maximize the following quantity
+ * \f[ W(\mathbf{\alpha}) = -\mathbf{\alpha}^T \mathbf{1}
+ *   + \frac{1}{2} \mathbf{\alpha}^T Q \mathbf{\alpha}, \f]
+ *
+ * We initialize f as:
+ * \f[ f_i = y_i \frac{\partial W(\mathbf{\alpha})}{\partial \alpha_i} =
+ *          -y_i +   y_j \alpha_j K(\mathbf{x}_i, \mathbf{x}_j) \f]
+ *
+ * @param [in] y device pointer of class labels size [n_rows]
+ */
+template <typename math_t>
+void SmoSolver<math_t>::SvcInit(const math_t* y)
+{
+  raft::linalg::unaryOp(
+    f.data(), y, n_rows, [] __device__(math_t y) { return -y; }, stream);
+}
 
-};  // end namespace SVM
-};  // end namespace ML
+/**
+ * @brief Initializes the solver for epsilon-SVR.
+ *
+ * For regression we are optimizing the following quantity
+ * \f[
+ * W(\alpha^+, \alpha^-) =
+ * \epsilon \sum_{i=1}^l (\alpha_i^+ + \alpha_i^-)
+ * - \sum_{i=1}^l yc_i (\alpha_i^+ - \alpha_i^-)
+ * + \frac{1}{2} \sum_{i,j=1}^l
+ *   (\alpha_i^+ - \alpha_i^-)(\alpha_j^+ - \alpha_j^-) K(\bm{x}_i, \bm{x}_j)
+ * \f]
+ *
+ * Then \f$ f_i = y_i \frac{\partial W(\alpha}{\partial \alpha_i} \f$
+ *      \f$     = yc_i*epsilon - yr_i \f$
+ *
+ * Additionally we set class labels for the training vectors.
+ *
+ * References:
+ * [1] B. Schölkopf et. al (1998): New support vector algorithms,
+ *     NeuroCOLT2 Technical Report Series, NC2-TR-1998-031, Section 6
+ * [2] A.J. Smola, B. Schölkopf (2004): A tutorial on support vector
+ *     regression, Statistics and Computing 14, 199–222
+ * [3] Orchel M. (2011) Support Vector Regression as a Classification Problem
+ *     with a Priori Knowledge in the Form of Detractors,
+ *     Man-Machine Interactions 2. Advances in Intelligent and Soft Computing,
+ *     vol 103
+ *
+ * @param [in] yr device pointer with values for regression, size [n_rows]
+ * @param [in] n_rows
+ * @param [out] yc device pointer to classes associated to the dual
+ *     coefficients, size [n_rows*2]
+ * @param [out] f device pointer f size [n_rows*2]
+ */
+template <typename math_t>
+void SmoSolver<math_t>::SvrInit(const math_t* yr, int n_rows, math_t* yc, math_t* f)
+{
+  // Init class labels to [1, 1, 1, ..., -1, -1, -1, ...]
+  thrust::device_ptr<math_t> yc_ptr(yc);
+  thrust::constant_iterator<math_t> one(1);
+  thrust::copy(thrust::cuda::par.on(stream), one, one + n_rows, yc_ptr);
+  thrust::constant_iterator<math_t> minus_one(-1);
+  thrust::copy(thrust::cuda::par.on(stream), minus_one, minus_one + n_rows, yc_ptr + n_rows);
+
+  // f_i = epsilon - y_i, for i \in [0..n_rows-1]
+  math_t epsilon = this->epsilon;
+  raft::linalg::unaryOp(
+    f, yr, n_rows, [epsilon] __device__(math_t y) { return epsilon - y; }, stream);
+
+  // f_i = -epsilon - y_i, for i \in [n_rows..2*n_rows-1]
+  raft::linalg::unaryOp(
+    f + n_rows, yr, n_rows, [epsilon] __device__(math_t y) { return -epsilon - y; }, stream);
+}
+
+}  // namespace SVM
+}  // namespace ML
diff --git a/cpp/src/svm/smosolver.h b/cpp/src/svm/smosolver.h
new file mode 100644
index 0000000000..d2355d68a5
--- /dev/null
+++ b/cpp/src/svm/smosolver.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuml/common/logger.hpp>
+#include <cuml/svm/svm_model.h>
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/device_ptr.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
+namespace ML {
+namespace SVM {
+
+/**
+ * @brief Solve the quadratic optimization problem using two level decomposition
+ * and Sequential Minimal Optimization (SMO).
+ *
+ * The general decomposition idea by Osuna is to choose q examples from all the
+ * training examples, and solve the QP problem for this subset (discussed in
+ * section 11.2 by Joachims [1]). SMO is the extreme case where we choose q=2.
+ *
+ * Here we follow [2] and [3] and use two level decomposition. First we set
+ * q_1=1024, and solve the QP sub-problem for that (let's call it QP1). This is
+ * the outer iteration, implemented in SmoSolver::Solve.
+ *
+ * To solve QP1, we use another decomposition, specifically the SMO (q_2 = 2),
+ * which is implemented in SmoBlockSolve.
+ *
+ * References:
+ * - [1] Joachims, T. Making large-scale support vector machine learning
+ *      practical. In B. Scholkopf, C. Burges, & A. Smola (Eds.), Advances in
+ *      kernel methods: Support vector machines. Cambridge, MA: MIT Press (1998)
+ * - [2] J. Vanek et al. A GPU-Architecture Optimized Hierarchical Decomposition
+ *      Algorithm for Support VectorMachine Training, IEEE Transactions on
+ *      Parallel and Distributed Systems, vol 28, no 12, 3330, (2017)
+ * - [3] Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal
+ *      of Machine Learning Research, 19, 1-5 (2018)
+ */
+template <typename math_t>
+class SmoSolver {
+ public:
+  SmoSolver(const raft::handle_t& handle,
+            SvmParameter param,
+            raft::distance::kernels::KernelType kernel_type,
+            raft::distance::kernels::GramMatrixBase<math_t>* kernel)
+    : handle(handle),
+      C(param.C),
+      tol(param.tol),
+      kernel(kernel),
+      kernel_type(kernel_type),
+      cache_size(param.cache_size),
+      nochange_steps(param.nochange_steps),
+      epsilon(param.epsilon),
+      svmType(param.svmType),
+      stream(handle.get_stream()),
+      return_buff(2, stream),
+      alpha(0, stream),
+      C_vec(0, stream),
+      delta_alpha(0, stream),
+      f(0, stream),
+      y_label(0, stream)
+  {
+    ML::Logger::get().setLevel(param.verbosity);
+  }
+
+  void GetNonzeroDeltaAlpha(const math_t* vec,
+                            int n_ws,
+                            const int* idx,
+                            math_t* nz_vec,
+                            int* n_nz,
+                            int* nz_idx,
+                            cudaStream_t stream);
+  /**
+   * @brief Solve the quadratic optimization problem.
+   *
+   * The output arrays (dual_coefs, support_matrix, idx) will be allocated on the
+   * device, they should be unallocated on entry.
+   *
+   * @param [in] matrix training vectors in matrix format(MLCommon::Matrix::Matrix),
+   * size [n_rows x * n_cols]
+   * @param [in] n_rows number of rows (training vectors)
+   * @param [in] n_cols number of columns (features)
+   * @param [in] y labels (values +/-1), size [n_rows]
+   * @param [in] sample_weight device array of sample weights (or nullptr if not
+   *     applicable)
+   * @param [out] dual_coefs size [n_support] on exit
+   * @param [out] n_support number of support vectors
+   * @param [out] support_matrix support vectors in matrix format, size [n_support, n_cols]
+   * @param [out] idx the original training set indices of the support vectors, size [n_support]
+   * @param [out] b scalar constant for the decision function
+   * @param [in] max_outer_iter maximum number of outer iteration (default 100 * n_rows)
+   * @param [in] max_inner_iter maximum number of inner iterations (default 10000)
+   */
+  template <typename MatrixViewType>
+  void Solve(MatrixViewType matrix,
+             int n_rows,
+             int n_cols,
+             math_t* y,
+             const math_t* sample_weight,
+             math_t** dual_coefs,
+             int* n_support,
+             SupportStorage<math_t>* support_matrix,
+             int** idx,
+             math_t* b,
+             int max_outer_iter = -1,
+             int max_inner_iter = 10000);
+
+  /**
+   * @brief Update the f vector after a block solve step.
+   *
+   * \f[ f_i = f_i + \sum_{k\in WS} K_{i,k} * \Delta \alpha_k, \f]
+   * where i = [0..n_train-1], WS is the set of workspace indices,
+   * and \f$K_{i,k}\f$ is the kernel function evaluated for training vector x_i and workspace vector
+   * x_k.
+   *
+   * @param f size [n_train]
+   * @param n_rows
+   * @param delta_alpha size [n_ws]
+   * @param n_ws
+   * @param cacheTile kernel function evaluated for the following set K[X,x_ws],
+   *   size [n_rows, n_ws]
+   */
+  void UpdateF(math_t* f, int n_rows, const math_t* delta_alpha, int n_ws, const math_t* cacheTile);
+
+  /** @brief Initialize the problem to solve.
+   *
+   * Both SVC and SVR are solved as a classification problem.
+   * The optimization target (W) does not appear directly in the SMO
+   * formulation, only its derivative through f (optimality indicator vector):
+   * \f[ f_i = y_i \frac{\partial W }{\partial \alpha_i}. \f]
+   *
+   * The f_i values are initialized here, and updated at every solver iteration
+   * when alpha changes. The update step is the same for SVC and SVR, only the
+   * init step differs.
+   *
+   * Additionally, we zero init the dual coefficients (alpha), and initialize
+   * class labels for SVR.
+   *
+   * @param[inout] y on entry class labels or target values,
+   *    on exit device pointer to class labels
+   * @param[in] sample_weight sample weights (can be nullptr, otherwise device
+   *    array of size [n_rows])
+   * @param[in] n_rows
+   * @param[in] n_cols
+   */
+  void Initialize(math_t** y, const math_t* sample_weight, int n_rows, int n_cols);
+
+  void InitPenalty(math_t* C_vec, const math_t* sample_weight, int n_rows);
+
+  /** @brief Initialize Support Vector Classification
+   *
+   * We would like to maximize the following quantity
+   * \f[ W(\mathbf{\alpha}) = -\mathbf{\alpha}^T \mathbf{1}
+   *   + \frac{1}{2} \mathbf{\alpha}^T Q \mathbf{\alpha}, \f]
+   *
+   * We initialize f as:
+   * \f[ f_i = y_i \frac{\partial W(\mathbf{\alpha})}{\partial \alpha_i} =
+   *          -y_i +   y_j \alpha_j K(\mathbf{x}_i, \mathbf{x}_j) \f]
+   *
+   * @param [in] y device pointer of class labels size [n_rows]
+   */
+  void SvcInit(const math_t* y);
+
+  /**
+   * @brief Initializes the solver for epsilon-SVR.
+   *
+   * For regression we are optimizing the following quantity
+   * \f[
+   * W(\alpha^+, \alpha^-) =
+   * \epsilon \sum_{i=1}^l (\alpha_i^+ + \alpha_i^-)
+   * - \sum_{i=1}^l yc_i (\alpha_i^+ - \alpha_i^-)
+   * + \frac{1}{2} \sum_{i,j=1}^l
+   *   (\alpha_i^+ - \alpha_i^-)(\alpha_j^+ - \alpha_j^-) K(\bm{x}_i, \bm{x}_j)
+   * \f]
+   *
+   * Then \f$ f_i = y_i \frac{\partial W(\alpha}{\partial \alpha_i} \f$
+   *      \f$     = yc_i*epsilon - yr_i \f$
+   *
+   * Additionally we set class labels for the training vectors.
+   *
+   * References:
+   * [1] B. Schölkopf et. al (1998): New support vector algorithms,
+   *     NeuroCOLT2 Technical Report Series, NC2-TR-1998-031, Section 6
+   * [2] A.J. Smola, B. Schölkopf (2004): A tutorial on support vector
+   *     regression, Statistics and Computing 14, 199–222
+   * [3] Orchel M. (2011) Support Vector Regression as a Classification Problem
+   *     with a Priori Knowledge in the Form of Detractors,
+   *     Man-Machine Interactions 2. Advances in Intelligent and Soft Computing,
+   *     vol 103
+   *
+   * @param [in] yr device pointer with values for regression, size [n_rows]
+   * @param [in] n_rows
+   * @param [out] yc device pointer to classes associated to the dual
+   *     coefficients, size [n_rows*2]
+   * @param [out] f device pointer f size [n_rows*2]
+   */
+  void SvrInit(const math_t* yr, int n_rows, math_t* yc, math_t* f);
+
+ private:
+  const raft::handle_t& handle;
+  cudaStream_t stream;
+
+  int n_rows  = 0;  //!< training data number of rows
+  int n_cols  = 0;  //!< training data number of columns
+  int n_ws    = 0;  //!< size of the working set
+  int n_train = 0;  //!< number of training vectors (including duplicates for SVR)
+
+  // Buffers for the domain [n_train]
+  rmm::device_uvector<math_t> alpha;    //!< dual coordinates
+  rmm::device_uvector<math_t> f;        //!< optimality indicator vector
+  rmm::device_uvector<math_t> y_label;  //!< extra label for regression
+
+  rmm::device_uvector<math_t> C_vec;  //!< penalty parameter vector
+
+  // Buffers for the working set [n_ws]
+  //! change in alpha parameter during a blocksolve step
+  rmm::device_uvector<math_t> delta_alpha;
+
+  // Buffers to return some parameters from the kernel (iteration number, and
+  // convergence information)
+  rmm::device_uvector<math_t> return_buff;
+  math_t host_return_buff[2];
+
+  math_t C;
+  math_t tol;      //!< tolerance for stopping condition
+  math_t epsilon;  //!< epsilon parameter for epsiolon-SVR
+
+  raft::distance::kernels::GramMatrixBase<math_t>* kernel;
+  raft::distance::kernels::KernelType kernel_type;
+  float cache_size;  //!< size of kernel cache in MiB
+
+  SvmType svmType;  ///!< Type of the SVM problem to solve
+
+  // Variables to track convergence of training
+  math_t diff_prev;
+  int n_small_diff;
+  int nochange_steps;
+  int n_increased_diff;
+  int n_iter;
+  bool report_increased_diff;
+
+  bool CheckStoppingCondition(math_t diff)
+  {
+    if (diff > diff_prev * 1.5 && n_iter > 0) {
+      // Ideally, diff should decrease monotonically. In practice we can have
+      // small fluctuations (10% increase is not uncommon). Here we consider a
+      // 50% increase in the diff value large enough to indicate a problem.
+      // The 50% value is an educated guess that triggers the convergence debug
+      // message for problematic use cases while avoids false alarms in many
+      // other cases.
+      n_increased_diff++;
+    }
+    if (report_increased_diff && n_iter > 100 && n_increased_diff > n_iter * 0.1) {
+      CUML_LOG_DEBUG(
+        "Solver is not converging monotonically. This might be caused by "
+        "insufficient normalization of the feature columns. In that case "
+        "MinMaxScaler((0,1)) could help. Alternatively, for nonlinear kernels, "
+        "you can try to increase the gamma parameter. To limit execution time, "
+        "you can also adjust the number of iterations using the max_iter "
+        "parameter.");
+      report_increased_diff = false;
+    }
+    bool keep_going = true;
+    if (abs(diff - diff_prev) < 0.001 * tol) {
+      n_small_diff++;
+    } else {
+      diff_prev    = diff;
+      n_small_diff = 0;
+    }
+    if (n_small_diff > nochange_steps) {
+      CUML_LOG_ERROR(
+        "SMO error: Stopping due to unchanged diff over %d"
+        " consecutive steps",
+        nochange_steps);
+      keep_going = false;
+    }
+    if (diff < tol) keep_going = false;
+    if (isnan(diff)) {
+      std::string txt;
+      if (std::is_same<float, math_t>::value) {
+        txt +=
+          " This might be caused by floating point overflow. In such case using"
+          " fp64 could help. Alternatively, try gamma='scale' kernel"
+          " parameter.";
+      }
+      THROW("SMO error: NaN found during fitting.%s", txt.c_str());
+    }
+    return keep_going;
+  }
+
+  /// Return the number of maximum iterations.
+  int GetDefaultMaxIter(int n_train, int max_outer_iter)
+  {
+    if (max_outer_iter == -1) {
+      max_outer_iter = n_train < std::numeric_limits<int>::max() / 100
+                         ? n_train * 100
+                         : std::numeric_limits<int>::max();
+      max_outer_iter = max(100000, max_outer_iter);
+    }
+    // else we have user defined iteration count which we do not change
+    return max_outer_iter;
+  }
+
+  void ResizeBuffers(int n_train, int n_cols)
+  {
+    // This needs to know n_train, therefore it can be only called during solve
+    alpha.resize(n_train, stream);
+    C_vec.resize(n_train, stream);
+    f.resize(n_train, stream);
+    delta_alpha.resize(n_ws, stream);
+    if (svmType == EPSILON_SVR) y_label.resize(n_train, stream);
+  }
+
+  void ReleaseBuffers()
+  {
+    alpha.release();
+    delta_alpha.release();
+    f.release();
+    y_label.release();
+  }
+};
+
+};  // end namespace SVM
+};  // end namespace ML
diff --git a/cpp/src/svm/sparse_util.cuh b/cpp/src/svm/sparse_util.cuh
index c0fc22628a..c4d0b277e9 100644
--- a/cpp/src/svm/sparse_util.cuh
+++ b/cpp/src/svm/sparse_util.cuh
@@ -15,6 +15,8 @@
  */
 
 #pragma once
+#include <cuml/common/utils.hpp>
+
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/handle.hpp>
@@ -333,12 +335,12 @@ raft::device_csr_matrix_view<math_t, int, int, int> getMatrixBatch(
 }
 
 template <typename math_t>
-static __global__ void extractDenseRowsFromCSR(math_t* out,
-                                               const int* indptr,
-                                               const int* indices,
-                                               const math_t* data,
-                                               const int* row_indices,
-                                               const int num_indices)
+CUML_KERNEL void extractDenseRowsFromCSR(math_t* out,
+                                         const int* indptr,
+                                         const int* indices,
+                                         const math_t* data,
+                                         const int* row_indices,
+                                         const int num_indices)
 {
   assert(gridDim.y == 1 && gridDim.z == 1);
   // all threads in x-direction are responsible for one line of csr
@@ -356,14 +358,14 @@ static __global__ void extractDenseRowsFromCSR(math_t* out,
 }
 
 template <typename math_t>
-static __global__ void extractCSRRowsFromCSR(int* indptr_out,  // already holds end positions
-                                             int* indices_out,
-                                             math_t* data_out,
-                                             const int* indptr_in,
-                                             const int* indices_in,
-                                             const math_t* data_in,
-                                             const int* row_indices,
-                                             const int num_indices)
+CUML_KERNEL void extractCSRRowsFromCSR(int* indptr_out,  // already holds end positions
+                                       int* indices_out,
+                                       math_t* data_out,
+                                       const int* indptr_in,
+                                       const int* indices_in,
+                                       const math_t* data_in,
+                                       const int* row_indices,
+                                       const int num_indices)
 {
   assert(gridDim.y == 1 && gridDim.z == 1);
   // all threads in x-direction are responsible for one line of csr
diff --git a/cpp/src/svm/svc_impl.cuh b/cpp/src/svm/svc_impl.cuh
index 7697d86644..3bd27dc6e4 100644
--- a/cpp/src/svm/svc_impl.cuh
+++ b/cpp/src/svm/svc_impl.cuh
@@ -32,8 +32,10 @@
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/gemv.cuh>
 
+#include <rmm/aligned.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
@@ -70,8 +72,9 @@ void svcFitX(const raft::handle_t& handle,
   {
     rmm::device_uvector<math_t> unique_labels(0, stream);
     model.n_classes = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    model.unique_labels = (math_t*)rmm_alloc->allocate(model.n_classes * sizeof(math_t), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    model.unique_labels                      = (math_t*)rmm_alloc.allocate_async(
+      model.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(model.unique_labels, unique_labels.data(), model.n_classes, stream);
     handle_impl.sync_stream(stream);
   }
@@ -352,27 +355,45 @@ void svcPredictSparse(const raft::handle_t& handle,
 template <typename math_t>
 void svmFreeBuffers(const raft::handle_t& handle, SvmModel<math_t>& m)
 {
-  cudaStream_t stream                        = handle.get_stream();
-  rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-  if (m.dual_coefs) rmm_alloc->deallocate(m.dual_coefs, m.n_support * sizeof(math_t), stream);
-  if (m.support_idx) rmm_alloc->deallocate(m.support_idx, m.n_support * sizeof(int), stream);
+  cudaStream_t stream                      = handle.get_stream();
+  rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+  if (m.dual_coefs)
+    rmm_alloc.deallocate_async(
+      m.dual_coefs, m.n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+  if (m.support_idx)
+    rmm_alloc.deallocate_async(
+      m.support_idx, m.n_support * sizeof(int), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   if (m.support_matrix.indptr) {
-    rmm_alloc->deallocate(m.support_matrix.indptr, (m.n_support + 1) * sizeof(int), stream);
+    rmm_alloc.deallocate_async(m.support_matrix.indptr,
+                               (m.n_support + 1) * sizeof(int),
+                               rmm::CUDA_ALLOCATION_ALIGNMENT,
+                               stream);
     m.support_matrix.indptr = nullptr;
   }
   if (m.support_matrix.indices) {
-    rmm_alloc->deallocate(m.support_matrix.indices, m.support_matrix.nnz * sizeof(int), stream);
+    rmm_alloc.deallocate_async(m.support_matrix.indices,
+                               m.support_matrix.nnz * sizeof(int),
+                               rmm::CUDA_ALLOCATION_ALIGNMENT,
+                               stream);
     m.support_matrix.indices = nullptr;
   }
   if (m.support_matrix.data) {
     if (m.support_matrix.nnz == -1) {
-      rmm_alloc->deallocate(m.support_matrix.data, m.n_support * m.n_cols * sizeof(math_t), stream);
+      rmm_alloc.deallocate_async(m.support_matrix.data,
+                                 m.n_support * m.n_cols * sizeof(math_t),
+                                 rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                 stream);
     } else {
-      rmm_alloc->deallocate(m.support_matrix.data, m.support_matrix.nnz * sizeof(math_t), stream);
+      rmm_alloc.deallocate_async(m.support_matrix.data,
+                                 m.support_matrix.nnz * sizeof(math_t),
+                                 rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                 stream);
     }
   }
   m.support_matrix.nnz = -1;
-  if (m.unique_labels) rmm_alloc->deallocate(m.unique_labels, m.n_classes * sizeof(math_t), stream);
+  if (m.unique_labels)
+    rmm_alloc.deallocate_async(
+      m.unique_labels, m.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   m.dual_coefs    = nullptr;
   m.support_idx   = nullptr;
   m.unique_labels = nullptr;
diff --git a/cpp/src/svm/workingset.cuh b/cpp/src/svm/workingset.cuh
index d52b268720..318ee5e14d 100644
--- a/cpp/src/svm/workingset.cuh
+++ b/cpp/src/svm/workingset.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "smo_sets.cuh"
+#include "workingset.h"
 #include "ws_util.cuh"
 
 #include <cuml/common/logger.hpp>
@@ -60,440 +61,265 @@ __device__ bool always_true(int) { return true; }
  * target label and the decision function value.
  */
 template <typename math_t>
-class WorkingSet {
- public:
-  //!> Workspace selection strategy, note that only FIFO is tested so far
-  bool FIFO_strategy = true;
-
-  /**
-   * @brief Manage a working set.
-   *
-   * @param handle cuml handle implementation
-   * @param stream cuda stream for working set operations
-   * @param n_rows number of training vectors
-   * @param n_ws number of elements in the working set (default 1024)
-   * @param svmType classification or regression
-   */
-  WorkingSet(const raft::handle_t& handle,
-             cudaStream_t stream,
-             int n_rows      = 0,
-             int n_ws        = 0,
-             SvmType svmType = C_SVC)
-    : handle(handle),
-      stream(stream),
-      svmType(svmType),
-      n_rows(n_rows),
-      available(0, stream),
-      available_sorted(0, stream),
-      cub_storage(0, stream),
-      f_idx(0, stream),
-      f_idx_sorted(0, stream),
-      f_sorted(0, stream),
-      idx_tmp(0, stream),
-      idx(0, stream),
-      ws_idx_sorted(0, stream),
-      ws_idx_selected(0, stream),
-      ws_idx_save(0, stream),
-      ws_priority(0, stream),
-      ws_priority_sorted(0, stream),
-      d_num_selected(stream)
-  {
-    n_train = (svmType == EPSILON_SVR) ? n_rows * 2 : n_rows;
-    SetSize(n_train, n_ws);
+inline void WorkingSet<math_t>::SimpleSelect(
+  math_t* f, math_t* alpha, math_t* y, const math_t* C, int n_already_selected)
+{
+  // We are not using the topK kernel, because of the additional lower/upper
+  // constraint
+  int n_needed = n_ws - n_already_selected;
+
+  // Zero the priority of the elements that will be newly selected
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(ws_priority.data() + n_already_selected, 0, n_needed * sizeof(int), stream));
+
+  cub::DeviceRadixSort::SortPairs((void*)cub_storage.data(),
+                                  cub_bytes,
+                                  f,
+                                  f_sorted.data(),
+                                  f_idx.data(),
+                                  f_idx_sorted.data(),
+                                  n_train,
+                                  0,
+                                  (int)8 * sizeof(math_t),
+                                  stream);
+
+  if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
+    std::stringstream ss;
+    raft::print_device_vector("idx_sorted", f_idx_sorted.data(), n_train, ss);
+    CUML_LOG_DEBUG(ss.str().c_str());
   }
-
-  ~WorkingSet() {}
-
-  /**
-   * @brief Set the size of the working set and allocate buffers accordingly.
-   *
-   * @param n_train number of training vectors
-   * @param n_ws working set size (default min(1024, n_train))
-   */
-  void SetSize(int n_train, int n_ws = 0)
-  {
-    if (n_ws == 0 || n_ws > n_train) { n_ws = n_train; }
-    n_ws       = std::min(1024, n_ws);
-    this->n_ws = n_ws;
-    CUML_LOG_DEBUG("Creating working set with %d elements", n_ws);
-    AllocateBuffers();
+  // Select n_ws/2 elements from the upper set with the smallest f value
+  bool* available = this->available.data();
+  set_upper<<<raft::ceildiv(n_train, TPB), TPB, 0, stream>>>(available, n_train, alpha, y, C);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  n_already_selected += GatherAvailable(n_already_selected, n_needed / 2, true);
+
+  // Select n_ws/2 elements from the lower set with the highest f values
+  set_lower<<<raft::ceildiv(n_train, TPB), TPB, 0, stream>>>(available, n_train, alpha, y, C);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  n_already_selected += GatherAvailable(n_already_selected, n_ws - n_already_selected, false);
+
+  // In case we could not find enough elements, then we just fill using the
+  // still available elements.
+  if (n_already_selected < n_ws) {
+    CUML_LOG_WARN(
+      "Warning: could not fill working set, found only %d"
+      " elements",
+      n_already_selected);
+    CUML_LOG_DEBUG("Filling up with unused elements");
+    RAFT_CUDA_TRY(cudaMemset(available, 1, sizeof(bool) * n_train));
+    n_already_selected += GatherAvailable(n_already_selected, n_ws - n_already_selected, true);
   }
+}
 
-  /** Return the size of the working set. */
-  int GetSize() { return n_ws; }
-
-  /**
-   * @brief Return a device pointer to the the working set indices.
-   *
-   * The returned array is owned by WorkingSet.
-   */
-  int* GetIndices() { return idx.data(); }
-
-  /**
-   * @brief Select new elements for a working set.
-   *
-   * Here we follow the working set selection strategy by Joachims [1], we
-   * select n training instances as:
-   *   - select n/2 element of upper set, where f is largest
-   *   - select n/2 from lower set, where f is smallest
-   *
-   * The difference compared to Joachims' strategy is that we can already have
-   * some elements selected by a different strategy, therefore we select only
-   * n = n_ws - n_already_selected.
-   *
-   * References:
-   * [1] Joachims, T. (1998). Making large-scale support vector machine learning
-   *     practical. In B. Scholkopf, C. Burges, & A. Smola (Eds.), Advances in
-   *     kernel methods: Support vector machines. Cambridge, MA: MIT Press
-   *
-   * @param f optimality indicator vector, size [n_train]
-   * @param alpha dual coefficients, size [n_train]
-   * @param y target labels (+/- 1)
-   * @param C penalty parameter vector size [n_train]
-   * @param n_already_selected
-   */
-
-  void SimpleSelect(
-    math_t* f, math_t* alpha, math_t* y, const math_t* C, int n_already_selected = 0)
-  {
-    // We are not using the topK kernel, because of the additional lower/upper
-    // constraint
-    int n_needed = n_ws - n_already_selected;
-
-    // Zero the priority of the elements that will be newly selected
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(ws_priority.data() + n_already_selected, 0, n_needed * sizeof(int), stream));
+/**
+ * @brief Select elements from the previous working set based on their priority.
+ *
+ * We sort the old working set based on their priority in ascending order,
+ * and then select nc elements from free, and then lower/upper bound vectors.
+ * For details see [2].
+ *
+ * See Issue #946.
+ *
+ * References:
+ * [2] T Serafini, L Zanni: On the Working Set selection in grad. projection
+ *     based decomposition techniques for Support Vector Machines
+ *     DOI: 10.1080/10556780500140714
+ *
+ * @param [in] alpha device vector of dual coefficients, size [n_train]
+ * @param [in] C_vec penalty parameter
+ * @param [in] nc number of elements to select
+ */
+template <typename math_t>
+int WorkingSet<math_t>::PrioritySelect(math_t* alpha, const math_t* C, int nc)
+{
+  int n_selected = 0;
+
+  cub::DeviceRadixSort::SortPairs((void*)cub_storage.data(),
+                                  cub_bytes,
+                                  ws_priority.data(),
+                                  ws_priority_sorted.data(),
+                                  idx.data(),
+                                  ws_idx_sorted.data(),
+                                  n_ws,
+                                  0,
+                                  sizeof(int) * 8,
+                                  stream);
+
+  // Select first from free vectors (0<alpha<C)
+  n_selected += SelectPrevWs(
+    2 * nc, n_selected, [alpha, C] HD(int idx) { return 0 < alpha[idx] && alpha[idx] < C[idx]; });
+
+  // then from lower bound (alpha=0)
+  n_selected += SelectPrevWs(2 * nc, n_selected, [alpha] HD(int idx) { return alpha[idx] <= 0; });
+  // and in the end from upper bound vectors (alpha=c)
+  n_selected +=
+    SelectPrevWs(2 * nc, n_selected, [alpha, C] HD(int idx) { return alpha[idx] >= C[idx]; });
+  // we have now idx[0:n_selected] indices from the old working set
+  // we need to update their priority.
+  update_priority<<<raft::ceildiv(n_selected, TPB), TPB, 0, stream>>>(ws_priority.data(),
+                                                                      n_selected,
+                                                                      idx.data(),
+                                                                      n_ws,
+                                                                      ws_idx_sorted.data(),
+                                                                      ws_priority_sorted.data());
+  return n_selected;
+}
 
-    cub::DeviceRadixSort::SortPairs((void*)cub_storage.data(),
+template <typename math_t>
+inline void WorkingSet<math_t>::AllocateBuffers()
+{
+  if (n_ws > 0) {
+    f_idx.resize(n_train, stream);
+    f_idx_sorted.resize(n_train, stream);
+    idx_tmp.resize(n_train, stream);
+    f_sorted.resize(n_train, stream);
+    available.resize(n_train, stream);
+    available_sorted.resize(n_train, stream);
+
+    idx.resize(n_ws, stream);  // allocate(idx, n_ws, stream);
+    ws_idx_sorted.resize(n_ws, stream);
+    ws_idx_save.resize(n_ws, stream);
+    ws_idx_selected.resize(n_ws, stream);
+    ws_priority.resize(n_ws, stream);
+    ws_priority_sorted.resize(n_ws, stream);
+
+    // Determine temporary device storage requirements for cub
+    std::size_t cub_bytes2 = 0;
+    cub::DeviceRadixSort::SortPairs(NULL,
                                     cub_bytes,
-                                    f,
+                                    f_sorted.data(),
                                     f_sorted.data(),
                                     f_idx.data(),
                                     f_idx_sorted.data(),
                                     n_train,
                                     0,
-                                    (int)8 * sizeof(math_t),
+                                    8 * sizeof(math_t),
                                     stream);
-
-    if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
-      std::stringstream ss;
-      raft::print_device_vector("idx_sorted", f_idx_sorted.data(), n_train, ss);
-      CUML_LOG_DEBUG(ss.str().c_str());
-    }
-    // Select n_ws/2 elements from the upper set with the smallest f value
-    bool* available = this->available.data();
-    set_upper<<<raft::ceildiv(n_train, TPB), TPB, 0, stream>>>(available, n_train, alpha, y, C);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    n_already_selected += GatherAvailable(n_already_selected, n_needed / 2, true);
-
-    // Select n_ws/2 elements from the lower set with the highest f values
-    set_lower<<<raft::ceildiv(n_train, TPB), TPB, 0, stream>>>(available, n_train, alpha, y, C);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    n_already_selected += GatherAvailable(n_already_selected, n_ws - n_already_selected, false);
-
-    // In case we could not find enough elements, then we just fill using the
-    // still available elements.
-    if (n_already_selected < n_ws) {
-      CUML_LOG_WARN(
-        "Warning: could not fill working set, found only %d"
-        " elements",
-        n_already_selected);
-      CUML_LOG_DEBUG("Filling up with unused elements");
-      RAFT_CUDA_TRY(cudaMemset(available, 1, sizeof(bool) * n_train));
-      n_already_selected += GatherAvailable(n_already_selected, n_ws - n_already_selected, true);
-    }
-  }
-
-  /**
-   * @brief Select working set indices.
-   *
-   * To avoid training vectors oscillating in and out of the working set, we
-   * keep half of the previous working set, and fill new elements only to the
-   * other half.
-   *
-   * We can have a FIFO retention policy, or we can
-   * consider the time (=ws_priority) a vector already spent in the ws.
-   * References:
-   * [1] Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal
-   *     of Machine Learning Research, 19, 1-5 (2018)
-   *
-   * @param f optimality indicator vector, size [n_train]
-   * @param alpha dual coefficients, size [n_train]
-   * @param y class labels, size [n_train]
-   * @param C penalty parameter vector, size [n_train]
-   */
-  void Select(math_t* f, math_t* alpha, math_t* y, const math_t* C)
-  {
-    if (n_ws >= n_train) {
-      // All elements are selected, we have initialized idx to cover this case
-      return;
-    }
-    int nc         = n_ws / 4;
-    int n_selected = 0;
-    if (firstcall) {
-      if (nc >= 1) {
-        firstcall = false;
-      } else {
-        // This can only happen for n_ws < 4.
-        // We keep the calculation always in firstcall mode (only SimpleSelect
-        // is used, no advanced strategies because we do not have enough elements)
-        //
-        // Nothing to do, firstcall is already true
-      }
-    } else {
-      // keep 1/2 of the old working set
-      if (FIFO_strategy) {
-        // FIFO selection following ThunderSVM
-        raft::copy(idx.data(), ws_idx_save.data() + 2 * nc, 2 * nc, stream);
-        n_selected = nc * 2;
-      } else {
-        // priority based selection preferring to keep newer elements in ws
-        n_selected = PrioritySelect(alpha, C, nc);
-      }
-    }
-    SimpleSelect(f, alpha, y, C, n_selected);
-    raft::copy(ws_idx_save.data(), idx.data(), n_ws, stream);
+    cub::DeviceSelect::If(NULL,
+                          cub_bytes2,
+                          f_idx.data(),
+                          f_idx.data(),
+                          d_num_selected.data(),
+                          n_train,
+                          always_true,
+                          stream);
+    cub_bytes = std::max(cub_bytes, cub_bytes2);
+    cub_storage.resize(cub_bytes, stream);
+    Initialize();
   }
+}
 
-  /**
-   * @brief Select elements from the previous working set based on their priority.
-   *
-   * We sort the old working set based on their priority in ascending order,
-   * and then select nc elements from free, and then lower/upper bound vectors.
-   * For details see [2].
-   *
-   * See Issue #946.
-   *
-   * References:
-   * [2] T Serafini, L Zanni: On the Working Set selection in grad. projection
-   *     based decomposition techniques for Support Vector Machines
-   *     DOI: 10.1080/10556780500140714
-   *
-   * @param [in] alpha device vector of dual coefficients, size [n_train]
-   * @param [in] C_vec penalty parameter
-   * @param [in] nc number of elements to select
-   */
-  int PrioritySelect(math_t* alpha, const math_t* C, int nc)
-  {
-    int n_selected = 0;
-
-    cub::DeviceRadixSort::SortPairs((void*)cub_storage.data(),
-                                    cub_bytes,
-                                    ws_priority.data(),
-                                    ws_priority_sorted.data(),
-                                    idx.data(),
-                                    ws_idx_sorted.data(),
-                                    n_ws,
-                                    0,
-                                    sizeof(int) * 8,
-                                    stream);
-
-    // Select first from free vectors (0<alpha<C)
-    n_selected += SelectPrevWs(
-      2 * nc, n_selected, [alpha, C] HD(int idx) { return 0 < alpha[idx] && alpha[idx] < C[idx]; });
-
-    // then from lower bound (alpha=0)
-    n_selected += SelectPrevWs(2 * nc, n_selected, [alpha] HD(int idx) { return alpha[idx] <= 0; });
-    // and in the end from upper bound vectors (alpha=c)
-    n_selected +=
-      SelectPrevWs(2 * nc, n_selected, [alpha, C] HD(int idx) { return alpha[idx] >= C[idx]; });
-    // we have now idx[0:n_selected] indices from the old working set
-    // we need to update their priority.
-    update_priority<<<raft::ceildiv(n_selected, TPB), TPB, 0, stream>>>(ws_priority.data(),
-                                                                        n_selected,
-                                                                        idx.data(),
-                                                                        n_ws,
-                                                                        ws_idx_sorted.data(),
-                                                                        ws_priority_sorted.data());
-    return n_selected;
+/**
+ * @brief Gather available elements from the working set.
+ *
+ * We select the first (last) n_needed element from the front (end) of
+ * f_idx_sorted. We ignore the elements that are already selected, and those
+ * where this->available is false.
+ *
+ * @param n_already_selected number of element already selected (their indices
+ *   are stored in idx[0:n_already_selected])
+ * @param n_needed number of elements to be selected
+ * @param copy_front if true, then copy the elements from the front of the
+ *        selected list, otherwise copy from the end of the list
+ * @return the number of elements copied (which might be less than n_needed)
+ */
+template <typename math_t>
+inline int WorkingSet<math_t>::GatherAvailable(int n_already_selected,
+                                               int n_needed,
+                                               bool copy_front)
+{
+  // First we update the mask to ignores already selected elements
+  bool* available = this->available.data();
+  if (n_already_selected > 0) {
+    set_unavailable<<<raft::ceildiv(n_train, TPB), TPB, 0, stream>>>(
+      available, n_train, idx.data(), n_already_selected);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
-
- private:
-  const raft::handle_t& handle;
-  cudaStream_t stream;
-
-  bool firstcall = true;
-  int n_train    = 0;  ///< number of training vectors (including duplicates for SVR)
-  int n_rows     = 0;  ///< number of original training vectors (no duplicates)
-  int n_ws       = 0;
-
-  SvmType svmType;
-
-  int TPB = 256;  //!< Threads per block for workspace selection kernels
-
-  // Buffers for the domain size [n_train]
-  rmm::device_uvector<int> f_idx;  //!< Arrays used for sorting for sorting
-  rmm::device_uvector<int> f_idx_sorted;
-  //! Temporary buffer for index manipulation
-  rmm::device_uvector<int> idx_tmp;
-  rmm::device_uvector<math_t> f_sorted;
-  //! Flag vectors available for selection
-  rmm::device_uvector<bool> available;
-  rmm::device_uvector<bool> available_sorted;
-
-  // working set buffers size [n_ws]
-  rmm::device_uvector<int> idx;  //!< Indices of the worknig set
-  rmm::device_uvector<int> ws_idx_sorted;
-  rmm::device_uvector<int> ws_idx_selected;
-  rmm::device_uvector<int> ws_idx_save;
-
-  rmm::device_uvector<int> ws_priority;
-  rmm::device_uvector<int> ws_priority_sorted;
-
-  rmm::device_scalar<int> d_num_selected;
-  std::size_t cub_bytes = 0;
-  rmm::device_uvector<char> cub_storage;
-
-  void AllocateBuffers()
-  {
-    if (n_ws > 0) {
-      f_idx.resize(n_train, stream);
-      f_idx_sorted.resize(n_train, stream);
-      idx_tmp.resize(n_train, stream);
-      f_sorted.resize(n_train, stream);
-      available.resize(n_train, stream);
-      available_sorted.resize(n_train, stream);
-
-      idx.resize(n_ws, stream);  // allocate(idx, n_ws, stream);
-      ws_idx_sorted.resize(n_ws, stream);
-      ws_idx_save.resize(n_ws, stream);
-      ws_idx_selected.resize(n_ws, stream);
-      ws_priority.resize(n_ws, stream);
-      ws_priority_sorted.resize(n_ws, stream);
-
-      // Determine temporary device storage requirements for cub
-      std::size_t cub_bytes2 = 0;
-      cub::DeviceRadixSort::SortPairs(NULL,
-                                      cub_bytes,
-                                      f_sorted.data(),
-                                      f_sorted.data(),
-                                      f_idx.data(),
-                                      f_idx_sorted.data(),
-                                      n_train,
-                                      0,
-                                      8 * sizeof(math_t),
-                                      stream);
-      cub::DeviceSelect::If(NULL,
-                            cub_bytes2,
-                            f_idx.data(),
-                            f_idx.data(),
-                            d_num_selected.data(),
-                            n_train,
-                            always_true,
-                            stream);
-      cub_bytes = std::max(cub_bytes, cub_bytes2);
-      cub_storage.resize(cub_bytes, stream);
-      Initialize();
-    }
+  if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
+    std::stringstream ss;
+    raft::print_device_vector("avail", available, n_train, ss);
+    CUML_LOG_DEBUG(ss.str().c_str());
   }
 
-  /**
-   * @brief Gather available elements from the working set.
-   *
-   * We select the first (last) n_needed element from the front (end) of
-   * f_idx_sorted. We ignore the elements that are already selected, and those
-   * where this->available is false.
-   *
-   * @param n_already_selected number of element already selected (their indices
-   *   are stored in idx[0:n_already_selected])
-   * @param n_needed number of elements to be selected
-   * @param copy_front if true, then copy the elements from the front of the
-   *        selected list, otherwise copy from the end of the list
-   * @return the number of elements copied (which might be less than n_needed)
-   */
-  int GatherAvailable(int n_already_selected, int n_needed, bool copy_front)
-  {
-    // First we update the mask to ignores already selected elements
-    bool* available = this->available.data();
-    if (n_already_selected > 0) {
-      set_unavailable<<<raft::ceildiv(n_train, TPB), TPB, 0, stream>>>(
-        available, n_train, idx.data(), n_already_selected);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-    }
-    if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
-      std::stringstream ss;
-      raft::print_device_vector("avail", available, n_train, ss);
-      CUML_LOG_DEBUG(ss.str().c_str());
-    }
-
-    // Map the mask to the sorted indices
-    thrust::device_ptr<bool> av_ptr(available);
-    thrust::device_ptr<bool> av_sorted_ptr(available_sorted.data());
-    thrust::device_ptr<int> idx_ptr(f_idx_sorted.data());
-    thrust::copy(thrust::cuda::par.on(stream),
-                 thrust::make_permutation_iterator(av_ptr, idx_ptr),
-                 thrust::make_permutation_iterator(av_ptr, idx_ptr + n_train),
-                 av_sorted_ptr);
-    if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
-      std::stringstream ss;
-      raft::print_device_vector("avail_sorted", available_sorted.data(), n_train, ss);
-      CUML_LOG_DEBUG(ss.str().c_str());
-    }
-
-    // Select the available elements
-    cub::DeviceSelect::Flagged((void*)cub_storage.data(),
-                               cub_bytes,
-                               f_idx_sorted.data(),
-                               available_sorted.data(),
-                               idx_tmp.data(),
-                               d_num_selected.data(),
-                               n_train,
-                               stream);
-    int n_selected = d_num_selected.value(stream);
-    handle.sync_stream(stream);
-
-    // Copy to output
-    int n_copy = n_selected > n_needed ? n_needed : n_selected;
-    if (copy_front) {
-      raft::copy(idx.data() + n_already_selected, idx_tmp.data(), n_copy, stream);
-    } else {
-      raft::copy(
-        idx.data() + n_already_selected, idx_tmp.data() + n_selected - n_copy, n_copy, stream);
-    }
-    if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
-      std::stringstream ss;
-      raft::print_device_vector("selected", idx.data(), n_already_selected + n_copy, ss);
-      CUML_LOG_DEBUG(ss.str().c_str());
-    }
-    return n_copy;
+  // Map the mask to the sorted indices
+  thrust::device_ptr<bool> av_ptr(available);
+  thrust::device_ptr<bool> av_sorted_ptr(available_sorted.data());
+  thrust::device_ptr<int> idx_ptr(f_idx_sorted.data());
+  thrust::copy(thrust::cuda::par.on(stream),
+               thrust::make_permutation_iterator(av_ptr, idx_ptr),
+               thrust::make_permutation_iterator(av_ptr, idx_ptr + n_train),
+               av_sorted_ptr);
+  if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
+    std::stringstream ss;
+    raft::print_device_vector("avail_sorted", available_sorted.data(), n_train, ss);
+    CUML_LOG_DEBUG(ss.str().c_str());
   }
 
-  void Initialize()
-  {
-    raft::linalg::range(f_idx.data(), n_train, stream);
-    raft::linalg::range(idx.data(), n_ws, stream);
+  // Select the available elements
+  cub::DeviceSelect::Flagged((void*)cub_storage.data(),
+                             cub_bytes,
+                             f_idx_sorted.data(),
+                             available_sorted.data(),
+                             idx_tmp.data(),
+                             d_num_selected.data(),
+                             n_train,
+                             stream);
+  int n_selected = d_num_selected.value(stream);
+  handle.sync_stream(stream);
+
+  // Copy to output
+  int n_copy = n_selected > n_needed ? n_needed : n_selected;
+  if (copy_front) {
+    raft::copy(idx.data() + n_already_selected, idx_tmp.data(), n_copy, stream);
+  } else {
+    raft::copy(
+      idx.data() + n_already_selected, idx_tmp.data() + n_selected - n_copy, n_copy, stream);
   }
-
-  /**
-   * @brief Select the first n_needed elements from ws_idx_sorted where op is true.
-   *
-   * The selected elements are appended to this->idx.
-   *
-   * @param n_needed number of elements that should be selected
-   * @param n_already_selected number of already selected elements
-   * @param op selection condition
-   * @return the number of elements selected
-   */
-  template <typename select_op>
-  int SelectPrevWs(int n_needed, int n_already_selected, select_op op)
-  {
-    n_needed -= n_already_selected;
-    if (n_needed <= 0) { return 0; }
-    cub::DeviceSelect::If(cub_storage.data(),
-                          cub_bytes,
-                          ws_idx_sorted.data(),
-                          ws_idx_selected.data(),
-                          d_num_selected.data(),
-                          n_ws,
-                          op,
-                          stream);
-    int n_selected = d_num_selected.value(stream);
-    handle.sync_stream(stream);
-    int n_copy = n_selected < n_needed ? n_selected : n_needed;
-    raft::copy(idx.data() + n_already_selected, ws_idx_selected.data(), n_copy, stream);
-    return n_copy;
+  if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG) && n_train < 20) {
+    std::stringstream ss;
+    raft::print_device_vector("selected", idx.data(), n_already_selected + n_copy, ss);
+    CUML_LOG_DEBUG(ss.str().c_str());
   }
-};
+  return n_copy;
+}
+template <typename math_t>
+inline void WorkingSet<math_t>::Initialize()
+{
+  raft::linalg::range(f_idx.data(), n_train, stream);
+  raft::linalg::range(idx.data(), n_ws, stream);
+}
 
-};  // end namespace SVM
-};  // end namespace ML
+/**
+ * @brief Select the first n_needed elements from ws_idx_sorted where op is true.
+ *
+ * The selected elements are appended to this->idx.
+ *
+ * @param n_needed number of elements that should be selected
+ * @param n_already_selected number of already selected elements
+ * @param op selection condition
+ * @return the number of elements selected
+ */
+template <typename math_t>
+template <typename select_op>
+inline int WorkingSet<math_t>::SelectPrevWs(int n_needed, int n_already_selected, select_op op)
+{
+  n_needed -= n_already_selected;
+  if (n_needed <= 0) { return 0; }
+  cub::DeviceSelect::If(cub_storage.data(),
+                        cub_bytes,
+                        ws_idx_sorted.data(),
+                        ws_idx_selected.data(),
+                        d_num_selected.data(),
+                        n_ws,
+                        op,
+                        stream);
+  int n_selected = d_num_selected.value(stream);
+  handle.sync_stream(stream);
+  int n_copy = n_selected < n_needed ? n_selected : n_needed;
+  raft::copy(idx.data() + n_already_selected, ws_idx_selected.data(), n_copy, stream);
+  return n_copy;
+}
+
+}  // end namespace SVM
+}  // end namespace ML
diff --git a/cpp/src/svm/workingset.h b/cpp/src/svm/workingset.h
new file mode 100644
index 0000000000..6335f7b5c6
--- /dev/null
+++ b/cpp/src/svm/workingset.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuml/common/logger.hpp>
+#include <cuml/svm/svm_parameter.h>
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+
+namespace ML {
+namespace SVM {
+
+/**
+ * Working set selection for the SMO algorithm.
+ *
+ * The working set is a subset of the training vectors, by default it has 1024 elements.
+ * At every outer iteration in SmoSolver::Solve, we select a different working set, and
+ * optimize the dual coefficients for the working set.
+ *
+ * The vectors are selected based on the f values, which is the difference between the
+ * target label and the decision function value.
+ */
+template <typename math_t>
+class WorkingSet {
+ public:
+  //!> Workspace selection strategy, note that only FIFO is tested so far
+  bool FIFO_strategy = true;
+
+  /**
+   * @brief Manage a working set.
+   *
+   * @param handle cuml handle implementation
+   * @param stream cuda stream for working set operations
+   * @param n_rows number of training vectors
+   * @param n_ws number of elements in the working set (default 1024)
+   * @param svmType classification or regression
+   */
+  WorkingSet(const raft::handle_t& handle,
+             cudaStream_t stream,
+             int n_rows      = 0,
+             int n_ws        = 0,
+             SvmType svmType = C_SVC)
+    : handle(handle),
+      stream(stream),
+      svmType(svmType),
+      n_rows(n_rows),
+      available(0, stream),
+      available_sorted(0, stream),
+      cub_storage(0, stream),
+      f_idx(0, stream),
+      f_idx_sorted(0, stream),
+      f_sorted(0, stream),
+      idx_tmp(0, stream),
+      idx(0, stream),
+      ws_idx_sorted(0, stream),
+      ws_idx_selected(0, stream),
+      ws_idx_save(0, stream),
+      ws_priority(0, stream),
+      ws_priority_sorted(0, stream),
+      d_num_selected(stream)
+  {
+    n_train = (svmType == EPSILON_SVR) ? n_rows * 2 : n_rows;
+    SetSize(n_train, n_ws);
+  }
+
+  ~WorkingSet() {}
+
+  /**
+   * @brief Set the size of the working set and allocate buffers accordingly.
+   *
+   * @param n_train number of training vectors
+   * @param n_ws working set size (default min(1024, n_train))
+   */
+  void SetSize(int n_train, int n_ws = 0)
+  {
+    if (n_ws == 0 || n_ws > n_train) { n_ws = n_train; }
+    n_ws       = std::min(1024, n_ws);
+    this->n_ws = n_ws;
+    CUML_LOG_DEBUG("Creating working set with %d elements", n_ws);
+    AllocateBuffers();
+  }
+
+  /** Return the size of the working set. */
+  int GetSize() { return n_ws; }
+
+  /**
+   * @brief Return a device pointer to the the working set indices.
+   *
+   * The returned array is owned by WorkingSet.
+   */
+  int* GetIndices() { return idx.data(); }
+
+  /**
+   * @brief Select new elements for a working set.
+   *
+   * Here we follow the working set selection strategy by Joachims [1], we
+   * select n training instances as:
+   *   - select n/2 element of upper set, where f is largest
+   *   - select n/2 from lower set, where f is smallest
+   *
+   * The difference compared to Joachims' strategy is that we can already have
+   * some elements selected by a different strategy, therefore we select only
+   * n = n_ws - n_already_selected.
+   *
+   * References:
+   * [1] Joachims, T. (1998). Making large-scale support vector machine learning
+   *     practical. In B. Scholkopf, C. Burges, & A. Smola (Eds.), Advances in
+   *     kernel methods: Support vector machines. Cambridge, MA: MIT Press
+   *
+   * @param f optimality indicator vector, size [n_train]
+   * @param alpha dual coefficients, size [n_train]
+   * @param y target labels (+/- 1)
+   * @param C penalty parameter vector size [n_train]
+   * @param n_already_selected
+   */
+
+  void SimpleSelect(
+    math_t* f, math_t* alpha, math_t* y, const math_t* C, int n_already_selected = 0);
+
+  /**
+   * @brief Select working set indices.
+   *
+   * To avoid training vectors oscillating in and out of the working set, we
+   * keep half of the previous working set, and fill new elements only to the
+   * other half.
+   *
+   * We can have a FIFO retention policy, or we can
+   * consider the time (=ws_priority) a vector already spent in the ws.
+   * References:
+   * [1] Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal
+   *     of Machine Learning Research, 19, 1-5 (2018)
+   *
+   * @param f optimality indicator vector, size [n_train]
+   * @param alpha dual coefficients, size [n_train]
+   * @param y class labels, size [n_train]
+   * @param C penalty parameter vector, size [n_train]
+   */
+  void Select(math_t* f, math_t* alpha, math_t* y, const math_t* C)
+  {
+    if (n_ws >= n_train) {
+      // All elements are selected, we have initialized idx to cover this case
+      return;
+    }
+    int nc         = n_ws / 4;
+    int n_selected = 0;
+    if (firstcall) {
+      if (nc >= 1) {
+        firstcall = false;
+      } else {
+        // This can only happen for n_ws < 4.
+        // We keep the calculation always in firstcall mode (only SimpleSelect
+        // is used, no advanced strategies because we do not have enough elements)
+        //
+        // Nothing to do, firstcall is already true
+      }
+    } else {
+      // keep 1/2 of the old working set
+      if (FIFO_strategy) {
+        // FIFO selection following ThunderSVM
+        raft::copy(idx.data(), ws_idx_save.data() + 2 * nc, 2 * nc, stream);
+        n_selected = nc * 2;
+      } else {
+        // priority based selection preferring to keep newer elements in ws
+        n_selected = PrioritySelect(alpha, C, nc);
+      }
+    }
+    SimpleSelect(f, alpha, y, C, n_selected);
+    raft::copy(ws_idx_save.data(), idx.data(), n_ws, stream);
+  }
+
+  /**
+   * @brief Select elements from the previous working set based on their priority.
+   *
+   * We sort the old working set based on their priority in ascending order,
+   * and then select nc elements from free, and then lower/upper bound vectors.
+   * For details see [2].
+   *
+   * See Issue #946.
+   *
+   * References:
+   * [2] T Serafini, L Zanni: On the Working Set selection in grad. projection
+   *     based decomposition techniques for Support Vector Machines
+   *     DOI: 10.1080/10556780500140714
+   *
+   * @param [in] alpha device vector of dual coefficients, size [n_train]
+   * @param [in] C penalty parameter
+   * @param [in] nc number of elements to select
+   */
+  int PrioritySelect(math_t* alpha, const math_t* C, int nc);
+
+ private:
+  const raft::handle_t& handle;
+  cudaStream_t stream;
+
+  bool firstcall = true;
+  int n_train    = 0;  ///< number of training vectors (including duplicates for SVR)
+  int n_rows     = 0;  ///< number of original training vectors (no duplicates)
+  int n_ws       = 0;
+
+  SvmType svmType;
+
+  int TPB = 256;  //!< Threads per block for workspace selection kernels
+
+  // Buffers for the domain size [n_train]
+  rmm::device_uvector<int> f_idx;  //!< Arrays used for sorting for sorting
+  rmm::device_uvector<int> f_idx_sorted;
+  //! Temporary buffer for index manipulation
+  rmm::device_uvector<int> idx_tmp;
+  rmm::device_uvector<math_t> f_sorted;
+  //! Flag vectors available for selection
+  rmm::device_uvector<bool> available;
+  rmm::device_uvector<bool> available_sorted;
+
+  // working set buffers size [n_ws]
+  rmm::device_uvector<int> idx;  //!< Indices of the worknig set
+  rmm::device_uvector<int> ws_idx_sorted;
+  rmm::device_uvector<int> ws_idx_selected;
+  rmm::device_uvector<int> ws_idx_save;
+
+  rmm::device_uvector<int> ws_priority;
+  rmm::device_uvector<int> ws_priority_sorted;
+
+  rmm::device_scalar<int> d_num_selected;
+  std::size_t cub_bytes = 0;
+  rmm::device_uvector<char> cub_storage;
+
+  void AllocateBuffers();
+
+  /**
+   * @brief Gather available elements from the working set.
+   *
+   * We select the first (last) n_needed element from the front (end) of
+   * f_idx_sorted. We ignore the elements that are already selected, and those
+   * where this->available is false.
+   *
+   * @param n_already_selected number of element already selected (their indices
+   *   are stored in idx[0:n_already_selected])
+   * @param n_needed number of elements to be selected
+   * @param copy_front if true, then copy the elements from the front of the
+   *        selected list, otherwise copy from the end of the list
+   * @return the number of elements copied (which might be less than n_needed)
+   */
+  int GatherAvailable(int n_already_selected, int n_needed, bool copy_front);
+
+  void Initialize();
+
+  /**
+   * @brief Select the first n_needed elements from ws_idx_sorted where op is true.
+   *
+   * The selected elements are appended to this->idx.
+   *
+   * @param n_needed number of elements that should be selected
+   * @param n_already_selected number of already selected elements
+   * @param op selection condition
+   * @return the number of elements selected
+   */
+  template <typename select_op>
+  int SelectPrevWs(int n_needed, int n_already_selected, select_op op);
+};
+
+};  // end namespace SVM
+};  // end namespace ML
diff --git a/cpp/src/svm/ws_util.cu b/cpp/src/svm/ws_util.cu
deleted file mode 100644
index 26de7683e5..0000000000
--- a/cpp/src/svm/ws_util.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/util/cuda_utils.cuh>
-
-#include <cub/cub.cuh>
-
-#include <limits.h>
-
-namespace ML {
-namespace SVM {
-
-__global__ void set_unavailable(bool* available, int n_rows, const int* idx, int n_selected)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < n_selected) { available[idx[tid]] = false; }
-}
-
-__global__ void update_priority(int* new_priority,
-                                int n_selected,
-                                const int* new_idx,
-                                int n_ws,
-                                const int* idx,
-                                const int* priority)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < n_selected) {
-    int my_new_idx = new_idx[tid];
-    // The working set size is limited (~1024 elements) so we just loop through it
-    for (int i = 0; i < n_ws; i++) {
-      if (idx[i] == my_new_idx) new_priority[tid] = priority[i] + 1;
-    }
-  }
-}
-}  // namespace SVM
-}  // namespace ML
diff --git a/cpp/src/svm/ws_util.cuh b/cpp/src/svm/ws_util.cuh
index 6cef4b38e3..dc4daf24c8 100644
--- a/cpp/src/svm/ws_util.cuh
+++ b/cpp/src/svm/ws_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,11 @@
  */
 
 #pragma once
+
 #include "smo_sets.cuh"
 
+#include <cuml/common/utils.hpp>
+
 namespace ML {
 namespace SVM {
 
@@ -27,7 +30,11 @@ namespace SVM {
  * \param [in] idx list of indices already selected, size [n_selected]
  * \param [in] n_selected number of elements in the idx list
  */
-__global__ void set_unavailable(bool* available, int n_rows, const int* idx, int n_selected);
+CUML_KERNEL void set_unavailable(bool* available, int n_rows, const int* idx, int n_selected)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n_selected) { available[idx[tid]] = false; }
+}
 
 /** Set availability to true for elements in the upper set, otherwise false.
  * @param [out] available size [n]
@@ -37,7 +44,7 @@ __global__ void set_unavailable(bool* available, int n_rows, const int* idx, int
  * @param [in] C penalty factor
  */
 template <typename math_t>
-__global__ void set_upper(
+CUML_KERNEL void set_upper(
   bool* available, int n, const math_t* alpha, const math_t* y, const math_t* C)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -52,7 +59,7 @@ __global__ void set_upper(
  * @param [in] C penalty factor
  */
 template <typename math_t>
-__global__ void set_lower(
+CUML_KERNEL void set_lower(
   bool* available, int n, const math_t* alpha, const math_t* y, const math_t* C)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -71,11 +78,21 @@ __global__ void set_lower(
  * @param [in] idx indices in the old working set, size [n_ws]
  * @param [in] priority of elements in the old working set, size [n_ws]
  */
-__global__ void update_priority(int* new_priority,
-                                int n_selected,
-                                const int* new_idx,
-                                int n_ws,
-                                const int* idx,
-                                const int* priority);
+CUML_KERNEL void update_priority(int* new_priority,
+                                 int n_selected,
+                                 const int* new_idx,
+                                 int n_ws,
+                                 const int* idx,
+                                 const int* priority)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n_selected) {
+    int my_new_idx = new_idx[tid];
+    // The working set size is limited (~1024 elements) so we just loop through it
+    for (int i = 0; i < n_ws; i++) {
+      if (idx[i] == my_new_idx) new_priority[tid] = priority[i] + 1;
+    }
+  }
+}
 }  // namespace SVM
 }  // namespace ML
diff --git a/cpp/src/tsa/auto_arima.cuh b/cpp/src/tsa/auto_arima.cuh
index d9c5342106..2bf049c04b 100644
--- a/cpp/src/tsa/auto_arima.cuh
+++ b/cpp/src/tsa/auto_arima.cuh
@@ -18,6 +18,8 @@
 
 #include <common/fast_int_div.cuh>
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/core/interruptible.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -123,12 +125,12 @@ inline int divide_by_mask_build_index(const bool* d_mask,
  * @param[in]  n_obs      Number of data points per series
  */
 template <typename DataT>
-__global__ void divide_by_mask_kernel(const DataT* d_in,
-                                      const bool* d_mask,
-                                      const int* d_index,
-                                      DataT* d_out0,
-                                      DataT* d_out1,
-                                      int n_obs)
+CUML_KERNEL void divide_by_mask_kernel(const DataT* d_in,
+                                       const bool* d_mask,
+                                       const int* d_index,
+                                       DataT* d_out0,
+                                       DataT* d_out1,
+                                       int n_obs)
 {
   const DataT* b_in = d_in + n_obs * blockIdx.x;
   DataT* b_out      = (d_mask[blockIdx.x] ? d_out1 : d_out0) + n_obs * d_index[blockIdx.x];
@@ -262,7 +264,7 @@ inline void divide_by_min_build_index(const DataT* d_matrix,
  * @param[in]  n_obs      Number of data points per series
  */
 template <typename DataT>
-__global__ void divide_by_min_kernel(
+CUML_KERNEL void divide_by_min_kernel(
   const DataT* d_in, const int* d_batch, const int* d_index, DataT** d_out, int n_obs)
 {
   const DataT* b_in = d_in + n_obs * blockIdx.x;
@@ -325,10 +327,10 @@ inline void divide_by_min_execute(const DataT* d_in,
  * @param[out] d_id_to_model Array associating each member with its
  *                           sub-batch
  */
-__global__ void build_division_map_kernel(const int* const* d_id,
-                                          const int* d_size,
-                                          int* d_id_to_pos,
-                                          int* d_id_to_model)
+CUML_KERNEL void build_division_map_kernel(const int* const* d_id,
+                                           const int* d_size,
+                                           int* d_id_to_pos,
+                                           int* d_id_to_model)
 {
   const int* b_id = d_id[blockIdx.x];
   int b_size      = d_size[blockIdx.x];
@@ -391,7 +393,7 @@ inline void build_division_map(const int* const* hd_id,
  * @param[in]  n_obs       Number of observations (or forecasts) per series
  */
 template <typename DataT>
-__global__ void merge_series_kernel(
+CUML_KERNEL void merge_series_kernel(
   const DataT* const* d_in, const int* d_id_to_pos, const int* d_id_to_sub, DataT* d_out, int n_obs)
 {
   const DataT* b_in = d_in[d_id_to_sub[blockIdx.x]] + n_obs * d_id_to_pos[blockIdx.x];
diff --git a/cpp/src/tsne/barnes_hut_kernels.cuh b/cpp/src/tsne/barnes_hut_kernels.cuh
index b99e17c1c2..61408765cd 100644
--- a/cpp/src/tsne/barnes_hut_kernels.cuh
+++ b/cpp/src/tsne/barnes_hut_kernels.cuh
@@ -49,10 +49,10 @@ namespace BH {
  * Initializes the states of objects. This speeds the overall kernel up.
  */
 template <typename value_idx, typename value_t>
-__global__ void InitializationKernel(/*int *restrict errd, */
-                                     unsigned* restrict limiter,
-                                     value_idx* restrict maxdepthd,
-                                     value_t* restrict radiusd)
+CUML_KERNEL void InitializationKernel(/*int *restrict errd, */
+                                      unsigned* restrict limiter,
+                                      value_idx* restrict maxdepthd,
+                                      value_t* restrict radiusd)
 {
   // errd[0] = 0;
   maxdepthd[0] = 1;
@@ -64,11 +64,11 @@ __global__ void InitializationKernel(/*int *restrict errd, */
  * Reset normalization back to 0.
  */
 template <typename value_idx, typename value_t>
-__global__ void Reset_Normalization(value_t* restrict Z_norm,
-                                    value_t* restrict radiusd_squared,
-                                    value_idx* restrict bottomd,
-                                    const value_idx NNODES,
-                                    const value_t* restrict radiusd)
+CUML_KERNEL void Reset_Normalization(value_t* restrict Z_norm,
+                                     value_t* restrict radiusd_squared,
+                                     value_idx* restrict bottomd,
+                                     const value_idx NNODES,
+                                     const value_t* restrict radiusd)
 {
   Z_norm[0]          = 0.0f;
   radiusd_squared[0] = radiusd[0] * radiusd[0];
@@ -80,7 +80,7 @@ __global__ void Reset_Normalization(value_t* restrict Z_norm,
  * Find 1/Z
  */
 template <typename value_idx, typename value_t>
-__global__ void Find_Normalization(value_t* restrict Z_norm, const value_idx N)
+CUML_KERNEL void Find_Normalization(value_t* restrict Z_norm, const value_idx N)
 {
   Z_norm[0] = 1.0f / (Z_norm[0] - N);
 }
@@ -89,20 +89,20 @@ __global__ void Find_Normalization(value_t* restrict Z_norm, const value_idx N)
  * Figures the bounding boxes for every point in the embedding.
  */
 template <typename value_idx, typename value_t>
-__global__ __launch_bounds__(THREADS1) void BoundingBoxKernel(value_idx* restrict startd,
-                                                              value_idx* restrict childd,
-                                                              value_t* restrict massd,
-                                                              value_t* restrict posxd,
-                                                              value_t* restrict posyd,
-                                                              value_t* restrict maxxd,
-                                                              value_t* restrict maxyd,
-                                                              value_t* restrict minxd,
-                                                              value_t* restrict minyd,
-                                                              const value_idx FOUR_NNODES,
-                                                              const value_idx NNODES,
-                                                              const value_idx N,
-                                                              unsigned* restrict limiter,
-                                                              value_t* restrict radiusd)
+CUML_KERNEL __launch_bounds__(THREADS1) void BoundingBoxKernel(value_idx* restrict startd,
+                                                               value_idx* restrict childd,
+                                                               value_t* restrict massd,
+                                                               value_t* restrict posxd,
+                                                               value_t* restrict posyd,
+                                                               value_t* restrict maxxd,
+                                                               value_t* restrict maxyd,
+                                                               value_t* restrict minxd,
+                                                               value_t* restrict minyd,
+                                                               const value_idx FOUR_NNODES,
+                                                               const value_idx NNODES,
+                                                               const value_idx N,
+                                                               unsigned* restrict limiter,
+                                                               value_t* restrict radiusd)
 {
   value_t val, minx, maxx, miny, maxy;
   __shared__ value_t sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1];
@@ -181,9 +181,9 @@ __global__ __launch_bounds__(THREADS1) void BoundingBoxKernel(value_idx* restric
  * Clear some of the state vectors up.
  */
 template <typename value_idx>
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(value_idx* restrict childd,
-                                                        const value_idx FOUR_NNODES,
-                                                        const value_idx FOUR_N)
+CUML_KERNEL __launch_bounds__(1024, 1) void ClearKernel1(value_idx* restrict childd,
+                                                         const value_idx FOUR_NNODES,
+                                                         const value_idx FOUR_N)
 {
   const auto inc = blockDim.x * gridDim.x;
   value_idx k    = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x;
@@ -200,15 +200,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(value_idx* restrict chil
  * See: https://iss.oden.utexas.edu/Publications/Papers/burtscher11.pdf
  */
 template <typename value_idx, typename value_t>
-__global__ __launch_bounds__(THREADS2) void TreeBuildingKernel(/* int *restrict errd, */
-                                                               value_idx* restrict childd,
-                                                               const value_t* restrict posxd,
-                                                               const value_t* restrict posyd,
-                                                               const value_idx NNODES,
-                                                               const value_idx N,
-                                                               value_idx* restrict maxdepthd,
-                                                               value_idx* restrict bottomd,
-                                                               const value_t* restrict radiusd)
+CUML_KERNEL __launch_bounds__(THREADS2) void TreeBuildingKernel(/* int *restrict errd, */
+                                                                value_idx* restrict childd,
+                                                                const value_t* restrict posxd,
+                                                                const value_t* restrict posyd,
+                                                                const value_idx NNODES,
+                                                                const value_idx N,
+                                                                value_idx* restrict maxdepthd,
+                                                                value_idx* restrict bottomd,
+                                                                const value_t* restrict radiusd)
 {
   value_idx j, depth;
   value_t x, y, r;
@@ -337,10 +337,10 @@ __global__ __launch_bounds__(THREADS2) void TreeBuildingKernel(/* int *restrict
  * Clean more state vectors.
  */
 template <typename value_idx, typename value_t>
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(value_idx* restrict startd,
-                                                        value_t* restrict massd,
-                                                        const value_idx NNODES,
-                                                        const value_idx* restrict bottomd)
+CUML_KERNEL __launch_bounds__(1024, 1) void ClearKernel2(value_idx* restrict startd,
+                                                         value_t* restrict massd,
+                                                         const value_idx NNODES,
+                                                         const value_idx* restrict bottomd)
 {
   const auto bottom = bottomd[0];
   const auto inc    = blockDim.x * gridDim.x;
@@ -359,15 +359,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(value_idx* restrict star
  * Summarize the KD Tree via cell gathering
  */
 template <typename value_idx, typename value_t>
-__global__ __launch_bounds__(THREADS3,
-                             FACTOR3) void SummarizationKernel(value_idx* restrict countd,
-                                                               const value_idx* restrict childd,
-                                                               volatile value_t* restrict massd,
-                                                               value_t* restrict posxd,
-                                                               value_t* restrict posyd,
-                                                               const value_idx NNODES,
-                                                               const value_idx N,
-                                                               const value_idx* restrict bottomd)
+CUML_KERNEL __launch_bounds__(THREADS3,
+                              FACTOR3) void SummarizationKernel(value_idx* restrict countd,
+                                                                const value_idx* restrict childd,
+                                                                volatile value_t* restrict massd,
+                                                                value_t* restrict posxd,
+                                                                value_t* restrict posyd,
+                                                                const value_idx NNODES,
+                                                                const value_idx N,
+                                                                const value_idx* restrict bottomd)
 {
   bool flag = 0;
   value_t cm, px, py;
@@ -495,13 +495,14 @@ __global__ __launch_bounds__(THREADS3,
  * Sort the cells
  */
 template <typename value_idx>
-__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(value_idx* restrict sortd,
-                                                                const value_idx* restrict countd,
-                                                                volatile value_idx* restrict startd,
-                                                                value_idx* restrict childd,
-                                                                const value_idx NNODES,
-                                                                const value_idx N,
-                                                                const value_idx* restrict bottomd)
+CUML_KERNEL __launch_bounds__(THREADS4,
+                              FACTOR4) void SortKernel(value_idx* restrict sortd,
+                                                       const value_idx* restrict countd,
+                                                       volatile value_idx* restrict startd,
+                                                       value_idx* restrict childd,
+                                                       const value_idx NNODES,
+                                                       const value_idx N,
+                                                       const value_idx* restrict bottomd)
 {
   const value_idx bottom = bottomd[0];
   const value_idx dec    = blockDim.x * gridDim.x;
@@ -545,7 +546,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(value_idx* restr
  * Calculate the repulsive forces using the KD Tree
  */
 template <typename value_idx, typename value_t>
-__global__ __launch_bounds__(
+CUML_KERNEL __launch_bounds__(
   THREADS5, 1) void RepulsionKernel(/* int *restrict errd, */
                                     const float theta,
                                     const float epssqd,  // correction for zero distance
@@ -678,16 +679,16 @@ __global__ __launch_bounds__(
  * Fast attractive kernel. Uses COO matrix.
  */
 template <typename value_idx, typename value_t>
-__global__ void attractive_kernel_bh(const value_t* restrict VAL,
-                                     const value_idx* restrict COL,
-                                     const value_idx* restrict ROW,
-                                     const value_t* restrict Y1,
-                                     const value_t* restrict Y2,
-                                     value_t* restrict attract1,
-                                     value_t* restrict attract2,
-                                     value_t* restrict Qs,
-                                     const value_idx NNZ,
-                                     const value_t dof)
+CUML_KERNEL void attractive_kernel_bh(const value_t* restrict VAL,
+                                      const value_idx* restrict COL,
+                                      const value_idx* restrict ROW,
+                                      const value_t* restrict Y1,
+                                      const value_t* restrict Y2,
+                                      value_t* restrict attract1,
+                                      value_t* restrict attract2,
+                                      value_t* restrict Qs,
+                                      const value_idx NNZ,
+                                      const value_t dof)
 {
   const auto index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index >= NNZ) return;
@@ -720,21 +721,21 @@ __global__ void attractive_kernel_bh(const value_t* restrict VAL,
  * Apply gradient updates.
  */
 template <typename value_idx, typename value_t>
-__global__ __launch_bounds__(THREADS6, 1) void IntegrationKernel(const float eta,
-                                                                 const float momentum,
-                                                                 const float exaggeration,
-                                                                 value_t* restrict Y1,
-                                                                 value_t* restrict Y2,
-                                                                 const value_t* restrict attract1,
-                                                                 const value_t* restrict attract2,
-                                                                 const value_t* restrict repel1,
-                                                                 const value_t* restrict repel2,
-                                                                 value_t* restrict gains1,
-                                                                 value_t* restrict gains2,
-                                                                 value_t* restrict old_forces1,
-                                                                 value_t* restrict old_forces2,
-                                                                 const value_t* restrict Z,
-                                                                 const value_idx N)
+CUML_KERNEL __launch_bounds__(THREADS6, 1) void IntegrationKernel(const float eta,
+                                                                  const float momentum,
+                                                                  const float exaggeration,
+                                                                  value_t* restrict Y1,
+                                                                  value_t* restrict Y2,
+                                                                  const value_t* restrict attract1,
+                                                                  const value_t* restrict attract2,
+                                                                  const value_t* restrict repel1,
+                                                                  const value_t* restrict repel2,
+                                                                  value_t* restrict gains1,
+                                                                  value_t* restrict gains2,
+                                                                  value_t* restrict old_forces1,
+                                                                  value_t* restrict old_forces2,
+                                                                  const value_t* restrict Z,
+                                                                  const value_idx N)
 {
   value_t ux, uy, gx, gy;
 
diff --git a/cpp/src/tsne/cannylab/bh.cu b/cpp/src/tsne/cannylab/bh.cu
index fe3949822a..686af3f5e0 100644
--- a/cpp/src/tsne/cannylab/bh.cu
+++ b/cpp/src/tsne/cannylab/bh.cu
@@ -72,7 +72,7 @@ __device__ volatile float radiusd;
 /*** initialize memory ********************************************************/
 /******************************************************************************/
 
-__global__ void InitializationKernel()
+CUML_KERNEL void InitializationKernel()
 {
   stepd   = -1;
   blkcntd = 0;
@@ -82,14 +82,14 @@ __global__ void InitializationKernel()
 /*** compute center and radius ************************************************/
 /******************************************************************************/
 
-__global__ __launch_bounds__(THREADS1,
-                             FACTOR1) void BoundingBoxKernel(const int nnodesd,
-                                                             const int nbodiesd,
-                                                             int* const __restrict__ startd,
-                                                             int* const __restrict__ childd,
-                                                             float4* const __restrict__ posMassd,
-                                                             float3* const __restrict__ maxd,
-                                                             float3* const __restrict__ mind)
+CUML_KERNEL __launch_bounds__(THREADS1,
+                              FACTOR1) void BoundingBoxKernel(const int nnodesd,
+                                                              const int nbodiesd,
+                                                              int* const __restrict__ startd,
+                                                              int* const __restrict__ childd,
+                                                              float4* const __restrict__ posMassd,
+                                                              float3* const __restrict__ maxd,
+                                                              float3* const __restrict__ mind)
 {
   int i, j, k, inc;
   float val;
@@ -189,9 +189,9 @@ __global__ __launch_bounds__(THREADS1,
 /*** build tree ***************************************************************/
 /******************************************************************************/
 
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(const int nnodesd,
-                                                        const int nbodiesd,
-                                                        int* const __restrict__ childd)
+CUML_KERNEL __launch_bounds__(1024, 1) void ClearKernel1(const int nnodesd,
+                                                         const int nbodiesd,
+                                                         int* const __restrict__ childd)
 {
   int k, inc, top, bottom;
 
@@ -208,7 +208,7 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(const int nnodesd,
   }
 }
 
-__global__ __launch_bounds__(THREADS2, FACTOR2) void TreeBuildingKernel(
+CUML_KERNEL __launch_bounds__(THREADS2, FACTOR2) void TreeBuildingKernel(
   const int nnodesd,
   const int nbodiesd,
   volatile int* const __restrict__ childd,
@@ -355,9 +355,9 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void TreeBuildingKernel(
   }
 }
 
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(const int nnodesd,
-                                                        int* const __restrict__ startd,
-                                                        float4* const __restrict__ posMassd)
+CUML_KERNEL __launch_bounds__(1024, 1) void ClearKernel2(const int nnodesd,
+                                                         int* const __restrict__ startd,
+                                                         float4* const __restrict__ posMassd)
 {
   int k, inc, bottom;
 
@@ -378,7 +378,7 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(const int nnodesd,
 /*** compute center of mass ***************************************************/
 /******************************************************************************/
 
-__global__ __launch_bounds__(THREADS3, FACTOR3) void SummarizationKernel(
+CUML_KERNEL __launch_bounds__(THREADS3, FACTOR3) void SummarizationKernel(
   const int nnodesd,
   const int nbodiesd,
   volatile int* const __restrict__ countd,
@@ -524,13 +524,13 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void SummarizationKernel(
 /*** sort bodies **************************************************************/
 /******************************************************************************/
 
-__global__ __launch_bounds__(THREADS4,
-                             FACTOR4) void SortKernel(const int nnodesd,
-                                                      const int nbodiesd,
-                                                      int* const __restrict__ sortd,
-                                                      const int* const __restrict__ countd,
-                                                      volatile int* const __restrict__ startd,
-                                                      int* const __restrict__ childd)
+CUML_KERNEL __launch_bounds__(THREADS4,
+                              FACTOR4) void SortKernel(const int nnodesd,
+                                                       const int nbodiesd,
+                                                       int* const __restrict__ sortd,
+                                                       const int* const __restrict__ countd,
+                                                       volatile int* const __restrict__ startd,
+                                                       int* const __restrict__ childd)
 {
   int i, j, k, ch, dec, start, bottom;
 
@@ -573,7 +573,7 @@ __global__ __launch_bounds__(THREADS4,
 /*** compute force ************************************************************/
 /******************************************************************************/
 
-__global__ __launch_bounds__(THREADS5, FACTOR5) void ForceCalculationKernel(
+CUML_KERNEL __launch_bounds__(THREADS5, FACTOR5) void ForceCalculationKernel(
   const int nnodesd,
   const int nbodiesd,
   const float dthfd,
@@ -692,13 +692,13 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void ForceCalculationKernel(
 /*** advance bodies ***********************************************************/
 /******************************************************************************/
 
-__global__ __launch_bounds__(THREADS6,
-                             FACTOR6) void IntegrationKernel(const int nbodiesd,
-                                                             const float dtimed,
-                                                             const float dthfd,
-                                                             float4* const __restrict__ posMass,
-                                                             float2* const __restrict__ veld,
-                                                             float4* const __restrict__ accVeld)
+CUML_KERNEL __launch_bounds__(THREADS6,
+                              FACTOR6) void IntegrationKernel(const int nbodiesd,
+                                                              const float dtimed,
+                                                              const float dthfd,
+                                                              float4* const __restrict__ posMass,
+                                                              float2* const __restrict__ veld,
+                                                              float4* const __restrict__ accVeld)
 {
   int i, inc;
   float dvelx, dvely, dvelz;
@@ -1104,4 +1104,4 @@ int main(int argc, char* argv[])
   cudaFree(minl);
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/cpp/src/tsne/distances.cuh b/cpp/src/tsne/distances.cuh
index d9e831cc34..a221d70820 100644
--- a/cpp/src/tsne/distances.cuh
+++ b/cpp/src/tsne/distances.cuh
@@ -33,6 +33,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
 
@@ -162,7 +163,8 @@ void get_distances(const raft::handle_t& handle,
 template <typename value_t>
 void normalize_distances(value_t* distances, const size_t total_nn, cudaStream_t stream)
 {
-  auto abs_f      = [] __device__(const value_t& x) { return abs(x); };
+  auto abs_f = cuda::proclaim_return_type<value_t>(
+    [] __device__(const value_t& x) -> value_t { return abs(x); });
   value_t maxNorm = thrust::transform_reduce(rmm::exec_policy(stream),
                                              distances,
                                              distances + total_nn,
diff --git a/cpp/src/tsne/exact_kernels.cuh b/cpp/src/tsne/exact_kernels.cuh
index 694063700c..fff11d8564 100644
--- a/cpp/src/tsne/exact_kernels.cuh
+++ b/cpp/src/tsne/exact_kernels.cuh
@@ -35,14 +35,14 @@ namespace TSNE {
 /* Finds the best Gaussian bandwidth for
     each row in the dataset             */
 template <typename value_idx, typename value_t>
-__global__ void sigmas_kernel(const value_t* restrict distances,
-                              value_t* restrict P,
-                              const float perplexity,
-                              const float desired_entropy,
-                              const int epochs,
-                              const float tol,
-                              const value_idx n,
-                              const int k)
+CUML_KERNEL void sigmas_kernel(const value_t* restrict distances,
+                               value_t* restrict P,
+                               const float perplexity,
+                               const float desired_entropy,
+                               const int epochs,
+                               const float tol,
+                               const value_idx n,
+                               const int k)
 {
   // For every item in row
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -94,13 +94,13 @@ __global__ void sigmas_kernel(const value_t* restrict distances,
 /* Finds the best Gaussian bandwidth for
     each row in the dataset             */
 template <typename value_idx, typename value_t>
-__global__ void sigmas_kernel_2d(const value_t* restrict distances,
-                                 value_t* restrict P,
-                                 const float perplexity,
-                                 const float desired_entropy,
-                                 const int epochs,
-                                 const float tol,
-                                 const value_idx n)
+CUML_KERNEL void sigmas_kernel_2d(const value_t* restrict distances,
+                                  value_t* restrict P,
+                                  const float perplexity,
+                                  const float desired_entropy,
+                                  const int epochs,
+                                  const float tol,
+                                  const value_idx n)
 {
   // For every item in row
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -171,17 +171,17 @@ void perplexity_search(const value_t* restrict distances,
 /* Compute attractive forces in O(uN) time.
     Uses only nearest neighbors         */
 template <typename value_idx, typename value_t>
-__global__ void attractive_kernel(const value_t* restrict VAL,
-                                  const value_idx* restrict COL,
-                                  const value_idx* restrict ROW,
-                                  const value_t* restrict Y,
-                                  const value_t* restrict norm,
-                                  value_t* restrict attract,
-                                  value_t* restrict Qs,
-                                  const value_idx NNZ,
-                                  const value_idx n,
-                                  const value_idx dim,
-                                  const value_t dof)
+CUML_KERNEL void attractive_kernel(const value_t* restrict VAL,
+                                   const value_idx* restrict COL,
+                                   const value_idx* restrict ROW,
+                                   const value_t* restrict Y,
+                                   const value_t* restrict norm,
+                                   value_t* restrict attract,
+                                   value_t* restrict Qs,
+                                   const value_idx NNZ,
+                                   const value_idx n,
+                                   const value_idx dim,
+                                   const value_t dof)
 {
   const auto index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index >= NNZ) return;
@@ -213,17 +213,17 @@ __global__ void attractive_kernel(const value_t* restrict VAL,
 /* Special case when dim == 2. Can speed
     up many calculations up             */
 template <typename value_idx, typename value_t>
-__global__ void attractive_kernel_2d(const value_t* restrict VAL,
-                                     const value_idx* restrict COL,
-                                     const value_idx* restrict ROW,
-                                     const value_t* restrict Y1,
-                                     const value_t* restrict Y2,
-                                     const value_t* restrict norm,
-                                     value_t* restrict attract1,
-                                     value_t* restrict attract2,
-                                     value_t* restrict Qs,
-                                     const value_idx NNZ,
-                                     const value_t dof)
+CUML_KERNEL void attractive_kernel_2d(const value_t* restrict VAL,
+                                      const value_idx* restrict COL,
+                                      const value_idx* restrict ROW,
+                                      const value_t* restrict Y1,
+                                      const value_t* restrict Y2,
+                                      const value_t* restrict norm,
+                                      value_t* restrict attract1,
+                                      value_t* restrict attract2,
+                                      value_t* restrict Qs,
+                                      const value_idx NNZ,
+                                      const value_t dof)
 {
   const auto index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index >= NNZ) return;
@@ -284,15 +284,15 @@ void attractive_forces(const value_t* restrict VAL,
     time where many of the math ops are
     made considerably faster.           */
 template <typename value_idx, typename value_t>
-__global__ void repulsive_kernel(const value_t* restrict Y,
-                                 value_t* restrict repel,
-                                 const value_t* restrict norm,
-                                 value_t* restrict Z_sum1,
-                                 value_t* restrict Z_sum2,
-                                 const value_idx n,
-                                 const value_idx dim,
-                                 const value_t df_power,  // -(df + 1)/2)
-                                 const value_t recp_df)   // 1 / df
+CUML_KERNEL void repulsive_kernel(const value_t* restrict Y,
+                                  value_t* restrict repel,
+                                  const value_t* restrict norm,
+                                  value_t* restrict Z_sum1,
+                                  value_t* restrict Z_sum2,
+                                  const value_idx n,
+                                  const value_idx dim,
+                                  const value_t df_power,  // -(df + 1)/2)
+                                  const value_t recp_df)   // 1 / df
 {
   const auto j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   const auto i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row
@@ -327,14 +327,14 @@ __global__ void repulsive_kernel(const value_t* restrict Y,
 /* Special case when dim == 2. Much faster
     since calculations are streamlined. */
 template <typename value_idx, typename value_t>
-__global__ void repulsive_kernel_2d(const value_t* restrict Y1,
-                                    const value_t* restrict Y2,
-                                    value_t* restrict repel1,
-                                    value_t* restrict repel2,
-                                    const value_t* restrict norm,
-                                    value_t* restrict Z_sum1,
-                                    value_t* restrict Z_sum2,
-                                    const value_idx n)
+CUML_KERNEL void repulsive_kernel_2d(const value_t* restrict Y1,
+                                     const value_t* restrict Y2,
+                                     value_t* restrict repel1,
+                                     value_t* restrict repel2,
+                                     const value_t* restrict norm,
+                                     value_t* restrict Z_sum1,
+                                     value_t* restrict Z_sum2,
+                                     const value_idx n)
 {
   const auto j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   const auto i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row
@@ -405,22 +405,22 @@ value_t repulsive_forces(const value_t* restrict Y,
     more gains and constrains the output
     for output stability                */
 template <typename value_idx, typename value_t>
-__global__ void apply_kernel(value_t* restrict Y,
-                             value_t* restrict velocity,
-                             const value_t* restrict attract,
-                             const value_t* restrict repel,
-                             value_t* restrict means,
-                             value_t* restrict gains,
-                             const float Z,  // sum(Q)
-                             const float learning_rate,
-                             const float C,  // constant from T-Dist Degrees of Freedom
-                             const float exaggeration,
-                             const float momentum,
-                             const value_idx SIZE,  // SIZE = n*dim
-                             const value_idx n,
-                             const float min_gain,
-                             value_t* restrict gradient,
-                             const bool check_convergence)
+CUML_KERNEL void apply_kernel(value_t* restrict Y,
+                              value_t* restrict velocity,
+                              const value_t* restrict attract,
+                              const value_t* restrict repel,
+                              value_t* restrict means,
+                              value_t* restrict gains,
+                              const float Z,  // sum(Q)
+                              const float learning_rate,
+                              const float C,  // constant from T-Dist Degrees of Freedom
+                              const float exaggeration,
+                              const float momentum,
+                              const value_idx SIZE,  // SIZE = n*dim
+                              const value_idx n,
+                              const float min_gain,
+                              value_t* restrict gradient,
+                              const bool check_convergence)
 {
   const auto index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index >= SIZE) return;
diff --git a/cpp/src/tsne/fft_kernels.cuh b/cpp/src/tsne/fft_kernels.cuh
index 79508783b4..20ad7ac55c 100644
--- a/cpp/src/tsne/fft_kernels.cuh
+++ b/cpp/src/tsne/fft_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,11 +30,11 @@ namespace TSNE {
 namespace FFT {
 
 template <typename value_idx, typename value_t>
-__global__ void compute_chargesQij(volatile value_t* __restrict__ chargesQij,
-                                   const value_t* __restrict__ xs,
-                                   const value_t* __restrict__ ys,
-                                   const value_idx num_points,
-                                   const value_idx n_terms)
+CUML_KERNEL void compute_chargesQij(volatile value_t* __restrict__ chargesQij,
+                                    const value_t* __restrict__ xs,
+                                    const value_t* __restrict__ ys,
+                                    const value_idx num_points,
+                                    const value_idx n_terms)
 {
   int TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= num_points) return;
@@ -49,12 +49,12 @@ __global__ void compute_chargesQij(volatile value_t* __restrict__ chargesQij,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_bounds(volatile value_t* __restrict__ box_lower_bounds,
-                               const value_t box_width,
-                               const value_t x_min,
-                               const value_t y_min,
-                               const value_idx n_boxes,
-                               const value_idx n_total_boxes)
+CUML_KERNEL void compute_bounds(volatile value_t* __restrict__ box_lower_bounds,
+                                const value_t box_width,
+                                const value_t x_min,
+                                const value_t y_min,
+                                const value_idx n_boxes,
+                                const value_idx n_total_boxes)
 {
   const int TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= n_boxes * n_boxes) return;
@@ -76,12 +76,12 @@ HDI value_t squared_cauchy_2d(value_t x1, value_t x2, value_t y1, value_t y2)
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_kernel_tilde(volatile value_t* __restrict__ kernel_tilde,
-                                     const value_t x_min,
-                                     const value_t y_min,
-                                     const value_t h,
-                                     const value_idx n_interpolation_points_1d,
-                                     const value_idx n_fft_coeffs)
+CUML_KERNEL void compute_kernel_tilde(volatile value_t* __restrict__ kernel_tilde,
+                                      const value_t x_min,
+                                      const value_t y_min,
+                                      const value_t h,
+                                      const value_idx n_interpolation_points_1d,
+                                      const value_idx n_fft_coeffs)
 {
   const int TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= n_interpolation_points_1d * n_interpolation_points_1d) return;
@@ -104,17 +104,17 @@ __global__ void compute_kernel_tilde(volatile value_t* __restrict__ kernel_tilde
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_point_box_idx(volatile value_idx* __restrict__ point_box_idx,
-                                      volatile value_t* __restrict__ x_in_box,
-                                      volatile value_t* __restrict__ y_in_box,
-                                      const value_t* const xs,
-                                      const value_t* const ys,
-                                      const value_t* const box_lower_bounds,
-                                      const value_t min_coord,
-                                      const value_t box_width,
-                                      const value_idx n_boxes,
-                                      const value_idx n_total_boxes,
-                                      const value_idx N)
+CUML_KERNEL void compute_point_box_idx(volatile value_idx* __restrict__ point_box_idx,
+                                       volatile value_t* __restrict__ x_in_box,
+                                       volatile value_t* __restrict__ y_in_box,
+                                       const value_t* const xs,
+                                       const value_t* const ys,
+                                       const value_t* const box_lower_bounds,
+                                       const value_t min_coord,
+                                       const value_t box_width,
+                                       const value_idx n_boxes,
+                                       const value_idx n_total_boxes,
+                                       const value_idx N)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= N) return;
@@ -136,12 +136,12 @@ __global__ void compute_point_box_idx(volatile value_idx* __restrict__ point_box
 }
 
 template <typename value_idx, typename value_t>
-__global__ void interpolate_device(volatile value_t* __restrict__ interpolated_values,
-                                   const value_t* const y_in_box,
-                                   const value_t* const y_tilde_spacings,
-                                   const value_t* const denominator,
-                                   const value_idx n_interpolation_points,
-                                   const value_idx N)
+CUML_KERNEL void interpolate_device(volatile value_t* __restrict__ interpolated_values,
+                                    const value_t* const y_in_box,
+                                    const value_t* const y_tilde_spacings,
+                                    const value_t* const denominator,
+                                    const value_idx n_interpolation_points,
+                                    const value_idx N)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= N * n_interpolation_points) return;
@@ -160,15 +160,15 @@ __global__ void interpolate_device(volatile value_t* __restrict__ interpolated_v
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_interpolated_indices(value_t* __restrict__ w_coefficients_device,
-                                             const value_idx* const point_box_indices,
-                                             const value_t* const chargesQij,
-                                             const value_t* const x_interpolated_values,
-                                             const value_t* const y_interpolated_values,
-                                             const value_idx N,
-                                             const value_idx n_interpolation_points,
-                                             const value_idx n_boxes,
-                                             const value_idx n_terms)
+CUML_KERNEL void compute_interpolated_indices(value_t* __restrict__ w_coefficients_device,
+                                              const value_idx* const point_box_indices,
+                                              const value_t* const chargesQij,
+                                              const value_t* const x_interpolated_values,
+                                              const value_t* const y_interpolated_values,
+                                              const value_idx N,
+                                              const value_idx n_interpolation_points,
+                                              const value_idx n_boxes,
+                                              const value_idx n_terms)
 {
   value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= n_terms * n_interpolation_points * n_interpolation_points * N) return;
@@ -190,11 +190,11 @@ __global__ void compute_interpolated_indices(value_t* __restrict__ w_coefficient
 }
 
 template <typename value_idx, typename value_t>
-__global__ void copy_to_fft_input(volatile value_t* __restrict__ fft_input,
-                                  const value_t* w_coefficients_device,
-                                  const value_idx n_fft_coeffs,
-                                  const value_idx n_fft_coeffs_half,
-                                  const value_idx n_terms)
+CUML_KERNEL void copy_to_fft_input(volatile value_t* __restrict__ fft_input,
+                                   const value_t* w_coefficients_device,
+                                   const value_idx n_fft_coeffs,
+                                   const value_idx n_fft_coeffs_half,
+                                   const value_idx n_terms)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= n_terms * n_fft_coeffs_half * n_fft_coeffs_half) return;
@@ -210,11 +210,11 @@ __global__ void copy_to_fft_input(volatile value_t* __restrict__ fft_input,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void copy_from_fft_output(volatile value_t* __restrict__ y_tilde_values,
-                                     const value_t* fft_output,
-                                     const value_idx n_fft_coeffs,
-                                     const value_idx n_fft_coeffs_half,
-                                     const value_idx n_terms)
+CUML_KERNEL void copy_from_fft_output(volatile value_t* __restrict__ y_tilde_values,
+                                      const value_t* fft_output,
+                                      const value_idx n_fft_coeffs,
+                                      const value_idx n_fft_coeffs_half,
+                                      const value_idx n_terms)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= n_terms * n_fft_coeffs_half * n_fft_coeffs_half) return;
@@ -232,13 +232,13 @@ __global__ void copy_from_fft_output(volatile value_t* __restrict__ y_tilde_valu
 
 // Template so that division is by compile-time divisors.
 template <typename value_idx, typename value_t, int n_terms, int n_interpolation_points>
-__global__ void compute_potential_indices(value_t* __restrict__ potentialsQij,
-                                          const value_idx* const point_box_indices,
-                                          const value_t* const y_tilde_values,
-                                          const value_t* const x_interpolated_values,
-                                          const value_t* const y_interpolated_values,
-                                          const value_idx N,
-                                          const value_idx n_boxes)
+CUML_KERNEL void compute_potential_indices(value_t* __restrict__ potentialsQij,
+                                           const value_idx* const point_box_indices,
+                                           const value_t* const y_tilde_values,
+                                           const value_t* const x_interpolated_values,
+                                           const value_t* const y_interpolated_values,
+                                           const value_idx N,
+                                           const value_idx n_boxes)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= n_terms * n_interpolation_points * n_interpolation_points * N) return;
@@ -263,10 +263,10 @@ __global__ void compute_potential_indices(value_t* __restrict__ potentialsQij,
 }
 
 template <typename value_idx>
-__global__ void broadcast_column_vector(cuComplex* __restrict__ mat,
-                                        cuComplex* __restrict__ vec,
-                                        value_idx n,
-                                        value_idx m)
+CUML_KERNEL void broadcast_column_vector(cuComplex* __restrict__ mat,
+                                         cuComplex* __restrict__ vec,
+                                         value_idx n,
+                                         value_idx m)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   const value_idx i   = TID % n;
@@ -278,7 +278,7 @@ __global__ void broadcast_column_vector(cuComplex* __restrict__ mat,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_repulsive_forces_kernel(
+CUML_KERNEL void compute_repulsive_forces_kernel(
   volatile value_t* __restrict__ repulsive_forces_device,
   volatile value_t* __restrict__ normalization_vec_device,
   const value_t* const xs,
@@ -306,15 +306,15 @@ __global__ void compute_repulsive_forces_kernel(
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_Pij_x_Qij_kernel(value_t* __restrict__ attr_forces,
-                                         value_t* __restrict__ Qs,
-                                         const value_t* __restrict__ pij,
-                                         const value_idx* __restrict__ coo_rows,
-                                         const value_idx* __restrict__ coo_cols,
-                                         const value_t* __restrict__ points,
-                                         const value_idx num_points,
-                                         const value_idx num_nonzero,
-                                         const value_t dof)
+CUML_KERNEL void compute_Pij_x_Qij_kernel(value_t* __restrict__ attr_forces,
+                                          value_t* __restrict__ Qs,
+                                          const value_t* __restrict__ pij,
+                                          const value_idx* __restrict__ coo_rows,
+                                          const value_idx* __restrict__ coo_cols,
+                                          const value_t* __restrict__ points,
+                                          const value_idx num_points,
+                                          const value_idx num_nonzero,
+                                          const value_t dof)
 {
   const value_idx TID = threadIdx.x + blockIdx.x * blockDim.x;
   if (TID >= num_nonzero) return;
@@ -344,16 +344,16 @@ __global__ void compute_Pij_x_Qij_kernel(value_t* __restrict__ attr_forces,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void IntegrationKernel(volatile value_t* __restrict__ points,
-                                  volatile value_t* __restrict__ attr_forces,
-                                  volatile value_t* __restrict__ rep_forces,
-                                  volatile value_t* __restrict__ gains,
-                                  volatile value_t* __restrict__ old_forces,
-                                  const value_t eta,
-                                  const value_t normalization,
-                                  const value_t momentum,
-                                  const value_t exaggeration,
-                                  const value_idx num_points)
+CUML_KERNEL void IntegrationKernel(volatile value_t* __restrict__ points,
+                                   volatile value_t* __restrict__ attr_forces,
+                                   volatile value_t* __restrict__ rep_forces,
+                                   volatile value_t* __restrict__ gains,
+                                   volatile value_t* __restrict__ old_forces,
+                                   const value_t eta,
+                                   const value_t normalization,
+                                   const value_t momentum,
+                                   const value_t exaggeration,
+                                   const value_idx num_points)
 {
   // iterate over all bodies assigned to thread
   const value_idx inc = blockDim.x * gridDim.x;
diff --git a/cpp/src/tsne/utils.cuh b/cpp/src/tsne/utils.cuh
index e1940152eb..895fe412d2 100644
--- a/cpp/src/tsne/utils.cuh
+++ b/cpp/src/tsne/utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/norm.cuh>
@@ -38,6 +39,7 @@
 #include <sys/time.h>
 #include <unistd.h>
 
+#include <cfloat>
 #include <chrono>
 #include <iostream>
 
@@ -147,7 +149,7 @@ double SymmetrizeTime = 0, DistancesTime = 0, NormalizeTime = 0, PerplexityTime
   }
 
 template <typename value_t, typename value_idx, int TPB = 256>
-__global__ void min_max_kernel(
+CUML_KERNEL void min_max_kernel(
   const value_t* Y, const value_idx n, value_t* min, value_t* max, bool find_min = true)
 {
   auto tid = threadIdx.x + blockDim.x * blockIdx.x;
@@ -182,10 +184,10 @@ __global__ void min_max_kernel(
  * CUDA kernel to compute KL divergence
  */
 template <typename value_idx, typename value_t>
-__global__ void compute_kl_div_k(const value_t* Ps,
-                                 const value_t* Qs,
-                                 value_t* __restrict__ KL_divs,
-                                 const value_idx NNZ)
+CUML_KERNEL void compute_kl_div_k(const value_t* Ps,
+                                  const value_t* Qs,
+                                  value_t* __restrict__ KL_divs,
+                                  const value_idx NNZ)
 {
   const auto index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index >= NNZ) return;
diff --git a/cpp/src/umap/fuzzy_simpl_set/naive.cuh b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
index 63798c9c28..f872b80c4b 100644
--- a/cpp/src/umap/fuzzy_simpl_set/naive.cuh
+++ b/cpp/src/umap/fuzzy_simpl_set/naive.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 #include <cuml/manifold/umapparams.h>
 #include <cuml/neighbors/knn.hpp>
 
@@ -79,15 +80,15 @@ static const float MIN_K_DIST_SCALE   = 1e-3;
  *
  */
 template <int TPB_X, typename value_t>
-__global__ void smooth_knn_dist_kernel(const value_t* knn_dists,
-                                       int n,
-                                       float mean_dist,
-                                       value_t* sigmas,
-                                       value_t* rhos,  // Size of n, iniitalized to zeros
-                                       int n_neighbors,
-                                       float local_connectivity = 1.0,
-                                       int n_iter               = 64,
-                                       float bandwidth          = 1.0)
+CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
+                                        int n,
+                                        float mean_dist,
+                                        value_t* sigmas,
+                                        value_t* rhos,  // Size of n, iniitalized to zeros
+                                        int n_neighbors,
+                                        float local_connectivity = 1.0,
+                                        int n_iter               = 64,
+                                        float bandwidth          = 1.0)
 {
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -190,7 +191,7 @@ __global__ void smooth_knn_dist_kernel(const value_t* knn_dists,
  * Descriptions adapted from: https://github.com/lmcinnes/umap/blob/master/umap/umap_.py
  */
 template <int TPB_X, typename value_idx, typename value_t>
-__global__ void compute_membership_strength_kernel(
+CUML_KERNEL void compute_membership_strength_kernel(
   const value_idx* knn_indices,
   const float* knn_dists,  // nn outputs
   const value_t* sigmas,
diff --git a/cpp/src/umap/optimize.cuh b/cpp/src/umap/optimize.cuh
index ba046a0f41..4862de112f 100644
--- a/cpp/src/umap/optimize.cuh
+++ b/cpp/src/umap/optimize.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 #include <cuml/manifold/umapparams.h>
 
 #include <raft/linalg/add.cuh>
@@ -38,7 +39,7 @@ namespace Optimize {
 using namespace ML;
 
 template <typename T, int TPB_X, typename Lambda>
-__global__ void map_kernel(T* output, T* X, int n_rows, T* coef, Lambda grad)
+CUML_KERNEL void map_kernel(T* output, T* X, int n_rows, T* coef, Lambda grad)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < n_rows) {
diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh
index f28eff0854..41bac31678 100644
--- a/cpp/src/umap/runner.cuh
+++ b/cpp/src/umap/runner.cuh
@@ -58,14 +58,14 @@ namespace SimplSetEmbedImpl = SimplSetEmbed::Algo;
 using namespace ML;
 
 template <int TPB_X, typename T>
-__global__ void init_transform(int* indices,
-                               T* weights,
-                               int n,
-                               const T* embeddings,
-                               int embeddings_n,
-                               int n_components,
-                               T* result,
-                               int n_neighbors)
+CUML_KERNEL void init_transform(int* indices,
+                                T* weights,
+                                int n,
+                                const T* embeddings,
+                                int embeddings_n,
+                                int n_components,
+                                T* result,
+                                int n_neighbors)
 {
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index d19cc4c54c..5fd34d2f3b 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,26 +97,26 @@ DI T truncate_gradient(T const rounding_factor, T const x)
 }
 
 template <typename T, int TPB_X, int n_components>
-__global__ void optimize_batch_kernel_reg(T const* head_embedding,
-                                          T* head_buffer,
-                                          int head_n,
-                                          T const* tail_embedding,
-                                          T* tail_buffer,
-                                          const MLCommon::FastIntDiv tail_n,
-                                          const int* head,
-                                          const int* tail,
-                                          int nnz,
-                                          T const* epochs_per_sample,
-                                          T* epoch_of_next_negative_sample,
-                                          T* epoch_of_next_sample,
-                                          T alpha,
-                                          int epoch,
-                                          T gamma,
-                                          uint64_t seed,
-                                          bool move_other,
-                                          UMAPParams params,
-                                          T nsr_inv,
-                                          T rounding)
+CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
+                                           T* head_buffer,
+                                           int head_n,
+                                           T const* tail_embedding,
+                                           T* tail_buffer,
+                                           const MLCommon::FastIntDiv tail_n,
+                                           const int* head,
+                                           const int* tail,
+                                           int nnz,
+                                           T const* epochs_per_sample,
+                                           T* epoch_of_next_negative_sample,
+                                           T* epoch_of_next_sample,
+                                           T alpha,
+                                           int epoch,
+                                           T gamma,
+                                           uint64_t seed,
+                                           bool move_other,
+                                           UMAPParams params,
+                                           T nsr_inv,
+                                           T rounding)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row >= nnz) return;
@@ -211,26 +211,26 @@ __global__ void optimize_batch_kernel_reg(T const* head_embedding,
 }
 
 template <typename T, int TPB_X, bool use_shared_mem>
-__global__ void optimize_batch_kernel(T const* head_embedding,
-                                      T* head_buffer,
-                                      int head_n,
-                                      T const* tail_embedding,
-                                      T* tail_buffer,
-                                      const MLCommon::FastIntDiv tail_n,
-                                      const int* head,
-                                      const int* tail,
-                                      int nnz,
-                                      T const* epochs_per_sample,
-                                      T* epoch_of_next_negative_sample,
-                                      T* epoch_of_next_sample,
-                                      T alpha,
-                                      int epoch,
-                                      T gamma,
-                                      uint64_t seed,
-                                      bool move_other,
-                                      UMAPParams params,
-                                      T nsr_inv,
-                                      T rounding)
+CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
+                                       T* head_buffer,
+                                       int head_n,
+                                       T const* tail_embedding,
+                                       T* tail_buffer,
+                                       const MLCommon::FastIntDiv tail_n,
+                                       const int* head,
+                                       const int* tail,
+                                       int nnz,
+                                       T const* epochs_per_sample,
+                                       T* epoch_of_next_negative_sample,
+                                       T* epoch_of_next_sample,
+                                       T alpha,
+                                       int epoch,
+                                       T gamma,
+                                       uint64_t seed,
+                                       bool move_other,
+                                       UMAPParams params,
+                                       T nsr_inv,
+                                       T rounding)
 {
   extern __shared__ T embedding_shared_mem_updates[];
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
diff --git a/cpp/src/umap/supervised.cuh b/cpp/src/umap/supervised.cuh
index 9ca86e512c..1156005ad2 100644
--- a/cpp/src/umap/supervised.cuh
+++ b/cpp/src/umap/supervised.cuh
@@ -52,7 +52,7 @@ namespace Supervised {
 using namespace ML;
 
 template <int TPB_X, typename T>
-__global__ void fast_intersection_kernel(
+CUML_KERNEL void fast_intersection_kernel(
   int* rows, int* cols, T* vals, int nnz, T* target, float unknown_dist = 1.0, float far_dist = 5.0)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -119,22 +119,22 @@ void categorical_simplicial_set_intersection(raft::sparse::COO<value_t>* graph_c
 }
 
 template <typename value_t, int TPB_X>
-__global__ void sset_intersection_kernel(int* row_ind1,
-                                         int* cols1,
-                                         value_t* vals1,
-                                         int nnz1,
-                                         int* row_ind2,
-                                         int* cols2,
-                                         value_t* vals2,
-                                         int nnz2,
-                                         int* result_ind,
-                                         int* result_cols,
-                                         value_t* result_vals,
-                                         int nnz,
-                                         value_t left_min,
-                                         value_t right_min,
-                                         int m,
-                                         float mix_weight = 0.5)
+CUML_KERNEL void sset_intersection_kernel(int* row_ind1,
+                                          int* cols1,
+                                          value_t* vals1,
+                                          int nnz1,
+                                          int* row_ind2,
+                                          int* cols2,
+                                          value_t* vals2,
+                                          int nnz2,
+                                          int* result_ind,
+                                          int* result_cols,
+                                          value_t* result_vals,
+                                          int nnz,
+                                          value_t left_min,
+                                          value_t right_min,
+                                          int m,
+                                          float mix_weight = 0.5)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
diff --git a/cpp/src_prims/common/grid_sync.cuh b/cpp/src_prims/common/grid_sync.cuh
index 323c89e61d..30c6c92894 100644
--- a/cpp/src_prims/common/grid_sync.cuh
+++ b/cpp/src_prims/common/grid_sync.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ enum SyncType {
  * know the list of supported synchronization 'modes'.
  *
  * @code{.cu}
- * __global__ void kernel(void* workspace, SyncType type, ...) {
+ * CUML_KERNEL void kernel(void* workspace, SyncType type, ...) {
  *   GridSync gs(workspace, type);
  *   // do pre-sync work here
  *   // ...
@@ -70,7 +70,7 @@ enum SyncType {
  * usage is discouraged. Example follows:
  *
  * @code{.cu}
- * __global__ void kernelMultiple(void* workspace, SyncType type, ...) {
+ * CUML_KERNEL void kernelMultiple(void* workspace, SyncType type, ...) {
  *   GridSync gs(workspace, type, true);
  *   ////// Part1 //////
  *   // do pre-sync work here
diff --git a/cpp/src_prims/common/iota.cuh b/cpp/src_prims/common/iota.cuh
index 99608da878..c09822c7ac 100644
--- a/cpp/src_prims/common/iota.cuh
+++ b/cpp/src_prims/common/iota.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 namespace MLCommon {
 
 template <typename DataT, typename IdxT>
-__global__ void iotaKernel(DataT* out, DataT start, DataT step, IdxT len)
+CUML_KERNEL void iotaKernel(DataT* out, DataT start, DataT step, IdxT len)
 {
   auto tid = (IdxT)blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < len) { out[tid] = start + DataT(tid) * step; }
diff --git a/cpp/src_prims/linalg/batched/gemv.cuh b/cpp/src_prims/linalg/batched/gemv.cuh
index 69c74daa1e..20ca1d0cc5 100644
--- a/cpp/src_prims/linalg/batched/gemv.cuh
+++ b/cpp/src_prims/linalg/batched/gemv.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -58,15 +60,15 @@ dotProduct(const DataT (&x)[VecLen], const DataT (&y)[VecLen], char* smem, bool
 }
 
 template <typename DataT, typename IdxT, int VecLenAx, int VecLenY, typename EpilogueOp>
-__global__ void gemvKernel(DataT* y,
-                           const DataT* A,
-                           const DataT* x,
-                           const DataT* z,
-                           DataT alpha,
-                           DataT beta,
-                           IdxT m,
-                           IdxT n,
-                           EpilogueOp op)
+CUML_KERNEL void gemvKernel(DataT* y,
+                            const DataT* A,
+                            const DataT* x,
+                            const DataT* z,
+                            DataT alpha,
+                            DataT beta,
+                            IdxT m,
+                            IdxT n,
+                            EpilogueOp op)
 {
   typedef raft::TxN_t<DataT, VecLenAx> VecTypeAx;
   typedef raft::TxN_t<DataT, VecLenY> VecTypeY;
diff --git a/cpp/src_prims/linalg/batched/make_symm.cuh b/cpp/src_prims/linalg/batched/make_symm.cuh
index 9b6405ed32..ce50ff90da 100644
--- a/cpp/src_prims/linalg/batched/make_symm.cuh
+++ b/cpp/src_prims/linalg/batched/make_symm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/util/cuda_utils.cuh>
 
 namespace MLCommon {
@@ -28,7 +30,7 @@ static constexpr int BlockRows = 8;
 // Ref: https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
 ///@todo: special-case for blockIdx.x == blockIdx.y to reduce gmem traffic
 template <typename DataT, typename IdxT, typename EpilogueOp>
-__global__ void symmKernel(DataT* out, const DataT* in, IdxT batchSize, IdxT n, EpilogueOp op)
+CUML_KERNEL void symmKernel(DataT* out, const DataT* in, IdxT batchSize, IdxT n, EpilogueOp op)
 {
   __shared__ DataT smem[TileDim][TileDim + 1];  // +1 to avoid bank conflicts
   IdxT batchOffset = blockIdx.z * n * n;
diff --git a/cpp/src_prims/linalg/batched/matrix.cuh b/cpp/src_prims/linalg/batched/matrix.cuh
index 00ea158012..b6dd6a9dcb 100644
--- a/cpp/src_prims/linalg/batched/matrix.cuh
+++ b/cpp/src_prims/linalg/batched/matrix.cuh
@@ -56,7 +56,7 @@ namespace Batched {
  * @param[in]   m  Number of rows/columns of matrix
  */
 template <typename T>
-__global__ void identity_matrix_kernel(T* I, int m)
+CUML_KERNEL void identity_matrix_kernel(T* I, int m)
 {
   T* I_b     = I + blockIdx.x * m * m;
   int stride = (m + 1);
@@ -79,7 +79,7 @@ __global__ void identity_matrix_kernel(T* I, int m)
  * @param[in]  period  Period of the difference
  */
 template <typename T>
-__global__ void batched_diff_kernel(const T* in, T* out, int n_elem, int period = 1)
+CUML_KERNEL void batched_diff_kernel(const T* in, T* out, int n_elem, int period = 1)
 {
   const T* batch_in = in + n_elem * blockIdx.x;
   T* batch_out      = out + (n_elem - period) * blockIdx.x;
@@ -103,7 +103,7 @@ __global__ void batched_diff_kernel(const T* in, T* out, int n_elem, int period
  * @param[in]  period2  Period for the 2nd difference
  */
 template <typename T>
-__global__ void batched_second_diff_kernel(
+CUML_KERNEL void batched_second_diff_kernel(
   const T* in, T* out, int n_elem, int period1 = 1, int period2 = 1)
 {
   const T* batch_in = in + n_elem * blockIdx.x;
@@ -126,7 +126,7 @@ __global__ void batched_second_diff_kernel(
  * @param[in]  n           Number of columns of each matrix
  */
 template <typename T>
-__global__ void fill_strided_pointers_kernel(T* A_dense, T** A_array, int batch_size, int m, int n)
+CUML_KERNEL void fill_strided_pointers_kernel(T* A_dense, T** A_array, int batch_size, int m, int n)
 {
   int bid = blockIdx.x * blockDim.x + threadIdx.x;
   if (bid < batch_size) { A_array[bid] = A_dense + bid * m * n; }
@@ -513,7 +513,7 @@ class Matrix {
  * @param[in]  alpha Multiplying coefficient
  */
 template <typename T>
-__global__ void kronecker_product_kernel(
+CUML_KERNEL void kronecker_product_kernel(
   const T* A, int m, int n, const T* B, int p, int q, T* AkB, int k_m, int k_n, T alpha)
 {
   const T* A_b = A + blockIdx.x * m * n;
@@ -886,15 +886,15 @@ Matrix<T> b_kron(const Matrix<T>& A, const Matrix<T>& B)
  * @param[in]  s                Seasonality of the lags
  */
 template <typename T>
-__global__ void lagged_mat_kernel(const T* vec,
-                                  T* mat,
-                                  int lags,
-                                  int lagged_height,
-                                  int vec_offset,
-                                  int ld,
-                                  int mat_offset,
-                                  int ls_batch_stride,
-                                  int s = 1)
+CUML_KERNEL void lagged_mat_kernel(const T* vec,
+                                   T* mat,
+                                   int lags,
+                                   int lagged_height,
+                                   int vec_offset,
+                                   int ld,
+                                   int mat_offset,
+                                   int ls_batch_stride,
+                                   int s = 1)
 {
   const T* batch_in = vec + blockIdx.x * ld + vec_offset;
   T* batch_out      = mat + blockIdx.x * ls_batch_stride + mat_offset;
@@ -1004,18 +1004,18 @@ Matrix<T> b_lagged_mat(const Matrix<T>& vec, int lags)
  * @param[in]  out_cols          Number of columns in the output matrix
  */
 template <typename T>
-static __global__ void batched_2dcopy_kernel(const T* in,
-                                             T* out,
-                                             int in_starting_row,
-                                             int in_starting_col,
-                                             int in_rows,
-                                             int in_cols,
-                                             MLCommon::FastIntDiv copy_rows,
-                                             int n_copy,
-                                             int out_starting_row,
-                                             int out_starting_col,
-                                             int out_rows,
-                                             int out_cols)
+CUML_KERNEL void batched_2dcopy_kernel(const T* in,
+                                       T* out,
+                                       int in_starting_row,
+                                       int in_starting_col,
+                                       int in_rows,
+                                       int in_cols,
+                                       MLCommon::FastIntDiv copy_rows,
+                                       int n_copy,
+                                       int out_starting_row,
+                                       int out_starting_col,
+                                       int out_rows,
+                                       int out_cols)
 {
   const T* in_ = in + blockIdx.x * in_rows * in_cols + in_starting_col * in_rows + in_starting_row;
   T* out_ = out + blockIdx.x * out_rows * out_cols + out_starting_col * out_rows + out_starting_row;
@@ -1183,7 +1183,7 @@ DI void generate_householder_vector(T* d_uk, const T* d_xk, T* shared_mem, int m
  * @param[in]    n    Matrix dimensions
  */
 template <typename T>
-__global__ void hessenberg_reduction_kernel(T* d_U, T* d_H, T* d_hh, int n)
+CUML_KERNEL void hessenberg_reduction_kernel(T* d_U, T* d_H, T* d_hh, int n)
 {
   int ib = blockIdx.x;
 
@@ -1388,7 +1388,7 @@ DI bool ahues_tisseur(const T* d_M, int i, int n)
  * @param[in]     n    Matrix dimension
  */
 template <typename T>
-__global__ void francis_qr_algorithm_kernel(T* d_U, T* d_H, int n)
+CUML_KERNEL void francis_qr_algorithm_kernel(T* d_U, T* d_H, int n)
 {
   int ib = blockIdx.x;
 
@@ -1688,7 +1688,7 @@ DI void quasi_triangular_solver(T* d_scratch, T* d_x, int n, T* shared_mem)
  * @param[in]  n           Matrix dimension
  */
 template <typename T>
-__global__ void trsyl_kernel(
+CUML_KERNEL void trsyl_kernel(
   const T* d_R, const T* d_R2, const T* d_S, const T* d_F, T* d_Y, T* d_scratch, int n)
 {
   int ib                = blockIdx.x;
diff --git a/cpp/src_prims/linalg/eltwise2d.cuh b/cpp/src_prims/linalg/eltwise2d.cuh
index 73fb56ac8c..508e117167 100644
--- a/cpp/src_prims/linalg/eltwise2d.cuh
+++ b/cpp/src_prims/linalg/eltwise2d.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,19 +16,21 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 namespace MLCommon {
 namespace LinAlg {
 
 template <typename Type, typename Lambda>
-__global__ void eltwise2DKernel(int rows,  // m
-                                int cols,  // n
-                                const Type* dotA,
-                                const Type* dotB,
-                                const Type* pC,
-                                Type* pD,
-                                Type alpha,
-                                Type beta,
-                                Lambda op)
+CUML_KERNEL void eltwise2DKernel(int rows,  // m
+                                 int cols,  // n
+                                 const Type* dotA,
+                                 const Type* dotB,
+                                 const Type* pC,
+                                 Type* pD,
+                                 Type alpha,
+                                 Type beta,
+                                 Lambda op)
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < cols * rows) {
diff --git a/cpp/src_prims/matrix/reverse.cuh b/cpp/src_prims/matrix/reverse.cuh
index 2064821a2c..2f794ada3b 100644
--- a/cpp/src_prims/matrix/reverse.cuh
+++ b/cpp/src_prims/matrix/reverse.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@ namespace MLCommon {
 namespace Matrix {
 
 template <typename math_t, int veclen_, typename Lambda>
-__global__ void reverseKernel(math_t* out,
-                              const math_t* in,
-                              int nrows,
-                              int ncols,
-                              bool rowMajor,
-                              bool alongRows,
-                              int len,
-                              Lambda op)
+CUML_KERNEL void reverseKernel(math_t* out,
+                               const math_t* in,
+                               int nrows,
+                               int ncols,
+                               bool rowMajor,
+                               bool alongRows,
+                               int len,
+                               Lambda op)
 {
   typedef raft::TxN_t<math_t, veclen_> VecType;
   int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
diff --git a/cpp/src_prims/random/make_arima.cuh b/cpp/src_prims/random/make_arima.cuh
index 54f5527c4b..83133fab85 100644
--- a/cpp/src_prims/random/make_arima.cuh
+++ b/cpp/src_prims/random/make_arima.cuh
@@ -51,20 +51,20 @@ namespace Random {
  * @param[in]  k          Parameter k
  */
 template <typename DataT>
-__global__ void make_arima_kernel(DataT* d_diff,
-                                  const DataT* d_res,
-                                  const DataT* d_mu,
-                                  const DataT* d_ar,
-                                  const DataT* d_ma,
-                                  const DataT* d_sar,
-                                  const DataT* d_sma,
-                                  int n_obs_diff,
-                                  int p,
-                                  int q,
-                                  int P,
-                                  int Q,
-                                  int s,
-                                  int k)
+CUML_KERNEL void make_arima_kernel(DataT* d_diff,
+                                   const DataT* d_res,
+                                   const DataT* d_mu,
+                                   const DataT* d_ar,
+                                   const DataT* d_ma,
+                                   const DataT* d_sar,
+                                   const DataT* d_sma,
+                                   int n_obs_diff,
+                                   int p,
+                                   int q,
+                                   int P,
+                                   int Q,
+                                   int s,
+                                   int k)
 {
   int n_phi   = p + s * P;
   int n_theta = q + s * Q;
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index 93a2d88f50..b24f0d03e1 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
 #include <cuml/neighbors/knn.hpp>
 
 #include <raft/core/handle.hpp>
@@ -47,12 +48,12 @@ inline __device__ T get_lbls(const T* labels, const int64_t* knn_indices, int64_
 }
 
 template <typename OutType = float, bool precomp_lbls = false>
-__global__ void class_probs_kernel(OutType* out,
-                                   const int64_t* knn_indices,
-                                   const int* labels,
-                                   int n_uniq_labels,
-                                   std::size_t n_samples,
-                                   int n_neighbors)
+CUML_KERNEL void class_probs_kernel(OutType* out,
+                                    const int64_t* knn_indices,
+                                    const int* labels,
+                                    int n_uniq_labels,
+                                    std::size_t n_samples,
+                                    int n_neighbors)
 {
   int row = (blockIdx.x * blockDim.x) + threadIdx.x;
   int i   = row * n_neighbors;
@@ -69,14 +70,14 @@ __global__ void class_probs_kernel(OutType* out,
 }
 
 template <typename OutType = int>
-__global__ void class_vote_kernel(OutType* out,
-                                  const float* class_proba,
-                                  int* unique_labels,
-                                  int n_uniq_labels,
-                                  std::size_t n_samples,
-                                  int n_outputs,
-                                  int output_offset,
-                                  bool use_shared_mem)
+CUML_KERNEL void class_vote_kernel(OutType* out,
+                                   const float* class_proba,
+                                   int* unique_labels,
+                                   int n_uniq_labels,
+                                   std::size_t n_samples,
+                                   int n_outputs,
+                                   int output_offset,
+                                   bool use_shared_mem)
 {
   int row = (blockIdx.x * blockDim.x) + threadIdx.x;
   int i   = row * n_uniq_labels;
@@ -107,13 +108,13 @@ __global__ void class_vote_kernel(OutType* out,
 }
 
 template <typename LabelType, bool precomp_lbls = false>
-__global__ void regress_avg_kernel(LabelType* out,
-                                   const int64_t* knn_indices,
-                                   const LabelType* labels,
-                                   std::size_t n_samples,
-                                   int n_neighbors,
-                                   int n_outputs,
-                                   int output_offset)
+CUML_KERNEL void regress_avg_kernel(LabelType* out,
+                                    const int64_t* knn_indices,
+                                    const LabelType* labels,
+                                    std::size_t n_samples,
+                                    int n_neighbors,
+                                    int n_outputs,
+                                    int output_offset)
 {
   int row = (blockIdx.x * blockDim.x) + threadIdx.x;
   int i   = row * n_neighbors;
@@ -329,4 +330,4 @@ void knn_regress(const raft::handle_t& handle,
 }
 
 };  // namespace Selection
-};  // namespace MLCommon
\ No newline at end of file
+};  // namespace MLCommon
diff --git a/cpp/src_prims/selection/kselection.cuh b/cpp/src_prims/selection/kselection.cuh
index e76fc9f0ba..8e154974b8 100644
--- a/cpp/src_prims/selection/kselection.cuh
+++ b/cpp/src_prims/selection/kselection.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/util/cuda_utils.cuh>
 
 #include <stdlib.h>
@@ -302,7 +304,7 @@ struct KVArray {
 
 ///@todo: specialize this for k=1
 template <typename TypeV, typename TypeK, int N, int TPB, bool Greater, bool Sort>
-__global__ void warpTopKkernel(
+CUML_KERNEL void warpTopKkernel(
   TypeV* outV, TypeK* outK, const TypeV* arr, int k, int rows, int cols, TypeV iV, TypeK iK)
 {
   // static_assert(Sort==false, "warpTopK: Sort=true is not yet supported!");
diff --git a/cpp/src_prims/sparse/batched/csr.cuh b/cpp/src_prims/sparse/batched/csr.cuh
index 8f75da9fe2..6ffc421077 100644
--- a/cpp/src_prims/sparse/batched/csr.cuh
+++ b/cpp/src_prims/sparse/batched/csr.cuh
@@ -66,14 +66,14 @@ namespace Batched {
  * @param[in]  nnz        Number of non-zero elements in each matrix
  */
 template <typename T>
-static __global__ void dense_to_csr_kernel(const T* dense,
-                                           const int* col_index,
-                                           const int* row_index,
-                                           T* values,
-                                           int batch_size,
-                                           int m,
-                                           int n,
-                                           int nnz)
+CUML_KERNEL void dense_to_csr_kernel(const T* dense,
+                                     const int* col_index,
+                                     const int* row_index,
+                                     T* values,
+                                     int batch_size,
+                                     int m,
+                                     int n,
+                                     int nnz)
 {
   int bid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -104,14 +104,14 @@ static __global__ void dense_to_csr_kernel(const T* dense,
  * @param[in]  nnz        Number of non-zero elements in each matrix
  */
 template <typename T>
-static __global__ void csr_to_dense_kernel(T* dense,
-                                           const int* col_index,
-                                           const int* row_index,
-                                           const T* values,
-                                           int batch_size,
-                                           int m,
-                                           int n,
-                                           int nnz)
+CUML_KERNEL void csr_to_dense_kernel(T* dense,
+                                     const int* col_index,
+                                     const int* row_index,
+                                     const T* values,
+                                     int batch_size,
+                                     int m,
+                                     int n,
+                                     int nnz)
 {
   int bid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -439,16 +439,16 @@ class CSR {
  * @param[in]     batch_size   Number of individual matrices in the batch
  */
 template <typename T>
-__global__ void batched_spmv_kernel(T alpha,
-                                    const int* A_col_index,
-                                    const int* A_row_index,
-                                    const T* A_values,
-                                    const T* x,
-                                    T beta,
-                                    T* y,
-                                    int m,
-                                    int n,
-                                    int batch_size)
+CUML_KERNEL void batched_spmv_kernel(T alpha,
+                                     const int* A_col_index,
+                                     const int* A_row_index,
+                                     const T* A_values,
+                                     const T* x,
+                                     T beta,
+                                     T* y,
+                                     int m,
+                                     int n,
+                                     int batch_size)
 {
   int bid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -534,18 +534,18 @@ void b_spmv(T alpha,
  * @param[in]     threads_per_bid Number of threads per batch index
  */
 template <typename T>
-__global__ void batched_spmm_kernel(T alpha,
-                                    const int* A_col_index,
-                                    const int* A_row_index,
-                                    const T* A_values,
-                                    const T* B,
-                                    T beta,
-                                    T* C,
-                                    int m,
-                                    int k,
-                                    int n,
-                                    int batch_size,
-                                    int threads_per_bid)
+CUML_KERNEL void batched_spmm_kernel(T alpha,
+                                     const int* A_col_index,
+                                     const int* A_row_index,
+                                     const T* A_values,
+                                     const T* B,
+                                     T beta,
+                                     T* C,
+                                     int m,
+                                     int k,
+                                     int n,
+                                     int batch_size,
+                                     int threads_per_bid)
 {
   int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int bid        = thread_idx / threads_per_bid;
@@ -588,17 +588,17 @@ __global__ void batched_spmm_kernel(T alpha,
  * @param[in]     nnz             Number of non-zero elements per matrix
  */
 template <typename T>
-__global__ void batched_spmm_kernel_shared_mem(T alpha,
-                                               const int* A_col_index,
-                                               const int* A_row_index,
-                                               const T* A_values,
-                                               const T* B,
-                                               T beta,
-                                               T* C,
-                                               int m,
-                                               int k,
-                                               int n,
-                                               int nnz)
+CUML_KERNEL void batched_spmm_kernel_shared_mem(T alpha,
+                                                const int* A_col_index,
+                                                const int* A_row_index,
+                                                const T* A_values,
+                                                const T* B,
+                                                T beta,
+                                                T* C,
+                                                int m,
+                                                int k,
+                                                int n,
+                                                int nnz)
 {
   int bid = blockIdx.x;
   int j   = threadIdx.x;
diff --git a/cpp/src_prims/timeSeries/arima_helpers.cuh b/cpp/src_prims/timeSeries/arima_helpers.cuh
index c78c9b732e..18a83b04e9 100644
--- a/cpp/src_prims/timeSeries/arima_helpers.cuh
+++ b/cpp/src_prims/timeSeries/arima_helpers.cuh
@@ -84,7 +84,7 @@ DI DataT _select_read(const DataT* src0, int size0, const DataT* src1, int idx)
  * @param[in]  stream   CUDA stream
  */
 template <typename T>
-__global__ void _future_diff_kernel(
+CUML_KERNEL void _future_diff_kernel(
   const T* in_past, const T* in_fut, T* out, int n_past, int n_fut, int period = 1)
 {
   const T* b_in_past = in_past + n_past * blockIdx.x;
@@ -109,13 +109,13 @@ __global__ void _future_diff_kernel(
  * @param[in]  stream   CUDA stream
  */
 template <typename T>
-__global__ void _future_second_diff_kernel(const T* in_past,
-                                           const T* in_fut,
-                                           T* out,
-                                           int n_past,
-                                           int n_fut,
-                                           int period1 = 1,
-                                           int period2 = 1)
+CUML_KERNEL void _future_second_diff_kernel(const T* in_past,
+                                            const T* in_fut,
+                                            T* out,
+                                            int n_past,
+                                            int n_fut,
+                                            int period1 = 1,
+                                            int period2 = 1)
 {
   const T* b_in_past = in_past + n_past * blockIdx.x;
   const T* b_in_fut  = in_fut + n_fut * blockIdx.x;
@@ -145,14 +145,14 @@ __global__ void _future_second_diff_kernel(const T* in_past,
  * @param[in]    s1          2nd differencing period if relevant
  */
 template <bool double_diff, typename DataT>
-__global__ void _undiff_kernel(DataT* d_fc,
-                               const DataT* d_in,
-                               int num_steps,
-                               int batch_size,
-                               int in_ld,
-                               int n_in,
-                               int s0,
-                               int s1 = 0)
+CUML_KERNEL void _undiff_kernel(DataT* d_fc,
+                                const DataT* d_in,
+                                int num_steps,
+                                int batch_size,
+                                int in_ld,
+                                int n_in,
+                                int s0,
+                                int s1 = 0)
 {
   int bid = blockIdx.x * blockDim.x + threadIdx.x;
   if (bid < batch_size) {
diff --git a/cpp/src_prims/timeSeries/fillna.cuh b/cpp/src_prims/timeSeries/fillna.cuh
index 76d20814b5..53b64d06cb 100644
--- a/cpp/src_prims/timeSeries/fillna.cuh
+++ b/cpp/src_prims/timeSeries/fillna.cuh
@@ -77,10 +77,10 @@ struct FillnaOp {
 };
 
 template <bool forward, typename T>
-__global__ void fillna_interpolate_kernel(T* data,
-                                          int n_elem,
-                                          FillnaTemp* d_indices_fwd,
-                                          FillnaTemp* d_indices_bwd)
+CUML_KERNEL void fillna_interpolate_kernel(T* data,
+                                           int n_elem,
+                                           FillnaTemp* d_indices_fwd,
+                                           FillnaTemp* d_indices_bwd)
 {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n_elem;
        index += gridDim.x * blockDim.x) {
@@ -167,4 +167,4 @@ void fillna(T* data, int batch_size, int n_obs, cudaStream_t stream)
 }
 
 }  // namespace TimeSeries
-}  // namespace MLCommon
\ No newline at end of file
+}  // namespace MLCommon
diff --git a/cpp/src_prims/timeSeries/jones_transform.cuh b/cpp/src_prims/timeSeries/jones_transform.cuh
index 63c931137d..ff4f749cab 100644
--- a/cpp/src_prims/timeSeries/jones_transform.cuh
+++ b/cpp/src_prims/timeSeries/jones_transform.cuh
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -158,7 +160,7 @@ inline __device__ void invtransform(DataT* tmp, DataT* myNewParams, bool isAr)
  * @param clamp: whether to clamp transformed params between -1 and 1
  */
 template <typename DataT, int VALUE, typename IdxT, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void jones_transform_kernel(
+CUML_KERNEL void jones_transform_kernel(
   DataT* newParams, const DataT* params, IdxT batchSize, bool isAr, bool isInv, bool clamp)
 {
   // calculating the index of the model that the coefficients belong to
diff --git a/cpp/src_prims/timeSeries/stationarity.cuh b/cpp/src_prims/timeSeries/stationarity.cuh
index 5c0f284386..b85b874d94 100644
--- a/cpp/src_prims/timeSeries/stationarity.cuh
+++ b/cpp/src_prims/timeSeries/stationarity.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,13 +89,13 @@ static inline dim3 choose_block_dims(IdxT batch_size)
  * @param[in]   coeff_b      Part of the calculation for w(k)=a*k+b
  */
 template <typename DataT, typename IdxT>
-static __global__ void s2B_accumulation_kernel(DataT* accumulator,
-                                               const DataT* data,
-                                               IdxT lags,
-                                               IdxT batch_size,
-                                               IdxT n_obs,
-                                               DataT coeff_a,
-                                               DataT coeff_b)
+CUML_KERNEL void s2B_accumulation_kernel(DataT* accumulator,
+                                         const DataT* data,
+                                         IdxT lags,
+                                         IdxT batch_size,
+                                         IdxT n_obs,
+                                         DataT coeff_a,
+                                         DataT coeff_b)
 {
   IdxT sample_idx = blockIdx.x * blockDim.x + threadIdx.x;
   IdxT batch_idx  = blockIdx.y * blockDim.y + threadIdx.y;
@@ -129,13 +129,13 @@ static __global__ void s2B_accumulation_kernel(DataT* accumulator,
  *                              considered stationary
  */
 template <typename DataT, typename IdxT>
-static __global__ void kpss_stationarity_check_kernel(bool* results,
-                                                      const DataT* s2A,
-                                                      const DataT* s2B,
-                                                      const DataT* eta,
-                                                      IdxT batch_size,
-                                                      DataT n_obs_f,
-                                                      DataT pval_threshold)
+CUML_KERNEL void kpss_stationarity_check_kernel(bool* results,
+                                                const DataT* s2A,
+                                                const DataT* s2B,
+                                                const DataT* eta,
+                                                IdxT batch_size,
+                                                DataT n_obs_f,
+                                                DataT pval_threshold)
 {
   // Table 1, Kwiatkowski 1992
   const DataT crit_vals[4] = {0.347, 0.463, 0.574, 0.739};
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0033c844ae..2a04100cdf 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -85,6 +85,10 @@ function(ConfigureTest)
   set_target_properties(
     ${_CUML_TEST_NAME}
     PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD                      17
+               CXX_STANDARD_REQUIRED             ON
+               CUDA_STANDARD                     17
+               CUDA_STANDARD_REQUIRED            ON
   )
 
   set(_CUML_TEST_COMPONENT_NAME testing)
diff --git a/cpp/test/prims/batched/gemv.cu b/cpp/test/prims/batched/gemv.cu
index 68761b8800..b2eb118049 100644
--- a/cpp/test/prims/batched/gemv.cu
+++ b/cpp/test/prims/batched/gemv.cu
@@ -41,7 +41,7 @@ template <typename T, typename IdxType = int>
 }
 
 template <typename Type>
-__global__ void naiveBatchGemvKernel(Type* y, const Type* A, const Type* x, int m, int n)
+CUML_KERNEL void naiveBatchGemvKernel(Type* y, const Type* A, const Type* x, int m, int n)
 {
   int batch = blockIdx.y;
   int row   = blockIdx.x;
diff --git a/cpp/test/prims/batched/make_symm.cu b/cpp/test/prims/batched/make_symm.cu
index 5de82b025d..624dcf7fc6 100644
--- a/cpp/test/prims/batched/make_symm.cu
+++ b/cpp/test/prims/batched/make_symm.cu
@@ -43,7 +43,7 @@ template <typename T, typename IdxType = int>
 }
 
 template <typename Type>
-__global__ void naiveBatchMakeSymmKernel(Type* y, const Type* x, int n)
+CUML_KERNEL void naiveBatchMakeSymmKernel(Type* y, const Type* x, int n)
 {
   int batch = blockIdx.z;
   int row   = threadIdx.y + blockDim.y * blockIdx.y;
diff --git a/cpp/test/prims/decoupled_lookback.cu b/cpp/test/prims/decoupled_lookback.cu
index 6954d26c5f..08d7aa8b1f 100644
--- a/cpp/test/prims/decoupled_lookback.cu
+++ b/cpp/test/prims/decoupled_lookback.cu
@@ -16,6 +16,8 @@
 
 #include "test_utils.h"
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/core/interruptible.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -27,7 +29,7 @@
 namespace MLCommon {
 
 template <int TPB>
-__global__ void dlbTestKernel(void* workspace, int len, int* out)
+CUML_KERNEL void dlbTestKernel(void* workspace, int len, int* out)
 {
   DecoupledLookBack<int> dlb(workspace);
   int count   = threadIdx.x == blockDim.x - 1 ? 1 : 0;
diff --git a/cpp/test/prims/device_utils.cu b/cpp/test/prims/device_utils.cu
index 6e60c9aa00..c4ea25de37 100644
--- a/cpp/test/prims/device_utils.cu
+++ b/cpp/test/prims/device_utils.cu
@@ -18,6 +18,8 @@
 
 #include <common/device_utils.cuh>
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/core/interruptible.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -43,7 +45,7 @@ namespace MLCommon {
  */
 
 template <int NThreads>
-__global__ void batchedBlockReduceTestKernel(int* out)
+CUML_KERNEL void batchedBlockReduceTestKernel(int* out)
 {
   extern __shared__ char smem[];
   int val = threadIdx.x;
diff --git a/cpp/test/prims/dist_adj.cu b/cpp/test/prims/dist_adj.cu
index 8dccedeb84..5730b276ea 100644
--- a/cpp/test/prims/dist_adj.cu
+++ b/cpp/test/prims/dist_adj.cu
@@ -27,14 +27,14 @@ namespace MLCommon {
 namespace Distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(bool* dist,
-                                       const DataType* x,
-                                       const DataType* y,
-                                       int m,
-                                       int n,
-                                       int k,
-                                       DataType eps,
-                                       bool isRowMajor)
+CUML_KERNEL void naiveDistanceAdjKernel(bool* dist,
+                                        const DataType* x,
+                                        const DataType* y,
+                                        int m,
+                                        int n,
+                                        int k,
+                                        DataType eps,
+                                        bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
diff --git a/cpp/test/prims/distance_base.cuh b/cpp/test/prims/distance_base.cuh
index 0924804b31..4a472779f7 100644
--- a/cpp/test/prims/distance_base.cuh
+++ b/cpp/test/prims/distance_base.cuh
@@ -29,14 +29,14 @@ namespace MLCommon {
 namespace Distance {
 
 template <typename DataType>
-__global__ void naiveDistanceKernel(DataType* dist,
-                                    const DataType* x,
-                                    const DataType* y,
-                                    int m,
-                                    int n,
-                                    int k,
-                                    raft::distance::DistanceType type,
-                                    bool isRowMajor)
+CUML_KERNEL void naiveDistanceKernel(DataType* dist,
+                                     const DataType* x,
+                                     const DataType* y,
+                                     int m,
+                                     int n,
+                                     int k,
+                                     raft::distance::DistanceType type,
+                                     bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
@@ -56,7 +56,7 @@ __global__ void naiveDistanceKernel(DataType* dist,
 }
 
 template <typename DataType>
-__global__ void naiveL1DistanceKernel(
+CUML_KERNEL void naiveL1DistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -78,7 +78,7 @@ __global__ void naiveL1DistanceKernel(
 }
 
 template <typename DataType>
-__global__ void naiveCosineDistanceKernel(
+CUML_KERNEL void naiveCosineDistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/test/prims/eltwise2d.cu b/cpp/test/prims/eltwise2d.cu
index 549e72c729..471939c830 100644
--- a/cpp/test/prims/eltwise2d.cu
+++ b/cpp/test/prims/eltwise2d.cu
@@ -26,14 +26,14 @@ namespace MLCommon {
 namespace LinAlg {
 
 template <typename Type>
-__global__ void naiveEltwise2DAddKernel(int rows,
-                                        int cols,
-                                        const Type* aPtr,
-                                        const Type* bPtr,
-                                        const Type* cPtr,
-                                        Type* dPtr,
-                                        Type alpha,
-                                        Type beta)
+CUML_KERNEL void naiveEltwise2DAddKernel(int rows,
+                                         int cols,
+                                         const Type* aPtr,
+                                         const Type* bPtr,
+                                         const Type* cPtr,
+                                         Type* dPtr,
+                                         Type alpha,
+                                         Type beta)
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < cols * rows) {
diff --git a/cpp/test/prims/fast_int_div.cu b/cpp/test/prims/fast_int_div.cu
index 0e2b9c16cd..3d46d86a17 100644
--- a/cpp/test/prims/fast_int_div.cu
+++ b/cpp/test/prims/fast_int_div.cu
@@ -18,6 +18,8 @@
 
 #include <common/fast_int_div.cuh>
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -57,7 +59,7 @@ TEST(FastIntDiv, CpuTest)
   }
 }
 
-__global__ void fastIntDivTestKernel(
+CUML_KERNEL void fastIntDivTestKernel(
   int* computed, int* correct, const int* in, FastIntDiv fid, int divisor, int len)
 {
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/test/prims/grid_sync.cu b/cpp/test/prims/grid_sync.cu
index 0f5e6c51c7..8bc165312e 100644
--- a/cpp/test/prims/grid_sync.cu
+++ b/cpp/test/prims/grid_sync.cu
@@ -18,6 +18,8 @@
 
 #include <common/grid_sync.cuh>
 
+#include <cuml/common/utils.hpp>
+
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -26,7 +28,7 @@
 
 namespace MLCommon {
 
-__global__ void gridSyncTestKernel(void* workspace, int* out, SyncType type)
+CUML_KERNEL void gridSyncTestKernel(void* workspace, int* out, SyncType type)
 {
   GridSync gs(workspace, type, true);
   bool master;
diff --git a/cpp/test/prims/kselection.cu b/cpp/test/prims/kselection.cu
index afa69cea5f..a3bd79df36 100644
--- a/cpp/test/prims/kselection.cu
+++ b/cpp/test/prims/kselection.cu
@@ -28,7 +28,7 @@ namespace MLCommon {
 namespace Selection {
 
 template <typename TypeV, typename TypeK, int N, int TPB, bool Greater>
-__global__ void sortTestKernel(TypeK* key)
+CUML_KERNEL void sortTestKernel(TypeK* key)
 {
   KVArray<TypeV, TypeK, N, Greater> arr;
 #pragma unroll
diff --git a/cpp/test/prims/linalg_block.cu b/cpp/test/prims/linalg_block.cu
index 369c16e676..c5debc711f 100644
--- a/cpp/test/prims/linalg_block.cu
+++ b/cpp/test/prims/linalg_block.cu
@@ -17,6 +17,7 @@
 #include "test_utils.h"
 
 #include <cuml/common/logger.hpp>
+#include <cuml/common/utils.hpp>
 
 #include <raft/core/handle.hpp>
 #include <raft/random/rng.cuh>
@@ -53,7 +54,7 @@ template <typename T>
 }
 
 template <typename Policy, typename T>
-__global__ void block_gemm_test_kernel(
+CUML_KERNEL void block_gemm_test_kernel(
   bool transa, bool transb, int m, int n, int k, T alpha, const T* a, const T* b, T* c)
 {
   __shared__ MLCommon::LinAlg::GemmStorage<Policy, T> gemm_storage;
@@ -268,7 +269,7 @@ template <typename T>
 }
 
 template <typename Policy, typename T>
-__global__ void block_gemv_test_kernel(
+CUML_KERNEL void block_gemv_test_kernel(
   int m, int n, T alpha, const T* a, const T* x, T* y, bool preload)
 {
   __shared__ MLCommon::LinAlg::GemvStorage<Policy, T> gemv_storage;
@@ -420,7 +421,7 @@ template <typename T>
 }
 
 template <int BlockSize, bool Broadcast, typename T>
-__global__ void block_dot_test_kernel(int n, const T* x, const T* y, T* d_dot)
+CUML_KERNEL void block_dot_test_kernel(int n, const T* x, const T* y, T* d_dot)
 {
   __shared__ ReductionStorage<BlockSize, T> reduction_storage;
 
@@ -535,7 +536,7 @@ template <typename T>
 }
 
 template <int BlockSize, bool Broadcast, typename T>
-__global__ void block_xAxt_test_kernel(int n, const T* x, const T* A, T* d_res, bool preload)
+CUML_KERNEL void block_xAxt_test_kernel(int n, const T* x, const T* A, T* d_res, bool preload)
 {
   extern __shared__ char dyna_shared_mem[];
   T* shared_vec = (T*)dyna_shared_mem;
@@ -669,7 +670,7 @@ template <typename T>
 }
 
 template <typename T>
-__global__ void block_ax_test_kernel(int n, T alpha, const T* x, T* y)
+CUML_KERNEL void block_ax_test_kernel(int n, T alpha, const T* x, T* y)
 {
   _block_ax(n, alpha, x + n * blockIdx.x, y + n * blockIdx.x);
 }
@@ -765,7 +766,7 @@ template <typename T>
 }
 
 template <typename CovPolicy, typename T>
-__global__ void block_cov_stability_test_kernel(int n, const T* in, T* out)
+CUML_KERNEL void block_cov_stability_test_kernel(int n, const T* in, T* out)
 {
   __shared__ CovStabilityStorage<CovPolicy, T> cov_stability_storage;
   _block_covariance_stability<CovPolicy>(
diff --git a/cpp/test/sg/experimental/fil/raft_proto/buffer.cu b/cpp/test/sg/experimental/fil/raft_proto/buffer.cu
index 90f8f3cc98..d99b37aaa9 100644
--- a/cpp/test/sg/experimental/fil/raft_proto/buffer.cu
+++ b/cpp/test/sg/experimental/fil/raft_proto/buffer.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuml/common/utils.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/buffer.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_check.hpp>
 #include <cuml/experimental/fil/detail/raft_proto/cuda_stream.hpp>
@@ -28,7 +29,7 @@
 
 namespace raft_proto {
 
-__global__ void check_buffer_access(int* buf)
+CUML_KERNEL void check_buffer_access(int* buf)
 {
   if (buf[0] == 1) { buf[0] = 4; }
   if (buf[1] == 2) { buf[1] = 5; }
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 8f926728db..78bfc1dff6 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -16,6 +16,7 @@
 
 #include "../../src/fil/internal.cuh"
 
+#include <cuml/common/utils.hpp>
 #include <cuml/fil/fil.h>
 
 #include <raft/core/handle.hpp>
@@ -135,7 +136,7 @@ std::ostream& operator<<(std::ostream& os, const FilTestParams& ps)
 }
 
 template <typename real_t>
-__global__ void nan_kernel(real_t* data, const bool* mask, int len, real_t nan)
+CUML_KERNEL void nan_kernel(real_t* data, const bool* mask, int len, real_t nan)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= len) return;
@@ -188,7 +189,7 @@ struct replace_some_floating_with_categorical {
 };
 
 template <typename real_t>
-__global__ void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t size)
+CUML_KERNEL void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t size)
 {
   std::size_t idx = std::size_t(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) return;
diff --git a/cpp/test/sg/knn_test.cu b/cpp/test/sg/knn_test.cu
index 20d4947f0b..c11ffed740 100644
--- a/cpp/test/sg/knn_test.cu
+++ b/cpp/test/sg/knn_test.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuml/common/utils.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 #include <cuml/neighbors/knn.hpp>
 
@@ -97,14 +98,14 @@ void create_index_parts(raft::handle_t& handle,
   }
 }
 
-__global__ void to_float(float* out, int* in, int size)
+CUML_KERNEL void to_float(float* out, int* in, int size)
 {
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= size) return;
   out[element] = float(in[element]);
 }
 
-__global__ void build_actual_output(
+CUML_KERNEL void build_actual_output(
   int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices)
 {
   int element = threadIdx.x + blockDim.x * blockIdx.x;
@@ -114,7 +115,7 @@ __global__ void build_actual_output(
   output[element] = idx_labels[ind];
 }
 
-__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels)
+CUML_KERNEL void build_expected_output(int* output, int n_rows, int k, const int* labels)
 {
   int row = threadIdx.x + blockDim.x * blockIdx.x;
   if (row >= n_rows) return;
diff --git a/cpp/test/sg/multi_sum_test.cu b/cpp/test/sg/multi_sum_test.cu
index f33197c3c5..4f1f6b1762 100644
--- a/cpp/test/sg/multi_sum_test.cu
+++ b/cpp/test/sg/multi_sum_test.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuml/common/utils.hpp>
 #include <cuml/fil/multi_sum.cuh>
 
 #include <raft/core/handle.hpp>
@@ -74,7 +75,7 @@ __device__ void test_single_radix(multi_sum_test_shmem<T>& s,
 }
 
 template <typename T>
-__global__ void test_multi_sum_k(T* data, MultiSumTestParams* params, int* error_flags)
+CUML_KERNEL void test_multi_sum_k(T* data, MultiSumTestParams* params, int* error_flags)
 {
   __shared__ multi_sum_test_shmem<T> s;
   MultiSumTestParams p = params[blockIdx.x];
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index e60d59d52d..117ec37960 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -51,6 +51,26 @@
 
 namespace ML {
 
+namespace DT {
+
+template <typename T>
+using ReturnValue = std::tuple<ML::DT::Quantiles<T, int>,
+                               std::shared_ptr<rmm::device_uvector<T>>,
+                               std::shared_ptr<rmm::device_uvector<int>>>;
+
+template <typename T>
+ReturnValue<T> computeQuantiles(
+  const raft::handle_t& handle, const T* data, int max_n_bins, int n_rows, int n_cols);
+
+template <>
+ReturnValue<float> computeQuantiles<float>(
+  const raft::handle_t& handle, const float* data, int max_n_bins, int n_rows, int n_cols);
+
+template <>
+ReturnValue<double> computeQuantiles<double>(
+  const raft::handle_t& handle, const double* data, int max_n_bins, int n_rows, int n_cols);
+}  // namespace DT
+
 // Utils for changing tuple into struct
 namespace detail {
 template <typename result_type, typename... types, std::size_t... indices>
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index ad6266ca57..0caad107d5 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -31,6 +31,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/device_ptr.h>
@@ -38,6 +39,7 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -500,9 +502,12 @@ class GetResultsTest : public ::testing::Test {
  protected:
   void FreeDenseSupport()
   {
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    auto stream                                = this->handle.get_stream();
-    rmm_alloc->deallocate(support_matrix.data, n_coefs * n_cols * sizeof(math_t), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    auto stream                              = this->handle.get_stream();
+    rmm_alloc.deallocate_async(support_matrix.data,
+                               n_coefs * n_cols * sizeof(math_t),
+                               rmm::CUDA_ALLOCATION_ALIGNMENT,
+                               stream);
     support_matrix.data = nullptr;
   }
 
@@ -1284,7 +1289,7 @@ std::ostream& operator<<(std::ostream& os, const blobInput& b)
 
 // until there is progress with Issue #935
 template <typename inType, typename outType>
-__global__ void cast(outType* out, int n, inType* in)
+CUML_KERNEL void cast(outType* out, int n, inType* in)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) out[tid] = in[tid];
diff --git a/cpp/test/sg/umap_parametrizable_test.cu b/cpp/test/sg/umap_parametrizable_test.cu
index 3cec73a1f6..821437fb0e 100644
--- a/cpp/test/sg/umap_parametrizable_test.cu
+++ b/cpp/test/sg/umap_parametrizable_test.cu
@@ -44,7 +44,7 @@ using namespace MLCommon;
 using namespace MLCommon::Datasets::Digits;
 
 template <typename T>
-__global__ void has_nan_kernel(T* data, size_t len, bool* answer)
+CUML_KERNEL void has_nan_kernel(T* data, size_t len, bool* answer)
 {
   static_assert(std::is_floating_point<T>());
   std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -65,7 +65,7 @@ bool has_nan(T* data, size_t len, cudaStream_t stream)
 }
 
 template <typename T>
-__global__ void are_equal_kernel(T* embedding1, T* embedding2, size_t len, double* diff)
+CUML_KERNEL void are_equal_kernel(T* embedding1, T* embedding2, size_t len, double* diff)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= len) return;
diff --git a/dependencies.yaml b/dependencies.yaml
index 6560560bb2..95514dc299 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -116,12 +116,10 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - gmock>=1.13.0
-          - gtest>=1.13.0
-          - libcumlprims==24.4.*
-          - libraft==24.4.*
-          - libraft-headers==24.4.*
-          - librmm==24.4.*
+          - libcumlprims==24.6.*
+          - libraft==24.6.*
+          - libraft-headers==24.6.*
+          - librmm==24.6.*
     specific:
       - output_types: conda
         matrices:
@@ -159,8 +157,8 @@ dependencies:
           - &treelite treelite==4.1.2
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.4.*
-          - &rmm_conda rmm==24.4.*
+          - &pylibraft_conda pylibraft==24.6.*
+          - &rmm_conda rmm==24.6.*
           - scikit-build-core>=0.7.0
       - output_types: requirements
         packages:
@@ -185,33 +183,34 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibraft-cu12==24.4.*
-              - rmm-cu12==24.4.*
+              - pylibraft-cu12==24.6.*
+              - rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.4.*
-              - &rmm_cu11 rmm-cu11==24.4.*
+              - &pylibraft_cu11 pylibraft-cu11==24.6.*
+              - &rmm_cu11 rmm-cu11==24.6.*
           - {matrix: null, packages: [*pylibraft_conda, *rmm_conda] }
 
   py_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - joblib>=0.11
           - numba>=0.57
             # TODO: Is scipy really a hard dependency, or should
             # we make it optional (i.e. an extra for pip
             # installation/run_constrained for conda)?
           - scipy>=1.8.0
-          - rapids-dask-dependency==24.4.*
+          - packaging
+          - rapids-dask-dependency==24.6.*
           - *treelite
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.4.*
+          - &cudf_conda cudf==24.6.*
           - &cupy_conda cupy>=12.0.0
-          - &dask_cudf_conda dask-cudf==24.4.*
-          - &raft_dask_conda raft-dask==24.4.*
+          - &dask_cudf_conda dask-cudf==24.6.*
+          - &raft_dask_conda raft-dask==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -223,19 +222,19 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.4.*
+              - cudf-cu12==24.6.*
               - cupy-cuda12x>=12.0.0
-              - dask-cudf-cu12==24.4.*
-              - pylibraft-cu12==24.4.*
-              - raft-dask-cu12==24.4.*
-              - rmm-cu12==24.4.*
+              - dask-cudf-cu12==24.6.*
+              - pylibraft-cu12==24.6.*
+              - raft-dask-cu12==24.6.*
+              - rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages: &py_run_packages_cu11
-              - cudf-cu11==24.4.*
+              - cudf-cu11==24.6.*
               - &cupy_pyproject_cu11 cupy-cuda11x>=12.0.0
-              - dask-cudf-cu11==24.4.*
+              - dask-cudf-cu11==24.6.*
               - *pylibraft_cu11
-              - raft-dask-cu11==24.4.*
+              - raft-dask-cu11==24.6.*
               - *rmm_cu11
           - matrix: null
             packages:
@@ -357,7 +356,7 @@ dependencies:
           # https://github.com/pydata/pydata-sphinx-theme/issues/1539
           - pydata-sphinx-theme!=0.14.2
           - recommonmark
-          - &scikit_learn scikit-learn==1.2
+          - &scikit_learn scikit-learn==1.5
           - sphinx<6
           - sphinx-copybutton
           - sphinx-markdown-tables
@@ -387,14 +386,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libcuml==24.4.*
-          - libcuml-tests==24.4.*
+          - libcuml==24.6.*
+          - libcuml-tests==24.6.*
   test_cuml:
     common:
       - output_types: conda
         packages:
-          - libcuml==24.4.*
-          - cuml==24.4.*
+          - libcuml==24.6.*
+          - cuml==24.6.*
   test_cpp:
     common:
       - output_types: conda
diff --git a/notebooks/forest_inference_demo.ipynb b/notebooks/forest_inference_demo.ipynb
index 8b82f9769c..3e57c2165e 100644
--- a/notebooks/forest_inference_demo.ipynb
+++ b/notebooks/forest_inference_demo.ipynb
@@ -456,9 +456,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = dask_cudf.from_dask_dataframe(\n",
-    "    dask.dataframe.from_array(x)\n",
-    ")"
+    "df = dask.dataframe.from_array(x).to_backend(\"cudf\")"
    ]
   },
   {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 52872fd0d5..036d91eef1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -41,7 +41,6 @@ option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
 set(CUML_RAFT_CLONE_ON_PIN OFF)
 
 
-
 # todo: use CMAKE_MESSAGE_CONTEXT for prefix for logging.
 # https://github.com/rapidsai/cuml/issues/4843
 message(VERBOSE "CUML_PY: Build only cuML CPU Python components.: ${CUML_CPU}")
diff --git a/python/README.md b/python/README.md
index fd28666819..4fe169f41e 100644
--- a/python/README.md
+++ b/python/README.md
@@ -38,7 +38,7 @@ example `setup.py --singlegpu`) are:
 
 RAFT's Python and Cython is located in the [RAFT repository](https://github.com/rapidsai/raft/python). It was designed to be included in projects as opposed to be distributed by itself, so at build time, **setup.py creates a symlink from cuML, located in `/python/cuml/raft/` to the Python folder of RAFT**.
 
-For developers that need to modify RAFT code, please refer to the [RAFT Developer Guide](https://github.com/rapidsai/raft/blob/branch-24.04/docs/source/build.md) for recommendations.
+For developers that need to modify RAFT code, please refer to the [RAFT Developer Guide](https://github.com/rapidsai/raft/blob/branch-24.06/docs/source/build.md) for recommendations.
 
 To configure RAFT at build time:
 
@@ -50,7 +50,7 @@ The RAFT Python code gets included in the cuML build and distributable artifacts
 
 ### Build Requirements
 
-cuML's convenience [development yaml files](https://github.com/rapidsai/cuml/tree/branch-24.04/environments) includes all dependencies required to build cuML.
+cuML's convenience [development yaml files](https://github.com/rapidsai/cuml/tree/branch-24.06/environments) includes all dependencies required to build cuML.
 
 To build cuML's Python package, the following dependencies are required:
 
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
index f1e9eac615..afe8f8742e 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
@@ -14,6 +14,19 @@
 # This code is under BSD 3 clause license.
 # Authors mentioned above do not endorse or promote this production.
 
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from ....internals.memory_utils import using_output_type
 from ....internals import _deprecate_pos_args
@@ -48,6 +61,7 @@
 from cuml.internals.safe_imports import cpu_only_import
 cpu_np = cpu_only_import('numpy')
 np = gpu_only_import('cupy')
+resample = cpu_only_import_from('sklearn.utils._indexing', 'resample')
 sparse = gpu_only_import_from('cupyx.scipy', 'sparse')
 stats = cpu_only_import_from('scipy', 'stats')
 
@@ -2284,17 +2298,14 @@ def _dense_fit(self, X, random_state):
         n_samples, n_features = X.shape
         references = np.asnumpy(self.references_ * 100)
 
-        self.quantiles_ = []
-        for col in X.T:
-            if self.subsample < n_samples:
-                subsample_idx = random_state.choice(n_samples,
-                                                    size=self.subsample,
-                                                    replace=False)
-                col = col.take(subsample_idx)
-            self.quantiles_.append(
-                cpu_np.nanpercentile(np.asnumpy(col), references)
+        X = np.asnumpy(X)
+        if self.subsample is not None and self.subsample < n_samples:
+            # Take a subsample of `X`
+            X = resample(
+                X, replace=False, n_samples=self.subsample, random_state=random_state
             )
-        self.quantiles_ = cpu_np.transpose(self.quantiles_)
+
+        self.quantiles_ = cpu_np.nanpercentile(X, references, axis=0)
         # Due to floating-point precision error in `np.nanpercentile`,
         # make sure that quantiles are monotonically increasing.
         # Upstream issue in numpy:
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
index ed85c5262f..02762f6585 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
@@ -10,6 +10,20 @@
 # This code is under BSD 3 clause license.
 # Authors mentioned above do not endorse or promote this production.
 
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 from ....internals import _deprecate_pos_args
 from ....internals.memory_utils import using_output_type
@@ -240,7 +254,7 @@ def fit(self, X, y=None) -> "KBinsDiscretizer":
         if 'onehot' in self.encode:
             self._encoder = OneHotEncoder(
                 categories=np.array([np.arange(i) for i in self.n_bins_]),
-                sparse=self.encode == 'onehot', output_type='cupy')
+                sparse_output=self.encode == 'onehot', output_type='cupy')
             # Fit the OneHotEncoder with toy datasets
             # so that it's ready for use after the KBinsDiscretizer is fitted
             self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int))
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py b/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py
index cc0550b2d9..c0bf8917b8 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_function_transformer.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+
 # This code originates from the Scikit-Learn library,
 # it was since modified to allow GPU acceleration.
 # This code is under BSD 3 clause license.
@@ -63,7 +65,7 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):
     --------
     >>> import cupy as cp
     >>> from cuml.preprocessing import FunctionTransformer
-    >>> transformer = FunctionTransformer(cp.log1p)
+    >>> transformer = FunctionTransformer(func=cp.log1p)
     >>> X = cp.array([[0, 1], [2, 3]])
     >>> transformer.transform(X)
     array([[0.       , 0.6931...],
diff --git a/python/cuml/cluster/agglomerative.pyx b/python/cuml/cluster/agglomerative.pyx
index 84cc579201..34150d3f6b 100644
--- a/python/cuml/cluster/agglomerative.pyx
+++ b/python/cuml/cluster/agglomerative.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 # distutils: language = c++
 
+import warnings
+
 from libc.stdint cimport uintptr_t
 
 from cuml.internals.safe_imports import cpu_only_import
@@ -103,6 +105,17 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         Metric used to compute the linkage. Can be "euclidean", "l1",
         "l2", "manhattan", or "cosine". If connectivity is "knn" only
         "euclidean" is accepted.
+
+        .. deprecated:: 24.06
+            `affinity` was deprecated in version 24.06 and will be renamed to
+            `metric` in 25.08.
+
+    metric : str, default=None
+        Metric used to compute the linkage. Can be "euclidean", "l1",
+        "l2", "manhattan", or "cosine". If set to `None` then "euclidean"
+        is used. If connectivity is "knn" only "euclidean" is accepted.
+        .. versionadded:: 24.06
+
     linkage : {"single"}, default="single"
         Which linkage criterion to use. The linkage criterion determines
         which distance to use between sets of observations. The algorithm
@@ -136,9 +149,9 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
     labels_ = CumlArrayDescriptor()
     children_ = CumlArrayDescriptor()
 
-    def __init__(self, *, n_clusters=2, affinity="euclidean", linkage="single",
-                 handle=None, verbose=False, connectivity='knn',
-                 n_neighbors=10, output_type=None):
+    def __init__(self, *, n_clusters=2, affinity="deprecated", metric=None,
+                 linkage="single", handle=None, verbose=False,
+                 connectivity='knn', n_neighbors=10, output_type=None):
 
         super().__init__(handle=handle,
                          verbose=verbose,
@@ -159,11 +172,12 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
             raise ValueError("'n_neighbors' must be a positive number "
                              "between 2 and 1023")
 
-        if affinity not in _metrics_mapping:
-            raise ValueError("'affinity' %s is not supported." % affinity)
+        if metric is not None and metric not in _metrics_mapping:
+            raise ValueError("Metric '%s' is not supported." % affinity)
 
         self.n_clusters = n_clusters
         self.affinity = affinity
+        self.metric = metric
         self.linkage = linkage
         self.n_neighbors = n_neighbors
         self.connectivity = connectivity
@@ -178,6 +192,26 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         """
         Fit the hierarchical clustering from features.
         """
+        if self.affinity != "deprecated":
+            if self.metric is not None:
+                raise ValueError(
+                    "Both `affinity` and `metric` attributes were set. Attribute"
+                    " `affinity` was deprecated in version 24.06 and will be removed in"
+                    " 25.08. To avoid this error, only set the `metric` attribute."
+                )
+            warnings.warn(
+                (
+                    "Attribute `affinity` was deprecated in version 24.06 and will be"
+                    " removed in 25.08. Use `metric` instead."
+                ),
+                FutureWarning,
+            )
+            metric_name = self.affinity
+        else:
+            if self.metric is None:
+                metric_name = "euclidean"
+            else:
+                metric_name = self.metric
 
         X_m, n_rows, n_cols, self.dtype = \
             input_to_cuml_array(X, order='C',
@@ -209,10 +243,10 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         linkage_output.labels = <int*>labels_ptr
 
         cdef DistanceType metric
-        if self.affinity in _metrics_mapping:
-            metric = _metrics_mapping[self.affinity]
+        if metric_name in _metrics_mapping:
+            metric = _metrics_mapping[metric_name]
         else:
-            raise ValueError("'affinity' %s not supported." % self.affinity)
+            raise ValueError("Metric '%s' not supported." % metric_name)
 
         if self.connectivity == 'knn':
             single_linkage_neighbors(
@@ -249,6 +283,7 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         return super().get_param_names() + [
             "n_clusters",
             "affinity",
+            "metric",
             "linkage",
             "connectivity",
             "n_neighbors"
diff --git a/python/cuml/dask/__init__.py b/python/cuml/dask/__init__.py
index f2dc448552..6aaf17a3b3 100644
--- a/python/cuml/dask/__init__.py
+++ b/python/cuml/dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from dask import config
 
 from cuml.dask import cluster
 from cuml.dask import common
@@ -27,6 +28,9 @@
 from cuml.dask import preprocessing
 from cuml.dask import solvers
 
+# Avoid "p2p" shuffling in dask for now
+config.set({"dataframe.shuffle.method": "tasks"})
+
 __all__ = [
     "cluster",
     "common",
diff --git a/python/cuml/dask/common/base.py b/python/cuml/dask/common/base.py
index a9949310be..1f2f71542c 100644
--- a/python/cuml/dask/common/base.py
+++ b/python/cuml/dask/common/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 
 from distributed.client import Future
 from functools import wraps
-from dask_cudf.core import Series as dcSeries
+from dask_cudf import Series as dcSeries
 from cuml.internals.safe_imports import gpu_only_import_from
 from cuml.internals.base import Base
 from cuml.internals import BaseMetaClass
@@ -37,7 +37,7 @@
 
 
 dask_cudf = gpu_only_import("dask_cudf")
-dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
+dcDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
 
 
 class BaseEstimator(object, metaclass=BaseMetaClass):
diff --git a/python/cuml/dask/common/input_utils.py b/python/cuml/dask/common/input_utils.py
index 688de03219..5c4f7d0913 100644
--- a/python/cuml/dask/common/input_utils.py
+++ b/python/cuml/dask/common/input_utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 from cuml.dask.common.dask_arr_utils import validate_dask_array
 from cuml.dask.common.dask_df_utils import to_dask_cudf
 from cuml.dask.common.utils import get_client
-from dask_cudf.core import Series as dcSeries
+from dask_cudf import Series as dcSeries
 from dask.dataframe import Series as daskSeries
 from dask.dataframe import DataFrame as daskDataFrame
 from cudf import Series
@@ -43,7 +43,7 @@
 
 
 DataFrame = gpu_only_import_from("cudf", "DataFrame")
-dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
+dcDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
 
 
 class DistributedDataHandler:
diff --git a/python/cuml/dask/common/part_utils.py b/python/cuml/dask/common/part_utils.py
index a6aa892a76..c92f2c351d 100644
--- a/python/cuml/dask/common/part_utils.py
+++ b/python/cuml/dask/common/part_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 #
 
 from cuml.dask.common.utils import parse_host_port
-from dask_cudf.core import Series as dcSeries
+from dask_cudf import Series as dcSeries
 from cuml.internals.safe_imports import gpu_only_import_from
 from dask.dataframe import Series as daskSeries
 from dask.dataframe import DataFrame as daskDataFrame
@@ -30,7 +30,7 @@
 np = cpu_only_import("numpy")
 
 
-dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
+dcDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
 
 
 def hosts_to_parts(futures):
diff --git a/python/cuml/dask/neighbors/kneighbors_classifier.py b/python/cuml/dask/neighbors/kneighbors_classifier.py
index 2844823e06..18babd75c0 100644
--- a/python/cuml/dask/neighbors/kneighbors_classifier.py
+++ b/python/cuml/dask/neighbors/kneighbors_classifier.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -111,9 +111,14 @@ def fit(self, X, y):
             if isinstance(y, DaskSeries):
                 uniq_labels.append(y.unique())
             else:
-                n_targets = len(y.columns)
+                # Dask-expr does not support numerical column names
+                # See: https://github.com/dask/dask-expr/issues/1015
+                _y = y
+                if hasattr(y, "to_legacy_dataframe"):
+                    _y = y.to_legacy_dataframe()
+                n_targets = len(_y.columns)
                 for i in range(n_targets):
-                    uniq_labels.append(y.iloc[:, i].unique())
+                    uniq_labels.append(_y.iloc[:, i].unique())
 
         uniq_labels = da.compute(uniq_labels)[0]
         if hasattr(uniq_labels[0], "values_host"):  # for cuDF Series
diff --git a/python/cuml/dask/preprocessing/LabelEncoder.py b/python/cuml/dask/preprocessing/LabelEncoder.py
index 07a6ac2479..fcc35c07a9 100644
--- a/python/cuml/dask/preprocessing/LabelEncoder.py
+++ b/python/cuml/dask/preprocessing/LabelEncoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 #
 from cuml.preprocessing import LabelEncoder as LE
 from cuml.common.exceptions import NotFittedError
-from dask_cudf.core import Series as daskSeries
 from cuml.dask.common.base import BaseEstimator
 from cuml.dask.common.base import DelayedTransformMixin
 from cuml.dask.common.base import DelayedInverseTransformMixin
@@ -24,7 +23,8 @@
 from collections.abc import Sequence
 from cuml.internals.safe_imports import gpu_only_import_from
 
-dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
+dcDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
+dcSeries = gpu_only_import_from("dask_cudf", "Series")
 
 
 class LabelEncoder(
@@ -148,7 +148,7 @@ def fit(self, y):
         _classes = y.unique().compute().sort_values(ignore_index=True)
         el = first(y) if isinstance(y, Sequence) else y
         self.datatype = (
-            "cudf" if isinstance(el, (dcDataFrame, daskSeries)) else "cupy"
+            "cudf" if isinstance(el, (dcDataFrame, dcSeries)) else "cupy"
         )
         self._set_internal_model(LE(**self.kwargs).fit(y, _classes=_classes))
         return self
diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py
index 8bf2503578..e574a53c0c 100644
--- a/python/cuml/dask/preprocessing/encoders.py
+++ b/python/cuml/dask/preprocessing/encoders.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,11 +21,11 @@
     DelayedTransformMixin,
 )
 from cuml.internals.safe_imports import gpu_only_import_from, gpu_only_import
-from dask_cudf.core import Series as daskSeries
 from toolz import first
 
 dask_cudf = gpu_only_import("dask_cudf")
-dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
+dcDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
+dcSeries = gpu_only_import_from("dask_cudf", "Series")
 
 
 class DelayedFitTransformMixin:
@@ -123,7 +123,7 @@ def fit(self, X):
 
         el = first(X) if isinstance(X, Sequence) else X
         self.datatype = (
-            "cudf" if isinstance(el, (dcDataFrame, daskSeries)) else "cupy"
+            "cudf" if isinstance(el, (dcDataFrame, dcSeries)) else "cupy"
         )
 
         self._set_internal_model(OneHotEncoderMG(**self.kwargs).fit(X))
@@ -233,7 +233,7 @@ def fit(self, X):
 
         el = first(X) if isinstance(X, Sequence) else X
         self.datatype = (
-            "cudf" if isinstance(el, (dcDataFrame, daskSeries)) else "cupy"
+            "cudf" if isinstance(el, (dcDataFrame, dcSeries)) else "cupy"
         )
 
         self._set_internal_model(OrdinalEncoderMG(**self.kwargs).fit(X))
diff --git a/python/cuml/datasets/arima.pyx b/python/cuml/datasets/arima.pyx
index d40c2072bd..cbfea2227b 100644
--- a/python/cuml/datasets/arima.pyx
+++ b/python/cuml/datasets/arima.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -135,7 +135,7 @@ def make_arima(batch_size=1000, n_obs=100, order=(1, 1, 1),
     cdef uintptr_t out_ptr = <uintptr_t> out.ptr
 
     if random_state is None:
-        random_state = randint(0, 1e18)
+        random_state = randint(0, 10**18)
 
     if dtype == np.float32:
         cpp_make_arima(handle_[0], <float*> out_ptr, <int> batch_size,
diff --git a/python/cuml/datasets/blobs.py b/python/cuml/datasets/blobs.py
index 803aaf40cc..1f03004eac 100644
--- a/python/cuml/datasets/blobs.py
+++ b/python/cuml/datasets/blobs.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -209,12 +209,8 @@ def make_blobs(
         proba_samples_per_center = np.array(n_samples_per_center) / np.sum(
             n_samples_per_center
         )
-        np_seed = int(generator.randint(n_samples, size=1))
-        np.random.seed(np_seed)
-        shuffled_sample_indices = cp.array(
-            np.random.choice(
-                n_centers, n_samples, replace=True, p=proba_samples_per_center
-            )
+        shuffled_sample_indices = generator.choice(
+            n_centers, n_samples, replace=True, p=proba_samples_per_center
         )
         for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
             center_indices = cp.where(shuffled_sample_indices == i)
diff --git a/python/cuml/datasets/classification.py b/python/cuml/datasets/classification.py
index 22da11210b..8122f7e1b6 100644
--- a/python/cuml/datasets/classification.py
+++ b/python/cuml/datasets/classification.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -120,7 +120,7 @@ def make_classification(
         [-0.8817671  -0.84549576  0.1845096   0.02556021]]
 
         >>> print(y)
-        [0 1 0 1 1 0 0 1 0 0]
+        [1 0 1 1 1 1 1 1 1 0]
 
     Parameters
     ----------
@@ -229,8 +229,6 @@ def make_classification(
     cuml.internals.set_api_output_type("cupy")
 
     generator = _create_rs_generator(random_state)
-    np_seed = int(generator.randint(n_samples, size=1))
-    np.random.seed(np_seed)
 
     # Count features, clusters and samples
     if n_informative + n_redundant + n_repeated > n_features:
@@ -307,13 +305,8 @@ def make_classification(
         proba_samples_per_cluster = np.array(n_samples_per_cluster) / np.sum(
             n_samples_per_cluster
         )
-        shuffled_sample_indices = cp.array(
-            np.random.choice(
-                n_clusters,
-                n_samples,
-                replace=True,
-                p=proba_samples_per_cluster,
-            )
+        shuffled_sample_indices = generator.choice(
+            n_clusters, n_samples, replace=True, p=proba_samples_per_cluster
         )
         for k, centroid in enumerate(centroids):
             centroid_indices = cp.where(shuffled_sample_indices == k)
diff --git a/python/cuml/datasets/regression.pyx b/python/cuml/datasets/regression.pyx
index dbba4fa288..6de9a04853 100644
--- a/python/cuml/datasets/regression.pyx
+++ b/python/cuml/datasets/regression.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -206,7 +206,7 @@ def make_regression(
         coef_ptr = coefs.ptr
 
     if random_state is None:
-        random_state = randint(0, 1e18)
+        random_state = randint(0, 10**18)
 
     if dtype == np.float32:
         cpp_make_regression(handle_[0], <float*> out_ptr,
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index eb71f0c78d..2442757c75 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ class BaseRandomForestModel(Base):
     classes_ = CumlArrayDescriptor()
 
     def __init__(self, *, split_criterion, n_streams=4, n_estimators=100,
-                 max_depth=16, handle=None, max_features='auto', n_bins=128,
+                 max_depth=16, handle=None, max_features='sqrt', n_bins=128,
                  bootstrap=True,
                  verbose=False, min_samples_leaf=1, min_samples_split=2,
                  max_samples=1.0, max_leaves=-1, accuracy_metric=None,
@@ -166,8 +166,22 @@ class BaseRandomForestModel(Base):
             return math.log2(self.n_cols)/self.n_cols
         elif self.max_features == 'auto':
             if self.RF_type == CLASSIFICATION:
+                warnings.warn(
+                    "`max_features='auto'` has been deprecated in 24.06 "
+                    "and will be removed in 25.08. To keep the past behaviour "
+                    "and silence this warning, explicitly set "
+                    "`max_features='sqrt'`.",
+                    FutureWarning
+                )
                 return 1/np.sqrt(self.n_cols)
             else:
+                warnings.warn(
+                    "`max_features='auto'` has been deprecated in 24.06 "
+                    "and will be removed in 25.08. To keep the past behaviour "
+                    "and silence this warning, explicitly set "
+                    "`max_features=1.0`.",
+                    FutureWarning
+                )
                 return 1.0
         else:
             raise ValueError(
diff --git a/python/cuml/ensemble/randomforest_shared.pyx b/python/cuml/ensemble/randomforest_shared.pyx
index 4e8c86341b..a164bbcd02 100644
--- a/python/cuml/ensemble/randomforest_shared.pyx
+++ b/python/cuml/ensemble/randomforest_shared.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@
 
 # distutils: language = c++
 
+from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_FULL_RO
+
 from libcpp.vector cimport vector
-from cpython.object cimport PyObject
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
 from typing import Dict, List, Union
@@ -37,9 +38,6 @@ cdef extern from "treelite/tree.h" namespace "treelite":
         @staticmethod
         unique_ptr[Model] DeserializeFromPyBuffer(const vector[TreelitePyBufferFrame] &) except +
 
-cdef extern from "Python.h":
-    Py_buffer* PyMemoryView_GET_BUFFER(PyObject* mview)
-
 cdef class PyBufferFrameWrapper:
     cdef TreelitePyBufferFrame _handle
     cdef Py_ssize_t shape[1]
@@ -92,18 +90,25 @@ def get_frames(model: uintptr_t) -> List[memoryview]:
 def init_from_frames(frames: List[np.ndarray],
                      format_str: List[str], itemsize: List[int]) -> uintptr_t:
     cdef vector[TreelitePyBufferFrame] cpp_frames
+    # Need to keep track of the buffers to release them later.
+    cdef vector[Py_buffer] buffers
     cdef Py_buffer* buf
     cdef TreelitePyBufferFrame cpp_frame
     format_bytes = [s.encode('utf-8') for s in format_str]
     for i, frame in enumerate(frames):
-        x = memoryview(frame)
-        buf = PyMemoryView_GET_BUFFER(<PyObject*>x)
+        buffers.emplace_back()
+        buf = &buffers.back()
+        PyObject_GetBuffer(frame, buf, PyBUF_FULL_RO)
         cpp_frame.buf = buf.buf
         cpp_frame.format = format_bytes[i]
         cpp_frame.itemsize = itemsize[i]
         cpp_frame.nitem = buf.len // itemsize[i]
         cpp_frames.push_back(cpp_frame)
-    return <uintptr_t> _init_from_frames(cpp_frames)
+    output = <uintptr_t> _init_from_frames(cpp_frames)
+    cdef int j
+    for j in range(buffers.size()):
+        PyBuffer_Release(&buffers[j])
+    return output
 
 
 def treelite_serialize(
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index ba16335dad..45bc4ce2e8 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -1,6 +1,6 @@
 
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -172,15 +172,18 @@ class RandomForestClassifier(BaseRandomForestModel,
     max_leaves : int (default = -1)
         Maximum leaf nodes per tree. Soft constraint. Unlimited,
         If ``-1``.
-    max_features : int, float, or string (default = 'auto')
+    max_features : int, float, or string (default = 'sqrt')
         Ratio of number of features (columns) to consider per node
         split.\n
          * If type ``int`` then ``max_features`` is the absolute count of
            features to be used
          * If type ``float`` then ``max_features`` is used as a fraction.
-         * If ``'auto'`` then ``max_features=1/sqrt(n_features)``.
          * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``.
          * If ``'log2'`` then ``max_features=log2(n_features)/n_features``.
+
+        .. versionchanged:: 24.06
+           The default of `max_features` changed from `"auto"` to `"sqrt"`.
+
     n_bins : int (default = 128)
         Maximum number of bins used by the split algorithm per feature.
         For large problems, particularly those with highly-skewed input data,
@@ -550,6 +553,7 @@ class RandomForestClassifier(BaseRandomForestModel,
         domain="cuml_python")
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
                            return_values=[('dense', '(n_samples, 1)')])
+    @cuml.internals.api_base_return_array(get_output_dtype=True)
     def predict(self, X, predict_model="GPU", threshold=0.5,
                 algo='auto', convert_dtype=True,
                 fil_sparse_format='auto') -> CumlArray:
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index bfa35cdccc..96a197e5c5 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -165,18 +165,22 @@ class RandomForestRegressor(BaseRandomForestModel,
         is not supported.\n
         .. note:: This default differs from scikit-learn's
           random forest, which defaults to unlimited depth.
+
     max_leaves : int (default = -1)
         Maximum leaf nodes per tree. Soft constraint. Unlimited,
         If ``-1``.
-    max_features : int, float, or string (default = 'auto')
+    max_features : int, float, or string (default = 1.0)
         Ratio of number of features (columns) to consider
         per node split.\n
          * If type ``int`` then ``max_features`` is the absolute count of
            features to be used.
          * If type ``float`` then ``max_features`` is used as a fraction.
-         * If ``'auto'`` then ``max_features=1.0``.
          * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``.
          * If ``'log2'`` then ``max_features=log2(n_features)/n_features``.
+
+        .. versionchanged:: 24.06
+          The default of `max_features` changed from `"auto"` to 1.0.
+
     n_bins : int (default = 128)
         Maximum number of bins used by the split algorithm per feature.
         For large problems, particularly those with highly-skewed input data,
diff --git a/python/cuml/experimental/fil/fil.pyx b/python/cuml/experimental/fil/fil.pyx
index 5057b22529..7fe59e43a1 100644
--- a/python/cuml/experimental/fil/fil.pyx
+++ b/python/cuml/experimental/fil/fil.pyx
@@ -55,6 +55,10 @@ nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator)
 
 cdef extern from "treelite/c_api.h":
     ctypedef void* TreeliteModelHandle
+    cdef int TreeliteDeserializeModelFromBytes(const char* bytes_seq, size_t len,
+                                               TreeliteModelHandle* out) except +
+    cdef int TreeliteFreeModel(TreeliteModelHandle handle) except +
+    cdef const char* TreeliteGetLastError()
 
 
 cdef raft_proto_device_t get_device_type(arr):
@@ -137,16 +141,19 @@ cdef class ForestInference_impl():
             use_double_precision_bool = use_double_precision
             use_double_precision_c = use_double_precision_bool
 
-        try:
-            model_handle = tl_model.handle.value
-        except AttributeError:
-            try:
-                model_handle = tl_model.handle
-            except AttributeError:
-                try:
-                    model_handle = tl_model.value
-                except AttributeError:
-                    model_handle = tl_model
+        if not isinstance(tl_model, treelite.Model):
+            raise ValueError("tl_model must be a treelite.Model object")
+        # Serialize Treelite model object and de-serialize again,
+        # to get around C++ ABI incompatibilities (due to different compilers
+        # being used to build cuML pip wheel vs. Treelite pip wheel)
+        bytes_seq = tl_model.serialize_bytes()
+        cdef TreeliteModelHandle model_handle = NULL
+        cdef int res = TreeliteDeserializeModelFromBytes(bytes_seq, len(bytes_seq),
+                                                         &model_handle)
+        cdef str err_msg
+        if res < 0:
+            err_msg = TreeliteGetLastError().decode("UTF-8")
+            raise RuntimeError(f"Failed to load Treelite model from bytes ({err_msg})")
 
         cdef raft_proto_device_t dev_type
         if mem_type.is_device_accessible:
@@ -169,6 +176,8 @@ cdef class ForestInference_impl():
             self.raft_proto_handle.get_next_usable_stream()
         )
 
+        TreeliteFreeModel(model_handle)
+
     def get_dtype(self):
         return [np.float32, np.float64][self.model.is_double_precision()]
 
diff --git a/python/cuml/experimental/linear_model/lars.pyx b/python/cuml/experimental/linear_model/lars.pyx
index 25a2ead0ac..9f2da7ea3b 100644
--- a/python/cuml/experimental/linear_model/lars.pyx
+++ b/python/cuml/experimental/linear_model/lars.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -85,11 +85,15 @@ class Lars(Base, RegressorMixin):
     fit_intercept : boolean (default = True)
         If True, Lars tries to correct for the global mean of y.
         If False, the model expects that you have centered the data.
-    normalize : boolean (default = True)
+    normalize : boolean (default = False)
         This parameter is ignored when `fit_intercept` is set to False.
         If True, the predictors in X will be normalized by removing its mean
         and dividing by it's variance. If False, then the solver expects that
         the data is already normalized.
+
+        .. versionchanged:: 24.06
+            The default of `normalize` changed from `True` to `False`.
+
     copy_X : boolean (default = True)
         The solver permutes the columns of X. Set `copy_X` to True to prevent
         changing the input data.
diff --git a/python/cuml/explainer/kernel_shap.pyx b/python/cuml/explainer/kernel_shap.pyx
index f1fc4cec71..fbd99d5eb9 100644
--- a/python/cuml/explainer/kernel_shap.pyx
+++ b/python/cuml/explainer/kernel_shap.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -340,7 +340,7 @@ class KernelExplainer(SHAPBase):
         x_ptr = get_cai_ptr(self._mask)
 
         if self.random_state is None:
-            self.random_state = randint(0, 1e18)
+            self.random_state = randint(0, 10**18)
 
         # we default to float32 unless self.dtype is specifically np.float64
         if self.dtype == np.float64:
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 16413b34ac..170f18992e 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -65,6 +65,8 @@ cdef extern from "treelite/c_api.h":
                                        TreeliteModelHandle* out) except +
     cdef int TreeliteSerializeModelToFile(TreeliteModelHandle handle,
                                           const char* filename) except +
+    cdef int TreeliteDeserializeModelFromBytes(const char* bytes_seq, size_t len,
+                                               TreeliteModelHandle* out) except +
     cdef int TreeliteGetHeaderField(
             TreeliteModelHandle model, const char * name, TreelitePyBufferFrame* out_frame) except +
     cdef const char* TreeliteGetLastError()
@@ -164,6 +166,27 @@ cdef class TreeliteModel():
         cdef uintptr_t model_ptr = <uintptr_t>model_handle
         TreeliteFreeModel(<TreeliteModelHandle> model_ptr)
 
+    @classmethod
+    def from_treelite_bytes(cls, bytes bytes_seq):
+        """
+        Returns a TreeliteModel object loaded from bytes representing a
+        serialized Treelite model object.
+
+        Parameters
+        ----------
+        bytes_seq: bytes
+            bytes representing a serialized Treelite model
+        """
+        cdef TreeliteModelHandle handle
+        cdef int res = TreeliteDeserializeModelFromBytes(bytes_seq, len(bytes_seq), &handle)
+        cdef str err_msg
+        if res < 0:
+            err_msg = TreeliteGetLastError().decode("UTF-8")
+            raise RuntimeError(f"Failed to load Treelite model from bytes ({err_msg})")
+        cdef TreeliteModel model = TreeliteModel()
+        model.set_handle(handle)
+        return model
+
     @classmethod
     def from_filename(cls, filename, model_type="xgboost"):
         """
@@ -177,30 +200,32 @@ cdef class TreeliteModel():
         model_type : string
             Type of model: 'xgboost', 'xgboost_json', or 'lightgbm'
         """
-        filename_bytes = filename.encode("UTF-8")
-        config_bytes = "{}".encode("UTF-8")
+        cdef bytes filename_bytes = filename.encode("UTF-8")
+        cdef bytes config_bytes = b"{}"
         cdef TreeliteModelHandle handle
+        cdef int res
+        cdef str err_msg
         if model_type == "xgboost":
             res = TreeliteLoadXGBoostModelLegacyBinary(filename_bytes, config_bytes, &handle)
             if res < 0:
-                err = TreeliteGetLastError()
-                raise RuntimeError("Failed to load %s (%s)" % (filename, err))
+                err_msg = TreeliteGetLastError().decode("UTF-8")
+                raise RuntimeError(f"Failed to load {filename} ({err_msg})")
         elif model_type == "xgboost_json":
             res = TreeliteLoadXGBoostModel(filename_bytes, config_bytes, &handle)
             if res < 0:
-                err = TreeliteGetLastError()
-                raise RuntimeError("Failed to load %s (%s)" % (filename, err))
+                err_msg = TreeliteGetLastError().decode("UTF-8")
+                raise RuntimeError(f"Failed to load {filename} ({err_msg})")
         elif model_type == "lightgbm":
             logger.warn("Treelite currently does not support float64 model"
                         " parameters. Accuracy may degrade slightly relative"
                         " to native LightGBM invocation.")
             res = TreeliteLoadLightGBMModel(filename_bytes, config_bytes, &handle)
             if res < 0:
-                err = TreeliteGetLastError()
-                raise RuntimeError("Failed to load %s (%s)" % (filename, err))
+                err_msg = TreeliteGetLastError().decode("UTF-8")
+                raise RuntimeError(f"Failed to load {filename} ({err_msg})")
         else:
-            raise ValueError("Unknown model type %s" % model_type)
-        model = TreeliteModel()
+            raise ValueError(f"Unknown model type {model_type}")
+        cdef TreeliteModel model = TreeliteModel()
         model.set_handle(handle)
         return model
 
@@ -215,7 +240,11 @@ cdef class TreeliteModel():
         """
         assert self.handle != NULL
         filename_bytes = filename.encode("UTF-8")
-        TreeliteSerializeModelToFile(self.handle, filename_bytes)
+        cdef int res = TreeliteSerializeModelToFile(self.handle, filename_bytes)
+        cdef str err_msg
+        if res < 0:
+            err_msg = TreeliteGetLastError().decode("UTF-8")
+            raise RuntimeError(f"Failed to serialize Treelite model ({err_msg})")
 
     @classmethod
     def from_treelite_model_handle(cls,
@@ -514,10 +543,11 @@ cdef class ForestInference_impl():
                       &treelite_params)
         # Get num_class
         cdef TreelitePyBufferFrame frame
-        res = TreeliteGetHeaderField(<TreeliteModelHandle> model_ptr, "num_class", &frame)
+        cdef int res = TreeliteGetHeaderField(<TreeliteModelHandle> model_ptr, "num_class", &frame)
+        cdef str err_msg
         if res < 0:
-            err = TreeliteGetLastError()
-            raise RuntimeError(f"Failed to fetch num_class: {err}")
+            err_msg = TreeliteGetLastError().decode("UTF-8")
+            raise RuntimeError(f"Failed to fetch num_class: {err_msg}")
         view = memoryview(MakePyBufferFrameWrapper(frame))
         self.num_class = np.asarray(view).copy()
         if len(self.num_class) > 1:
@@ -882,8 +912,13 @@ class ForestInference(Base,
                     " parameters. Accuracy may degrade slightly relative to"
                     " native sklearn invocation.")
         tl_model = tl_skl.import_model(skl_model)
+        # Serialize Treelite model object and de-serialize again,
+        # to get around C++ ABI incompatibilities (due to different compilers
+        # being used to build cuML pip wheel vs. Treelite pip wheel)
+        cdef bytes bytes_seq = tl_model.serialize_bytes()
+        cdef TreeliteModel tl_model2 = TreeliteModel.from_treelite_bytes(bytes_seq)
         cuml_fm.load_from_treelite_model(
-            model=tl_model,
+            model=tl_model2,
             output_class=output_class,
             threshold=threshold,
             algo=algo,
diff --git a/python/cuml/internals/array.py b/python/cuml/internals/array.py
index 53e4a3f858..f8c4284d90 100644
--- a/python/cuml/internals/array.py
+++ b/python/cuml/internals/array.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,8 +48,8 @@
 CudfDataFrame = gpu_only_import_from("cudf", "DataFrame")
 CudfIndex = gpu_only_import_from("cudf", "Index")
 CudfSeries = gpu_only_import_from("cudf", "Series")
-DaskCudfDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
-DaskCudfSeries = gpu_only_import_from("dask_cudf.core", "Series")
+DaskCudfDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
+DaskCudfSeries = gpu_only_import_from("dask_cudf", "Series")
 DaskDataFrame = gpu_only_import_from("dask.dataframe", "DataFrame")
 DaskSeries = gpu_only_import_from("dask.dataframe", "Series")
 DeviceBuffer = gpu_only_import_from("rmm", "DeviceBuffer")
@@ -447,7 +447,7 @@ def is_host_accessible(self):
     @cached_property
     def size(self):
         return (
-            host_xpy.product(self._array_interface["shape"])
+            host_xpy.prod(self._array_interface["shape"])
             * host_xpy.dtype(self._array_interface["typestr"]).itemsize
         )
 
@@ -793,7 +793,7 @@ def deserialize(cls, header: dict, frames: list):
 
         if header["desc"]["shape"] != ary._array_interface["shape"]:
             raise ValueError(
-                f"Received a `Buffer` with the wrong size."
+                "Received a `Buffer` with the wrong size."
                 f" Expected {header['desc']['shape']}, "
                 f"but got {ary._array_interface['shape']}"
             )
diff --git a/python/cuml/internals/import_utils.py b/python/cuml/internals/import_utils.py
index 4a7c6324c8..50ede87205 100644
--- a/python/cuml/internals/import_utils.py
+++ b/python/cuml/internals/import_utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@
 
 import platform
 
+from packaging.version import Version
+
 from cuml.internals.safe_imports import gpu_only_import, UnavailableError
-from distutils.version import LooseVersion
 
 
 numba = gpu_only_import("numba")
@@ -123,14 +124,14 @@ def check_min_dask_version(version):
     try:
         import dask
 
-        return LooseVersion(dask.__version__) >= LooseVersion(version)
+        return Version(dask.__version__) >= Version(version)
     except ImportError:
         return False
 
 
 def check_min_numba_version(version):
     try:
-        return LooseVersion(str(numba.__version__)) >= LooseVersion(version)
+        return Version(str(numba.__version__)) >= Version(version)
     except UnavailableError:
         return False
 
@@ -139,7 +140,7 @@ def check_min_cupy_version(version):
     if has_cupy():
         import cupy
 
-        return LooseVersion(str(cupy.__version__)) >= LooseVersion(version)
+        return Version(str(cupy.__version__)) >= Version(version)
     else:
         return False
 
@@ -186,9 +187,7 @@ def has_shap(min_version="0.37"):
         if min_version is None:
             return True
         else:
-            return LooseVersion(str(shap.__version__)) >= LooseVersion(
-                min_version
-            )
+            return Version(str(shap.__version__)) >= Version(min_version)
     except ImportError:
         return False
 
@@ -200,9 +199,7 @@ def has_daskglm(min_version=None):
         if min_version is None:
             return True
         else:
-            return LooseVersion(str(dask_glm.__version__)) >= LooseVersion(
-                min_version
-            )
+            return Version(str(dask_glm.__version__)) >= Version(min_version)
     except ImportError:
         return False
 
diff --git a/python/cuml/internals/input_utils.py b/python/cuml/internals/input_utils.py
index edcbffabaa..cda8ff0ed8 100644
--- a/python/cuml/internals/input_utils.py
+++ b/python/cuml/internals/input_utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #
 
 from collections import namedtuple
+from typing import Literal
 
 from cuml.internals.array import CumlArray
 from cuml.internals.array_sparse import SparseCumlArray
@@ -46,8 +47,9 @@
 cp_ndarray = gpu_only_import_from("cupy", "ndarray")
 CudfSeries = gpu_only_import_from("cudf", "Series")
 CudfDataFrame = gpu_only_import_from("cudf", "DataFrame")
-DaskCudfSeries = gpu_only_import_from("dask_cudf.core", "Series")
-DaskCudfDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame")
+CudfIndex = gpu_only_import_from("cudf", "Index")
+DaskCudfSeries = gpu_only_import_from("dask_cudf", "Series")
+DaskCudfDataFrame = gpu_only_import_from("dask_cudf", "DataFrame")
 np_ndarray = cpu_only_import_from("numpy", "ndarray")
 numba_devicearray = gpu_only_import_from("numba.cuda", "devicearray")
 try:
@@ -64,6 +66,7 @@
 nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator)
 PandasSeries = cpu_only_import_from("pandas", "Series")
 PandasDataFrame = cpu_only_import_from("pandas", "DataFrame")
+PandasIndex = cpu_only_import_from("pandas", "Index")
 
 cuml_array = namedtuple("cuml_array", "array n_rows n_cols dtype")
 
@@ -73,6 +76,7 @@
     np_ndarray: "numpy",
     PandasSeries: "pandas",
     PandasDataFrame: "pandas",
+    PandasIndex: "pandas",
 }
 
 
@@ -80,6 +84,7 @@
     _input_type_to_str[cp_ndarray] = "cupy"
     _input_type_to_str[CudfSeries] = "cudf"
     _input_type_to_str[CudfDataFrame] = "cudf"
+    _input_type_to_str[CudfIndex] = "cudf"
     _input_type_to_str[NumbaDeviceNDArrayBase] = "numba"
 except UnavailableError:
     pass
@@ -160,9 +165,21 @@ def get_supported_input_type(X):
     if isinstance(X, PandasSeries):
         return PandasSeries
 
+    if isinstance(X, PandasIndex):
+        return PandasIndex
+
     if isinstance(X, CudfDataFrame):
         return CudfDataFrame
 
+    if isinstance(X, CudfIndex):
+        return CudfIndex
+
+    # A cudf.pandas wrapped Numpy array defines `__cuda_array_interface__`
+    # which means without this we'd always return a cupy array. We don't want
+    # to match wrapped cupy arrays, they get dealt with later
+    if getattr(X, "_fsproxy_slow_type", None) is np.ndarray:
+        return np.ndarray
+
     try:
         if numba_cuda.devicearray.is_cuda_ndarray(X):
             return numba_cuda.devicearray.DeviceNDArrayBase
@@ -205,6 +222,21 @@ def determine_array_type(X):
     return _input_type_to_str.get(gen_type, None)
 
 
+def determine_df_obj_type(X):
+    if X is None:
+        return None
+
+    # Get the generic type
+    gen_type = get_supported_input_type(X)
+
+    if gen_type in (CudfDataFrame, PandasDataFrame):
+        return "dataframe"
+    elif gen_type in (CudfSeries, PandasSeries):
+        return "series"
+
+    return None
+
+
 def determine_array_dtype(X):
 
     if X is None:
@@ -575,3 +607,27 @@ def sparse_scipy_to_cp(sp, dtype):
     v = cp.asarray(values, dtype=dtype)
 
     return cupyx.scipy.sparse.coo_matrix((v, (r, c)), sp.shape)
+
+
+def output_to_df_obj_like(
+    X_out: CumlArray, X_in, output_type: Literal["series", "dataframe"]
+):
+    """Cast CumlArray `X_out` to the dataframe / series type as `X_in`
+    `CumlArray` abstracts away the dataframe / series metadata, when API
+    methods needs to return a dataframe / series matching original input
+    metadata, this function can copy input metadata to output.
+    """
+
+    if output_type not in ["series", "dataframe"]:
+        raise ValueError(
+            f'output_type must be either "series" or "dataframe" : {output_type}'
+        )
+
+    out = None
+    if output_type == "series":
+        out = X_out.to_output("series")
+        out.name = X_in.name
+    elif output_type == "dataframe":
+        out = X_out.to_output("dataframe")
+        out.columns = X_in.columns
+    return out
diff --git a/python/cuml/linear_model/logistic_regression.pyx b/python/cuml/linear_model/logistic_regression.pyx
index 92c42c849d..164821a5bd 100644
--- a/python/cuml/linear_model/logistic_regression.pyx
+++ b/python/cuml/linear_model/logistic_regression.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 # distutils: language = c++
 
+import warnings
+
 from cuml.internals.safe_imports import cpu_only_import
 from cuml.internals.safe_imports import gpu_only_import
 import pprint
@@ -36,7 +38,7 @@ cp = gpu_only_import('cupy')
 np = cpu_only_import('numpy')
 
 
-supported_penalties = ["l1", "l2", "none", "elasticnet"]
+supported_penalties = ["l1", "l2", None, "none", "elasticnet"]
 
 supported_solvers = ["qn"]
 
@@ -210,7 +212,7 @@ class LogisticRegression(UniversalBase,
                          output_type=output_type)
 
         if penalty not in supported_penalties:
-            raise ValueError("`penalty` " + str(penalty) + "not supported.")
+            raise ValueError("`penalty` " + str(penalty) + " not supported.")
 
         if solver not in supported_solvers:
             raise ValueError("Only quasi-newton `qn` solver is "
@@ -218,7 +220,16 @@ class LogisticRegression(UniversalBase,
         self.solver = solver
 
         self.C = C
+
+        if penalty == "none":
+            warnings.warn(
+                "The 'none' option was deprecated in version 24.06, and will "
+                "be removed in 25.08. Use None instead.",
+                FutureWarning
+            )
+            penalty = None
         self.penalty = penalty
+
         self.tol = tol
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -452,7 +463,7 @@ class LogisticRegression(UniversalBase,
         return proba
 
     def _get_qn_params(self):
-        if self.penalty == "none":
+        if self.penalty is None:
             l1_strength = 0.0
             l2_strength = 0.0
 
diff --git a/python/cuml/linear_model/logistic_regression_mg.pyx b/python/cuml/linear_model/logistic_regression_mg.pyx
index 0a6988a804..834ee1f41d 100644
--- a/python/cuml/linear_model/logistic_regression_mg.pyx
+++ b/python/cuml/linear_model/logistic_regression_mg.pyx
@@ -80,11 +80,29 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil:
         float *f,
         int *num_iters) except +
 
+    cdef void qnFit(
+        handle_t& handle,
+        vector[doubleData_t *] input_data,
+        PartDescriptor &input_desc,
+        vector[doubleData_t *] labels,
+        double *coef,
+        const qn_params& pams,
+        bool X_col_major,
+        bool standardization,
+        int n_classes,
+        double *f,
+        int *num_iters) except +
+
     cdef vector[float] getUniquelabelsMG(
         const handle_t& handle,
         PartDescriptor &input_desc,
         vector[floatData_t*] labels) except+
 
+    cdef vector[double] getUniquelabelsMG(
+        const handle_t& handle,
+        PartDescriptor &input_desc,
+        vector[doubleData_t*] labels) except+
+
     cdef void qnFitSparse(
         handle_t& handle,
         vector[floatData_t *] input_values,
@@ -100,6 +118,21 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil:
         float *f,
         int *num_iters) except +
 
+    cdef void qnFitSparse(
+        handle_t& handle,
+        vector[doubleData_t *] input_values,
+        int *input_cols,
+        int *input_row_ids,
+        int X_nnz,
+        PartDescriptor &input_desc,
+        vector[doubleData_t *] labels,
+        double *coef,
+        const qn_params& pams,
+        bool standardization,
+        int n_classes,
+        double *f,
+        int *num_iters) except +
+
 
 class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
@@ -199,14 +232,25 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
         cdef float objective32
+        cdef float objective64
         cdef int num_iters
 
         cdef vector[float] c_classes_
-        c_classes_ = getUniquelabelsMG(
-            handle_[0],
-            deref(<PartDescriptor*><uintptr_t>input_desc),
-            deref(<vector[floatData_t*]*><uintptr_t>y))
-        self.classes_ = np.sort(list(c_classes_)).astype('float32')
+        cdef vector[double] c_classes_64
+        if self.dtype == np.float32:
+            c_classes_ = getUniquelabelsMG(
+                handle_[0],
+                deref(<PartDescriptor*><uintptr_t>input_desc),
+                deref(<vector[floatData_t*]*><uintptr_t>y))
+            self.classes_ = np.sort(list(c_classes_)).astype(np.float32)
+        elif self.dtype == np.float64:
+            c_classes_64 = getUniquelabelsMG(
+                handle_[0],
+                deref(<PartDescriptor*><uintptr_t>input_desc),
+                deref(<vector[doubleData_t*]*><uintptr_t>y))
+            self.classes_ = np.sort(list(c_classes_64))
+        else:
+            assert False, "dtypes other than float32 and float64 are currently not supported yet."
 
         self._num_classes = len(self.classes_)
         self.loss = "sigmoid" if self._num_classes <= 2 else "softmax"
@@ -220,6 +264,7 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
         if self.dtype == np.float32:
             if sparse_input is False:
+
                 qnFit(
                     handle_[0],
                     deref(<vector[floatData_t*]*><uintptr_t>X),
@@ -227,9 +272,9 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
                     deref(<vector[floatData_t*]*><uintptr_t>y),
                     <float*>mat_coef_ptr,
                     qnpams,
-                    self.is_col_major,
-                    self.standardization,
-                    self._num_classes,
+                    <bool>self.is_col_major,
+                    <bool>self.standardization,
+                    <int>self._num_classes,
                     <float*> &objective32,
                     <int*> &num_iters)
 
@@ -245,20 +290,60 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
                     deref(<vector[floatData_t*]*><uintptr_t>X_values),
                     <int*><uintptr_t>X_cols,
                     <int*><uintptr_t>X_row_ids,
-                    X_nnz,
+                    <int> X_nnz,
                     deref(<PartDescriptor*><uintptr_t>input_desc),
                     deref(<vector[floatData_t*]*><uintptr_t>y),
                     <float*>mat_coef_ptr,
                     qnpams,
-                    self.standardization,
-                    self._num_classes,
+                    <bool> self.standardization,
+                    <int> self._num_classes,
                     <float*> &objective32,
                     <int*> &num_iters)
 
             self.solver_model.objective = objective32
 
+        elif self.dtype == np.float64:
+            if sparse_input is False:
+
+                qnFit(
+                    handle_[0],
+                    deref(<vector[doubleData_t*]*><uintptr_t>X),
+                    deref(<PartDescriptor*><uintptr_t>input_desc),
+                    deref(<vector[doubleData_t*]*><uintptr_t>y),
+                    <double*>mat_coef_ptr,
+                    qnpams,
+                    <bool> self.is_col_major,
+                    <bool> self.standardization,
+                    <int> self._num_classes,
+                    <double*> &objective64,
+                    <int*> &num_iters)
+
+            else:
+                assert len(X) == 4
+                X_values = X[0]
+                X_cols = X[1]
+                X_row_ids = X[2]
+                X_nnz = X[3]
+
+                qnFitSparse(
+                    handle_[0],
+                    deref(<vector[doubleData_t*]*><uintptr_t>X_values),
+                    <int*><uintptr_t>X_cols,
+                    <int*><uintptr_t>X_row_ids,
+                    <int> X_nnz,
+                    deref(<PartDescriptor*><uintptr_t>input_desc),
+                    deref(<vector[doubleData_t*]*><uintptr_t>y),
+                    <double*>mat_coef_ptr,
+                    qnpams,
+                    <bool> self.standardization,
+                    <int> self._num_classes,
+                    <double*> &objective32,
+                    <int*> &num_iters)
+
+            self.solver_model.objective = objective64
+
         else:
-            assert False, "dtypes other than float32 are currently not supported yet. See issue: https://github.com/rapidsai/cuml/issues/5589"
+            assert False, "dtypes other than float32 and float64 are currently not supported yet."
 
         self.solver_model.num_iters = num_iters
 
diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx
index 01ddd12fdd..e5c7649267 100644
--- a/python/cuml/linear_model/ridge.pyx
+++ b/python/cuml/linear_model/ridge.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -101,7 +101,7 @@ class Ridge(UniversalBase,
         >>> from cuml import Ridge
         >>> from cuml.linear_model import Ridge
 
-        >>> alpha = cp.array([1e-5])
+        >>> alpha = 1e-5
         >>> ridge = Ridge(alpha=alpha, fit_intercept=True, normalize=False,
         ...               solver="eig")
 
diff --git a/python/cuml/manifold/simpl_set.pyx b/python/cuml/manifold/simpl_set.pyx
index d0f30e3e88..a5bbd32900 100644
--- a/python/cuml/manifold/simpl_set.pyx
+++ b/python/cuml/manifold/simpl_set.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
 cp = gpu_only_import('cupy')
+cupyx = gpu_only_import('cupyx')
 
 from cuml.manifold.umap_utils cimport *
 from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \
@@ -350,8 +351,8 @@ def simplicial_set_embedding(
 
     graph = graph.tocoo()
     graph.sum_duplicates()
-    if not isinstance(graph, cp.sparse.coo_matrix):
-        graph = cp.sparse.coo_matrix(graph)
+    if not isinstance(graph, cupyx.scipy.sparse.coo_matrix):
+        graph = cupyx.scipy.sparse.coo_matrix(graph)
 
     handle = Handle()
     cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
diff --git a/python/cuml/manifold/umap_utils.pyx b/python/cuml/manifold/umap_utils.pyx
index a68af62195..200b8cc4b3 100644
--- a/python/cuml/manifold/umap_utils.pyx
+++ b/python/cuml/manifold/umap_utils.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
 cp = gpu_only_import('cupy')
+cupyx = gpu_only_import('cupyx')
 
 
 cdef class GraphHolder:
@@ -102,7 +103,7 @@ cdef class GraphHolder:
         rows = create_nonowning_cp_array(self.rows(), np.int32)
         cols = create_nonowning_cp_array(self.cols(), np.int32)
 
-        return cp.sparse.coo_matrix(((vals, (rows, cols))))
+        return cupyx.scipy.sparse.coo_matrix(((vals, (rows, cols))))
 
     def __dealloc__(self):
         self.c_graph.reset(NULL)
diff --git a/python/cuml/model_selection/_split.py b/python/cuml/model_selection/_split.py
index cb58db4f5f..0727f82c82 100644
--- a/python/cuml/model_selection/_split.py
+++ b/python/cuml/model_selection/_split.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,10 +13,16 @@
 # limitations under the License.
 #
 
-from typing import Optional, Union
+from typing import Optional, Union, List, Tuple
 
 from cuml.common import input_to_cuml_array
-from cuml.internals.array import array_to_memory_order
+from cuml.internals.input_utils import (
+    determine_array_type,
+    determine_df_obj_type,
+    output_to_df_obj_like,
+)
+from cuml.internals.mem_type import MemoryType
+from cuml.internals.array import array_to_memory_order, CumlArray
 from cuml.internals.safe_imports import (
     cpu_only_import,
     gpu_only_import,
@@ -31,68 +37,48 @@
 cuda = gpu_only_import_from("numba", "cuda")
 
 
-def _stratify_split(
-    X, stratify, labels, n_train, n_test, x_numba, y_numba, random_state
-):
+def _compute_stratify_split_indices(
+    indices: cp.ndarray,
+    stratify: CumlArray,
+    n_train: int,
+    n_test: int,
+    random_state: cp.random.RandomState,
+) -> Tuple[cp.ndarray]:
     """
-    Function to perform a stratified split based on stratify column.
+    Compute the indices for stratified split based on stratify keys.
     Based on scikit-learn stratified split implementation.
 
     Parameters
     ----------
-    X, y: Shuffled input data and labels
-    stratify: column to be stratified on.
+    indices: cupy array
+        Indices used to shuffle input data
+    stratify: CumlArray
+        Keys used for stratification
     n_train: Number of samples in train set
     n_test: number of samples in test set
-    x_numba: Determines whether the data should be converted to numba
-    y_numba: Determines whether the labales should be converted to numba
+    random_state: cupy RandomState
+        Random state used for shuffling stratify keys
 
     Returns
     -------
-    X_train, X_test: Data X divided into train and test sets
-    y_train, y_test: Labels divided into train and test sets
+    train_indices, test_indices:
+        Indices of inputs from which train and test sets are gathered
     """
-    x_cudf = False
-    labels_cudf = False
-
-    if isinstance(X, cudf.DataFrame):
-        x_cudf = True
-    elif hasattr(X, "__cuda_array_interface__"):
-        X = cp.asarray(X)
-    x_order = array_to_memory_order(X)
 
-    # labels and stratify will be only cp arrays
-    if isinstance(labels, cudf.Series):
-        labels_cudf = True
-        labels = labels.values
-    elif hasattr(labels, "__cuda_array_interface__"):
-        labels = cp.asarray(labels)
-    elif isinstance(stratify, cudf.DataFrame):
-        # ensuring it has just one column
-        if labels.shape[1] != 1:
-            raise ValueError(
-                "Expected one column for labels, but found df"
-                "with shape = %d" % (labels.shape)
-            )
-        labels_cudf = True
-        labels = labels[0].values
+    if indices.ndim != 1:
+        raise ValueError(
+            "Expected one one dimension for indices, but found array"
+            "with shape = %d" % (indices.shape)
+        )
 
-    labels_order = array_to_memory_order(labels)
+    if stratify.ndim != 1:
+        raise ValueError(
+            "Expected one one dimension for stratify, but found array"
+            "with shape = %d" % (stratify.shape)
+        )
 
     # Converting to cupy array removes the need to add an if-else block
     # for startify column
-    if isinstance(stratify, cudf.Series):
-        stratify = stratify.values
-    elif hasattr(stratify, "__cuda_array_interface__"):
-        stratify = cp.asarray(stratify)
-    elif isinstance(stratify, cudf.DataFrame):
-        # ensuring it has just one column
-        if stratify.shape[1] != 1:
-            raise ValueError(
-                "Expected one column, but found column"
-                "with shape = %d" % (stratify.shape)
-            )
-        stratify = stratify[0].values
 
     classes, stratify_indices = cp.unique(stratify, return_inverse=True)
 
@@ -112,84 +98,31 @@ def _stratify_split(
             "equal to the number of classes = %d" % (n_train, n_classes)
         )
 
-    class_indices = cp.split(
+    # List of length n_classes. Each element contains indices of that class.
+    class_indices: List[cp.ndarray] = cp.split(
         cp.argsort(stratify_indices), cp.cumsum(class_counts)[:-1].tolist()
     )
 
-    X_train = None
-
-    # random_state won't be None or int, that's handled earlier
-    if isinstance(random_state, np.random.RandomState):
-        random_state = cp.random.RandomState(seed=random_state.get_state()[1])
-
     # Break ties
     n_i = _approximate_mode(class_counts, n_train, random_state)
     class_counts_remaining = class_counts - n_i
     t_i = _approximate_mode(class_counts_remaining, n_test, random_state)
 
+    train_indices_partials = []
+    test_indices_partials = []
     for i in range(n_classes):
         permutation = random_state.permutation(class_counts[i].item())
         perm_indices_class_i = class_indices[i].take(permutation)
 
-        y_train_i = cp.array(
-            labels[perm_indices_class_i[: n_i[i]]], order=labels_order
-        )
-        y_test_i = cp.array(
-            labels[perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]],
-            order=labels_order,
+        train_indices_partials.append(perm_indices_class_i[: n_i[i]])
+        test_indices_partials.append(
+            perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]
         )
-        if hasattr(X, "__cuda_array_interface__") or isinstance(
-            X, cupyx.scipy.sparse.csr_matrix
-        ):
-            X_train_i = cp.array(
-                X[perm_indices_class_i[: n_i[i]]], order=x_order
-            )
-            X_test_i = cp.array(
-                X[perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]],
-                order=x_order,
-            )
 
-            if X_train is None:
-                X_train = cp.array(X_train_i, order=x_order)
-                y_train = cp.array(y_train_i, order=labels_order)
-                X_test = cp.array(X_test_i, order=x_order)
-                y_test = cp.array(y_test_i, order=labels_order)
-            else:
-                X_train = cp.concatenate([X_train, X_train_i], axis=0)
-                X_test = cp.concatenate([X_test, X_test_i], axis=0)
-                y_train = cp.concatenate([y_train, y_train_i], axis=0)
-                y_test = cp.concatenate([y_test, y_test_i], axis=0)
-
-        elif x_cudf:
-            X_train_i = X.iloc[perm_indices_class_i[: n_i[i]]]
-            X_test_i = X.iloc[perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]]
-
-            if X_train is None:
-                X_train = X_train_i
-                y_train = y_train_i
-                X_test = X_test_i
-                y_test = y_test_i
-            else:
-                X_train = cudf.concat([X_train, X_train_i], ignore_index=False)
-                X_test = cudf.concat([X_test, X_test_i], ignore_index=False)
-                y_train = cp.concatenate([y_train, y_train_i], axis=0)
-                y_test = cp.concatenate([y_test, y_test_i], axis=0)
-
-    if x_numba:
-        X_train = cuda.as_cuda_array(X_train)
-        X_test = cuda.as_cuda_array(X_test)
-    elif x_cudf:
-        X_train = cudf.DataFrame(X_train)
-        X_test = cudf.DataFrame(X_test)
-
-    if y_numba:
-        y_train = cuda.as_cuda_array(y_train)
-        y_test = cuda.as_cuda_array(y_test)
-    elif labels_cudf:
-        y_train = cudf.Series(y_train)
-        y_test = cudf.Series(y_test)
-
-    return X_train, X_test, y_train, y_test
+    train_indices = cp.concatenate(train_indices_partials, axis=0)
+    test_indices = cp.concatenate(test_indices_partials, axis=0)
+
+    return indices[train_indices], indices[test_indices]
 
 
 def _approximate_mode(class_counts, n_draws, rng):
@@ -332,103 +265,78 @@ def train_test_split(
                              string"
             )
 
-    # todo: this check will be replaced with upcoming improvements
-    # to input_utils
-    #
+    x_order = array_to_memory_order(X)
+    X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
     if y is not None:
-        if not hasattr(X, "__cuda_array_interface__") and not isinstance(
-            X, cudf.DataFrame
-        ):
-            raise TypeError(
-                "X needs to be either a cuDF DataFrame, Series or \
-                            a cuda_array_interface compliant array."
-            )
-
-        if not hasattr(y, "__cuda_array_interface__") and not isinstance(
-            y, cudf.DataFrame
-        ):
-            raise TypeError(
-                "y needs to be either a cuDF DataFrame, Series or \
-                            a cuda_array_interface compliant array."
-            )
-
-        if X.shape[0] != y.shape[0]:
+        y_order = array_to_memory_order(y)
+        y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order)
+        if X_row != y_row:
             raise ValueError(
                 "X and y must have the same first dimension"
-                "(found {} and {})".format(X.shape[0], y.shape[0])
-            )
-    else:
-        if not hasattr(X, "__cuda_array_interface__") and not isinstance(
-            X, cudf.DataFrame
-        ):
-            raise TypeError(
-                "X needs to be either a cuDF DataFrame, Series or \
-                            a cuda_array_interface compliant object."
+                f"(found {X_row} and {y_row})"
             )
 
     if isinstance(train_size, float):
         if not 0 <= train_size <= 1:
             raise ValueError(
                 "proportion train_size should be between"
-                "0 and 1 (found {})".format(train_size)
+                f"0 and 1 (found {train_size})"
             )
 
     if isinstance(train_size, int):
-        if not 0 <= train_size <= X.shape[0]:
+        if not 0 <= train_size <= X_row:
             raise ValueError(
                 "Number of instances train_size should be between 0 and the"
-                "first dimension of X (found {})".format(train_size)
+                f"first dimension of X (found {train_size})"
             )
 
     if isinstance(test_size, float):
         if not 0 <= test_size <= 1:
             raise ValueError(
                 "proportion test_size should be between"
-                "0 and 1 (found {})".format(train_size)
+                f"0 and 1 (found {train_size})"
             )
 
     if isinstance(test_size, int):
-        if not 0 <= test_size <= X.shape[0]:
+        if not 0 <= test_size <= X_row:
             raise ValueError(
                 "Number of instances test_size should be between 0 and the"
-                "first dimension of X (found {})".format(test_size)
+                f"first dimension of X (found {test_size})"
             )
 
-    x_numba = cuda.devicearray.is_cuda_ndarray(X)
-    y_numba = cuda.devicearray.is_cuda_ndarray(y)
-
     # Determining sizes of splits
     if isinstance(train_size, float):
-        train_size = int(X.shape[0] * train_size)
+        train_size = int(X_row * train_size)
 
     if test_size is None:
         if train_size is None:
-            train_size = int(X.shape[0] * 0.75)
+            train_size = int(X_row * 0.75)
 
-        test_size = X.shape[0] - train_size
+        test_size = X_row - train_size
 
     if isinstance(test_size, float):
-        test_size = int(X.shape[0] * test_size)
+        test_size = int(X_row * test_size)
         if train_size is None:
-            train_size = X.shape[0] - test_size
+            train_size = X_row - test_size
 
     elif isinstance(test_size, int):
         if train_size is None:
-            train_size = X.shape[0] - test_size
+            train_size = X_row - test_size
 
+    # Compute training set and test set indices
     if shuffle:
-        # Shuffle the data
+        idxs = cp.arange(X_row)
+
+        # Compute shuffle indices
         if random_state is None or isinstance(random_state, int):
-            idxs = cp.arange(X.shape[0])
             random_state = cp.random.RandomState(seed=random_state)
 
-        elif isinstance(random_state, cp.random.RandomState):
-            idxs = cp.arange(X.shape[0])
-
         elif isinstance(random_state, np.random.RandomState):
-            idxs = np.arange(X.shape[0])
+            random_state = cp.random.RandomState(
+                seed=random_state.get_state()[1]
+            )
 
-        else:
+        elif not isinstance(random_state, cp.random.RandomState):
             raise TypeError(
                 "`random_state` must be an int, NumPy RandomState \
                              or CuPy RandomState."
@@ -436,77 +344,74 @@ def train_test_split(
 
         random_state.shuffle(idxs)
 
-        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
-            X = X.iloc[idxs]
-
-        elif hasattr(X, "__cuda_array_interface__"):
-            # numba (and therefore rmm device_array) does not support
-            # fancy indexing
-            X = cp.asarray(X)[idxs]
-
-        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
-            y = y.iloc[idxs]
-
-        elif hasattr(y, "__cuda_array_interface__"):
-            y = cp.asarray(y)[idxs]
-
         if stratify is not None:
-            if isinstance(stratify, cudf.DataFrame) or isinstance(
-                stratify, cudf.Series
-            ):
-                stratify = stratify.iloc[idxs]
+            stratify, *_ = input_to_cuml_array(stratify)
+            stratify = stratify[idxs]
 
-            elif hasattr(stratify, "__cuda_array_interface__"):
-                stratify = cp.asarray(stratify)[idxs]
-
-            split_return = _stratify_split(
-                X,
+            (train_indices, test_indices,) = _compute_stratify_split_indices(
+                idxs,
                 stratify,
-                y,
                 train_size,
                 test_size,
-                x_numba,
-                y_numba,
                 random_state,
             )
-            return split_return
 
-    # If not stratified, perform train_test_split splicing
-    x_order = array_to_memory_order(X)
+        else:
+            train_indices = idxs[:train_size]
+            test_indices = idxs[-1 * test_size :]
+    else:
+        train_indices = range(0, train_size)
+        test_indices = range(-1 * test_size, 0)
+
+    # Gather from indices
+    X_train = X_arr[train_indices]
+    X_test = X_arr[test_indices]
+    if y is not None:
+        y_train = y_arr[train_indices]
+        y_test = y_arr[test_indices]
 
-    if y is None:
-        y_order = None
+    # Coerce output to original input type
+    if ty := determine_df_obj_type(X):
+        x_type = ty
     else:
-        y_order = array_to_memory_order(y)
+        x_type = determine_array_type(X)
+
+    if ty := determine_df_obj_type(y):
+        y_type = ty
+    else:
+        y_type = determine_array_type(y)
 
-    if hasattr(X, "__cuda_array_interface__") or isinstance(
-        X, cupyx.scipy.sparse.csr_matrix
-    ):
-        X_train = cp.array(X[0:train_size], order=x_order)
-        X_test = cp.array(X[-1 * test_size :], order=x_order)
-        if y is not None:
-            y_train = cp.array(y[0:train_size], order=y_order)
-            y_test = cp.array(y[-1 * test_size :], order=y_order)
-    elif isinstance(X, cudf.DataFrame):
-        X_train = X.iloc[0:train_size]
-        X_test = X.iloc[-1 * test_size :]
-        if y is not None:
-            if isinstance(y, cudf.Series):
-                y_train = y.iloc[0:train_size]
-                y_test = y.iloc[-1 * test_size :]
-            elif hasattr(y, "__cuda_array_interface__") or isinstance(
-                y, cupyx.scipy.sparse.csr_matrix
-            ):
-                y_train = cp.array(y[0:train_size], order=y_order)
-                y_test = cp.array(y[-1 * test_size :], order=y_order)
-
-    if x_numba:
-        X_train = cuda.as_cuda_array(X_train)
-        X_test = cuda.as_cuda_array(X_test)
-
-    if y_numba:
-        y_train = cuda.as_cuda_array(y_train)
-        y_test = cuda.as_cuda_array(y_test)
+    if x_type in ("series", "dataframe"):
+        X_train = output_to_df_obj_like(X_train, X, x_type)
+        X_test = output_to_df_obj_like(X_test, X, x_type)
+
+        if determine_array_type(X.index) == "pandas":
+            if isinstance(train_indices, cp.ndarray):
+                train_indices = train_indices.get()
+            if isinstance(test_indices, cp.ndarray):
+                test_indices = test_indices.get()
+
+        X_train.index = X.index[train_indices]
+        X_test.index = X.index[test_indices]
+    else:
+        X_train = X_train.to_output(x_type)
+        X_test = X_test.to_output(x_type)
+
+    if y_type in ("series", "dataframe"):
+        y_train = output_to_df_obj_like(y_train, y, y_type)
+        y_test = output_to_df_obj_like(y_test, y, y_type)
+
+        if determine_array_type(y.index) == "pandas":
+            if isinstance(train_indices, cp.ndarray):
+                train_indices = train_indices.get()
+            if isinstance(test_indices, cp.ndarray):
+                test_indices = test_indices.get()
+
+        y_train.index = y.index[train_indices]
+        y_test.index = y.index[test_indices]
+    elif y_type is not None:
+        y_train = y_train.to_output(y_type)
+        y_test = y_test.to_output(y_type)
 
     if y is not None:
         return X_train, X_test, y_train, y_test
diff --git a/python/cuml/multiclass/multiclass.py b/python/cuml/multiclass/multiclass.py
index 65b378a17b..58c4151094 100644
--- a/python/cuml/multiclass/multiclass.py
+++ b/python/cuml/multiclass/multiclass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ class MulticlassClassifier(Base, ClassifierMixin):
         >>> cls.fit(X,y)
         MulticlassClassifier()
         >>> cls.predict(X)
-        array([2, 0, 2, 2, 2, 1, 1, 0, 1, 1])
+        array([1, 1, 1, 1, 1, 1, 2, 1, 1, 2])
 
     Parameters
     ----------
@@ -228,7 +228,7 @@ class OneVsRestClassifier(MulticlassClassifier):
         >>> cls.fit(X,y)
         OneVsRestClassifier()
         >>> cls.predict(X)
-        array([2, 0, 2, 2, 2, 1, 1, 0, 1, 1])
+        array([1, 1, 1, 1, 1, 1, 2, 1, 1, 2])
 
 
     Parameters
@@ -303,7 +303,7 @@ class OneVsOneClassifier(MulticlassClassifier):
         >>> cls.fit(X,y)
         OneVsOneClassifier()
         >>> cls.predict(X)
-        array([2, 0, 2, 2, 2, 1, 1, 0, 1, 1])
+        array([1, 1, 1, 1, 1, 1, 2, 1, 1, 2])
 
     Parameters
     ----------
diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx
index b08eed8163..9d44d73a34 100644
--- a/python/cuml/neighbors/nearest_neighbors.pyx
+++ b/python/cuml/neighbors/nearest_neighbors.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -253,11 +253,11 @@ class NearestNeighbors(UniversalBase,
         >>> # print results
         >>> print(indices)
         0  1  2
-        0  0  1  3
-        1  1  0  2
+        0  0  3  1
+        1  1  3  0
         2  2  4  0
-        3  3  0  2
-        4  4  2  3
+        3  3  0  1
+        4  4  2  0
         >>> print(distances) # doctest: +SKIP
                 0          1          2
         0  0.007812  24.786566  26.399996
diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py
index 46500b766a..01264572e7 100644
--- a/python/cuml/preprocessing/encoders.py
+++ b/python/cuml/preprocessing/encoders.py
@@ -203,10 +203,21 @@ class OneHotEncoder(BaseEncoder):
         - dict/list : ``drop[col]`` is the category in feature col that
           should be dropped.
 
-    sparse : bool, default=True
+    sparse_output : bool, default=True
         This feature is not fully supported by cupy
         yet, causing incorrect values when computing one hot encodings.
         See https://github.com/cupy/cupy/issues/3223
+
+        .. versionadded:: 24.06
+           `sparse` was renamed to `sparse_output`
+
+    sparse : bool, default=True
+        Will return sparse matrix if set True else will return an array.
+
+        .. deprecated:: 24.06
+           `sparse` is deprecated in 24.06 and will be removed in 25.08. Use
+           `sparse_output` instead.
+
     dtype : number type, default=np.float
         Desired datatype of transform's output.
     handle_unknown : {'error', 'ignore'}, default='error'
@@ -246,7 +257,8 @@ def __init__(
         *,
         categories="auto",
         drop=None,
-        sparse=True,
+        sparse="deprecated",
+        sparse_output=True,
         dtype=np.float32,
         handle_unknown="error",
         handle=None,
@@ -257,7 +269,9 @@ def __init__(
             handle=handle, verbose=verbose, output_type=output_type
         )
         self.categories = categories
+        # TODO(24.08): Remove self.sparse
         self.sparse = sparse
+        self.sparse_output = sparse_output
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.drop = drop
@@ -266,10 +280,14 @@ def __init__(
         self._features = None
         self._encoders = None
         self.input_type = None
-        if sparse and np.dtype(dtype) not in ["f", "d", "F", "D"]:
+        # This parameter validation should be performed in `fit` instead
+        # of in the constructor. Hence the awkwark `if` clause
+        if ((sparse != "deprecated" and sparse) or sparse_output) and np.dtype(
+            dtype
+        ) not in ["f", "d", "F", "D"]:
             raise ValueError(
                 "Only float32, float64, complex64 and complex128 "
-                "are supported when using sparse"
+                "are supported when using sparse_output"
             )
 
     def _validate_keywords(self):
@@ -289,6 +307,17 @@ def _validate_keywords(self):
                 "zero."
             )
 
+        if self.sparse != "deprecated":
+            warnings.warn(
+                (
+                    "`sparse` was renamed to `sparse_output` in version 24.06"
+                    " and will be removed in 25.08. `sparse_output` is ignored"
+                    " unless you leave `sparse` set to its default value."
+                ),
+                FutureWarning,
+            )
+            self.sparse_output = self.sparse
+
     def _check_is_fitted(self):
         if not self._fitted:
             msg = (
@@ -440,7 +469,7 @@ def transform(self, X):
                 (val, (rows, cols)), shape=(len(X), j), dtype=self.dtype
             )
 
-            if not self.sparse:
+            if not self.sparse_output:
                 ohe = ohe.toarray()
 
             return ohe
@@ -578,6 +607,7 @@ def get_param_names(self):
             "categories",
             "drop",
             "sparse",
+            "sparse_output",
             "dtype",
             "handle_unknown",
         ]
diff --git a/python/cuml/preprocessing/label.py b/python/cuml/preprocessing/label.py
index 78daee603e..c34a9d9249 100644
--- a/python/cuml/preprocessing/label.py
+++ b/python/cuml/preprocessing/label.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -64,14 +64,19 @@ def label_binarize(
 
     cp.cuda.Stream.null.synchronize()
 
+    is_binary = classes.shape[0] == 2
+
     if sparse_output:
         sp = sp.tocsr()
+        if is_binary:
+            sp = sp.getcol(1)  # getcol does not support -1 indexing
         return sp
     else:
 
         arr = sp.toarray().astype(y.dtype)
         arr[arr == 0] = neg_label
-
+        if is_binary:
+            arr = arr[:, -1].reshape((-1, 1))
         return arr
 
 
diff --git a/python/cuml/tests/conftest.py b/python/cuml/tests/conftest.py
index 76e23339de..1522b27983 100644
--- a/python/cuml/tests/conftest.py
+++ b/python/cuml/tests/conftest.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 import os
 import subprocess
 import pandas as pd
+import cudf.pandas
 
 from cuml.internals.safe_imports import cpu_only_import
 
@@ -169,6 +170,10 @@ def pytest_collection_modifyitems(config, items):
 
 
 def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "cudf_pandas: mark test as requiring the cudf.pandas wrapper",
+    )
     cp.cuda.set_allocator(None)
     # max_gpu_memory: Capacity of the GPU memory in GB
     pytest.max_gpu_memory = get_gpu_memory()
@@ -186,6 +191,16 @@ def pytest_configure(config):
         hypothesis.settings.load_profile("unit")
 
 
+def pytest_pyfunc_call(pyfuncitem):
+    """Skip tests that require the cudf.pandas accelerator
+
+    Tests marked with `@pytest.mark.cudf_pandas` will only be run if the
+    cudf.pandas accelerator is enabled via the `cudf.pandas` plugin.
+    """
+    if "cudf_pandas" in pyfuncitem.keywords and not cudf.pandas.LOADED:
+        pytest.skip("Test requires cudf.pandas accelerator")
+
+
 @pytest.fixture(scope="module")
 def nlp_20news():
     try:
diff --git a/python/cuml/tests/dask/conftest.py b/python/cuml/tests/dask/conftest.py
index 3c6311dc03..27fb746e1c 100644
--- a/python/cuml/tests/dask/conftest.py
+++ b/python/cuml/tests/dask/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import pytest
 
@@ -34,18 +34,8 @@ def client(cluster):
 
 @pytest.fixture(scope="module")
 def ucx_cluster():
-    initialize.initialize(
-        create_cuda_context=True,
-        enable_tcp_over_ucx=enable_tcp_over_ucx,
-        enable_nvlink=enable_nvlink,
-        enable_infiniband=enable_infiniband,
-    )
     cluster = LocalCUDACluster(
         protocol="ucx",
-        enable_tcp_over_ucx=enable_tcp_over_ucx,
-        enable_nvlink=enable_nvlink,
-        enable_infiniband=enable_infiniband,
-        worker_class=IncreasedCloseTimeoutNanny,
     )
     yield cluster
     cluster.close()
@@ -57,3 +47,62 @@ def ucx_client(ucx_cluster):
     client = Client(ucx_cluster)
     yield client
     client.close()
+
+
+@pytest.fixture(scope="module")
+def ucxx_cluster():
+    cluster = LocalCUDACluster(
+        protocol="ucxx",
+        worker_class=IncreasedCloseTimeoutNanny,
+    )
+    yield cluster
+    cluster.close()
+
+
+@pytest.fixture(scope="function")
+def ucxx_client(ucxx_cluster):
+    pytest.importorskip("distributed_ucxx")
+
+    client = Client(ucxx_cluster)
+    yield client
+    client.close()
+
+
+def pytest_addoption(parser):
+    group = parser.getgroup("Dask cuML Custom Options")
+
+    group.addoption(
+        "--run_ucx", action="store_true", help="run _only_ UCX-Py tests"
+    )
+
+    group.addoption(
+        "--run_ucxx", action="store_true", help="run _only_ UCXX tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--run_ucx"):
+        skip_others = pytest.mark.skip(
+            reason="only runs when --run_ucx is not specified"
+        )
+        for item in items:
+            if "ucx" not in item.keywords:
+                item.add_marker(skip_others)
+    else:
+        skip_ucx = pytest.mark.skip(reason="requires --run_ucx to run")
+        for item in items:
+            if "ucx" in item.keywords:
+                item.add_marker(skip_ucx)
+
+    if config.getoption("--run_ucxx"):
+        skip_others = pytest.mark.skip(
+            reason="only runs when --run_ucxx is not specified"
+        )
+        for item in items:
+            if "ucxx" not in item.keywords:
+                item.add_marker(skip_others)
+    else:
+        skip_ucxx = pytest.mark.skip(reason="requires --run_ucxx to run")
+        for item in items:
+            if "ucxx" in item.keywords:
+                item.add_marker(skip_ucxx)
diff --git a/python/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/tests/dask/test_dask_logistic_regression.py
index 4fe7504ce4..89814365e1 100644
--- a/python/cuml/tests/dask/test_dask_logistic_regression.py
+++ b/python/cuml/tests/dask/test_dask_logistic_regression.py
@@ -187,7 +187,7 @@ def imp():
 
 @pytest.mark.mg
 @pytest.mark.parametrize("n_parts", [2])
-@pytest.mark.parametrize("datatype", [np.float32])
+@pytest.mark.parametrize("datatype", [np.float32, np.float64])
 def test_lbfgs_toy(n_parts, datatype, client):
     def imp():
         import cuml.comm.serialize  # NOQA
@@ -217,16 +217,7 @@ def imp():
     from numpy.testing import assert_array_equal
 
     assert_array_equal(preds, y, strict=True)
-
-    # assert error on float64
-    X = X.astype(np.float64)
-    y = y.astype(np.float64)
-    X_df, y_df = _prep_training_data(client, X, y, n_parts)
-    with pytest.raises(
-        RuntimeError,
-        match="dtypes other than float32 are currently not supported yet. See issue: https://github.com/rapidsai/cuml/issues/5589",
-    ):
-        lr.fit(X_df, y_df)
+    assert lr.dtype == datatype
 
 
 def test_lbfgs_init(client):
@@ -303,14 +294,7 @@ def assert_params(
     )
 
 
-@pytest.mark.mg
-@pytest.mark.parametrize("nrows", [1e5])
-@pytest.mark.parametrize("ncols", [20])
-@pytest.mark.parametrize("n_parts", [2, 23])
-@pytest.mark.parametrize("fit_intercept", [False, True])
-@pytest.mark.parametrize("datatype", [np.float32])
-@pytest.mark.parametrize("delayed", [True, False])
-def test_lbfgs(
+def _test_lbfgs(
     nrows,
     ncols,
     n_parts,
@@ -428,14 +412,35 @@ def array_to_numpy(ary):
     return lr
 
 
+@pytest.mark.mg
+@pytest.mark.parametrize("n_parts", [2, 23])
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("delayed", [True, False])
+def test_lbfgs(n_parts, fit_intercept, delayed, client):
+    datatype = np.float32 if fit_intercept else np.float64
+
+    lr = _test_lbfgs(
+        nrows=1e5,
+        ncols=20,
+        n_parts=n_parts,
+        fit_intercept=fit_intercept,
+        datatype=datatype,
+        delayed=delayed,
+        client=client,
+    )
+
+    assert lr.dtype == datatype
+
+
 @pytest.mark.parametrize("fit_intercept", [False, True])
 def test_noreg(fit_intercept, client):
-    lr = test_lbfgs(
+    datatype = np.float64 if fit_intercept else np.float32
+    lr = _test_lbfgs(
         nrows=1e5,
         ncols=20,
         n_parts=23,
         fit_intercept=fit_intercept,
-        datatype=np.float32,
+        datatype=datatype,
         delayed=True,
         client=client,
         penalty="none",
@@ -449,6 +454,8 @@ def test_noreg(fit_intercept, client):
     assert l1_strength == 0.0
     assert l2_strength == 0.0
 
+    assert lr.dtype == datatype
+
 
 def test_n_classes_small(client):
     def assert_small(X, y, n_classes):
@@ -493,13 +500,14 @@ def assert_small(X, y, n_classes):
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("n_classes", [8])
 def test_n_classes(n_parts, fit_intercept, n_classes, client):
+    datatype = np.float32 if fit_intercept else np.float64
     nrows = int(1e5) if n_classes < 5 else int(2e5)
-    lr = test_lbfgs(
+    lr = _test_lbfgs(
         nrows=nrows,
         ncols=20,
         n_parts=n_parts,
         fit_intercept=fit_intercept,
-        datatype=np.float32,
+        datatype=datatype,
         delayed=True,
         client=client,
         penalty="l2",
@@ -507,17 +515,18 @@ def test_n_classes(n_parts, fit_intercept, n_classes, client):
     )
 
     assert lr._num_classes == n_classes
+    assert lr.dtype == datatype
 
 
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [False, True])
-@pytest.mark.parametrize("datatype", [np.float32])
 @pytest.mark.parametrize("delayed", [True])
 @pytest.mark.parametrize("n_classes", [2, 8])
 @pytest.mark.parametrize("C", [1.0, 10.0])
-def test_l1(fit_intercept, datatype, delayed, n_classes, C, client):
+def test_l1(fit_intercept, delayed, n_classes, C, client):
+    datatype = np.float64 if fit_intercept else np.float32
     nrows = int(1e5) if n_classes < 5 else int(2e5)
-    lr = test_lbfgs(
+    lr = _test_lbfgs(
         nrows=nrows,
         ncols=20,
         n_parts=2,
@@ -534,18 +543,22 @@ def test_l1(fit_intercept, datatype, delayed, n_classes, C, client):
     assert l1_strength == 1.0 / lr.C
     assert l2_strength == 0.0
 
+    assert lr.dtype == datatype
+
 
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [False, True])
-@pytest.mark.parametrize("datatype", [np.float32])
+@pytest.mark.parametrize("datatype", [np.float32, np.float64])
 @pytest.mark.parametrize("delayed", [True])
 @pytest.mark.parametrize("n_classes", [2, 8])
 @pytest.mark.parametrize("l1_ratio", [0.2, 0.8])
 def test_elasticnet(
     fit_intercept, datatype, delayed, n_classes, l1_ratio, client
 ):
+    datatype = np.float32 if fit_intercept else np.float64
+
     nrows = int(1e5) if n_classes < 5 else int(2e5)
-    lr = test_lbfgs(
+    lr = _test_lbfgs(
         nrows=nrows,
         ncols=20,
         n_parts=2,
@@ -564,28 +577,28 @@ def test_elasticnet(
     assert l1_strength == lr.l1_ratio * strength
     assert l2_strength == (1.0 - lr.l1_ratio) * strength
 
+    assert lr.dtype == datatype
+
 
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize(
-    "regularization",
+    "reg_dtype",
     [
-        ("none", 1.0, None),
-        ("l2", 2.0, None),
-        ("l1", 2.0, None),
-        ("elasticnet", 2.0, 0.2),
+        (("none", 1.0, None), np.float32),
+        (("l2", 2.0, None), np.float64),
+        (("l1", 2.0, None), np.float32),
+        (("elasticnet", 2.0, 0.2), np.float64),
     ],
 )
-@pytest.mark.parametrize("datatype", [np.float32, np.float64])
 @pytest.mark.parametrize("n_classes", [2, 8])
-def test_sparse_from_dense(
-    fit_intercept, regularization, datatype, n_classes, client
-):
-    penalty, C, l1_ratio = regularization
+def test_sparse_from_dense(fit_intercept, reg_dtype, n_classes, client):
+    penalty, C, l1_ratio = reg_dtype[0]
+    datatype = reg_dtype[1]
 
     nrows = int(1e5) if n_classes < 5 else int(2e5)
     run_test = partial(
-        test_lbfgs,
+        _test_lbfgs,
         nrows=nrows,
         ncols=20,
         n_parts=2,
@@ -600,17 +613,11 @@ def test_sparse_from_dense(
         convert_to_sparse=True,
     )
 
-    if datatype == np.float32:
-        run_test()
-    else:
-        with pytest.raises(
-            RuntimeError,
-            match="dtypes other than float32 are currently not supported",
-        ):
-            run_test()
+    lr = run_test()
+    assert lr.dtype == datatype
 
 
-@pytest.mark.parametrize("dtype", [np.float32])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_sparse_nlp20news(dtype, nlp_20news, client):
 
     X, y = nlp_20news
@@ -677,21 +684,22 @@ def test_exception_one_label(fit_intercept, client):
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize(
-    "regularization",
+    "reg_dtype",
     [
-        ("none", 1.0, None),
-        ("l2", 2.0, None),
-        ("l1", 2.0, None),
-        ("elasticnet", 2.0, 0.2),
+        (("none", 1.0, None), np.float64),
+        (("l2", 2.0, None), np.float32),
+        (("l1", 2.0, None), np.float64),
+        (("elasticnet", 2.0, 0.2), np.float32),
     ],
 )
-@pytest.mark.parametrize("datatype", [np.float32])
 @pytest.mark.parametrize("delayed", [False])
 @pytest.mark.parametrize("n_classes", [2, 8])
 def test_standardization_on_normal_dataset(
-    fit_intercept, regularization, datatype, delayed, n_classes, client
+    fit_intercept, reg_dtype, delayed, n_classes, client
 ):
 
+    regularization = reg_dtype[0]
+    datatype = reg_dtype[1]
     penalty = regularization[0]
     C = regularization[1]
     l1_ratio = regularization[2]
@@ -699,7 +707,7 @@ def test_standardization_on_normal_dataset(
     nrows = int(1e5) if n_classes < 5 else int(2e5)
 
     # test correctness compared with scikit-learn
-    test_lbfgs(
+    lr = _test_lbfgs(
         nrows=nrows,
         ncols=20,
         n_parts=2,
@@ -713,26 +721,29 @@ def test_standardization_on_normal_dataset(
         l1_ratio=l1_ratio,
         standardization=True,
     )
+    assert lr.dtype == datatype
 
 
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize(
-    "regularization",
+    "reg_dtype",
     [
-        ("none", 1.0, None),
-        ("l2", 2.0, None),
-        ("l1", 2.0, None),
-        ("elasticnet", 2.0, 0.2),
+        (("none", 1.0, None), np.float32),
+        (("l2", 2.0, None), np.float32),
+        (("l1", 2.0, None), np.float64),
+        (("elasticnet", 2.0, 0.2), np.float64),
     ],
 )
-@pytest.mark.parametrize("datatype", [np.float32])
 @pytest.mark.parametrize("delayed", [False])
 @pytest.mark.parametrize("ncol_and_nclasses", [(2, 2), (6, 4), (100, 10)])
 def test_standardization_on_scaled_dataset(
-    fit_intercept, regularization, datatype, delayed, ncol_and_nclasses, client
+    fit_intercept, reg_dtype, delayed, ncol_and_nclasses, client
 ):
 
+    regularization = reg_dtype[0]
+    datatype = reg_dtype[1]
+
     penalty = regularization[0]
     C = regularization[1]
     l1_ratio = regularization[2]
@@ -887,25 +898,30 @@ def to_dask_data(X_train, X_test, y_train, y_test):
         total_tol=tolerance,
     )
 
+    assert mgon.dtype == datatype
+    assert mgoff.dtype == datatype
+
 
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(
-    "regularization",
+    "reg_dtype",
     [
-        ("none", 1.0, None),
-        ("l2", 2.0, None),
-        ("l1", 2.0, None),
-        ("elasticnet", 2.0, 0.2),
+        ((None, 1.0, None), np.float64),
+        (("l2", 2.0, None), np.float64),
+        (("l1", 2.0, None), np.float32),
+        (("elasticnet", 2.0, 0.2), np.float32),
     ],
 )
-def test_standardization_example(fit_intercept, regularization, client):
+def test_standardization_example(fit_intercept, reg_dtype, client):
+    regularization = reg_dtype[0]
+    datatype = reg_dtype[1]
+
     n_rows = int(1e5)
     n_cols = 20
     n_info = 10
     n_classes = 4
 
-    datatype = np.float32
     n_parts = 2
     max_iter = 5  # cannot set this too large. Observed GPU-specific coefficients when objective converges at 0.
 
@@ -980,19 +996,25 @@ def test_standardization_example(fit_intercept, regularization, client):
         total_tol=tolerance,
     )
 
+    assert lr_on.dtype == datatype
+    assert lr_off.dtype == datatype
+
 
 @pytest.mark.mg
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(
-    "regularization",
+    "reg_dtype",
     [
-        ("none", 1.0, None),
-        ("l2", 2.0, None),
-        ("l1", 2.0, None),
-        ("elasticnet", 2.0, 0.2),
+        ((None, 1.0, None), np.float64),
+        (("l2", 2.0, None), np.float32),
+        (("l1", 2.0, None), np.float64),
+        (("elasticnet", 2.0, 0.2), np.float32),
     ],
 )
-def test_standardization_sparse(fit_intercept, regularization, client):
+def test_standardization_sparse(fit_intercept, reg_dtype, client):
+    regularization = reg_dtype[0]
+    datatype = reg_dtype[1]
+
     n_rows = 10000
     n_cols = 25
     n_info = 15
@@ -1000,7 +1022,6 @@ def test_standardization_sparse(fit_intercept, regularization, client):
     nnz = int(n_rows * n_cols * 0.3)  # number of non-zero values
     tolerance = 0.005
 
-    datatype = np.float32
     n_parts = 10
     max_iter = 5  # cannot set this too large. Observed GPU-specific coefficients when objective converges at 0.
 
@@ -1080,3 +1101,5 @@ def make_classification_with_nnz(
     assert array_equal(
         lron_intercept_origin, sg.intercept_, unit_tol=tolerance
     )
+
+    assert lr_on.dtype == datatype
diff --git a/python/cuml/tests/dask/test_dask_nearest_neighbors.py b/python/cuml/tests/dask/test_dask_nearest_neighbors.py
index 9dbd4dc010..2915538bda 100644
--- a/python/cuml/tests/dask/test_dask_nearest_neighbors.py
+++ b/python/cuml/tests/dask/test_dask_nearest_neighbors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -81,24 +81,7 @@ def _scale_rows(client, nrows):
     return n_workers * nrows
 
 
-@pytest.mark.parametrize(
-    "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]
-)
-@pytest.mark.parametrize("ncols", [10, 30])
-@pytest.mark.parametrize(
-    "nclusters", [unit_param(5), quality_param(10), stress_param(15)]
-)
-@pytest.mark.parametrize(
-    "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]
-)
-@pytest.mark.parametrize(
-    "n_parts",
-    [unit_param(1), unit_param(5), quality_param(7), stress_param(50)],
-)
-@pytest.mark.parametrize(
-    "streams_per_handle,reverse_worker_order", [(5, True), (10, False)]
-)
-def test_compare_skl(
+def _test_compare_skl(
     nrows,
     ncols,
     nclusters,
@@ -106,8 +89,10 @@ def test_compare_skl(
     n_neighbors,
     streams_per_handle,
     reverse_worker_order,
-    client,
+    dask_client,
+    request,
 ):
+    client = request.getfixturevalue(dask_client)
 
     from cuml.dask.neighbors import NearestNeighbors as daskNN
 
@@ -162,11 +147,130 @@ def test_compare_skl(
     assert array_equal(y_hat, skl_y_hat)
 
 
-@pytest.mark.parametrize("nrows", [unit_param(1000), stress_param(1e5)])
-@pytest.mark.parametrize("ncols", [unit_param(10), stress_param(500)])
-@pytest.mark.parametrize("n_parts", [unit_param(10), stress_param(100)])
-@pytest.mark.parametrize("batch_size", [unit_param(100), stress_param(1e3)])
-def test_batch_size(nrows, ncols, n_parts, batch_size, client):
+@pytest.mark.parametrize(
+    "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]
+)
+@pytest.mark.parametrize("ncols", [10, 30])
+@pytest.mark.parametrize(
+    "nclusters", [unit_param(5), quality_param(10), stress_param(15)]
+)
+@pytest.mark.parametrize(
+    "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]
+)
+@pytest.mark.parametrize(
+    "n_parts",
+    [unit_param(1), unit_param(5), quality_param(7), stress_param(50)],
+)
+@pytest.mark.parametrize(
+    "streams_per_handle,reverse_worker_order", [(5, True), (10, False)]
+)
+def test_compare_skl(
+    nrows,
+    ncols,
+    nclusters,
+    n_parts,
+    n_neighbors,
+    streams_per_handle,
+    reverse_worker_order,
+    request,
+):
+    _test_compare_skl(
+        nrows,
+        ncols,
+        nclusters,
+        n_parts,
+        n_neighbors,
+        streams_per_handle,
+        reverse_worker_order,
+        "client",
+        request,
+    )
+
+
+@pytest.mark.parametrize(
+    "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]
+)
+@pytest.mark.parametrize("ncols", [10, 30])
+@pytest.mark.parametrize(
+    "nclusters", [unit_param(5), quality_param(10), stress_param(15)]
+)
+@pytest.mark.parametrize(
+    "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]
+)
+@pytest.mark.parametrize(
+    "n_parts",
+    [unit_param(1), unit_param(5), quality_param(7), stress_param(50)],
+)
+@pytest.mark.parametrize(
+    "streams_per_handle,reverse_worker_order", [(5, True), (10, False)]
+)
+@pytest.mark.ucx
+def test_compare_skl_ucx(
+    nrows,
+    ncols,
+    nclusters,
+    n_parts,
+    n_neighbors,
+    streams_per_handle,
+    reverse_worker_order,
+    request,
+):
+    _test_compare_skl(
+        nrows,
+        ncols,
+        nclusters,
+        n_parts,
+        n_neighbors,
+        streams_per_handle,
+        reverse_worker_order,
+        "ucx_client",
+        request,
+    )
+
+
+@pytest.mark.parametrize(
+    "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]
+)
+@pytest.mark.parametrize("ncols", [10, 30])
+@pytest.mark.parametrize(
+    "nclusters", [unit_param(5), quality_param(10), stress_param(15)]
+)
+@pytest.mark.parametrize(
+    "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]
+)
+@pytest.mark.parametrize(
+    "n_parts",
+    [unit_param(1), unit_param(5), quality_param(7), stress_param(50)],
+)
+@pytest.mark.parametrize(
+    "streams_per_handle,reverse_worker_order", [(5, True), (10, False)]
+)
+@pytest.mark.ucxx
+def test_compare_skl_ucxx(
+    nrows,
+    ncols,
+    nclusters,
+    n_parts,
+    n_neighbors,
+    streams_per_handle,
+    reverse_worker_order,
+    request,
+):
+    _test_compare_skl(
+        nrows,
+        ncols,
+        nclusters,
+        n_parts,
+        n_neighbors,
+        streams_per_handle,
+        reverse_worker_order,
+        "ucxx_client",
+        request,
+    )
+
+
+def _test_batch_size(nrows, ncols, n_parts, batch_size, dask_client, request):
+    client = request.getfixturevalue(dask_client)
 
     n_neighbors = 10
     n_clusters = 5
@@ -202,7 +306,34 @@ def test_batch_size(nrows, ncols, n_parts, batch_size, client):
     assert array_equal(y_hat, y)
 
 
-def test_return_distance(client):
+@pytest.mark.parametrize("nrows", [unit_param(1000), stress_param(1e5)])
+@pytest.mark.parametrize("ncols", [unit_param(10), stress_param(500)])
+@pytest.mark.parametrize("n_parts", [unit_param(10), stress_param(100)])
+@pytest.mark.parametrize("batch_size", [unit_param(100), stress_param(1e3)])
+def test_batch_size(nrows, ncols, n_parts, batch_size, request):
+    _test_batch_size(nrows, ncols, n_parts, batch_size, "client", request)
+
+
+@pytest.mark.parametrize("nrows", [unit_param(1000), stress_param(1e5)])
+@pytest.mark.parametrize("ncols", [unit_param(10), stress_param(500)])
+@pytest.mark.parametrize("n_parts", [unit_param(10), stress_param(100)])
+@pytest.mark.parametrize("batch_size", [unit_param(100), stress_param(1e3)])
+@pytest.mark.ucx
+def test_batch_size_ucx(nrows, ncols, n_parts, batch_size, request):
+    _test_batch_size(nrows, ncols, n_parts, batch_size, "ucx_client", request)
+
+
+@pytest.mark.parametrize("nrows", [unit_param(1000), stress_param(1e5)])
+@pytest.mark.parametrize("ncols", [unit_param(10), stress_param(500)])
+@pytest.mark.parametrize("n_parts", [unit_param(10), stress_param(100)])
+@pytest.mark.parametrize("batch_size", [unit_param(100), stress_param(1e3)])
+@pytest.mark.ucxx
+def test_batch_size_ucxx(nrows, ncols, n_parts, batch_size, request):
+    _test_batch_size(nrows, ncols, n_parts, batch_size, "ucxx_client", request)
+
+
+def _test_return_distance(dask_client, request):
+    client = request.getfixturevalue(dask_client)
 
     n_samples = 50
     n_feats = 50
@@ -233,7 +364,22 @@ def test_return_distance(client):
     assert len(ret) == 2
 
 
-def test_default_n_neighbors(client):
+def test_return_distance(request):
+    _test_return_distance("client", request)
+
+
+@pytest.mark.ucx
+def test_return_distance_ucx(request):
+    _test_return_distance("ucx_client", request)
+
+
+@pytest.mark.ucxx
+def test_return_distance_ucxx(request):
+    _test_return_distance("ucxx_client", request)
+
+
+def _test_default_n_neighbors(dask_client, request):
+    client = request.getfixturevalue(dask_client)
 
     n_samples = 50
     n_feats = 50
@@ -269,7 +415,23 @@ def test_default_n_neighbors(client):
     assert ret.shape[1] == k
 
 
-def test_one_query_partition(client):
+def test_default_n_neighbors(request):
+    _test_default_n_neighbors("client", request)
+
+
+@pytest.mark.ucx
+def test_default_n_neighbors_ucx(request):
+    _test_default_n_neighbors("ucx_client", request)
+
+
+@pytest.mark.ucxx
+def test_default_n_neighbors_ucxx(request):
+    _test_default_n_neighbors("ucxx_client", request)
+
+
+def _test_one_query_partition(dask_client, request):
+    client = request.getfixturevalue(dask_client)  # noqa
+
     from cuml.dask.neighbors import NearestNeighbors as daskNN
     from cuml.dask.datasets import make_blobs
 
@@ -280,3 +442,17 @@ def test_one_query_partition(client):
     cumlModel = daskNN(n_neighbors=4)
     cumlModel.fit(X_train)
     cumlModel.kneighbors(X_test)
+
+
+def test_one_query_partition(request):
+    _test_one_query_partition("client", request)
+
+
+@pytest.mark.ucx
+def test_one_query_partition_ucx(request):
+    _test_one_query_partition("ucx_client", request)
+
+
+@pytest.mark.ucxx
+def test_one_query_partition_ucxx(request):
+    _test_one_query_partition("ucxx_client", request)
diff --git a/python/cuml/tests/dask/test_dask_one_hot_encoder.py b/python/cuml/tests/dask/test_dask_one_hot_encoder.py
index e4d76bd470..64ba9715bc 100644
--- a/python/cuml/tests/dask/test_dask_one_hot_encoder.py
+++ b/python/cuml/tests/dask/test_dask_one_hot_encoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,8 +41,8 @@ def test_onehot_vs_skonehot(client):
     skX = from_df_to_numpy(X)
     X = dask_cudf.from_cudf(X, npartitions=2)
 
-    enc = OneHotEncoder(sparse=False)
-    skohe = SkOneHotEncoder(sparse=False)
+    enc = OneHotEncoder(sparse_output=False)
+    skohe = SkOneHotEncoder(sparse_output=False)
 
     ohe = enc.fit_transform(X)
     ref = skohe.fit_transform(skX)
@@ -71,7 +71,7 @@ def test_onehot_categories(client):
     X = DataFrame({"chars": ["a", "b"], "int": [0, 2]})
     X = dask_cudf.from_cudf(X, npartitions=2)
     cats = DataFrame({"chars": ["a", "b", "c"], "int": [0, 1, 2]})
-    enc = OneHotEncoder(categories=cats, sparse=False)
+    enc = OneHotEncoder(categories=cats, sparse_output=False)
     ref = cp.array(
         [[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]]
     )
@@ -100,12 +100,12 @@ def test_onehot_transform_handle_unknown(client):
     X = dask_cudf.from_cudf(X, npartitions=2)
     Y = dask_cudf.from_cudf(Y, npartitions=2)
 
-    enc = OneHotEncoder(handle_unknown="error", sparse=False)
+    enc = OneHotEncoder(handle_unknown="error", sparse_output=False)
     enc = enc.fit(X)
     with pytest.raises(KeyError):
         enc.transform(Y).compute()
 
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
     enc = enc.fit(X)
     ohe = enc.transform(Y)
     ref = cp.array([[0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]])
@@ -140,8 +140,10 @@ def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
     else:
         dX = dask_cudf.from_cudf(X, npartitions=1)
 
-    enc = OneHotEncoder(sparse=sparse, drop=drop, categories="auto")
-    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories="auto")
+    enc = OneHotEncoder(sparse_output=sparse, drop=drop, categories="auto")
+    sk_enc = SkOneHotEncoder(
+        sparse_output=sparse, drop=drop, categories="auto"
+    )
     ohe = enc.fit_transform(dX)
     ref = sk_enc.fit_transform(ary)
     if sparse:
@@ -159,8 +161,8 @@ def test_onehot_drop_idx_first(client):
     X = DataFrame({"chars": ["c", "b"], "int": [2, 2], "letters": ["a", "b"]})
     ddf = dask_cudf.from_cudf(X, npartitions=2)
 
-    enc = OneHotEncoder(sparse=False, drop="first")
-    sk_enc = SkOneHotEncoder(sparse=False, drop="first")
+    enc = OneHotEncoder(sparse_output=False, drop="first")
+    sk_enc = SkOneHotEncoder(sparse_output=False, drop="first")
     ohe = enc.fit_transform(ddf)
     ref = sk_enc.fit_transform(X_ary)
     cp.testing.assert_array_equal(ohe.compute(), ref)
@@ -177,8 +179,8 @@ def test_onehot_drop_one_of_each(client):
     ddf = dask_cudf.from_cudf(X, npartitions=2)
 
     drop = dict({"chars": "b", "int": 2, "letters": "b"})
-    enc = OneHotEncoder(sparse=False, drop=drop)
-    sk_enc = SkOneHotEncoder(sparse=False, drop=["b", 2, "b"])
+    enc = OneHotEncoder(sparse_output=False, drop=drop)
+    sk_enc = SkOneHotEncoder(sparse_output=False, drop=["b", 2, "b"])
     ohe = enc.fit_transform(ddf)
     ref = sk_enc.fit_transform(X_ary)
     cp.testing.assert_array_equal(ohe.compute(), ref)
@@ -212,7 +214,7 @@ def test_onehot_drop_exceptions(client, drop, pattern):
     X = dask_cudf.from_cudf(X, npartitions=2)
 
     with pytest.raises(ValueError, match=pattern):
-        OneHotEncoder(sparse=False, drop=drop).fit(X)
+        OneHotEncoder(sparse_output=False, drop=drop).fit(X)
 
 
 @pytest.mark.mg
diff --git a/python/cuml/tests/dask/test_dask_random_forest.py b/python/cuml/tests/dask/test_dask_random_forest.py
index c35f5ab21e..38596b2e69 100644
--- a/python/cuml/tests/dask/test_dask_random_forest.py
+++ b/python/cuml/tests/dask/test_dask_random_forest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 #
 
 
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -170,7 +170,7 @@ def test_rf_regression_dask_fil(partitions_per_worker, dtype, client):
     cuml_mod_predict = cuml_mod.predict(X_test_df)
     cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute()))
 
-    acc_score = r2_score(cuml_mod_predict, y_test)
+    acc_score = r2_score(y_test, cuml_mod_predict)
 
     assert acc_score >= 0.59
 
@@ -256,7 +256,7 @@ def test_rf_regression_dask_cpu(partitions_per_worker, client):
 
     cuml_mod_predict = cuml_mod.predict(X_test, predict_model="CPU")
 
-    acc_score = r2_score(cuml_mod_predict, y_test)
+    acc_score = r2_score(y_test, cuml_mod_predict)
 
     assert acc_score >= 0.67
 
@@ -711,7 +711,7 @@ def test_rf_broadcast(model_type, fit_broadcast, transform_broadcast, client):
 
         cuml_mod_predict = cuml_mod_predict.compute()
         cuml_mod_predict = cp.asnumpy(cuml_mod_predict)
-        acc_score = r2_score(cuml_mod_predict, y_test)
+        acc_score = r2_score(y_test, cuml_mod_predict)
         assert acc_score >= 0.72
 
     if transform_broadcast:
diff --git a/python/cuml/tests/dask/test_dask_serialization.py b/python/cuml/tests/dask/test_dask_serialization.py
index 2eb9a35060..3dc819b24f 100644
--- a/python/cuml/tests/dask/test_dask_serialization.py
+++ b/python/cuml/tests/dask/test_dask_serialization.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ def test_serialize_mnmg_model(client):
     X, y = make_regression(n_samples=1000, n_features=20, random_state=0)
     X, y = da.from_array(X), da.from_array(y)
 
-    model = LinearRegression(client)
+    model = LinearRegression(client=client)
     model.fit(X, y)
 
     pickled_model = pickle.dumps(model)
diff --git a/python/cuml/tests/explainer/test_explainer_kernel_shap.py b/python/cuml/tests/explainer/test_explainer_kernel_shap.py
index 74c985989f..3f20b7d8a5 100644
--- a/python/cuml/tests/explainer/test_explainer_kernel_shap.py
+++ b/python/cuml/tests/explainer/test_explainer_kernel_shap.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -522,24 +522,24 @@ def test_typeerror_input():
 housing_regression_result = np.array(
     [
         [
-            -0.73860609,
-            0.00557072,
-            -0.05829297,
-            -0.01582018,
-            -0.01010366,
-            -0.23167623,
-            -0.470639,
-            -0.07584473,
+            -0.00182223,
+            -0.01232004,
+            -0.4782278,
+            0.04781425,
+            -0.01337761,
+            -0.34830606,
+            -0.4682865,
+            -0.20812261,
         ],
         [
-            -0.6410764,
-            0.01369913,
-            -0.09492759,
-            0.02654463,
-            -0.00911134,
-            -0.05953105,
-            -0.51266433,
-            -0.0853608,
+            -0.0013606,
+            0.0110372,
+            -0.445176,
+            -0.08268094,
+            0.00406259,
+            -0.02185595,
+            -0.47673094,
+            -0.13557231,
         ],
     ],
     dtype=np.float32,
diff --git a/python/cuml/tests/test_agglomerative.py b/python/cuml/tests/test_agglomerative.py
index c415a6b3c2..7c71be02ec 100644
--- a/python/cuml/tests/test_agglomerative.py
+++ b/python/cuml/tests/test_agglomerative.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,14 +33,14 @@ def test_duplicate_distances(connectivity):
 
     cuml_agg = AgglomerativeClustering(
         n_clusters=2,
-        affinity="euclidean",
+        metric="euclidean",
         linkage="single",
         n_neighbors=3,
         connectivity=connectivity,
     )
 
     sk_agg = cluster.AgglomerativeClustering(
-        n_clusters=2, affinity="euclidean", linkage="single"
+        n_clusters=2, metric="euclidean", linkage="single"
     )
 
     cuml_agg.fit(X)
@@ -64,7 +64,7 @@ def test_single_linkage_sklearn_compare(
 
     cuml_agg = AgglomerativeClustering(
         n_clusters=nclusters,
-        affinity="euclidean",
+        metric="euclidean",
         linkage="single",
         n_neighbors=k,
         connectivity=connectivity,
@@ -73,7 +73,7 @@ def test_single_linkage_sklearn_compare(
     cuml_agg.fit(X)
 
     sk_agg = cluster.AgglomerativeClustering(
-        n_clusters=nclusters, affinity="euclidean", linkage="single"
+        n_clusters=nclusters, metric="euclidean", linkage="single"
     )
     sk_agg.fit(cp.asnumpy(X))
 
@@ -87,9 +87,9 @@ def test_single_linkage_sklearn_compare(
 
 def test_invalid_inputs():
 
-    # Test bad affinity
+    # Test bad metric
     with pytest.raises(ValueError):
-        AgglomerativeClustering(affinity="doesntexist")
+        AgglomerativeClustering(metric="doesntexist")
 
     with pytest.raises(ValueError):
         AgglomerativeClustering(linkage="doesntexist")
@@ -108,3 +108,23 @@ def test_invalid_inputs():
 
     with pytest.raises(ValueError):
         AgglomerativeClustering(n_clusters=500).fit(cp.ones((2, 5)))
+
+
+def test_affinity_deprecation():
+    X = cp.array([[1.0, 2], [3, 4]])
+    y = cp.array([1, 0])
+
+    agg = AgglomerativeClustering(affinity="euclidean")
+    with pytest.warns(
+        FutureWarning,
+        match="Attribute `affinity` was deprecated in version 24.06",
+    ):
+        agg.fit(X, y)
+
+    # don't provide both
+    agg = AgglomerativeClustering(affinity="euclidean", metric="euclidean")
+    with pytest.raises(
+        ValueError,
+        match="Both `affinity` and `metric` attributes were set",
+    ):
+        agg.fit(X, y)
diff --git a/python/cuml/tests/test_api.py b/python/cuml/tests/test_api.py
index 8d13499171..74adbd177d 100644
--- a/python/cuml/tests/test_api.py
+++ b/python/cuml/tests/test_api.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -158,8 +158,6 @@ def test_get_tags(model):
         else:
             assert isinstance(model_tags[tag], tag_type)
 
-    return True
-
 
 def test_dynamic_tags_and_composition():
     static_tags = dummy_class_with_tags._get_tags()
diff --git a/python/cuml/tests/test_array.py b/python/cuml/tests/test_array.py
index c4c479506c..f64683717d 100644
--- a/python/cuml/tests/test_array.py
+++ b/python/cuml/tests/test_array.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -251,8 +251,8 @@ def test_get_set_item(inp, indices, mem_type):
         _assert_equal(inp_view, ary[indices])
 
         # Check equality after assigning to array slice.
-        ary[indices] = 1.0
-        inp[indices] = 1.0
+        ary[indices] = inp.dtype.type(1.0)
+        inp[indices] = inp.dtype.type(1.0)
 
         # We need to assume that inp is not a cudf.Series here, otherwise
         # ary.to_output("cupy") called by equal() will trigger a
diff --git a/python/cuml/tests/test_coordinate_descent.py b/python/cuml/tests/test_coordinate_descent.py
index 1711e409fe..8f6968abc7 100644
--- a/python/cuml/tests/test_coordinate_descent.py
+++ b/python/cuml/tests/test_coordinate_descent.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,6 @@
 
 
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
-@pytest.mark.parametrize("X_type", ["ndarray"])
 @pytest.mark.parametrize("alpha", [0.1, 0.001])
 @pytest.mark.parametrize("algorithm", ["cyclic", "random"])
 @pytest.mark.parametrize(
@@ -42,7 +41,7 @@
     ],
 )
 @pytest.mark.filterwarnings("ignore:Objective did not converge::sklearn[.*]")
-def test_lasso(datatype, X_type, alpha, algorithm, nrows, column_info):
+def test_lasso(datatype, alpha, algorithm, nrows, column_info):
     ncols, n_info = column_info
     X, y = make_regression(
         n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0
@@ -53,7 +52,7 @@ def test_lasso(datatype, X_type, alpha, algorithm, nrows, column_info):
         X, y, train_size=0.8, random_state=0
     )
     cu_lasso = cuLasso(
-        alpha=np.array([alpha]),
+        alpha=alpha,
         fit_intercept=True,
         max_iter=1000,
         selection=algorithm,
@@ -165,7 +164,6 @@ def test_weighted_cd(datatype, model, fit_intercept, distribution):
 
 
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
-@pytest.mark.parametrize("X_type", ["ndarray"])
 @pytest.mark.parametrize("alpha", [0.2, 0.7])
 @pytest.mark.parametrize("algorithm", ["cyclic", "random"])
 @pytest.mark.parametrize(
@@ -180,7 +178,7 @@ def test_weighted_cd(datatype, model, fit_intercept, distribution):
     ],
 )
 @pytest.mark.filterwarnings("ignore:Objective did not converge::sklearn[.*]")
-def test_elastic_net(datatype, X_type, alpha, algorithm, nrows, column_info):
+def test_elastic_net(datatype, alpha, algorithm, nrows, column_info):
     ncols, n_info = column_info
     X, y = make_regression(
         n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0
@@ -192,7 +190,7 @@ def test_elastic_net(datatype, X_type, alpha, algorithm, nrows, column_info):
     )
 
     elastic_cu = cuElasticNet(
-        alpha=np.array([alpha]),
+        alpha=alpha,
         fit_intercept=True,
         max_iter=1000,
         selection=algorithm,
diff --git a/python/cuml/tests/test_device_selection.py b/python/cuml/tests/test_device_selection.py
index ecca397a57..e5c2d9ce1a 100644
--- a/python/cuml/tests/test_device_selection.py
+++ b/python/cuml/tests/test_device_selection.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@
 from sklearn.decomposition import PCA as skPCA
 from sklearn.decomposition import TruncatedSVD as skTruncatedSVD
 from sklearn.datasets import make_regression, make_blobs
-from pytest_cases import fixture_union, fixture_plus
+from pytest_cases import fixture_union, fixture
 from importlib import import_module
 import inspect
 import pickle
@@ -195,7 +195,7 @@ def fixture_generation_helper(params):
     return {"scope": "session", "params": param_combis, "ids": ids}
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
@@ -232,11 +232,11 @@ def linreg_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
-            "penalty": ["none", "l2"],
+            "penalty": [None, "l2"],
             "fit_intercept": [False, True],
         }
     )
@@ -274,7 +274,7 @@ def logreg_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
@@ -314,7 +314,7 @@ def lasso_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
@@ -354,7 +354,7 @@ def elasticnet_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
@@ -389,7 +389,7 @@ def ridge_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["cupy"],
@@ -437,7 +437,7 @@ def umap_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
@@ -477,7 +477,7 @@ def pca_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
@@ -516,7 +516,7 @@ def tsvd_test_data(request):
     }
 
 
-@fixture_plus(
+@fixture(
     **fixture_generation_helper(
         {
             "input_type": ["numpy", "dataframe", "cupy", "cudf", "numba"],
diff --git a/python/cuml/tests/test_incremental_pca.py b/python/cuml/tests/test_incremental_pca.py
index 60df00ab73..8adc19b01f 100644
--- a/python/cuml/tests/test_incremental_pca.py
+++ b/python/cuml/tests/test_incremental_pca.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -131,7 +131,7 @@ def test_partial_fit(
     sk_t = sk_ipca.transform(X)
     sk_inv = sk_ipca.inverse_transform(sk_t)
 
-    assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
+    assert array_equal(cu_inv, sk_inv, 6e-5, with_sign=True)
 
 
 def test_exceptions():
diff --git a/python/cuml/tests/test_input_utils.py b/python/cuml/tests/test_input_utils.py
index fbbcde2105..6e4eefdf7b 100644
--- a/python/cuml/tests/test_input_utils.py
+++ b/python/cuml/tests/test_input_utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
 # limitations under the License.
 #
 
+import numpy as np
 from pandas import Series as pdSeries
+from cuml.manifold import umap
 from cuml.internals.safe_imports import cpu_only_import_from
 from cuml.internals.safe_imports import gpu_only_import_from
 from cuml.internals.input_utils import convert_dtype
@@ -24,6 +26,7 @@
 from cuml.common import input_to_cuml_array, CumlArray
 from cuml.internals.safe_imports import cpu_only_import
 import pytest
+import pandas as pd
 
 from cuml.internals.safe_imports import gpu_only_import
 
@@ -442,3 +445,18 @@ def test_tocupy_missing_values_handling():
         array, n_rows, n_cols, dtype = input_to_cupy_array(
             df, fail_on_null=True
         )
+
+
+@pytest.mark.cudf_pandas
+def test_numpy_output():
+    # Check that a Numpy array is used as output when a cudf.pandas wrapped
+    # Numpy array is passed in.
+    # Non regression test for issue #5784
+    df = pd.DataFrame({"a": range(5), "b": range(5)})
+    X = df.values
+
+    reducer = umap.UMAP()
+
+    # Check that this is a cudf.pandas wrapped array
+    assert hasattr(X, "_fsproxy_fast_type")
+    assert isinstance(reducer.fit_transform(X), np.ndarray)
diff --git a/python/cuml/tests/test_kmeans.py b/python/cuml/tests/test_kmeans.py
index 1b6060388e..b05a762177 100644
--- a/python/cuml/tests/test_kmeans.py
+++ b/python/cuml/tests/test_kmeans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ def get_data_consistency_test():
 
 @pytest.fixture
 def random_state():
-    random_state = random.randint(0, 1e6)
+    random_state = random.randint(0, 10**6)
     with logger.set_level(logger.level_debug):
         logger.debug("Random seed: {}".format(random_state))
     return random_state
@@ -236,7 +236,9 @@ def test_kmeans_sklearn_comparison(name, nrows, random_state):
     cu_y_pred = cuml_kmeans.fit_predict(X)
     cu_score = adjusted_rand_score(cu_y_pred, y)
     kmeans = cluster.KMeans(
-        random_state=random_state, n_clusters=params["n_clusters"]
+        random_state=random_state,
+        n_clusters=params["n_clusters"],
+        n_init=10,
     )
     sk_y_pred = kmeans.fit_predict(X)
     sk_score = adjusted_rand_score(sk_y_pred, y)
@@ -278,7 +280,9 @@ def test_kmeans_sklearn_comparison_default(name, nrows, random_state):
     cu_y_pred = cuml_kmeans.fit_predict(X)
     cu_score = adjusted_rand_score(cu_y_pred, y)
     kmeans = cluster.KMeans(
-        random_state=random_state, n_clusters=params["n_clusters"]
+        random_state=random_state,
+        n_clusters=params["n_clusters"],
+        n_init=10,
     )
     sk_y_pred = kmeans.fit_predict(X)
     sk_score = adjusted_rand_score(sk_y_pred, y)
diff --git a/python/cuml/tests/test_lars.py b/python/cuml/tests/test_lars.py
index 274d8f4199..5064ae674b 100644
--- a/python/cuml/tests/test_lars.py
+++ b/python/cuml/tests/test_lars.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,15 +61,14 @@ def normalize_data(X, y):
         stress_param([1000, 500]),
     ],
 )
-@pytest.mark.parametrize("normalize", [True, False])
 @pytest.mark.parametrize("precompute", [True, False, "precompute"])
-def test_lars_model(datatype, nrows, column_info, precompute, normalize):
+def test_lars_model(datatype, nrows, column_info, precompute):
     ncols, n_info = column_info
     X_train, X_test, y_train, y_test = make_regression_dataset(
         datatype, nrows, ncols, n_info
     )
 
-    if precompute == "precompute" or not normalize:
+    if precompute == "precompute":
         # Apply normalization manually, because the solver expects normalized
         # input data
         X_train, y_train, x_mean, x_scale, y_mean = normalize_data(
@@ -81,7 +80,7 @@ def test_lars_model(datatype, nrows, column_info, precompute, normalize):
     if precompute == "precompute":
         precompute = np.dot(X_train.T, X_train)
 
-    params = {"precompute": precompute, "normalize": normalize}
+    params = {"precompute": precompute}
 
     # Initialization of cuML's LARS
     culars = cuLars(**params)
diff --git a/python/cuml/tests/test_linear_model.py b/python/cuml/tests/test_linear_model.py
index e434f64212..74395c15c9 100644
--- a/python/cuml/tests/test_linear_model.py
+++ b/python/cuml/tests/test_linear_model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,9 +13,9 @@
 # limitations under the License.
 #
 from contextlib import nullcontext
-from distutils.version import LooseVersion
 from functools import lru_cache
 
+from packaging.version import Version
 import pytest
 import sklearn
 from cuml.internals.array import elements_in_representable_range
@@ -144,6 +144,15 @@ def cuml_compatible_dataset(X_train, X_test, y_train, _=None):
 algorithms = st.sampled_from(_ALGORITHMS)
 
 
+# TODO(24.08): remove this test
+def test_logreg_penalty_deprecation():
+    with pytest.warns(
+        FutureWarning,
+        match="The 'none' option was deprecated in version 24.06",
+    ):
+        cuLog(penalty="none")
+
+
 @pytest.mark.parametrize("ntargets", [1, 2])
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
 @pytest.mark.parametrize("algorithm", ["eig", "svd"])
@@ -457,11 +466,11 @@ def test_weighted_ridge(datatype, algorithm, fit_intercept, distribution):
     "num_classes, dtype, penalty, l1_ratio, fit_intercept, C, tol",
     [
         # L-BFGS Solver
-        (2, np.float32, "none", 1.0, True, 1.0, 1e-3),
+        (2, np.float32, None, 1.0, True, 1.0, 1e-3),
         (2, np.float64, "l2", 1.0, True, 1.0, 1e-8),
         (10, np.float32, "elasticnet", 0.0, True, 1.0, 1e-3),
-        (10, np.float32, "none", 1.0, False, 1.0, 1e-8),
-        (10, np.float32, "none", 1.0, False, 2.0, 1e-3),
+        (10, np.float32, None, 1.0, False, 1.0, 1e-8),
+        (10, np.float32, None, 1.0, False, 2.0, 1e-3),
         # OWL-QN Solver
         (2, np.float32, "l1", 1.0, True, 1.0, 1e-3),
         (2, np.float64, "elasticnet", 1.0, True, 1.0, 1e-8),
@@ -488,7 +497,7 @@ def test_logistic_regression(
 ):
     ncols, n_info = column_info
     # Checking sklearn >= 0.21 for testing elasticnet
-    sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0")
+    sk_check = Version(str(sklearn.__version__)) >= Version("0.21.0")
     if not sk_check and penalty == "elasticnet":
         pytest.skip(
             "Need sklearn > 0.21 for testing logistic with" "elastic net."
@@ -567,7 +576,7 @@ def test_logistic_regression(
 
 @given(
     dtype=floating_dtypes(sizes=(32, 64)),
-    penalty=st.sampled_from(("none", "l1", "l2", "elasticnet")),
+    penalty=st.sampled_from((None, "l1", "l2", "elasticnet")),
     l1_ratio=st.one_of(st.none(), st.floats(min_value=0.0, max_value=1.0)),
 )
 def test_logistic_regression_unscaled(dtype, penalty, l1_ratio):
@@ -624,7 +633,7 @@ def test_logistic_regression_model_default(dtype):
     order=st.sampled_from(("C", "F")),
     sparse_input=st.booleans(),
     fit_intercept=st.booleans(),
-    penalty=st.sampled_from(("none", "l1", "l2")),
+    penalty=st.sampled_from((None, "l1", "l2")),
 )
 def test_logistic_regression_model_digits(
     dtype, order, sparse_input, fit_intercept, penalty
@@ -927,8 +936,8 @@ def test_linear_models_set_params(algo):
     coef_before = model.coef_
 
     if algo == cuLog:
-        params = {"penalty": "none", "C": 1, "max_iter": 30}
-        model = algo(penalty="none", C=1, max_iter=30)
+        params = {"penalty": None, "C": 1, "max_iter": 30}
+        model = algo(penalty=None, C=1, max_iter=30)
     else:
         model = algo(solver="svd", alpha=0.1)
         params = {"solver": "svd", "alpha": 0.1}
diff --git a/python/cuml/tests/test_metrics.py b/python/cuml/tests/test_metrics.py
index d38bf774f5..6e92535cf7 100644
--- a/python/cuml/tests/test_metrics.py
+++ b/python/cuml/tests/test_metrics.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -108,7 +108,7 @@
 
 @pytest.fixture(scope="module")
 def random_state():
-    random_state = random.randint(0, 1e6)
+    random_state = random.randint(0, 10**6)
     with logger.set_level(logger.level_debug):
         logger.debug("Random seed: {}".format(random_state))
     return random_state
@@ -199,7 +199,7 @@ def test_sklearn_search():
     gdf_train = cudf.DataFrame(dict(train=y_train))
 
     sk_cu_grid.fit(gdf_data, gdf_train.train)
-    assert sk_cu_grid.best_params_ == {"alpha": 0.1}
+    assert_almost_equal(sk_cu_grid.best_params_["alpha"], 0.1)
 
 
 @pytest.mark.parametrize(
@@ -960,9 +960,12 @@ def test_log_loss_random(n_samples, dtype):
         lambda rng: rng.randint(0, 10, n_samples).astype(dtype)
     )
 
-    y_pred, _, _, _ = generate_random_labels(
+    _, _, y_pred, _ = generate_random_labels(
         lambda rng: rng.rand(n_samples, 10)
     )
+    # Make sure the probabilities sum to 1 per sample
+    y_pred /= y_pred.sum(axis=1)[:, None]
+    y_pred = cuda.to_device(y_pred)
 
     assert_almost_equal(
         log_loss(y_true, y_pred), sklearn_log_loss(y_true, y_pred)
@@ -1497,8 +1500,8 @@ def test_sparse_pairwise_distances_sklearn_comparison(
         matrix_size[0], matrix_size[1], cp.float64, density, metric
     )
 
-    # For fp64, compare at 9 decimals, (6 places less than the ~15 max)
-    compare_precision = 9
+    # For fp64, compare at 7 decimals, (8 places less than the ~15 max)
+    compare_precision = 7
 
     # Compare to sklearn, fp64
     S = sparse_pairwise_distances(X, Y, metric=metric)
diff --git a/python/cuml/tests/test_nearest_neighbors.py b/python/cuml/tests/test_nearest_neighbors.py
index 2ad3898c7d..9f5764a7e9 100644
--- a/python/cuml/tests/test_nearest_neighbors.py
+++ b/python/cuml/tests/test_nearest_neighbors.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -299,7 +299,7 @@ def test_ivfpq_pred(
         if metric in cuml.neighbors.VALID_METRICS[algo]
     ],
 )
-def test_ann_distances_metrics(algo, metric):
+def test_ann_distances_metrics(algo, metric, request):
     X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0)
 
     cu_knn = cuKNN(algorithm=algo, metric=metric)
@@ -316,8 +316,13 @@ def test_ann_distances_metrics(algo, metric):
     sk_dist, sk_ind = sk_knn.kneighbors(
         X, n_neighbors=10, return_distance=True
     )
-
-    return array_equal(sk_dist, cu_dist)
+    request.applymarker(
+        pytest.mark.xfail(
+            not (algo == "brute" and metric in ("cosine", "correlation")),
+            reason=f"arrays not equal with {algo=} and {metric=}",
+        )
+    )
+    assert bool(array_equal(sk_dist, cu_dist))
 
 
 def test_return_dists():
diff --git a/python/cuml/tests/test_one_hot_encoder.py b/python/cuml/tests/test_one_hot_encoder.py
index 991d42ddb1..9f3b1d2c34 100644
--- a/python/cuml/tests/test_one_hot_encoder.py
+++ b/python/cuml/tests/test_one_hot_encoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -57,8 +57,8 @@ def test_onehot_vs_skonehot(as_array):
         X = _from_df_to_cupy(X)
         skX = cp.asnumpy(X)
 
-    enc = OneHotEncoder(sparse=True)
-    skohe = SkOneHotEncoder(sparse=True)
+    enc = OneHotEncoder(sparse_output=True)
+    skohe = SkOneHotEncoder(sparse_output=True)
 
     ohe = enc.fit_transform(X)
     ref = skohe.fit_transform(skX)
@@ -89,7 +89,7 @@ def test_onehot_categories(as_array):
         X = _from_df_to_cupy(X)
         categories = _from_df_to_cupy(categories).transpose()
 
-    enc = OneHotEncoder(categories=categories, sparse=False)
+    enc = OneHotEncoder(categories=categories, sparse_output=False)
     ref = cp.array(
         [[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]]
     )
@@ -124,12 +124,12 @@ def test_onehot_transform_handle_unknown(as_array):
         X = _from_df_to_cupy(X)
         Y = _from_df_to_cupy(Y)
 
-    enc = OneHotEncoder(handle_unknown="error", sparse=False)
+    enc = OneHotEncoder(handle_unknown="error", sparse_output=False)
     enc = enc.fit(X)
     with pytest.raises(KeyError):
         enc.transform(Y)
 
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
     enc = enc.fit(X)
     ohe = enc.transform(Y)
     ref = cp.array([[0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]])
@@ -163,8 +163,10 @@ def test_onehot_random_inputs(drop, sparse, n_samples, as_array):
         n_samples=n_samples, as_array=as_array
     )
 
-    enc = OneHotEncoder(sparse=sparse, drop=drop, categories="auto")
-    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories="auto")
+    enc = OneHotEncoder(sparse_output=sparse, drop=drop, categories="auto")
+    sk_enc = SkOneHotEncoder(
+        sparse_output=sparse, drop=drop, categories="auto"
+    )
     ohe = enc.fit_transform(X)
     ref = sk_enc.fit_transform(ary)
     if sparse:
@@ -183,8 +185,10 @@ def test_onehot_drop_idx_first(as_array):
         X = _from_df_to_cupy(X)
         X_ary = cp.asnumpy(X)
 
-    enc = OneHotEncoder(sparse=False, drop="first", categories="auto")
-    sk_enc = SkOneHotEncoder(sparse=False, drop="first", categories="auto")
+    enc = OneHotEncoder(sparse_output=False, drop="first", categories="auto")
+    sk_enc = SkOneHotEncoder(
+        sparse_output=False, drop="first", categories="auto"
+    )
     ohe = enc.fit_transform(X)
     ref = sk_enc.fit_transform(X_ary)
     cp.testing.assert_array_equal(ohe, ref)
@@ -203,11 +207,11 @@ def test_onehot_drop_one_of_each(as_array):
         X_ary = cp.asnumpy(X)
         drop = drop_ary = _convert_drop(drop)
 
-    enc = OneHotEncoder(sparse=False, drop=drop, categories="auto")
+    enc = OneHotEncoder(sparse_output=False, drop=drop, categories="auto")
     ohe = enc.fit_transform(X)
     print(ohe.dtype)
     ref = SkOneHotEncoder(
-        sparse=False, drop=drop_ary, categories="auto"
+        sparse_output=False, drop=drop_ary, categories="auto"
     ).fit_transform(X_ary)
     cp.testing.assert_array_equal(ohe, ref)
     inv = enc.inverse_transform(ohe)
@@ -240,7 +244,7 @@ def test_onehot_drop_exceptions(drop, pattern, as_array):
         drop = _convert_drop(drop) if not isinstance(drop, DataFrame) else drop
 
     with pytest.raises(ValueError, match=pattern):
-        OneHotEncoder(sparse=False, drop=drop).fit(X)
+        OneHotEncoder(sparse_output=False, drop=drop).fit(X)
 
 
 @pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"])
@@ -270,8 +274,10 @@ def test_onehot_sparse_drop(as_array):
         ary = cp.asnumpy(X)
         drop = drop_ary = _convert_drop(drop)
 
-    enc = OneHotEncoder(sparse=True, drop=drop, categories="auto")
-    sk_enc = SkOneHotEncoder(sparse=True, drop=drop_ary, categories="auto")
+    enc = OneHotEncoder(sparse_output=True, drop=drop, categories="auto")
+    sk_enc = SkOneHotEncoder(
+        sparse_output=True, drop=drop_ary, categories="auto"
+    )
     ohe = enc.fit_transform(X)
     ref = sk_enc.fit_transform(ary)
     cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
@@ -286,21 +292,21 @@ def test_onehot_categories_shape_mismatch(as_array):
         categories = _from_df_to_cupy(categories).transpose()
 
     with pytest.raises(ValueError):
-        OneHotEncoder(categories=categories, sparse=False).fit(X)
+        OneHotEncoder(categories=categories, sparse_output=False).fit(X)
 
 
 def test_onehot_category_specific_cases():
     # See this for reasoning: https://github.com/rapidsai/cuml/issues/2690
 
-    # All of these cases use sparse=False, where
-    # test_onehot_category_class_count uses sparse=True
+    # All of these cases use sparse_output=False, where
+    # test_onehot_category_class_count uses sparse_output=True
 
     # ==== 2 Rows (Low before High) ====
     example_df = DataFrame()
     example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * 56
     example_df["high_cardinality_column"] = cp.linspace(0, 255, 256)
 
-    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
     encoder.fit_transform(example_df)
 
     # ==== 2 Rows (High before Low, used to fail) ====
@@ -308,7 +314,7 @@ def test_onehot_category_specific_cases():
     example_df["high_cardinality_column"] = cp.linspace(0, 255, 256)
     example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * 56
 
-    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
     encoder.fit_transform(example_df)
 
 
@@ -319,9 +325,9 @@ def test_onehot_category_specific_cases():
 )
 def test_onehot_category_class_count(total_classes: int):
     # See this for reasoning: https://github.com/rapidsai/cuml/issues/2690
-    # All tests use sparse=True to avoid memory errors
+    # All tests use sparse_output=True to avoid memory errors
 
-    encoder = OneHotEncoder(handle_unknown="ignore", sparse=True)
+    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
 
     # ==== 2 Rows ====
     example_df = DataFrame()
@@ -388,3 +394,14 @@ def test_onehot_get_feature_names(as_array):
     ]
     feature_names = enc.get_feature_names(["fruit", "size"])
     assert np.array_equal(feature_names, feature_names_ref)
+
+
+# TODO(24.08): remove this test
+def test_sparse_deprecation():
+    X = cp.array([[33, 1], [34, 3], [34, 2]])
+    oh = OneHotEncoder(sparse=True)
+
+    with pytest.warns(
+        FutureWarning, match="`sparse` was renamed to `sparse_output`"
+    ):
+        oh.fit(X)
diff --git a/python/cuml/tests/test_preprocessing.py b/python/cuml/tests/test_preprocessing.py
index 332ef4be93..c341fa2a63 100644
--- a/python/cuml/tests/test_preprocessing.py
+++ b/python/cuml/tests/test_preprocessing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,6 +43,7 @@
     quantile_transform as cu_quantile_transform,
     robust_scale as cu_robust_scale,
     scale as cu_scale,
+    label_binarize as cu_label_binarize,
 )
 from sklearn.preprocessing import (
     Binarizer as skBinarizer,
@@ -68,6 +69,7 @@
     quantile_transform as sk_quantile_transform,
     robust_scale as sk_robust_scale,
     scale as sk_scale,
+    label_binarize as sk_label_binarize,
 )
 from sklearn.impute import (
     MissingIndicator as skMissingIndicator,
@@ -1135,6 +1137,36 @@ def test_kernel_centerer():
     assert_allclose(sk_t_X, t_X)
 
 
+def test_label_binarize():
+    cu_bin = cu_label_binarize(
+        cp.array([1, 0, 1, 1]), classes=cp.array([0, 1])
+    )
+    sk_bin = sk_label_binarize([1, 0, 1, 1], classes=[0, 1])
+    assert_allclose(cu_bin, sk_bin)
+
+    cu_bin_sparse = cu_label_binarize(
+        cp.array([1, 0, 1, 1]), classes=cp.array([0, 1]), sparse_output=True
+    )
+    sk_bin_sparse = sk_label_binarize(
+        [1, 0, 1, 1], classes=[0, 1], sparse_output=True
+    )
+    assert_allclose(cu_bin_sparse, sk_bin_sparse)
+
+    cu_multi = cu_label_binarize(
+        cp.array([1, 6, 3]), classes=cp.array([1, 3, 4, 6])
+    )
+    sk_multi = sk_label_binarize([1, 6, 3], classes=[1, 3, 4, 6])
+    assert_allclose(cu_multi, sk_multi)
+
+    cu_multi_sparse = cu_label_binarize(
+        cp.array([1, 6, 3]), classes=cp.array([1, 3, 4, 6]), sparse_output=True
+    )
+    sk_multi_sparse = sk_label_binarize(
+        [1, 6, 3], classes=[1, 3, 4, 6], sparse_output=True
+    )
+    assert_allclose(cu_multi_sparse, sk_multi_sparse)
+
+
 def test__repr__():
     assert cuBinarizer().__repr__() == "Binarizer()"
     assert cuFunctionTransformer().__repr__() == "FunctionTransformer()"
diff --git a/python/cuml/tests/test_random_forest.py b/python/cuml/tests/test_random_forest.py
index 0fdde7acab..640c22fd67 100644
--- a/python/cuml/tests/test_random_forest.py
+++ b/python/cuml/tests/test_random_forest.py
@@ -275,7 +275,7 @@ def test_tweedie_convergence(max_depth, split_criterion):
     "max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
 )
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
-@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
+@pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
 def test_rf_classification(small_clf, datatype, max_samples, max_features):
     use_handle = True
 
@@ -399,7 +399,6 @@ def test_rf_classification_unorder(
     [
         (1.0, 16),
         (1.0, 11),
-        ("auto", 128),
         ("log2", 100),
         ("sqrt", 100),
         (1.0, 17),
@@ -473,7 +472,7 @@ def test_rf_classification_seed(small_clf, datatype):
     )
 
     for i in range(8):
-        seed = random.randint(100, 1e5)
+        seed = random.randint(100, 10**5)
         # Initialize, fit and predict using cuML's
         # random forest classification model
         cu_class = curfc(random_state=seed, n_streams=1)
@@ -682,7 +681,7 @@ def test_rf_classification_multi_class(mclass_clf, datatype, array_type):
 
 @pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
 @pytest.mark.parametrize("max_samples", [unit_param(1.0), stress_param(0.95)])
-@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
+@pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
 def test_rf_classification_proba(
     small_clf, datatype, max_samples, max_features
 ):
@@ -862,7 +861,7 @@ def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
             sk_model.fit(X_train, y_train)
             sk_preds = sk_model.predict(X_test)
             sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
-            assert fil_r2 >= (sk_r2 - 0.07)
+            assert fil_r2 >= (sk_r2 - 0.08)
 
 
 @pytest.mark.xfail(reason="Need rapidsai/rmm#415 to detect memleak robustly")
@@ -915,7 +914,7 @@ def test_for_memory_leak():
         test_for_memory_leak()
 
 
-@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"])
+@pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
 @pytest.mark.parametrize("max_depth", [10, 13, 16])
 @pytest.mark.parametrize("n_estimators", [10, 20, 100])
 @pytest.mark.parametrize("n_bins", [8, 9, 10])
@@ -1382,3 +1381,30 @@ def test_rf_min_samples_split_with_small_float(estimator, make_data):
 
     # Does not error
     clf.fit(X, y)
+
+
+# TODO: Remove in v24.08
+@pytest.mark.parametrize(
+    "Estimator",
+    [
+        curfr,
+        curfc,
+    ],
+)
+def test_random_forest_max_features_deprecation(Estimator):
+    X = np.array([[1.0, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = Estimator(max_features="auto")
+
+    error_msg = "`max_features='auto'` has been deprecated in 24.06 "
+
+    with pytest.warns(FutureWarning, match=error_msg):
+        est.fit(X, y)
+
+
+def test_rf_predict_returns_int():
+
+    X, y = make_classification()
+    clf = cuml.ensemble.RandomForestClassifier().fit(X, y)
+    pred = clf.predict(X)
+    assert pred.dtype == np.int64
diff --git a/python/cuml/tests/test_simpl_set.py b/python/cuml/tests/test_simpl_set.py
index 4cd34f1971..cbc5ebc635 100644
--- a/python/cuml/tests/test_simpl_set.py
+++ b/python/cuml/tests/test_simpl_set.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 np = cpu_only_import("numpy")
 cp = gpu_only_import("cupy")
+cupyx = gpu_only_import("cupyx")
 
 
 IS_ARM = platform.processor() == "aarch64"
@@ -111,7 +112,7 @@ def test_fuzzy_simplicial_set(
         )[0].tocoo()
 
     cu_fss_graph = cu_fss_graph.todense()
-    ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
+    ref_fss_graph = cupyx.scipy.sparse.coo_matrix(ref_fss_graph).todense()
     assert correctness_sparse(
         ref_fss_graph, cu_fss_graph, atol=0.1, rtol=0.2, threshold=0.95
     )
diff --git a/python/cuml/tests/test_thirdparty.py b/python/cuml/tests/test_thirdparty.py
index 70b4a21ad6..ed23db76fb 100644
--- a/python/cuml/tests/test_thirdparty.py
+++ b/python/cuml/tests/test_thirdparty.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -88,8 +88,8 @@ def test_check_X_y():
 def test_row_norms(failure_logger, sparse_random_dataset, square):
     X_np, X, X_sparse_np, X_sparse = sparse_random_dataset
 
-    cu_norms = cu_row_norms(X_np, squared=square)
-    sk_norms = sk_row_norms(X, squared=square)
+    cu_norms = cu_row_norms(X, squared=square)
+    sk_norms = sk_row_norms(X_np, squared=square)
     assert_allclose(cu_norms, sk_norms)
 
     cu_norms = cu_row_norms(X_sparse, squared=square)
diff --git a/python/cuml/tests/test_train_test_split.py b/python/cuml/tests/test_train_test_split.py
index e0f450176b..c6b1ec0a87 100644
--- a/python/cuml/tests/test_train_test_split.py
+++ b/python/cuml/tests/test_train_test_split.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,14 +23,34 @@
 cudf = gpu_only_import("cudf")
 cp = gpu_only_import("cupy")
 np = cpu_only_import("numpy")
+pd = cpu_only_import("pandas")
 
 cuda = gpu_only_import_from("numba", "cuda")
 
-test_array_input_types = ["numba", "cupy"]
-
 test_seeds = ["int", "cupy", "numpy"]
 
 
+@pytest.fixture(
+    params=[cuda.to_device, cp.asarray, cudf, pd],
+    ids=["to_numba", "to_cupy", "to_cudf", "to_pandas"],
+)
+def convert_to_type(request):
+    if request.param in (cudf, pd):
+
+        def ctor(X):
+            if isinstance(X, cp.ndarray) and request.param == pd:
+                X = X.get()
+
+            if X.ndim > 1:
+                return request.param.DataFrame(X)
+            else:
+                return request.param.Series(X)
+
+        return ctor
+
+    return request.param
+
+
 @pytest.mark.parametrize("train_size", [0.2, 0.6, 0.8])
 @pytest.mark.parametrize("shuffle", [True, False])
 def test_split_dataframe(train_size, shuffle):
@@ -153,21 +173,23 @@ def test_random_state(seed_type):
         assert y_test.equals(y_test2)
 
 
-@pytest.mark.parametrize("type", test_array_input_types)
+@pytest.mark.parametrize(
+    "X, y",
+    [
+        (np.arange(-100, 0), np.arange(100)),
+        (
+            np.zeros((100, 10)) + np.arange(100).reshape(100, 1),
+            np.arange(100).reshape(100, 1),
+        ),
+    ],
+)
 @pytest.mark.parametrize("test_size", [0.2, 0.4, None])
 @pytest.mark.parametrize("train_size", [0.6, 0.8, None])
 @pytest.mark.parametrize("shuffle", [True, False])
-def test_array_split(type, test_size, train_size, shuffle):
-    X = np.zeros((100, 10)) + np.arange(100).reshape(100, 1)
-    y = np.arange(100).reshape(100, 1)
-
-    if type == "cupy":
-        X = cp.asarray(X)
-        y = cp.asarray(y)
+def test_array_split(X, y, convert_to_type, test_size, train_size, shuffle):
 
-    if type == "numba":
-        X = cuda.to_device(X)
-        y = cuda.to_device(y)
+    X = convert_to_type(X)
+    y = convert_to_type(y)
 
     X_train, X_test, y_train, y_test = train_test_split(
         X,
@@ -251,17 +273,19 @@ def test_split_df_single_argument(test_size, train_size, shuffle):
         assert X_test.shape[0] == (int)(X.shape[0] * test_size)
 
 
-@pytest.mark.parametrize("type", test_array_input_types)
+@pytest.mark.parametrize(
+    "X",
+    [np.arange(-100, 0), np.zeros((100, 10)) + np.arange(100).reshape(100, 1)],
+)
 @pytest.mark.parametrize("test_size", [0.2, 0.4, None])
 @pytest.mark.parametrize("train_size", [0.6, 0.8, None])
 @pytest.mark.parametrize("shuffle", [True, False])
-def test_split_array_single_argument(type, test_size, train_size, shuffle):
-    X = np.zeros((100, 10)) + np.arange(100).reshape(100, 1)
-    if type == "cupy":
-        X = cp.asarray(X)
+def test_split_array_single_argument(
+    X, convert_to_type, test_size, train_size, shuffle
+):
+
+    X = convert_to_type(X)
 
-    if type == "numba":
-        X = cuda.to_device(X)
     X_train, X_test = train_test_split(
         X,
         train_size=train_size,
@@ -293,20 +317,14 @@ def test_split_array_single_argument(type, test_size, train_size, shuffle):
         assert X_rec == X
 
 
-@pytest.mark.parametrize("type", test_array_input_types)
 @pytest.mark.parametrize("test_size", [0.2, 0.4, None])
 @pytest.mark.parametrize("train_size", [0.6, 0.8, None])
-def test_stratified_split(type, test_size, train_size):
+def test_stratified_split(convert_to_type, test_size, train_size):
     # For more tolerance and reliable estimates
     X, y = make_classification(n_samples=10000)
 
-    if type == "cupy":
-        X = cp.asarray(X)
-        y = cp.asarray(y)
-
-    if type == "numba":
-        X = cuda.to_device(X)
-        y = cuda.to_device(y)
+    X = convert_to_type(X)
+    y = convert_to_type(y)
 
     def counts(y):
         _, y_indices = cp.unique(y, return_inverse=True)
diff --git a/python/cuml/tests/test_umap.py b/python/cuml/tests/test_umap.py
index 34d899e7bf..6faa4ad8d3 100644
--- a/python/cuml/tests/test_umap.py
+++ b/python/cuml/tests/test_umap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -667,7 +667,7 @@ def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors):
     ref_fss_graph = model.graph_
 
     cu_fss_graph = cu_fss_graph.todense()
-    ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
+    ref_fss_graph = cupyx.scipy.sparse.coo_matrix(ref_fss_graph).todense()
     assert correctness_sparse(
         ref_fss_graph, cu_fss_graph, atol=0.1, rtol=0.2, threshold=0.95
     )
diff --git a/python/pyproject.toml b/python/pyproject.toml
index b25187cf17..a7dd8d8e6a 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -18,8 +18,8 @@ requires = [
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
     "ninja",
-    "pylibraft==24.4.*",
-    "rmm==24.4.*",
+    "pylibraft==24.6.*",
+    "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
     "treelite==4.1.2",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -33,15 +33,46 @@ markers = [
   "mg: Multi-GPU tests",
   "memleak: Test that checks for memory leaks",
   "no_bad_cuml_array_check: Test that should not check for bad CumlArray uses",
+  "ucx: Run _only_ Dask UCX-Py tests",
+  "ucxx: Run _only_ Dask UCXX tests",
 ]
 
-testpaths = "cuml/tests"
+testpaths = [
+  "cuml/tests",
+  "cuml/tests/dask",
+  "cuml/tests/experimental",
+  "cuml/tests/explainer",
+  "cuml/tests/stemmer_tests",
+]
 
 filterwarnings = [
-  "error::FutureWarning:cuml[.*]", # Catch uses of deprecated positional args in testing
+  "error::FutureWarning",
+  "error::DeprecationWarning",
   "error:::cudf",
   "ignore:[^.]*ABCs[^.]*:DeprecationWarning:patsy[.*]",
   "ignore:(.*)alias(.*):DeprecationWarning:hdbscan[.*]",
+  # TODO: https://github.com/rapidsai/cuml/issues/5878
+  "ignore:.*ndarray.scatter_[(max|add)].* is deprecated:DeprecationWarning:cupyx",
+  # TODO: https://github.com/rapidsai/cuml/issues/5879
+  "ignore::FutureWarning:sklearn",
+  "ignore::DeprecationWarning:sklearn",
+  # https://github.com/pytest-dev/pytest-cov/issues/557
+  "ignore:The --rsyncdir command line argument:DeprecationWarning",
+  # https://github.com/scikit-learn/scikit-learn/pull/25157
+  "ignore:.* is deprecated. Use files:DeprecationWarning",
+  # https://github.com/scikit-learn/scikit-learn/pull/25741
+  "ignore:`product` is deprecated as of NumPy 1.25.0:DeprecationWarning",
+  # https://github.com/scikit-learn-contrib/hdbscan/pull/612
+  "ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning",
+  # https://github.com/scikit-learn/scikit-learn/pull/26287
+  "ignore:is_sparse is deprecated and will be removed:DeprecationWarning",
+  # From dask-glm
+  "ignore:pkg_resources is deprecated as an API:DeprecationWarning",
+  "ignore:Deprecated call to `pkg_resources.declare_namespace:DeprecationWarning",
+  "ignore:`rcond` parameter will change to the default:FutureWarning",
+  "ignore:Dask configuration key 'fuse_ave_width':FutureWarning",
+  # From hdbscan
+  "ignore:Conversion of an array with ndim > 0:DeprecationWarning",
 ]
 
 [project]
@@ -55,16 +86,17 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.4.*",
+    "cudf==24.6.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.4.*",
-    "dask-cudf==24.4.*",
+    "dask-cuda==24.6.*",
+    "dask-cudf==24.6.*",
     "joblib>=0.11",
     "numba>=0.57",
-    "pylibraft==24.4.*",
-    "raft-dask==24.4.*",
-    "rapids-dask-dependency==24.4.*",
-    "rmm==24.4.*",
+    "packaging",
+    "pylibraft==24.6.*",
+    "raft-dask==24.6.*",
+    "rapids-dask-dependency==24.6.*",
+    "rmm==24.6.*",
     "scipy>=1.8.0",
     "treelite==4.1.2",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -90,7 +122,7 @@ test = [
     "pytest-cov",
     "pytest-xdist",
     "pytest==7.*",
-    "scikit-learn==1.2",
+    "scikit-learn==1.5",
     "seaborn",
     "statsmodels",
     "umap-learn==0.5.3",
diff --git a/python/pytest.ini b/python/pytest.ini
deleted file mode 100644
index 1946ee58f2..0000000000
--- a/python/pytest.ini
+++ /dev/null
@@ -1,20 +0,0 @@
-[pytest]
-markers =
-  unit: Quickest tests focused on accuracy and correctness
-  quality: More intense tests than unit with increased runtimes
-  stress: Longest running tests focused on stressing hardware compute resources
-  mg: Multi-GPU tests
-  memleak: Test that checks for memory leaks
-  no_bad_cuml_array_check: Test that should not check for bad CumlArray uses
-
-testpaths = 
-  cuml/tests
-  cuml/tests/dask
-  cuml/tests/experimental
-  cuml/tests/explainer
-  cuml/tests/stemmer_tests
-
-filterwarnings =
-  error::FutureWarning:cuml[.*] # Catch uses of deprecated positional args in testing
-  ignore:[^.]*ABCs[^.]*:DeprecationWarning:patsy[.*]
-  ignore:(.*)alias(.*):DeprecationWarning:hdbscan[.*]
diff --git a/wiki/cpp/DEVELOPER_GUIDE.md b/wiki/cpp/DEVELOPER_GUIDE.md
index 13fe035b80..d177d55b6b 100644
--- a/wiki/cpp/DEVELOPER_GUIDE.md
+++ b/wiki/cpp/DEVELOPER_GUIDE.md
@@ -199,13 +199,14 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/src cpp/src_p
 ```
 
 #### Copyright header
-[copyright.py](../../ci/checks/copyright.py) checks the Copyright header for all git-modified files
+RAPIDS [pre-commit-hooks](https://github.com/rapidsai/pre-commit-hooks) checks the Copyright
+header for all git-modified files.
 
-Manually, you can run the following to bulk-fix the header if only the years need to be updated:
+Manually, you can run the following to bulk-fix the header on all files in the repository:
 ```bash
-python ./ci/checks/copyright.py --update-current-year
+pre-commit run -a verify-copyright
 ```
-Keep in mind that this only applies to files tracked by git and having been modified.
+Keep in mind that this only applies to files tracked by git that have been modified.
 
 ## Error handling
 Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` and `RAFT_CUSOLVER_TRY`. These macros take care of checking the return values of the used API calls and generate an exception when the command is not successful. If you need to avoid an exception, e.g. inside a destructor, use `RAFT_CUDA_TRY_NO_THROW`, `RAFT_CUBLAS_TRY_NO_THROW ` and `RAFT_CUSOLVER_TRY_NO_THROW ` (currently not available, see https://github.com/rapidsai/cuml/issues/229). These macros log the error but do not throw an exception.