From cff257406990e070a8153a244c0ef82169df6456 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:52:34 -0700
Subject: [PATCH 01/22] Initial GPU guide

---
 docs/guides/usegpus.md | 244 +++++++++++++++++++++++++++++++++++++++++
 docs/index.rst         |   1 +
 2 files changed, 245 insertions(+)
 create mode 100644 docs/guides/usegpus.md
diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
new file mode 100644
index 000000000..a8c095741
--- /dev/null
+++ b/docs/guides/usegpus.md
@@ -0,0 +1,244 @@
+# Using BQSkit on a GPU Cluster
+
+This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides the GPU implemantation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed insnformation and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+
+We will guide you through the installation, setup, and execution process for BQSkit on a GPU cluster.
+
+
+## bqskit-qfactor-jax Package Installation
+
+First you will need to install `bqskit-qfactor-jax`, follow the instructions available at the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
+
+## Setting Up the Environment
+
+To run BQSkit with GPUs, you need to setup the BQSkit runtime properly, each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS]https://docs.nvidia.com/deploy/mps/. You can setup the runtime on an interactive node, or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+
+You may configure the number of GPUs to use on each node, and also the number of workers on each GPU. If you will use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration, and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the nvidia-smi command to check the GPU usage during the execution, it specifices the utilization of the memroy and of the exection units.
+
+<table>
+  <tr>
+    <th>Unitary size</th>
+    <th>Workers per GPU</th>
+  </tr>
+  <tr>
+    <td>3,4</td>
+    <td>10</td>
+  </tr>
+  <tr>
+    <td>5</td>
+    <td>8</td>
+  </tr>
+  <tr>
+    <td>6</td>
+    <td>5</td>
+  </tr>
+  <tr>
+    <td>7</td>
+    <td>2</td>
+  </tr>
+  <tr>
+    <td>8 and more</td>
+    <td>1</td>
+  </tr>
+</table>
+
+
+Make sure that in your python script you are creating the compiler object with the appropriate ip address. When running on the same node as the server, you can use 'localhost' as the ip address.
+
+### Interactive Node Setup Script
+Use the following script to set up the environment on an interactive node. After the enviorment is up, you may open a seconed terminal and run your python script.
+
+```bash
+hostname=$(uname -n)
+unique_id=bqskit_${RANDOM}
+amount_of_gpus=<Number of GPUS to use in the node>
+amount_of_workers_per_gpu=<Number of workers per GPU>
+total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
+scratch_dir=$SCRATCH
+
+wait_for_outgoing_thread_in_manager_log() {
+    while [[ ! -f "$manager_log_file" ]]
+    do
+            sleep 0.5
+    done
+
+    while ! grep -q "Started outgoing thread." $manager_log_file; do
+            sleep 1
+    done
+}
+
+wait_for_server_to_connect(){
+    while [[ ! -f "$server_log_file" ]]
+    do
+            sleep 0.5
+    done
+
+    while ! grep -q "Connected to manager" $server_log_file; do
+            sleep 1
+    done
+}
+
+mkdir -p $scratch_dir/bqskit_logs
+
+manager_log_file=$scratch_dir/bqskit_logs/manager_${unique_id}.log
+server_log_file=$scratch_dir/bqskit_logs/server_${unique_id}.log
+
+echo "Will start bqskit runtime with id $unique_id gpus = $amount_of_gpus and workers per gpu = $amount_of_workers_per_gpu"
+
+# Clean old server and manager logs, if exists
+rm -f $manager_log_file
+rm -f $server_log_file
+
+echo "Starting MPS server"
+nvidia-cuda-mps-control -d
+
+echo "starting BQSKit managers"
+
+bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
+manager_pid=$!
+wait_for_outgoing_thread_in_manager_log
+
+echo "starting BQSKit server on main node"
+echo "Will run the command bqskit-server ${hostname} -vvv" > $server_log_file
+bqskit-server $hostname -vvv &>> $server_log_file &
+server_pid=$!
+
+wait_for_server_to_connect
+
+echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
+for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ ))
+do
+    echo "XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu"
+    XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu > $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${hostname}_${gpu_id}.log &
+done
+
+wait
+
+echo "Stop MPS on $hostname"
+echo quit | nvidia-cuda-mps-control
+
+```
+
+### Scripts to be Used in an SBATCH Across Several Nodes
+
+Use the following SBATCH script to set up the job on a cluster:
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=<job_name>
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t <time_to_run>
+#SBATCH -n <number_of_nodes>
+#SBATCH --gpus=<total number of GPUs, not nodes>
+#SBATCH --output=<full_path_to_log_file>
+
+date
+uname -a
+
+### load any modules needed and activate the conda enviorment
+module load <module1>
+module load <module2>
+conda activate <conda-env-name>
+
+
+echo "starting BQSKit managers on all nodes"
+srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
+managers_pid=$!
+
+managers_started_file=$SCRATCH/managers_${SLURM_JOB_ID}_started
+n=<number_of_nodes>
+
+
+# Wait until  all the the  managers have started
+while [[ ! -f "$managers_started_file" ]]
+do
+        sleep 0.5
+done
+
+while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
+    sleep 1
+done
+
+echo "starting BQSKit server on main node"
+bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $SCRATCH/bqskit_logs/server_${SLURM_JOB_ID}.log &
+server_pid=$!
+
+uname -a >> $SCRATCH/server_${SLURM_JOB_ID}_started
+
+echo "will run python your command"
+
+python <Your command>
+
+date
+
+echo "Killing the server"
+kill -2 $server_pid
+
+sleep 2
+```
+
+
+Save the following script as 'run_workers_and_managers.sh' in the same directory as your SBATCH script:
+```bash
+#!/bin/bash
+
+node_id=$(uname -n)
+amount_of_gpus=$1
+amount_of_workers_per_gpu=$2
+total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
+manager_log_file="$SCRATCH/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
+server_started_file="$SCRATCH/server_${SLURM_JOB_ID}_started"
+managers_started_file="$SCRATCH/managers_${SLURM_JOB_ID}_started"
+
+touch $managers_started_file
+
+wait_for_outgoing_thread_in_manager_log() {
+    while ! grep -q "Started outgoing thread." $manager_log_file; do
+        sleep 1
+    done
+    uname -a >> $managers_started_file
+}
+
+start_mps_servers() {
+    echo "Starting MPS servers on node $node_id with CUDA $CUDA_VISIBLE_DEVICES"
+    nvidia-cuda-mps-control -d
+}
+
+wait_for_bqskit_server() {
+    i=0
+    while [[ ! -f $server_started_file && $i -lt 10 ]]; do
+        sleep 1
+        i=$((i+1))
+    done
+}
+
+start_workers() {
+    echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
+    for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do
+        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $SCRATCH/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
+    done
+    wait
+}
+
+stop_mps_servers() {
+    echo "Stop MPS servers on node $node_id"
+    echo quit | nvidia-cuda-mps-control
+}
+
+if [ $amount_of_gpus -eq 0 ]; then
+    echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu"
+    bqskit-manager -n $amount_of_workers_per_gpu -v &> $manager_log_file
+    echo "Manager finished on node $node_id"
+else
+    echo "Will run manager on node $node_id"
+    bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
+    wait_for_outgoing_thread_in_manager_log
+    start_mps_servers
+    wait_for_bqskit_server
+    start_workers
+    echo "Manager and workers finished on node $node_id" >> $manager_log_file
+    stop_mps_servers
+fi
+
+```
diff --git a/docs/index.rst b/docs/index.rst
index d1c52a3c0..e3de9ed83 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -26,6 +26,7 @@ our `tutorial series. <https://github.com/BQSKit/bqskit-tutorial/>`_
    :maxdepth: 1
 
    guides/distributing.md
+   guides/usegpus.md
    guides/custompass.md
 
 .. toctree::

From 83392c155e76fe6aa58372cb14b5903da21c5bb6 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:54:17 -0700
Subject: [PATCH 02/22] Proofing

---
 docs/guides/usegpus.md | 52 +++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index a8c095741..39872dffd 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -1,49 +1,29 @@
 # Using BQSkit on a GPU Cluster
 
-This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides the GPU implemantation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed insnformation and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
 
 We will guide you through the installation, setup, and execution process for BQSkit on a GPU cluster.
 
-
 ## bqskit-qfactor-jax Package Installation
 
-First you will need to install `bqskit-qfactor-jax`, follow the instructions available at the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
+First, you will need to install `bqskit-qfactor-jax`. Follow the instructions available on the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
 
 ## Setting Up the Environment
 
-To run BQSkit with GPUs, you need to setup the BQSkit runtime properly, each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS]https://docs.nvidia.com/deploy/mps/. You can setup the runtime on an interactive node, or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
-
-You may configure the number of GPUs to use on each node, and also the number of workers on each GPU. If you will use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration, and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the nvidia-smi command to check the GPU usage during the execution, it specifices the utilization of the memroy and of the exection units.
-
-<table>
-  <tr>
-    <th>Unitary size</th>
-    <th>Workers per GPU</th>
-  </tr>
-  <tr>
-    <td>3,4</td>
-    <td>10</td>
-  </tr>
-  <tr>
-    <td>5</td>
-    <td>8</td>
-  </tr>
-  <tr>
-    <td>6</td>
-    <td>5</td>
-  </tr>
-  <tr>
-    <td>7</td>
-    <td>2</td>
-  </tr>
-  <tr>
-    <td>8 and more</td>
-    <td>1</td>
-  </tr>
-</table>
-
-
-Make sure that in your python script you are creating the compiler object with the appropriate ip address. When running on the same node as the server, you can use 'localhost' as the ip address.
+To run BQSkit with GPUs, you need to set up the BQSkit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+
+You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
+
+| Unitary Size   | Workers per GPU |
+|----------------|------------------|
+| 3,4            | 10               |
+| 5              | 8                |
+| 6              | 4                |
+| 7              | 2                |
+| 8 and more     | 1                |
+
+Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use `localhost` as the IP address.
+
 
 ### Interactive Node Setup Script
 Use the following script to set up the environment on an interactive node. After the enviorment is up, you may open a seconed terminal and run your python script.

From 7e03f5959298b150cff6f6f871a2c859ebf8bbde Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:55:11 -0700
Subject: [PATCH 03/22] Removing a sleep

---
 docs/guides/usegpus.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 39872dffd..28372a8d8 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -155,7 +155,6 @@ date
 echo "Killing the server"
 kill -2 $server_pid
 
-sleep 2
 ```
 
 

From cadea1bb673e0356e3c0000eae607e8e07753007 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:56:05 -0700
Subject: [PATCH 04/22] .

---
 docs/guides/usegpus.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 28372a8d8..af2d54b58 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -22,7 +22,7 @@ You may configure the number of GPUs to use on each node and also the number of
 | 7              | 2                |
 | 8 and more     | 1                |
 
-Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use `localhost` as the IP address.
+Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use \`localhost\` as the IP address.
 
 
 ### Interactive Node Setup Script

From 1b9f556b321c163482bdafb8039244520ee62282 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 14:06:59 -0700
Subject: [PATCH 05/22] Chaneging BQSkit to BQSKit

---
 docs/guides/usegpus.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index af2d54b58..5999e5cb5 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -1,8 +1,8 @@
-# Using BQSkit on a GPU Cluster
+# Using BQSKit on a GPU Cluster
 
-This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+This guide explains how to use BQSKit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSKit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
 
-We will guide you through the installation, setup, and execution process for BQSkit on a GPU cluster.
+We will guide you through the installation, setup, and execution process for BQSKit on a GPU cluster.
 
 ## bqskit-qfactor-jax Package Installation
 
@@ -10,7 +10,7 @@ First, you will need to install `bqskit-qfactor-jax`. Follow the instructions av
 
 ## Setting Up the Environment
 
-To run BQSkit with GPUs, you need to set up the BQSkit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
 
 You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
 

From df19df3e031493ee2211170cf82dd38b48aebf5b Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 14:14:12 -0700
Subject: [PATCH 06/22] Removing redundent echos

---
 docs/guides/usegpus.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 5999e5cb5..54a548511 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -78,8 +78,7 @@ bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
 manager_pid=$!
 wait_for_outgoing_thread_in_manager_log
 
-echo "starting BQSKit server on main node"
-echo "Will run the command bqskit-server ${hostname} -vvv" > $server_log_file
+echo "starting BQSKit server"
 bqskit-server $hostname -vvv &>> $server_log_file &
 server_pid=$!
 
@@ -88,7 +87,6 @@ wait_for_server_to_connect
 echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
 for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ ))
 do
-    echo "XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu"
     XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu > $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${hostname}_${gpu_id}.log &
 done
 

From bf6d2497bbe993ac1fcd7a666d3ed97bcadc6890 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 22:42:31 -0700
Subject: [PATCH 07/22] Adding a note regarding the number of GPUs to se in
 QFactor-Sample

---
 docs/guides/usegpus.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 54a548511..6a43e9646 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -12,7 +12,7 @@ First, you will need to install `bqskit-qfactor-jax`. Follow the instructions av
 
 To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
 
-You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
+You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. If you are using QFactor, you may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. If you are using QFactor-Sample, start with a single worker and increase if the memory premits it. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
 
 | Unitary Size   | Workers per GPU |
 |----------------|------------------|

From 1ed2c0167592a5b708209fb0fa1a51232db8cb2f Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Wed, 17 Jul 2024 10:35:49 -0700
Subject: [PATCH 08/22] Adding a reference to the examples and removing the
 SCRATCH var

---
 docs/guides/usegpus.md | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 6a43e9646..4945e55dc 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -8,6 +8,12 @@ We will guide you through the installation, setup, and execution process for BQS
 
 First, you will need to install `bqskit-qfactor-jax`. Follow the instructions available on the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
 
+## QFactor-JAX and QFactor-Sample-JAX Use Examples
+
+
+For detailed usage examples, please refer to the [examples directory](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples)  in the `bqskit-qfactor-jax` package. There, you will find two Toffoli instantiation examples using QFactor and QFactor-Sample, as well as two different synthesis flows that also utilize these algorithms.
+
+
 ## Setting Up the Environment
 
 To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
@@ -34,7 +40,7 @@ unique_id=bqskit_${RANDOM}
 amount_of_gpus=<Number of GPUS to use in the node>
 amount_of_workers_per_gpu=<Number of workers per GPU>
 total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
-scratch_dir=$SCRATCH
+scratch_dir=<temp_dir>
 
 wait_for_outgoing_thread_in_manager_log() {
     while [[ ! -f "$manager_log_file" ]]
@@ -111,6 +117,8 @@ Use the following SBATCH script to set up the job on a cluster:
 #SBATCH --gpus=<total number of GPUs, not nodes>
 #SBATCH --output=<full_path_to_log_file>
 
+scratch_dir=<temp_dir>
+
 date
 uname -a
 
@@ -124,7 +132,7 @@ echo "starting BQSKit managers on all nodes"
 srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
 managers_pid=$!
 
-managers_started_file=$SCRATCH/managers_${SLURM_JOB_ID}_started
+managers_started_file=$scratch_dir/managers_${SLURM_JOB_ID}_started
 n=<number_of_nodes>
 
 
@@ -139,10 +147,10 @@ while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
 done
 
 echo "starting BQSKit server on main node"
-bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $SCRATCH/bqskit_logs/server_${SLURM_JOB_ID}.log &
+bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log &
 server_pid=$!
 
-uname -a >> $SCRATCH/server_${SLURM_JOB_ID}_started
+uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started
 
 echo "will run python your command"
 
@@ -164,9 +172,11 @@ node_id=$(uname -n)
 amount_of_gpus=$1
 amount_of_workers_per_gpu=$2
 total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
-manager_log_file="$SCRATCH/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
-server_started_file="$SCRATCH/server_${SLURM_JOB_ID}_started"
-managers_started_file="$SCRATCH/managers_${SLURM_JOB_ID}_started"
+
+scratch_dir=<temp_dir>
+manager_log_file="$scratch_dir/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
+server_started_file="$scratch_dir/server_${SLURM_JOB_ID}_started"
+managers_started_file="$scratch_dir/managers_${SLURM_JOB_ID}_started"
 
 touch $managers_started_file
 
@@ -193,7 +203,7 @@ wait_for_bqskit_server() {
 start_workers() {
     echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
     for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do
-        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $SCRATCH/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
+        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
     done
     wait
 }

From ea3137375677c40dc0d767f377ca26ef88b5cec4 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:52:34 -0700
Subject: [PATCH 09/22] Initial GPU guide

---
 docs/guides/usegpus.md | 244 +++++++++++++++++++++++++++++++++++++++++
 docs/index.rst         |   1 +
 2 files changed, 245 insertions(+)
 create mode 100644 docs/guides/usegpus.md

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
new file mode 100644
index 000000000..a8c095741
--- /dev/null
+++ b/docs/guides/usegpus.md
@@ -0,0 +1,244 @@
+# Using BQSkit on a GPU Cluster
+
+This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides the GPU implemantation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed insnformation and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+
+We will guide you through the installation, setup, and execution process for BQSkit on a GPU cluster.
+
+
+## bqskit-qfactor-jax Package Installation
+
+First you will need to install `bqskit-qfactor-jax`, follow the instructions available at the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
+
+## Setting Up the Environment
+
+To run BQSkit with GPUs, you need to setup the BQSkit runtime properly, each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS]https://docs.nvidia.com/deploy/mps/. You can setup the runtime on an interactive node, or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+
+You may configure the number of GPUs to use on each node, and also the number of workers on each GPU. If you will use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration, and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the nvidia-smi command to check the GPU usage during the execution, it specifices the utilization of the memroy and of the exection units.
+
+<table>
+  <tr>
+    <th>Unitary size</th>
+    <th>Workers per GPU</th>
+  </tr>
+  <tr>
+    <td>3,4</td>
+    <td>10</td>
+  </tr>
+  <tr>
+    <td>5</td>
+    <td>8</td>
+  </tr>
+  <tr>
+    <td>6</td>
+    <td>5</td>
+  </tr>
+  <tr>
+    <td>7</td>
+    <td>2</td>
+  </tr>
+  <tr>
+    <td>8 and more</td>
+    <td>1</td>
+  </tr>
+</table>
+
+
+Make sure that in your python script you are creating the compiler object with the appropriate ip address. When running on the same node as the server, you can use 'localhost' as the ip address.
+
+### Interactive Node Setup Script
+Use the following script to set up the environment on an interactive node. After the enviorment is up, you may open a seconed terminal and run your python script.
+
+```bash
+hostname=$(uname -n)
+unique_id=bqskit_${RANDOM}
+amount_of_gpus=<Number of GPUS to use in the node>
+amount_of_workers_per_gpu=<Number of workers per GPU>
+total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
+scratch_dir=$SCRATCH
+
+wait_for_outgoing_thread_in_manager_log() {
+    while [[ ! -f "$manager_log_file" ]]
+    do
+            sleep 0.5
+    done
+
+    while ! grep -q "Started outgoing thread." $manager_log_file; do
+            sleep 1
+    done
+}
+
+wait_for_server_to_connect(){
+    while [[ ! -f "$server_log_file" ]]
+    do
+            sleep 0.5
+    done
+
+    while ! grep -q "Connected to manager" $server_log_file; do
+            sleep 1
+    done
+}
+
+mkdir -p $scratch_dir/bqskit_logs
+
+manager_log_file=$scratch_dir/bqskit_logs/manager_${unique_id}.log
+server_log_file=$scratch_dir/bqskit_logs/server_${unique_id}.log
+
+echo "Will start bqskit runtime with id $unique_id gpus = $amount_of_gpus and workers per gpu = $amount_of_workers_per_gpu"
+
+# Clean old server and manager logs, if exists
+rm -f $manager_log_file
+rm -f $server_log_file
+
+echo "Starting MPS server"
+nvidia-cuda-mps-control -d
+
+echo "starting BQSKit managers"
+
+bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
+manager_pid=$!
+wait_for_outgoing_thread_in_manager_log
+
+echo "starting BQSKit server on main node"
+echo "Will run the command bqskit-server ${hostname} -vvv" > $server_log_file
+bqskit-server $hostname -vvv &>> $server_log_file &
+server_pid=$!
+
+wait_for_server_to_connect
+
+echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
+for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ ))
+do
+    echo "XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu"
+    XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu > $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${hostname}_${gpu_id}.log &
+done
+
+wait
+
+echo "Stop MPS on $hostname"
+echo quit | nvidia-cuda-mps-control
+
+```
+
+### Scripts to be Used in an SBATCH Across Several Nodes
+
+Use the following SBATCH script to set up the job on a cluster:
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=<job_name>
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t <time_to_run>
+#SBATCH -n <number_of_nodes>
+#SBATCH --gpus=<total number of GPUs, not nodes>
+#SBATCH --output=<full_path_to_log_file>
+
+date
+uname -a
+
+### load any modules needed and activate the conda enviorment
+module load <module1>
+module load <module2>
+conda activate <conda-env-name>
+
+
+echo "starting BQSKit managers on all nodes"
+srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
+managers_pid=$!
+
+managers_started_file=$SCRATCH/managers_${SLURM_JOB_ID}_started
+n=<number_of_nodes>
+
+
+# Wait until  all the the  managers have started
+while [[ ! -f "$managers_started_file" ]]
+do
+        sleep 0.5
+done
+
+while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
+    sleep 1
+done
+
+echo "starting BQSKit server on main node"
+bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $SCRATCH/bqskit_logs/server_${SLURM_JOB_ID}.log &
+server_pid=$!
+
+uname -a >> $SCRATCH/server_${SLURM_JOB_ID}_started
+
+echo "will run python your command"
+
+python <Your command>
+
+date
+
+echo "Killing the server"
+kill -2 $server_pid
+
+sleep 2
+```
+
+
+Save the following script as 'run_workers_and_managers.sh' in the same directory as your SBATCH script:
+```bash
+#!/bin/bash
+
+node_id=$(uname -n)
+amount_of_gpus=$1
+amount_of_workers_per_gpu=$2
+total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
+manager_log_file="$SCRATCH/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
+server_started_file="$SCRATCH/server_${SLURM_JOB_ID}_started"
+managers_started_file="$SCRATCH/managers_${SLURM_JOB_ID}_started"
+
+touch $managers_started_file
+
+wait_for_outgoing_thread_in_manager_log() {
+    while ! grep -q "Started outgoing thread." $manager_log_file; do
+        sleep 1
+    done
+    uname -a >> $managers_started_file
+}
+
+start_mps_servers() {
+    echo "Starting MPS servers on node $node_id with CUDA $CUDA_VISIBLE_DEVICES"
+    nvidia-cuda-mps-control -d
+}
+
+wait_for_bqskit_server() {
+    i=0
+    while [[ ! -f $server_started_file && $i -lt 10 ]]; do
+        sleep 1
+        i=$((i+1))
+    done
+}
+
+start_workers() {
+    echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
+    for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do
+        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $SCRATCH/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
+    done
+    wait
+}
+
+stop_mps_servers() {
+    echo "Stop MPS servers on node $node_id"
+    echo quit | nvidia-cuda-mps-control
+}
+
+if [ $amount_of_gpus -eq 0 ]; then
+    echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu"
+    bqskit-manager -n $amount_of_workers_per_gpu -v &> $manager_log_file
+    echo "Manager finished on node $node_id"
+else
+    echo "Will run manager on node $node_id"
+    bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
+    wait_for_outgoing_thread_in_manager_log
+    start_mps_servers
+    wait_for_bqskit_server
+    start_workers
+    echo "Manager and workers finished on node $node_id" >> $manager_log_file
+    stop_mps_servers
+fi
+
+```
diff --git a/docs/index.rst b/docs/index.rst
index d1c52a3c0..e3de9ed83 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -26,6 +26,7 @@ our `tutorial series. <https://github.com/BQSKit/bqskit-tutorial/>`_
    :maxdepth: 1
 
    guides/distributing.md
+   guides/usegpus.md
    guides/custompass.md
 
 .. toctree::

From d94a36dc227fa9d2a16412cded9800f6e9dc054c Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:54:17 -0700
Subject: [PATCH 10/22] Proofing

---
 docs/guides/usegpus.md | 52 +++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index a8c095741..39872dffd 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -1,49 +1,29 @@
 # Using BQSkit on a GPU Cluster
 
-This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides the GPU implemantation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed insnformation and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
 
 We will guide you through the installation, setup, and execution process for BQSkit on a GPU cluster.
 
-
 ## bqskit-qfactor-jax Package Installation
 
-First you will need to install `bqskit-qfactor-jax`, follow the instructions available at the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
+First, you will need to install `bqskit-qfactor-jax`. Follow the instructions available on the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
 
 ## Setting Up the Environment
 
-To run BQSkit with GPUs, you need to setup the BQSkit runtime properly, each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS]https://docs.nvidia.com/deploy/mps/. You can setup the runtime on an interactive node, or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
-
-You may configure the number of GPUs to use on each node, and also the number of workers on each GPU. If you will use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration, and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the nvidia-smi command to check the GPU usage during the execution, it specifices the utilization of the memroy and of the exection units.
-
-<table>
-  <tr>
-    <th>Unitary size</th>
-    <th>Workers per GPU</th>
-  </tr>
-  <tr>
-    <td>3,4</td>
-    <td>10</td>
-  </tr>
-  <tr>
-    <td>5</td>
-    <td>8</td>
-  </tr>
-  <tr>
-    <td>6</td>
-    <td>5</td>
-  </tr>
-  <tr>
-    <td>7</td>
-    <td>2</td>
-  </tr>
-  <tr>
-    <td>8 and more</td>
-    <td>1</td>
-  </tr>
-</table>
-
-
-Make sure that in your python script you are creating the compiler object with the appropriate ip address. When running on the same node as the server, you can use 'localhost' as the ip address.
+To run BQSkit with GPUs, you need to set up the BQSkit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+
+You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
+
+| Unitary Size   | Workers per GPU |
+|----------------|------------------|
+| 3,4            | 10               |
+| 5              | 8                |
+| 6              | 4                |
+| 7              | 2                |
+| 8 and more     | 1                |
+
+Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use `localhost` as the IP address.
+
 
 ### Interactive Node Setup Script
 Use the following script to set up the environment on an interactive node. After the enviorment is up, you may open a seconed terminal and run your python script.

From 37f6cbd5d367e5af478b97dbcc6dd134a5d05cf7 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:55:11 -0700
Subject: [PATCH 11/22] Removing a sleep

---
 docs/guides/usegpus.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 39872dffd..28372a8d8 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -155,7 +155,6 @@ date
 echo "Killing the server"
 kill -2 $server_pid
 
-sleep 2
 ```
 
 

From dc8935a3e965f6836ddb44b01a208dafa91803a9 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 12:56:05 -0700
Subject: [PATCH 12/22] .

---
 docs/guides/usegpus.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 28372a8d8..af2d54b58 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -22,7 +22,7 @@ You may configure the number of GPUs to use on each node and also the number of
 | 7              | 2                |
 | 8 and more     | 1                |
 
-Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use `localhost` as the IP address.
+Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use \`localhost\` as the IP address.
 
 
 ### Interactive Node Setup Script

From 1f57bc70df29100b91c1d9eebe0848c81f65ef3d Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 14:06:59 -0700
Subject: [PATCH 13/22] Chaneging BQSkit to BQSKit

---
 docs/guides/usegpus.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index af2d54b58..5999e5cb5 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -1,8 +1,8 @@
-# Using BQSkit on a GPU Cluster
+# Using BQSKit on a GPU Cluster
 
-This guide explains how to use BQSkit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSkit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+This guide explains how to use BQSKit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSKit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
 
-We will guide you through the installation, setup, and execution process for BQSkit on a GPU cluster.
+We will guide you through the installation, setup, and execution process for BQSKit on a GPU cluster.
 
 ## bqskit-qfactor-jax Package Installation
 
@@ -10,7 +10,7 @@ First, you will need to install `bqskit-qfactor-jax`. Follow the instructions av
 
 ## Setting Up the Environment
 
-To run BQSkit with GPUs, you need to set up the BQSkit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
 
 You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
 

From d88501f53424e73c76aa95d341a77eef2d606a15 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 14:14:12 -0700
Subject: [PATCH 14/22] Removing redundent echos

---
 docs/guides/usegpus.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 5999e5cb5..54a548511 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -78,8 +78,7 @@ bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
 manager_pid=$!
 wait_for_outgoing_thread_in_manager_log
 
-echo "starting BQSKit server on main node"
-echo "Will run the command bqskit-server ${hostname} -vvv" > $server_log_file
+echo "starting BQSKit server"
 bqskit-server $hostname -vvv &>> $server_log_file &
 server_pid=$!
 
@@ -88,7 +87,6 @@ wait_for_server_to_connect
 echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
 for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ ))
 do
-    echo "XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu"
     XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu > $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${hostname}_${gpu_id}.log &
 done
 

From 6410edc3c5177175c1b5653b64b90d2a8e73a62a Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Tue, 16 Jul 2024 22:42:31 -0700
Subject: [PATCH 15/22] Adding a note regarding the number of GPUs to se in
 QFactor-Sample

---
 docs/guides/usegpus.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 54a548511..6a43e9646 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -12,7 +12,7 @@ First, you will need to install `bqskit-qfactor-jax`. Follow the instructions av
 
 To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
 
-You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. You may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
+You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. If you are using QFactor, you may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. If you are using QFactor-Sample, start with a single worker and increase if the memory premits it. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
 
 | Unitary Size   | Workers per GPU |
 |----------------|------------------|

From 9c10dc2cf70f6d3495ae1dae1441f552ab6fec1a Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Wed, 17 Jul 2024 10:35:49 -0700
Subject: [PATCH 16/22] Adding a reference to the examples and removing the
 SCRATCH var

---
 docs/guides/usegpus.md | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 6a43e9646..4945e55dc 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -8,6 +8,12 @@ We will guide you through the installation, setup, and execution process for BQS
 
 First, you will need to install `bqskit-qfactor-jax`. Follow the instructions available on the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
 
+## QFactor-JAX and QFactor-Sample-JAX Use Examples
+
+
+For detailed usage examples, please refer to the [examples directory](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples)  in the `bqskit-qfactor-jax` package. There, you will find two Toffoli instantiation examples using QFactor and QFactor-Sample, as well as two different synthesis flows that also utilize these algorithms.
+
+
 ## Setting Up the Environment
 
 To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
@@ -34,7 +40,7 @@ unique_id=bqskit_${RANDOM}
 amount_of_gpus=<Number of GPUS to use in the node>
 amount_of_workers_per_gpu=<Number of workers per GPU>
 total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
-scratch_dir=$SCRATCH
+scratch_dir=<temp_dir>
 
 wait_for_outgoing_thread_in_manager_log() {
     while [[ ! -f "$manager_log_file" ]]
@@ -111,6 +117,8 @@ Use the following SBATCH script to set up the job on a cluster:
 #SBATCH --gpus=<total number of GPUs, not nodes>
 #SBATCH --output=<full_path_to_log_file>
 
+scratch_dir=<temp_dir>
+
 date
 uname -a
 
@@ -124,7 +132,7 @@ echo "starting BQSKit managers on all nodes"
 srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
 managers_pid=$!
 
-managers_started_file=$SCRATCH/managers_${SLURM_JOB_ID}_started
+managers_started_file=$scratch_dir/managers_${SLURM_JOB_ID}_started
 n=<number_of_nodes>
 
 
@@ -139,10 +147,10 @@ while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
 done
 
 echo "starting BQSKit server on main node"
-bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $SCRATCH/bqskit_logs/server_${SLURM_JOB_ID}.log &
+bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log &
 server_pid=$!
 
-uname -a >> $SCRATCH/server_${SLURM_JOB_ID}_started
+uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started
 
 echo "will run python your command"
 
@@ -164,9 +172,11 @@ node_id=$(uname -n)
 amount_of_gpus=$1
 amount_of_workers_per_gpu=$2
 total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
-manager_log_file="$SCRATCH/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
-server_started_file="$SCRATCH/server_${SLURM_JOB_ID}_started"
-managers_started_file="$SCRATCH/managers_${SLURM_JOB_ID}_started"
+
+scratch_dir=<temp_dir>
+manager_log_file="$scratch_dir/bqskit_logs/manager_${SLURM_JOB_ID}_${node_id}.log"
+server_started_file="$scratch_dir/server_${SLURM_JOB_ID}_started"
+managers_started_file="$scratch_dir/managers_${SLURM_JOB_ID}_started"
 
 touch $managers_started_file
 
@@ -193,7 +203,7 @@ wait_for_bqskit_server() {
 start_workers() {
     echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
     for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ )); do
-        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $SCRATCH/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
+        XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu &> $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${node_id}_${gpu_id}.log &
     done
     wait
 }

From c551604b0238ac03ea3204bcb79d140bfd9cc632 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Mon, 16 Sep 2024 09:18:46 -0700
Subject: [PATCH 17/22] Improving the guide by adding a synthesis example and
 adding a "row by row" explanation of the setup scripts.

---
 docs/guides/usegpus.md | 184 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 153 insertions(+), 31 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 4945e55dc..ab833205d 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -1,24 +1,97 @@
 # Using BQSKit on a GPU Cluster
 
-This guide explains how to use BQSKit with GPUs by leveraging the `bqskit-qfactor-jax` package. This package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSKit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
+This guide explains how to use BQSKit with GPUs by leveraging the `bqskit-qfactor-jax` package. The `bqskit-qfactor-jax` package provides GPU implementation support for the [QFactor](https://ieeexplore.ieee.org/abstract/document/10313638) and [QFactor-Sample](https://arxiv.org/abs/2405.12866) instantiation algorithms. For more detailed information and advanced configurations of the BQSKit runtime, refer to the [BQSKit distribution guide](https://bqskit.readthedocs.io/en/latest/guides/distributing.html).
 
 We will guide you through the installation, setup, and execution process for BQSKit on a GPU cluster.
 
 ## bqskit-qfactor-jax Package Installation
 
-First, you will need to install `bqskit-qfactor-jax`. Follow the instructions available on the [PyPI page](https://pypi.org/project/bqskit-qfactor-jax/).
+First, you will need to install `bqskit-qfactor-jax`. This can easily done by using pip
+```sh
+pip install bqskit-qfactor-jax
+```
+
+This command will install also all the dependencies including BQSKit and JAX with GPU support.
+
+## Optimizing a Circuit Using QFactor-Sample and the Gate Deletion Flow
+This section explains how to optimize a quantum circuit using QFactor-Sample and the gate deletion flow.
+
+First we load the circuit to be optimized using the Circuit class.
+```python
+from bqskit import Circuit
+
+# Load a circuit from QASM
+in_circuit = Circuit.from_file("circuit_to_opt.qasm")
+```
+
+Then we create the instniator instance, and set the number of multistarts to 32.
+```python
+from qfactorjax.qfactor_sample_jax import QFactorSampleJax
+
+num_multistarts = 32
+
+qfactor_sample_gpu_instantiator = QFactorSampleJax()
+
+instantiate_options = {
+        'method': qfactor_sample_gpu_instantiator,
+        'multistarts': num_multistarts,
+    }
+
+```
+
+Next, generate the optimization flow.
+```python
+from bqskit.passes import *
+
+# Prepare the compilation passes
+passes = [
+    # Convert U3s to VU
+    ToVariablePass(),
+
+    # Split the circuit into partitions
+    QuickPartitioner(partition_size),
+
+    # For each partition perform scanning gate removal using QFactor jax
+    ForEachBlockPass([
+        ScanningGateRemovalPass(
+            instantiate_options=instantiate_options,
+        ),
+    ]),
+
+    # Combine the partitions back into a circuit
+    UnfoldPass(),
+
+    # Convert back the VariablueUnitaires into U3s
+    ToU3Pass(),
+]
+```
+
+
+Finally, use a compiler instance to execute the passes, and then print the statistics. If your system has more than a single GPU, then you should initiate a detached server and connect to it. A destailed explanation on how to setup BQSKit runtime is given in the next sections of the this guide.
+```python
+from bqskit.compiler import Compiler
+
+with Compiler(num_workers=1) as compiler:
+    
+    out_circuit = compiler.compile(in_circuit, passes)
+
+    print(
+            f'Circuit finished with gates: {out_circuit.gate_counts}, '
+            f'while started with {in_circuit.gate_counts}',
+        )
+```
 
 ## QFactor-JAX and QFactor-Sample-JAX Use Examples
 
 
-For detailed usage examples, please refer to the [examples directory](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples)  in the `bqskit-qfactor-jax` package. There, you will find two Toffoli instantiation examples using QFactor and QFactor-Sample, as well as two different synthesis flows that also utilize these algorithms.
+For other usage examples, please refer to the [examples directory](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples)  in the `bqskit-qfactor-jax` package. There, you will find two Toffoli instantiation examples using QFactor and QFactor-Sample, as well as two different synthesis flows that also utilize these algorithms.
 
 
-## Setting Up the Environment
+## Setting Up a Multi-GPU Environment
 
-To run BQSKit with GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU, and several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on an interactive node or using SBATCH on several nodes. Below are the scripts to help you set up the runtime.
+To run BQSKit with multiple GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU by leveragig NVIDIA's CUDA_VISIBLE_DEVICES enviorment variable. Several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on a single server ( or interactive node on a cluster) or using SBATCH on several nodes. You can find scripts to help you set up the runtime in this [link](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples/bqskit_env_scripts).
 
-You may configure the number of GPUs to use on each node and also the number of workers on each GPU. If you use too many workers on the same GPU, you will get an out-of-memory exception. If you are using QFactor, you may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. If you are using QFactor-Sample, start with a single worker and increase if the memory premits it. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
+You may configure the number of GPUs to use on each server and also the number of workers on each GPU. If you use too many workers on the same GPU, you will run out of memory and experince an out-of-memory exception. If you are using QFactor, you may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. If you are using QFactor-Sample, start with a single worker and increase if the memory premits it. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
 
 | Unitary Size   | Workers per GPU |
 |----------------|------------------|
@@ -30,18 +103,30 @@ You may configure the number of GPUs to use on each node and also the number of
 
 Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use \`localhost\` as the IP address.
 
+```python
+with Compiler('localhost') as compiler:
+    out_circuit = compiler.compile(in_circuit, passes)
+```
+
+
+### Single Server Multiple GPUs Setup
+This section of the guide explains the main concepts in the [single_server_env.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/single_server_env.sh) script template and how to use it. The script creates a GPU enabled BQSKit runtime and is easily configured for any system. 
 
-### Interactive Node Setup Script
-Use the following script to set up the environment on an interactive node. After the enviorment is up, you may open a seconed terminal and run your python script.
+After you configure the template (replacing every  <> with an appropriate value) run it, and then in a seperate shell execute your python scirpt that uses this runtime enviorment.
 
+The enviorment script has the following parts:
+1. Variable configuration - choosing the number of GPUs to use, and the number of workrs per GPU. Moreover, the scratch dir path is configured, later to be used for logging.
 ```bash
+#!/bin/bash
 hostname=$(uname -n)
 unique_id=bqskit_${RANDOM}
 amount_of_gpus=<Number of GPUS to use in the node>
 amount_of_workers_per_gpu=<Number of workers per GPU>
 total_amount_of_workers=$(($amount_of_gpus * $amount_of_workers_per_gpu))
 scratch_dir=<temp_dir>
-
+```
+2. Log file monitoring functions to monitor the startup of BQSKit managers and server.
+```bash
 wait_for_outgoing_thread_in_manager_log() {
     while [[ ! -f "$manager_log_file" ]]
     do
@@ -63,7 +148,9 @@ wait_for_server_to_connect(){
             sleep 1
     done
 }
-
+```
+3. Creating the log directory, and deleting any old log files that conflicts with the current run logs.
+```bash
 mkdir -p $scratch_dir/bqskit_logs
 
 manager_log_file=$scratch_dir/bqskit_logs/manager_${unique_id}.log
@@ -74,39 +161,59 @@ echo "Will start bqskit runtime with id $unique_id gpus = $amount_of_gpus and wo
 # Clean old server and manager logs, if exists
 rm -f $manager_log_file
 rm -f $server_log_file
-
+```
+4. Starting NVIDA MPS to allow an efficient execution of multiple works on a single GPU.
+```bash
 echo "Starting MPS server"
 nvidia-cuda-mps-control -d
-
+```
+5. Starting the BQSKit manager, and indicating to wait for workers to connect to it. Waiting for the manager to start listening for a connection from a server. This is important as the server might timeout if the manager isn't ready for the connection.
+```bash
 echo "starting BQSKit managers"
 
 bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
 manager_pid=$!
 wait_for_outgoing_thread_in_manager_log
-
+```
+6. Starting the BQSKit server indicating that there is a single manager in the current server. Waiting untill the server connects to the manager before continuing to start the workers.
+```bash
 echo "starting BQSKit server"
 bqskit-server $hostname -vvv &>> $server_log_file &
 server_pid=$!
 
 wait_for_server_to_connect
-
+```
+7. Starting the workrs, each seeing only a specific GPU.
+```bash
 echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
 for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ ))
 do
     XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=$gpu_id bqskit-worker $amount_of_workers_per_gpu > $scratch_dir/bqskit_logs/workers_${SLURM_JOB_ID}_${hostname}_${gpu_id}.log &
 done
-
+```
+8. After all the processes have finished, stop the MPS server.
+```bash
 wait
 
 echo "Stop MPS on $hostname"
 echo quit | nvidia-cuda-mps-control
+```
 
+
+### Multis-Server Multi-GPU Enviorment Setup
+
+This section of the guide explains the main concepts in the [init_multi_node_multi_gpu_slurm_run.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh) [run_workers_and_managers.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/run_workers_and_managers.sh) scripts and how to use them. After configuring the scripts (updating every <>), place both of them in the same directory and initate a an SBATCH command. These scripts assume a SLURM enviorment, but can be easily ported to other disterbutation systems.
+
+```bash
+sbatch init_multi_node_multi_gpu_slurm_run.sh
 ```
 
-### Scripts to be Used in an SBATCH Across Several Nodes
+The rest of this section exaplains in detail both of the scripts.
 
-Use the following SBATCH script to set up the job on a cluster:
+#### init_multi_node_multi_gpu_slurm_run 
+This is a SLURM batch script for running a multi-node BQSKit task across multiple GPUs. It manages job submission, environment setup, launching the BQSKit server and workers on different nodes, and the execution of the main application.
 
+1. Job configuration and logging - this is a standard SLURM SBATCH header.
 ```bash
 #!/bin/bash
 #SBATCH --job-name=<job_name>
@@ -118,25 +225,28 @@ Use the following SBATCH script to set up the job on a cluster:
 #SBATCH --output=<full_path_to_log_file>
 
 scratch_dir=<temp_dir>
+```
 
-date
-uname -a
-
+2. Shell environment setup - Please consulte with your HPC system admin to choose the apropriate modules to load that will enable you to JAX on NVDIA's GPUs. You may use NERSC's Perlmutter [documentation](https://docs.nersc.gov/development/languages/python/using-python-perlmutter/#jax) as a reference.
+```bash
 ### load any modules needed and activate the conda enviorment
 module load <module1>
 module load <module2>
 conda activate <conda-env-name>
+```
 
-
+3. Starting the managers on all of the nodes using SLURM’s srun command, initiating the run_workers_and_managers.sh script across all nodes. The former handles starting managers and workers on each node.
+```bash
 echo "starting BQSKit managers on all nodes"
 srun run_workers_and_managers.sh <number_of_gpus_per_node> <number_of_workers_per_gpu> &
 managers_pid=$!
 
 managers_started_file=$scratch_dir/managers_${SLURM_JOB_ID}_started
 n=<number_of_nodes>
+```
 
-
-# Wait until  all the the  managers have started
+4. Waiting for all managers to start, by tracking the number of lines in the log file, one created by each manager.
+```bash
 while [[ ! -f "$managers_started_file" ]]
 do
         sleep 0.5
@@ -145,26 +255,32 @@ done
 while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
     sleep 1
 done
+```
 
+5. Starting the BQSKit server on the main node, and using SLURM's `SLURM_JOB_NODELIST` enviorment variable to indicate the BQSKit server the hostnames of the managers.
+```bash
 echo "starting BQSKit server on main node"
 bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log &
 server_pid=$!
 
 uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started
+```
 
-echo "will run python your command"
-
+6. Executing the main application, that will connect to the BQSKit runtime
+```bash
 python <Your command>
+```
 
-date
-
+7. After the run is over, closing the BQSKit server.
+```bash
 echo "Killing the server"
 kill -2 $server_pid
-
 ```
 
+#### run_workers_and_managers.sh
+This script is executed by each node to start the workers and managers on that specific node. It interacts with `init_multi_node_multi_gpu_slurm_run.sh`, the SBATCH script. If GPUs are required, the workers are spawnd seperatly from the manager, allowing for better configuratio of each worker.
 
-Save the following script as 'run_workers_and_managers.sh' in the same directory as your SBATCH script:
+The script starts with argument parsing and some variable configuration
 ```bash
 #!/bin/bash
 
@@ -179,7 +295,11 @@ server_started_file="$scratch_dir/server_${SLURM_JOB_ID}_started"
 managers_started_file="$scratch_dir/managers_${SLURM_JOB_ID}_started"
 
 touch $managers_started_file
+```
 
+Then the script declares a few utility methods.
+
+```bash
 wait_for_outgoing_thread_in_manager_log() {
     while ! grep -q "Started outgoing thread." $manager_log_file; do
         sleep 1
@@ -212,7 +332,10 @@ stop_mps_servers() {
     echo "Stop MPS servers on node $node_id"
     echo quit | nvidia-cuda-mps-control
 }
+```
 
+Finaly, the script chekcs if GPUs are not needed, it spwans the manager with its default behaviour, else suing the "-x" argument, it indicates to the manager to wait for connecting workers.
+```bash
 if [ $amount_of_gpus -eq 0 ]; then
     echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu"
     bqskit-manager -n $amount_of_workers_per_gpu -v &> $manager_log_file
@@ -227,5 +350,4 @@ else
     echo "Manager and workers finished on node $node_id" >> $manager_log_file
     stop_mps_servers
 fi
-
-```
+```
\ No newline at end of file

From 2efff2a0c01cd33072a23dfcb09d7b3447e70817 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Mon, 7 Oct 2024 21:59:27 -0700
Subject: [PATCH 18/22] autoupdate for pre-commit to V5.0.0

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5fd25af39..9c64255f8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ ci:
     skip: [mypy]
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer

From 8dc417b0ac0b488719b5b607eff8f6c2e0a54212 Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Thu, 10 Oct 2024 21:28:31 -0700
Subject: [PATCH 19/22] a commit to rerun the tests

---
 docs/guides/usegpus.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index ab833205d..e727cb327 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -266,7 +266,7 @@ server_pid=$!
 uname -a >> $scratch_dir/server_${SLURM_JOB_ID}_started
 ```
 
-6. Executing the main application, that will connect to the BQSKit runtime
+6. Executing the main application that will connect to the BQSKit runtime
 ```bash
 python <Your command>
 ```

From 1e3bc57ed3d9085cbb72a2ebefb9539e0f72126b Mon Sep 17 00:00:00 2001
From: Alon Kukliansky <alon.kukliansky.is@nps.edu>
Date: Fri, 11 Oct 2024 08:01:08 -0700
Subject: [PATCH 20/22] pre-commit fixes

---
 docs/guides/usegpus.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index e727cb327..22ef36807 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -72,7 +72,7 @@ Finally, use a compiler instance to execute the passes, and then print the stati
 from bqskit.compiler import Compiler
 
 with Compiler(num_workers=1) as compiler:
-    
+
     out_circuit = compiler.compile(in_circuit, passes)
 
     print(
@@ -110,7 +110,7 @@ with Compiler('localhost') as compiler:
 
 
 ### Single Server Multiple GPUs Setup
-This section of the guide explains the main concepts in the [single_server_env.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/single_server_env.sh) script template and how to use it. The script creates a GPU enabled BQSKit runtime and is easily configured for any system. 
+This section of the guide explains the main concepts in the [single_server_env.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/single_server_env.sh) script template and how to use it. The script creates a GPU enabled BQSKit runtime and is easily configured for any system.
 
 After you configure the template (replacing every  <> with an appropriate value) run it, and then in a seperate shell execute your python scirpt that uses this runtime enviorment.
 
@@ -210,7 +210,7 @@ sbatch init_multi_node_multi_gpu_slurm_run.sh
 
 The rest of this section exaplains in detail both of the scripts.
 
-#### init_multi_node_multi_gpu_slurm_run 
+#### init_multi_node_multi_gpu_slurm_run
 This is a SLURM batch script for running a multi-node BQSKit task across multiple GPUs. It manages job submission, environment setup, launching the BQSKit server and workers on different nodes, and the execution of the main application.
 
 1. Job configuration and logging - this is a standard SLURM SBATCH header.
@@ -350,4 +350,4 @@ else
     echo "Manager and workers finished on node $node_id" >> $manager_log_file
     stop_mps_servers
 fi
-```
\ No newline at end of file
+```

From bd13b4f3e0aaeb41379a735ec1a7fb020f551012 Mon Sep 17 00:00:00 2001
From: alonkukl <alonkukl@gmail.com>
Date: Tue, 22 Oct 2024 11:19:54 -0700
Subject: [PATCH 21/22] Fixing some typos

---
 docs/guides/usegpus.md | 44 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/docs/guides/usegpus.md b/docs/guides/usegpus.md
index 22ef36807..e2babb8bb 100644
--- a/docs/guides/usegpus.md
+++ b/docs/guides/usegpus.md
@@ -11,12 +11,12 @@ First, you will need to install `bqskit-qfactor-jax`. This can easily done by us
 pip install bqskit-qfactor-jax
 ```
 
-This command will install also all the dependencies including BQSKit and JAX with GPU support.
+This command will also install all the dependencies including BQSKit and JAX with GPU support.
 
 ## Optimizing a Circuit Using QFactor-Sample and the Gate Deletion Flow
 This section explains how to optimize a quantum circuit using QFactor-Sample and the gate deletion flow.
 
-First we load the circuit to be optimized using the Circuit class.
+First, we load the circuit to be optimized using the Circuit class.
 ```python
 from bqskit import Circuit
 
@@ -24,7 +24,7 @@ from bqskit import Circuit
 in_circuit = Circuit.from_file("circuit_to_opt.qasm")
 ```
 
-Then we create the instniator instance, and set the number of multistarts to 32.
+Then we create the instantiator instance and set the number of multistarts to 32.
 ```python
 from qfactorjax.qfactor_sample_jax import QFactorSampleJax
 
@@ -89,9 +89,9 @@ For other usage examples, please refer to the [examples directory](https://githu
 
 ## Setting Up a Multi-GPU Environment
 
-To run BQSKit with multiple GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU by leveragig NVIDIA's CUDA_VISIBLE_DEVICES enviorment variable. Several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on a single server ( or interactive node on a cluster) or using SBATCH on several nodes. You can find scripts to help you set up the runtime in this [link](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples/bqskit_env_scripts).
+To run BQSKit with multiple GPUs, you need to set up the BQSKit runtime properly. Each worker should be assigned to a specific GPU by leveraging NVIDIA's CUDA_VISIBLE_DEVICES environment variable. Several workers can use the same GPU by utilizing [NVIDIA's MPS](https://docs.nvidia.com/deploy/mps/). You can set up the runtime on a single server ( or interactive node on a cluster) or using SBATCH on several nodes. You can find scripts to help you set up the runtime in this [link](https://github.com/BQSKit/bqskit-qfactor-jax/tree/main/examples/bqskit_env_scripts).
 
-You may configure the number of GPUs to use on each server and also the number of workers on each GPU. If you use too many workers on the same GPU, you will run out of memory and experince an out-of-memory exception. If you are using QFactor, you may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. If you are using QFactor-Sample, start with a single worker and increase if the memory premits it. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
+You may configure the number of GPUs to use on each server and also the number of workers on each GPU. If you use too many workers on the same GPU, you will run out of memory and experience an out-of-memory exception. If you are using QFactor, you may use the following table as a starting configuration and adjust the number of workers according to your specific circuit, unitary size, and GPU performance. If you are using QFactor-Sample, start with a single worker and increase if the memory permits it. You can use the `nvidia-smi` command to check the GPU usage during execution; it specifies the utilization of the memory and the execution units.
 
 | Unitary Size   | Workers per GPU |
 |----------------|------------------|
@@ -101,7 +101,7 @@ You may configure the number of GPUs to use on each server and also the number o
 | 7              | 2                |
 | 8 and more     | 1                |
 
-Make sure that in your Python script you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use \`localhost\` as the IP address.
+Make sure that in your Python script, you are creating the compiler object with the appropriate IP address. When running on the same node as the server, you can use \`localhost\` as the IP address.
 
 ```python
 with Compiler('localhost') as compiler:
@@ -110,12 +110,12 @@ with Compiler('localhost') as compiler:
 
 
 ### Single Server Multiple GPUs Setup
-This section of the guide explains the main concepts in the [single_server_env.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/single_server_env.sh) script template and how to use it. The script creates a GPU enabled BQSKit runtime and is easily configured for any system.
+This section of the guide explains the main concepts in the [single_server_env.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/single_server_env.sh) script template and how to use it. The script creates a GPU-enabled BQSKit runtime and is easily configured for any system.
 
-After you configure the template (replacing every  <> with an appropriate value) run it, and then in a seperate shell execute your python scirpt that uses this runtime enviorment.
+After you configure the template (replacing every  <> with an appropriate value) run it, and then in a separate shell execute your python script that uses this runtime environment.
 
-The enviorment script has the following parts:
-1. Variable configuration - choosing the number of GPUs to use, and the number of workrs per GPU. Moreover, the scratch dir path is configured, later to be used for logging.
+The environment script has the following parts:
+1. Variable configuration - choosing the number of GPUs to use, and the number of workers per GPU. Moreover, the scratch dir path is configured and later used for logging.
 ```bash
 #!/bin/bash
 hostname=$(uname -n)
@@ -149,7 +149,7 @@ wait_for_server_to_connect(){
     done
 }
 ```
-3. Creating the log directory, and deleting any old log files that conflicts with the current run logs.
+3. Creating the log directory, and deleting any old log files that conflict with the current run logs.
 ```bash
 mkdir -p $scratch_dir/bqskit_logs
 
@@ -162,7 +162,7 @@ echo "Will start bqskit runtime with id $unique_id gpus = $amount_of_gpus and wo
 rm -f $manager_log_file
 rm -f $server_log_file
 ```
-4. Starting NVIDA MPS to allow an efficient execution of multiple works on a single GPU.
+4. Starting NVIDA MPS to allow efficient execution of multiple works on a single GPU.
 ```bash
 echo "Starting MPS server"
 nvidia-cuda-mps-control -d
@@ -175,7 +175,7 @@ bqskit-manager -x -n$total_amount_of_workers -vvv &> $manager_log_file &
 manager_pid=$!
 wait_for_outgoing_thread_in_manager_log
 ```
-6. Starting the BQSKit server indicating that there is a single manager in the current server. Waiting untill the server connects to the manager before continuing to start the workers.
+6. Starting the BQSKit server indicating that there is a single manager in the current server. Waiting until the server connects to the manager before continuing to start the workers.
 ```bash
 echo "starting BQSKit server"
 bqskit-server $hostname -vvv &>> $server_log_file &
@@ -183,7 +183,7 @@ server_pid=$!
 
 wait_for_server_to_connect
 ```
-7. Starting the workrs, each seeing only a specific GPU.
+7. Starting the workers, each seeing only a specific GPU.
 ```bash
 echo "Starting $total_amount_of_workers workers on $amount_of_gpus gpus"
 for (( gpu_id=0; gpu_id<$amount_of_gpus; gpu_id++ ))
@@ -200,15 +200,15 @@ echo quit | nvidia-cuda-mps-control
 ```
 
 
-### Multis-Server Multi-GPU Enviorment Setup
+### Multis-Server Multi-GPU Environment Setup
 
-This section of the guide explains the main concepts in the [init_multi_node_multi_gpu_slurm_run.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh) [run_workers_and_managers.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/run_workers_and_managers.sh) scripts and how to use them. After configuring the scripts (updating every <>), place both of them in the same directory and initate a an SBATCH command. These scripts assume a SLURM enviorment, but can be easily ported to other disterbutation systems.
+This section of the guide explains the main concepts in the [init_multi_node_multi_gpu_slurm_run.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/init_multi_node_multi_gpu_slurm_run.sh) [run_workers_and_managers.sh](https://github.com/BQSKit/bqskit-qfactor-jax/blob/main/examples/bqskit_env_scripts/run_workers_and_managers.sh) scripts and how to use them. After configuring the scripts (updating every <>), place both of them in the same directory and initiate an SBATCH command. These scripts assume a SLURM environment but can be easily ported to other distribution systems.
 
 ```bash
 sbatch init_multi_node_multi_gpu_slurm_run.sh
 ```
 
-The rest of this section exaplains in detail both of the scripts.
+The rest of this section explains both of the scripts in detail.
 
 #### init_multi_node_multi_gpu_slurm_run
 This is a SLURM batch script for running a multi-node BQSKit task across multiple GPUs. It manages job submission, environment setup, launching the BQSKit server and workers on different nodes, and the execution of the main application.
@@ -227,9 +227,9 @@ This is a SLURM batch script for running a multi-node BQSKit task across multipl
 scratch_dir=<temp_dir>
 ```
 
-2. Shell environment setup - Please consulte with your HPC system admin to choose the apropriate modules to load that will enable you to JAX on NVDIA's GPUs. You may use NERSC's Perlmutter [documentation](https://docs.nersc.gov/development/languages/python/using-python-perlmutter/#jax) as a reference.
+2. Shell environment setup - Please consult with your HPC system admin to choose the appropriate modules to load that will enable you to JAX on NVDIA's GPUs. You may use NERSC's Perlmutter [documentation](https://docs.nersc.gov/development/languages/python/using-python-perlmutter/#jax) as a reference.
 ```bash
-### load any modules needed and activate the conda enviorment
+### load any modules needed and activate the conda environment
 module load <module1>
 module load <module2>
 conda activate <conda-env-name>
@@ -257,9 +257,9 @@ while [ "$(cat "$managers_started_file" | wc -l)" -lt "$n" ]; do
 done
 ```
 
-5. Starting the BQSKit server on the main node, and using SLURM's `SLURM_JOB_NODELIST` enviorment variable to indicate the BQSKit server the hostnames of the managers.
+5. Starting the BQSKit server on the main node, and using SLURM's `SLURM_JOB_NODELIST` environment variable to indicate the BQSKit server the hostnames of the managers.
 ```bash
-echo "starting BQSKit server on main node"
+echo "starting BQSKit server on the main node"
 bqskit-server $(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ') &> $scratch_dir/bqskit_logs/server_${SLURM_JOB_ID}.log &
 server_pid=$!
 
@@ -334,7 +334,7 @@ stop_mps_servers() {
 }
 ```
 
-Finaly, the script chekcs if GPUs are not needed, it spwans the manager with its default behaviour, else suing the "-x" argument, it indicates to the manager to wait for connecting workers.
+Finally, the script checks if GPUs are not needed, it spawns the manager with its default behavior, else using the "-x" argument, it indicates to the manager to wait for connecting workers.
 ```bash
 if [ $amount_of_gpus -eq 0 ]; then
     echo "Will run manager on node $node_id with n args of $amount_of_workers_per_gpu"

From 33091aa3bfa9f42539ae3e42665100df20566c31 Mon Sep 17 00:00:00 2001
From: alonkukl <alonkukl@gmail.com>
Date: Thu, 7 Nov 2024 23:38:02 -0800
Subject: [PATCH 22/22] Removing duplicated requirment

---
 docs/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index a88d4eaa7..a593ab467 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,4 @@
 Sphinx>=4.5.0
-sphinx-autodoc-typehints>=1.12.0
 sphinx-rtd-theme>=1.0.0
 sphinx-togglebutton>=0.2.3
 sphinx-autodoc-typehints>=2.3.0