diff --git a/.gitignore b/.gitignore index 68bc17f..6e9b121 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +examples/data/ +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index dc8d731..39fb3d5 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,9 @@ pytest We provide examples demonstrating how to use copairs for: - [grouping profiles based on their metadata](./examples/finding_pairs.ipynb) -- [calculating mAP to assess phenotypic activity and consistnecy of perturbation using real data](./examples/mAP_demo.ipynb) - +- [calculating mAP to assess phenotypic activity of perturbations](./examples/phenotypic_activity.ipynb) +- [calculating mAP to assess phenotypic consistency of perturbations](./examples/phenotypic_consistency.ipynb) +- [estimating null size for mAP p-value calculation](./examples/null_size.ipynb) ## Citation If you find this work useful for your research, please cite our [pre-print](https://doi.org/10.1101/2024.04.01.587631): diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..09a6919 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,32 @@ + # Examples + +Example notebooks demostrating the use of `copairs`. + +## Installation + +To install dependencies for running examples, run: +```bash +pip install copairs[demo] +``` + +## Running examples + +```bash +cd examples +notebook +``` + +## List of examples + +We show how to use copairs for: + +- [grouping profiles based on their metadata](./finding_pairs.ipynb) +- [calculating mAP to assess phenotypic activity of perturbations](./phenotypic_activity.ipynb) +- [calculating mAP to assess phenotypic consistency of perturbations](./phenotypic_consistency.ipynb) +- [estimating null size for mAP p-value calculation](./null_size.ipynb) + +## Data used + +In these examples, we used a single plate of profiles from the dataset "cpg0004" (aka LINCS), which contains Cell Painting images of 1,327 small-molecule perturbations of A549 human cells. The wells on each plate were perturbed with 56 different compounds in six different doses. + +> Way, G. P. et al. Morphology and gene expression profiling provide complementary information for mapping cell state. Cell Syst 13, 911–923.e9 (2022). diff --git a/examples/finding_pairs.ipynb b/examples/finding_pairs.ipynb index d8fa818..b04886f 100644 --- a/examples/finding_pairs.ipynb +++ b/examples/finding_pairs.ipynb @@ -107,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ ], "metadata": { "kernelspec": { - "display_name": "map_benchmark", + "display_name": "copairs", "language": "python", "name": "python3" }, @@ -292,7 +292,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.9.19" } }, "nbformat": 4, diff --git a/examples/null_size.ipynb b/examples/null_size.ipynb new file mode 100644 index 0000000..b8e2061 --- /dev/null +++ b/examples/null_size.ipynb @@ -0,0 +1,599 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Determining null size for mAP p-value calculation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.special import comb\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from copairs import map" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_scatter_grid(\n", + " data,\n", + " null_size_col=\"null_size\",\n", + " x_col=\"mAP\",\n", + " y_col=\"-log10(p-value)\",\n", + " color_col=\"below_corrected_p\",\n", + " cmap=\"tab10\",\n", + " figsize=(12, 6),\n", + "):\n", + " \"\"\"Plot a grid of scatter plots for different values of a given column.\n", + "\n", + " Args:\n", + " data (pd.DataFrame): Input DataFrame containing the data.\n", + " null_size_col (str): Column to split data into subplots. Defaults to \"null_size\".\n", + " x_col (str): Column to use for the x-axis. Defaults to \"mean_average_precision\".\n", + " y_col (str): Column to use for the y-axis. Defaults to \"-log10(p-value)\".\n", + " color_col (str): Column for coloring points. Defaults to \"below_corrected_p\".\n", + " cmap (str): Colormap for the scatter plot. Defaults to \"tab10\".\n", + " figsize (tuple): Figure size. Defaults to (12, 6).\n", + " \"\"\"\n", + " unique_null_sizes = sorted(data[null_size_col].unique()) # Get unique values\n", + " n_rows, n_cols = 3, 4 # Define grid shape\n", + "\n", + " fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, sharex=True, sharey=True)\n", + " axes = axes.flatten() # Flatten for easy iteration\n", + "\n", + " for i, null_size in enumerate(unique_null_sizes):\n", + " ax = axes[i]\n", + " subset = data[data[null_size_col] == null_size] # Filter data for current panel\n", + "\n", + " # Compute active ratio for the subset\n", + " active_ratio = subset[color_col].mean()\n", + "\n", + " # Scatter plot\n", + " _ = ax.scatter(\n", + " subset[x_col], subset[y_col], c=subset[color_col], cmap=cmap, s=10\n", + " )\n", + "\n", + " ax.axhline(\n", + " -np.log10(0.05), color=\"black\", linestyle=\"--\"\n", + " ) # Significance threshold\n", + " ax.set_title(f\"{null_size_col} = {null_size}\")\n", + "\n", + " # Display active ratio per panel\n", + " ax.text(\n", + " 0.4,\n", + " 5,\n", + " f\"Active = {100 * active_ratio:.2f}%\",\n", + " va=\"center\",\n", + " ha=\"left\",\n", + " fontsize=9,\n", + " )\n", + "\n", + " if i % n_cols == 0: # Leftmost column\n", + " ax.set_ylabel(y_col)\n", + " if i >= (n_rows - 1) * n_cols: # Bottom row\n", + " ax.set_xlabel(x_col)\n", + "\n", + " fig.suptitle(f\"Scatter plots across different {null_size_col} values\", fontsize=14)\n", + " plt.tight_layout(rect=[0, 0.05, 1, 0.95]) # Adjust layout\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data\n", + "\n", + "This example relies on data and results from the [Phenotypic activity](./phenotypic_activity.ipynb) example, so run that one first if you haven't.\n", + "\n", + "Let's define some helper functions." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df_activity = pd.read_csv(\"data/2016_04_01_a549_48hr_batch1_plateSQ00014812.csv\")\n", + "activity_ap = pd.read_csv(\"data/activity_ap.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete null size\n", + "\n", + "We estimate the statistical significance of a mAP score with respect to a random baseline using a permutation testing approach, a non-parametric, assumption-free method for testing the null hypothesis of sample exchangeability. The complete AP null distribution consists of all possible rank list re-shuffles.\n", + "\n", + "Let $m$ to be the number of perturbation replicates and $n$ to be the number of control profiles. Given that one perturbation profile serves as a query, the complete null size $d$ can be calculated as a binomial coefficient:\n", + "\n", + "\\begin{equation}\n", + " d_{null} = \\binom{(m-1)}{(m-1)*n}\n", + "\\end{equation}\n", + "\n", + "Let's calculate the complete null size for the example dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "m=6, n=24, d=118755\n" + ] + } + ], + "source": [ + "# almost all perturbations have 6 replicates\n", + "m = (\n", + " df_activity.query(\"Metadata_broad_sample != 'DMSO'\")\n", + " .groupby(\"Metadata_broad_sample\")\n", + " .size()\n", + " .mode()[0]\n", + ")\n", + "# the number of control profiles is 24\n", + "n = df_activity.query(\"Metadata_broad_sample == 'DMSO'\").shape[0]\n", + "\n", + "# using SciPy's comb function for numerical stability\n", + "d = comb(m - 1 + n, m - 1, exact=True)\n", + "\n", + "print(f\"{m=}, {n=}, {d=}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For large datasets, computing the full combinatorial null is infeasible. Instead, we approximate the null distribution using Monte Carlo sampling with $d_{\\text{perm}}$ permutations:\n", + "\n", + "\\begin{equation}\n", + " null\\_size \\approx d_{null}\n", + "\\end{equation}\n", + "\n", + "where $null\\_size$ is the number of random rank list shufflings applied to estimate the null distribution.\n", + "\n", + "## Effect of null size on mAP p-value calculation\n", + "\n", + "Let's calculate mAP significance on the given dataset using `null_size` values from $10$ to $5*10^6$ and plot results below." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f757e36886c54b19af4b8c7c1fe77e55", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "activity_maps = []\n", + "for ns_pow in range(1, 7):\n", + " null_size = 10**ns_pow\n", + "\n", + " replicate_map = map.mean_average_precision(\n", + " activity_ap,\n", + " [\"Metadata_broad_sample\"],\n", + " null_size=null_size,\n", + " threshold=0.05,\n", + " seed=0,\n", + " )\n", + " replicate_map[\"null_size\"] = null_size\n", + " activity_maps.append(replicate_map)\n", + "\n", + " replicate_map = map.mean_average_precision(\n", + " activity_ap,\n", + " [\"Metadata_broad_sample\"],\n", + " null_size=5 * null_size,\n", + " threshold=0.05,\n", + " seed=0,\n", + " )\n", + " replicate_map[\"null_size\"] = 5 * null_size\n", + " activity_maps.append(replicate_map)\n", + "\n", + "activity_maps = pd.concat(activity_maps)\n", + "activity_maps.rename(columns={\"mean_average_precision\": \"mAP\"}, inplace=True)\n", + "activity_maps[\"-log10(p-value)\"] = -activity_maps[\"corrected_p_value\"].apply(np.log10)\n", + "\n", + "plot_scatter_grid(activity_maps)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because the full null size $d_{null}=118755$, smaller sample sizes ($<5,000$) lead to poor estimation of significance for these data, while very large values ($>100,000$) cover the whole null and do not affect perturbation ranking results.\n", + "\n", + "## Practical consideration for choosing null size\n", + "\n", + "In practice, drawing a large number of samples is not always feasible, because compute time for each AP calculation grows with the higher number of perturbations of the dataset, the number of metadata constraints for profile grouping, sizes of perturbation groups (the number of perturbation replicates) and control groups (the number of control replicates), and profile dimensionality (the number of features in a profile).\n", + "\n", + "Finding a `null_size` that works for a particular dataset is balancing between test resolution (for example, being able to tell apart vary small p-values) and compute. We provided `null_size` values for each real-world dataset in Supplemental Materials to our paper—please refer to:\n", + "\n", + "> Kalinin, A. A. et al. A versatile information retrieval framework for evaluating profile strength and similarity. bioRxiv, 2024-04, (2024)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "copairs", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/mAP_demo.ipynb b/examples/phenotypic_activity.ipynb similarity index 80% rename from examples/mAP_demo.ipynb rename to examples/phenotypic_activity.ipynb index 5f25e41..32f8be8 100644 --- a/examples/mAP_demo.ipynb +++ b/examples/phenotypic_activity.ipynb @@ -1,8 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# mAP for phenotypic activity assesement" + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -10,7 +17,8 @@ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", - "from copairs import map" + "from copairs import map\n", + "from copairs.matching import assign_reference_index" ] }, { @@ -19,9 +27,13 @@ "source": [ "## Introduction\n", "\n", - "This example demostrates how to use `copairs` to:\n", - "- assess phenotypic activity of perturbations' replicates against DMSO control replicates and\n", - "- assess phenotypic consistncy of perturbations htat target the same gene against other perturbations.\n", + "This example demostrates how to use `copairs` to assess phenotypic activity of perturbations in a profiling dataset.\n", + "\n", + "Phenotypic activity is assessed by calculating mean average precision (mAP) for the retrieval of replicates of a perturbation against replicates of negative controls.\n", + "\n", + "It aims to answer the question: “How distinguishable is this perturbation from negative controls?”\n", + "\n", + "The resulting perturbation mAP score reflects the average extent to which its replicate profiles are more similar to each other compared to control profiles (Figure 1E).\n", "\n", "Citation:\n", "> Kalinin, A. A. et al. A versatile information retrieval framework for evaluating profile strength and similarity. bioRxiv, 2024-04, (2024)." @@ -29,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -60,7 +72,7 @@ } ], "source": [ - "fig1_path = \"F1.large.jpg\"\n", + "fig1_path = \"data/F1.large.jpg\"\n", "fig1_url = \"https://www.biorxiv.org/content/biorxiv/early/2024/04/02/2024.04.01.587631/F1.large.jpg\"\n", "\n", "if not Path(fig1_path).is_file():\n", @@ -84,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -548,17 +560,23 @@ "[384 rows x 507 columns]" ] }, - "execution_count": 4, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "local_path = \"data/2016_04_01_a549_48hr_batch1_plateSQ00014812.csv\"\n", "commit = \"da8ae6a3bc103346095d61b4ee02f08fc85a5d98\"\n", "plate = \"SQ00014812\"\n", "url = f\"https://media.githubusercontent.com/media/broadinstitute/lincs-cell-painting/{commit}/profiles/2016_04_01_a549_48hr_batch1/{plate}/{plate}_normalized_feature_select.csv.gz\"\n", "\n", - "df = pd.read_csv(url)\n", + "if not Path(local_path).is_file():\n", + " df = pd.read_csv(url)\n", + " df.to_csv(local_path, index=False)\n", + "else:\n", + " df = pd.read_csv(local_path)\n", + "\n", "df = df.loc[:, df.nunique() > 1] # remove constant columns\n", "df" ] @@ -572,7 +590,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -599,7 +617,7 @@ " 'BCL2|BCL2L1|BCL2L2'], dtype=object)" ] }, - "execution_count": 5, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -612,9 +630,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Assessing phenotypic activity of compounds with mAP\n", - "\n", - "Phenotypic activity of a perturbation reflects the average extent to which its replicate profiles are more similar to each other compared to control profiles (Figure 1E)." + "## Assessing phenotypic activity of compounds with mAP" ] }, { @@ -623,14 +639,110 @@ "source": [ "Here, we treat different doses of each compound as replicates and assess how well we can retrieve them by similarity against the group of negative controls (DMSO).\n", "\n", - "To ensure correct grouping of profiles, we can add a dummy column that is equal to row index for all DMSO replicates and to -1 for all compound replicates. " + "For phenotypic activity, it's helpful to add an extra column that is equal to row index for all DMSO replicates and to -1 for all compound replicates using `assign_reference_index` function. This helps to not count groups of negative controls as query groups and not consider other perturbations as a reference." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "reference_col = \"Metadata_reference_index\"\n", + "\n", + "df_activity = assign_reference_index(\n", + " df,\n", + " \"Metadata_broad_sample == 'DMSO'\", # condition to get reference profiles (neg controls)\n", + " reference_col=reference_col,\n", + " default_value=-1,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we define the rules by which profiles are grouped based on metadata:\n", + "\n", + "* Two profiles are a positive pair if they belong to the same group that is not a control group. In this case, any two replicate profiles of the same compound are a positive pair. To define that using metadata columns, positive pairs should share the same value in the metadata column that identifies compounds (`Metadata_broad_sample`). We add this column to a list names `pos_sameby`.\n", + "\n", + "* In this case, profiles that form a positive pair do not need to be different in any of the metatada columns, so we keep `pos_diffby` empty. Although one could define them as being from different batches, for instance, to account for batch effects.\n", + "\n", + "* Two profiles are a negative pair when one of them belongs to a group of compound replicates and another to a group of DMSO controls. That means they should be different both in the metadata column that identifies the specific compound and the reference index columns that we created. The latter is needed to ensure that replicates of compounds are retrieved against only DMSO controls at this stage (and not against replicates of other compounds). We list these columns in `neg_diffby`.\n", + "\n", + "* Profiles that form a negative pair do not need to be same in any of the metatada columns, so we keep `neg_sameby` empty.\n", + "\n", + "\n", + "Finally, we include `Metadata_reference_index` column to:\n", + "* `pos_sameby`—this ensures positive pairs connect profiles that share the same value in this column, i.e. a positive pair cannot be formed between any two negative controls (control profiles contain index values).\n", + "* `neg_diffby`—this ensures negative pairs connect profiles that differ in this columns, i.e. a negative pair cannot be formed between profiles of two different perturbations (all perturbation profiles contain -1)." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# positive pairs are replicates of the same treatment\n", + "pos_sameby = [\"Metadata_broad_sample\", reference_col]\n", + "pos_diffby = []\n", + "\n", + "neg_sameby = []\n", + "# negative pairs are replicates of different treatments\n", + "neg_diffby = [\"Metadata_broad_sample\", reference_col]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use `average_precision` function to calculate the average precision score for each replicate of each compound.\n", + "\n", + "It returns metadata with 3 new columns: number of positive and negative pairs for each replicate profile and the average precision score." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3cfeabe4061942f499ad5045f1262f51", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", " \n", " \n", - " Metadata_reference_index\n", " Metadata_broad_sample\n", " Metadata_mg_per_ml\n", " Metadata_mmoles_per_liter\n", @@ -662,1296 +773,80 @@ " Metadata_broad_sample_type\n", " Metadata_pert_type\n", " Metadata_broad_id\n", - " ...\n", - " Nuclei_Texture_InverseDifferenceMoment_AGP_5_0\n", - " Nuclei_Texture_InverseDifferenceMoment_DNA_20_0\n", - " Nuclei_Texture_InverseDifferenceMoment_ER_5_0\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_10_0\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_5_0\n", - " Nuclei_Texture_SumAverage_RNA_5_0\n", - " Nuclei_Texture_SumEntropy_DNA_10_0\n", - " Nuclei_Texture_SumEntropy_DNA_20_0\n", - " Nuclei_Texture_SumEntropy_DNA_5_0\n", - " Nuclei_Texture_Variance_RNA_10_0\n", + " Metadata_InChIKey14\n", + " Metadata_moa\n", + " Metadata_target\n", + " Metadata_broad_date\n", + " Metadata_Well\n", + " Metadata_reference_index\n", + " n_pos_pairs\n", + " n_total_pairs\n", + " average_precision\n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", - " DMSO\n", - " 0.000000\n", - " 0.000000\n", - " NaN\n", - " NaN\n", - " A01\n", - " control\n", - " control\n", - " NaN\n", - " ...\n", - " -1.3544\n", - " -1.07770\n", - " 2.26020\n", - " -0.377010\n", - " -0.065840\n", - " 2.12360\n", - " 2.8740\n", - " 2.87500\n", - " 2.3047\n", - " -0.92358\n", - " \n", - " \n", - " 1\n", - " 1\n", - " DMSO\n", - " 0.000000\n", - " 0.000000\n", - " NaN\n", - " NaN\n", - " A02\n", - " control\n", - " control\n", - " NaN\n", - " ...\n", - " -2.3840\n", - " -0.73440\n", - " 1.12090\n", - " -0.182500\n", - " -0.061450\n", - " 0.66985\n", - " 2.3919\n", - " 2.35230\n", - " 1.8672\n", - " -0.11820\n", + " 6\n", + " BRD-K74363950-004-01-0\n", + " 5.655600\n", + " 10.000000\n", + " BRD-K74363950\n", + " BRD-K74363950-004-01-0\n", + " A07\n", + " trt\n", + " trt\n", + " BRD-K74363950\n", + " ASMXXROZKSBQIH\n", + " acetylcholine receptor antagonist\n", + " CHRM1|CHRM2|CHRM3|CHRM4|CHRM5\n", + " broad_id_20170327\n", + " A07\n", + " -1\n", + " 5\n", + " 29\n", + " 0.325013\n", " \n", " \n", - " 2\n", - " 2\n", - " DMSO\n", - " 0.000000\n", - " 0.000000\n", - " NaN\n", - " NaN\n", - " A03\n", - " control\n", - " control\n", - " NaN\n", - " ...\n", - " -1.9493\n", - " -0.36148\n", - " 0.44050\n", - " 0.326660\n", - " 0.547200\n", - " 0.25015\n", - " 1.2271\n", - " 0.77847\n", - " 1.0651\n", - " -0.44810\n", + " 7\n", + " BRD-K74363950-004-01-0\n", + " 1.885200\n", + " 3.333300\n", + " BRD-K74363950\n", + " BRD-K74363950-004-01-0\n", + " A08\n", + " trt\n", + " trt\n", + " BRD-K74363950\n", + " ASMXXROZKSBQIH\n", + " acetylcholine receptor antagonist\n", + " CHRM1|CHRM2|CHRM3|CHRM4|CHRM5\n", + " broad_id_20170327\n", + " A08\n", + " -1\n", + " 5\n", + " 29\n", + " 0.513889\n", " \n", " \n", - " 3\n", - " 3\n", - " DMSO\n", - " 0.000000\n", - " 0.000000\n", - " NaN\n", - " NaN\n", - " A04\n", - " control\n", - " control\n", - " NaN\n", - " ...\n", - " -2.2909\n", - " -0.46380\n", - " 0.96434\n", - " 1.132200\n", - " 0.753500\n", - " 0.31403\n", - " 1.4384\n", - " 1.48110\n", - " 1.2943\n", - " -0.83810\n", - " \n", - " \n", - " 4\n", - " 4\n", - " DMSO\n", - " 0.000000\n", - " 0.000000\n", - " NaN\n", - " NaN\n", - " A05\n", - " control\n", - " control\n", - " NaN\n", - " ...\n", - " -1.8955\n", - " -1.05350\n", - " 1.64840\n", - " 0.057781\n", - " 0.070229\n", - " 1.60990\n", - " 1.1296\n", - " 0.90213\n", - " 1.1016\n", - " 0.53225\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 379\n", - " -1\n", - " BRD-K82746043-001-15-1\n", - " 3.248700\n", - " 3.333300\n", - " BRD-K82746043\n", - " BRD-K82746043-001-15-1\n", - " P20\n", - " trt\n", - " trt\n", - " BRD-K82746043\n", - " ...\n", - " -6.1522\n", - " 1.81410\n", - " 1.54220\n", - " -1.874700\n", - " -1.133900\n", - " 1.57540\n", - " -3.0962\n", - " -3.25160\n", - " -2.7683\n", - " 1.40170\n", - " \n", - " \n", - " 380\n", - " -1\n", - " BRD-K82746043-001-15-1\n", - " 1.082900\n", - " 1.111100\n", - " BRD-K82746043\n", - " BRD-K82746043-001-15-1\n", - " P21\n", - " trt\n", - " trt\n", - " BRD-K82746043\n", - " ...\n", - " -5.1586\n", - " 1.50580\n", - " 1.68420\n", - " -1.126400\n", - " -1.066600\n", - " 1.24740\n", - " -1.5305\n", - " -1.79020\n", - " -1.2474\n", - " 1.17600\n", - " \n", - " \n", - " 381\n", - " -1\n", - " BRD-K82746043-001-15-1\n", - " 0.360970\n", - " 0.370370\n", - " BRD-K82746043\n", - " BRD-K82746043-001-15-1\n", - " P22\n", - " trt\n", - " trt\n", - " BRD-K82746043\n", - " ...\n", - " -5.9475\n", - " 1.42100\n", - " 1.51020\n", - " -1.103600\n", - " -1.666500\n", - " 1.19840\n", - " -2.6086\n", - " -2.97620\n", - " -2.0026\n", - " 0.91557\n", - " \n", - " \n", - " 382\n", - " -1\n", - " BRD-K82746043-001-15-1\n", - " 0.120320\n", - " 0.123460\n", - " BRD-K82746043\n", - " BRD-K82746043-001-15-1\n", - " P23\n", - " trt\n", - " trt\n", - " BRD-K82746043\n", - " ...\n", - " -8.4408\n", - " 2.99620\n", - " 2.55230\n", - " -2.275200\n", - " -1.783500\n", - " 2.49200\n", - " -4.3964\n", - " -4.19030\n", - " -3.8360\n", - " 1.02240\n", - " \n", - " \n", - " 383\n", - " -1\n", - " BRD-K82746043-001-15-1\n", - " 0.040108\n", - " 0.041152\n", - " BRD-K82746043\n", - " BRD-K82746043-001-15-1\n", - " P24\n", - " trt\n", - " trt\n", - " BRD-K82746043\n", - " ...\n", - " -7.9510\n", - " 2.55730\n", - " 3.05790\n", - " -1.466300\n", - " -1.673800\n", - " 1.99540\n", - " -4.2176\n", - " -4.49940\n", - " -3.4922\n", - " 1.01170\n", - " \n", - " \n", - "\n", - "

384 rows × 508 columns

\n", - "" - ], - "text/plain": [ - " Metadata_reference_index Metadata_broad_sample Metadata_mg_per_ml \\\n", - "0 0 DMSO 0.000000 \n", - "1 1 DMSO 0.000000 \n", - "2 2 DMSO 0.000000 \n", - "3 3 DMSO 0.000000 \n", - "4 4 DMSO 0.000000 \n", - ".. ... ... ... \n", - "379 -1 BRD-K82746043-001-15-1 3.248700 \n", - "380 -1 BRD-K82746043-001-15-1 1.082900 \n", - "381 -1 BRD-K82746043-001-15-1 0.360970 \n", - "382 -1 BRD-K82746043-001-15-1 0.120320 \n", - "383 -1 BRD-K82746043-001-15-1 0.040108 \n", - "\n", - " Metadata_mmoles_per_liter Metadata_pert_id Metadata_pert_mfc_id \\\n", - "0 0.000000 NaN NaN \n", - "1 0.000000 NaN NaN \n", - "2 0.000000 NaN NaN \n", - "3 0.000000 NaN NaN \n", - "4 0.000000 NaN NaN \n", - ".. ... ... ... \n", - "379 3.333300 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "380 1.111100 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "381 0.370370 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "382 0.123460 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "383 0.041152 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "\n", - " Metadata_pert_well Metadata_broad_sample_type Metadata_pert_type \\\n", - "0 A01 control control \n", - "1 A02 control control \n", - "2 A03 control control \n", - "3 A04 control control \n", - "4 A05 control control \n", - ".. ... ... ... \n", - "379 P20 trt trt \n", - "380 P21 trt trt \n", - "381 P22 trt trt \n", - "382 P23 trt trt \n", - "383 P24 trt trt \n", - "\n", - " Metadata_broad_id ... Nuclei_Texture_InverseDifferenceMoment_AGP_5_0 \\\n", - "0 NaN ... -1.3544 \n", - "1 NaN ... -2.3840 \n", - "2 NaN ... -1.9493 \n", - "3 NaN ... -2.2909 \n", - "4 NaN ... -1.8955 \n", - ".. ... ... ... \n", - "379 BRD-K82746043 ... -6.1522 \n", - "380 BRD-K82746043 ... -5.1586 \n", - "381 BRD-K82746043 ... -5.9475 \n", - "382 BRD-K82746043 ... -8.4408 \n", - "383 BRD-K82746043 ... -7.9510 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_DNA_20_0 \\\n", - "0 -1.07770 \n", - "1 -0.73440 \n", - "2 -0.36148 \n", - "3 -0.46380 \n", - "4 -1.05350 \n", - ".. ... \n", - "379 1.81410 \n", - "380 1.50580 \n", - "381 1.42100 \n", - "382 2.99620 \n", - "383 2.55730 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_ER_5_0 \\\n", - "0 2.26020 \n", - "1 1.12090 \n", - "2 0.44050 \n", - "3 0.96434 \n", - "4 1.64840 \n", - ".. ... \n", - "379 1.54220 \n", - "380 1.68420 \n", - "381 1.51020 \n", - "382 2.55230 \n", - "383 3.05790 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_10_0 \\\n", - "0 -0.377010 \n", - "1 -0.182500 \n", - "2 0.326660 \n", - "3 1.132200 \n", - "4 0.057781 \n", - ".. ... \n", - "379 -1.874700 \n", - "380 -1.126400 \n", - "381 -1.103600 \n", - "382 -2.275200 \n", - "383 -1.466300 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_5_0 \\\n", - "0 -0.065840 \n", - "1 -0.061450 \n", - "2 0.547200 \n", - "3 0.753500 \n", - "4 0.070229 \n", - ".. ... \n", - "379 -1.133900 \n", - "380 -1.066600 \n", - "381 -1.666500 \n", - "382 -1.783500 \n", - "383 -1.673800 \n", - "\n", - " Nuclei_Texture_SumAverage_RNA_5_0 Nuclei_Texture_SumEntropy_DNA_10_0 \\\n", - "0 2.12360 2.8740 \n", - "1 0.66985 2.3919 \n", - "2 0.25015 1.2271 \n", - "3 0.31403 1.4384 \n", - "4 1.60990 1.1296 \n", - ".. ... ... \n", - "379 1.57540 -3.0962 \n", - "380 1.24740 -1.5305 \n", - "381 1.19840 -2.6086 \n", - "382 2.49200 -4.3964 \n", - "383 1.99540 -4.2176 \n", - "\n", - " Nuclei_Texture_SumEntropy_DNA_20_0 Nuclei_Texture_SumEntropy_DNA_5_0 \\\n", - "0 2.87500 2.3047 \n", - "1 2.35230 1.8672 \n", - "2 0.77847 1.0651 \n", - "3 1.48110 1.2943 \n", - "4 0.90213 1.1016 \n", - ".. ... ... \n", - "379 -3.25160 -2.7683 \n", - "380 -1.79020 -1.2474 \n", - "381 -2.97620 -2.0026 \n", - "382 -4.19030 -3.8360 \n", - "383 -4.49940 -3.4922 \n", - "\n", - " Nuclei_Texture_Variance_RNA_10_0 \n", - "0 -0.92358 \n", - "1 -0.11820 \n", - "2 -0.44810 \n", - "3 -0.83810 \n", - "4 0.53225 \n", - ".. ... \n", - "379 1.40170 \n", - "380 1.17600 \n", - "381 0.91557 \n", - "382 1.02240 \n", - "383 1.01170 \n", - "\n", - "[384 rows x 508 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_activity = df.copy()\n", - "# make deafult value equal to row index\n", - "df_activity[\"Metadata_reference_index\"] = df_activity.index\n", - "# make index equal to -1 for all treatment replicates (non-DMSO)\n", - "df_activity.loc[df[\"Metadata_broad_sample\"] != \"DMSO\", \"Metadata_reference_index\"] = -1\n", - "# now all treatment replicates equal -1 in the index column, except for DMSO replicates\n", - "df_activity.insert(\n", - " 0, \"Metadata_reference_index\", df_activity.pop(\"Metadata_reference_index\")\n", - ")\n", - "df_activity" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we define the rules by which profiles are grouped based on metadata:\n", - "\n", - "* Two profiles are a positive pair if they belong to the same group that is not a control group. In this case, any two replicate profiles of the same compound are a positive pair. To define that using metadata columns, positive pairs should share the same value in the metadata column that identifies compounds (`Metadata_broad_sample`). We add this column to a list names `pos_sameby`.\n", - "\n", - "* In this case, profiles that form a positive pair do not need to be different in any of the metatada columns, so we keep `pos_diffby` empty. Although one could define them as being from different batches, for instance, to account for batch effects.\n", - "\n", - "* Two profiles are a negative pair when one of them belongs to a group of compound replicates and another to a group of DMSO controls. That means they should be different both in the metadata column that identifies the specific compound and the reference index columns that we created. The latter is needed to ensure that replicates of compounds are retrieved against only DMSO controls at this stage (and not against replicates of other compounds). We list these columns in `neg_diffby`.\n", - "\n", - "* Profiles that form a negative pair do not need to be same in any of the metatada columns, so we keep `neg_sameby` empty." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# positive pairs are replicates of the same treatment\n", - "pos_sameby = [\"Metadata_broad_sample\"]\n", - "pos_diffby = []\n", - "\n", - "neg_sameby = []\n", - "# negative pairs are replicates of different treatments\n", - "neg_diffby = [\"Metadata_broad_sample\", \"Metadata_reference_index\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can use `average_precision` function to calculate the average precision score for each replicate of each compound.\n", - "\n", - "It returns metadata with 3 new columns: number of positive and negative pairs for each replicate profile and the average precision score." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "51509158c2e84267b94e8d0cf5952604", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_reference_indexMetadata_broad_sampleMetadata_mg_per_mlMetadata_mmoles_per_literMetadata_pert_idMetadata_pert_mfc_idMetadata_pert_wellMetadata_broad_sample_typeMetadata_pert_typeMetadata_broad_idMetadata_InChIKey14Metadata_moaMetadata_targetMetadata_broad_dateMetadata_Welln_pos_pairsn_total_pairsaverage_precision
6-1BRD-K74363950-004-01-05.65560010.000000BRD-K74363950BRD-K74363950-004-01-0A07trttrtBRD-K74363950ASMXXROZKSBQIHacetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A075290.325013
7-1BRD-K74363950-004-01-01.8852003.333300BRD-K74363950BRD-K74363950-004-01-0A08trttrtBRD-K74363950ASMXXROZKSBQIHacetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A085290.513889
8-1BRD-K74363950-004-01-00.6284001.111100BRD-K74363950BRD-K74363950-004-01-0A09trttrtBRD-K74363950ASMXXROZKSBQIHacetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A095290.727778
9-1BRD-K74363950-004-01-00.2094700.370370BRD-K74363950BRD-K74363950-004-01-0A10trttrtBRD-K74363950ASMXXROZKSBQIHacetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A105290.783333
10-1BRD-K74363950-004-01-00.0698230.123460BRD-K74363950BRD-K74363950-004-01-0A11trttrtBRD-K74363950ASMXXROZKSBQIHacetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A115290.900000
.........................................................
379-1BRD-K82746043-001-15-13.2487003.333300BRD-K82746043BRD-K82746043-001-15-1P20trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P205291.000000
380-1BRD-K82746043-001-15-11.0829001.111100BRD-K82746043BRD-K82746043-001-15-1P21trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P215290.966667
381-1BRD-K82746043-001-15-10.3609700.370370BRD-K82746043BRD-K82746043-001-15-1P22trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P225290.942857
382-1BRD-K82746043-001-15-10.1203200.123460BRD-K82746043BRD-K82746043-001-15-1P23trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P235291.000000
383-1BRD-K82746043-001-15-10.0401080.041152BRD-K82746043BRD-K82746043-001-15-1P24trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P245291.000000
\n", - "

360 rows × 18 columns

\n", - "" - ], - "text/plain": [ - " Metadata_reference_index Metadata_broad_sample Metadata_mg_per_ml \\\n", - "6 -1 BRD-K74363950-004-01-0 5.655600 \n", - "7 -1 BRD-K74363950-004-01-0 1.885200 \n", - "8 -1 BRD-K74363950-004-01-0 0.628400 \n", - "9 -1 BRD-K74363950-004-01-0 0.209470 \n", - "10 -1 BRD-K74363950-004-01-0 0.069823 \n", - ".. ... ... ... \n", - "379 -1 BRD-K82746043-001-15-1 3.248700 \n", - "380 -1 BRD-K82746043-001-15-1 1.082900 \n", - "381 -1 BRD-K82746043-001-15-1 0.360970 \n", - "382 -1 BRD-K82746043-001-15-1 0.120320 \n", - "383 -1 BRD-K82746043-001-15-1 0.040108 \n", - "\n", - " Metadata_mmoles_per_liter Metadata_pert_id Metadata_pert_mfc_id \\\n", - "6 10.000000 BRD-K74363950 BRD-K74363950-004-01-0 \n", - "7 3.333300 BRD-K74363950 BRD-K74363950-004-01-0 \n", - "8 1.111100 BRD-K74363950 BRD-K74363950-004-01-0 \n", - "9 0.370370 BRD-K74363950 BRD-K74363950-004-01-0 \n", - "10 0.123460 BRD-K74363950 BRD-K74363950-004-01-0 \n", - ".. ... ... ... \n", - "379 3.333300 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "380 1.111100 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "381 0.370370 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "382 0.123460 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "383 0.041152 BRD-K82746043 BRD-K82746043-001-15-1 \n", - "\n", - " Metadata_pert_well Metadata_broad_sample_type Metadata_pert_type \\\n", - "6 A07 trt trt \n", - "7 A08 trt trt \n", - "8 A09 trt trt \n", - "9 A10 trt trt \n", - "10 A11 trt trt \n", - ".. ... ... ... \n", - "379 P20 trt trt \n", - "380 P21 trt trt \n", - "381 P22 trt trt \n", - "382 P23 trt trt \n", - "383 P24 trt trt \n", - "\n", - " Metadata_broad_id Metadata_InChIKey14 Metadata_moa \\\n", - "6 BRD-K74363950 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", - "7 BRD-K74363950 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", - "8 BRD-K74363950 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", - "9 BRD-K74363950 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", - "10 BRD-K74363950 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", - ".. ... ... ... \n", - "379 BRD-K82746043 JLYAXFNOILIKPP BCL inhibitor \n", - "380 BRD-K82746043 JLYAXFNOILIKPP BCL inhibitor \n", - "381 BRD-K82746043 JLYAXFNOILIKPP BCL inhibitor \n", - "382 BRD-K82746043 JLYAXFNOILIKPP BCL inhibitor \n", - "383 BRD-K82746043 JLYAXFNOILIKPP BCL inhibitor \n", - "\n", - " Metadata_target Metadata_broad_date Metadata_Well \\\n", - "6 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A07 \n", - "7 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A08 \n", - "8 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A09 \n", - "9 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A10 \n", - "10 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A11 \n", - ".. ... ... ... \n", - "379 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P20 \n", - "380 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P21 \n", - "381 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P22 \n", - "382 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P23 \n", - "383 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P24 \n", - "\n", - " n_pos_pairs n_total_pairs average_precision \n", - "6 5 29 0.325013 \n", - "7 5 29 0.513889 \n", - "8 5 29 0.727778 \n", - "9 5 29 0.783333 \n", - "10 5 29 0.900000 \n", - ".. ... ... ... \n", - "379 5 29 1.000000 \n", - "380 5 29 0.966667 \n", - "381 5 29 0.942857 \n", - "382 5 29 1.000000 \n", - "383 5 29 1.000000 \n", - "\n", - "[360 rows x 18 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "metadata = df_activity.filter(regex=\"^Metadata\")\n", - "profiles = df_activity.filter(regex=\"^(?!Metadata)\").values\n", - "\n", - "replicate_aps = map.average_precision(\n", - " metadata, profiles, pos_sameby, pos_diffby, neg_sameby, neg_diffby\n", - ")\n", - "replicate_aps = replicate_aps.query(\"Metadata_broad_sample != 'DMSO'\") # remove DMSO\n", - "replicate_aps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At the next step, we average replicate AP scores at the per-compound level to obtain mAP values using `mean_average_precision`.\n", - "\n", - "It also calculates p-values using permutation testing, and performs FDR correction to compare across compounds." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b55cf11c765b4af98dca44f808372955", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_broad_samplemean_average_precisionp_valuecorrected_p_valuebelow_pbelow_corrected_p-log10(p-value)
0BRD-A69275535-001-01-50.5756290.0176980.023857TrueTrue1.622390
1BRD-A69636825-003-04-70.6938060.0037000.006922TrueTrue2.159775
2BRD-A69815203-001-07-61.0000000.0001000.000341TrueTrue3.467064
3BRD-A70858459-001-01-70.7771730.0006000.001289TrueTrue2.889828
4BRD-A72309220-001-04-10.7169270.0022000.004253TrueTrue2.371314
5BRD-A72390365-001-15-20.9344440.0001000.000341TrueTrue3.467064
6BRD-A73368467-003-17-60.9260320.0001000.000341TrueTrue3.467064
7BRD-A74980173-001-11-90.7659310.0006000.001289TrueTrue2.889828
8BRD-A81233518-004-16-10.6211830.0093990.013978TrueTrue1.854552
9BRD-A82035391-001-02-70.3180660.2603740.264942FalseFalse0.576849
\n", - "" - ], - "text/plain": [ - " Metadata_broad_sample mean_average_precision p_value \\\n", - "0 BRD-A69275535-001-01-5 0.575629 0.017698 \n", - "1 BRD-A69636825-003-04-7 0.693806 0.003700 \n", - "2 BRD-A69815203-001-07-6 1.000000 0.000100 \n", - "3 BRD-A70858459-001-01-7 0.777173 0.000600 \n", - "4 BRD-A72309220-001-04-1 0.716927 0.002200 \n", - "5 BRD-A72390365-001-15-2 0.934444 0.000100 \n", - "6 BRD-A73368467-003-17-6 0.926032 0.000100 \n", - "7 BRD-A74980173-001-11-9 0.765931 0.000600 \n", - "8 BRD-A81233518-004-16-1 0.621183 0.009399 \n", - "9 BRD-A82035391-001-02-7 0.318066 0.260374 \n", - "\n", - " corrected_p_value below_p below_corrected_p -log10(p-value) \n", - "0 0.023857 True True 1.622390 \n", - "1 0.006922 True True 2.159775 \n", - "2 0.000341 True True 3.467064 \n", - "3 0.001289 True True 2.889828 \n", - "4 0.004253 True True 2.371314 \n", - "5 0.000341 True True 3.467064 \n", - "6 0.000341 True True 3.467064 \n", - "7 0.001289 True True 2.889828 \n", - "8 0.013978 True True 1.854552 \n", - "9 0.264942 False False 0.576849 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "replicate_maps = map.mean_average_precision(\n", - " replicate_aps, pos_sameby, null_size=10000, threshold=0.05, seed=0\n", - ")\n", - "replicate_maps[\"-log10(p-value)\"] = -replicate_maps[\"corrected_p_value\"].apply(np.log10)\n", - "replicate_maps.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can plot the results and filter out phenotypicall inactive compounds with corrected p-value >0.05." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "active_ratio = replicate_maps.below_corrected_p.mean()\n", - "\n", - "plt.scatter(\n", - " data=replicate_maps,\n", - " x=\"mean_average_precision\",\n", - " y=\"-log10(p-value)\",\n", - " c=\"below_corrected_p\",\n", - " cmap=\"tab10\",\n", - " s=10,\n", - ")\n", - "# 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r',\n", - "plt.xlabel(\"mAP\")\n", - "plt.ylabel(\"-log10(p-value)\")\n", - "plt.axhline(-np.log10(0.05), color=\"black\", linestyle=\"--\")\n", - "plt.text(\n", - " 0.5, 1.5, f\"Phenotypically active = {100*active_ratio:.2f}%\", va=\"center\", ha=\"left\"\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Assessing phenotypic consistency of compounds grouped by targets\n", - "\n", - "Phenotypic consitency of a group of perturbations reflects the average extent to which members of this group are more similar to each other compared to other groups (see Figure 1F).\n", - "\n", - "First, we are going to filter out compounds that were not phenotypically active using mAP p-values from the previous section.\n", - "\n", - "Next, we will aggregate each compound’s replicate profiles into a \"consensus\" profile by taking the median of each feature to reduce profile noise and improve computational efficiency." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1965,17 +860,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1989,592 +881,30 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_broad_sampleMetadata_mg_per_mlMetadata_mmoles_per_literMetadata_pert_idMetadata_pert_mfc_idMetadata_pert_wellMetadata_broad_sample_typeMetadata_pert_typeMetadata_broad_idMetadata_InChIKey14...Nuclei_Texture_InverseDifferenceMoment_AGP_5_0Nuclei_Texture_InverseDifferenceMoment_DNA_20_0Nuclei_Texture_InverseDifferenceMoment_ER_5_0Nuclei_Texture_InverseDifferenceMoment_Mito_10_0Nuclei_Texture_InverseDifferenceMoment_Mito_5_0Nuclei_Texture_SumAverage_RNA_5_0Nuclei_Texture_SumEntropy_DNA_10_0Nuclei_Texture_SumEntropy_DNA_20_0Nuclei_Texture_SumEntropy_DNA_5_0Nuclei_Texture_Variance_RNA_10_0
6BRD-K74363950-004-01-05.65560010.000000BRD-K74363950BRD-K74363950-004-01-0A07trttrtBRD-K74363950ASMXXROZKSBQIH...-0.51038-0.764021.616400-0.49600-0.4813602.4211001.107901.138201.143200.329230
7BRD-K74363950-004-01-01.8852003.333300BRD-K74363950BRD-K74363950-004-01-0A08trttrtBRD-K74363950ASMXXROZKSBQIH...-0.23602-0.411290.3049600.478840.005852-0.7103300.41986-0.238880.54949-0.092826
8BRD-K74363950-004-01-00.6284001.111100BRD-K74363950BRD-K74363950-004-01-0A09trttrtBRD-K74363950ASMXXROZKSBQIH...-0.52939-0.547270.7225700.733990.2238500.0358420.333180.390640.42969-0.8113908BRD-K74363950-004-01-00.6284001.111100BRD-K74363950BRD-K74363950-004-01-0A09trttrtBRD-K74363950ASMXXROZKSBQIHacetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A09-15290.727778
9trtBRD-K74363950ASMXXROZKSBQIH...-0.58515-0.415330.0448740.763740.062913-0.6568500.18149-0.109600.48699-0.345260acetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A10-15290.783333
10trtBRD-K74363950ASMXXROZKSBQIH...-0.52686-0.578230.5916100.851840.5603700.0391840.598640.441230.75783-0.018031
11BRD-K74363950-004-01-00.0232740.041152BRD-K74363950BRD-K74363950-004-01-0A12trttrtBRD-K74363950ASMXXROZKSBQIH...-0.48060-1.472200.8141500.794630.0892490.0722400.918280.396261.09120-0.243750acetylcholine receptor antagonistCHRM1|CHRM2|CHRM3|CHRM4|CHRM5broad_id_20170327A11-15290.900000
12BRD-K75958547-238-01-04.61540010.000000BRD-K75958547BRD-K75958547-238-01-0A13trttrtBRD-K75958547VGYFMXBACGZSIL...-5.89680-0.97404-5.025000-10.41400-6.0675007.6257003.318303.27410-2.122402.299300
\n", - "

7 rows × 507 columns

\n", - "
" - ], - "text/plain": [ - " Metadata_broad_sample Metadata_mg_per_ml Metadata_mmoles_per_liter \\\n", - "6 BRD-K74363950-004-01-0 5.655600 10.000000 \n", - "7 BRD-K74363950-004-01-0 1.885200 3.333300 \n", - "8 BRD-K74363950-004-01-0 0.628400 1.111100 \n", - "9 BRD-K74363950-004-01-0 0.209470 0.370370 \n", - "10 BRD-K74363950-004-01-0 0.069823 0.123460 \n", - "11 BRD-K74363950-004-01-0 0.023274 0.041152 \n", - "12 BRD-K75958547-238-01-0 4.615400 10.000000 \n", - "\n", - " Metadata_pert_id Metadata_pert_mfc_id Metadata_pert_well \\\n", - "6 BRD-K74363950 BRD-K74363950-004-01-0 A07 \n", - "7 BRD-K74363950 BRD-K74363950-004-01-0 A08 \n", - "8 BRD-K74363950 BRD-K74363950-004-01-0 A09 \n", - "9 BRD-K74363950 BRD-K74363950-004-01-0 A10 \n", - "10 BRD-K74363950 BRD-K74363950-004-01-0 A11 \n", - "11 BRD-K74363950 BRD-K74363950-004-01-0 A12 \n", - "12 BRD-K75958547 BRD-K75958547-238-01-0 A13 \n", - "\n", - " Metadata_broad_sample_type Metadata_pert_type Metadata_broad_id \\\n", - "6 trt trt BRD-K74363950 \n", - "7 trt trt BRD-K74363950 \n", - "8 trt trt BRD-K74363950 \n", - "9 trt trt BRD-K74363950 \n", - "10 trt trt BRD-K74363950 \n", - "11 trt trt BRD-K74363950 \n", - "12 trt trt BRD-K75958547 \n", - "\n", - " Metadata_InChIKey14 ... Nuclei_Texture_InverseDifferenceMoment_AGP_5_0 \\\n", - "6 ASMXXROZKSBQIH ... -0.51038 \n", - "7 ASMXXROZKSBQIH ... -0.23602 \n", - "8 ASMXXROZKSBQIH ... -0.52939 \n", - "9 ASMXXROZKSBQIH ... -0.58515 \n", - "10 ASMXXROZKSBQIH ... -0.52686 \n", - "11 ASMXXROZKSBQIH ... -0.48060 \n", - "12 VGYFMXBACGZSIL ... -5.89680 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_DNA_20_0 \\\n", - "6 -0.76402 \n", - "7 -0.41129 \n", - "8 -0.54727 \n", - "9 -0.41533 \n", - "10 -0.57823 \n", - "11 -1.47220 \n", - "12 -0.97404 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_ER_5_0 \\\n", - "6 1.616400 \n", - "7 0.304960 \n", - "8 0.722570 \n", - "9 0.044874 \n", - "10 0.591610 \n", - "11 0.814150 \n", - "12 -5.025000 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_10_0 \\\n", - "6 -0.49600 \n", - "7 0.47884 \n", - "8 0.73399 \n", - "9 0.76374 \n", - "10 0.85184 \n", - "11 0.79463 \n", - "12 -10.41400 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_5_0 \\\n", - "6 -0.481360 \n", - "7 0.005852 \n", - "8 0.223850 \n", - "9 0.062913 \n", - "10 0.560370 \n", - "11 0.089249 \n", - "12 -6.067500 \n", - "\n", - " Nuclei_Texture_SumAverage_RNA_5_0 Nuclei_Texture_SumEntropy_DNA_10_0 \\\n", - "6 2.421100 1.10790 \n", - "7 -0.710330 0.41986 \n", - "8 0.035842 0.33318 \n", - "9 -0.656850 0.18149 \n", - "10 0.039184 0.59864 \n", - "11 0.072240 0.91828 \n", - "12 7.625700 3.31830 \n", - "\n", - " Nuclei_Texture_SumEntropy_DNA_20_0 Nuclei_Texture_SumEntropy_DNA_5_0 \\\n", - "6 1.13820 1.14320 \n", - "7 -0.23888 0.54949 \n", - "8 0.39064 0.42969 \n", - "9 -0.10960 0.48699 \n", - "10 0.44123 0.75783 \n", - "11 0.39626 1.09120 \n", - "12 3.27410 -2.12240 \n", - "\n", - " Nuclei_Texture_Variance_RNA_10_0 \n", - "6 0.329230 \n", - "7 -0.092826 \n", - "8 -0.811390 \n", - "9 -0.345260 \n", - "10 -0.018031 \n", - "11 -0.243750 \n", - "12 2.299300 \n", - "\n", - "[7 rows x 507 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# only keep active compounds, i.e. those with corrected p-value < 0.05\n", - "active_compounds = replicate_maps.query(\"below_corrected_p\")[\"Metadata_broad_sample\"]\n", - "df_consistent = df.query(\"Metadata_broad_sample in @active_compounds\")\n", - "df_consistent.head(7)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_broad_sampleMetadata_targetCells_AreaShape_EccentricityCells_AreaShape_ExtentCells_AreaShape_FormFactorCells_AreaShape_OrientationCells_AreaShape_SolidityCells_AreaShape_Zernike_0_0Cells_AreaShape_Zernike_1_1Cells_AreaShape_Zernike_2_0...Nuclei_Texture_InverseDifferenceMoment_AGP_5_0Nuclei_Texture_InverseDifferenceMoment_DNA_20_0Nuclei_Texture_InverseDifferenceMoment_ER_5_0Nuclei_Texture_InverseDifferenceMoment_Mito_10_0Nuclei_Texture_InverseDifferenceMoment_Mito_5_0Nuclei_Texture_SumAverage_RNA_5_0Nuclei_Texture_SumEntropy_DNA_10_0Nuclei_Texture_SumEntropy_DNA_20_0Nuclei_Texture_SumEntropy_DNA_5_0Nuclei_Texture_Variance_RNA_10_0
0BRD-A69636825-003-04-7[CACNA1C, CACNA1S, CACNA2D1, CACNG1, HTR3A, KC...-0.3263650.6516100.2112800.0924120.4569150.4865150.4355450.863160...0.1752000.557360-0.8594650.4090450.201909-1.003185-1.405850-1.495100-0.867225-0.066115
1BRD-A69815203-001-07-6[ABCB11, CAMLG, FPR1, PPIA, PPIF, PPP3CA, PPP3...2.487450-2.8727500.616635-0.451942-2.260100-3.3009000.316320-1.825400...-2.681800-0.197230-4.7173500.6441701.3241000.1030700.9860251.3462000.773450-2.749350
2BRD-A70858459-001-01-7[ESR1, ESR2, MAP1A, MAP2]-0.9202101.4615500.445630-0.3942351.5284501.116100-0.0549901.061270...0.2388750.3264750.0645630.1876460.200447-0.6956600.1002250.4018850.114583-0.245753
3BRD-A72309220-001-04-1[HTR1A, HTR1B, HTR1D, HTR1E, HTR1F, HTR2A, HTR...0.0454350.0997550.1036280.592620-0.3522000.202930-0.059855-0.353755...1.069575-0.475915-0.1740020.2179650.090715-0.1546950.165235-0.1601910.242195-0.126886
4BRD-A73368467-003-17-6[HRH1]-0.062074-0.3148200.526190-0.502485-0.444675-0.1912250.1450190.018870...0.527805-1.2042500.615420-0.1876450.3218801.0132350.7936750.6829251.0755000.844115
\n", - "

5 rows × 495 columns

\n", - "
" - ], - "text/plain": [ - " Metadata_broad_sample Metadata_target \\\n", - "0 BRD-A69636825-003-04-7 [CACNA1C, CACNA1S, CACNA2D1, CACNG1, HTR3A, KC... \n", - "1 BRD-A69815203-001-07-6 [ABCB11, CAMLG, FPR1, PPIA, PPIF, PPP3CA, PPP3... \n", - "2 BRD-A70858459-001-01-7 [ESR1, ESR2, MAP1A, MAP2] \n", - "3 BRD-A72309220-001-04-1 [HTR1A, HTR1B, HTR1D, HTR1E, HTR1F, HTR2A, HTR... \n", - "4 BRD-A73368467-003-17-6 [HRH1] \n", - "\n", - " Cells_AreaShape_Eccentricity Cells_AreaShape_Extent \\\n", - "0 -0.326365 0.651610 \n", - "1 2.487450 -2.872750 \n", - "2 -0.920210 1.461550 \n", - "3 0.045435 0.099755 \n", - "4 -0.062074 -0.314820 \n", - "\n", - " Cells_AreaShape_FormFactor Cells_AreaShape_Orientation \\\n", - "0 0.211280 0.092412 \n", - "1 0.616635 -0.451942 \n", - "2 0.445630 -0.394235 \n", - "3 0.103628 0.592620 \n", - "4 0.526190 -0.502485 \n", - "\n", - " Cells_AreaShape_Solidity Cells_AreaShape_Zernike_0_0 \\\n", - "0 0.456915 0.486515 \n", - "1 -2.260100 -3.300900 \n", - "2 1.528450 1.116100 \n", - "3 -0.352200 0.202930 \n", - "4 -0.444675 -0.191225 \n", - "\n", - " Cells_AreaShape_Zernike_1_1 Cells_AreaShape_Zernike_2_0 ... \\\n", - "0 0.435545 0.863160 ... \n", - "1 0.316320 -1.825400 ... \n", - "2 -0.054990 1.061270 ... \n", - "3 -0.059855 -0.353755 ... \n", - "4 0.145019 0.018870 ... \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_AGP_5_0 \\\n", - "0 0.175200 \n", - "1 -2.681800 \n", - "2 0.238875 \n", - "3 1.069575 \n", - "4 0.527805 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_DNA_20_0 \\\n", - "0 0.557360 \n", - "1 -0.197230 \n", - "2 0.326475 \n", - "3 -0.475915 \n", - "4 -1.204250 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_ER_5_0 \\\n", - "0 -0.859465 \n", - "1 -4.717350 \n", - "2 0.064563 \n", - "3 -0.174002 \n", - "4 0.615420 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_10_0 \\\n", - "0 0.409045 \n", - "1 0.644170 \n", - "2 0.187646 \n", - "3 0.217965 \n", - "4 -0.187645 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_Mito_5_0 \\\n", - "0 0.201909 \n", - "1 1.324100 \n", - "2 0.200447 \n", - "3 0.090715 \n", - "4 0.321880 \n", - "\n", - " Nuclei_Texture_SumAverage_RNA_5_0 Nuclei_Texture_SumEntropy_DNA_10_0 \\\n", - "0 -1.003185 -1.405850 \n", - "1 0.103070 0.986025 \n", - "2 -0.695660 0.100225 \n", - "3 -0.154695 0.165235 \n", - "4 1.013235 0.793675 \n", - "\n", - " Nuclei_Texture_SumEntropy_DNA_20_0 Nuclei_Texture_SumEntropy_DNA_5_0 \\\n", - "0 -1.495100 -0.867225 \n", - "1 1.346200 0.773450 \n", - "2 0.401885 0.114583 \n", - "3 -0.160191 0.242195 \n", - "4 0.682925 1.075500 \n", - "\n", - " Nuclei_Texture_Variance_RNA_10_0 \n", - "0 -0.066115 \n", - "1 -2.749350 \n", - "2 -0.245753 \n", - "3 -0.126886 \n", - "4 0.844115 \n", - "\n", - "[5 rows x 495 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# aggregate replicates by taking the median of each feature\n", - "feature_cols = [c for c in df_consistent.columns if not c.startswith(\"Metadata\")]\n", - "df_consistent = df_consistent.groupby(\n", - " [\"Metadata_broad_sample\", \"Metadata_target\"], as_index=False\n", - ")[feature_cols].median()\n", - "df_consistent[\"Metadata_target\"] = df_consistent[\"Metadata_target\"].str.split(\"|\")\n", - "df_consistent.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we again use metadata columns to define grouping of profiles. Here, we'd like to group those compounds that share a target and assess their similarity against compounds that do not have the same target:\n", - "\n", - "* Two compound profiles are a positive pair if they share the same target. To define that using metadata columns, positive pairs should share the same value in the metadata column that identifies targets (`Metadata_target`). We add this column to a list names `pos_sameby`.\n", - "\n", - "* In this case, profiles that form a positive pair do not need to be different in any of the metatada columns, so we keep `pos_diffby` empty. Although one could define them as being structurally different, for example.\n", - "\n", - "* Two profiles are a negative pair when do not share a common target. That means they should be different in the metadata column that identifies targets (`Metadata_target`).\n", - "\n", - "* Profiles that form a negative pair do not need to be same in any of the metatada columns, so we keep `neg_sameby` empty.\n", - "\n", - "We use `map.multilabel.average_precision` because each compound can have more than one target. If that's not the case, the standard `map.average_precision` should be used instead." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d6f90c7f26924332b4a4e23ba90dd98e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2582,130 +912,239 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Metadata_broad_sampleaverage_precisionn_pos_pairsn_total_pairsMetadata_target
52BRD-A69636825-003-04-70.500000142HTR3A
32BRD-A72309220-001-04-10.406071442HTR1A
37BRD-A72309220-001-04-10.142857139HTR1B
39BRD-A72309220-001-04-10.142857139HTR1D
41BRD-A72309220-001-04-10.142857139HTR1E
...................................................
16BRD-K74363950-004-01-00.105128242CHRM3379BRD-K82746043-001-15-13.2487003.333300BRD-K82746043BRD-K82746043-001-15-1P20trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P20-15291.000000
19BRD-K74363950-004-01-00.105128242CHRM4380BRD-K82746043-001-15-11.0829001.111100BRD-K82746043BRD-K82746043-001-15-1P21trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P21-15290.966667
22BRD-K74363950-004-01-00.105128242CHRM5381BRD-K82746043-001-15-10.3609700.370370BRD-K82746043BRD-K82746043-001-15-1P22trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P22-15290.942857
28BRD-K76908866-001-07-60.500000142ERBB2382BRD-K82746043-001-15-10.1203200.123460BRD-K82746043BRD-K82746043-001-15-1P23trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P23-15291.000000
61BRD-K81258678-001-01-00.100000142RELA383BRD-K82746043-001-15-10.0401080.041152BRD-K82746043BRD-K82746043-001-15-1P24trttrtBRD-K82746043JLYAXFNOILIKPPBCL inhibitorBCL2|BCL2L1|BCL2L2broad_id_20170327P24-15291.000000
\n", - "

64 rows × 5 columns

\n", + "

360 rows × 18 columns

\n", "" ], "text/plain": [ - " Metadata_broad_sample average_precision n_pos_pairs n_total_pairs \\\n", - "52 BRD-A69636825-003-04-7 0.500000 1 42 \n", - "32 BRD-A72309220-001-04-1 0.406071 4 42 \n", - "37 BRD-A72309220-001-04-1 0.142857 1 39 \n", - "39 BRD-A72309220-001-04-1 0.142857 1 39 \n", - "41 BRD-A72309220-001-04-1 0.142857 1 39 \n", - ".. ... ... ... ... \n", - "16 BRD-K74363950-004-01-0 0.105128 2 42 \n", - "19 BRD-K74363950-004-01-0 0.105128 2 42 \n", - "22 BRD-K74363950-004-01-0 0.105128 2 42 \n", - "28 BRD-K76908866-001-07-6 0.500000 1 42 \n", - "61 BRD-K81258678-001-01-0 0.100000 1 42 \n", + " Metadata_broad_sample Metadata_mg_per_ml Metadata_mmoles_per_liter \\\n", + "6 BRD-K74363950-004-01-0 5.655600 10.000000 \n", + "7 BRD-K74363950-004-01-0 1.885200 3.333300 \n", + "8 BRD-K74363950-004-01-0 0.628400 1.111100 \n", + "9 BRD-K74363950-004-01-0 0.209470 0.370370 \n", + "10 BRD-K74363950-004-01-0 0.069823 0.123460 \n", + ".. ... ... ... \n", + "379 BRD-K82746043-001-15-1 3.248700 3.333300 \n", + "380 BRD-K82746043-001-15-1 1.082900 1.111100 \n", + "381 BRD-K82746043-001-15-1 0.360970 0.370370 \n", + "382 BRD-K82746043-001-15-1 0.120320 0.123460 \n", + "383 BRD-K82746043-001-15-1 0.040108 0.041152 \n", + "\n", + " Metadata_pert_id Metadata_pert_mfc_id Metadata_pert_well \\\n", + "6 BRD-K74363950 BRD-K74363950-004-01-0 A07 \n", + "7 BRD-K74363950 BRD-K74363950-004-01-0 A08 \n", + "8 BRD-K74363950 BRD-K74363950-004-01-0 A09 \n", + "9 BRD-K74363950 BRD-K74363950-004-01-0 A10 \n", + "10 BRD-K74363950 BRD-K74363950-004-01-0 A11 \n", + ".. ... ... ... \n", + "379 BRD-K82746043 BRD-K82746043-001-15-1 P20 \n", + "380 BRD-K82746043 BRD-K82746043-001-15-1 P21 \n", + "381 BRD-K82746043 BRD-K82746043-001-15-1 P22 \n", + "382 BRD-K82746043 BRD-K82746043-001-15-1 P23 \n", + "383 BRD-K82746043 BRD-K82746043-001-15-1 P24 \n", + "\n", + " Metadata_broad_sample_type Metadata_pert_type Metadata_broad_id \\\n", + "6 trt trt BRD-K74363950 \n", + "7 trt trt BRD-K74363950 \n", + "8 trt trt BRD-K74363950 \n", + "9 trt trt BRD-K74363950 \n", + "10 trt trt BRD-K74363950 \n", + ".. ... ... ... \n", + "379 trt trt BRD-K82746043 \n", + "380 trt trt BRD-K82746043 \n", + "381 trt trt BRD-K82746043 \n", + "382 trt trt BRD-K82746043 \n", + "383 trt trt BRD-K82746043 \n", + "\n", + " Metadata_InChIKey14 Metadata_moa \\\n", + "6 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", + "7 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", + "8 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", + "9 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", + "10 ASMXXROZKSBQIH acetylcholine receptor antagonist \n", + ".. ... ... \n", + "379 JLYAXFNOILIKPP BCL inhibitor \n", + "380 JLYAXFNOILIKPP BCL inhibitor \n", + "381 JLYAXFNOILIKPP BCL inhibitor \n", + "382 JLYAXFNOILIKPP BCL inhibitor \n", + "383 JLYAXFNOILIKPP BCL inhibitor \n", + "\n", + " Metadata_target Metadata_broad_date Metadata_Well \\\n", + "6 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A07 \n", + "7 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A08 \n", + "8 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A09 \n", + "9 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A10 \n", + "10 CHRM1|CHRM2|CHRM3|CHRM4|CHRM5 broad_id_20170327 A11 \n", + ".. ... ... ... \n", + "379 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P20 \n", + "380 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P21 \n", + "381 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P22 \n", + "382 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P23 \n", + "383 BCL2|BCL2L1|BCL2L2 broad_id_20170327 P24 \n", "\n", - " Metadata_target \n", - "52 HTR3A \n", - "32 HTR1A \n", - "37 HTR1B \n", - "39 HTR1D \n", - "41 HTR1E \n", - ".. ... \n", - "16 CHRM3 \n", - "19 CHRM4 \n", - "22 CHRM5 \n", - "28 ERBB2 \n", - "61 RELA \n", + " Metadata_reference_index n_pos_pairs n_total_pairs average_precision \n", + "6 -1 5 29 0.325013 \n", + "7 -1 5 29 0.513889 \n", + "8 -1 5 29 0.727778 \n", + "9 -1 5 29 0.783333 \n", + "10 -1 5 29 0.900000 \n", + ".. ... ... ... ... \n", + "379 -1 5 29 1.000000 \n", + "380 -1 5 29 0.966667 \n", + "381 -1 5 29 0.942857 \n", + "382 -1 5 29 1.000000 \n", + "383 -1 5 29 1.000000 \n", "\n", - "[64 rows x 5 columns]" + "[360 rows x 18 columns]" ] }, - "execution_count": 13, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# positive pairs are compounds that share a target\n", - "pos_sameby = [\"Metadata_target\"]\n", - "pos_diffby = []\n", - "\n", - "neg_sameby = []\n", - "# negative pairs are compounds that do not share a target\n", - "neg_diffby = [\"Metadata_target\"]\n", - "\n", - "metadata = df_consistent.filter(regex=\"^Metadata\")\n", - "profiles = df_consistent.filter(regex=\"^(?!Metadata)\").values\n", + "metadata = df_activity.filter(regex=\"^Metadata\")\n", + "profiles = df_activity.filter(regex=\"^(?!Metadata)\").values\n", "\n", - "target_aps = map.multilabel.average_precision(\n", - " metadata,\n", - " profiles,\n", - " pos_sameby=pos_sameby,\n", - " pos_diffby=pos_diffby,\n", - " neg_sameby=neg_sameby,\n", - " neg_diffby=neg_diffby,\n", - " multilabel_col=\"Metadata_target\",\n", + "activity_ap = map.average_precision(\n", + " metadata, profiles, pos_sameby, pos_diffby, neg_sameby, neg_diffby\n", ")\n", - "target_aps" + "activity_ap = activity_ap.query(\"Metadata_broad_sample != 'DMSO'\") # remove DMSO\n", + "activity_ap.to_csv(\"data/activity_ap.csv\", index=False)\n", + "activity_ap" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we can compute mAP scores and p-values for each target group." + "At the next step, we average replicate AP scores at the per-compound level to obtain mAP values using `mean_average_precision`.\n", + "\n", + "It also calculates p-values using permutation testing, and performs FDR correction to compare across compounds.\n", + "\n", + "For more information on choosing `null size` parameter see the [Null size](./null_size.ipynb) example." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "587b411cad734aa9ab356ee6ba537fd5", + "model_id": "3c684ee9c2094831a91b21b14dcbb2b0", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/15 [00:00\n", " \n", " \n", - " Metadata_target\n", + " Metadata_broad_sample\n", + " Metadata_reference_index\n", " mean_average_precision\n", " p_value\n", " corrected_p_value\n", @@ -2758,162 +1198,184 @@ " \n", " \n", " 0\n", - " ADRA1A\n", - " 0.250000\n", - " 0.113389\n", - " 0.192056\n", - " False\n", - " False\n", - " 0.716573\n", + " BRD-A69275535-001-01-5\n", + " -1\n", + " 0.575629\n", + " 1.725598e-02\n", + " 0.023276\n", + " True\n", + " True\n", + " 1.633101\n", " \n", " \n", " 1\n", - " ADRA2A\n", - " 0.250000\n", - " 0.113389\n", - " 0.192056\n", - " False\n", - " False\n", - " 0.716573\n", + " BRD-A69636825-003-04-7\n", + " -1\n", + " 0.693806\n", + " 3.477997e-03\n", + " 0.006507\n", + " True\n", + " True\n", + " 2.186605\n", " \n", " \n", " 2\n", - " AURKA\n", - " 0.625000\n", - " 0.023398\n", - " 0.101390\n", + " BRD-A69815203-001-07-6\n", + " -1\n", + " 1.000000\n", + " 9.999990e-07\n", + " 0.000008\n", " True\n", - " False\n", - " 0.994005\n", + " True\n", + " 5.081670\n", " \n", " \n", " 3\n", - " BIRC2\n", - " 0.060662\n", - " 0.379062\n", - " 0.469315\n", - " False\n", - " False\n", - " 0.328536\n", + " BRD-A70858459-001-01-7\n", + " -1\n", + " 0.777173\n", + " 8.279992e-04\n", + " 0.001921\n", + " True\n", + " True\n", + " 2.716482\n", " \n", " \n", " 4\n", - " CHRM1\n", - " 0.098420\n", - " 0.484752\n", - " 0.484752\n", - " False\n", - " False\n", - " 0.314481\n", + " BRD-A72309220-001-04-1\n", + " -1\n", + " 0.716927\n", + " 2.323998e-03\n", + " 0.004493\n", + " True\n", + " True\n", + " 2.347458\n", " \n", " \n", " 5\n", - " CHRM2\n", - " 0.098420\n", - " 0.484752\n", - " 0.484752\n", - " False\n", - " False\n", - " 0.314481\n", + " BRD-A72390365-001-15-2\n", + " -1\n", + " 0.934444\n", + " 2.799997e-05\n", + " 0.000108\n", + " True\n", + " True\n", + " 3.965506\n", " \n", " \n", " 6\n", - " CHRM3\n", - " 0.098420\n", - " 0.484752\n", - " 0.484752\n", - " False\n", - " False\n", - " 0.314481\n", + " BRD-A73368467-003-17-6\n", + " -1\n", + " 0.926032\n", + " 3.699996e-05\n", + " 0.000134\n", + " True\n", + " True\n", + " 3.872491\n", " \n", " \n", " 7\n", - " CHRM4\n", - " 0.098420\n", - " 0.484752\n", - " 0.484752\n", - " False\n", - " False\n", - " 0.314481\n", + " BRD-A74980173-001-11-9\n", + " -1\n", + " 0.765931\n", + " 1.017999e-03\n", + " 0.002187\n", + " True\n", + " True\n", + " 2.660188\n", " \n", " \n", " 8\n", - " CHRM5\n", - " 0.098420\n", - " 0.484752\n", - " 0.484752\n", - " False\n", - " False\n", - " 0.314481\n", + " BRD-A81233518-004-16-1\n", + " -1\n", + " 0.621183\n", + " 9.594990e-03\n", + " 0.014269\n", + " True\n", + " True\n", + " 1.845592\n", " \n", " \n", " 9\n", - " DRD2\n", - " 0.750000\n", - " 0.000900\n", - " 0.005849\n", - " True\n", - " True\n", - " 2.232888\n", + " BRD-A82035391-001-02-7\n", + " -1\n", + " 0.318066\n", + " 2.536767e-01\n", + " 0.258127\n", + " False\n", + " False\n", + " 0.588166\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Metadata_target mean_average_precision p_value corrected_p_value \\\n", - "0 ADRA1A 0.250000 0.113389 0.192056 \n", - "1 ADRA2A 0.250000 0.113389 0.192056 \n", - "2 AURKA 0.625000 0.023398 0.101390 \n", - "3 BIRC2 0.060662 0.379062 0.469315 \n", - "4 CHRM1 0.098420 0.484752 0.484752 \n", - "5 CHRM2 0.098420 0.484752 0.484752 \n", - "6 CHRM3 0.098420 0.484752 0.484752 \n", - "7 CHRM4 0.098420 0.484752 0.484752 \n", - "8 CHRM5 0.098420 0.484752 0.484752 \n", - "9 DRD2 0.750000 0.000900 0.005849 \n", - "\n", - " below_p below_corrected_p -log10(p-value) \n", - "0 False False 0.716573 \n", - "1 False False 0.716573 \n", - "2 True False 0.994005 \n", - "3 False False 0.328536 \n", - "4 False False 0.314481 \n", - "5 False False 0.314481 \n", - "6 False False 0.314481 \n", - "7 False False 0.314481 \n", - "8 False False 0.314481 \n", - "9 True True 2.232888 " + " Metadata_broad_sample Metadata_reference_index mean_average_precision \\\n", + "0 BRD-A69275535-001-01-5 -1 0.575629 \n", + "1 BRD-A69636825-003-04-7 -1 0.693806 \n", + "2 BRD-A69815203-001-07-6 -1 1.000000 \n", + "3 BRD-A70858459-001-01-7 -1 0.777173 \n", + "4 BRD-A72309220-001-04-1 -1 0.716927 \n", + "5 BRD-A72390365-001-15-2 -1 0.934444 \n", + "6 BRD-A73368467-003-17-6 -1 0.926032 \n", + "7 BRD-A74980173-001-11-9 -1 0.765931 \n", + "8 BRD-A81233518-004-16-1 -1 0.621183 \n", + "9 BRD-A82035391-001-02-7 -1 0.318066 \n", + "\n", + " p_value corrected_p_value below_p below_corrected_p \\\n", + "0 1.725598e-02 0.023276 True True \n", + "1 3.477997e-03 0.006507 True True \n", + "2 9.999990e-07 0.000008 True True \n", + "3 8.279992e-04 0.001921 True True \n", + "4 2.323998e-03 0.004493 True True \n", + "5 2.799997e-05 0.000108 True True \n", + "6 3.699996e-05 0.000134 True True \n", + "7 1.017999e-03 0.002187 True True \n", + "8 9.594990e-03 0.014269 True True \n", + "9 2.536767e-01 0.258127 False False \n", + "\n", + " -log10(p-value) \n", + "0 1.633101 \n", + "1 2.186605 \n", + "2 5.081670 \n", + "3 2.716482 \n", + "4 2.347458 \n", + "5 3.965506 \n", + "6 3.872491 \n", + "7 2.660188 \n", + "8 1.845592 \n", + "9 0.588166 " ] }, - "execution_count": 14, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "target_maps = map.mean_average_precision(\n", - " target_aps, pos_sameby, null_size=10000, threshold=0.05, seed=0\n", + "activity_map = map.mean_average_precision(\n", + " activity_ap, pos_sameby, null_size=1000000, threshold=0.05, seed=0\n", ")\n", - "target_maps[\"-log10(p-value)\"] = -target_maps[\"corrected_p_value\"].apply(np.log10)\n", - "target_maps.head(10)" + "activity_map[\"-log10(p-value)\"] = -activity_map[\"corrected_p_value\"].apply(np.log10)\n", + "activity_map.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Similarly, we can plot the results, where groups of compounds targeting the same gene are called consistent if their corrected p-value < 0.05." + "Finally, we can plot the results and filter out phenotypicall inactive compounds with corrected p-value >0.05." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA98klEQVR4nO3deXhU5f338c9kmywkIRgIAUJAgkBQILJIiAVBEJUq2KqIiuhPaGWryKKiRSioUPeKoBXLUmVxQUQRUUQQEWSPZZOyL5IEUEhCAiHL/fzBkyljFjKTSSY5vF/XNdfFnPM9Z75zEmY+uc9mM8YYAQAAWISPtxsAAADwJMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFD9vN1DZCgoKdOzYMYWGhspms3m7HQAAUAbGGGVmZqpevXry8Sl9bOayCzfHjh1TTEyMt9sAAABuOHLkiBo0aFBqzWUXbkJDQyVd2DhhYWFe7gYAAJRFRkaGYmJiHN/jpbnswk3hrqiwsDDCDQAA1UxZDinhgGIAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAFBuJ87n6v7/7FeT1f9R27U79EnaKa/1ctndOBMAAHiWMUYDth3QjxnZypeUlV+gwTsPqU6AvzpF1Kj0fhi5AQAA5ZKSk6st/z/YFPK1SZ+fOO2Vfgg3AACgXPx9bEWmGUl+xUyvDIQbAABQLrUD/NUrMtwRKnwk+cqme+rW8ko/HHMDAADKbVp8rF48mKo1pzJ1hb+fRjeuqxY1grzSC+EGAACUW6Cvj8Y1qeftNiSxWwoAAFgM4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFiKV8PN5MmT1b59e4WGhqpOnTrq06ePdu/eXeoys2fPls1mc3oEBgZWUscAAKCq82q4+fbbbzV06FD98MMPWr58uXJzc3XTTTcpKyur1OXCwsKUkpLieBw6dKiSOgYAAFWdV69zs2zZMqfns2fPVp06dbR582Z17ty5xOVsNpvq1q1b0e0BAIBqqEodc5Oeni5JqlWr9Ms1nzlzRrGxsYqJiVHv3r21Y8eOEmtzcnKUkZHh9AAAANZVZcJNQUGBRowYoaSkJF199dUl1jVr1kwzZ87U4sWL9d5776mgoECdOnXS0aNHi62fPHmywsPDHY+YmJiKegsAAKAKsBljjLebkKTBgwfriy++0Jo1a9SgQYMyL5ebm6sWLVqoX79+mjRpUpH5OTk5ysnJcTzPyMhQTEyM0tPTFRYW5pHeAQBAxcrIyFB4eHiZvr+rxL2lhg0bpiVLlmj16tUuBRtJ8vf3V0JCgvbu3VvsfLvdLrvd7ok2AQBANeDV3VLGGA0bNkyLFi3SN998o8aNG7u8jvz8fG3btk3R0dEV0CEAAKhuvDpyM3ToUM2bN0+LFy9WaGioUlNTJUnh4eEKCrpwm/QHHnhA9evX1+TJkyVJEydOVMeOHRUXF6fTp0/rxRdf1KFDhzRw4ECvvQ8AAFB1eDXcvPnmm5KkG264wWn6rFmz9OCDD0qSDh8+LB+f/w0wnTp1SoMGDVJqaqoiIiLUtm1brV27VvHx8ZXVNgAAqMKqzAHFlcWVA5IAAEDV4Mr3d5U5FRwAAMATCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBSCDcAAMBS/LzdAADg8nDobI5u2rhb6fkF8pX0fw0iNalpA2+3BQti5AYAUOHy8/N1/fpdSs8vuPBc0oyjJzX1UKp3G4MlEW4AABVu9eks5Zqi0/955GTlNwPLI9wAACpcbkFBsdONikk8QDkRbgAAFe7GWqHFfuHcU7dWpfcC6yPcAAAqnK+vr5a2bapAm80x7fbaNTUurr4Xu4JVcbYUAKBStAkL0cEbWnu7DVwGGLkBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACW4tVwM3nyZLVv316hoaGqU6eO+vTpo927d19yuQ8//FDNmzdXYGCgrrnmGi1durQSugUAANWBV8PNt99+q6FDh+qHH37Q8uXLlZubq5tuuklZWVklLrN27Vr169dPDz/8sLZu3ao+ffqoT58+2r59eyV2DgAAqiqbMcZ4u4lCJ06cUJ06dfTtt9+qc+fOxdb07dtXWVlZWrJkiWNax44d1aZNG7311luXfI2MjAyFh4crPT1dYWFhHusdAABUHFe+v6vUMTfp6emSpFq1apVYs27dOnXv3t1pWs+ePbVu3bpi63NycpSRkeH0AAAA1lVlwk1BQYFGjBihpKQkXX311SXWpaamKioqymlaVFSUUlNTi62fPHmywsPDHY+YmBiP9g0AAKqWKhNuhg4dqu3bt2vBggUeXe/YsWOVnp7ueBw5csSj6wcAAFWLn7cbkKRhw4ZpyZIlWr16tRo0aFBqbd26dZWWluY0LS0tTXXr1i223m63y263e6xXAABQtXl15MYYo2HDhmnRokX65ptv1Lhx40suk5iYqBUrVjhNW758uRITEyuqTQAAUI14deRm6NChmjdvnhYvXqzQ0FDHcTPh4eEKCgqSJD3wwAOqX7++Jk+eLEl69NFH1aVLF7388svq1auXFixYoE2bNuntt9/22vsAAABVh1dHbt58802lp6frhhtuUHR0tOPx/vvvO2oOHz6slJQUx/NOnTpp3rx5evvtt9W6dWt99NFH+uSTT0o9CBkAAFw+qtR1bioD17kBAKD6qbbXuQEAACgvwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUP1cXOH36tBYtWqTvvvtOhw4dUnZ2tmrXrq2EhAT17NlTnTp1qog+AQAAyqTMIzfHjh3TwIEDFR0drWeffVZnz55VmzZtdOONN6pBgwZauXKlevToofj4eL3//vsV2TMAAECJyjxyk5CQoAEDBmjz5s2Kj48vtubs2bP65JNP9Nprr+nIkSMaPXq0xxoFAAAoC5sxxpSl8JdfftEVV1xR5hW7Wl9ZMjIyFB4ervT0dIWFhXm7HQAAUAaufH+XebeUq0GlKgYbAABgfW6fLfXuu+8qKSlJ9erV06FDhyRJr732mhYvXuyx5gAAAFzlVrh58803NXLkSN166606ffq08vPzJUk1a9bUa6+95sn+AAAAXOJWuJk6dapmzJihp59+Wr6+vo7p7dq107Zt2zzWHAAAgKvcCjcHDhxQQkJCkel2u11ZWVnlbgoAAMBdboWbxo0bKzk5ucj0ZcuWqUWLFuXtCQAAwG0uX6FYkkaOHKmhQ4fq3LlzMsZow4YNmj9/viZPnqx33nnH0z0CAACUmVvhZuDAgQoKCtJf//pXZWdn695771W9evX0j3/8Q/fcc4+newQAACizMl/EryTZ2dk6c+aM6tSp46meKhQX8QMAoPpx5fvbrZGbiwUHBys4OLi8qwEAAPAIt8JN48aNZbPZSpy/f/9+txsCAAAoD7fCzYgRI5ye5+bmauvWrVq2bJnGjBnjib4AAADc4la4efTRR4udPm3aNG3atKlcDQEAAJSH2/eWKs4tt9yihQsXenKVAAAALvFouPnoo49Uq1YtT64SAADAJW7tlkpISHA6oNgYo9TUVJ04cULTp0/3WHMAAACucivc9OnTx+m5j4+PateurRtuuEHNmzf3RF8AAABuKfdF/KobLuIHAED1UyEX8cvIyChzA4QGAADgLWUONzVr1iz1wn3ShWNvbDab8vPzy90YAACAO8ocblauXFmRfQAAAHhEmcNNly5dKrIPAAAAjyjXjTOzs7N1+PBhnT9/3ml6q1atytUUAACAu9wKNydOnNBDDz2kL774otj5HHMDAAC8xa0rFI8YMUKnT5/W+vXrFRQUpGXLlmnOnDlq2rSpPv30U0/3CAAAUGZujdx88803Wrx4sdq1aycfHx/FxsaqR48eCgsL0+TJk9WrVy9P9wkAAFAmbo3cZGVlqU6dOpKkiIgInThxQpJ0zTXXaMuWLZ7rDgAAwEVuhZtmzZpp9+7dkqTWrVvrn//8p37++We99dZbio6O9miDAAAArnBrt9Sjjz6qlJQUSdL48eN18803a+7cuQoICNDs2bM92R8AAIBLPHJvqezsbP30009q2LChIiMjPdFXheHeUgAAVD+ufH+7tVtqzZo1Ts+Dg4N17bXXVvlgAwAArM+tcNOtWzc1btxYTz31lHbu3OnpngCgTGbPnq2aNWt6u40KtWrVKtlsNp0+fdoj6zt48KBsNpuSk5MrZP3ecMMNN2jEiBHebgNViFvh5tixYxo1apS+/fZbXX311WrTpo1efPFFHT161NP9AbiMPfjgg7LZbLLZbAoICFBcXJwmTpyovLw8b7dWRKNGjfTaa695fL2dOnVSSkqKwsPDPb5uq/j44481adKkMtVWZBCy2Wz65JNPKmTdF3vuuefUqVMnBQcHlxjuC//fXPxYsGBBqeu9/fbb1bBhQwUGBio6Olr9+/fXsWPHnGq+/PJLdezYUaGhoapdu7b++Mc/6uDBg475W7duVUJCgmrUqKHbbrtNv/76q2NeXl6e2rZtqw0bNrj93svKrXATGRmpYcOG6fvvv9e+fft01113ac6cOWrUqJG6devm6R4BXMZuvvlmpaSkaM+ePRo1apQmTJigF1980dttVZqAgADVrVtXNpvN261UWbVq1VJoaKi326g058+f11133aXBgweXWjdr1iylpKQ4Hn369Cm1vmvXrvrggw+0e/duLVy4UPv27dOdd97pmH/gwAH17t1b3bp1U3Jysr788kudPHlSf/jDHxw1AwcOVLdu3bRlyxalp6fr+eefd8x7+eWXlZSUpA4dOrj3xl1hPCAvL8989tlnpk2bNsbHx6fMy3377bfm97//vYmOjjaSzKJFi0qtX7lypZFU5JGSklLm10xPTzeSTHp6epmXAeAdAwYMML1793aa1qNHD9OxY0djjDGzZs0y4eHhZtmyZaZ58+YmJCTE9OzZ0xw7dsxpmRkzZpjmzZsbu91umjVrZqZNm+aYd+DAASPJLFy40Nxwww0mKCjItGrVyqxdu9ZpHR999JGJj483AQEBJjY21rz00kuOeV26dCnyuXTmzBkTGhpqPvzwQ6f1LFq0yAQHB5uMjAzHa8+fP98kJiYau91uWrZsaVatWuWoL/zcO3XqlGPamjVrTJcuXUxQUJCpWbOmuemmm8yvv/5qjDHmiy++MElJSSY8PNzUqlXL9OrVy+zdu7fI+926dWuR9Zel5+Lk5+ebv//976ZJkyYmICDAxMTEmGeffdYx/z//+Y/p2rWrCQwMNLVq1TKDBg0ymZmZjvmFP+cXX3zR1K1b19SqVcsMGTLEnD9/3lEzbdo0ExcXZ+x2u6lTp4754x//6LT9H3300UvWDhgwoMjP6cCBA8YYY7Zt22ZuvvlmExISYurUqWPuv/9+c+LECafXGD58uBkzZoyJiIgwUVFRZvz48Y75sbGxTuuNjY0tdlt5UuHvf3HK8p16KYsXLzY2m83xc/jwww+Nn5+fyc/Pd9R8+umnTjVBQUFm165dxhhjpk+fbm699VZjjDH79u0zTZs2LfF3qCxc+f52a+Sm0Pfff68hQ4YoOjpa9957r66++mp9/vnnZV4+KytLrVu31rRp01x63d27dzul0cILCgKwvqCgIKeb9WZnZ+ull17Su+++q9WrV+vw4cMaPXq0Y/7cuXP1zDPP6LnnntOuXbv0/PPPa9y4cZozZ47Tep9++mmNHj1aycnJuuqqq9SvXz/H7q/Nmzfr7rvv1j333KNt27ZpwoQJGjdunOPSFx9//LEaNGigiRMnOj6XQkJCdM8992jWrFlOrzNr1izdeeedTiMNY8aM0ahRo7R161YlJibqtttu0y+//FLs+09OTtaNN96o+Ph4rVu3TmvWrNFtt93muKdfVlaWRo4cqU2bNmnFihXy8fHRHXfcoYKCgktuW1d6vtjYsWM1ZcoUjRs3Tjt37tS8efMUFRXl6Kdnz56KiIjQxo0b9eGHH+rrr7/WsGHDnNaxcuVK7du3TytXrtScOXM0e/Zsx/bdtGmT/vKXv2jixInavXu3li1bps6dOxfbS2m1//jHP5SYmKhBgwY5fk4xMTE6ffq0unXrpoSEBG3atEnLli1TWlqa7r77bqd1z5kzRyEhIVq/fr1eeOEFTZw4UcuXL5ckbdy40bGtUlJSHM+L07JlS9WoUaPExy233FLisq4YOnSoIiMj1aFDB82cOVPGhZOjf/31V82dO1edOnWSv7+/JKlt27by8fHRrFmzlJ+fr/T0dL377rvq3r27o6Z169Zavny58vLytGLFCseNtB955BG98MILlTfC5k56evLJJ02jRo1MQECA6dWrl5k3b57JyspyZ1UOcmHk5uK/YC7l3LlzJj093fE4cuQIIzdANXHxyE1BQYFZvny5sdvtZvTo0caYC3+5SnIamZg2bZqJiopyPG/SpImZN2+e03onTZpkEhMTjTH/G8l45513HPN37NhhJDn+Ar333ntNjx49nNYxZswYEx8f73geGxtrXn31Vaea9evXG19fX8dIUlpamvHz83OMzBS+9pQpUxzL5ObmmgYNGpi///3vxpiin3v9+vUzSUlJZdh6F5w4ccJIMtu2bXN6zeJGbsrS829lZGQYu91uZsyYUez8t99+20RERJgzZ844pn3++efGx8fHpKamGmMu/JxjY2NNXl6eo+auu+4yffv2NcYYs3DhQhMWFlbiX/0Xj9y4Ulto0qRJ5qabbnKaVvhdsXv3bsdy119/vVNN+/btzRNPPOF4XpbvMWOMOXjwoNmzZ0+Jj6NHj15yHcaUPnIzceJEs2bNGrNlyxYzZcoUY7fbzT/+8Y9LrvPxxx83wcHBRpLp2LGjOXnypNP8VatWmTp16hhfX18jySQmJjp9J2/fvt107tzZNGzY0PTr18+kp6ebf//736Z3797m6NGj5qabbjJNmjQxTz/9dJne48UqfORm9erVGjNmjH7++WctWbJE/fr1U3BwcHlzVpm1adNG0dHR6tGjh77//vtSaydPnqzw8HDHIyYmppK6BOAJS5YsUY0aNRQYGKhbbrlFffv21YQJExzzg4OD1aRJE8fz6OhoHT9+XNKFUYN9+/bp4YcfdvrL+Nlnn9W+ffucXqfwL8zCdUhyrGfXrl1KSkpyqk9KStKePXscIybF6dChg1q2bOkYJXrvvfcUGxtbZNQhMTHR8W8/Pz+1a9dOu3btKnadhSM3JdmzZ4/69eunK6+8UmFhYWrUqJEk6fDhwyUu407PhXbt2qWcnJwSe9q1a5dat26tkJAQx7SkpCQVFBQ4rnQvXRjN8PX1dTy/+OfYo0cPxcbG6sorr1T//v01d+5cZWdnF/t6rtQW+vHHH7Vy5Uqn35HmzZtLktPvycW/I7/t0RWxsbGKi4sr8VG/fn2X1/lb48aNU1JSkhISEvTEE0/o8ccfL9OxamPGjNHWrVv11VdfydfXVw888IBjxCc1NVWDBg3SgAEDtHHjRn377bcKCAjQnXfe6ahp2bKlvv32Wx06dEjz5s1Tbm6uxo8frzfeeEPDhw9Xp06d9OOPP+rjjz/WZ599Vu73WRK3wk3h7qjIyEjNnz9fWVlZnu6rWNHR0Xrrrbe0cOFCLVy4UDExMbrhhhtKvZ/V2LFjlZ6e7ngcOXKkUnoF4Bldu3ZVcnKy9uzZo7Nnzzp2DRQqHA4vZLPZHB+0Z86ckSTNmDFDycnJjsf27dv1ww8/OC138XoKD94ty66cSxk4cKBj98qsWbP00EMPlevg4KCgoFLnF56hMmPGDK1fv17r16+XJKddeZ7s+VL9lFVxP8fC7R8aGqotW7Zo/vz5io6O1jPPPKPWrVsXe/q6K7WFzpw5o9tuu83pd6Twd+7iUFdaj66orN1SF7vuuut09OhR5eTklFoXGRmpq666Sj169NCCBQu0dOlSx/+VadOmKTw8XC+88IISEhLUuXNnvffee1qxYoXj9+y3Ro4cqREjRqhBgwZatWqV7rrrLoWEhKhXr15atWqVp9+mg1u3X7jYn//8Z1133XW68sorPdFPqZo1a6ZmzZo5nnfq1En79u3Tq6++qnfffbfYZex2u+x2e4X3BqBihISEKC4uzq1lo6KiVK9ePe3fv1/33Xef2z20aNGiyCjx999/r6uuusox2hAQEFDsKM7999+vxx9/XK+//rp27typAQMGFKn54YcfHF+ieXl52rx5c5FjUgq1atVKK1as0N/+9rci83755Rft3r1bM2bM0O9+9ztJRS+6WhZl6blQ06ZNFRQUpBUrVmjgwIFF5rdo0UKzZ89WVlaWI5R+//338vHxcfo8vxQ/Pz91795d3bt31/jx41WzZk198803TmfqlKW2uJ/Ttddeq4ULF6pRo0by83P/a9Hf37/UkbxCS5cuVW5ubonzPRUYL5acnKyIiAiXvg8Lg1thIMrOzpaPj/OYSOHvf3Ehb8WKFdq1a5fjGK78/HzH+y7t/XtCucONKf/dG8qlQ4cObv3nBXB5+Nvf/qa//OUvCg8P180336ycnBxt2rRJp06d0siRI8u0jlGjRql9+/aaNGmS+vbtq3Xr1umNN97Q9OnTHTWNGjXS6tWrdc8998hutzuu2B4REaE//OEPGjNmjG666SY1aNCgyPqnTZumpk2bqkWLFnr11Vd16tQp/d///V+xvYwdO1bXXHONhgwZokceeUQBAQFauXKl7rrrLtWqVUtXXHGF3n77bUVHR+vw4cN68sknXd5mZem5UGBgoGO3R0BAgJKSknTixAnt2LFDDz/8sO677z6NHz9eAwYM0IQJE3TixAkNHz5c/fv3dxx0fClLlizR/v371blzZ0VERGjp0qUqKCgoNhxdqrZRo0Zav369Dh48qBo1aqhWrVoaOnSoZsyYoX79+unxxx9XrVq1tHfvXi1YsEDvvPOO0+6y0jRq1EgrVqxQUlKS7Ha7IiIiiq2LjY0t0/pKcvjwYf366686fPiw8vPzHRdkjIuLU40aNfTZZ58pLS1NHTt2VGBgoJYvX67nn3/e6UD7DRs26IEHHtCKFStUv359rV+/Xhs3btT111+viIgI7du3T+PGjVOTJk0cu0179eqlV199VRMnTlS/fv2UmZmpp556SrGxsUpISHDq8dy5cxo2bJjmz5/vCERJSUmaNm2ahg4dqoULF+qVV14p13YolctH9PxGjRo1zL59+8q7GrdPW+vevbu54447ylzPqeBA9VHcqeAXK+6AykWLFpnffrTNnTvXtGnTxgQEBJiIiAjTuXNn8/HHHxtjih5ga4wxp06dMpLMypUrHdMKTwX39/c3DRs2NC+++KLTa6xbt860atXK2O32Iq+/YsUKI8l88MEHTtMLX3vevHmmQ4cOJiAgwMTHx5tvvvnGUVPciRSrVq0ynTp1Mna73dSsWdP07NnTMX/58uWmRYsWxm63m1atWplVq1Y5fb5e6oDiS/VcnPz8fPPss8+a2NhYx/Z5/vnnHfPLeir4xR599FHTpUsXY4wx3333nenSpYuJiIhwnKr//vvvO2ovPkj4UrW7d+82HTt2NEFBQU6ngv/3v/81d9xxh6lZs6YJCgoyzZs3NyNGjDAFBQVFXqNQ7969zYABAxzPP/30UxMXF2f8/Pwq9FTw4k5pv/j39YsvvjBt2rQxNWrUMCEhIaZ169bmrbfecjqFu/DnXvj+C39GtWrVMna73TRq1Mg88sgjRQ5unj9/vklISDAhISGmdu3a5vbbb3cceH+xJ5980owaNcpp2p49e0z79u1NWFiYGTx4sFM/ZeHK93e5b5y5Zs0atW/f3q1dP2fOnNHevXslSQkJCXrllVfUtWtX1apVSw0bNtTYsWP1888/69///rck6bXXXlPjxo3VsmVLnTt3Tu+8846mTp2qr776qtQD7C7GjTMBVLZ3331Xjz32mI4dO6aAgADH9IMHD6px48baunWr2rRp470Gi1FSz4C3uPL9Xa7dUsePH5cxRhs2bFCzZs1cvt7Mpk2b1LVrV8fzwiHiAQMGaPbs2UpJSXE6wv/8+fMaNWqUfv75ZwUHB6tVq1b6+uuvndYBAFVFdna2UlJSNGXKFP35z3+uFiGhOvYM/JZbIzeZmZkaMmSIFixY4Dh4ytfXV3379nUcTV1VMXIDoLJMmDBBzz33nDp37qzFixerRo0aTvOr4sjNpXoGvMWV72+3wk3fvn21detWTZ061XGg0bp16/Too4+qTZs2l7w5lzcRbgAAqH4qPNyEhIToyy+/1PXXX+80/bvvvtPNN99cade9cQfhBgCA6seV72+3LuJ3xRVXFLvrKTw8vMRT3wAAACqDW+Hmr3/9q0aOHKnU1FTHtNTUVI0ZM0bjxo3zWHMAAACucmu3VEJCgvbu3aucnBw1bNhQ0oWLCtntdjVt2tSptrRbI3gDu6UAAKh+KvxU8D59+rizGAAAQIUr90X8qhtGbgAAqH4q5IDiyywDAQCAaqrM4aZly5ZasGCBzp8/X2rdnj17NHjwYE2ZMqXczQEAALiqzMfcTJ06VU888YSGDBmiHj16qF27dqpXr54CAwN16tQp7dy5U2vWrNGOHTs0bNgwDR48uCL7BgAAKJbLx9ysWbNG77//vr777jsdOnRIZ8+eVWRkpBISEtSzZ0/dd999VfpaNxxzAwBA9VPhVyiuzgg3AABUPxV+hWIAAICqyuXr3Jw8eVIzZ87UunXrHFcorlu3rhITE/XQQw+pdu3aHm8SAACgrFwaudm4caOuuuoqvf766woPD1fnzp3VuXNnhYeHa+rUqWrevLk2bdpUUb0CAABckkvH3HTs2FGtW7fWW2+9JZvN5jTPGKNHHnlE//nPf7Ru3TqPN+opHHMDAED1U2G3X/jxxx81e/bsIsFGkmw2mx577DElJCS41i0AAIAHubRbqm7dutqwYUOJ8zds2KCoqKhyNwUAAOAul0ZuRo8erT/96U/avHmzbrzxRkeQSUtL04oVKzRjxgy99NJLFdIoAABAWbgUboYOHarIyEi9+uqrmj59uvLz8yVJvr6+atu2rWbPnq277767QhoFAAAoC7cv4pebm6uTJ09KkiIjI+Xv7+/RxipK4QFJx44dK/aAJF9fXwUGBjqeZ2VllbguHx8fBQUFuVWbnZ1d4s1IbTabgoOD3ao9e/asCgoKSuwjJCTErdpz5845wmx5a4ODgx3HbeXk5CgvL88jtUFBQfLxubCn9fz588rNzfVIbWBgoHx9fV2uzc3NLfVebHa7XX5+fi7X5uXlKScnp8TagIAAx/9HV2rz8/N17ty5Emv9/f0VEBDgcm1BQYHOnj3rkVo/Pz/Z7XZJF05iyM7O9kitK//v+YwovpbPCD4jKvozwqUTgsxlJj093Ugq8XHrrbc61QcHB5dY26VLF6fayMjIEmvbtWvnVBsbG1tibXx8vFNtfHx8ibWxsbFOte3atSuxNjIy0qm2S5cuJdYGBwc71d56662lbreL3XnnnaXWnjlzxlE7YMCAUmuPHz/uqB0yZEiptQcOHHDUjh49utTa7du3O2rHjx9fau2GDRsctS+88EKptStXrnTUvvHGG6XWLlmyxFE7a9asUms/+OADR+0HH3xQau2sWbMctUuWLCm19o033nDUrly5stTaF154wVG7YcOGUmvHjx/vqN2+fXuptaNHj3bUHjhwoNTaIUOGOGqPHz9eau2AAQMctWfOnCm19s4773T6HS6tls+ICw8+I/734DPiwqOiPyMKv7/T09PNpXj0CsX79u1Tt27dPLlKAAAAl3j03lI//vijrr322lKHG72N3VIMObtay5DzBeyWcr2Wzwj3avmMuIDPCOfaCrtx5uuvv17q/J9//lkvvfRStQg3XMQPAIDqo8Iu4jdixAhFR0c70tZvlZYqAQAAKoNL4SY2NlZ///vfSzzdOzk5WW3btvVIYwAAAO5w6YDitm3bavPmzSXOt9lsJe73BQAAqAwujdxMnDix1IPy4uPjdeDAgXI3BQAA4C6Xwk18fHyp8/39/RUbG1uuhgAAAMrDo9e5AQAA8DaXRm4KJSQkOK4rcDGbzabAwEDFxcXpwQcfVNeuXcvdIAAAgCvcGrm5+eabtX//foWEhKhr167q2rWratSooX379ql9+/ZKSUlR9+7dtXjxYk/3CwAAUCq3Rm5OnjypUaNGady4cU7Tn332WR06dEhfffWVxo8fr0mTJql3794eaRQAAKAs3Lr9Qnh4uDZv3qy4uDin6Xv37lXbtm2Vnp6un376Se3bt1dmZqbHmvUErlAMAED148r3t1u7pQIDA7V27doi09euXeu450pBQYHT/VcAAAAqg1u7pYYPH65HHnlEmzdvVvv27SVJGzdu1DvvvKOnnnpKkvTll1+qTZs2HmsUAACgLNy+K/jcuXP1xhtvaPfu3ZKkZs2aafjw4br33nslXbibbOHZU1UJu6UAAKh+Kuyu4FZAuAEAoPqpsLuC/9bmzZu1a9cuSVLLli2VkJBQntUBAACUm1vh5vjx47rnnnu0atUq1axZU5J0+vRpde3aVQsWLFDt2rU92SNw2Vj503Et35WmYH9f3XtdQ11Zu4a3WwKAasets6WGDx+uzMxM7dixQ7/++qt+/fVXbd++XRkZGfrLX/7i6R6By8J7PxzSQ7M36v2NRzRr7UH1en2NdqdWrUspAEB14Fa4WbZsmaZPn64WLVo4psXHx2vatGn64osvPNYccLkwxujvy36SJOUXGOUXGJ3PK9Cbq/Z6uTMAqH7cCjcFBQXy9/cvMt3f318FBQXlbgq43BQY6UxOntO0fGP0a9Z5L3UEANWXW+GmW7duevTRR3Xs2DHHtJ9//lmPPfaYbrzxRo81B1wufH1satswQr4+zjekTWwS6aWOAKD6civcvPHGG8rIyFCjRo3UpEkTNWnSRI0bN1ZGRoamTp3q6R6By8LUexN0VdT/DiC+u10DDfpdYy92BADVk9vXuTHG6Ouvv9ZPP104TqBFixbq3r27R5urCFznBlWZMUbHM3MU6Oer8OCiu34B4HLFRfxKQbgBAKD6qZCL+L3++utlboDTwQEAgLeUeeSmceOy7fu32Wzav39/uZqqSIzcAABQ/VTIyM2BAweKnb5mzRq1a9euyt0gEwAAXJ7cOlvqYrfeeqvTKeEAAADeVO5wc5kdjwwAAKq4cocbAACAqqTc4eaf//ynoqKiPNELAABAuZX5gOKS3HvvvZ7oAwAAwCPYLQUAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACzFq+Fm9erVuu2221SvXj3ZbDZ98sknl1xm1apVuvbaa2W32xUXF6fZs2dXeJ8AAKD68Gq4ycrKUuvWrTVt2rQy1R84cEC9evVS165dlZycrBEjRmjgwIH68ssvK7hTAKgYa/ee1D++3qN31x3UmZw8b7cDWILNVJE7X9psNi1atEh9+vQpseaJJ57Q559/ru3btzum3XPPPTp9+rSWLVtW7DI5OTnKyclxPM/IyFBMTIzS09MVFhbmsf4BwFXvfLdfz36+S74+NhUYo8aRIfpkaJLCAv293RpQ5WRkZCg8PLxM39/V6pibdevWqXv37k7TevbsqXXr1pW4zOTJkxUeHu54xMTEVHSbAHBJmedyNXnpT5Kk/AIjY6SDJ7P077UHvdsYYAHVKtykpqYWuUlnVFSUMjIydPbs2WKXGTt2rNLT0x2PI0eOVEarAFCqX86cV/5vBs59bDalZeSUsASAsir3jTOrOrvdLrvd7u02AMBJdM1AhQf5K/Ncrgr+f8bJKzC6pn64dxsDLKBajdzUrVtXaWlpTtPS0tIUFhamoKAgL3UFAK6z+/nq7f5tFRzwv78x727XQHe2beDFrgBrqFYjN4mJiVq6dKnTtOXLlysxMdFLHQGA+6678gqtHdtNe9LOqFZIgBpHhni7JcASvDpyc+bMGSUnJys5OVnShVO9k5OTdfjwYUkXjpd54IEHHPWPPPKI9u/fr8cff1w//fSTpk+frg8++ECPPfaYN9oHgHILC/RX29gIgg3gQV4NN5s2bVJCQoISEhIkSSNHjlRCQoKeeeYZSVJKSooj6EhS48aN9fnnn2v58uVq3bq1Xn75Zb3zzjvq2bOnV/oHAABVT5W5zk1lceU8eQAAUDVY9jo3AAAAl0K4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAlkK4AQAAluLn7QYA4GInz+Top5RMRYT4Kz46TDabzdstAahmCDcAqozV/z2hP7+7WWdz8yVJt7eO1qt9E+TrQ8ABUHbslgJQJZzLzdeQuVt07v8HG0n69McUvb/xiBe7AlAdMXIDoEo4dvqszuTkOU3z87FpV0qGlzoCUF0xcgOgSqgdai+y+6nAGNWrGeSljgBUV4QbAFVCaKC/JtzeUpJUmHGa1Q3VgE6xXuwKQHXEbikAVUb/jrGKjw7TlkOnFBESoN+3ilagv6+32wJQzRBuUGXNWL1Pb3yzV5L0aPemuuWaaH2967i2HDqlX87kqGGtYI3p2UzhwQFurb+gwOirnak6euqsmtcN0/VNIz3ZPtzUNjZCbWMjvN0GgGrMZowx3m6iMmVkZCg8PFzp6ekKCwvzdjsowfD5m/XZj6lO03x9pPwC57ogf199/0Q31arhWsApKDD683ubtXxnmnxsUoGRHunSRE/e0ry8rQMAKoAr398cc4Mq6bfBRioabCTpbG6+Rn+U7PL6v9qZpuU70yRdCDaS9Na3+7QnLdPldQEAqhbCDaq9lNPnXF7m6KlsFXdduKOnz3qgIwCANxFuUO01j3Z992KL6DDHiE0hH5sUV7uGh7oCAHgL4QZV0tO3Fj32pWZQ0ePf64UH6u9/bOXy+pPiIjX4hiaO5z426fk7rlFMrWCX1wUAqFo4oBhV1u6UdD358Tb52Gx6/o6rFRtZQ1sOn9LhX7OUcvqcGl1RQ73bRMvHx/2MvictUz+fPqsmtWsQbACgCnPl+5twAwAAqjzOlgIAAJctwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALAUwg0AALCUotezh+X9fPqstv+crlohAWrbMEI+xd1BEgCAaopwc5n5ckeqhs3botz8Cxem7hEfpTfvu1Z+vgziAQCsgW+0y0j2+Tw9umCrI9hI0tc707Rg4xEvdgUAgGcRbi4jx06f07ncAqdpvr427UnL9FJHAAB4HuHmMhIVZpffb46vKSgwanhFiJc6AgDA8wg3l5HQQH89f8c1sl2UbxIaRui+6xp6rykAADyMA4ovM3e3j9HV9cO19cgpXRESoBtbRMmfg4kBABZCuLkMxdcLU3y9MG+3AQBAheBPdgAAYCmEGwAAYCmEGwAAYCmEGwAAYCmEGwAAYCmEGwAAYCmEGwAAYCmEGwAAYClVItxMmzZNjRo1UmBgoK677jpt2LChxNrZs2fLZrM5PQIDAyuxWwAAUJV5Pdy8//77GjlypMaPH68tW7aodevW6tmzp44fP17iMmFhYUpJSXE8Dh06VIkdAwCAqszr4eaVV17RoEGD9NBDDyk+Pl5vvfWWgoODNXPmzBKXsdlsqlu3ruMRFRVVYm1OTo4yMjKcHgAAwLq8Gm7Onz+vzZs3q3v37o5pPj4+6t69u9atW1ficmfOnFFsbKxiYmLUu3dv7dixo8TayZMnKzw83PGIiYnx6HsAAABVi1fDzcmTJ5Wfn19k5CUqKkqpqanFLtOsWTPNnDlTixcv1nvvvaeCggJ16tRJR48eLbZ+7NixSk9PdzyOHDni8fcBAACqjmp3V/DExEQlJiY6nnfq1EktWrTQP//5T02aNKlIvd1ul91ur8wWAQCAF3l15CYyMlK+vr5KS0tzmp6Wlqa6deuWaR3+/v5KSEjQ3r17K6JFAABQzXg13AQEBKht27ZasWKFY1pBQYFWrFjhNDpTmvz8fG3btk3R0dEV1SYAAKhGvL5bauTIkRowYIDatWunDh066LXXXlNWVpYeeughSdIDDzyg+vXra/LkyZKkiRMnqmPHjoqLi9Pp06f14osv6tChQxo4cKA33wYAAKgivB5u+vbtqxMnTuiZZ55Ramqq2rRpo2XLljkOMj58+LB8fP43wHTq1CkNGjRIqampioiIUNu2bbV27VrFx8d76y0AAIAqxGaMMd5uojJlZGQoPDxc6enpCgsL83Y7AACgDFz5/vb6RfwAAAA8iXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXDjQdnn85SXV+DWsufzCmSM8Ugfxhidd7MPAACqOz9vN2AFe9My9ce31ir9bJ4kKanJFXr34Q7y8bl0djz8S7aGzN2s7ccyFGL31bhe8bqnQ0O3e1m4+agmfLZDmefy1CI6VNPva6vGkSFurw8AgOqGkRsP+MOb/ws2kvT9vl/0xMfbLrlcXn6BBsxcr12pmZKkrJx8PfnxNq3Zc9KtPjYc+FWjP/xRmecu9PLf1DPq/6/1jOIAAC4rjNyUU3r2eWWcyysyffV/T1xy2SOnzurAL9lFpt//r/WSJLufj7Y+1V3Bwf5l6mXl7uPy9bEpr+DC7q18Y3T01Fkd/CVLV0WFlmkdAABUd4zclFNQQPH50O536U0bHOBb6vycvAK1m7KizL0E+/uqoJjjdoL8S38dAACshHBTTgF+PmoXG1Fk+mPdm11y2aiwQPVJqC9bKTXZ5/PL3Mtd7WIUGugvX58La7RJuuXqumoQEVTmdQAAUN2xW8oDPvhzR414/0et3XdSgf6+eqx7U91xbf0yLfvSna10VVQNbTp4St/8dLxcfdQND9Rnw67XtJV7deJMjq5tWFN/7tJENltp8QkAAGuxGU+df1xNZGRkKDw8XOnp6QoLC/N2O07in1lWZKSmbphdPzzV3UsdAQBQNbjy/c1uqSrkx3E3Kjzwf4Np9cIDCTYAALiI3VJViL+/v36c0NPbbQAAUK0xcgMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACzlsrtxpjFG0oVbpwMAgOqh8Hu78Hu8NJdduMnMzJQkxcTEeLkTAADgqszMTIWHh5daYzNliUAWUlBQoGPHjik0NFQ2m83b7VhWRkaGYmJidOTIEYWFhXm7ncsO29+72P7exfb3rora/sYYZWZmql69evLxKf2omstu5MbHx0cNGjTwdhuXjbCwMD5cvIjt711sf+9i+3tXRWz/S43YFOKAYgAAYCmEGwAAYCmEG1QIu92u8ePHy263e7uVyxLb37vY/t7F9veuqrD9L7sDigEAgLUxcgMAACyFcAMAACyFcAMAACyFcAMAACyFcAO3TZs2TY0aNVJgYKCuu+46bdiwocTaGTNm6He/+50iIiIUERGh7t27l1qPS3Nl+19swYIFstls6tOnT8U2aHGubv/Tp09r6NChio6Olt1u11VXXaWlS5dWUrfW4+r2f+2119SsWTMFBQUpJiZGjz32mM6dO1dJ3VrH6tWrddttt6levXqy2Wz65JNPLrnMqlWrdO2118putysuLk6zZ8+u8D5lADcsWLDABAQEmJkzZ5odO3aYQYMGmZo1a5q0tLRi6++9914zbdo0s3XrVrNr1y7z4IMPmvDwcHP06NFK7twaXN3+hQ4cOGDq169vfve735nevXtXTrMW5Or2z8nJMe3atTO33nqrWbNmjTlw4IBZtWqVSU5OruTOrcHV7T937lxjt9vN3LlzzYEDB8yXX35poqOjzWOPPVbJnVd/S5cuNU8//bT5+OOPjSSzaNGiUuv3799vgoODzciRI83OnTvN1KlTja+vr1m2bFmF9km4gVs6dOhghg4d6nien59v6tWrZyZPnlym5fPy8kxoaKiZM2dORbVoae5s/7y8PNOpUyfzzjvvmAEDBhBuysHV7f/mm2+aK6+80pw/f76yWrQ0V7f/0KFDTbdu3ZymjRw50iQlJVVon1ZXlnDz+OOPm5YtWzpN69u3r+nZs2cFdmYMu6XgsvPnz2vz5s3q3r27Y5qPj4+6d++udevWlWkd2dnZys3NVa1atSqqTctyd/tPnDhRderU0cMPP1wZbVqWO9v/008/VWJiooYOHaqoqChdffXVev7555Wfn19ZbVuGO9u/U6dO2rx5s2PX1f79+7V06VLdeuutldLz5WzdunVOPytJ6tmzZ5m/K9x12d04E+V38uRJ5efnKyoqyml6VFSUfvrppzKt44knnlC9evWK/NLj0tzZ/mvWrNG//vUvJScnV0KH1ubO9t+/f7+++eYb3XfffVq6dKn27t2rIUOGKDc3V+PHj6+Mti3Dne1/77336uTJk7r++utljFFeXp4eeeQRPfXUU5XR8mUtNTW12J9VRkaGzp49q6CgoAp5XUZuUOmmTJmiBQsWaNGiRQoMDPR2O5aXmZmp/v37a8aMGYqMjPR2O5elgoIC1alTR2+//bbatm2rvn376umnn9Zbb73l7dYuC6tWrdLzzz+v6dOna8uWLfr444/1+eefa9KkSd5uDRWEkRu4LDIyUr6+vkpLS3OanpaWprp165a67EsvvaQpU6bo66+/VqtWrSqyTctydfvv27dPBw8e1G233eaYVlBQIEny8/PT7t271aRJk4pt2kLc+f2Pjo6Wv7+/fH19HdNatGih1NRUnT9/XgEBARXas5W4s/3HjRun/v37a+DAgZKka665RllZWfrTn/6kp59+Wj4+/J1fUerWrVvszyosLKzCRm0kRm7ghoCAALVt21YrVqxwTCsoKNCKFSuUmJhY4nIvvPCCJk2apGXLlqldu3aV0aolubr9mzdvrm3btik5OdnxuP3229W1a1clJycrJiamMtuv9tz5/U9KStLevXsdoVKS/vvf/yo6Oppg4yJ3tn92dnaRAFMYNA23V6xQiYmJTj8rSVq+fHmp3xUeUaGHK8OyFixYYOx2u5k9e7bZuXOn+dOf/mRq1qxpUlNTjTHG9O/f3zz55JOO+ilTppiAgADz0UcfmZSUFMcjMzPTW2+hWnN1+/8WZ0uVj6vb//DhwyY0NNQMGzbM7N692yxZssTUqVPHPPvss956C9Waq9t//PjxJjQ01MyfP9/s37/ffPXVV6ZJkybm7rvv9tZbqLYyMzPN1q1bzdatW40k88orr5itW7eaQ4cOGWOMefLJJ03//v0d9YWngo8ZM8bs2rXLTJs2jVPBUbVNnTrVNGzY0AQEBJgOHTqYH374wTGvS5cuZsCAAY7nsbGxRlKRx/jx4yu/cYtwZfv/FuGm/Fzd/mvXrjXXXXedsdvt5sorrzTPPfecycvLq+SurcOV7Z+bm2smTJhgmjRpYgIDA01MTIwZMmSIOXXqVOU3Xs2tXLmy2M/ywu09YMAA06VLlyLLtGnTxgQEBJgrr7zSzJo1q8L7tBnDmBwAALAOjrkBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBAACWQrgBYDk9e/aUr6+vNm7cWGTegw8+KJvNJpvNpoCAAMXFxWnixInKy8vzQqcAKgLhBoClHD58WGvXrtWwYcM0c+bMYmtuvvlmpaSkaM+ePRo1apQmTJigF198sZI7BVBRCDcAqqQbbrhBw4cP14gRIxQREaGoqCjNmDFDWVlZeuihhxQaGqq4uDh98cUXTsvNmjVLv//97zV48GDNnz9fZ8+eLbJuu92uunXrKjY2VoMHD1b37t316aefVtZbA1DBCDcAqqw5c+YoMjJSGzZs0PDhwzV48GDddddd6tSpk7Zs2aKbbrpJ/fv3V3Z2tiTJGKNZs2bp/vvvV/PmzRUXF6ePPvrokq8TFBSk8+fPV/TbAVBJCDcAqqzWrVvrr3/9q5o2baqxY8cqMDBQkZGRGjRokJo2bapnnnlGv/zyi/7zn/9Ikr7++mtlZ2erZ8+ekqT7779f//rXv0pcvzFGX3/9tb788kt169atUt4TgIpHuAFQZbVq1crxb19fX11xxRW65pprHNOioqIkScePH5ckzZw5U3379pWfn58kqV+/fvr++++1b98+p/UuWbJENWrUUGBgoG655Rb17dtXEyZMqOB3A6CyEG4AVFn+/v5Oz202m9M0m80mSSooKNCvv/6qRYsWafr06fLz85Ofn5/q16+vvLy8IgcWd+3aVcnJydqzZ4/Onj2rOXPmKCQkpOLfEIBK4eftBgDAE+bOnasGDRrok08+cZr+1Vdf6eWXX9bEiRPl6+srSQoJCVFcXJwXugRQGQg3ACzhX//6l+68805dffXVTtNjYmI0duxYLVu2TL169fJSdwAqE7ulAFR7+/bt048//qg//vGPReaFh4frxhtvLPXAYgDWYjPGGG83AQAA4CmM3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEsh3AAAAEv5f8ppjOq1poPJAAAAAElFTkSuQmCC", + "image/png": "", "text/plain": [ "
" ] @@ -2923,74 +1385,43 @@ } ], "source": [ - "consistent_ratio = target_maps.below_corrected_p.mean()\n", + "active_ratio = activity_map.below_corrected_p.mean()\n", "\n", "plt.scatter(\n", - " data=target_maps,\n", + " data=activity_map,\n", " x=\"mean_average_precision\",\n", " y=\"-log10(p-value)\",\n", " c=\"below_corrected_p\",\n", " cmap=\"tab10\",\n", " s=10,\n", ")\n", + "plt.title(\"Phenotypic activity assesement\")\n", "plt.xlabel(\"mAP\")\n", "plt.ylabel(\"-log10(p-value)\")\n", "plt.axhline(-np.log10(0.05), color=\"black\", linestyle=\"--\")\n", "plt.text(\n", - " 0.5,\n", + " 0.65,\n", " 1.5,\n", - " f\"Phenotypically consistent = {100*consistent_ratio:.2f}%\",\n", + " f\"Phenotypically active = {100 * active_ratio:.2f}%\",\n", " va=\"center\",\n", " ha=\"left\",\n", ")\n", - "\n", "plt.show()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can list compounds that are phenotypically active and consistent.\n", - "\n", - "Note that in multi-label scenario, when each compound can have multiple targets, the same compound can have \"consistent\" response in respect to one target, but not another." - ] - }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 26, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Phenotypically consistent targets: DRD2, EGFR, HTR3A, PSMB1\n", - "Phenotypically consistent compounds: BRD-A69636825-003-04-7, BRD-K50691590-001-02-2, BRD-K60230970-001-10-0, BRD-K70330367-003-07-9, BRD-K70358946-001-15-7, BRD-K70401845-003-09-6, BRD-K70914287-300-02-8\n" - ] - } - ], + "outputs": [], "source": [ - "consistent_targets = target_maps.query(\"below_corrected_p\")[\"Metadata_target\"]\n", - "consistent_compounds = df_consistent[\n", - " df_consistent[\"Metadata_target\"].apply(\n", - " lambda x: any(t in x for t in consistent_targets)\n", - " )\n", - "][\"Metadata_broad_sample\"]\n", - "\n", - "print(f\"Phenotypically consistent targets: {consistent_targets.str.cat(sep=', ')}\")\n", - "print(f\"Phenotypically consistent compounds: {consistent_compounds.str.cat(sep=', ')}\")" + "activity_map.to_csv(\"data/activity_map.csv\", index=False)" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "map_benchmark", + "display_name": "copairs", "language": "python", "name": "python3" }, @@ -3004,7 +1435,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.9.19" } }, "nbformat": 4, diff --git a/examples/phenotypic_consistency.ipynb b/examples/phenotypic_consistency.ipynb new file mode 100644 index 0000000..4c8dc94 --- /dev/null +++ b/examples/phenotypic_consistency.ipynb @@ -0,0 +1,1251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# mAP for phenotypic consistency assesement" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from copairs import map" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This example demostrates how to use `copairs` to assess phenotypic consistncy of perturbations htat target the same gene against other perturbations.\n", + "\n", + "Phenotypic consistency is assessed by calculating mean average precision (mAP) for the retrieval of phenotypically active samples that share expected biological similarity (such as chemical mechanisms of action and gene-gene relationships) against other phenotypically active samples that are not biologically similar to the query sample.\n", + "\n", + "It aims to answer the question: “How distinctive is this group of perturbations from other phenotypically active samples that are not biologically similar to the query sample?”\n", + "\n", + "The resulting mAP score for a group of perturbations reflects the average extent to which members of this group are more similar to each other compared to other groups (see Figure 1F).\n", + "\n", + "Citation:\n", + "> Kalinin, A. A. et al. A versatile information retrieval framework for evaluating profile strength and similarity. bioRxiv, 2024-04, (2024)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data\n", + "\n", + "Assessing phenotypic consistency relies on data and results from the [Phenotypic activity](./phenotypic_activity.ipynb) example, so run that one first if you haven't." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/2016_04_01_a549_48hr_batch1_plateSQ00014812.csv\")\n", + "activity_map = pd.read_csv(\n", + " \"data/activity_map.csv\"\n", + ") # load mAP scores for phenotypic activity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Assessing phenotypic consistency of compounds grouped by targets\n", + "\n", + "First, we are going to filter out compounds that were not phenotypically active using mAP p-values from the previous section.\n", + "\n", + "Next, we will aggregate each compound’s replicate profiles into a \"consensus\" profile by taking the median of each feature to reduce profile noise and improve computational efficiency." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_plate_map_nameMetadata_broad_sampleMetadata_mg_per_mlMetadata_mmoles_per_literMetadata_solventMetadata_pert_idMetadata_pert_mfc_idMetadata_pert_wellMetadata_pert_id_vendorMetadata_cell_id...Nuclei_Texture_InverseDifferenceMoment_AGP_5_0Nuclei_Texture_InverseDifferenceMoment_DNA_20_0Nuclei_Texture_InverseDifferenceMoment_ER_5_0Nuclei_Texture_InverseDifferenceMoment_Mito_10_0Nuclei_Texture_InverseDifferenceMoment_Mito_5_0Nuclei_Texture_SumAverage_RNA_5_0Nuclei_Texture_SumEntropy_DNA_10_0Nuclei_Texture_SumEntropy_DNA_20_0Nuclei_Texture_SumEntropy_DNA_5_0Nuclei_Texture_Variance_RNA_10_0
6C-7161-01-LM6-022BRD-K74363950-004-01-05.65560010.000000DMSOBRD-K74363950BRD-K74363950-004-01-0A07NaNA549...-0.51038-0.764021.616400-0.49600-0.4813602.4211001.107901.138201.143200.329230
7C-7161-01-LM6-022BRD-K74363950-004-01-01.8852003.333300DMSOBRD-K74363950BRD-K74363950-004-01-0A08NaNA549...-0.23602-0.411290.3049600.478840.005852-0.7103300.41986-0.238880.54949-0.092826
8C-7161-01-LM6-022BRD-K74363950-004-01-00.6284001.111100DMSOBRD-K74363950BRD-K74363950-004-01-0A09NaNA549...-0.52939-0.547270.7225700.733990.2238500.0358420.333180.390640.42969-0.811390
9C-7161-01-LM6-022BRD-K74363950-004-01-00.2094700.370370DMSOBRD-K74363950BRD-K74363950-004-01-0A10NaNA549...-0.58515-0.415330.0448740.763740.062913-0.6568500.18149-0.109600.48699-0.345260
10C-7161-01-LM6-022BRD-K74363950-004-01-00.0698230.123460DMSOBRD-K74363950BRD-K74363950-004-01-0A11NaNA549...-0.52686-0.578230.5916100.851840.5603700.0391840.598640.441230.75783-0.018031
11C-7161-01-LM6-022BRD-K74363950-004-01-00.0232740.041152DMSOBRD-K74363950BRD-K74363950-004-01-0A12NaNA549...-0.48060-1.472200.8141500.794630.0892490.0722400.918280.396261.09120-0.243750
12C-7161-01-LM6-022BRD-K75958547-238-01-04.61540010.000000DMSOBRD-K75958547BRD-K75958547-238-01-0A13NaNA549...-5.89680-0.97404-5.025000-10.41400-6.0675007.6257003.318303.27410-2.122402.299300
\n", + "

7 rows × 519 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_plate_map_name Metadata_broad_sample Metadata_mg_per_ml \\\n", + "6 C-7161-01-LM6-022 BRD-K74363950-004-01-0 5.655600 \n", + "7 C-7161-01-LM6-022 BRD-K74363950-004-01-0 1.885200 \n", + "8 C-7161-01-LM6-022 BRD-K74363950-004-01-0 0.628400 \n", + "9 C-7161-01-LM6-022 BRD-K74363950-004-01-0 0.209470 \n", + "10 C-7161-01-LM6-022 BRD-K74363950-004-01-0 0.069823 \n", + "11 C-7161-01-LM6-022 BRD-K74363950-004-01-0 0.023274 \n", + "12 C-7161-01-LM6-022 BRD-K75958547-238-01-0 4.615400 \n", + "\n", + " Metadata_mmoles_per_liter Metadata_solvent Metadata_pert_id \\\n", + "6 10.000000 DMSO BRD-K74363950 \n", + "7 3.333300 DMSO BRD-K74363950 \n", + "8 1.111100 DMSO BRD-K74363950 \n", + "9 0.370370 DMSO BRD-K74363950 \n", + "10 0.123460 DMSO BRD-K74363950 \n", + "11 0.041152 DMSO BRD-K74363950 \n", + "12 10.000000 DMSO BRD-K75958547 \n", + "\n", + " Metadata_pert_mfc_id Metadata_pert_well Metadata_pert_id_vendor \\\n", + "6 BRD-K74363950-004-01-0 A07 NaN \n", + "7 BRD-K74363950-004-01-0 A08 NaN \n", + "8 BRD-K74363950-004-01-0 A09 NaN \n", + "9 BRD-K74363950-004-01-0 A10 NaN \n", + "10 BRD-K74363950-004-01-0 A11 NaN \n", + "11 BRD-K74363950-004-01-0 A12 NaN \n", + "12 BRD-K75958547-238-01-0 A13 NaN \n", + "\n", + " Metadata_cell_id ... Nuclei_Texture_InverseDifferenceMoment_AGP_5_0 \\\n", + "6 A549 ... -0.51038 \n", + "7 A549 ... -0.23602 \n", + "8 A549 ... -0.52939 \n", + "9 A549 ... -0.58515 \n", + "10 A549 ... -0.52686 \n", + "11 A549 ... -0.48060 \n", + "12 A549 ... -5.89680 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_DNA_20_0 \\\n", + "6 -0.76402 \n", + "7 -0.41129 \n", + "8 -0.54727 \n", + "9 -0.41533 \n", + "10 -0.57823 \n", + "11 -1.47220 \n", + "12 -0.97404 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_ER_5_0 \\\n", + "6 1.616400 \n", + "7 0.304960 \n", + "8 0.722570 \n", + "9 0.044874 \n", + "10 0.591610 \n", + "11 0.814150 \n", + "12 -5.025000 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_Mito_10_0 \\\n", + "6 -0.49600 \n", + "7 0.47884 \n", + "8 0.73399 \n", + "9 0.76374 \n", + "10 0.85184 \n", + "11 0.79463 \n", + "12 -10.41400 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_Mito_5_0 \\\n", + "6 -0.481360 \n", + "7 0.005852 \n", + "8 0.223850 \n", + "9 0.062913 \n", + "10 0.560370 \n", + "11 0.089249 \n", + "12 -6.067500 \n", + "\n", + " Nuclei_Texture_SumAverage_RNA_5_0 Nuclei_Texture_SumEntropy_DNA_10_0 \\\n", + "6 2.421100 1.10790 \n", + "7 -0.710330 0.41986 \n", + "8 0.035842 0.33318 \n", + "9 -0.656850 0.18149 \n", + "10 0.039184 0.59864 \n", + "11 0.072240 0.91828 \n", + "12 7.625700 3.31830 \n", + "\n", + " Nuclei_Texture_SumEntropy_DNA_20_0 Nuclei_Texture_SumEntropy_DNA_5_0 \\\n", + "6 1.13820 1.14320 \n", + "7 -0.23888 0.54949 \n", + "8 0.39064 0.42969 \n", + "9 -0.10960 0.48699 \n", + "10 0.44123 0.75783 \n", + "11 0.39626 1.09120 \n", + "12 3.27410 -2.12240 \n", + "\n", + " Nuclei_Texture_Variance_RNA_10_0 \n", + "6 0.329230 \n", + "7 -0.092826 \n", + "8 -0.811390 \n", + "9 -0.345260 \n", + "10 -0.018031 \n", + "11 -0.243750 \n", + "12 2.299300 \n", + "\n", + "[7 rows x 519 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# only keep active compounds, i.e. those with corrected p-value < 0.05\n", + "active_compounds = activity_map.query(\"below_corrected_p\")[\"Metadata_broad_sample\"]\n", + "df_active = df.query(\"Metadata_broad_sample in @active_compounds\")\n", + "df_active.head(7)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_broad_sampleMetadata_targetCells_AreaShape_EccentricityCells_AreaShape_ExtentCells_AreaShape_FormFactorCells_AreaShape_OrientationCells_AreaShape_SolidityCells_AreaShape_Zernike_0_0Cells_AreaShape_Zernike_1_1Cells_AreaShape_Zernike_2_0...Nuclei_Texture_InverseDifferenceMoment_AGP_5_0Nuclei_Texture_InverseDifferenceMoment_DNA_20_0Nuclei_Texture_InverseDifferenceMoment_ER_5_0Nuclei_Texture_InverseDifferenceMoment_Mito_10_0Nuclei_Texture_InverseDifferenceMoment_Mito_5_0Nuclei_Texture_SumAverage_RNA_5_0Nuclei_Texture_SumEntropy_DNA_10_0Nuclei_Texture_SumEntropy_DNA_20_0Nuclei_Texture_SumEntropy_DNA_5_0Nuclei_Texture_Variance_RNA_10_0
0BRD-A69636825-003-04-7[CACNA1C, CACNA1S, CACNA2D1, CACNG1, HTR3A, KC...-0.3263650.6516100.2112800.0924120.4569150.4865150.4355450.863160...0.1752000.557360-0.8594650.4090450.201909-1.003185-1.405850-1.495100-0.867225-0.066115
1BRD-A69815203-001-07-6[ABCB11, CAMLG, FPR1, PPIA, PPIF, PPP3CA, PPP3...2.487450-2.8727500.616635-0.451942-2.260100-3.3009000.316320-1.825400...-2.681800-0.197230-4.7173500.6441701.3241000.1030700.9860251.3462000.773450-2.749350
2BRD-A70858459-001-01-7[ESR1, ESR2, MAP1A, MAP2]-0.9202101.4615500.445630-0.3942351.5284501.116100-0.0549901.061270...0.2388750.3264750.0645630.1876460.200447-0.6956600.1002250.4018850.114583-0.245753
3BRD-A72309220-001-04-1[HTR1A, HTR1B, HTR1D, HTR1E, HTR1F, HTR2A, HTR...0.0454350.0997550.1036280.592620-0.3522000.202930-0.059855-0.353755...1.069575-0.475915-0.1740020.2179650.090715-0.1546950.165235-0.1601910.242195-0.126886
4BRD-A73368467-003-17-6[HRH1]-0.062074-0.3148200.526190-0.502485-0.444675-0.1912250.1450190.018870...0.527805-1.2042500.615420-0.1876450.3218801.0132350.7936750.6829251.0755000.844115
\n", + "

5 rows × 495 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_broad_sample Metadata_target \\\n", + "0 BRD-A69636825-003-04-7 [CACNA1C, CACNA1S, CACNA2D1, CACNG1, HTR3A, KC... \n", + "1 BRD-A69815203-001-07-6 [ABCB11, CAMLG, FPR1, PPIA, PPIF, PPP3CA, PPP3... \n", + "2 BRD-A70858459-001-01-7 [ESR1, ESR2, MAP1A, MAP2] \n", + "3 BRD-A72309220-001-04-1 [HTR1A, HTR1B, HTR1D, HTR1E, HTR1F, HTR2A, HTR... \n", + "4 BRD-A73368467-003-17-6 [HRH1] \n", + "\n", + " Cells_AreaShape_Eccentricity Cells_AreaShape_Extent \\\n", + "0 -0.326365 0.651610 \n", + "1 2.487450 -2.872750 \n", + "2 -0.920210 1.461550 \n", + "3 0.045435 0.099755 \n", + "4 -0.062074 -0.314820 \n", + "\n", + " Cells_AreaShape_FormFactor Cells_AreaShape_Orientation \\\n", + "0 0.211280 0.092412 \n", + "1 0.616635 -0.451942 \n", + "2 0.445630 -0.394235 \n", + "3 0.103628 0.592620 \n", + "4 0.526190 -0.502485 \n", + "\n", + " Cells_AreaShape_Solidity Cells_AreaShape_Zernike_0_0 \\\n", + "0 0.456915 0.486515 \n", + "1 -2.260100 -3.300900 \n", + "2 1.528450 1.116100 \n", + "3 -0.352200 0.202930 \n", + "4 -0.444675 -0.191225 \n", + "\n", + " Cells_AreaShape_Zernike_1_1 Cells_AreaShape_Zernike_2_0 ... \\\n", + "0 0.435545 0.863160 ... \n", + "1 0.316320 -1.825400 ... \n", + "2 -0.054990 1.061270 ... \n", + "3 -0.059855 -0.353755 ... \n", + "4 0.145019 0.018870 ... \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_AGP_5_0 \\\n", + "0 0.175200 \n", + "1 -2.681800 \n", + "2 0.238875 \n", + "3 1.069575 \n", + "4 0.527805 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_DNA_20_0 \\\n", + "0 0.557360 \n", + "1 -0.197230 \n", + "2 0.326475 \n", + "3 -0.475915 \n", + "4 -1.204250 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_ER_5_0 \\\n", + "0 -0.859465 \n", + "1 -4.717350 \n", + "2 0.064563 \n", + "3 -0.174002 \n", + "4 0.615420 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_Mito_10_0 \\\n", + "0 0.409045 \n", + "1 0.644170 \n", + "2 0.187646 \n", + "3 0.217965 \n", + "4 -0.187645 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_Mito_5_0 \\\n", + "0 0.201909 \n", + "1 1.324100 \n", + "2 0.200447 \n", + "3 0.090715 \n", + "4 0.321880 \n", + "\n", + " Nuclei_Texture_SumAverage_RNA_5_0 Nuclei_Texture_SumEntropy_DNA_10_0 \\\n", + "0 -1.003185 -1.405850 \n", + "1 0.103070 0.986025 \n", + "2 -0.695660 0.100225 \n", + "3 -0.154695 0.165235 \n", + "4 1.013235 0.793675 \n", + "\n", + " Nuclei_Texture_SumEntropy_DNA_20_0 Nuclei_Texture_SumEntropy_DNA_5_0 \\\n", + "0 -1.495100 -0.867225 \n", + "1 1.346200 0.773450 \n", + "2 0.401885 0.114583 \n", + "3 -0.160191 0.242195 \n", + "4 0.682925 1.075500 \n", + "\n", + " Nuclei_Texture_Variance_RNA_10_0 \n", + "0 -0.066115 \n", + "1 -2.749350 \n", + "2 -0.245753 \n", + "3 -0.126886 \n", + "4 0.844115 \n", + "\n", + "[5 rows x 495 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# aggregate replicates by taking the median of each feature\n", + "feature_cols = [c for c in df_active.columns if not c.startswith(\"Metadata\")]\n", + "df_active = df_active.groupby(\n", + " [\"Metadata_broad_sample\", \"Metadata_target\"], as_index=False\n", + ")[feature_cols].median()\n", + "df_active[\"Metadata_target\"] = df_active[\"Metadata_target\"].str.split(\"|\")\n", + "df_active.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we again use metadata columns to define grouping of profiles. Here, we'd like to group those compounds that share a target and assess their similarity against compounds that do not have the same target:\n", + "\n", + "* Two compound profiles are a positive pair if they share the same target. To define that using metadata columns, positive pairs should share the same value in the metadata column that identifies targets (`Metadata_target`). We add this column to a list names `pos_sameby`.\n", + "\n", + "* In this case, profiles that form a positive pair do not need to be different in any of the metatada columns, so we keep `pos_diffby` empty. Although one could define them as being structurally different, for example.\n", + "\n", + "* Two profiles are a negative pair when do not share a common target. That means they should be different in the metadata column that identifies targets (`Metadata_target`).\n", + "\n", + "* Profiles that form a negative pair do not need to be same in any of the metatada columns, so we keep `neg_sameby` empty.\n", + "\n", + "We use `map.multilabel.average_precision` because each compound can have more than one target. If that's not the case, the standard `map.average_precision` should be used instead." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "72e631ce67b44d2f890735cc44c480d4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_broad_sampleaverage_precisionn_pos_pairsn_total_pairsMetadata_target
52BRD-A69636825-003-04-70.500000142HTR3A
32BRD-A72309220-001-04-10.406071442HTR1A
37BRD-A72309220-001-04-10.142857139HTR1B
39BRD-A72309220-001-04-10.142857139HTR1D
41BRD-A72309220-001-04-10.142857139HTR1E
..................
16BRD-K74363950-004-01-00.105128242CHRM3
19BRD-K74363950-004-01-00.105128242CHRM4
22BRD-K74363950-004-01-00.105128242CHRM5
28BRD-K76908866-001-07-60.500000142ERBB2
61BRD-K81258678-001-01-00.100000142RELA
\n", + "

64 rows × 5 columns

\n", + "" + ], + "text/plain": [ + " Metadata_broad_sample average_precision n_pos_pairs n_total_pairs \\\n", + "52 BRD-A69636825-003-04-7 0.500000 1 42 \n", + "32 BRD-A72309220-001-04-1 0.406071 4 42 \n", + "37 BRD-A72309220-001-04-1 0.142857 1 39 \n", + "39 BRD-A72309220-001-04-1 0.142857 1 39 \n", + "41 BRD-A72309220-001-04-1 0.142857 1 39 \n", + ".. ... ... ... ... \n", + "16 BRD-K74363950-004-01-0 0.105128 2 42 \n", + "19 BRD-K74363950-004-01-0 0.105128 2 42 \n", + "22 BRD-K74363950-004-01-0 0.105128 2 42 \n", + "28 BRD-K76908866-001-07-6 0.500000 1 42 \n", + "61 BRD-K81258678-001-01-0 0.100000 1 42 \n", + "\n", + " Metadata_target \n", + "52 HTR3A \n", + "32 HTR1A \n", + "37 HTR1B \n", + "39 HTR1D \n", + "41 HTR1E \n", + ".. ... \n", + "16 CHRM3 \n", + "19 CHRM4 \n", + "22 CHRM5 \n", + "28 ERBB2 \n", + "61 RELA \n", + "\n", + "[64 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# positive pairs are compounds that share a target\n", + "pos_sameby = [\"Metadata_target\"]\n", + "pos_diffby = []\n", + "\n", + "neg_sameby = []\n", + "# negative pairs are compounds that do not share a target\n", + "neg_diffby = [\"Metadata_target\"]\n", + "\n", + "metadata = df_active.filter(regex=\"^Metadata\")\n", + "profiles = df_active.filter(regex=\"^(?!Metadata)\").values\n", + "\n", + "target_aps = map.multilabel.average_precision(\n", + " metadata,\n", + " profiles,\n", + " pos_sameby=pos_sameby,\n", + " pos_diffby=pos_diffby,\n", + " neg_sameby=neg_sameby,\n", + " neg_diffby=neg_diffby,\n", + " multilabel_col=\"Metadata_target\",\n", + ")\n", + "target_aps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, we can compute mAP scores and p-values for each target group." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f03f87b87d1c45549ec70867a939d998", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/15 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_targetmean_average_precisionp_valuecorrected_p_valuebelow_pbelow_corrected_p-log10(p-value)
0ADRA1A0.2500000.1124140.186114FalseFalse0.730220
1ADRA2A0.2500000.1124140.186114FalseFalse0.730220
2AURKA0.6250000.0239760.103896TrueFalse0.983402
3BIRC20.0606620.3804920.471085FalseFalse0.326901
4CHRM10.0984200.4929380.492938FalseFalse0.307208
5CHRM20.0984200.4929380.492938FalseFalse0.307208
6CHRM30.0984200.4929380.492938FalseFalse0.307208
7CHRM40.0984200.4929380.492938FalseFalse0.307208
8CHRM50.0984200.4929380.492938FalseFalse0.307208
9DRD20.7500000.0006700.004355TrueTrue2.361012
\n", + "" + ], + "text/plain": [ + " Metadata_target mean_average_precision p_value corrected_p_value \\\n", + "0 ADRA1A 0.250000 0.112414 0.186114 \n", + "1 ADRA2A 0.250000 0.112414 0.186114 \n", + "2 AURKA 0.625000 0.023976 0.103896 \n", + "3 BIRC2 0.060662 0.380492 0.471085 \n", + "4 CHRM1 0.098420 0.492938 0.492938 \n", + "5 CHRM2 0.098420 0.492938 0.492938 \n", + "6 CHRM3 0.098420 0.492938 0.492938 \n", + "7 CHRM4 0.098420 0.492938 0.492938 \n", + "8 CHRM5 0.098420 0.492938 0.492938 \n", + "9 DRD2 0.750000 0.000670 0.004355 \n", + "\n", + " below_p below_corrected_p -log10(p-value) \n", + "0 False False 0.730220 \n", + "1 False False 0.730220 \n", + "2 True False 0.983402 \n", + "3 False False 0.326901 \n", + "4 False False 0.307208 \n", + "5 False False 0.307208 \n", + "6 False False 0.307208 \n", + "7 False False 0.307208 \n", + "8 False False 0.307208 \n", + "9 True True 2.361012 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_maps = map.mean_average_precision(\n", + " target_aps, pos_sameby, null_size=1000000, threshold=0.05, seed=0\n", + ")\n", + "target_maps[\"-log10(p-value)\"] = -target_maps[\"corrected_p_value\"].apply(np.log10)\n", + "target_maps.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similarly, we can plot the results, where groups of compounds targeting the same gene are called consistent if their corrected p-value < 0.05." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGwCAYAAACHJU4LAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA440lEQVR4nO3dd3xUVf7/8fekzKSQhIRQEggBCVIFIqBAXJBeXMuqiKCIfoVdiqyIomJZWHQFZXet2ECB3y5FV8SCCCoSFEV6kCYCoSkdSYeQcn5/uJllTCEzKXOTvJ6Px308mHvPPfOZO8Pcd+6ce6/NGGMEAABgQT7eLgAAAKA4BBUAAGBZBBUAAGBZBBUAAGBZBBUAAGBZBBUAAGBZBBUAAGBZft4uoCzy8/N19OhRhYSEyGazebscAABQCsYYpaenKzo6Wj4+JR8zqdJB5ejRo4qJifF2GQAAwANHjhxRo0aNSmxTpYNKSEiIpF9faGhoqJerAQAApZGWlqaYmBjnfrwkVTqoFPzcExoaSlABAKCKKc2wDQbTAgAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAyyKoAAAAFwfPZeuGLXt12ZrvlfDdbq09m+61WggqAADAKSsvX7du3afNaZnKys/XgXPZGrYtWfuyznulHoIKAABw2p6epZ+yc5Rnfn2cLynXGH1xOs0r9RBUAACAk7+PrdA8I8mviPmVgaACAACc2tUKUoeQQPn+97GvpNp+vrq+bm2v1ENQAQAATn4+Nr3TvpmGRdVRu1qB6h8ZpmUdm6u+w9879XjlWQEAgGWF+ftpZssYb5chiSMqAADAwggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsiwTVGbMmCGbzaYJEyZ4uxQAAGARlggqGzdu1BtvvKF27dp5uxQAAGAhXg8qGRkZuuOOOzR79myFh4d7uxwAAGAhXg8q48aN03XXXac+ffpcsm12drbS0tJcJgAAUH35efPJFy9erC1btmjjxo2laj99+nT99a9/reCqAACAVXjtiMqRI0d0//33a8GCBQoICCjVOpMnT1ZqaqpzOnLkSAVXCQAAvMlmjDHeeOIPPvhAf/jDH+Tr6+ucl5eXJ5vNJh8fH2VnZ7ssK0paWprCwsKUmpqq0NDQii4ZAACUA3f231776ad3797avn27y7x77rlHLVu21COPPHLJkAIAAKo/rwWVkJAQtW3b1mVecHCw6tSpU2g+AACombx+1g8AAEBxvHrWz28lJiZ6uwQAAGAhefuCikpKVq6dKm+/vprHTp0SFlZWapbt67i4+PVv39/devWrSLqBAAANVCpj6gcPXpUI0eOVFRUlJ5++mmdO3dOHTp0UO/evdWoUSOtXr1affv2VevWrfXOO+9UZM0AAKCGKPURlfj4eI0YMUKbN29W69ati2xz7tw5ffDBB3rhhRd05MgRPfTQQyX2+dprr+m1117TwYMHJUlt2rTRX/7yFw0cOLD0rwAAAFRbNmOMKU3DM2fOqE6dOqXuuDTtP/74Y/n6+qp58+Yyxmj+/PmaOXOmtm7dqjZt2lzyOdLS0hQWFqbU1FSFhoaWujYAAOA97uy/Sx1UKktERIRmzpype++9t9Cy7OxsZWdnOx+npaUpJiaGoAIAQBXiTlDx+Kyff/3rX0pISFB0dLQOHTokSXrhhRf04YcfetRfXl6eFi9erMzMTHXt2rXINtOnT1dYWJhziomJ8bR8AABQBXgUVF577TVNnDhRgwYNUkpKivLy8iRJtWvX1gsvvOBWX9u3b1etWrXkcDg0evRoLV26tNgxMJMnT1ZqaqpzOnLkiCflAwCAKsKjoPLyyy9r9uzZevzxx+Xr6+uc36lTJ23fvt2tvlq0aKGkpCStX79eY8aM0YgRI7Rr164i2zocDoWGhrpMAACg+nL7OiqSdODAAcXHxxea73A4lJmZ6VZfdrtdcXFxkqSOHTtq48aNevHFF/XGG294UhoAAKhGPDqi0rRpUyUlJRWav2LFCrVq1apMBeXn57sMmAUAADWXR0dUJk6cqHHjxun8+fMyxmjDhg1atGiRpk+frjlz5pS6n8mTJ2vgwIFq3Lix0tPTtXDhQiUmJmrlypWelAUAAKoZj4LKyJEjFRgYqCeeeEJZWVkaNmyYoqOj9eKLL+r2228vdT8nT57UXXfdpWPHjiksLEzt2rXTypUr1bdvX0/KAgAA1UyZr6OSlZWljIwM1atXr7xqKjUu+AYAQNXjzv7boyMqFwsKClJQUFBZuwEAACjEo6DStGlT2Wy2YpcnJyd7XBAAAEABj4LKhAkTXB7n5ORo69atWrFihSZNmlQedQEAAHgWVO6///4i58+aNUubNm0qU0EAAAAFPL7XT1EGDhyoJUuWlGeXAACgBivXoPLee+8pIiKiPLsEAAA1mEc//cTHx7sMpjXG6Pjx4zp16pReffXVcisOAADUbB4FlZtuusnlsY+Pj+rWratrr71WLVu2LI+6AAAAyn7BN2/igm8AAFQ9FXLBt7S0tFIXQGgAAADlodRBpXbt2iVe5E36dayKzWZTXl5emQsDAAAodVBZvXp1RdYBAABQSKmDSo8ePSqyDgAAgELKdFPCrKwsHT58WBcuXHCZ365duzIVBQAAIHkYVE6dOqV77rlHn376aZHLGaMCAADKg0dXpp0wYYJSUlK0fv16BQYGasWKFZo/f76aN2+ujz76qLxrBAAANZRHR1S+/PJLffjhh+rUqZN8fHwUGxurvn37KjQ0VNOnT9d1111X3nUCAIAayKMjKpmZmapXr54kKTw8XKdOnZIkXXHFFdqyZUv5VQcAAGo0j4JKixYttGfPHklS+/bt9cYbb+jnn3/W66+/rqioqHItEAAA1Fwe/fRz//3369ixY5KkKVOmaMCAAVqwYIHsdrvmzZtXnvUBAIAarFzu9ZOVlaUffvhBjRs3VmRkZHnUVSrc6wcAgKrHnf23Rz/9rF271uVxUFCQrrzyykoNKQAAoPrzKKj06tVLTZs21WOPPaZdu3aVd00AAACSPAwqR48e1YMPPqg1a9aobdu26tChg2bOnKmffvqpvOsDAAA1WJnHqBw4cEALFy7UokWL9MMPP6h79+768ssvy6u+EjFGBQCAqsed/Xe5DKbNy8vTp59+qieffFLff/99pV1Cn6ACAEDVU+GDaQt88803Gjt2rKKiojRs2DC1bdtWn3zySVm6BAAAcPLoOiqTJ0/W4sWLdfToUfXt21cvvviibrzxRgUFBZV3fQAAoAbzKKh89dVXmjRpkm677TZOSQYAABXGo6DyzTffOP+9aNEi3XDDDQoODi63ogAAAKQyjlGRpD/96U86ceJEedQCAADgosxBpRxOGgIAAChSmYMKAABARSlzUPn000/VsGHD8qgFAADARZmCysmTJ2WM0YYNG3Ty5MnyqgkAAECSh0ElPT1dw4cPV8OGDdWjRw/16NFDDRs21J133qnU1NTyrhEAANRQHgWVkSNHav369Vq2bJlSUlKUkpKiZcuWadOmTfrTn/5U3jUCAIAayqN7/QQHB2vlypW65pprXOZ//fXXGjBggDIzM8utwJJwrx8AAKqeCr/XT506dRQWFlZoflhYmMLDwz3pEgAAoBCPgsoTTzyhiRMn6vjx4855x48f16RJk/Tkk0+WW3EAAOtacyZND/9wWIuPnfF2KajGPPrpJz4+Xvv27VN2drYaN24sSTp8+LAcDoeaN2/u0nbLli3lU2kR+OkHALzjtqR9+upshvNxA7ufkhLaerEiVCXu7L89utfPTTfd5MlqAIBqYPWZNJeQIknHL+Tq/t2H9GKrWC9VherKo6AyZcqU8q4DAFBFfHwypcj5m1Ir50QK1CylHqPCPX0AAJJ0ZWhgkfMbBTgquRLUBKUOKm3atNHixYt14cKFEtvt3btXY8aM0YwZM8pcHADAeu5sWFf17K4H5P0kvdW2iVfqQfVW6sG0q1at0iOPPKLk5GT17dtXnTp1UnR0tAICAnT27Fnt2rVLa9eu1c6dO3XffffpscceK/IU5vLEYFoA8J7xuw5pc1qmGjrsmntFU9Xy8/V2Sagi3Nl/u33Wz9q1a/XOO+/o66+/1qFDh3Tu3DlFRkYqPj5e/fv31x133FFp11IhqAAAUPVUaFCxEoIKAABVT4VfmRYAAKAyuH168unTp/X2229r3bp1zivTNmjQQF27dtU999yjunXrlnuRAACgZnLriMrGjRt1+eWX66WXXlJYWJi6d++u7t27KywsTC+//LJatmypTZs2VVStAACghnFrjEqXLl3Uvn17vf7667LZbC7LjDEaPXq0vv/+e61bt67cCy0KY1QAAKh6KuwS+tu2bdO8efMKhRRJstlseuCBBxQfH+9etQAAAMVw66efBg0aaMOGDcUu37Bhg+rXr1/mogAAACQ3j6g89NBD+uMf/6jNmzerd+/ezlBy4sQJrVq1SrNnz9bf//73CikUAADUPG4FlXHjxikyMlLPP/+8Xn31VeXl5UmSfH191bFjR82bN0+33XZbhRQKAABqHo8v+JaTk6PTp09LkiIjI+Xv71+uhZUGg2kBAKh6Kmww7cX8/f0VFRXl6eoAAACXVK5Xpt2/f7969epVnl0CAIAarFyDSkZGhtasWVOeXQIAgBrMrZ9+XnrppRKX//zzz2UqBgAA4GJuBZUJEyYoKipKdru9yOUXLlwol6IAAAAkN4NKbGysnn322WJPQU5KSlLHjh3LpTAAAAC3xqh07NhRmzdvLna5zWaTh2c7AwAAFOLWEZVp06YpKyur2OWtW7fWgQMHylwUAACA5GZQad26dYnL/f39FRsbW6aCAAAACpTr6ckAAADlyaMr08bHx8tmsxWab7PZFBAQoLi4ON19993q2bNnif1Mnz5d77//vn744QcFBgaqW7duevbZZ9WiRQtPygIAANWMR0dUBgwYoOTkZAUHB6tnz57q2bOnatWqpf3796tz5846duyY+vTpow8//LDEftasWaNx48bpu+++0+eff66cnBz169dPmZmZHr0YAABQvXh0U8JRo0apcePGevLJJ13mP/300zp06JBmz56tKVOm6JNPPtGmTZtK3e+pU6dUr149rVmzRt27d79ke25KCABA1ePO/tujIyrvvvuuhg4dWmj+7bffrnfffVeSNHToUO3Zs8etflNTUyVJERERRS7Pzs5WWlqaywQAAKovj4JKQECAvv3220Lzv/32WwUEBEiS8vPznf8ujfz8fE2YMEEJCQlq27ZtkW2mT5+usLAw5xQTE+NJ+QAAoIrwaDDt+PHjNXr0aG3evFmdO3eWJG3cuFFz5szRY489JklauXKlOnToUOo+x40bpx07dmjt2rXFtpk8ebImTpzofJyWlkZYAQCgGvNojIokLViwQK+88orz550WLVpo/PjxGjZsmCTp3LlzzrOALuW+++7Thx9+qK+++kpNmzYtdQ2MUQEAoOpxZ//tcVApD8YYjR8/XkuXLlViYqKaN2/u1voEFQAAqh539t8e/fRTYPPmzdq9e7ckqU2bNoqPj3dr/XHjxmnhwoX68MMPFRISouPHj0uSwsLCFBgYWJbSAABANeDREZWTJ0/q9ttvV2JiomrXri1JSklJUc+ePbV48WLVrVu3dE9exEXjJGnu3Lm6++67L7k+R1QAAKh6Kvz05PHjxys9PV07d+7UL7/8ol9++UU7duxQWlqa/vznP5e6H2NMkVNpQgoAAKj+PDqiEhYWpi+++MJ5xk+BDRs2qF+/fkpJSSmv+krEERUAAKqeCj+ikp+fL39//0Lz/f39lZ+f70mXAAAAhXgUVHr16qX7779fR48edc77+eef9cADD6h3797lVhwAAKjZPAoqr7zyitLS0tSkSRM1a9ZMzZo1U9OmTZWWlqaXX365vGsEAAA1lEenJ8fExGjLli364osv9MMPP0iSWrVqpT59+pRrcQAAoGbz6gXfyorBtAAAVD0VcsG3l156qdQFuHOKMgAAQHFKfUSltPfgsdlsSk5OLlNRpcURFQAAqp4KOaJy4MCBIuevXbtWnTp1KtXNBwEAANzh0Vk/Fxs0aJDLacoAAADlpcxBpQqPxQUAABZX5qACAABQUcocVN544w3Vr1+/PGoBAABw4dEF3y42bNiw8qgDAACgEH76AQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAQAAlkVQAWAJ8+bNU+3atb1dRoVKTEyUzWZTSkpKufR38OBB2Ww2JSUlVUj/3nDttddqwoQJ3i4DFkJQAVAp7r77btlsNtlsNtntdsXFxWnatGnKzc31dmmFNGnSRC+88EK599utWzcdO3ZMYWFh5d53dfH+++/rqaeeKlXbigw1NptNH3zwQYX0fbG//e1v6tatm4KCgooN6gX/by6eFi9eXGK/N9xwgxo3bqyAgABFRUVp+PDhOnr0qEublStXqkuXLgoJCVHdunV1yy236ODBg87lW7duVXx8vGrVqqXrr79ev/zyi3NZbm6uOnbsqA0bNnj82kuLoAKg0gwYMEDHjh3T3r179eCDD2rq1KmaOXOmt8uqNHa7XQ0aNJDNZvN2KZYVERGhkJAQb5dRaS5cuKDBgwdrzJgxJbabO3eujh075pxuuummEtv37NlT7777rvbs2aMlS5Zo//79uvXWW53LDxw4oBtvvFG9evVSUlKSVq5cqdOnT+vmm292thk5cqR69eqlLVu2KDU1Vc8884xz2T/+8Q8lJCToqquu8uyFu8NUYampqUaSSU1N9XYpAC5hxIgR5sYbb3SZ17dvX9OlSxdjjDFz5841YWFhZsWKFaZly5YmODjY9O/f3xw9etRlndmzZ5uWLVsah8NhWrRoYWbNmuVcduDAASPJLFmyxFx77bUmMDDQtGvXznz77bcufbz33numdevWxm63m9jYWPP3v//duaxHjx5GksuUkZFhQkJCzH/+8x+XfpYuXWqCgoJMWlqa87kXLVpkunbtahwOh2nTpo1JTEx0tl+9erWRZM6ePeuct3btWtOjRw8TGBhoateubfr162d++eUXY4wxn376qUlISDBhYWEmIiLCXHfddWbfvn2FXu/WrVsL9V+amouSl5dnnn32WdOsWTNjt9tNTEyMefrpp53Lv//+e9OzZ08TEBBgIiIizKhRo0x6erpzecH7PHPmTNOgQQMTERFhxo4day5cuOBsM2vWLBMXF2ccDoepV6+eueWWW1y2//3333/JtiNGjCj0Ph04cMAYY8z27dvNgAEDTHBwsKlXr5658847zalTp1yeY/z48WbSpEkmPDzc1K9f30yZMsW5PDY21qXf2NjYIrdVeSr4/BdFklm6dGmZ+v/www+NzWZzvg//+c9/jJ+fn8nLy3O2+eijj1zaBAYGmt27dxtjjHn11VfNoEGDjDHG7N+/3zRv3rzYz1BpuLP/JqgAqBRFBZUbbrjBXHnllcaYX7+o/f39TZ8+fczGjRvN5s2bTatWrcywYcOc7f/973+bqKgos2TJEpOcnGyWLFliIiIizLx584wx/9txt2zZ0ixbtszs2bPH3HrrrSY2Ntbk5OQYY4zZtGmT8fHxMdOmTTN79uwxc+fONYGBgWbu3LnGGGPOnDljGjVqZKZNm2aOHTtmjh07ZowxZtSoUc4v6ovrv+uuu1yeu1GjRua9994zu3btMiNHjjQhISHm9OnTxpjCQWXr1q3G4XCYMWPGmKSkJLNjxw7z8ssvO3eq7733nlmyZInZu3ev2bp1q7n++uvNFVdc4dy5lBRUSlNzUR5++GETHh5u5s2bZ/bt22e+/vprM3v2bGOMMRkZGSYqKsrcfPPNZvv27WbVqlWmadOmZsSIES7vc2hoqBk9erTZvXu3+fjjj01QUJB58803jTHGbNy40fj6+pqFCxeagwcPmi1btpgXX3zRuf7FQaWktikpKaZr165m1KhRzvcpNzfXnD171tStW9dMnjzZ7N6922zZssX07dvX9OzZ0+U5QkNDzdSpU82PP/5o5s+fb2w2m/nss8+MMcacPHnSSDJz5841x44dMydPnix2e7Vu3doEBwcXOw0YMKDYdS92qaASHR1t6tSpYzp37mzeeustk5+fX6p+jfn1M33bbbeZhIQE57zk5GRjt9vNnDlzTG5urklJSTGDBw82ffv2dbbp0qWLeemll0xOTo655ZZbzKOPPmqM+fUPjLIGJ4IKAMu5OKjk5+ebzz//3DgcDvPQQw8ZY379opbkcsRg1qxZpn79+s7HzZo1MwsXLnTp96mnnjJdu3Y1xvxvxz1nzhzn8p07dxpJzr8Mhw0b5vJlbIwxkyZNMq1bt3Y+jo2NNc8//7xLm/Xr1xtfX1/nEZ4TJ04YPz8/5xGTgueeMWOGc52cnBzTqFEj8+yzzxpjCgeJoUOHuuw8LuXUqVNGktm+fbvLcxYXVC5V82+lpaUZh8PhDCa/9eabb5rw8HCTkZHhnPfJJ58YHx8fc/z4cWPMr+9zbGysyc3NdbYZPHiwGTJkiDHGmCVLlpjQ0NBi/xq/OKi407bAU089Zfr16+cy78iRI0aS2bNnj3O9a665xqVN586dzSOPPOJ8XNqjGAcPHjR79+4tdvrpp58u2YcxJQeVadOmmbVr15otW7aYGTNmGIfD4RLuivPwww+boKAgI8l06dLFGZgLJCYmmnr16hlfX18jyXTt2tXlaN+OHTtM9+7dTePGjc3QoUNNamqq+X//7/+ZG2+80fz000+mX79+plmzZubxxx8v1Wu8mDv7b8aoAKg0y5YtU61atRQQEKCBAwdqyJAhmjp1qnN5UFCQmjVr5nwcFRWlkydPSpIyMzO1f/9+3XvvvapVq5Zzevrpp7V//36X52nXrp1LH5Kc/ezevVsJCQku7RMSErR3717l5eUVW/tVV12lNm3aaP78+ZKkf//734qNjVX37t1d2nXt2tX5bz8/P3Xq1Em7d+8uss+kpCT17t272Ofcu3evhg4dqssuu0yhoaFq0qSJJOnw4cPFruNJzQV2796t7OzsYmvavXu32rdvr+DgYOe8hIQE5efna8+ePc55bdq0ka+vr/Pxxe9j3759FRsbq8suu0zDhw/XggULlJWVVeTzudO2wLZt27R69WqXz0jLli0lyeVzcvFn5Lc1uiM2NlZxcXHFTg0bNnS7z9968sknlZCQoPj4eD3yyCN6+OGHSzW2a9KkSdq6das+++wz+fr66q677pIxRpJ0/PhxjRo1SiNGjNDGjRu1Zs0a2e123Xrrrc42bdq00Zo1a3To0CEtXLhQOTk5mjJlil555RWNHz9e3bp107Zt2/T+++/r448/LvPrLA5BBUCl6dmzp5KSkrR3716dO3dO8+fPd9np+fv7u7S32WzOL82MjAxJ0uzZs5WUlOScduzYoe+++85lvYv7KRi4mp+fX+b6R44cqXnz5kn6dXDjPffcU6aBsYGBgSUuLzjTYvbs2Vq/fr3Wr18v6dcBmBVR86XqKa2i3seC7R8SEqItW7Zo0aJFioqK0l/+8he1b9++yFOq3WlbICMjQ9dff73LZ6TgM3dxQCupRne0adPGJRT9dho4cKDbfV7K1VdfrZ9++knZ2dkltouMjNTll1+uvn37avHixVq+fLnz/8qsWbMUFham5557TvHx8erevbv+/e9/a9WqVc7P2W9NnDhREyZMUKNGjZSYmKjBgwcrODhY1113nRITE8v7ZToRVABUmuDgYMXFxalx48by8/Nza9369esrOjpaycnJhf5qbdq0aan7adWqlb755huXed98840uv/xy51EAu91e5NGVO++8U4cOHdJLL72kXbt2acSIEYXaXByacnNztXnzZrVq1arIWtq1a6dVq1YVuezMmTPas2ePnnjiCfXu3VutWrXS2bNnS/063am5QPPmzRUYGFhsTa1atdK2bduUmZnpnPfNN9/Ix8dHLVq0KHVNfn5+6tOnj5577jl9//33OnjwoL788ku32xb1Pl155ZXauXOnmjRpUuhzcnEovhR/f/8Sj7AVWL58eaFQdPE0Z86cUj9naSUlJSk8PFwOh6PU6xSEsIJwk5WVJR8f1whQ8PkvKrCtWrVKu3fv1n333SdJysvLU05OjiQpJyenVNvKU+59UwCAF/31r3/Vn//8Z4WFhWnAgAHKzs7Wpk2bdPbsWU2cOLFUfTz44IPq3LmznnrqKQ0ZMkTr1q3TK6+8oldffdXZpkmTJvrqq690++23y+FwKDIyUpIUHh6um2++WZMmTVK/fv3UqFGjQv3PmjVLzZs3V6tWrfT888/r7Nmz+r//+78ia5k8ebKuuOIKjR07VqNHj5bdbtfq1as1ePBgRUREqE6dOnrzzTcVFRWlw4cP69FHH3V7m5Wm5gIBAQHOnxbsdrsSEhJ06tQp7dy5U/fee6/uuOMOTZkyRSNGjNDUqVN16tQpjR8/XsOHD1f9+vVLVc+yZcuUnJys7t27Kzw8XMuXL1d+fn6RQedSbZs0aaL169fr4MGDqlWrliIiIjRu3DjNnj1bQ4cO1cMPP6yIiAjt27dPixcv1pw5c1x+kipJkyZNtGrVKiUkJMjhcCg8PLzIdrGxsaXqrziHDx/WL7/8osOHDysvL8958b64uDjVqlVLH3/8sU6cOKEuXbooICBAn3/+uZ555hk99NBDzj42bNigu+66S6tWrVLDhg21fv16bdy4Uddcc43Cw8O1f/9+Pfnkk2rWrJnzp8nrrrtOzz//vKZNm6ahQ4cqPT1djz32mGJjYxUfH+9S4/nz53Xfffdp0aJFznCTkJCgWbNmady4cVqyZIn++c9/lmk7lMjtETAWwmBaoOoo6qyfixU1mHDp0qXmt19TCxYsMB06dDB2u92Eh4eb7t27m/fff98YU3hwqTHGnD171kgyq1evds4rOD3Z39/fNG7c2MycOdPlOdatW2fatWtnHA5HoedftWqVkWTeffddl/kFz71w4UJz1VVXGbvdblq3bm2+/PJLZ5uiTk9OTEw03bp1Mw6Hw9SuXdv079/fufzzzz83rVq1Mg6Hw7Rr184kJia6DPK81GDaS9VclLy8PPP000+b2NhY5/Z55plnnMtLe3ryxe6//37To0cPY4wxX3/9tenRo4cJDw93nj7+zjvvONtePED2Um337NljunTpYgIDA11OT/7xxx/NH/7wB1O7dm0TGBhoWrZsaSZMmOA8U6aoQbg33nijy9lLH330kYmLizN+fn4VenpyUadZX/x5/fTTT02HDh1MrVq1THBwsGnfvr15/fXXXU4rLnjfC15/wXsUERFhHA6HadKkiRk9enShgb2LFi0y8fHxJjg42NStW9fccMMNzkHnF3v00UfNgw8+6DJv7969pnPnziY0NNSMGTPGpZ7ScGf/bTPmvz8AV0FpaWkKCwtTamqqQkNDvV0OgBrgX//6lx544AEdPXpUdrvdOf/gwYNq2rSptm7dqg4dOnivwCIUVzPgLe7sv/npBwBKISsrS8eOHdOMGTP0pz/9qUrs8KtizcBvMZgWAErhueeeU8uWLdWgQQNNnjzZ2+WUSlWsGfgtfvoBAACVyp39N0dUAACAZRFUAACAZRFUAACAZRFUAACAZRFUAACAZVWL66hkZmYWeVlkX19fBQQEuLQrjo+Pj8sNudxpm5WVpeJOnrLZbAoKCvKo7blz50q8SdbF961wp+358+dLvC+DO22DgoKcNzjLzs5Wbm5uubQNDAx0Xqr5woULzntKlLVtQECA87PiTtucnJwSbwTncDic965xp21ubm6JNxaz2+3Om6e50zYvL0/nz58vtq2/v7/zmhrutM3Pz9e5c+fKpa2fn5/zXiXGmBLviutOW3f+3/MdUXRbviP4jqiM74hSc+uatxZTcAne4qZBgwa5tA8KCiq2bcHlnQtERkYW27ZTp04ubWNjY4tt27p1a5e2rVu3Lrbtby/T3KlTp2LbRkZGurTt0aNHsW2DgoJc2g4aNKjE7XaxW2+9tcS2GRkZzrbFXQq6YDp58qSz7dixY0tsW3ApaGOMeeihh0psu2PHDmfbKVOmlNh2w4YNzrbPPfdciW0vvuT6K6+8UmLbZcuWOdvOnTu3xLYXX8b83XffLbHt3LlznW2XLVtWYttXXnnF2bbgktrFTc8995yz7YYNG0psO2XKFGfbHTt2lNj2oYcecrYtuLx7cdPYsWOdbU+ePFli24svbZ6RkVFi21tvvdXlM1xSW74jfp34jvjfxHfEr1NFf0e4cwl9fvoBAACWVS0u+Hb06NEiLxjDYd2i23JYl8O6/PTjflu+Izxry3fEr/iOcG3rzgXfqkVQ4cq0AABUHVyZFgAAVAsEFQAAYFkEFQAAYFkEFQAAYFkEFQAAYFkEFaACVeGT6gDAEqrFJfQBqzmeel4T3tmqjQfPKjTAT48NaqXBnWK8XRYAVDkcUQHKWX6+0f/N36iNB88qL9/obFaOJr33vdbuPe3t0gCgyiGoAOXseNp57Tqaprz8//3s4+tj0+e7jnuxKgComggqQDlz+BX938rhX/gO3wCAkhFUgHJWp5ZDN3WIlu2/j31skr+vTUM6M0YFANzFYFqgAswc3F5NI2tpXfJp1Ql26L5ecWpWt5a3ywKAKoebEgIAgErFTQkBAEC1QFABAACWRVABAACWRVABAACW5dWg8tVXX+n6669XdHS0bDabPvjgA2+WAwAALMarQSUzM1Pt27fXrFmzvFkGAJQLY4zSz+coP7/KnkwJWI5Xr6MycOBADRw40JslAEC52HYkRaP/vVnHUs+rlsNPM265Qr9vF+3tsoAqr0pd8C07O1vZ2dnOx2lpaV6sBgB+lXouR3e9vUHp53MkSZnZufrzoq1qGhmsNtFhXq4OqNqq1GDa6dOnKywszDnFxHBJcgDet/PnVKWey1HBLz4FP/ys23/GazUB1UWVCiqTJ09Wamqqczpy5Ii3SwIA1QoofHDaGCnYUaUOWgOWVKX+FzkcDjkcDm+XAQAu2kaHqXvzSH2977QkycdmU1RYgK5rF+XlyoCqr0oFFQCwIh8fm968q5Nmf5WsXcfSFF07UGOvbabQAH9vlwZUeV4NKhkZGdq3b5/z8YEDB5SUlKSIiAg1btzYi5UBgHsC/H01vndzb5cBVDteDSqbNm1Sz549nY8nTpwoSRoxYoTmzZvnpaoAAIBVeDWoXHvttTKGCyMBAICiVamzfgAAQM1CUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJbFTQkBVIhDZzI16T/bnDfp+9sfrlDnJhHeLgtAFcMRFVSK0+lZmvbRTr2R+OtNKHPz8pV8KkM7fk7RZzuP68iZrDI/x5mMbO07ma7zOXll7gtlk3UhV7e/+Z02H05RRnae9p3M0J1z1uvg6UxvlwagiuGICircy6t+1D8+3+t8PH3FHjUOD9Dhs+dd2g26ooFevaOj2/0bYzRz5R69mrhfkhQRbNdbIzopvnF42QqHx77/KVXHUv/3/uYb6UJevlbvOal7Ipt6sTIAVQ1HVFDhLg4pBX4bUiRp+fbjWrj+sNv9L99+3BlSJCkl64Lunb9J2bkcWfEWPx9b4ZmmmPkAUAKCCipUTk6OW+1X/3DC7efYfOisyw4w30i/ZF7QkV/Oud0Xyke7RrXVOipEvv99X3x9bAoL8lf/tg28XBmAqoagggrl7+/vVvt6oQFuP0edWnblG1NofkSw3e2+UD7sfj5aMLKLbo5vqJYNQtSrZT29P6ab6oW4//4CqNkYo4IKVz/UoRNp2ZdsF+jvq0cHtnS7/zuvjtXiDYf1c8o5+dhsys03uq9nHEHFy8KD7Zo5uL23ywBQxdmMKeJP0SoiLS1NYWFhSk1NVWhoqLfLQQlue/1bbTl8VjabTWN6XKaEuLr6fNcJfbv/tHJyjZrWDdZzt7ZT7SDPwkVqVo4WbDikMxkX1Ck2XAPaNpDNxngIALAid/bfBBUAAFCp3Nl/M0YFAABYFkEFAABYFkEFAABYFmf9VGHJpzL00H+2affxdDUMC9Tf/tBWV19Wx9tlAQBQbjiiUkVlZudq6OzvtO2nVJ27kKfk0xm66+0NOnSGe6kAAKoPgkoVte1Iik6kZSsv/9eTtgrupZK455SXKwMAoPwQVKoof78i3joj+fly7RAAQPVBUKmi2jeqrVZRIfL970XNfG02hQfb1b8N91IBAFQfDKatoux+Plo0qoumf/qDdvycqtiIID0ysKUiazm8XRoAAOWGoFKF1Q6y69lb2nm7DAAAKgw//QAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMsiqAAAAMvy83YBVjT7q/1K3HNKdUMC9Jfft1ZELXup1jPG6PNdJ/TD8XQ1rB2oGztEy8/X8yyYknVBHyYdVUZ2rro1q6P4xuEe9wUAQFVkM8YYbxfhqbS0NIWFhSk1NVWhoaHl0uc9czdo9Z5TzscOPx99/XBP1QsNuOS6Uz7cofnrDsnPx6bcfKNr4iI1757OHoWVU+nZuuGVtTqedl4+NpvyjdE/BrfXzVc2crsvAACsxJ39Nz/9XOTAqQyXkCJJ2bn5evT97y+57g/H0zR/3SFJUm7+r9lv7b7Tinv8UzV59BN1+OtKt2p5LXG/TqZnyxgpL9/IGOnxpTuUn19lcyUAAG4jqFxk/6mMIuefSM2+5LrHU8+XuDzlXK5+9+yXpa7lRNp55f/mYNe5nDxlXMgtdR8AAFR1BJWLXNk4osj57WPCLrnu5fVD5OdjK7HNkbPnSl1Lm4auh8J8bFJ07QCFOBhWBACoOQgqF4moZdejA1q6zGtWN1hP39T2kutG1w7UP25rX2JYKTnGuBp5zWXq3bK+83HtILveuLOTbDZ3egEAoGpjMG0RjpzJ0pofTyomIkg9WtRza91T6dk6cDpTt72xrtCyq5qE693R3UrdlzFGP57IUEZ2jlo0CFUtjqYAAKoBd/bfBJUKcjo9SwnPrlF2br4kqV3DMH00/hovVwUAgPe5s//mT/QKEhkSpD1PD/R2GQAAVGmMUQEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZFUAEAAJZVpW9KWHDj57S0NC9XAgAASqtgv12wHy9JlQ4q6enpkqSYmBgvVwIAANyVnp6usLCwEtvYTGnijEXl5+fr6NGjCgkJkc1m83Y51VZaWppiYmJ05MgRhYaGerucGoft711sf+9i+3tXRW1/Y4zS09MVHR0tH5+SR6FU6SMqPj4+atSokbfLqDFCQ0P5ovAitr93sf29i+3vXRWx/S91JKUAg2kBAIBlEVQAAIBlEVRwSQ6HQ1OmTJHD4fB2KTUS29+72P7exfb3Lits/yo9mBYAAFRvHFEBAACWRVABAACWRVABAACWRVABAACWRVCBJGnWrFlq0qSJAgICdPXVV2vDhg3Ftp09e7Z+97vfKTw8XOHh4erTp0+J7XFp7mz/iy1evFg2m0033XRTxRZYjbm77VNSUjRu3DhFRUXJ4XDo8ssv1/Llyyup2urH3e3/wgsvqEWLFgoMDFRMTIweeOABnT9/vpKqrV6++uorXX/99YqOjpbNZtMHH3xwyXUSExN15ZVXyuFwKC4uTvPmzavwOmVQ4y1evNjY7Xbz9ttvm507d5pRo0aZ2rVrmxMnThTZftiwYWbWrFlm69atZvfu3ebuu+82YWFh5qeffqrkyqsHd7d/gQMHDpiGDRua3/3ud+bGG2+snGKrGXe3fXZ2tunUqZMZNGiQWbt2rTlw4IBJTEw0SUlJlVx59eDu9l+wYIFxOBxmwYIF5sCBA2blypUmKirKPPDAA5VcefWwfPly8/jjj5v333/fSDJLly4tsX1ycrIJCgoyEydONLt27TIvv/yy8fX1NStWrKjQOgkqMFdddZUZN26c83FeXp6Jjo4206dPL9X6ubm5JiQkxMyfP7+iSqzWPNn+ubm5plu3bmbOnDlmxIgRBBUPubvtX3vtNXPZZZeZCxcuVFaJ1Zq723/cuHGmV69eLvMmTpxoEhISKrTOmqA0QeXhhx82bdq0cZk3ZMgQ079//wqszBh++qnhLly4oM2bN6tPnz7OeT4+PurTp4/WrVtXqj6ysrKUk5OjiIiIiiqz2vJ0+0+bNk316tXTvffeWxllVkuebPuPPvpIXbt21bhx41S/fn21bdtWzzzzjPLy8iqr7GrDk+3frVs3bd682fnzUHJyspYvX65BgwZVSs013bp161zeL0nq379/qfcVnqrSNyVE2Z0+fVp5eXmqX7++y/z69evrhx9+KFUfjzzyiKKjowt9gHFpnmz/tWvX6q233lJSUlIlVFh9ebLtk5OT9eWXX+qOO+7Q8uXLtW/fPo0dO1Y5OTmaMmVKZZRdbXiy/YcNG6bTp0/rmmuukTFGubm5Gj16tB577LHKKLnGO378eJHvV1pams6dO6fAwMAKeV6OqKBMZsyYocWLF2vp0qUKCAjwdjnVXnp6uoYPH67Zs2crMjLS2+XUOPn5+apXr57efPNNdezYUUOGDNHjjz+u119/3dul1QiJiYl65pln9Oqrr2rLli16//339cknn+ipp57ydmmoQBxRqeEiIyPl6+urEydOuMw/ceKEGjRoUOK6f//73zVjxgx98cUXateuXUWWWW25u/3379+vgwcP6vrrr3fOy8/PlyT5+flpz549atasWcUWXU148tmPioqSv7+/fH19nfNatWql48eP68KFC7Lb7RVac3XiyfZ/8sknNXz4cI0cOVKSdMUVVygzM1N//OMf9fjjj8vHh7+9K1KDBg2KfL9CQ0Mr7GiKxBGVGs9ut6tjx45atWqVc15+fr5WrVqlrl27Frvec889p6eeekorVqxQp06dKqPUasnd7d+yZUtt375dSUlJzumGG25Qz549lZSUpJiYmMosv0rz5LOfkJCgffv2OcOhJP3444+KiooipLjJk+2flZVVKIwUhEbDbesqXNeuXV3eL0n6/PPPS9xXlIsKHaqLKmHx4sXG4XCYefPmmV27dpk//vGPpnbt2ub48ePGGGOGDx9uHn30UWf7GTNmGLvdbt577z1z7Ngx55Senu6tl1Clubv9f4uzfjzn7rY/fPiwCQkJMffdd5/Zs2ePWbZsmalXr555+umnvfUSqjR3t/+UKVNMSEiIWbRokUlOTjafffaZadasmbntttu89RKqtPT0dLN161azdetWI8n885//NFu3bjWHDh0yxhjz6KOPmuHDhzvbF5yePGnSJLN7924za9YsTk9G5Xn55ZdN48aNjd1uN1dddZX57rvvnMt69OhhRowY4XwcGxtrJBWapkyZUvmFVxPubP/fIqiUjbvb/ttvvzVXX321cTgc5rLLLjN/+9vfTG5ubiVXXX24s/1zcnLM1KlTTbNmzUxAQICJiYkxY8eONWfPnq38wquB1atXF/ldXrDNR4wYYXr06FFonQ4dOhi73W4uu+wyM3fu3Aqv02YMx8sAAIA1MUYFAABYFkEFAABYFkEFAABYFkEFAABYFkEFAABYFkEFAABYFkEFAABYFkEFAABYFkEFAABYFkEFgKX1799fvr6+2rhxY6Fld999t2w2m2w2m+x2u+Li4jRt2jTl5uZ6oVIAFYGgAsCyDh8+rG+//Vb33Xef3n777SLbDBgwQMeOHdPevXv14IMPaurUqZo5c2YlVwqgohBUAFS4a6+9VuPHj9eECRMUHh6u+vXra/bs2crMzNQ999yjkJAQxcXF6dNPP3VZb+7cufr973+vMWPGaNGiRTp37lyhvh0Ohxo0aKDY2FiNGTNGffr00UcffVRZLw1ABSOoAKgU8+fPV2RkpDZs2KDx48drzJgxGjx4sLp166YtW7aoX79+Gj58uLKysiRJxhjNnTtXd955p1q2bKm4uDi99957l3yewMBAXbhwoaJfDoBKQlABUCnat2+vJ554Qs2bN9fkyZMVEBCgyMhIjRo1Ss2bN9df/vIXnTlzRt9//70k6YsvvlBWVpb69+8vSbrzzjv11ltvFdu/MUZffPGFVq5cqV69elXKawJQ8QgqACpFu3btnP/29fVVnTp1dMUVVzjn1a9fX5J08uRJSdLbb7+tIUOGyM/PT5I0dOhQffPNN9q/f79Lv8uWLVOtWrUUEBCggQMHasiQIZo6dWoFvxoAlYWgAqBS+Pv7uzy22Wwu82w2myQpPz9fv/zyi5YuXapXX31Vfn5+8vPzU8OGDZWbm1toUG3Pnj2VlJSkvXv36ty5c5o/f76Cg4Mr/gUBqBR+3i4AAH5rwYIFatSokT744AOX+Z999pn+8Y9/aNq0afL19ZUkBQcHKy4uzgtVAqgMBBUAlvPWW2/p1ltvVdu2bV3mx8TEaPLkyVqxYoWuu+46L1UHoDLx0w8AS9m/f7+2bdumW265pdCysLAw9e7du8RBtQCqF5sxxni7CAAAgKJwRAUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFjW/wdjHVoMVQbXvwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "consistent_ratio = target_maps.below_corrected_p.mean()\n", + "\n", + "plt.scatter(\n", + " data=target_maps,\n", + " x=\"mean_average_precision\",\n", + " y=\"-log10(p-value)\",\n", + " c=\"below_corrected_p\",\n", + " cmap=\"tab10\",\n", + " s=10,\n", + ")\n", + "plt.xlabel(\"mAP\")\n", + "plt.ylabel(\"-log10(p-value)\")\n", + "plt.axhline(-np.log10(0.05), color=\"black\", linestyle=\"--\")\n", + "plt.text(\n", + " 0.5,\n", + " 1.5,\n", + " f\"Phenotypically consistent = {100 * consistent_ratio:.2f}%\",\n", + " va=\"center\",\n", + " ha=\"left\",\n", + ")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can list compounds that are phenotypically active and consistent.\n", + "\n", + "Note that in multi-label scenario, when each compound can have multiple targets, the same compound can have \"consistent\" response in respect to one target, but not another." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotypically consistent targets: DRD2, EGFR, HTR3A, PSMB1\n", + "Phenotypically consistent compounds: BRD-A69636825-003-04-7, BRD-K50691590-001-02-2, BRD-K60230970-001-10-0, BRD-K70330367-003-07-9, BRD-K70358946-001-15-7, BRD-K70401845-003-09-6, BRD-K70914287-300-02-8\n" + ] + } + ], + "source": [ + "consistent_targets = target_maps.query(\"below_corrected_p\")[\"Metadata_target\"]\n", + "consistent_compounds = df_active[\n", + " df_active[\"Metadata_target\"].apply(\n", + " lambda x: any(t in x for t in consistent_targets)\n", + " )\n", + "][\"Metadata_broad_sample\"]\n", + "\n", + "print(f\"Phenotypically consistent targets: {consistent_targets.str.cat(sep=', ')}\")\n", + "print(f\"Phenotypically consistent compounds: {consistent_compounds.str.cat(sep=', ')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "copairs", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index c74d308..92728a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "copairs" -version = "0.4.3" +version = "0.4.4" description = "Find pairs and compute metrics between them" readme = "README.md" requires-python = ">=3.8" @@ -33,3 +33,9 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["src"] + +[tool.ruff.lint] +select = ["D"] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" diff --git a/src/copairs/__init__.py b/src/copairs/__init__.py index ee93afd..9c27384 100644 --- a/src/copairs/__init__.py +++ b/src/copairs/__init__.py @@ -1,6 +1,4 @@ -""" -Package to create pairwise lists based on sameby and diffby criteria -""" +"""Package to create pairwise lists based on sameby and diffby criteria.""" from .matching import Matcher, MatcherMultilabel diff --git a/src/copairs/compute.py b/src/copairs/compute.py index 954bf01..c03c1b5 100644 --- a/src/copairs/compute.py +++ b/src/copairs/compute.py @@ -1,20 +1,45 @@ +"""Functions to compute distances and ranks using numpy operations.""" + import itertools import os from multiprocessing.pool import ThreadPool from pathlib import Path -from typing import Callable +from typing import Callable, Tuple, Union import numpy as np from tqdm.autonotebook import tqdm -def parallel_map(par_func, items): - """Execute par_func(i) for every i in items using ThreadPool and tqdm.""" +def parallel_map(par_func: Callable[[int], None], items: np.ndarray) -> None: + """Execute a function in parallel over a list of items. + + This function uses a thread pool to process items in parallel, with progress + tracking via `tqdm`. It is particularly useful for batch operations that benefit + from multithreading. + + Parameters + ---------- + par_func : Callable + A function to execute for each item. It should accept a single argument + (an item index or value). + items : np.ndarray + An array or list of items to process. + """ + # Total number of items to process num_items = len(items) + + # Determine the number of threads to use, limited by CPU count pool_size = min(num_items, os.cpu_count()) + + # Calculate chunk size for dividing work among threads chunksize = num_items // pool_size + + # Use a thread pool to execute the function in parallel with ThreadPool(pool_size) as pool: + # Map the function to items with unordered execution for better efficiency tasks = pool.imap_unordered(par_func, items, chunksize=chunksize) + + # Display progress using tqdm for _ in tqdm(tasks, total=len(items), leave=False): pass @@ -22,18 +47,42 @@ def parallel_map(par_func, items): def batch_processing( pairwise_op: Callable[[np.ndarray, np.ndarray], np.ndarray], ): - """Decorator adding the batch_size param to run the function with - multithreading using a list of paired indices""" + """ + Add batch processing support to pairwise operations. + + This decorator wraps a pairwise operation to process data in batches, + enabling efficient computation and multithreading when working with large + datasets. + + Parameters + ---------- + pairwise_op : Callable + A function that computes pairwise operations (e.g., similarity or distance) + between two arrays of features. + + Returns + ------- + Callable + A wrapped function that processes pairwise operations in batches. + + """ def batched_fn(feats: np.ndarray, pair_ix: np.ndarray, batch_size: int): + # Total number of pairs to process num_pairs = len(pair_ix) + + # Initialize an empty result array to store pairwise operation results result = np.empty(num_pairs, dtype=np.float32) def par_func(i): + # Extract the features for the current batch of pairs x_sample = feats[pair_ix[i : i + batch_size, 0]] y_sample = feats[pair_ix[i : i + batch_size, 1]] + + # Compute pairwise operations for the current batch result[i : i + len(x_sample)] = pairwise_op(x_sample, y_sample) + # Use multithreading to process the batches in parallel parallel_map(par_func, np.arange(0, num_pairs, batch_size)) return result @@ -42,52 +91,180 @@ def par_func(i): def pairwise_corr(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray: + """Compute the Pearson correlation coefficient for paired rows of two matrices. + + Parameters + ---------- + x_sample : np.ndarray + A 2D array where each row represents a profile + y_sample : np.ndarray + A 2D array of the same shape as `x_sample`. + + Returns + ------- + np.ndarray + A 1D array of Pearson correlation coefficients for each row pair in + `x_sample` and `y_sample`. """ - Compute pearson correlation between two matrices in a paired row-wise - fashion. `x_sample` and `y_sample` must be of the same shape. - """ + # Compute the mean for each row x_mean = x_sample.mean(axis=1, keepdims=True) y_mean = y_sample.mean(axis=1, keepdims=True) + # Center the rows by subtracting the mean x_center = x_sample - x_mean y_center = y_sample - y_mean + # Compute the numerator (dot product of centered vectors) numer = (x_center * y_center).sum(axis=1) + # Compute the denominator (product of vector magnitudes) denom = (x_center**2).sum(axis=1) * (y_center**2).sum(axis=1) denom = np.sqrt(denom) + # Calculate correlation coefficients corrs = numer / denom return corrs def pairwise_cosine(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray: + """Compute cosine similarity for paired rows of two matrices. + + Parameters + ---------- + x_sample : np.ndarray + A 2D array where each row represents a profile. + y_sample : np.ndarray + A 2D array of the same shape as `x_sample`. + + Returns + ------- + np.ndarray + A 1D array of cosine similarity scores for each row pair in `x_sample` and `y_sample`. + """ + # Normalize each row to unit vectors x_norm = x_sample / np.linalg.norm(x_sample, axis=1)[:, np.newaxis] y_norm = y_sample / np.linalg.norm(y_sample, axis=1)[:, np.newaxis] + + # Compute the dot product of normalized vectors c_sim = np.sum(x_norm * y_norm, axis=1) return c_sim def pairwise_abs_cosine(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray: + """Compute the absolute cosine similarity for paired rows of two matrices. + + Parameters + ---------- + x_sample : np.ndarray + A 2D array where each row represents a profile. + y_sample : np.ndarray + A 2D array of the same shape as `x_sample`. + + Returns + ------- + np.ndarray + Absolute values of cosine similarity scores. + """ return np.abs(pairwise_cosine(x_sample, y_sample)) def pairwise_euclidean(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray: + """ + Compute the inverse Euclidean distance for paired rows of two matrices. + + Parameters + ---------- + x_sample : np.ndarray + A 2D array where each row represents a profile. + y_sample : np.ndarray + A 2D array of the same shape as `x_sample`. + + Returns + ------- + np.ndarray + A 1D array of inverse Euclidean distance scores (scaled to range 0-1). + """ + # Compute Euclidean distance and scale to a range of 0 to 1 e_dist = np.sqrt(np.sum((x_sample - y_sample) ** 2, axis=1)) return 1 / (1 + e_dist) def pairwise_manhattan(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray: + """Compute the inverse Manhattan distance for paired rows of two matrices. + + Parameters + ---------- + x_sample : np.ndarray + A 2D array where each row represents a profile. + y_sample : np.ndarray + A 2D array of the same shape as `x_sample`. + + Returns + ------- + np.ndarray + A 1D array of inverse Manhattan distance scores (scaled to range 0-1). + """ m_dist = np.sum(np.abs(x_sample - y_sample), axis=1) return 1 / (1 + m_dist) def pairwise_chebyshev(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray: + """Compute the inverse Chebyshev distance for paired rows of two matrices. + + Parameters + ---------- + x_sample : np.ndarray + A 2D array where each row represents a profile. + y_sample : np.ndarray + A 2D array of the same shape as `x_sample`. + + Returns + ------- + np.ndarray + A 1D array of inverse Chebyshev distance scores (scaled to range 0-1). + """ c_dist = np.max(np.abs(x_sample - y_sample), axis=1) return 1 / (1 + c_dist) -def get_distance_fn(distance): +def get_distance_fn(distance: Union[str, Callable]) -> Callable: + """Retrieve a distance metric function based on a string identifier or custom callable. + + This function provides flexibility in specifying the distance metric to be used + for pairwise similarity or dissimilarity computations. Users can choose from a + predefined set of metrics or provide a custom callable. + + Parameters + ---------- + distance : str or callable + The name of the distance metric or a custom callable function. Supported + string identifiers for predefined metrics are: + - "cosine": Cosine similarity. + - "abs_cosine": Absolute cosine similarity. + - "correlation": Pearson correlation coefficient. + - "euclidean": Inverse Euclidean distance (scaled to range 0-1). + - "manhattan": Inverse Manhattan distance (scaled to range 0-1). + - "chebyshev": Inverse Chebyshev distance (scaled to range 0-1). + + If a callable is provided, it must accept the paramters associated with each + callable function. + + Returns + ------- + callable + A function implementing the specified distance metric. + + Raises + ------ + ValueError: + If the provided `distance` is not a recognized string identifier or a valid callable. + + Example: + ------- + >>> distance_fn = get_distance_fn("cosine") + >>> similarity_scores = distance_fn(x_sample, y_sample) + """ + # Dictionary of supported distance metrics distance_metrics = { "abs_cosine": pairwise_abs_cosine, "cosine": pairwise_cosine, @@ -97,6 +274,7 @@ def get_distance_fn(distance): "chebyshev": pairwise_chebyshev, } + # If a string is provided, look up the corresponding metric function if isinstance(distance, str): if distance not in distance_metrics: raise ValueError( @@ -104,137 +282,324 @@ def get_distance_fn(distance): ) distance_fn = distance_metrics[distance] elif callable(distance): + # If a callable is provided, use it directly distance_fn = distance else: + # Raise an error if neither a string nor a callable is provided raise ValueError("Distance must be either a string or a callable object.") + # Wrap the distance function for efficient batch processing return batch_processing(distance_fn) def random_binary_matrix(n, m, k, rng): - """Generate a random binary matrix of n*m with exactly k values in 1 per row. + """Generate a indices of k values in 1 per row in a random binary n*m matrix. + Args: n: Number of rows. m: Number of columns. k: Number of 1's per row. - Returns: - A: Random binary matrix of n*m with exactly k values in 1 per row. + Returns + ------- + np.ndarray + A binary matrix of shape `(n, m)` with exactly `k` ones per row. """ - matrix = np.zeros((n, m), dtype=int) - matrix[:, :k] = 1 - rng.permuted(matrix, axis=1, out=matrix) - return matrix + dtype = np.uint16 if m < 2**16 else np.uint32 + indices = np.tile(np.arange(m, dtype=dtype), (n, 1)) + rng.permuted(indices, axis=1, out=indices) + return np.sort(indices[:, :k], axis=1) def average_precision(rel_k) -> np.ndarray: - """Compute average precision based on binary list sorted by relevance""" - tp = np.cumsum(rel_k, axis=1) - num_pos = tp[:, -1] - k = np.arange(1, rel_k.shape[1] + 1) - pr_k = tp / k - ap = (pr_k * rel_k).sum(axis=1) / num_pos - return ap + """Compute average precision based on binary list indices.""" + num_pos = rel_k.shape[1] + pr_k = np.arange(1, num_pos + 1, dtype=np.float32) / (rel_k + 1) + ap_values = pr_k.sum(axis=1) / num_pos + return ap_values.astype(np.float32) + +def ap_contiguous( + rel_k_list: np.ndarray, counts: np.ndarray +) -> Tuple[np.ndarray, np.ndarray]: + """Compute Average Precision (AP) scores from relevance labels. -def ap_contiguous(rel_k_list, counts): - """Compute average precision from a list of contiguous values""" + This function calculates Average Precision (AP) scores for each profile based on + relevance labels and their associated counts. It also returns configurations + indicating the number of positive and total pairs for each profile. + + Parameters + ---------- + rel_k_list : np.ndarray + Array of relevance labels (1 for positive pairs, 0 for negative pairs), sorted + by descending similarity within profiles. + counts : np.ndarray + Array indicating how many times each profile appears in the rank list. + + Returns + ------- + ap_scores : np.ndarray + Array of Average Precision scores for each profile. + null_confs : np.ndarray + Array of configurations, where each row corresponds to: + - Number of positive pairs (`num_pos`). + - Total number of pairs (`counts`). + """ + # Convert counts into cutoff indices to segment relevance labels cutoffs = to_cutoffs(counts) - num_pos = np.add.reduceat(rel_k_list, cutoffs) + num_pos = np.add.reduceat(rel_k_list, cutoffs, dtype=np.uint32) shift = np.empty_like(num_pos) shift[0], shift[1:] = 0, num_pos[:-1] + # Calculate cumulative true positives for each profile segment tp = rel_k_list.cumsum() - np.repeat(shift.cumsum(), counts) + + # Rank positions for each relevance label, adjusted by cutoff indices k = np.arange(1, len(rel_k_list) + 1) - np.repeat(cutoffs, counts) + # Compute precision at each rank (precision = TP / rank) pr_k = tp / k + + # Calculate average precision scores for each profile ap_scores = np.add.reduceat(pr_k * rel_k_list, cutoffs) / num_pos + + # Generate configurations (number of positive and total pairs) null_confs = np.stack([num_pos, counts], axis=1) + return ap_scores, null_confs -def random_ap(num_perm: int, num_pos: int, total: int, seed) -> np.ndarray: - """Compute multiple average_precision scores generated at random""" +def random_ap(num_perm: int, num_pos: int, total: int, seed: int): + """Generate random Average Precision (AP) scores to create a null distribution. + + This function computes multiple Average Precision (AP) scores based on randomly + generated binary relevance lists. It is useful for generating a null distribution + to assess the significance of observed AP scores. + + Parameters + ---------- + num_perm : int + Number of random permutations (i.e., how many random relevance lists to generate). + num_pos : int + Number of positive samples (1's) in each relevance list. + total : int + Total number of samples (columns) in each relevance list. + seed : int + Seed for the random number generator to ensure reproducibility. + + Returns + ------- + np.ndarray + A 1D array containing the Average Precision scores for each randomly + generated relevance list. + """ + # Initialize the random number generator rng = np.random.default_rng(seed) + + # Generate a binary matrix with `num_perm` rows and `total` columns, + # where each row contains exactly `num_pos` ones distributed randomly rel_k = random_binary_matrix(num_perm, total, num_pos, rng) + + # Compute Average Precision (AP) scores for each row of the binary matrix null_dist = average_precision(rel_k) return null_dist -def null_dist_cached(num_pos, total, seed, null_size, cache_dir): +def null_dist_cached( + num_pos: int, total: int, seed: int, null_size: int, cache_dir: Path +) -> np.ndarray: + """Generate or retrieve a cached null distribution for a given configuration. + + This function calculates a null distribution for a specified number of positive + pairs (`num_pos`) and total pairs (`total`). It uses caching to store and + retrieve precomputed distributions, saving time and computational resources. + + Parameters + ---------- + num_pos : int + Number of positive pairs in the configuration. + total : int + Total number of pairs (positive + negative) in the configuration. + seed : int + Random seed for reproducibility. + null_size : int + Number of samples to generate in the null distribution. + cache_dir : Path + Directory to store or retrieve cached null distributions. + + Returns + ------- + np.ndarray + Null distribution for the specified configuration. + """ + # Check if a seed is provided to enable caching if seed is not None: + # Define the cache file name based on the configuration cache_file = cache_dir / f"n{total}_k{num_pos}.npy" + + # If the cache file exists, load the null distribution from it if cache_file.is_file(): null_dist = np.load(cache_file) else: + # If the cache file doesn't exist, compute the null distribution null_dist = random_ap(null_size, num_pos, total, seed) + + # Save the computed distribution to the cache np.save(cache_file, null_dist) else: + # If no seed is provided, compute the null distribution without caching null_dist = random_ap(null_size, num_pos, total, seed) + + # Return the null distribution (loaded or computed) return null_dist -def get_null_dists(confs, null_size, seed): +def get_null_dists(confs: np.ndarray, null_size: int, seed: int) -> np.ndarray: + """Generate null distributions for each configuration of positive and total pairs. + + Parameters + ---------- + confs : np.ndarray + Array where each row contains the number of positive pairs (`num_pos`) + and total pairs (`total`) for a specific configuration. + null_size : int + Number of samples to generate in the null distribution. + seed : int + Random seed for reproducibility. + + Returns + ------- + np.ndarray + A 2D array where each row corresponds to a null distribution for a specific + configuration. + """ + # Define the directory for caching null distributions cache_dir = Path.home() / ".copairs" / f"seed{seed}" / f"ns{null_size}" cache_dir.mkdir(parents=True, exist_ok=True) + + # Number of configurations and random seeds for each configuration num_confs = len(confs) rng = np.random.default_rng(seed) seeds = rng.integers(8096, size=num_confs) + # Initialize an array to store null distributions null_dists = np.empty([len(confs), null_size], dtype=np.float32) + # Function to generate null distributions for each configuration def par_func(i): num_pos, total = confs[i] null_dists[i] = null_dist_cached(num_pos, total, seeds[i], null_size, cache_dir) + # Parallelize the generation of null distributions parallel_map(par_func, np.arange(num_confs)) + return null_dists def p_values(ap_scores: np.ndarray, null_confs: np.ndarray, null_size: int, seed: int): - """Calculate p values for an array of ap_scores and null configurations. It uses the path - folder to cache null calculations. + """Calculate p-values for an array of Average Precision (AP) scores using a null distribution. Parameters ---------- ap_scores : np.ndarray - Ap scores for which to calculate p value. + Array of observed AP scores for which to calculate p-values. null_confs : np.ndarray - Number of average precisions calculated. It serves as an indicator of - how relevant is the resultant score. + Configuration array indicating the relevance or context of each AP score. Used + to generate corresponding null distributions. null_size : int + Number of samples to generate in the null distribution for each configuration. seed : int - Random initializing value. - - Examples - -------- - FIXME: Add docs. - + Seed for the random number generator to ensure reproducibility of the null + distribution. + Returns + ------- + np.ndarray + An array of p-values corresponding to the input AP scores. """ + # Identify unique configurations and their indices confs, rev_ix = np.unique(null_confs, axis=0, return_inverse=True) + + # Generate null distributions for each unique configuration null_dists = get_null_dists(confs, null_size, seed) + + # Sort null distributions for efficient p-value computation null_dists.sort(axis=1) + + # Initialize an array to store the p-values pvals = np.empty(len(ap_scores), dtype=np.float32) + + # Compute p-values for each AP score for i, (ap_score, ix) in enumerate(zip(ap_scores, rev_ix)): - # Reverse to get from hi to low + # Find the rank of the observed AP score in the sorted null distribution num = null_size - np.searchsorted(null_dists[ix], ap_score) + + # Calculate the p-value as the proportion of null scores >= observed score pvals[i] = (num + 1) / (null_size + 1) + return pvals def concat_ranges(start: np.ndarray, end: np.ndarray) -> np.ndarray: - """Create a 1-d array concatenating multiple ranges""" + """Create a 1D array by concatenating multiple integer ranges. + + This function generates a single concatenated array from multiple ranges defined + by the `start` and `end` arrays. Each range is inclusive of `start` and exclusive + of `end`. + + Parameters + ---------- + start : np.ndarray + A 1D array of start indices for the ranges. + end : np.ndarray + A 1D array of end indices for the ranges. Must have the same shape as `start`. + + Returns + ------- + np.ndarray + A 1D array containing the concatenated ranges. + """ + # Generate individual ranges using `range` for each pair of start and end slices = map(range, start, end) + + # Flatten the ranges into a single iterable slices = itertools.chain.from_iterable(slices) + + # Calculate the total length of the concatenated ranges count = (end - start).sum() + + # Create a 1D array from the concatenated ranges mask = np.fromiter(slices, dtype=np.int32, count=count) + return mask -def to_cutoffs(counts: np.ndarray): - """Convert a list of counts into cutoff indices.""" +def to_cutoffs(counts: np.ndarray) -> np.ndarray: + """Convert counts into cumulative cutoff indices. + + This function generates a 1D array of indices that mark the start of each segment + in a cumulative list. The first index is always `0`, and subsequent indices + correspond to the cumulative sum of counts up to the previous entry. + + Parameters + ---------- + counts : np.ndarray + A 1D array of counts representing the size of each segment. + + Returns + ------- + np.ndarray + A 1D array of cutoff indices where each value indicates the starting index + for the corresponding segment. + """ + # Initialize an empty array for cutoff indices cutoffs = np.empty_like(counts) - cutoffs[0], cutoffs[1:] = 0, counts.cumsum()[:-1] + + # Set the first cutoff to 0 (start of the first segment) + cutoffs[0] = 0 + + # Compute subsequent cutoffs using cumulative sums, excluding the last element + cutoffs[1:] = counts.cumsum()[:-1] + return cutoffs diff --git a/src/copairs/map/__init__.py b/src/copairs/map/__init__.py index 0e1998c..f6c77c9 100644 --- a/src/copairs/map/__init__.py +++ b/src/copairs/map/__init__.py @@ -1,3 +1,5 @@ +"""Module to compute mAP-based metrics.""" + from . import multilabel from .average_precision import average_precision from .map import mean_average_precision diff --git a/src/copairs/map/average_precision.py b/src/copairs/map/average_precision.py index 10b481a..8c73a35 100644 --- a/src/copairs/map/average_precision.py +++ b/src/copairs/map/average_precision.py @@ -1,5 +1,8 @@ +"""Functions to compute average precision.""" + import itertools import logging +from typing import List import numpy as np import pandas as pd @@ -12,92 +15,269 @@ logger = logging.getLogger("copairs") -def build_rank_lists(pos_pairs, neg_pairs, pos_sims, neg_sims): +def build_rank_lists( + pos_pairs: np.ndarray, + neg_pairs: np.ndarray, + pos_sims: np.ndarray, + neg_sims: np.ndarray, +): + """Build rank lists for calculating average precision. + + This function processes positive and negative pairs along with their similarity scores + to construct rank lists and determine unique profile indices with their associated counts. + + Parameters + ---------- + pos_pairs : np.ndarray + Array of positive pair indices, where each pair is represented as a pair of integers. + + neg_pairs : np.ndarray + Array of negative pair indices, where each pair is represented as a pair of integers. + + pos_sims : np.ndarray + Array of similarity scores for positive pairs. + + neg_sims : np.ndarray + Array of similarity scores for negative pairs. + + Returns + ------- + paired_ix : np.ndarray + Unique indices of profiles that appear in the rank lists. + + rel_k_list : np.ndarray + Array of relevance labels (1 for positive pairs, 0 for negative pairs) sorted by + decreasing similarity within each profile. + + counts : np.ndarray + Array of counts indicating how many times each profile index appears in the rank lists. + """ + # Combine relevance labels: 1 for positive pairs, 0 for negative pairs labels = np.concatenate( [ - np.ones(pos_pairs.size, dtype=np.int32), - np.zeros(neg_pairs.size, dtype=np.int32), + np.ones(pos_pairs.size, dtype=np.uint32), + np.zeros(neg_pairs.size, dtype=np.uint32), ] ) + + # Flatten positive and negative pair indices for ranking ix = np.concatenate([pos_pairs.ravel(), neg_pairs.ravel()]) + + # Expand similarity scores to match the flattened pair indices sim_all = np.concatenate([np.repeat(pos_sims, 2), np.repeat(neg_sims, 2)]) + + # Sort by similarity (descending) and then by index (lexicographical order) + # `1 - sim_all` ensures higher similarity values appear first, prioritizing + # pairs with stronger similarity scores for ranking. + # `ix` acts as a secondary criterion, ensuring consistent ordering of pairs + # with equal similarity scores by their indices (lexicographical order). ix_sort = np.lexsort([1 - sim_all, ix]) + + # Create the rank list of relevance labels sorted by similarity and index rel_k_list = labels[ix_sort] + + # Find unique profile indices and count their occurrences in the pairs paired_ix, counts = np.unique(ix, return_counts=True) - return paired_ix, rel_k_list, counts + + return paired_ix, rel_k_list, counts.astype(np.uint32) def average_precision( - meta, - feats, - pos_sameby, - pos_diffby, - neg_sameby, - neg_diffby, - batch_size=20000, - distance="cosine", + meta: pd.DataFrame, + feats: pd.DataFrame, + pos_sameby: List[str], + pos_diffby: List[str], + neg_sameby: List[str], + neg_diffby: List[str], + batch_size: int = 20000, + distance: str = "cosine", ) -> pd.DataFrame: + """Calculate average precision (AP) scores for pairs of profiles based on their similarity. + + This function identifies positive and negative pairs of profiles using metadata + rules, computes their similarity scores, and calculates average precision + scores for each profile. The results include the number of positive and total pairs + for each profile. + + Parameters + ---------- + meta : pd.DataFrame + Metadata of the profiles, including columns used for defining pairs. + This DataFrame should include the columns specified in `pos_sameby`, + `pos_diffby`, `neg_sameby`, and `neg_diffby`. + + feats : np.ndarray + Feature matrix representing the profiles, where rows correspond to profiles + and columns to features. + + pos_sameby : list + Metadata columns used to define positive pairs. Two profiles are considered a + positive pair if they belong to the same group that is not a control group. + For example, replicate profiles of the same compound are positive pairs and + should share the same value in a column identifying compounds. + + pos_diffby : list + Metadata columns used to differentiate positive pairs. Positive pairs do not need + to differ in any metadata columns, so this is typically left empty. However, + if necessary (e.g., to account for batch effects), you can specify columns + such as batch identifiers. + + neg_sameby : list + Metadata columns used to define negative pairs. Typically left empty, as profiles + forming a negative pair (e.g., a compound and a DMSO/control) do not need to + share any metadata values. This ensures comparisons are made without enforcing + unnecessary constraints. + + neg_diffby : list + Metadata columns used to differentiate negative pairs. Two profiles are considered + a negative pair if one belongs to a compound group and the other to a DMSO/ + control group. They must differ in specified metadata columns, such as those + identifying the compound and the treatment index, to ensure comparisons are + only made between compounds and DMSO controls (not between different compounds). + + batch_size : int + The batch size for similarity computations to optimize memory usage. + Default is 20000. + + distance : str + The distance metric used for computing similarities. Default is "cosine". + + Returns + ------- + pd.DataFrame + A DataFrame containing the following columns: + - 'average_precision': The calculated average precision score for each profile. + - 'n_pos_pairs': The number of positive pairs for each profile. + - 'n_total_pairs': The total number of pairs for each profile. + - Additional metadata columns from the input. + + Raises + ------ + UnpairedException + If no positive or negative pairs are found in the dataset. + + Notes + ----- + - Positive Pair Rules: + * Positive pairs are defined by `pos_sameby` (profiles share these metadata values) + and optionally differentiated by `pos_diffby` (profiles must differ in these metadata values if specified). + - Negative Pair Rules: + * Negative pairs are defined by `neg_diffby` (profiles differ in these metadata values) + and optionally constrained by `neg_sameby` (profiles share these metadata values if specified). + """ + # Combine all metadata columns needed for pair definitions columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby) + + # Validate and filter metadata to ensure the required columns are present and usable meta, columns = evaluate_and_filter(meta, columns) validate_pipeline_input(meta, feats, columns) + + # Get the distance function for similarity calculations (e.g., cosine) distance_fn = compute.get_distance_fn(distance) - # Critical!, otherwise the indexing wont work + # Reset metadata index for consistent indexing meta = meta.reset_index(drop=True).copy() + + # Initialize the Matcher object to find pairs based on metadata rules logger.info("Indexing metadata...") matcher = Matcher(meta, columns, seed=0) + # Identify positive pairs based on `pos_sameby` and `pos_diffby` logger.info("Finding positive pairs...") pos_pairs = matcher.get_all_pairs(sameby=pos_sameby, diffby=pos_diffby) pos_total = sum(len(p) for p in pos_pairs.values()) if pos_total == 0: raise UnpairedException("Unable to find positive pairs.") + + # Convert positive pairs to a NumPy array for efficient computation pos_pairs = np.fromiter( itertools.chain.from_iterable(pos_pairs.values()), - dtype=np.dtype((np.int32, 2)), + dtype=np.dtype((np.uint32, 2)), count=pos_total, ) + # Identify negative pairs based on `neg_sameby` and `neg_diffby` logger.info("Finding negative pairs...") neg_pairs = matcher.get_all_pairs(sameby=neg_sameby, diffby=neg_diffby) neg_total = sum(len(p) for p in neg_pairs.values()) if neg_total == 0: raise UnpairedException("Unable to find negative pairs.") + + # Convert negative pairs to a NumPy array for efficient computation neg_pairs = np.fromiter( itertools.chain.from_iterable(neg_pairs.values()), - dtype=np.dtype((np.int32, 2)), + dtype=np.dtype((np.uint32, 2)), count=neg_total, ) + # Compute similarities for positive pairs logger.info("Computing positive similarities...") pos_sims = distance_fn(feats, pos_pairs, batch_size) + # Compute similarities for negative pairs logger.info("Computing negative similarities...") neg_sims = distance_fn(feats, neg_pairs, batch_size) + # Build rank lists for calculating average precision logger.info("Building rank lists...") paired_ix, rel_k_list, counts = build_rank_lists( pos_pairs, neg_pairs, pos_sims, neg_sims ) + # Compute average precision scores and associated configurations logger.info("Computing average precision...") ap_scores, null_confs = compute.ap_contiguous(rel_k_list, counts) + # Add AP scores and pair counts to the metadata DataFrame logger.info("Creating result DataFrame...") meta["n_pos_pairs"] = 0 meta["n_total_pairs"] = 0 meta.loc[paired_ix, "average_precision"] = ap_scores meta.loc[paired_ix, "n_pos_pairs"] = null_confs[:, 0] meta.loc[paired_ix, "n_total_pairs"] = null_confs[:, 1] + logger.info("Finished.") return meta -def p_values(dframe: pd.DataFrame, null_size: int, seed: int): - """Compute p-values""" +def p_values(dframe: pd.DataFrame, null_size: int, seed: int) -> np.ndarray: + """Compute p-values for average precision scores based on a null distribution. + + This function calculates the p-values for each profile in the input DataFrame, + comparing their average precision scores (`average_precision`) against a null + distribution generated for their specific configurations (number of positive + and total pairs). Profiles with no positive pairs are excluded from the p-value calculation. + + Parameters + ---------- + dframe : pd.DataFrame + A DataFrame containing the following columns: + - `average_precision`: The AP scores for each profile. + - `n_pos_pairs`: Number of positive pairs for each profile. + - `n_total_pairs`: Total number of pairs (positive + negative) for each profile. + null_size : int + The number of samples to generate in the null distribution for significance testing. + seed : int + Random seed for reproducibility of the null distribution. + + Returns + ------- + np.ndarray + An array of p-values for each profile in the DataFrame. Profiles with no positive + pairs will have NaN as their p-value. + """ + # Create a mask to filter profiles with at least one positive pair mask = dframe["n_pos_pairs"] > 0 + + # Initialize the p-values array with NaN for all profiles pvals = np.full(len(dframe), np.nan, dtype=np.float32) + + # Extract the average precision scores and null configurations for valid profiles scores = dframe.loc[mask, "average_precision"].values null_confs = dframe.loc[mask, ["n_pos_pairs", "n_total_pairs"]].values + + # Compute p-values for profiles with valid configurations using the null distribution pvals[mask] = compute.p_values(scores, null_confs, null_size, seed) + + # Return the array of p-values, including NaN for invalid profiles return pvals diff --git a/src/copairs/map/filter.py b/src/copairs/map/filter.py index c2956da..7106763 100644 --- a/src/copairs/map/filter.py +++ b/src/copairs/map/filter.py @@ -1,3 +1,5 @@ +"""Functions to support query-like syntax when finding the matches.""" + import itertools import re from typing import List, Tuple @@ -6,17 +8,42 @@ import pandas as pd -def validate_pipeline_input(meta, feats, columns): +def validate_pipeline_input( + meta: pd.DataFrame, feats: np.ndarray, columns: List[str] +) -> None: + """Validate the metadata and features for consistency and completeness. + + Parameters + ---------- + meta : pd.DataFrame + The metadata DataFrame describing the profiles. + feats : np.ndarray + The feature matrix where rows correspond to profiles in the metadata. + columns : List[str] + List of column names in the metadata to validate for null values. + + Raises + ------ + ValueError: + - If any of the specified metadata columns contain null values. + - If the number of rows in the metadata and features are not equal. + - If the feature matrix contains null values. + """ + # Check for null values in the specified metadata columns if meta[columns].isna().any(axis=None): raise ValueError("metadata columns should not have null values.") + + # Check if the number of rows in metadata matches the feature matrix if len(meta) != len(feats): - raise ValueError("meta and feats have different number of rows") + raise ValueError("Metadata and features must have the same number of rows.") + + # Check for null values in the feature matrix if np.isnan(feats).any(): raise ValueError("features should not have null values.") def flatten_str_list(*args): - """create a single list with all the params given""" + """Create a single list with all the params given.""" columns = set() for col in args: if isinstance(col, str): @@ -29,50 +56,139 @@ def flatten_str_list(*args): return columns -def evaluate_and_filter(df, columns) -> Tuple[pd.DataFrame, List[str]]: - """Evaluate queries and filter the dataframe""" +def evaluate_and_filter( + df: pd.DataFrame, columns: List[str] +) -> Tuple[pd.DataFrame, List[str]]: + """Evaluate query filters and filter the metadata DataFrame based on specified columns. + + This function processes column specifications, extracts any filter conditions, + applies these conditions to the metadata DataFrame, and returns the filtered metadata + along with the updated list of columns. + + Parameters + ---------- + df : pd.DataFrame + The metadata DataFrame containing information about profiles to be filtered. + columns : List[str] + A list of metadata column names. + + Returns + ------- + Tuple[pd.DataFrame, List[str]] + - The filtered metadata DataFrame. + - The updated list of columns after processing any filter specifications. + """ + # Extract query filters from the column specifications query_list, columns = extract_filters(columns, df.columns) + + # Apply the extracted filters to the metadata DataFrame df = apply_filters(df, query_list) + + # Return the filtered metadata DataFrame and the updated list of columns return df, columns -def extract_filters(columns, df_columns) -> Tuple[List[str], List[str]]: - """Extract and validate filters from columns""" +def extract_filters( + columns: List[str], df_columns: List[str] +) -> Tuple[List[str], List[str]]: + """Extract and validate query filters from selected metadata columns. + + Parameters + ---------- + columns : List[str] + A list of selected metadata column names or query expressions. Query expressions + should follow a valid syntax (e.g., "metadata_column > 5" or "metadata_column == 'value'"). + df_columns : List[str] + All available metadata column names to validate against. + + Returns + ------- + Tuple[List[str], List[str]] + - `queries_to_eval`: A list of valid query expressions to evaluate. + - `parsed_cols`: A list of valid metadata column names extracted from the input `columns`. + + Raises + ------ + ValueError: + - If a metadata column or query expression is invalid (e.g., references a non-existent column). + - If duplicate queries are found for the same metadata column. + """ + # Initialize lists to store parsed metadata column names and query expressions parsed_cols = [] queries_to_eval = [] + # Iterate through each entry in the selected metadata columns for col in columns: if col in df_columns: + # If the entry is a valid metadata column name, add it to parsed_cols parsed_cols.append(col) continue + + # Use regex to extract metadata column names from query expressions column_names = re.findall(r"(\w+)\s*[=<>!]+", col) + # Validate the extracted metadata column names against all available metadata columns valid_column_names = [col for col in column_names if col in df_columns] if not valid_column_names: - raise ValueError(f"Invalid query or column name: {col}") + raise ValueError(f"Invalid query or metadata column name: {col}") + # Add valid query expressions and associated metadata columns queries_to_eval.append(col) parsed_cols.extend(valid_column_names) + # Check for duplicate metadata columns in the parsed list if len(parsed_cols) != len(set(parsed_cols)): raise ValueError(f"Duplicate queries for column: {col}") + # Return the queries to evaluate and the parsed metadata column names return queries_to_eval, parsed_cols -def apply_filters(df, query_list): - """Combine and apply filters to dataframe""" +def apply_filters(df: pd.DataFrame, query_list: List[str]) -> pd.DataFrame: + """Combine and apply query filters to a DataFrame. + + This function takes a list of query expressions and applies them to a DataFrame + to filter its rows. If no query expressions are provided, the original DataFrame + is returned unchanged. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to which the filters will be applied. + query_list : List[str] + A list of query expressions (e.g., "column_name > 5"). These expressions + should follow the syntax supported by `pd.DataFrame.query`. + + Returns + ------- + pd.DataFrame + The DataFrame filtered based on the provided query expressions. + + Raises + ------ + ValueError: + - If the combined query results in an empty DataFrame. + - If the combined query expression is invalid. + """ + # If no queries are provided, return the original DataFrame unchanged if not query_list: return df + # Combine the query expressions into a single string using logical AND (&) combined_query = " & ".join(f"({query})" for query in query_list) + try: + # Apply the combined query to filter the DataFrame df_filtered = df.query(combined_query) + + # Raise an error if the filtered DataFrame is empty if df_filtered.empty: raise ValueError(f"No data matched the query: {combined_query}") except Exception as e: + # Handle any issues with the query expression and provide feedback raise ValueError( f"Invalid combined query expression: {combined_query}. Error: {e}" ) + # Return the filtered DataFrame return df_filtered diff --git a/src/copairs/map/map.py b/src/copairs/map/map.py index 2e354cc..66d4bbf 100644 --- a/src/copairs/map/map.py +++ b/src/copairs/map/map.py @@ -1,4 +1,7 @@ +"""Functions to compute mean average precision.""" + import logging +from typing import Optional import numpy as np import pandas as pd @@ -11,26 +14,70 @@ def mean_average_precision( - ap_scores: pd.DataFrame, sameby, null_size: int, threshold: float, seed: int + ap_scores: pd.DataFrame, + sameby, + null_size: int, + threshold: float, + seed: int, + max_workers: Optional[int] = None, ) -> pd.DataFrame: + """Calculate the Mean Average Precision (mAP) score and associated p-values. + + This function computes the Mean Average Precision (mAP) score by grouping profiles + based on the specified criteria (`sameby`). It calculates the significance of mAP + scores by comparing them to a null distribution and performs multiple testing + corrections. + + Parameters + ---------- + ap_scores : pd.DataFrame + DataFrame containing individual Average Precision (AP) scores and pair statistics + (e.g., number of positive pairs `n_pos_pairs` and total pairs `n_total_pairs`). + sameby : list or str + Metadata column(s) used to group profiles for mAP calculation. + null_size : int + Number of samples in the null distribution for significance testing. + threshold : float + p-value threshold for identifying significant MaP scores. + seed : int + Random seed for reproducibility. + max_workers : int + Number of workers used. Default defined by tqdm's `thread_map` + + Returns + ------- + pd.DataFrame + DataFrame with the following columns: + - `mean_average_precision`: Mean AP score for each group. + - `p_value`: p-value comparing mAP to the null distribution. + - `corrected_p_value`: Adjusted p-value after multiple testing correction. + - `below_p`: Boolean indicating if the p-value is below the threshold. + - `below_corrected_p`: Boolean indicating if the corrected p-value is below the threshold. + """ + # Filter out invalid or incomplete AP scores ap_scores = ap_scores.query("~average_precision.isna() and n_pos_pairs > 0") ap_scores = ap_scores.reset_index(drop=True).copy() logger.info("Computing null_dist...") + # Extract configurations for null distribution generation null_confs = ap_scores[["n_pos_pairs", "n_total_pairs"]].values null_confs, rev_ix = np.unique(null_confs, axis=0, return_inverse=True) + + # Generate null distributions for each unique configuration null_dists = compute.get_null_dists(null_confs, null_size, seed=seed) ap_scores["null_ix"] = rev_ix + # Function to calculate the p-value for a mAP score based on the null distribution def get_p_value(params): map_score, indices = params null_dist = null_dists[rev_ix[indices]].mean(axis=0) num = (null_dist > map_score).sum() - p_value = (num + 1) / (null_size + 1) + p_value = (num + 1) / (null_size + 1) # Add 1 for stability return p_value logger.info("Computing p-values...") + # Group by the specified metadata column(s) and calculate mean AP map_scores = ap_scores.groupby(sameby, observed=True).agg( { "average_precision": ["mean", lambda x: list(x.index)], @@ -38,14 +85,20 @@ def get_p_value(params): ) map_scores.columns = ["mean_average_precision", "indices"] + # Compute p-values for each group using the null distributions params = map_scores[["mean_average_precision", "indices"]] - map_scores["p_value"] = thread_map(get_p_value, params.values, leave=False) + map_scores["p_value"] = thread_map( + get_p_value, params.values, leave=False, max_workers=max_workers + ) + + # Perform multiple testing correction on p-values reject, pvals_corrected, alphacSidak, alphacBonf = multipletests( map_scores["p_value"], method="fdr_bh" ) map_scores["corrected_p_value"] = pvals_corrected + + # Mark scores below the p-value threshold map_scores["below_p"] = map_scores["p_value"] < threshold map_scores["below_corrected_p"] = map_scores["corrected_p_value"] < threshold - map_scores.drop(columns=["indices"], inplace=True) - map_scores.reset_index(inplace=True) + return map_scores diff --git a/src/copairs/map/multilabel.py b/src/copairs/map/multilabel.py index ff124a3..8f17a0a 100644 --- a/src/copairs/map/multilabel.py +++ b/src/copairs/map/multilabel.py @@ -1,3 +1,5 @@ +"""Functions to compute mAP with multilabel support.""" + import itertools import logging @@ -12,7 +14,7 @@ logger = logging.getLogger("copairs") -def create_neg_query_solver(neg_pairs, neg_sims): +def _create_neg_query_solver(neg_pairs, neg_sims): # Melting and sorting by ix. neg_cutoffs splits the contiguous array neg_ix = neg_pairs.ravel() neg_sims = np.repeat(neg_sims, 2) @@ -35,7 +37,7 @@ def negs_for(query: np.ndarray): return negs_for -def build_rank_lists_multi(pos_pairs, pos_sims, pos_counts, negs_for): +def _build_rank_lists_multi(pos_pairs, pos_sims, pos_counts, negs_for): ap_scores_list, null_confs_list, ix_list = [], [], [] start = 0 @@ -48,8 +50,8 @@ def build_rank_lists_multi(pos_pairs, pos_sims, pos_counts, negs_for): neg_ix = np.repeat(query, neg_counts) labels = np.concatenate( [ - np.ones(mpos_pairs.size, dtype=np.int32), - np.zeros(len(neg_sims), dtype=np.int32), + np.ones(mpos_pairs.size, dtype=np.uint32), + np.zeros(len(neg_sims), dtype=np.uint32), ] ) @@ -76,6 +78,13 @@ def average_precision( batch_size=20000, distance="cosine", ) -> pd.DataFrame: + """ + Compute average precision with multilabel support. + + See Also + -------- + copairs.map.average_precision : Average precision without multilabel support. + """ columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby) meta, columns = evaluate_and_filter(meta, columns) validate_pipeline_input(meta, feats, columns) @@ -89,13 +98,13 @@ def average_precision( logger.info("Finding positive pairs...") pos_pairs = matcher.get_all_pairs(sameby=pos_sameby, diffby=pos_diffby) pos_keys = pos_pairs.keys() - pos_counts = np.fromiter(map(len, pos_pairs.values()), dtype=np.int32) + pos_counts = np.fromiter(map(len, pos_pairs.values()), dtype=np.uint32) pos_total = sum(pos_counts) if pos_total == 0: raise UnpairedException("Unable to find positive pairs.") pos_pairs = np.fromiter( itertools.chain.from_iterable(pos_pairs.values()), - dtype=np.dtype((np.int32, 2)), + dtype=np.dtype((np.uint32, 2)), count=pos_total, ) @@ -106,7 +115,7 @@ def average_precision( raise UnpairedException("Unable to find any negative pairs.") neg_pairs = np.fromiter( itertools.chain.from_iterable(neg_pairs.values()), - dtype=np.dtype((np.int32, 2)), + dtype=np.dtype((np.uint32, 2)), count=neg_total, ) @@ -120,8 +129,8 @@ def average_precision( neg_sims = distance_fn(feats, neg_pairs, batch_size) logger.info("Computing AP per label...") - negs_for = create_neg_query_solver(neg_pairs, neg_sims) - ap_scores_list, null_confs_list, ix_list = build_rank_lists_multi( + negs_for = _create_neg_query_solver(neg_pairs, neg_sims) + ap_scores_list, null_confs_list, ix_list = _build_rank_lists_multi( pos_pairs, pos_sims, pos_counts, negs_for ) diff --git a/src/copairs/matching.py b/src/copairs/matching.py index f840b1c..4095b56 100644 --- a/src/copairs/matching.py +++ b/src/copairs/matching.py @@ -1,6 +1,4 @@ -""" -Sample pairs with given column restrictions -""" +"""Sample pairs with given column restrictions.""" import itertools import logging @@ -18,13 +16,30 @@ ColumnDict = Dict[str, ColumnList] +def assign_reference_index( + df: pd.DataFrame, + condition: Union[str, pd.Index], + reference_col: str = "Metadata_Reference_Index", + default_value: int = -1, + inplace: bool = False, +): + """Assign reference index to a specified column based on a given condition.""" + if not inplace: + df = df.copy() + df[reference_col] = default_value + if isinstance(condition, str): + condition = df.query(condition).index + df.loc[condition, reference_col] = condition + return df if not inplace else None + + def reverse_index(col: pd.Series) -> pd.Series: - """Build a reverse_index for a given column in the DataFrame""" + """Build a reverse_index for a given column in the DataFrame.""" return pd.Series(col.groupby(col, observed=True).indices, name=col.name) def dict_to_dframe(dict_pairs, sameby: Union[str, list]): - """Convert the Matcher.get_all_pairs output to pd.DataFrame""" + """Convert the Matcher.get_all_pairs output to pd.DataFrame.""" if not dict_pairs: raise ValueError("dict_pairs empty") keys = np.array(list(dict_pairs.keys())) @@ -46,17 +61,14 @@ def dict_to_dframe(dict_pairs, sameby: Union[str, list]): class UnpairedException(Exception): - """Exception raised when a row can not be paired with any other row in the - data""" + """Exception raised when a row can not be paired with any other row in the data.""" class Matcher: - """Class to get pair of rows given contraints in the columns""" + """Class to get pair of rows given contraints in the columns.""" def __init__(self, dframe: pd.DataFrame, columns: ColumnList, seed: int): - """ - max_size: max number of rows to consider from the same value. - """ + """max_size: max number of rows to consider from the same value.""" rng = np.random.default_rng(seed) self.original_index = dframe.index dframe = dframe[columns].reset_index(drop=True).copy() @@ -85,9 +97,7 @@ def __init__(self, dframe: pd.DataFrame, columns: ColumnList, seed: int): self.rand_iter = iter([]) def _null_sample(self, diffby_all: ColumnList, diffby_any: ColumnList): - """ - Sample a pair from the frame. - """ + """Sample a pair from the frame.""" valid = set(self.frozen_valid) id1 = self.integers(0, len(valid) - 1) valid.remove(id1) @@ -101,7 +111,7 @@ def _null_sample(self, diffby_all: ColumnList, diffby_any: ColumnList): return id1, id2 def sample_null_pair(self, diffby: ColumnList, n_tries=5): - """Sample pairs from the data. It tries multiple times before raising an error""" + """Sample pairs from the data. It tries multiple times before raising an error.""" if isinstance(diffby, dict): diffby_all, diffby_any = diffby.get("all", []), diffby.get("any", []) if len(diffby_any) == 1: @@ -118,6 +128,7 @@ def sample_null_pair(self, diffby: ColumnList, n_tries=5): raise ValueError("Number of tries exhusted. Could not find a valid pair") def rand_next(self): + """Get next value from the precomputed value.""" try: value = next(self.rand_iter) except StopIteration: @@ -127,9 +138,11 @@ def rand_next(self): return value def integers(self, min_val, max_val): + """Get a random integer value between the specified range.""" return int(self.rand_next() * (max_val - min_val + 1) + min_val) def choice(self, items): + """Select a random item from the given list.""" min_val, max_val = 0, len(items) - 1 pos = self.integers(min_val, max_val) return items[pos] @@ -140,9 +153,7 @@ def get_all_pairs( diffby: Union[str, ColumnList, ColumnDict], original_index: bool = True, ): - """ - Get all pairs with given params - """ + """Get all pairs with given params.""" sameby, diffby = self._normalize_sameby_diffby(sameby, diffby) sameby, diffby = self._validate_inputs(sameby, diffby) @@ -168,9 +179,7 @@ def _get_original_index(self, pairs): } def _normalize_sameby_diffby(self, sameby, diffby): - """ - Convert sameby and diffby to a consistent format: {'all': [...], 'any': [...]} - """ + """Convert sameby and diffby to a consistent format: {'all': [...], 'any': [...]}.""" keys = ["all", "any"] result = [] @@ -294,8 +303,7 @@ def _get_all_pairs_single( return pairs def _only_diffby_all(self, diffby_all: ColumnList): - """Generate a dict with single NaN key containing all of the pairs - with different values in the column list""" + """Generate a dict with single NaN key containing all of the pairs with different values in the column list.""" diffby_all = sorted(diffby_all, key=self.col_order.get) # Cartesian product for one of the diffby columns @@ -311,8 +319,7 @@ def _only_diffby_all(self, diffby_all: ColumnList): return {None: list(map(tuple, pairs))} def _only_diffby_any(self, diffby: ColumnList): - """Generate a dict with single NaN key containing all of the pairs - with different values in any of specififed columns""" + """Generate a dict with single NaN key containing all of the pairs with different values in any of specififed columns.""" diffby = sorted(diffby, key=self.col_order.get) pairs = [] @@ -325,8 +332,7 @@ def _only_diffby_any(self, diffby: ColumnList): return {None: list(map(tuple, pairs))} def _only_diffby_all_any(self, diffby_all: ColumnList, diffby_any: ColumnList): - """Generate a dict with single NaN key containing all of the pairs - with different values in any of specififed columns""" + """Generate a dict with single NaN key containing all of the pairs with different values in any of specififed columns.""" diffby_all_pairs = np.asarray(self._only_diffby_all(diffby_all)[None]) diffby_all_any = self._filter_pairs_by_condition( diffby_all_pairs, diffby_any, condition="any_diff" @@ -337,12 +343,12 @@ def _filter_diffby( self, idx: int, diffby_all: ColumnList, diffby_any: ColumnList, valid: Set[int] ): """ - Remove from valid rows that have matches with idx in any of the diffby columns + Remove from valid rows that have matches with idx in any of the diffby columns. + :idx: index of the row to be compared :diffby: indices of columns that should have different values :valid: candidate rows to be evaluated - :returns: subset of valid after removing indices - + :returns: subset of valid after removing indices. """ row = self.values[idx] for col in diffby_all: @@ -388,6 +394,12 @@ def _filter_pairs_by_condition(self, pairs, columns, condition="all_same"): class MatcherMultilabel: + """ + Class to get pair of rows given contraints in the columns. + + Support one multilabel column. + """ + def __init__( self, dframe: pd.DataFrame, columns: ColumnList, multilabel_col: str, seed: int ): @@ -400,6 +412,7 @@ def __init__( self.matcher = Matcher(dframe, columns, seed) def get_all_pairs(self, sameby: Union[str, ColumnList], diffby: ColumnList): + """Get all pairs with given params.""" diffby_multi = self.multilabel_col in diffby if diffby_multi: # Multilabel in diffby must be 'ALL' instead of 'ANY' @@ -424,11 +437,13 @@ def get_all_pairs(self, sameby: Union[str, ColumnList], diffby: ColumnList): return pairs def sample_null_pair(self, diffby: ColumnList, n_tries=5): + """Sample pairs from the data. It tries multiple times before raising an error.""" null_pair = self.matcher.sample_null_pair(diffby, n_tries) id1, id2 = self.original_index[list(null_pair)].values return id1, id2 def get_null_pairs(self, diffby: ColumnList, size: int, n_tries=5): + """Sample multiple null pairs at the same time.""" null_pairs = [] for _ in tqdm(range(size)): null_pairs.append(self.matcher.sample_null_pair(diffby, n_tries)) @@ -438,7 +453,7 @@ def get_null_pairs(self, diffby: ColumnList, size: int, n_tries=5): return null_pairs def _only_diffby_multi(self): - """Special case when it is filter only by the diffby=multilabel_col""" + """Process special case when it is filter only by the diffby=multilabel_col.""" pairs = self.get_all_pairs(self.multilabel_col, []) pairs = itertools.chain.from_iterable(pairs.values()) pairs = set(map(frozenset, pairs)) diff --git a/src/copairs/plot.py b/src/copairs/plot.py index eea7010..903672f 100644 --- a/src/copairs/plot.py +++ b/src/copairs/plot.py @@ -1,3 +1,5 @@ +"""Functions to plot percent replicating.""" + from typing import Optional from plotly import graph_objects as go @@ -15,9 +17,7 @@ def plot( true_dist_title="True replicates", null_dist_title="Null distribution", ) -> go.Figure: - """ - Plot two distributions and threshold(s) line. - """ + """Plot two distributions and threshold(s) line.""" # fig = go.Figure() fig = make_subplots(specs=[[{"secondary_y": True}]]) diff --git a/src/copairs/replicating.py b/src/copairs/replicating.py index 3674f42..6fae6e4 100644 --- a/src/copairs/replicating.py +++ b/src/copairs/replicating.py @@ -1,4 +1,4 @@ -"""Class for getting Percent replicating metric""" +"""Class for getting Percent replicating metric.""" from typing import List, Literal @@ -29,15 +29,17 @@ def corr_between_non_replicates( ): """ Null distribution between random "replicates". - Parameters: - ------------ + + Parameters + ---------- df: pandas.DataFrame n_samples: int n_replicates: int diffby: list of columns that should be different use_rep: which data to use from .obsm property. `None` (default) uses `adata.X` - Returns: - -------- + + Returns + ------- list-like of correlation values, with a length of `n_samples` """ matcher = Matcher(meta, diffby, seed=0) @@ -49,13 +51,15 @@ def corr_between_non_replicates( def corr_from_pairs(X: np.ndarray, pairs: dict, sameby: List[str]): """ - Correlation from a list of named pairs. Generated by Matcher.get_all_pairs - Parameters: - ----------- + Correlation from a list of named pairs. Generated by Matcher.get_all_pairs. + + Parameters + ---------- X: Matrix containing samples in rows pairs: dictionary with list of index pairs. - Returns: - -------- + + Returns + ------- list-like of correlation values and median of number of replicates """ pair_ix = np.vstack(list(pairs.values())) @@ -92,15 +96,17 @@ def corr_between_replicates( X: np.ndarray, meta: pd.DataFrame, sameby: List[str], diffby: List[str] ): """ - Correlation between replicates - Parameters: - ----------- + Correlation between replicates. + + Parameters + ---------- adata: ad.AnnData sameby: Feature name to group the data frame by diffby: Feature name to force different values use_rep: which data to use from .obsm property. `None` (default) uses `adata.X` - Returns: - -------- + + Returns + ------- list-like of correlation values and median of number of replicates """ matcher = Matcher(meta, sameby + diffby, seed=0) @@ -109,18 +115,18 @@ def corr_between_replicates( class CorrelationTestResult: - """Class representing the percent replicating score. It stores distributions""" + """Class representing the percent replicating score. It stores distributions.""" def __init__(self, corr_df: pd.DataFrame, null_dist: pd.Series): - """Initialize object""" + """Initialize object.""" self.corr_df = corr_df self.corr_dist = corr_df["median"] self.null_dist = null_dist def percent_score_left(self): - """ - Calculates the percent score using the 5th percentile threshold. - :return: proportion of correlation distribution beyond the threshold and the threshold + """Calculate the percent score using the 5th percentile threshold. + + :return: proportion of correlation distribution beyond the threshold and the threshold. """ perc_5 = np.nanpercentile(self.null_dist, 5) below_threshold = self.corr_dist.dropna() < perc_5 @@ -128,8 +134,9 @@ def percent_score_left(self): def percent_score_right(self): """ - Calculates the percent score using the 95th percentile threshold. - :return: proportion of correlation distribution beyond the threshold and the threshold + Calculate the percent score using the 95th percentile threshold. + + :return: proportion of correlation distribution beyond the threshold and the threshold. """ perc_95 = np.nanpercentile(self.null_dist, 95) above_threshold = self.corr_dist.dropna() > perc_95 @@ -137,8 +144,9 @@ def percent_score_right(self): def percent_score_both(self): """ - Calculates the percent score using the 5th and 95th percentile or thresholds. - :return: proportion of correlation distribution beyond the thresholds and the thresholds + Calculate the percent score using the 5th and 95th percentile or thresholds. + + :return: proportion of correlation distribution beyond the thresholds and the thresholds. """ perc_95 = np.nanpercentile(self.null_dist, 95) above_threshold = self.corr_dist.dropna() > perc_95 @@ -154,6 +162,7 @@ def percent_score_both(self): ) def percent_score(self, how: Literal["left", "right", "both"]): + """Calculate percent score given the `how` criteria.""" left_th, right_th = None, None if how == "right": percent_score, right_th = self.percent_score_right() @@ -167,9 +176,7 @@ def percent_score(self, how: Literal["left", "right", "both"]): return percent_score, left_th, right_th def wasserstein_distance(self): - """ - Compute the Wasserstein distance between null and corr distributions. - """ + """Compute the Wasserstein distance between null and corr distributions.""" from scipy.stats import wasserstein_distance return wasserstein_distance(self.null_dist.values, self.corr_dist.values) @@ -182,9 +189,7 @@ def correlation_test( diffby: List[str], n_samples: int = 1000, ) -> CorrelationTestResult: - """ - Generate Null and replicate distribution for replicate correlation analysis - """ + """Generate Null and replicate distribution for replicate correlation analysis.""" corr_df, median_num_repl = corr_between_replicates(X, meta, sameby, diffby) n_replicates = min(median_num_repl, 50) @@ -202,9 +207,7 @@ def correlation_test( def correlation_test_from_pairs( X: np.ndarray, pairs: dict, null_pairs: list, sameby: list ) -> CorrelationTestResult: - """ - Generate Null and replicate distribution for replicate correlation analysis - """ + """Generate Null and replicate distribution for replicate correlation analysis.""" corr_df, median_num_repl = corr_from_pairs(X, pairs, sameby) n_replicates = min(median_num_repl, 50) null_dist = corr_from_null_pairs(X, null_pairs, n_replicates) diff --git a/tests/test_map.py b/tests/test_map.py index b18ca9e..1c69a4c 100644 --- a/tests/test_map.py +++ b/tests/test_map.py @@ -14,20 +14,28 @@ SEED = 0 +def binary2indices(arr: np.ndarray) -> np.ndarray: + """Convert a binary matrix to a list of indices.""" + return np.nonzero(arr)[1].reshape(arr.shape[0], -1) + + def test_random_binary_matrix(): """Test the random binary matrix generation.""" rng = np.random.default_rng(SEED) + # Test with n=3, m=4, k=2 - A = compute.random_binary_matrix(3, 4, 2, rng) - assert A.shape == (3, 4) - assert np.all(np.sum(A, axis=1) == 2) - assert np.all((A >= 0) | (A <= 1)) + indices = compute.random_binary_matrix(3, 4, 2, rng) + assert indices.shape == (3, 2) + assert np.all(indices < 4) + assert np.all(indices >= 0) + assert np.unique(indices, axis=1).shape == indices.shape # Test with n=5, m=6, k=3 - B = compute.random_binary_matrix(5, 6, 3, rng) - assert B.shape == (5, 6) - assert np.all(np.sum(B, axis=1) == 3) - assert np.all((B == 0) | (B <= 1)) + indices = compute.random_binary_matrix(5, 6, 3, rng) + assert indices.shape == (5, 3) + assert np.all(indices < 6) + assert np.all(indices >= 0) + assert np.unique(indices, axis=1).shape == indices.shape def test_compute_ap(): @@ -50,7 +58,8 @@ def test_compute_ap(): .apply(lambda x: np.array(df.y_true[0])[x]) ) rel_k = np.stack(rel_k) - ap = compute.average_precision(rel_k) + + ap = compute.average_precision(binary2indices(rel_k)) ap_sklearn = df.apply( lambda x: average_precision_score(x["y_true"], x["y_pred"]), axis=1 diff --git a/tests/test_reference_index.py b/tests/test_reference_index.py new file mode 100644 index 0000000..7820f75 --- /dev/null +++ b/tests/test_reference_index.py @@ -0,0 +1,50 @@ +"""Tests for assign reference index helper function.""" + +import pytest +import numpy as np +import pandas as pd + +from copairs.map import average_precision +from copairs.matching import assign_reference_index +from tests.helpers import simulate_random_dframe + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in divide") +def test_assign_reference_index(): + """Test ap values are not computed for ref samples.""" + SEED = 42 + length = 200 + vocab_size = {"p": 5, "w": 3, "l": 4} + n_feats = 5 + pos_sameby = ["l"] + pos_diffby = [] + neg_sameby = [] + neg_diffby = ["l"] + rng = np.random.default_rng(SEED) + meta = simulate_random_dframe(length, vocab_size, pos_sameby, pos_diffby, rng) + # p: Plate, w: Well, l: PerturbationID, t: PerturbationType (is control?) + meta.eval("t=(l=='l1')", inplace=True) + length = len(meta) + feats = rng.uniform(size=(length, n_feats)) + + ap = average_precision( + meta, feats, pos_sameby + ["t"], pos_diffby, neg_sameby, neg_diffby + ["t"] + ) + + ap_ri = average_precision( + assign_reference_index(meta, "l=='l1'"), + feats, + pos_sameby + ["Metadata_Reference_Index"], + pos_diffby, + neg_sameby, + neg_diffby + ["Metadata_Reference_Index"], + ) + + # Check no AP values were computed for the reference samples. + assert ap_ri.query("l=='l1'").average_precision.isna().all() + + # Check AP values for all other samples are equal + pd.testing.assert_frame_equal( + ap_ri.query("l!='l1'").drop(columns="Metadata_Reference_Index"), + ap.query("l!='l1'"), + )