From 590559ecb0c28e66408683aa351ea56970fd784e Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 11 Oct 2024 11:05:23 +0000
Subject: [PATCH 01/14] filter relations for edge card; cap mp workers based on
 cores

---
 src/kg_topology_toolbox/topology_toolbox.py | 36 ++++++++++++++-------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index c3d6f5b..20981a4 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -6,7 +6,7 @@
 """
 
 from functools import cache
-
+import multiprocessing as mp
 import numpy as np
 import pandas as pd
 from scipy.sparse import coo_array
@@ -271,7 +271,7 @@ def edge_cardinality(self) -> pd.DataFrame:
         return df_res
 
     def edge_degree_cardinality_summary(
-        self, aggregate_by_r: bool = False
+        self, filter_relations: list = [], aggregate_by_r: bool = False
     ) -> pd.DataFrame:
         """
         For each edge in the KG, compute the number of edges with the same head
@@ -285,6 +285,9 @@ def edge_degree_cardinality_summary(
         The output dataframe maintains the same indexing and ordering of triples
         as the original Knowledge Graph dataframe.
 
+        :param filter_relations:
+            Compute the output only for the edges with relation in this list
+            of relation IDs.
         :param aggregate_by_r:
             If True, return metrics aggregated by relation type
             (the output DataFrame will be indexed over relation IDs).
@@ -318,6 +321,8 @@ def edge_degree_cardinality_summary(
             ],
             axis=1,
         )
+        if len(filter_relations) > 0:
+            df_res = df_res[df_res.r.isin(filter_relations)]
         # compute number of parallel edges to avoid double-counting them
         # in total degree
         num_parallel = df_res.merge(
@@ -326,7 +331,9 @@ def edge_degree_cardinality_summary(
             how="left",
         )
         df_res["tot_degree"] = (
-            df_res.h_degree + df_res.t_degree - num_parallel.n_parallel
+            df_res.h_degree.values
+            + df_res.t_degree.values
+            - num_parallel.n_parallel.values
         )
         # when restricting to the relation type, there is only one edge
         # (the edge itself) that is double-counted
@@ -344,9 +351,10 @@ def edge_degree_cardinality_summary(
     def edge_pattern_summary(
         self,
         return_metapath_list: bool = False,
-        composition_chunk_size: int = 2**8,
-        composition_workers: int = 32,
+        filter_relations: list = [],
         aggregate_by_r: bool = False,
+        composition_chunk_size: int = 2**8,
+        composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
     ) -> pd.DataFrame:
         """
         Analyse structural properties of each edge in the KG:
@@ -359,14 +367,18 @@ def edge_pattern_summary(
         :param return_metapath_list:
             If True, return the list of unique metapaths for all
             triangles supported over one edge. WARNING: very expensive for large graphs.
-        :param composition_chunk_size:
-            Size of column chunks of sparse adjacency matrix
-            to compute the triangle count.
-        :param composition_workers:
-            Number of workers to compute the triangle count.
+        :param filter_relations:
+            Compute the output only for the edges with relation in this list
+            of relation IDs.
         :param aggregate_by_r:
             If True, return metrics aggregated by relation type
             (the output DataFrame will be indexed over relation IDs).
+        :param composition_chunk_size:
+            Size of column chunks of sparse adjacency matrix
+            to compute the triangle count. Default: 2**8.
+        :param composition_workers:
+            Number of workers to compute the triangle count. By default, assigned based
+            on number of available threads (max: 32).
 
         :return:
             The results dataframe. Contains the following columns
@@ -407,6 +419,8 @@ def edge_pattern_summary(
             self.df.reset_index().merge(df_inv)["index"],
             "is_symmetric",
         ] = True
+        if len(filter_relations) > 0:
+            df_res = df_res[df_res.r.isin(filter_relations)]
         # loops are treated separately
         df_res["is_loop"] = df_res.h == df_res.t
         df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False
@@ -631,7 +645,7 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame
         returned dataframe.
 
         :param min_max_norm:
-            min-max normalization of edge weights. Defaults to False.
+            min-max normalization of edge weights. Default: False.
 
         :return:
             The results dataframe. Contains the following columns:

From 4612c0ee9de125efa4c9b3545d23c5a51c043f3a Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 11 Oct 2024 16:02:05 +0000
Subject: [PATCH 02/14] add relation filter to edge methods

---
 src/kg_topology_toolbox/topology_toolbox.py | 94 +++++++++++++++------
 tests/test_edge_topology_toolbox.py         | 20 +++++
 2 files changed, 86 insertions(+), 28 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index 20981a4..b1434f7 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -407,31 +407,64 @@ def edge_pattern_summary(
             - **metapath_list** (list): The list of unique metapaths "r1-r2"
               for the directed triangles.
         """
+
+        if len(filter_relations) > 0:
+            rel_df = self.df[self.df.r.isin(filter_relations)]
+            filter_heads = rel_df.h.unique()
+            filter_tails = rel_df.t.unique()
+            filter_entities = np.union1d(filter_heads, filter_tails)
+
+            inference_df = self.df[
+                np.logical_and(
+                    self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
+                )
+            ]
+            inverse_df = self.df[
+                np.logical_and(
+                    self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
+                )
+            ]
+            df_triangles = self.df[
+                np.logical_or(
+                    self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
+                )
+            ]
+            df_triangles_und = self.df[
+                np.logical_or(
+                    self.df.h.isin(filter_entities), self.df.t.isin(filter_entities)
+                )
+            ]
+            # discard loops as edges of a triangle
+            df_triangles = df_triangles[df_triangles.h != df_triangles.t]
+            df_triangles_und = df_triangles_und[
+                df_triangles_und.h != df_triangles_und.t
+            ]
+        else:
+            rel_df = inference_df = inverse_df = self.df
+            df_triangles = df_triangles_und = self.df[self.df.h != self.df.t]
+        df_res = df_res = pd.DataFrame(
+            {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
+        )
         # symmetry-asymmetry
         # edges with h/t switched
-        df_inv = self.df.reindex(columns=["t", "r", "h"]).rename(
+        df_inv = inverse_df.reindex(columns=["t", "r", "h"]).rename(
             columns={"t": "h", "r": "r", "h": "t"}
         )
-        df_res = pd.DataFrame(
-            {"h": self.df.h, "r": self.df.r, "t": self.df.t, "is_symmetric": False}
-        )
         df_res.loc[
-            self.df.reset_index().merge(df_inv)["index"],
+            df_res.reset_index().merge(df_inv)["index"],
             "is_symmetric",
         ] = True
-        if len(filter_relations) > 0:
-            df_res = df_res[df_res.r.isin(filter_relations)]
         # loops are treated separately
         df_res["is_loop"] = df_res.h == df_res.t
         df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False
 
+        df_res = df_res.reset_index()
+
         # inverse
         unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg(
             inverse_edge_types=("r", list),
         )
-        df_res = df_res.merge(
-            unique_inv_r_by_ht, left_on=["h", "t"], right_on=["h", "t"], how="left"
-        )
+        df_res = df_res.merge(unique_inv_r_by_ht, on=["h", "t"], how="left")
         df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply(
             lambda agg: agg if isinstance(agg, list) else []
         )
@@ -446,27 +479,32 @@ def edge_pattern_summary(
         df_res["has_inverse"] = df_res["n_inverse_relations"] > 0
 
         # inference
-        edges_between_ht = unique_inv_r_by_ht.reindex(
-            columns=["t", "h", "inverse_edge_types"]
-        ).rename(
-            columns={"t": "h", "h": "t", "inverse_edge_types": "inference_edge_types"}
-        )
-        df_res = df_res.merge(
-            edges_between_ht, left_on=["h", "t"], right_on=["h", "t"], how="left"
-        )
+        if len(filter_relations) > 0:
+            edges_between_ht = inference_df.groupby(["h", "t"], as_index=False).agg(
+                inference_edge_types=("r", list),
+            )
+        else:
+            edges_between_ht = unique_inv_r_by_ht.reindex(
+                columns=["t", "h", "inverse_edge_types"]
+            ).rename(
+                columns={
+                    "t": "h",
+                    "h": "t",
+                    "inverse_edge_types": "inference_edge_types",
+                }
+            )
+        df_res = df_res.merge(edges_between_ht, on=["h", "t"], how="left")
         # inference_edge_types always contains the edge itself, which we need to drop
         df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1
         df_res["has_inference"] = df_res["n_inference_relations"] > 0
 
         # composition & metapaths
-        # discard loops as edges of a triangle
-        df_wo_loops = self.df[self.df.h != self.df.t]
         if return_metapath_list:
             # 2-hop paths
-            df_bridges = df_wo_loops.merge(
-                df_wo_loops, left_on="t", right_on="h", how="inner"
+            df_bridges = df_triangles.merge(
+                df_triangles, left_on="t", right_on="h", how="inner"
             )
-            df_triangles = df_wo_loops.merge(
+            df_triangles = df_triangles.merge(
                 df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner"
             )
             df_triangles["metapath"] = (
@@ -479,8 +517,7 @@ def edge_pattern_summary(
             )
             df_res = df_res.merge(
                 grouped_triangles,
-                left_on=["h", "r", "t"],
-                right_on=["h", "r", "t"],
+                on=["h", "r", "t"],
                 how="left",
             )
             df_res["metapath_list"] = df_res["metapath_list"].apply(
@@ -489,7 +526,7 @@ def edge_pattern_summary(
             df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
         else:
             counts = composition_count(
-                df_wo_loops,
+                df_triangles,
                 chunk_size=composition_chunk_size,
                 workers=composition_workers,
                 directed=True,
@@ -504,7 +541,7 @@ def edge_pattern_summary(
         df_res["has_composition"] = df_res["n_triangles"] > 0
 
         counts = composition_count(
-            df_wo_loops,
+            df_triangles_und,
             chunk_size=composition_chunk_size,
             workers=composition_workers,
             directed=False,
@@ -519,7 +556,7 @@ def edge_pattern_summary(
         )
         df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0
 
-        df_res = df_res[
+        df_res = df_res.set_index("index")[
             [
                 "h",
                 "r",
@@ -539,6 +576,7 @@ def edge_pattern_summary(
             ]
             + (["metapath_list"] if return_metapath_list else [])
         ]
+        df_res.index.name = None
 
         return aggregate_by_relation(df_res) if aggregate_by_r else df_res
 
diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py
index eaba81a..d24977d 100644
--- a/tests/test_edge_topology_toolbox.py
+++ b/tests/test_edge_topology_toolbox.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 
+from functools import partial
 import numpy as np
 import pandas as pd
 import pytest
@@ -85,3 +86,22 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
     assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0])
     if return_metapath_list:
         assert res["metapath_list"][2] == ["0-1", "1-1"]
+
+
+def test_filter_relations() -> None:
+    for rels in [[0], [1], [0, 1]]:
+        for method in [
+            kgtt.edge_degree_cardinality_summary,
+            partial(kgtt.edge_pattern_summary, return_metapath_list=True),
+        ]:
+            # compare outputs of standard method call and filtered call
+            res_all = method()
+            res_all = res_all[res_all.r.isin(rels)]
+            res_filtered = method(filter_relations=rels)
+            assert np.all(res_all.index.values == res_filtered.index.values)
+            for c in res_all.columns:
+                if c == "metapath_list":
+                    for a, b in zip(res_all[c].values, res_filtered[c].values):
+                        assert a == b
+                else:
+                    assert np.all(res_all[c].values == res_filtered[c].values)

From 586e816b7f3431e02c1b7d072a174089d711e3d8 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 11 Oct 2024 16:17:29 +0000
Subject: [PATCH 03/14] ci fix

---
 src/kg_topology_toolbox/topology_toolbox.py | 21 ++++++++++-----------
 tests/test_edge_topology_toolbox.py         |  5 +++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index b1434f7..9af3323 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -5,8 +5,9 @@
 Topology toolbox main functionalities
 """
 
-from functools import cache
 import multiprocessing as mp
+from functools import cache
+
 import numpy as np
 import pandas as pd
 from scipy.sparse import coo_array
@@ -271,7 +272,7 @@ def edge_cardinality(self) -> pd.DataFrame:
         return df_res
 
     def edge_degree_cardinality_summary(
-        self, filter_relations: list = [], aggregate_by_r: bool = False
+        self, filter_relations: list[int] = [], aggregate_by_r: bool = False
     ) -> pd.DataFrame:
         """
         For each edge in the KG, compute the number of edges with the same head
@@ -286,8 +287,8 @@ def edge_degree_cardinality_summary(
         as the original Knowledge Graph dataframe.
 
         :param filter_relations:
-            Compute the output only for the edges with relation in this list
-            of relation IDs.
+            If not empty, compute the output only for the edges with relation
+            in this list of relation IDs.
         :param aggregate_by_r:
             If True, return metrics aggregated by relation type
             (the output DataFrame will be indexed over relation IDs).
@@ -331,9 +332,7 @@ def edge_degree_cardinality_summary(
             how="left",
         )
         df_res["tot_degree"] = (
-            df_res.h_degree.values
-            + df_res.t_degree.values
-            - num_parallel.n_parallel.values
+            df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values
         )
         # when restricting to the relation type, there is only one edge
         # (the edge itself) that is double-counted
@@ -351,7 +350,7 @@ def edge_degree_cardinality_summary(
     def edge_pattern_summary(
         self,
         return_metapath_list: bool = False,
-        filter_relations: list = [],
+        filter_relations: list[int] = [],
         aggregate_by_r: bool = False,
         composition_chunk_size: int = 2**8,
         composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
@@ -366,10 +365,10 @@ def edge_pattern_summary(
 
         :param return_metapath_list:
             If True, return the list of unique metapaths for all
-            triangles supported over one edge. WARNING: very expensive for large graphs.
+            triangles supported over each edge. WARNING: very expensive for large graphs.
         :param filter_relations:
-            Compute the output only for the edges with relation in this list
-            of relation IDs.
+            If not empty, compute the output only for the edges with relation
+            in this list of relation IDs.
         :param aggregate_by_r:
             If True, return metrics aggregated by relation type
             (the output DataFrame will be indexed over relation IDs).
diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py
index d24977d..6e0b6be 100644
--- a/tests/test_edge_topology_toolbox.py
+++ b/tests/test_edge_topology_toolbox.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 
 from functools import partial
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -95,9 +96,9 @@ def test_filter_relations() -> None:
             partial(kgtt.edge_pattern_summary, return_metapath_list=True),
         ]:
             # compare outputs of standard method call and filtered call
-            res_all = method()
+            res_all = method()  # type: ignore
             res_all = res_all[res_all.r.isin(rels)]
-            res_filtered = method(filter_relations=rels)
+            res_filtered = method(filter_relations=rels)  # type: ignore
             assert np.all(res_all.index.values == res_filtered.index.values)
             for c in res_all.columns:
                 if c == "metapath_list":

From 432615fdb34df028593c1700a5d87e3876bef888 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Mon, 14 Oct 2024 11:30:07 +0000
Subject: [PATCH 04/14] metapath tweaks

---
 src/kg_topology_toolbox/topology_toolbox.py | 36 ++++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index 9af3323..eb97472 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -407,6 +407,8 @@ def edge_pattern_summary(
               for the directed triangles.
         """
 
+        # discard loops as edges of a triangle
+        df_wo_loops = self.df[self.df.h != self.df.t]
         if len(filter_relations) > 0:
             rel_df = self.df[self.df.r.isin(filter_relations)]
             filter_heads = rel_df.h.unique()
@@ -423,24 +425,24 @@ def edge_pattern_summary(
                     self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
                 )
             ]
-            df_triangles = self.df[
+            df_triangles_out = df_wo_loops[df_wo_loops.h.isin(filter_heads)]
+            df_triangles_in = df_wo_loops[df_wo_loops.t.isin(filter_tails)]
+            df_triangles = df_wo_loops[
                 np.logical_or(
-                    self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
+                    df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
                 )
             ]
-            df_triangles_und = self.df[
+            df_triangles_und = df_wo_loops[
                 np.logical_or(
-                    self.df.h.isin(filter_entities), self.df.t.isin(filter_entities)
+                    df_wo_loops.h.isin(filter_entities),
+                    df_wo_loops.t.isin(filter_entities),
                 )
             ]
-            # discard loops as edges of a triangle
-            df_triangles = df_triangles[df_triangles.h != df_triangles.t]
-            df_triangles_und = df_triangles_und[
-                df_triangles_und.h != df_triangles_und.t
-            ]
         else:
             rel_df = inference_df = inverse_df = self.df
-            df_triangles = df_triangles_und = self.df[self.df.h != self.df.t]
+            df_triangles = df_triangles_und = df_triangles_out = df_triangles_in = (
+                df_wo_loops
+            )
         df_res = df_res = pd.DataFrame(
             {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
         )
@@ -500,16 +502,18 @@ def edge_pattern_summary(
         # composition & metapaths
         if return_metapath_list:
             # 2-hop paths
-            df_bridges = df_triangles.merge(
-                df_triangles, left_on="t", right_on="h", how="inner"
+            df_bridges = df_triangles_out.merge(
+                df_triangles_in, left_on="t", right_on="h", how="inner"
             )
-            df_triangles = df_triangles.merge(
+            df_res_triangles = df_res[df_res.h != df_res.t].merge(
                 df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner"
             )
-            df_triangles["metapath"] = (
-                df_triangles["r_x"].astype(str) + "-" + df_triangles["r_y"].astype(str)
+            df_res_triangles["metapath"] = (
+                df_res_triangles["r_x"].astype(str)
+                + "-"
+                + df_res_triangles["r_y"].astype(str)
             )
-            grouped_triangles = df_triangles.groupby(
+            grouped_triangles = df_res_triangles.groupby(
                 ["h", "r", "t"], as_index=False
             ).agg(
                 n_triangles=("metapath", "count"), metapath_list=("metapath", "unique")

From 7c6d2578417daf060c653d7af0cbfea1771ef494 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Tue, 15 Oct 2024 17:44:18 +0000
Subject: [PATCH 05/14] refactor metapath counting with sparse matmuls

---
 src/kg_topology_toolbox/topology_toolbox.py |  82 ++++++++++++----
 src/kg_topology_toolbox/utils.py            | 101 ++++++++++++++++----
 2 files changed, 142 insertions(+), 41 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index eb97472..366bf6c 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -271,6 +271,56 @@ def edge_cardinality(self) -> pd.DataFrame:
             ).astype(str)
         return df_res
 
+    def edge_metapath_count(
+        self,
+        filter_relations: list[int] = [],
+        composition_chunk_size: int = 2**8,
+        composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
+    ) -> pd.DataFrame:
+        """
+        For each edge in the KG, compute the number of triangles of different
+        metapaths (i.e., the unique tuples (r1, r2) of relation types
+        of the two additional edges of the triangle).
+
+        :param filter_relations:
+            If not empty, compute the output only for the edges with relation
+            in this list of relation IDs.
+        :param composition_chunk_size:
+            Size of column chunks of sparse adjacency matrix
+            to compute the triangle count. Default: 2**8.
+        :param composition_workers:
+            Number of workers to compute the triangle count. By default, assigned based
+            on number of available threads (max: 32).
+
+        :return:
+            The output dataframe has one row for each (h, t, r1, r2) such that
+            there exists at least one triangle of metapath (r1, r2) over (any) edge
+            connecting h, t.
+            The number of metapath triangles is given in the column **n_triangles**.
+        """
+        # discard loops as edges of a triangle
+        df_wo_loops = self.df[self.df.h != self.df.t]
+        if len(filter_relations) > 0:
+            rel_df = self.df[self.df.r.isin(filter_relations)]
+            filter_heads = rel_df.h.unique()
+            filter_tails = rel_df.t.unique()
+            df_triangles = df_wo_loops[
+                np.logical_or(
+                    df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
+                )
+            ]
+        else:
+            rel_df = self.df
+            df_triangles = df_wo_loops
+
+        return composition_count(
+            df_triangles,
+            chunk_size=composition_chunk_size,
+            workers=composition_workers,
+            metapaths=True,
+            directed=True,
+        )
+
     def edge_degree_cardinality_summary(
         self, filter_relations: list[int] = [], aggregate_by_r: bool = False
     ) -> pd.DataFrame:
@@ -425,8 +475,6 @@ def edge_pattern_summary(
                     self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
                 )
             ]
-            df_triangles_out = df_wo_loops[df_wo_loops.h.isin(filter_heads)]
-            df_triangles_in = df_wo_loops[df_wo_loops.t.isin(filter_tails)]
             df_triangles = df_wo_loops[
                 np.logical_or(
                     df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
@@ -440,9 +488,7 @@ def edge_pattern_summary(
             ]
         else:
             rel_df = inference_df = inverse_df = self.df
-            df_triangles = df_triangles_und = df_triangles_out = df_triangles_in = (
-                df_wo_loops
-            )
+            df_triangles = df_triangles_und = df_wo_loops
         df_res = df_res = pd.DataFrame(
             {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
         )
@@ -501,30 +547,24 @@ def edge_pattern_summary(
 
         # composition & metapaths
         if return_metapath_list:
-            # 2-hop paths
-            df_bridges = df_triangles_out.merge(
-                df_triangles_in, left_on="t", right_on="h", how="inner"
+            counts = self.edge_metapath_count(
+                filter_relations,
+                composition_chunk_size,
+                composition_workers,
             )
-            df_res_triangles = df_res[df_res.h != df_res.t].merge(
-                df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner"
+            counts["metapath"] = (
+                counts["r1"].astype(str) + "-" + counts["r2"].astype(str)
             )
-            df_res_triangles["metapath"] = (
-                df_res_triangles["r_x"].astype(str)
-                + "-"
-                + df_res_triangles["r_y"].astype(str)
-            )
-            grouped_triangles = df_res_triangles.groupby(
-                ["h", "r", "t"], as_index=False
-            ).agg(
-                n_triangles=("metapath", "count"), metapath_list=("metapath", "unique")
+            grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg(
+                n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list)
             )
             df_res = df_res.merge(
                 grouped_triangles,
-                on=["h", "r", "t"],
+                on=["h", "t"],
                 how="left",
             )
             df_res["metapath_list"] = df_res["metapath_list"].apply(
-                lambda agg: agg.tolist() if isinstance(agg, np.ndarray) else []
+                lambda agg: agg if isinstance(agg, list) else []
             )
             df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
         else:
diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index d3a3d55..363416a 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -188,22 +188,40 @@ def jaccard_similarity(
 
 
 def _composition_count_worker(
-    adj_csr: csr_array, adj_csc: csc_array, tail_shift: int = 0
+    adj_csr: csr_array, adj_csc: csc_array, adj_mask: csc_array, tail_shift: int = 0
 ) -> pd.DataFrame:
+    n_nodes = adj_csr.shape[1]
+    n_rels = adj_csr.shape[0] // n_nodes
     adj_2hop = adj_csr @ adj_csc
-    adj_composition = (adj_2hop.tocsc() * (adj_csc > 0)).tocoo()
-    df_composition = pd.DataFrame(
-        dict(
-            h=adj_composition.row,
-            t=adj_composition.col + tail_shift,
-            n_triangles=adj_composition.data,
+    adj_composition = (adj_2hop.tocsc() * (adj_mask > 0)).tocoo()
+    col_shift = adj_composition.col + tail_shift
+    if n_rels > 1:
+        df_composition = pd.DataFrame(
+            dict(
+                h=adj_composition.row // n_rels,
+                t=col_shift % n_nodes,
+                r1=adj_composition.row % n_rels,
+                r2=col_shift // n_nodes,
+                n_triangles=adj_composition.data,
+            )
+        )
+    else:
+        df_composition = pd.DataFrame(
+            dict(
+                h=adj_composition.row,
+                t=col_shift,
+                n_triangles=adj_composition.data,
+            )
         )
-    )
     return df_composition
 
 
 def composition_count(
-    df: pd.DataFrame, chunk_size: int, workers: int, directed: bool = True
+    df: pd.DataFrame,
+    chunk_size: int,
+    workers: int,
+    metapaths: bool = False,
+    directed: bool = True,
 ) -> pd.DataFrame:
     """A helper function to compute the composition count of a graph.
 
@@ -227,15 +245,48 @@ def composition_count(
     """
 
     n_nodes = df[["h", "t"]].max().max() + 1
-    adj = coo_array(
-        (np.ones(len(df)), (df.h, df.t)),
-        shape=[n_nodes, n_nodes],
-    ).astype(np.uint16)
-    if not directed:
-        adj = adj + adj.T
-    n_cols = adj.shape[1]
-    adj_csr = adj.tocsr()
-    adj_csc = adj.tocsc()
+    n_rels = df["r"].max() + 1
+    if metapaths:
+        adj_repeated = csc_array(
+            (
+                np.ones(n_rels * n_rels * len(df)),
+                (
+                    (n_rels * df.h.values[:, None] + np.arange(n_rels)).repeat(n_rels),
+                    np.tile(
+                        df.t.values[:, None] + n_nodes * np.arange(n_rels), n_rels
+                    ).flatten(),
+                ),
+            ),
+            shape=[n_nodes * n_rels, n_nodes * n_rels],
+        ).astype(np.uint16)
+        adj_csr = csr_array(
+            (np.ones(len(df)), (df.h * n_rels + df.r, df.t)),
+            shape=[n_nodes * n_rels, n_nodes],
+        ).astype(np.uint16)
+        adj_csc = csc_array(
+            (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)),
+            shape=[n_nodes, n_nodes * n_rels],
+        ).astype(np.uint16)
+        n_cols = adj_csc.shape[1]
+        adj_repeated_slices = {
+            i: adj_repeated[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)]
+            for i in range(int(np.ceil(n_cols / chunk_size)))
+        }
+        if not directed:
+            raise NotImplementedError(
+                "Metapath counting only implemented for directed triangles"
+            )
+    else:
+        adj = coo_array(
+            (np.ones(len(df)), (df.h, df.t)),
+            shape=[n_nodes, n_nodes],
+        ).astype(np.uint16)
+        if not directed:
+            adj = adj + adj.T
+        adj_csr = adj.tocsr()
+        adj_csc = adj.tocsc()
+        n_cols = adj_csc.shape[1]
+
     adj_csc_slices = {
         i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)]
         for i in range(int(np.ceil(n_cols / chunk_size)))
@@ -246,13 +297,23 @@ def composition_count(
             df_composition_list = pool.starmap(
                 _composition_count_worker,
                 (
-                    (adj_csr, adj_csc_slice, i * chunk_size)
+                    (
+                        adj_csr,
+                        adj_csc_slice,
+                        adj_repeated_slices[i] if metapaths else adj_csc_slice,
+                        i * chunk_size,
+                    )
                     for i, adj_csc_slice in adj_csc_slices.items()
                 ),
             )
     else:
         df_composition_list = [
-            _composition_count_worker(adj_csr, adj_csc_slice, i * chunk_size)
+            _composition_count_worker(
+                adj_csr,
+                adj_csc_slice,
+                adj_repeated_slices[i] if metapaths else adj_csc_slice,
+                i * chunk_size,
+            )
             for i, adj_csc_slice in adj_csc_slices.items()
         ]
 

From fadc967090439e5f57981ff8b7b499da0a52c2d2 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Tue, 15 Oct 2024 17:46:04 +0000
Subject: [PATCH 06/14] docstring update

---
 src/kg_topology_toolbox/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index 363416a..0160d1c 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -233,8 +233,11 @@ def composition_count(
         processed together.
     :param workers:
         Number of workers processing chunks concurrently
+    :param metapaths:
+        If True, the number of composition is computed separately for each
+        unique metapath.
     :param directed:
-        Boolean flag. If false, bidirectional edges are considered for
+        If False, bidirectional edges are considered for
         triangles by adding the adjacency matrix and its transposed. Default: True.
 
     :return:

From c49e88692749a809af23a926a6580d4e531718de Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Wed, 16 Oct 2024 09:01:40 +0000
Subject: [PATCH 07/14] use np.divmod

---
 src/kg_topology_toolbox/utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index 0160d1c..8194572 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -194,14 +194,15 @@ def _composition_count_worker(
     n_rels = adj_csr.shape[0] // n_nodes
     adj_2hop = adj_csr @ adj_csc
     adj_composition = (adj_2hop.tocsc() * (adj_mask > 0)).tocoo()
-    col_shift = adj_composition.col + tail_shift
     if n_rels > 1:
+        h, r1 = np.divmod(adj_composition.row, n_rels)
+        r2, t = np.divmod(adj_composition.col + tail_shift, n_nodes)
         df_composition = pd.DataFrame(
             dict(
-                h=adj_composition.row // n_rels,
-                t=col_shift % n_nodes,
-                r1=adj_composition.row % n_rels,
-                r2=col_shift // n_nodes,
+                h=h,
+                t=t,
+                r1=r1,
+                r2=r2,
                 n_triangles=adj_composition.data,
             )
         )
@@ -209,7 +210,7 @@ def _composition_count_worker(
         df_composition = pd.DataFrame(
             dict(
                 h=adj_composition.row,
-                t=col_shift,
+                t=adj_composition.col + tail_shift,
                 n_triangles=adj_composition.data,
             )
         )

From ea845abe2bc6643ee018e12a723889edebef9092 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Wed, 16 Oct 2024 09:10:59 +0000
Subject: [PATCH 08/14] avoid repeated work

---
 src/kg_topology_toolbox/topology_toolbox.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index 366bf6c..5def422 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -547,10 +547,12 @@ def edge_pattern_summary(
 
         # composition & metapaths
         if return_metapath_list:
-            counts = self.edge_metapath_count(
-                filter_relations,
-                composition_chunk_size,
-                composition_workers,
+            counts = composition_count(
+                df_triangles,
+                chunk_size=composition_chunk_size,
+                workers=composition_workers,
+                metapaths=True,
+                directed=True,
             )
             counts["metapath"] = (
                 counts["r1"].astype(str) + "-" + counts["r2"].astype(str)

From 05e3ee3ed6a26a387bc52389c4ba88986c30ed4f Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Wed, 16 Oct 2024 10:47:12 +0000
Subject: [PATCH 09/14] add metapath unit test

---
 src/kg_topology_toolbox/topology_toolbox.py | 17 ++++++++++-------
 tests/test_edge_topology_toolbox.py         | 21 ++++++++++++++++-----
 tests/test_node_topology_toolbox.py         |  5 +----
 tests/test_relation_topology_toolbox.py     |  5 +----
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index 5def422..faec2a9 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -278,9 +278,9 @@ def edge_metapath_count(
         composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
     ) -> pd.DataFrame:
         """
-        For each edge in the KG, compute the number of triangles of different
-        metapaths (i.e., the unique tuples (r1, r2) of relation types
-        of the two additional edges of the triangle).
+        For each edge in the KG, compute the number of triangles supported on it
+        distinguishing between different metapaths (i.e., the unique tuples (r1, r2)
+        of relation types of the two additional edges of the triangle).
 
         :param filter_relations:
             If not empty, compute the output only for the edges with relation
@@ -293,10 +293,11 @@ def edge_metapath_count(
             on number of available threads (max: 32).
 
         :return:
-            The output dataframe has one row for each (h, t, r1, r2) such that
-            there exists at least one triangle of metapath (r1, r2) over (any) edge
-            connecting h, t.
+            The output dataframe has one row for each (h, r, t, r1, r2) such that
+            there exists at least one triangle of metapath (r1, r2) over (h, r, t).
             The number of metapath triangles is given in the column **n_triangles**.
+            The column **index** provides the index of the edge (h, r, t) in the
+            original Knowledge Graph dataframe.
         """
         # discard loops as edges of a triangle
         df_wo_loops = self.df[self.df.h != self.df.t]
@@ -313,7 +314,7 @@ def edge_metapath_count(
             rel_df = self.df
             df_triangles = df_wo_loops
 
-        return composition_count(
+        counts = composition_count(
             df_triangles,
             chunk_size=composition_chunk_size,
             workers=composition_workers,
@@ -321,6 +322,8 @@ def edge_metapath_count(
             directed=True,
         )
 
+        return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner")
+
     def edge_degree_cardinality_summary(
         self, filter_relations: list[int] = [], aggregate_by_r: bool = False
     ) -> pd.DataFrame:
diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py
index 6e0b6be..49bfa5e 100644
--- a/tests/test_edge_topology_toolbox.py
+++ b/tests/test_edge_topology_toolbox.py
@@ -22,12 +22,19 @@
 )
 
 
-@pytest.mark.parametrize("return_metapath_list", [True, False])
-def test_small_graph_metrics(return_metapath_list: bool) -> None:
-    # Define a small graph with all the features tested by
-    # the edge_topology_toolbox
+def test_edge_metapath_count() -> None:
+    res = kgtt.edge_metapath_count()
+    assert np.allclose(res["index"], [2, 2])
+    assert np.allclose(res["h"], [0, 0])
+    assert np.allclose(res["r"], [0, 0])
+    assert np.allclose(res["t"], [2, 2])
+    assert np.allclose(res["r1"], [0, 1])
+    assert np.allclose(res["r2"], [1, 1])
+    assert np.allclose(res["n_triangles"], [1, 1])
+
 
-    # entity degrees statistics
+def test_edge_degree_cardinality_summary() -> None:
+    # edge degrees statistics
     res = kgtt.edge_degree_cardinality_summary()
     assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2])
     assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3])
@@ -60,6 +67,9 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
         "M:M",
     ]
 
+
+@pytest.mark.parametrize("return_metapath_list", [True, False])
+def test_edge_pattern_summary(return_metapath_list: bool) -> None:
     # relation pattern symmetry
     res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list)
     assert np.allclose(
@@ -92,6 +102,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
 def test_filter_relations() -> None:
     for rels in [[0], [1], [0, 1]]:
         for method in [
+            kgtt.edge_metapath_count,
             kgtt.edge_degree_cardinality_summary,
             partial(kgtt.edge_pattern_summary, return_metapath_list=True),
         ]:
diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py
index 18d87ed..d002b41 100644
--- a/tests/test_node_topology_toolbox.py
+++ b/tests/test_node_topology_toolbox.py
@@ -19,10 +19,7 @@
 
 
 @pytest.mark.parametrize("return_relation_list", [True, False])
-def test_small_graph_metrics(return_relation_list: bool) -> None:
-    # Define a small graph with all the features tested by
-    # the node_topology_toolbox
-
+def test_node_degree_summary(return_relation_list: bool) -> None:
     # entity degrees statistics
     res = kgtt.node_degree_summary(return_relation_list=return_relation_list)
     assert np.allclose(res["h_degree"], [3, 1, 3])
diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py
index 3f0c05c..e527a5f 100644
--- a/tests/test_relation_topology_toolbox.py
+++ b/tests/test_relation_topology_toolbox.py
@@ -20,10 +20,7 @@
 kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T")
 
 
-def test_small_graph_metrics() -> None:
-    # Define a small graph on five nodes with all the features tested by
-    # the relation_topology_toolbox
-
+def test_aggregate_by_r() -> None:
     dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True)
     eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True)
 

From 5ec0f3e07d9ab8fddc4607d474af2b8455115b93 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 18 Oct 2024 17:21:56 +0000
Subject: [PATCH 10/14] reduce memory usage of metapath counting

---
 src/kg_topology_toolbox/utils.py    | 70 +++++++++++++++++------------
 tests/test_edge_topology_toolbox.py | 13 +++---
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index 8194572..d2e798f 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -193,7 +193,7 @@ def _composition_count_worker(
     n_nodes = adj_csr.shape[1]
     n_rels = adj_csr.shape[0] // n_nodes
     adj_2hop = adj_csr @ adj_csc
-    adj_composition = (adj_2hop.tocsc() * (adj_mask > 0)).tocoo()
+    adj_composition = (adj_2hop.tocsc() * adj_mask).tocoo()
     if n_rels > 1:
         h, r1 = np.divmod(adj_composition.row, n_rels)
         r2, t = np.divmod(adj_composition.col + tail_shift, n_nodes)
@@ -250,19 +250,15 @@ def composition_count(
 
     n_nodes = df[["h", "t"]].max().max() + 1
     n_rels = df["r"].max() + 1
+    adj = coo_array(
+        (np.ones(len(df)), (df.h, df.t)),
+        shape=[n_nodes, n_nodes],
+    ).astype(np.uint16)
     if metapaths:
-        adj_repeated = csc_array(
-            (
-                np.ones(n_rels * n_rels * len(df)),
-                (
-                    (n_rels * df.h.values[:, None] + np.arange(n_rels)).repeat(n_rels),
-                    np.tile(
-                        df.t.values[:, None] + n_nodes * np.arange(n_rels), n_rels
-                    ).flatten(),
-                ),
-            ),
-            shape=[n_nodes * n_rels, n_nodes * n_rels],
-        ).astype(np.uint16)
+        if not directed:
+            raise NotImplementedError(
+                "Metapath counting only implemented for directed triangles"
+            )
         adj_csr = csr_array(
             (np.ones(len(df)), (df.h * n_rels + df.r, df.t)),
             shape=[n_nodes * n_rels, n_nodes],
@@ -271,26 +267,24 @@ def composition_count(
             (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)),
             shape=[n_nodes, n_nodes * n_rels],
         ).astype(np.uint16)
-        n_cols = adj_csc.shape[1]
-        adj_repeated_slices = {
-            i: adj_repeated[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)]
-            for i in range(int(np.ceil(n_cols / chunk_size)))
-        }
-        if not directed:
-            raise NotImplementedError(
-                "Metapath counting only implemented for directed triangles"
-            )
+        # boolean mask to filter results with only the edges in the KG
+        msk = csc_array(
+            (
+                [True] * (len(adj.data) * n_rels),
+                (
+                    (n_rels * adj.row + np.arange(n_rels)[:, None]).flatten(),
+                    np.tile(adj.col, n_rels),
+                ),
+            ),
+            shape=[n_nodes * n_rels, n_nodes],
+        )
     else:
-        adj = coo_array(
-            (np.ones(len(df)), (df.h, df.t)),
-            shape=[n_nodes, n_nodes],
-        ).astype(np.uint16)
         if not directed:
             adj = adj + adj.T
         adj_csr = adj.tocsr()
         adj_csc = adj.tocsc()
-        n_cols = adj_csc.shape[1]
 
+    n_cols = adj_csc.shape[1]
     adj_csc_slices = {
         i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)]
         for i in range(int(np.ceil(n_cols / chunk_size)))
@@ -304,7 +298,16 @@ def composition_count(
                     (
                         adj_csr,
                         adj_csc_slice,
-                        adj_repeated_slices[i] if metapaths else adj_csc_slice,
+                        (
+                            # relevant slice of mask (with wraparound)
+                            msk[
+                                :,
+                                (i * chunk_size + np.arange(adj_csc_slice.shape[1]))
+                                % msk.shape[1],
+                            ]
+                            if metapaths
+                            else adj_csc_slice > 0
+                        ),
                         i * chunk_size,
                     )
                     for i, adj_csc_slice in adj_csc_slices.items()
@@ -315,7 +318,16 @@ def composition_count(
             _composition_count_worker(
                 adj_csr,
                 adj_csc_slice,
-                adj_repeated_slices[i] if metapaths else adj_csc_slice,
+                (
+                    # relevant slice of mask (with wraparound)
+                    msk[
+                        :,
+                        (i * chunk_size + np.arange(adj_csc_slice.shape[1]))
+                        % msk.shape[1],
+                    ]
+                    if metapaths
+                    else adj_csc_slice > 0
+                ),
                 i * chunk_size,
             )
             for i, adj_csc_slice in adj_csc_slices.items()
diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py
index 49bfa5e..849bd5c 100644
--- a/tests/test_edge_topology_toolbox.py
+++ b/tests/test_edge_topology_toolbox.py
@@ -23,13 +23,14 @@
 
 
 def test_edge_metapath_count() -> None:
-    res = kgtt.edge_metapath_count()
+    res = kgtt.edge_metapath_count(composition_chunk_size=3)
     assert np.allclose(res["index"], [2, 2])
     assert np.allclose(res["h"], [0, 0])
     assert np.allclose(res["r"], [0, 0])
     assert np.allclose(res["t"], [2, 2])
-    assert np.allclose(res["r1"], [0, 1])
-    assert np.allclose(res["r2"], [1, 1])
+    assert set(zip(res["r1"].values.tolist(), res["r2"].values.tolist())) == set(
+        [(0, 1), (1, 1)]
+    )
     assert np.allclose(res["n_triangles"], [1, 1])
 
 
@@ -71,7 +72,9 @@ def test_edge_degree_cardinality_summary() -> None:
 @pytest.mark.parametrize("return_metapath_list", [True, False])
 def test_edge_pattern_summary(return_metapath_list: bool) -> None:
     # relation pattern symmetry
-    res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list)
+    res = kgtt.edge_pattern_summary(
+        return_metapath_list=return_metapath_list, composition_chunk_size=3
+    )
     assert np.allclose(
         res["is_loop"], [False, False, False, False, False, False, True, True]
     )
@@ -96,7 +99,7 @@ def test_edge_pattern_summary(return_metapath_list: bool) -> None:
     assert np.allclose(res["n_triangles"], [0, 0, 2, 0, 0, 0, 0, 0])
     assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0])
     if return_metapath_list:
-        assert res["metapath_list"][2] == ["0-1", "1-1"]
+        assert set(res["metapath_list"][2]) == set(["0-1", "1-1"])
 
 
 def test_filter_relations() -> None:

From ce6f369f12d3a9388f536feae62902a3f696e8f8 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 25 Oct 2024 14:19:04 +0000
Subject: [PATCH 11/14] improve docstrings

---
 src/kg_topology_toolbox/topology_toolbox.py | 34 +++++++++---
 src/kg_topology_toolbox/utils.py            | 57 ++++++++++++++++-----
 2 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index faec2a9..c503d44 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -258,8 +258,6 @@ def edge_cardinality(self) -> pd.DataFrame:
         # check if the values in the pair (h_degree, t_degree) are =1 or >1
         # to determine the edge cardinality
         for suffix in ["", "_same_rel"]:
-            # check if the values in the pair (h_degree, t_degree) are =1 or >1
-            # to determine the edge cardinality
             edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + (
                 tail_degree["t_degree" + suffix] == 1
             )
@@ -279,15 +277,16 @@ def edge_metapath_count(
     ) -> pd.DataFrame:
         """
         For each edge in the KG, compute the number of triangles supported on it
-        distinguishing between different metapaths (i.e., the unique tuples (r1, r2)
-        of relation types of the two additional edges of the triangle).
+        distinguishing between different metapaths (i.e., the unique ordered tuples
+        (r1, r2) of relation types of the two additional edges of the triangle).
 
         :param filter_relations:
             If not empty, compute the output only for the edges with relation
             in this list of relation IDs.
         :param composition_chunk_size:
             Size of column chunks of sparse adjacency matrix
-            to compute the triangle count. Default: 2**8.
+            to compute the triangle count. Reduce the parameter if running OOM.
+            Default: 2**8.
         :param composition_workers:
             Number of workers to compute the triangle count. By default, assigned based
             on number of available threads (max: 32).
@@ -303,8 +302,11 @@ def edge_metapath_count(
         df_wo_loops = self.df[self.df.h != self.df.t]
         if len(filter_relations) > 0:
             rel_df = self.df[self.df.r.isin(filter_relations)]
+            # unique heads and tails used by filtered edges
             filter_heads = rel_df.h.unique()
             filter_tails = rel_df.t.unique()
+            # the only relevant edges for triangles are the ones with head in the
+            # set of filtered heads, or tail in the set of filtered tails
             df_triangles = df_wo_loops[
                 np.logical_or(
                     df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
@@ -387,7 +389,7 @@ def edge_degree_cardinality_summary(
         df_res["tot_degree"] = (
             df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values
         )
-        # when restricting to the relation type, there is only one edge
+        # when restricting to the same relation type, there is only one edge
         # (the edge itself) that is double-counted
         df_res["tot_degree_same_rel"] = (
             df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1
@@ -427,7 +429,8 @@ def edge_pattern_summary(
             (the output DataFrame will be indexed over relation IDs).
         :param composition_chunk_size:
             Size of column chunks of sparse adjacency matrix
-            to compute the triangle count. Default: 2**8.
+            to compute the triangle count. Reduce the parameter if running OOM.
+            Default: 2**8.
         :param composition_workers:
             Number of workers to compute the triangle count. By default, assigned based
             on number of available threads (max: 32).
@@ -464,10 +467,11 @@ def edge_pattern_summary(
         df_wo_loops = self.df[self.df.h != self.df.t]
         if len(filter_relations) > 0:
             rel_df = self.df[self.df.r.isin(filter_relations)]
+            # unique heads and tails used by filtered edges
             filter_heads = rel_df.h.unique()
             filter_tails = rel_df.t.unique()
             filter_entities = np.union1d(filter_heads, filter_tails)
-
+            # restrict relevant edges to count inference/inverse patterns
             inference_df = self.df[
                 np.logical_and(
                     self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
@@ -478,11 +482,15 @@ def edge_pattern_summary(
                     self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
                 )
             ]
+            # the only relevant edges for triangles are the ones with head in the
+            # set of filtered heads, or tail in the set of filtered tails
             df_triangles = df_wo_loops[
                 np.logical_or(
                     df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
                 )
             ]
+            # for undirected triangles, heads and tails can be any of the
+            # filtered entities
             df_triangles_und = df_wo_loops[
                 np.logical_or(
                     df_wo_loops.h.isin(filter_entities),
@@ -557,9 +565,12 @@ def edge_pattern_summary(
                 metapaths=True,
                 directed=True,
             )
+            # turn (r1, r2) into "r1-r2" string for metapaths
             counts["metapath"] = (
                 counts["r1"].astype(str) + "-" + counts["r2"].astype(str)
             )
+            # count triangles (summing over all metapaths between two nodes)
+            # and list unique metapaths for each head and tail node pair
             grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg(
                 n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list)
             )
@@ -568,6 +579,7 @@ def edge_pattern_summary(
                 on=["h", "t"],
                 how="left",
             )
+            # if no triangles are present over an edge, set metapath list to []
             df_res["metapath_list"] = df_res["metapath_list"].apply(
                 lambda agg: agg if isinstance(agg, list) else []
             )
@@ -588,6 +600,7 @@ def edge_pattern_summary(
 
         df_res["has_composition"] = df_res["n_triangles"] > 0
 
+        # undirected composition
         counts = composition_count(
             df_triangles_und,
             chunk_size=composition_chunk_size,
@@ -658,6 +671,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
             - **jaccard_both** (float): Jaccard similarity between the full entity set
               of r1 and r2.
         """
+        # set of unique heads/tails/any for each relation
         ent_unique = self.df.groupby("r", as_index=False).agg(
             num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique")
         )
@@ -674,6 +688,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
             suffixes=["_r1", "_r2"],
             how="cross",
         )
+        # order doesn't matter
         df_res = df_res[df_res.r1 < df_res.r2]
 
         df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"]
@@ -748,15 +763,18 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame
         # normalize by global t frequency
         rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum")
 
+        # sparse matrix of of (h,r) pair frequency
         E_h = coo_array(
             (hr_freqs.t, (hr_freqs.h, hr_freqs.r)),
             shape=[self.n_entity, self.n_rel],
         )
+        # sparse matrix of of (t,r) pair frequency
         E_t = coo_array(
             (rt_freqs.h, (rt_freqs.t, rt_freqs.r)),
             shape=[self.n_entity, self.n_rel],
         )
 
+        # adjacency matrix of relation graph
         A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray()
         A[np.diag_indices_from(A)] = 0
 
diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index d2e798f..5bc7ac7 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -133,6 +133,7 @@ def aggregate_by_relation(edge_topology_df: pd.DataFrame) -> pd.DataFrame:
         elif col_dtype == object:
             if isinstance(edge_topology_df[col].iloc[0], str):
                 for label in np.unique(edge_topology_df[col]):
+                    # fraction of rows for each label
                     df_res[f"{col}_{label}_frac"] = (
                         edge_topology_df[edge_topology_df[col] == label]
                         .groupby("r")[col]
@@ -188,15 +189,34 @@ def jaccard_similarity(
 
 
 def _composition_count_worker(
-    adj_csr: csr_array, adj_csc: csc_array, adj_mask: csc_array, tail_shift: int = 0
+    adj_csr: csr_array,
+    adj_csc_slice: csc_array,
+    adj_mask_slice: csc_array,
+    slice_tail_shift: int,
 ) -> pd.DataFrame:
+    """
+    Masked sparse matmul to count triangles over graph edges.
+
+    :param adj_csr: shape (n_nodes * n_rels, n_nodes) if distinguishing between
+        metapaths, (n_nodes, n_nodes) otherwise
+    :param adj_csc_slice: shape (n_nodes, chunk_size)
+    :param adj_mask_slice: shape (n_nodes, chunk_size)
+    :param slice_tail_shift: column shift of the vertical slice
+
+    :return:
+        Pandas dataframe of triangle counts.
+    """
     n_nodes = adj_csr.shape[1]
     n_rels = adj_csr.shape[0] // n_nodes
-    adj_2hop = adj_csr @ adj_csc
-    adj_composition = (adj_2hop.tocsc() * adj_mask).tocoo()
+    # 2-hop count
+    adj_2hop = adj_csr @ adj_csc_slice
+    # mask out (h,t) pairs not connected by edges
+    adj_composition = (adj_2hop.tocsc() * adj_mask_slice).tocoo()
     if n_rels > 1:
+        # distinguish between metapaths
+        # unflatten results
         h, r1 = np.divmod(adj_composition.row, n_rels)
-        r2, t = np.divmod(adj_composition.col + tail_shift, n_nodes)
+        r2, t = np.divmod(adj_composition.col + slice_tail_shift, n_nodes)
         df_composition = pd.DataFrame(
             dict(
                 h=h,
@@ -207,10 +227,11 @@ def _composition_count_worker(
             )
         )
     else:
+        # don't distinguish between metapaths
         df_composition = pd.DataFrame(
             dict(
                 h=adj_composition.row,
-                t=adj_composition.col + tail_shift,
+                t=adj_composition.col + slice_tail_shift,
                 n_triangles=adj_composition.data,
             )
         )
@@ -224,7 +245,7 @@ def composition_count(
     metapaths: bool = False,
     directed: bool = True,
 ) -> pd.DataFrame:
-    """A helper function to compute the composition count of a graph.
+    """Compute composition count of a graph.
 
     :param df:
         A graph represented as a pd.DataFrame. Must contain the columns
@@ -235,30 +256,35 @@ def composition_count(
     :param workers:
         Number of workers processing chunks concurrently
     :param metapaths:
-        If True, the number of composition is computed separately for each
+        If True, the number of compositions is computed separately for each
         unique metapath.
     :param directed:
         If False, bidirectional edges are considered for
-        triangles by adding the adjacency matrix and its transposed. Default: True.
+        triangles, by adding the adjacency matrix and its transposed. Default: True.
 
     :return:
         The results dataframe. Contains the following columns:
         - **h** (int): Index of the head entity.
         - **t** (int): Index of the tail entity.
-        - **n_triangles** (int): Number of compositions for the (h, t) edge.
+        - **n_triangles** (int): Number of compositions for any edge between (h, t).
     """
 
     n_nodes = df[["h", "t"]].max().max() + 1
     n_rels = df["r"].max() + 1
+    # sparse graph adjacency matrix, counting number of edges between each pair of nodes
     adj = coo_array(
         (np.ones(len(df)), (df.h, df.t)),
         shape=[n_nodes, n_nodes],
     ).astype(np.uint16)
+
     if metapaths:
         if not directed:
             raise NotImplementedError(
                 "Metapath counting only implemented for directed triangles"
             )
+        # relation-aware adjacency matrix, flattened to 2D for sparse implementation
+        # (adj_csr @ adj_csc).reshape(n_nodes, n_rels, n_rels, n_nodes)[h,r1,r2,t] counts
+        # the number of 2-hop paths of metapath (r1, r2) between h and t
         adj_csr = csr_array(
             (np.ones(len(df)), (df.h * n_rels + df.r, df.t)),
             shape=[n_nodes * n_rels, n_nodes],
@@ -267,7 +293,8 @@ def composition_count(
             (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)),
             shape=[n_nodes, n_nodes * n_rels],
         ).astype(np.uint16)
-        # boolean mask to filter results with only the edges in the KG
+        # boolean mask to filter results, keep only triangles over (h,t) pairs connected
+        # by at least one edge (equivalent to flattened adj[:,None,None,:] > 0)
         msk = csc_array(
             (
                 [True] * (len(adj.data) * n_rels),
@@ -280,18 +307,22 @@ def composition_count(
         )
     else:
         if not directed:
+            # add inverse edges for undirected compositions
             adj = adj + adj.T
+        # (adj_csr @ adj_csc)[h,t] counts the number of 2-hop paths between h and t;
+        # the boolean mask here is simply adj_csc > 0
         adj_csr = adj.tocsr()
         adj_csc = adj.tocsc()
 
+    # to compute (adj_csr @ adj_csc) * msk, serialize over vertical slices of adj_csc
     n_cols = adj_csc.shape[1]
     adj_csc_slices = {
         i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)]
         for i in range(int(np.ceil(n_cols / chunk_size)))
     }
-
     if len(adj_csc_slices) > 1 and workers > 1:
         with Pool(workers) as pool:
+            # workers are assigned different adj_csc slices
             df_composition_list = pool.starmap(
                 _composition_count_worker,
                 (
@@ -299,7 +330,7 @@ def composition_count(
                         adj_csr,
                         adj_csc_slice,
                         (
-                            # relevant slice of mask (with wraparound)
+                            # relevant slice of boolean mask (with wraparound)
                             msk[
                                 :,
                                 (i * chunk_size + np.arange(adj_csc_slice.shape[1]))
@@ -319,7 +350,7 @@ def composition_count(
                 adj_csr,
                 adj_csc_slice,
                 (
-                    # relevant slice of mask (with wraparound)
+                    # relevant slice of boolean mask (with wraparound)
                     msk[
                         :,
                         (i * chunk_size + np.arange(adj_csc_slice.shape[1]))

From d6effae8a3583d70847fda079740a753cd09385c Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 25 Oct 2024 15:01:17 +0000
Subject: [PATCH 12/14] document new functionalities in the doc notebook

---
 docs/source/notebooks/ogb_biokg_demo.ipynb | 316 ++++++++++++++++++++-
 1 file changed, 304 insertions(+), 12 deletions(-)

diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb
index bb73448..dc13ff5 100644
--- a/docs/source/notebooks/ogb_biokg_demo.ipynb
+++ b/docs/source/notebooks/ogb_biokg_demo.ipynb
@@ -22,9 +22,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found existing installation: kg-topology-toolbox 0.1.0\n",
-      "Uninstalling kg-topology-toolbox-0.1.0:\n",
-      "  Successfully uninstalled kg-topology-toolbox-0.1.0\n"
+      "Found existing installation: kg-topology-toolbox 1.0.0\n",
+      "Uninstalling kg-topology-toolbox-1.0.0:\n",
+      "  Successfully uninstalled kg-topology-toolbox-1.0.0\n"
      ]
     }
    ],
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -181,7 +181,7 @@
        "[5088434 rows x 3 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -209,14 +209,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/topology_toolbox.py:64: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n",
+      "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/utils.py:42: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n",
       "  warnings.warn(\n"
      ]
     }
@@ -232,13 +232,77 @@
     "Notice the warning raised by the constructor, which detects duplicated edges in the `biokg_df` DataFrame: to ensure optimal functionalities, duplicated edges should be removed before instantiating the `KGTopologyToolbox` class."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>h</th>\n",
+       "      <th>r</th>\n",
+       "      <th>t</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3854407</th>\n",
+       "      <td>1972</td>\n",
+       "      <td>45</td>\n",
+       "      <td>1972</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4000534</th>\n",
+       "      <td>1972</td>\n",
+       "      <td>45</td>\n",
+       "      <td>1972</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            h   r     t\n",
+       "3854407  1972  45  1972\n",
+       "4000534  1972  45  1972"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find duplicated edges\n",
+    "biokg_df.loc[biokg_df.duplicated(keep=False)]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Node-level analysis\n",
     "\n",
-    "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned dataframe is indexed on the node ID.\n",
+    "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned DataFrame is indexed on the node ID.\n",
     "\n",
     "- `h_degree` is the number of edges coming out from the node;\n",
     "- `t_degree` is the number of edges going into the node;\n",
@@ -894,7 +958,7 @@
     "\n",
     "![image info](../images/edge_patterns.png)\n",
     "\n",
-    "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided (the unique metapaths `[r_1, r_2]` can also be listed by setting `return_metapath_list=True` when calling the method)."
+    "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided."
    ]
   },
   {
@@ -1210,6 +1274,15 @@
     "edge_eps"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we need to identify the different metapaths `[r_1, r_2]` that give triangles `(h,r1,x) - (x,r2,t)` over an edge `(h,r,t)`, we can do so by setting `return_metapath_list=True` in the call of `edge_pattern_summary`. In order to disaggregate the total number of triangles over an edge into separate counts for each existing metapath, the `edge_metapath_count` method should be used instead. \n",
+    "\n",
+    "We can now easily produce a global view of the distribution of topological properties."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -1277,6 +1350,225 @@
     "plt.tight_layout()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Filtering relation types\n",
+    "\n",
+    "The edge-level methods presented in the previous section simultaneously compute statistics for all edges in the KG, and this can be expensive on larger graphs. Moreover, in many practical cases the user might be interested in looking only at the properties of edges of one or few specific relation types.\n",
+    "\n",
+    "The methods `edge_degree_cardinality_summary`, `edge_pattern_summary` and `edge_metapath_count` can be passed a list of relation type IDs to restrict computations of their outputs to edges of those specific relation types."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>h</th>\n",
+       "      <th>r</th>\n",
+       "      <th>t</th>\n",
+       "      <th>r1</th>\n",
+       "      <th>r2</th>\n",
+       "      <th>n_triangles</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>39</td>\n",
+       "      <td>2</td>\n",
+       "      <td>123</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>38</td>\n",
+       "      <td>2</td>\n",
+       "      <td>200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>37</td>\n",
+       "      <td>2</td>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>36</td>\n",
+       "      <td>2</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732149</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>13</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732150</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>11</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732151</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>6</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732152</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>4</td>\n",
+       "      <td>41</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732153</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>2</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>732154 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          index     h   r     t  r1  r2  n_triangles\n",
+       "0        334382   732   7  1225  41   2           10\n",
+       "1        334382   732   7  1225  39   2          123\n",
+       "2        334382   732   7  1225  38   2          200\n",
+       "3        334382   732   7  1225  37   2           27\n",
+       "4        334382   732   7  1225  36   2            6\n",
+       "...         ...   ...  ..   ...  ..  ..          ...\n",
+       "732149  4953327  1529  24  2492  13  41            2\n",
+       "732150  4953327  1529  24  2492  11  41            2\n",
+       "732151  4953327  1529  24  2492   6  41            2\n",
+       "732152  4953327  1529  24  2492   4  41            1\n",
+       "732153  4953327  1529  24  2492   2  41            2\n",
+       "\n",
+       "[732154 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_metapath_counts = kgtt.edge_metapath_count(filter_relations=[7, 24])\n",
+    "filtered_metapath_counts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The previous cell computes the number of triangles of each existing `(r1, r2)` metapath, but only over `(h,r,t)` edges of the two relation types with ID 7 and 24 (the column `index` gives the index of the edge in the `biokkg_df` DataFrame). This is the same as calling `kgtt.edge_metapath_count().query('r==7 or r==24')`, but the computation is much cheaper and faster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "r\n",
+       "24    413366\n",
+       "7     318788\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_metapath_counts.r.value_counts()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2267,7 +2559,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv38",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -2281,7 +2573,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

From 2a71340a8f241d50f91175ea3350cfb0cb204e07 Mon Sep 17 00:00:00 2001
From: Daniel Justus <danielj@graphcore.ai>
Date: Fri, 8 Nov 2024 14:25:42 +0000
Subject: [PATCH 13/14] tidy up redundant code

---
 src/kg_topology_toolbox/topology_toolbox.py | 24 +++++++--------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index c503d44..0c1e73b 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -557,14 +557,14 @@ def edge_pattern_summary(
         df_res["has_inference"] = df_res["n_inference_relations"] > 0
 
         # composition & metapaths
+        counts = composition_count(
+            df_triangles,
+            chunk_size=composition_chunk_size,
+            workers=composition_workers,
+            metapaths=return_metapath_list,
+            directed=True,
+        )
         if return_metapath_list:
-            counts = composition_count(
-                df_triangles,
-                chunk_size=composition_chunk_size,
-                workers=composition_workers,
-                metapaths=True,
-                directed=True,
-            )
             # turn (r1, r2) into "r1-r2" string for metapaths
             counts["metapath"] = (
                 counts["r1"].astype(str) + "-" + counts["r2"].astype(str)
@@ -583,21 +583,13 @@ def edge_pattern_summary(
             df_res["metapath_list"] = df_res["metapath_list"].apply(
                 lambda agg: agg if isinstance(agg, list) else []
             )
-            df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
         else:
-            counts = composition_count(
-                df_triangles,
-                chunk_size=composition_chunk_size,
-                workers=composition_workers,
-                directed=True,
-            )
             df_res = df_res.merge(
                 counts,
                 on=["h", "t"],
                 how="left",
             )
-            df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
-
+        df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
         df_res["has_composition"] = df_res["n_triangles"] > 0
 
         # undirected composition

From 9862ce0e10857e814907da9d0aab2432915658f0 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <albertoc@graphcore.ai>
Date: Fri, 8 Nov 2024 14:48:29 +0000
Subject: [PATCH 14/14] fix typo

---
 src/kg_topology_toolbox/topology_toolbox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index 0c1e73b..5ae642e 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -500,7 +500,7 @@ def edge_pattern_summary(
         else:
             rel_df = inference_df = inverse_df = self.df
             df_triangles = df_triangles_und = df_wo_loops
-        df_res = df_res = pd.DataFrame(
+        df_res = pd.DataFrame(
             {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
         )
         # symmetry-asymmetry