From 590559ecb0c28e66408683aa351ea56970fd784e Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 11 Oct 2024 11:05:23 +0000 Subject: [PATCH 01/14] filter relations for edge card; cap mp workers based on cores --- src/kg_topology_toolbox/topology_toolbox.py | 36 ++++++++++++++------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index c3d6f5b..20981a4 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -6,7 +6,7 @@ """ from functools import cache - +import multiprocessing as mp import numpy as np import pandas as pd from scipy.sparse import coo_array @@ -271,7 +271,7 @@ def edge_cardinality(self) -> pd.DataFrame: return df_res def edge_degree_cardinality_summary( - self, aggregate_by_r: bool = False + self, filter_relations: list = [], aggregate_by_r: bool = False ) -> pd.DataFrame: """ For each edge in the KG, compute the number of edges with the same head @@ -285,6 +285,9 @@ def edge_degree_cardinality_summary( The output dataframe maintains the same indexing and ordering of triples as the original Knowledge Graph dataframe. + :param filter_relations: + Compute the output only for the edges with relation in this list + of relation IDs. :param aggregate_by_r: If True, return metrics aggregated by relation type (the output DataFrame will be indexed over relation IDs). @@ -318,6 +321,8 @@ def edge_degree_cardinality_summary( ], axis=1, ) + if len(filter_relations) > 0: + df_res = df_res[df_res.r.isin(filter_relations)] # compute number of parallel edges to avoid double-counting them # in total degree num_parallel = df_res.merge( @@ -326,7 +331,9 @@ def edge_degree_cardinality_summary( how="left", ) df_res["tot_degree"] = ( - df_res.h_degree + df_res.t_degree - num_parallel.n_parallel + df_res.h_degree.values + + df_res.t_degree.values + - num_parallel.n_parallel.values ) # when restricting to the relation type, there is only one edge # (the edge itself) that is double-counted @@ -344,9 +351,10 @@ def edge_degree_cardinality_summary( def edge_pattern_summary( self, return_metapath_list: bool = False, - composition_chunk_size: int = 2**8, - composition_workers: int = 32, + filter_relations: list = [], aggregate_by_r: bool = False, + composition_chunk_size: int = 2**8, + composition_workers: int = min(32, mp.cpu_count() - 1 or 1), ) -> pd.DataFrame: """ Analyse structural properties of each edge in the KG: @@ -359,14 +367,18 @@ def edge_pattern_summary( :param return_metapath_list: If True, return the list of unique metapaths for all triangles supported over one edge. WARNING: very expensive for large graphs. - :param composition_chunk_size: - Size of column chunks of sparse adjacency matrix - to compute the triangle count. - :param composition_workers: - Number of workers to compute the triangle count. + :param filter_relations: + Compute the output only for the edges with relation in this list + of relation IDs. :param aggregate_by_r: If True, return metrics aggregated by relation type (the output DataFrame will be indexed over relation IDs). + :param composition_chunk_size: + Size of column chunks of sparse adjacency matrix + to compute the triangle count. Default: 2**8. + :param composition_workers: + Number of workers to compute the triangle count. By default, assigned based + on number of available threads (max: 32). :return: The results dataframe. Contains the following columns @@ -407,6 +419,8 @@ def edge_pattern_summary( self.df.reset_index().merge(df_inv)["index"], "is_symmetric", ] = True + if len(filter_relations) > 0: + df_res = df_res[df_res.r.isin(filter_relations)] # loops are treated separately df_res["is_loop"] = df_res.h == df_res.t df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False @@ -631,7 +645,7 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame returned dataframe. :param min_max_norm: - min-max normalization of edge weights. Defaults to False. + min-max normalization of edge weights. Default: False. :return: The results dataframe. Contains the following columns: From 4612c0ee9de125efa4c9b3545d23c5a51c043f3a Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 11 Oct 2024 16:02:05 +0000 Subject: [PATCH 02/14] add relation filter to edge methods --- src/kg_topology_toolbox/topology_toolbox.py | 94 +++++++++++++++------ tests/test_edge_topology_toolbox.py | 20 +++++ 2 files changed, 86 insertions(+), 28 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 20981a4..b1434f7 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -407,31 +407,64 @@ def edge_pattern_summary( - **metapath_list** (list): The list of unique metapaths "r1-r2" for the directed triangles. """ + + if len(filter_relations) > 0: + rel_df = self.df[self.df.r.isin(filter_relations)] + filter_heads = rel_df.h.unique() + filter_tails = rel_df.t.unique() + filter_entities = np.union1d(filter_heads, filter_tails) + + inference_df = self.df[ + np.logical_and( + self.df.h.isin(filter_heads), self.df.t.isin(filter_tails) + ) + ] + inverse_df = self.df[ + np.logical_and( + self.df.h.isin(filter_tails), self.df.t.isin(filter_heads) + ) + ] + df_triangles = self.df[ + np.logical_or( + self.df.h.isin(filter_heads), self.df.t.isin(filter_tails) + ) + ] + df_triangles_und = self.df[ + np.logical_or( + self.df.h.isin(filter_entities), self.df.t.isin(filter_entities) + ) + ] + # discard loops as edges of a triangle + df_triangles = df_triangles[df_triangles.h != df_triangles.t] + df_triangles_und = df_triangles_und[ + df_triangles_und.h != df_triangles_und.t + ] + else: + rel_df = inference_df = inverse_df = self.df + df_triangles = df_triangles_und = self.df[self.df.h != self.df.t] + df_res = df_res = pd.DataFrame( + {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False} + ) # symmetry-asymmetry # edges with h/t switched - df_inv = self.df.reindex(columns=["t", "r", "h"]).rename( + df_inv = inverse_df.reindex(columns=["t", "r", "h"]).rename( columns={"t": "h", "r": "r", "h": "t"} ) - df_res = pd.DataFrame( - {"h": self.df.h, "r": self.df.r, "t": self.df.t, "is_symmetric": False} - ) df_res.loc[ - self.df.reset_index().merge(df_inv)["index"], + df_res.reset_index().merge(df_inv)["index"], "is_symmetric", ] = True - if len(filter_relations) > 0: - df_res = df_res[df_res.r.isin(filter_relations)] # loops are treated separately df_res["is_loop"] = df_res.h == df_res.t df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False + df_res = df_res.reset_index() + # inverse unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg( inverse_edge_types=("r", list), ) - df_res = df_res.merge( - unique_inv_r_by_ht, left_on=["h", "t"], right_on=["h", "t"], how="left" - ) + df_res = df_res.merge(unique_inv_r_by_ht, on=["h", "t"], how="left") df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply( lambda agg: agg if isinstance(agg, list) else [] ) @@ -446,27 +479,32 @@ def edge_pattern_summary( df_res["has_inverse"] = df_res["n_inverse_relations"] > 0 # inference - edges_between_ht = unique_inv_r_by_ht.reindex( - columns=["t", "h", "inverse_edge_types"] - ).rename( - columns={"t": "h", "h": "t", "inverse_edge_types": "inference_edge_types"} - ) - df_res = df_res.merge( - edges_between_ht, left_on=["h", "t"], right_on=["h", "t"], how="left" - ) + if len(filter_relations) > 0: + edges_between_ht = inference_df.groupby(["h", "t"], as_index=False).agg( + inference_edge_types=("r", list), + ) + else: + edges_between_ht = unique_inv_r_by_ht.reindex( + columns=["t", "h", "inverse_edge_types"] + ).rename( + columns={ + "t": "h", + "h": "t", + "inverse_edge_types": "inference_edge_types", + } + ) + df_res = df_res.merge(edges_between_ht, on=["h", "t"], how="left") # inference_edge_types always contains the edge itself, which we need to drop df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1 df_res["has_inference"] = df_res["n_inference_relations"] > 0 # composition & metapaths - # discard loops as edges of a triangle - df_wo_loops = self.df[self.df.h != self.df.t] if return_metapath_list: # 2-hop paths - df_bridges = df_wo_loops.merge( - df_wo_loops, left_on="t", right_on="h", how="inner" + df_bridges = df_triangles.merge( + df_triangles, left_on="t", right_on="h", how="inner" ) - df_triangles = df_wo_loops.merge( + df_triangles = df_triangles.merge( df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner" ) df_triangles["metapath"] = ( @@ -479,8 +517,7 @@ def edge_pattern_summary( ) df_res = df_res.merge( grouped_triangles, - left_on=["h", "r", "t"], - right_on=["h", "r", "t"], + on=["h", "r", "t"], how="left", ) df_res["metapath_list"] = df_res["metapath_list"].apply( @@ -489,7 +526,7 @@ def edge_pattern_summary( df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) else: counts = composition_count( - df_wo_loops, + df_triangles, chunk_size=composition_chunk_size, workers=composition_workers, directed=True, @@ -504,7 +541,7 @@ def edge_pattern_summary( df_res["has_composition"] = df_res["n_triangles"] > 0 counts = composition_count( - df_wo_loops, + df_triangles_und, chunk_size=composition_chunk_size, workers=composition_workers, directed=False, @@ -519,7 +556,7 @@ def edge_pattern_summary( ) df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0 - df_res = df_res[ + df_res = df_res.set_index("index")[ [ "h", "r", @@ -539,6 +576,7 @@ def edge_pattern_summary( ] + (["metapath_list"] if return_metapath_list else []) ] + df_res.index.name = None return aggregate_by_relation(df_res) if aggregate_by_r else df_res diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index eaba81a..d24977d 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -1,5 +1,6 @@ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. +from functools import partial import numpy as np import pandas as pd import pytest @@ -85,3 +86,22 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0]) if return_metapath_list: assert res["metapath_list"][2] == ["0-1", "1-1"] + + +def test_filter_relations() -> None: + for rels in [[0], [1], [0, 1]]: + for method in [ + kgtt.edge_degree_cardinality_summary, + partial(kgtt.edge_pattern_summary, return_metapath_list=True), + ]: + # compare outputs of standard method call and filtered call + res_all = method() + res_all = res_all[res_all.r.isin(rels)] + res_filtered = method(filter_relations=rels) + assert np.all(res_all.index.values == res_filtered.index.values) + for c in res_all.columns: + if c == "metapath_list": + for a, b in zip(res_all[c].values, res_filtered[c].values): + assert a == b + else: + assert np.all(res_all[c].values == res_filtered[c].values) From 586e816b7f3431e02c1b7d072a174089d711e3d8 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 11 Oct 2024 16:17:29 +0000 Subject: [PATCH 03/14] ci fix --- src/kg_topology_toolbox/topology_toolbox.py | 21 ++++++++++----------- tests/test_edge_topology_toolbox.py | 5 +++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index b1434f7..9af3323 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -5,8 +5,9 @@ Topology toolbox main functionalities """ -from functools import cache import multiprocessing as mp +from functools import cache + import numpy as np import pandas as pd from scipy.sparse import coo_array @@ -271,7 +272,7 @@ def edge_cardinality(self) -> pd.DataFrame: return df_res def edge_degree_cardinality_summary( - self, filter_relations: list = [], aggregate_by_r: bool = False + self, filter_relations: list[int] = [], aggregate_by_r: bool = False ) -> pd.DataFrame: """ For each edge in the KG, compute the number of edges with the same head @@ -286,8 +287,8 @@ def edge_degree_cardinality_summary( as the original Knowledge Graph dataframe. :param filter_relations: - Compute the output only for the edges with relation in this list - of relation IDs. + If not empty, compute the output only for the edges with relation + in this list of relation IDs. :param aggregate_by_r: If True, return metrics aggregated by relation type (the output DataFrame will be indexed over relation IDs). @@ -331,9 +332,7 @@ def edge_degree_cardinality_summary( how="left", ) df_res["tot_degree"] = ( - df_res.h_degree.values - + df_res.t_degree.values - - num_parallel.n_parallel.values + df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values ) # when restricting to the relation type, there is only one edge # (the edge itself) that is double-counted @@ -351,7 +350,7 @@ def edge_degree_cardinality_summary( def edge_pattern_summary( self, return_metapath_list: bool = False, - filter_relations: list = [], + filter_relations: list[int] = [], aggregate_by_r: bool = False, composition_chunk_size: int = 2**8, composition_workers: int = min(32, mp.cpu_count() - 1 or 1), @@ -366,10 +365,10 @@ def edge_pattern_summary( :param return_metapath_list: If True, return the list of unique metapaths for all - triangles supported over one edge. WARNING: very expensive for large graphs. + triangles supported over each edge. WARNING: very expensive for large graphs. :param filter_relations: - Compute the output only for the edges with relation in this list - of relation IDs. + If not empty, compute the output only for the edges with relation + in this list of relation IDs. :param aggregate_by_r: If True, return metrics aggregated by relation type (the output DataFrame will be indexed over relation IDs). diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index d24977d..6e0b6be 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -1,6 +1,7 @@ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. from functools import partial + import numpy as np import pandas as pd import pytest @@ -95,9 +96,9 @@ def test_filter_relations() -> None: partial(kgtt.edge_pattern_summary, return_metapath_list=True), ]: # compare outputs of standard method call and filtered call - res_all = method() + res_all = method() # type: ignore res_all = res_all[res_all.r.isin(rels)] - res_filtered = method(filter_relations=rels) + res_filtered = method(filter_relations=rels) # type: ignore assert np.all(res_all.index.values == res_filtered.index.values) for c in res_all.columns: if c == "metapath_list": From 432615fdb34df028593c1700a5d87e3876bef888 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Mon, 14 Oct 2024 11:30:07 +0000 Subject: [PATCH 04/14] metapath tweaks --- src/kg_topology_toolbox/topology_toolbox.py | 36 ++++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 9af3323..eb97472 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -407,6 +407,8 @@ def edge_pattern_summary( for the directed triangles. """ + # discard loops as edges of a triangle + df_wo_loops = self.df[self.df.h != self.df.t] if len(filter_relations) > 0: rel_df = self.df[self.df.r.isin(filter_relations)] filter_heads = rel_df.h.unique() @@ -423,24 +425,24 @@ def edge_pattern_summary( self.df.h.isin(filter_tails), self.df.t.isin(filter_heads) ) ] - df_triangles = self.df[ + df_triangles_out = df_wo_loops[df_wo_loops.h.isin(filter_heads)] + df_triangles_in = df_wo_loops[df_wo_loops.t.isin(filter_tails)] + df_triangles = df_wo_loops[ np.logical_or( - self.df.h.isin(filter_heads), self.df.t.isin(filter_tails) + df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) ) ] - df_triangles_und = self.df[ + df_triangles_und = df_wo_loops[ np.logical_or( - self.df.h.isin(filter_entities), self.df.t.isin(filter_entities) + df_wo_loops.h.isin(filter_entities), + df_wo_loops.t.isin(filter_entities), ) ] - # discard loops as edges of a triangle - df_triangles = df_triangles[df_triangles.h != df_triangles.t] - df_triangles_und = df_triangles_und[ - df_triangles_und.h != df_triangles_und.t - ] else: rel_df = inference_df = inverse_df = self.df - df_triangles = df_triangles_und = self.df[self.df.h != self.df.t] + df_triangles = df_triangles_und = df_triangles_out = df_triangles_in = ( + df_wo_loops + ) df_res = df_res = pd.DataFrame( {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False} ) @@ -500,16 +502,18 @@ def edge_pattern_summary( # composition & metapaths if return_metapath_list: # 2-hop paths - df_bridges = df_triangles.merge( - df_triangles, left_on="t", right_on="h", how="inner" + df_bridges = df_triangles_out.merge( + df_triangles_in, left_on="t", right_on="h", how="inner" ) - df_triangles = df_triangles.merge( + df_res_triangles = df_res[df_res.h != df_res.t].merge( df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner" ) - df_triangles["metapath"] = ( - df_triangles["r_x"].astype(str) + "-" + df_triangles["r_y"].astype(str) + df_res_triangles["metapath"] = ( + df_res_triangles["r_x"].astype(str) + + "-" + + df_res_triangles["r_y"].astype(str) ) - grouped_triangles = df_triangles.groupby( + grouped_triangles = df_res_triangles.groupby( ["h", "r", "t"], as_index=False ).agg( n_triangles=("metapath", "count"), metapath_list=("metapath", "unique") From 7c6d2578417daf060c653d7af0cbfea1771ef494 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Tue, 15 Oct 2024 17:44:18 +0000 Subject: [PATCH 05/14] refactor metapath counting with sparse matmuls --- src/kg_topology_toolbox/topology_toolbox.py | 82 ++++++++++++---- src/kg_topology_toolbox/utils.py | 101 ++++++++++++++++---- 2 files changed, 142 insertions(+), 41 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index eb97472..366bf6c 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -271,6 +271,56 @@ def edge_cardinality(self) -> pd.DataFrame: ).astype(str) return df_res + def edge_metapath_count( + self, + filter_relations: list[int] = [], + composition_chunk_size: int = 2**8, + composition_workers: int = min(32, mp.cpu_count() - 1 or 1), + ) -> pd.DataFrame: + """ + For each edge in the KG, compute the number of triangles of different + metapaths (i.e., the unique tuples (r1, r2) of relation types + of the two additional edges of the triangle). + + :param filter_relations: + If not empty, compute the output only for the edges with relation + in this list of relation IDs. + :param composition_chunk_size: + Size of column chunks of sparse adjacency matrix + to compute the triangle count. Default: 2**8. + :param composition_workers: + Number of workers to compute the triangle count. By default, assigned based + on number of available threads (max: 32). + + :return: + The output dataframe has one row for each (h, t, r1, r2) such that + there exists at least one triangle of metapath (r1, r2) over (any) edge + connecting h, t. + The number of metapath triangles is given in the column **n_triangles**. + """ + # discard loops as edges of a triangle + df_wo_loops = self.df[self.df.h != self.df.t] + if len(filter_relations) > 0: + rel_df = self.df[self.df.r.isin(filter_relations)] + filter_heads = rel_df.h.unique() + filter_tails = rel_df.t.unique() + df_triangles = df_wo_loops[ + np.logical_or( + df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) + ) + ] + else: + rel_df = self.df + df_triangles = df_wo_loops + + return composition_count( + df_triangles, + chunk_size=composition_chunk_size, + workers=composition_workers, + metapaths=True, + directed=True, + ) + def edge_degree_cardinality_summary( self, filter_relations: list[int] = [], aggregate_by_r: bool = False ) -> pd.DataFrame: @@ -425,8 +475,6 @@ def edge_pattern_summary( self.df.h.isin(filter_tails), self.df.t.isin(filter_heads) ) ] - df_triangles_out = df_wo_loops[df_wo_loops.h.isin(filter_heads)] - df_triangles_in = df_wo_loops[df_wo_loops.t.isin(filter_tails)] df_triangles = df_wo_loops[ np.logical_or( df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) @@ -440,9 +488,7 @@ def edge_pattern_summary( ] else: rel_df = inference_df = inverse_df = self.df - df_triangles = df_triangles_und = df_triangles_out = df_triangles_in = ( - df_wo_loops - ) + df_triangles = df_triangles_und = df_wo_loops df_res = df_res = pd.DataFrame( {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False} ) @@ -501,30 +547,24 @@ def edge_pattern_summary( # composition & metapaths if return_metapath_list: - # 2-hop paths - df_bridges = df_triangles_out.merge( - df_triangles_in, left_on="t", right_on="h", how="inner" + counts = self.edge_metapath_count( + filter_relations, + composition_chunk_size, + composition_workers, ) - df_res_triangles = df_res[df_res.h != df_res.t].merge( - df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner" + counts["metapath"] = ( + counts["r1"].astype(str) + "-" + counts["r2"].astype(str) ) - df_res_triangles["metapath"] = ( - df_res_triangles["r_x"].astype(str) - + "-" - + df_res_triangles["r_y"].astype(str) - ) - grouped_triangles = df_res_triangles.groupby( - ["h", "r", "t"], as_index=False - ).agg( - n_triangles=("metapath", "count"), metapath_list=("metapath", "unique") + grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg( + n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list) ) df_res = df_res.merge( grouped_triangles, - on=["h", "r", "t"], + on=["h", "t"], how="left", ) df_res["metapath_list"] = df_res["metapath_list"].apply( - lambda agg: agg.tolist() if isinstance(agg, np.ndarray) else [] + lambda agg: agg if isinstance(agg, list) else [] ) df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) else: diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index d3a3d55..363416a 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -188,22 +188,40 @@ def jaccard_similarity( def _composition_count_worker( - adj_csr: csr_array, adj_csc: csc_array, tail_shift: int = 0 + adj_csr: csr_array, adj_csc: csc_array, adj_mask: csc_array, tail_shift: int = 0 ) -> pd.DataFrame: + n_nodes = adj_csr.shape[1] + n_rels = adj_csr.shape[0] // n_nodes adj_2hop = adj_csr @ adj_csc - adj_composition = (adj_2hop.tocsc() * (adj_csc > 0)).tocoo() - df_composition = pd.DataFrame( - dict( - h=adj_composition.row, - t=adj_composition.col + tail_shift, - n_triangles=adj_composition.data, + adj_composition = (adj_2hop.tocsc() * (adj_mask > 0)).tocoo() + col_shift = adj_composition.col + tail_shift + if n_rels > 1: + df_composition = pd.DataFrame( + dict( + h=adj_composition.row // n_rels, + t=col_shift % n_nodes, + r1=adj_composition.row % n_rels, + r2=col_shift // n_nodes, + n_triangles=adj_composition.data, + ) + ) + else: + df_composition = pd.DataFrame( + dict( + h=adj_composition.row, + t=col_shift, + n_triangles=adj_composition.data, + ) ) - ) return df_composition def composition_count( - df: pd.DataFrame, chunk_size: int, workers: int, directed: bool = True + df: pd.DataFrame, + chunk_size: int, + workers: int, + metapaths: bool = False, + directed: bool = True, ) -> pd.DataFrame: """A helper function to compute the composition count of a graph. @@ -227,15 +245,48 @@ def composition_count( """ n_nodes = df[["h", "t"]].max().max() + 1 - adj = coo_array( - (np.ones(len(df)), (df.h, df.t)), - shape=[n_nodes, n_nodes], - ).astype(np.uint16) - if not directed: - adj = adj + adj.T - n_cols = adj.shape[1] - adj_csr = adj.tocsr() - adj_csc = adj.tocsc() + n_rels = df["r"].max() + 1 + if metapaths: + adj_repeated = csc_array( + ( + np.ones(n_rels * n_rels * len(df)), + ( + (n_rels * df.h.values[:, None] + np.arange(n_rels)).repeat(n_rels), + np.tile( + df.t.values[:, None] + n_nodes * np.arange(n_rels), n_rels + ).flatten(), + ), + ), + shape=[n_nodes * n_rels, n_nodes * n_rels], + ).astype(np.uint16) + adj_csr = csr_array( + (np.ones(len(df)), (df.h * n_rels + df.r, df.t)), + shape=[n_nodes * n_rels, n_nodes], + ).astype(np.uint16) + adj_csc = csc_array( + (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)), + shape=[n_nodes, n_nodes * n_rels], + ).astype(np.uint16) + n_cols = adj_csc.shape[1] + adj_repeated_slices = { + i: adj_repeated[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] + for i in range(int(np.ceil(n_cols / chunk_size))) + } + if not directed: + raise NotImplementedError( + "Metapath counting only implemented for directed triangles" + ) + else: + adj = coo_array( + (np.ones(len(df)), (df.h, df.t)), + shape=[n_nodes, n_nodes], + ).astype(np.uint16) + if not directed: + adj = adj + adj.T + adj_csr = adj.tocsr() + adj_csc = adj.tocsc() + n_cols = adj_csc.shape[1] + adj_csc_slices = { i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] for i in range(int(np.ceil(n_cols / chunk_size))) @@ -246,13 +297,23 @@ def composition_count( df_composition_list = pool.starmap( _composition_count_worker, ( - (adj_csr, adj_csc_slice, i * chunk_size) + ( + adj_csr, + adj_csc_slice, + adj_repeated_slices[i] if metapaths else adj_csc_slice, + i * chunk_size, + ) for i, adj_csc_slice in adj_csc_slices.items() ), ) else: df_composition_list = [ - _composition_count_worker(adj_csr, adj_csc_slice, i * chunk_size) + _composition_count_worker( + adj_csr, + adj_csc_slice, + adj_repeated_slices[i] if metapaths else adj_csc_slice, + i * chunk_size, + ) for i, adj_csc_slice in adj_csc_slices.items() ] From fadc967090439e5f57981ff8b7b499da0a52c2d2 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Tue, 15 Oct 2024 17:46:04 +0000 Subject: [PATCH 06/14] docstring update --- src/kg_topology_toolbox/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index 363416a..0160d1c 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -233,8 +233,11 @@ def composition_count( processed together. :param workers: Number of workers processing chunks concurrently + :param metapaths: + If True, the number of composition is computed separately for each + unique metapath. :param directed: - Boolean flag. If false, bidirectional edges are considered for + If False, bidirectional edges are considered for triangles by adding the adjacency matrix and its transposed. Default: True. :return: From c49e88692749a809af23a926a6580d4e531718de Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Wed, 16 Oct 2024 09:01:40 +0000 Subject: [PATCH 07/14] use np.divmod --- src/kg_topology_toolbox/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index 0160d1c..8194572 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -194,14 +194,15 @@ def _composition_count_worker( n_rels = adj_csr.shape[0] // n_nodes adj_2hop = adj_csr @ adj_csc adj_composition = (adj_2hop.tocsc() * (adj_mask > 0)).tocoo() - col_shift = adj_composition.col + tail_shift if n_rels > 1: + h, r1 = np.divmod(adj_composition.row, n_rels) + r2, t = np.divmod(adj_composition.col + tail_shift, n_nodes) df_composition = pd.DataFrame( dict( - h=adj_composition.row // n_rels, - t=col_shift % n_nodes, - r1=adj_composition.row % n_rels, - r2=col_shift // n_nodes, + h=h, + t=t, + r1=r1, + r2=r2, n_triangles=adj_composition.data, ) ) @@ -209,7 +210,7 @@ def _composition_count_worker( df_composition = pd.DataFrame( dict( h=adj_composition.row, - t=col_shift, + t=adj_composition.col + tail_shift, n_triangles=adj_composition.data, ) ) From ea845abe2bc6643ee018e12a723889edebef9092 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Wed, 16 Oct 2024 09:10:59 +0000 Subject: [PATCH 08/14] avoid repeated work --- src/kg_topology_toolbox/topology_toolbox.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 366bf6c..5def422 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -547,10 +547,12 @@ def edge_pattern_summary( # composition & metapaths if return_metapath_list: - counts = self.edge_metapath_count( - filter_relations, - composition_chunk_size, - composition_workers, + counts = composition_count( + df_triangles, + chunk_size=composition_chunk_size, + workers=composition_workers, + metapaths=True, + directed=True, ) counts["metapath"] = ( counts["r1"].astype(str) + "-" + counts["r2"].astype(str) From 05e3ee3ed6a26a387bc52389c4ba88986c30ed4f Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Wed, 16 Oct 2024 10:47:12 +0000 Subject: [PATCH 09/14] add metapath unit test --- src/kg_topology_toolbox/topology_toolbox.py | 17 ++++++++++------- tests/test_edge_topology_toolbox.py | 21 ++++++++++++++++----- tests/test_node_topology_toolbox.py | 5 +---- tests/test_relation_topology_toolbox.py | 5 +---- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 5def422..faec2a9 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -278,9 +278,9 @@ def edge_metapath_count( composition_workers: int = min(32, mp.cpu_count() - 1 or 1), ) -> pd.DataFrame: """ - For each edge in the KG, compute the number of triangles of different - metapaths (i.e., the unique tuples (r1, r2) of relation types - of the two additional edges of the triangle). + For each edge in the KG, compute the number of triangles supported on it + distinguishing between different metapaths (i.e., the unique tuples (r1, r2) + of relation types of the two additional edges of the triangle). :param filter_relations: If not empty, compute the output only for the edges with relation @@ -293,10 +293,11 @@ def edge_metapath_count( on number of available threads (max: 32). :return: - The output dataframe has one row for each (h, t, r1, r2) such that - there exists at least one triangle of metapath (r1, r2) over (any) edge - connecting h, t. + The output dataframe has one row for each (h, r, t, r1, r2) such that + there exists at least one triangle of metapath (r1, r2) over (h, r, t). The number of metapath triangles is given in the column **n_triangles**. + The column **index** provides the index of the edge (h, r, t) in the + original Knowledge Graph dataframe. """ # discard loops as edges of a triangle df_wo_loops = self.df[self.df.h != self.df.t] @@ -313,7 +314,7 @@ def edge_metapath_count( rel_df = self.df df_triangles = df_wo_loops - return composition_count( + counts = composition_count( df_triangles, chunk_size=composition_chunk_size, workers=composition_workers, @@ -321,6 +322,8 @@ def edge_metapath_count( directed=True, ) + return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner") + def edge_degree_cardinality_summary( self, filter_relations: list[int] = [], aggregate_by_r: bool = False ) -> pd.DataFrame: diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index 6e0b6be..49bfa5e 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -22,12 +22,19 @@ ) -@pytest.mark.parametrize("return_metapath_list", [True, False]) -def test_small_graph_metrics(return_metapath_list: bool) -> None: - # Define a small graph with all the features tested by - # the edge_topology_toolbox +def test_edge_metapath_count() -> None: + res = kgtt.edge_metapath_count() + assert np.allclose(res["index"], [2, 2]) + assert np.allclose(res["h"], [0, 0]) + assert np.allclose(res["r"], [0, 0]) + assert np.allclose(res["t"], [2, 2]) + assert np.allclose(res["r1"], [0, 1]) + assert np.allclose(res["r2"], [1, 1]) + assert np.allclose(res["n_triangles"], [1, 1]) + - # entity degrees statistics +def test_edge_degree_cardinality_summary() -> None: + # edge degrees statistics res = kgtt.edge_degree_cardinality_summary() assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2]) assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3]) @@ -60,6 +67,9 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: "M:M", ] + +@pytest.mark.parametrize("return_metapath_list", [True, False]) +def test_edge_pattern_summary(return_metapath_list: bool) -> None: # relation pattern symmetry res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list) assert np.allclose( @@ -92,6 +102,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: def test_filter_relations() -> None: for rels in [[0], [1], [0, 1]]: for method in [ + kgtt.edge_metapath_count, kgtt.edge_degree_cardinality_summary, partial(kgtt.edge_pattern_summary, return_metapath_list=True), ]: diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py index 18d87ed..d002b41 100644 --- a/tests/test_node_topology_toolbox.py +++ b/tests/test_node_topology_toolbox.py @@ -19,10 +19,7 @@ @pytest.mark.parametrize("return_relation_list", [True, False]) -def test_small_graph_metrics(return_relation_list: bool) -> None: - # Define a small graph with all the features tested by - # the node_topology_toolbox - +def test_node_degree_summary(return_relation_list: bool) -> None: # entity degrees statistics res = kgtt.node_degree_summary(return_relation_list=return_relation_list) assert np.allclose(res["h_degree"], [3, 1, 3]) diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py index 3f0c05c..e527a5f 100644 --- a/tests/test_relation_topology_toolbox.py +++ b/tests/test_relation_topology_toolbox.py @@ -20,10 +20,7 @@ kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T") -def test_small_graph_metrics() -> None: - # Define a small graph on five nodes with all the features tested by - # the relation_topology_toolbox - +def test_aggregate_by_r() -> None: dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True) eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True) From 5ec0f3e07d9ab8fddc4607d474af2b8455115b93 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 18 Oct 2024 17:21:56 +0000 Subject: [PATCH 10/14] reduce memory usage of metapath counting --- src/kg_topology_toolbox/utils.py | 70 +++++++++++++++++------------ tests/test_edge_topology_toolbox.py | 13 +++--- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index 8194572..d2e798f 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -193,7 +193,7 @@ def _composition_count_worker( n_nodes = adj_csr.shape[1] n_rels = adj_csr.shape[0] // n_nodes adj_2hop = adj_csr @ adj_csc - adj_composition = (adj_2hop.tocsc() * (adj_mask > 0)).tocoo() + adj_composition = (adj_2hop.tocsc() * adj_mask).tocoo() if n_rels > 1: h, r1 = np.divmod(adj_composition.row, n_rels) r2, t = np.divmod(adj_composition.col + tail_shift, n_nodes) @@ -250,19 +250,15 @@ def composition_count( n_nodes = df[["h", "t"]].max().max() + 1 n_rels = df["r"].max() + 1 + adj = coo_array( + (np.ones(len(df)), (df.h, df.t)), + shape=[n_nodes, n_nodes], + ).astype(np.uint16) if metapaths: - adj_repeated = csc_array( - ( - np.ones(n_rels * n_rels * len(df)), - ( - (n_rels * df.h.values[:, None] + np.arange(n_rels)).repeat(n_rels), - np.tile( - df.t.values[:, None] + n_nodes * np.arange(n_rels), n_rels - ).flatten(), - ), - ), - shape=[n_nodes * n_rels, n_nodes * n_rels], - ).astype(np.uint16) + if not directed: + raise NotImplementedError( + "Metapath counting only implemented for directed triangles" + ) adj_csr = csr_array( (np.ones(len(df)), (df.h * n_rels + df.r, df.t)), shape=[n_nodes * n_rels, n_nodes], @@ -271,26 +267,24 @@ def composition_count( (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)), shape=[n_nodes, n_nodes * n_rels], ).astype(np.uint16) - n_cols = adj_csc.shape[1] - adj_repeated_slices = { - i: adj_repeated[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] - for i in range(int(np.ceil(n_cols / chunk_size))) - } - if not directed: - raise NotImplementedError( - "Metapath counting only implemented for directed triangles" - ) + # boolean mask to filter results with only the edges in the KG + msk = csc_array( + ( + [True] * (len(adj.data) * n_rels), + ( + (n_rels * adj.row + np.arange(n_rels)[:, None]).flatten(), + np.tile(adj.col, n_rels), + ), + ), + shape=[n_nodes * n_rels, n_nodes], + ) else: - adj = coo_array( - (np.ones(len(df)), (df.h, df.t)), - shape=[n_nodes, n_nodes], - ).astype(np.uint16) if not directed: adj = adj + adj.T adj_csr = adj.tocsr() adj_csc = adj.tocsc() - n_cols = adj_csc.shape[1] + n_cols = adj_csc.shape[1] adj_csc_slices = { i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] for i in range(int(np.ceil(n_cols / chunk_size))) @@ -304,7 +298,16 @@ def composition_count( ( adj_csr, adj_csc_slice, - adj_repeated_slices[i] if metapaths else adj_csc_slice, + ( + # relevant slice of mask (with wraparound) + msk[ + :, + (i * chunk_size + np.arange(adj_csc_slice.shape[1])) + % msk.shape[1], + ] + if metapaths + else adj_csc_slice > 0 + ), i * chunk_size, ) for i, adj_csc_slice in adj_csc_slices.items() @@ -315,7 +318,16 @@ def composition_count( _composition_count_worker( adj_csr, adj_csc_slice, - adj_repeated_slices[i] if metapaths else adj_csc_slice, + ( + # relevant slice of mask (with wraparound) + msk[ + :, + (i * chunk_size + np.arange(adj_csc_slice.shape[1])) + % msk.shape[1], + ] + if metapaths + else adj_csc_slice > 0 + ), i * chunk_size, ) for i, adj_csc_slice in adj_csc_slices.items() diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index 49bfa5e..849bd5c 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -23,13 +23,14 @@ def test_edge_metapath_count() -> None: - res = kgtt.edge_metapath_count() + res = kgtt.edge_metapath_count(composition_chunk_size=3) assert np.allclose(res["index"], [2, 2]) assert np.allclose(res["h"], [0, 0]) assert np.allclose(res["r"], [0, 0]) assert np.allclose(res["t"], [2, 2]) - assert np.allclose(res["r1"], [0, 1]) - assert np.allclose(res["r2"], [1, 1]) + assert set(zip(res["r1"].values.tolist(), res["r2"].values.tolist())) == set( + [(0, 1), (1, 1)] + ) assert np.allclose(res["n_triangles"], [1, 1]) @@ -71,7 +72,9 @@ def test_edge_degree_cardinality_summary() -> None: @pytest.mark.parametrize("return_metapath_list", [True, False]) def test_edge_pattern_summary(return_metapath_list: bool) -> None: # relation pattern symmetry - res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list) + res = kgtt.edge_pattern_summary( + return_metapath_list=return_metapath_list, composition_chunk_size=3 + ) assert np.allclose( res["is_loop"], [False, False, False, False, False, False, True, True] ) @@ -96,7 +99,7 @@ def test_edge_pattern_summary(return_metapath_list: bool) -> None: assert np.allclose(res["n_triangles"], [0, 0, 2, 0, 0, 0, 0, 0]) assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0]) if return_metapath_list: - assert res["metapath_list"][2] == ["0-1", "1-1"] + assert set(res["metapath_list"][2]) == set(["0-1", "1-1"]) def test_filter_relations() -> None: From ce6f369f12d3a9388f536feae62902a3f696e8f8 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 25 Oct 2024 14:19:04 +0000 Subject: [PATCH 11/14] improve docstrings --- src/kg_topology_toolbox/topology_toolbox.py | 34 +++++++++--- src/kg_topology_toolbox/utils.py | 57 ++++++++++++++++----- 2 files changed, 70 insertions(+), 21 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index faec2a9..c503d44 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -258,8 +258,6 @@ def edge_cardinality(self) -> pd.DataFrame: # check if the values in the pair (h_degree, t_degree) are =1 or >1 # to determine the edge cardinality for suffix in ["", "_same_rel"]: - # check if the values in the pair (h_degree, t_degree) are =1 or >1 - # to determine the edge cardinality edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + ( tail_degree["t_degree" + suffix] == 1 ) @@ -279,15 +277,16 @@ def edge_metapath_count( ) -> pd.DataFrame: """ For each edge in the KG, compute the number of triangles supported on it - distinguishing between different metapaths (i.e., the unique tuples (r1, r2) - of relation types of the two additional edges of the triangle). + distinguishing between different metapaths (i.e., the unique ordered tuples + (r1, r2) of relation types of the two additional edges of the triangle). :param filter_relations: If not empty, compute the output only for the edges with relation in this list of relation IDs. :param composition_chunk_size: Size of column chunks of sparse adjacency matrix - to compute the triangle count. Default: 2**8. + to compute the triangle count. Reduce the parameter if running OOM. + Default: 2**8. :param composition_workers: Number of workers to compute the triangle count. By default, assigned based on number of available threads (max: 32). @@ -303,8 +302,11 @@ def edge_metapath_count( df_wo_loops = self.df[self.df.h != self.df.t] if len(filter_relations) > 0: rel_df = self.df[self.df.r.isin(filter_relations)] + # unique heads and tails used by filtered edges filter_heads = rel_df.h.unique() filter_tails = rel_df.t.unique() + # the only relevant edges for triangles are the ones with head in the + # set of filtered heads, or tail in the set of filtered tails df_triangles = df_wo_loops[ np.logical_or( df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) @@ -387,7 +389,7 @@ def edge_degree_cardinality_summary( df_res["tot_degree"] = ( df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values ) - # when restricting to the relation type, there is only one edge + # when restricting to the same relation type, there is only one edge # (the edge itself) that is double-counted df_res["tot_degree_same_rel"] = ( df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1 @@ -427,7 +429,8 @@ def edge_pattern_summary( (the output DataFrame will be indexed over relation IDs). :param composition_chunk_size: Size of column chunks of sparse adjacency matrix - to compute the triangle count. Default: 2**8. + to compute the triangle count. Reduce the parameter if running OOM. + Default: 2**8. :param composition_workers: Number of workers to compute the triangle count. By default, assigned based on number of available threads (max: 32). @@ -464,10 +467,11 @@ def edge_pattern_summary( df_wo_loops = self.df[self.df.h != self.df.t] if len(filter_relations) > 0: rel_df = self.df[self.df.r.isin(filter_relations)] + # unique heads and tails used by filtered edges filter_heads = rel_df.h.unique() filter_tails = rel_df.t.unique() filter_entities = np.union1d(filter_heads, filter_tails) - + # restrict relevant edges to count inference/inverse patterns inference_df = self.df[ np.logical_and( self.df.h.isin(filter_heads), self.df.t.isin(filter_tails) @@ -478,11 +482,15 @@ def edge_pattern_summary( self.df.h.isin(filter_tails), self.df.t.isin(filter_heads) ) ] + # the only relevant edges for triangles are the ones with head in the + # set of filtered heads, or tail in the set of filtered tails df_triangles = df_wo_loops[ np.logical_or( df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) ) ] + # for undirected triangles, heads and tails can be any of the + # filtered entities df_triangles_und = df_wo_loops[ np.logical_or( df_wo_loops.h.isin(filter_entities), @@ -557,9 +565,12 @@ def edge_pattern_summary( metapaths=True, directed=True, ) + # turn (r1, r2) into "r1-r2" string for metapaths counts["metapath"] = ( counts["r1"].astype(str) + "-" + counts["r2"].astype(str) ) + # count triangles (summing over all metapaths between two nodes) + # and list unique metapaths for each head and tail node pair grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg( n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list) ) @@ -568,6 +579,7 @@ def edge_pattern_summary( on=["h", "t"], how="left", ) + # if no triangles are present over an edge, set metapath list to [] df_res["metapath_list"] = df_res["metapath_list"].apply( lambda agg: agg if isinstance(agg, list) else [] ) @@ -588,6 +600,7 @@ def edge_pattern_summary( df_res["has_composition"] = df_res["n_triangles"] > 0 + # undirected composition counts = composition_count( df_triangles_und, chunk_size=composition_chunk_size, @@ -658,6 +671,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame: - **jaccard_both** (float): Jaccard similarity between the full entity set of r1 and r2. """ + # set of unique heads/tails/any for each relation ent_unique = self.df.groupby("r", as_index=False).agg( num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique") ) @@ -674,6 +688,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame: suffixes=["_r1", "_r2"], how="cross", ) + # order doesn't matter df_res = df_res[df_res.r1 < df_res.r2] df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"] @@ -748,15 +763,18 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame # normalize by global t frequency rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum") + # sparse matrix of of (h,r) pair frequency E_h = coo_array( (hr_freqs.t, (hr_freqs.h, hr_freqs.r)), shape=[self.n_entity, self.n_rel], ) + # sparse matrix of of (t,r) pair frequency E_t = coo_array( (rt_freqs.h, (rt_freqs.t, rt_freqs.r)), shape=[self.n_entity, self.n_rel], ) + # adjacency matrix of relation graph A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray() A[np.diag_indices_from(A)] = 0 diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index d2e798f..5bc7ac7 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -133,6 +133,7 @@ def aggregate_by_relation(edge_topology_df: pd.DataFrame) -> pd.DataFrame: elif col_dtype == object: if isinstance(edge_topology_df[col].iloc[0], str): for label in np.unique(edge_topology_df[col]): + # fraction of rows for each label df_res[f"{col}_{label}_frac"] = ( edge_topology_df[edge_topology_df[col] == label] .groupby("r")[col] @@ -188,15 +189,34 @@ def jaccard_similarity( def _composition_count_worker( - adj_csr: csr_array, adj_csc: csc_array, adj_mask: csc_array, tail_shift: int = 0 + adj_csr: csr_array, + adj_csc_slice: csc_array, + adj_mask_slice: csc_array, + slice_tail_shift: int, ) -> pd.DataFrame: + """ + Masked sparse matmul to count triangles over graph edges. + + :param adj_csr: shape (n_nodes * n_rels, n_nodes) if distinguishing between + metapaths, (n_nodes, n_nodes) otherwise + :param adj_csc_slice: shape (n_nodes, chunk_size) + :param adj_mask_slice: shape (n_nodes, chunk_size) + :param slice_tail_shift: column shift of the vertical slice + + :return: + Pandas dataframe of triangle counts. + """ n_nodes = adj_csr.shape[1] n_rels = adj_csr.shape[0] // n_nodes - adj_2hop = adj_csr @ adj_csc - adj_composition = (adj_2hop.tocsc() * adj_mask).tocoo() + # 2-hop count + adj_2hop = adj_csr @ adj_csc_slice + # mask out (h,t) pairs not connected by edges + adj_composition = (adj_2hop.tocsc() * adj_mask_slice).tocoo() if n_rels > 1: + # distinguish between metapaths + # unflatten results h, r1 = np.divmod(adj_composition.row, n_rels) - r2, t = np.divmod(adj_composition.col + tail_shift, n_nodes) + r2, t = np.divmod(adj_composition.col + slice_tail_shift, n_nodes) df_composition = pd.DataFrame( dict( h=h, @@ -207,10 +227,11 @@ def _composition_count_worker( ) ) else: + # don't distinguish between metapaths df_composition = pd.DataFrame( dict( h=adj_composition.row, - t=adj_composition.col + tail_shift, + t=adj_composition.col + slice_tail_shift, n_triangles=adj_composition.data, ) ) @@ -224,7 +245,7 @@ def composition_count( metapaths: bool = False, directed: bool = True, ) -> pd.DataFrame: - """A helper function to compute the composition count of a graph. + """Compute composition count of a graph. :param df: A graph represented as a pd.DataFrame. Must contain the columns @@ -235,30 +256,35 @@ def composition_count( :param workers: Number of workers processing chunks concurrently :param metapaths: - If True, the number of composition is computed separately for each + If True, the number of compositions is computed separately for each unique metapath. :param directed: If False, bidirectional edges are considered for - triangles by adding the adjacency matrix and its transposed. Default: True. + triangles, by adding the adjacency matrix and its transposed. Default: True. :return: The results dataframe. Contains the following columns: - **h** (int): Index of the head entity. - **t** (int): Index of the tail entity. - - **n_triangles** (int): Number of compositions for the (h, t) edge. + - **n_triangles** (int): Number of compositions for any edge between (h, t). """ n_nodes = df[["h", "t"]].max().max() + 1 n_rels = df["r"].max() + 1 + # sparse graph adjacency matrix, counting number of edges between each pair of nodes adj = coo_array( (np.ones(len(df)), (df.h, df.t)), shape=[n_nodes, n_nodes], ).astype(np.uint16) + if metapaths: if not directed: raise NotImplementedError( "Metapath counting only implemented for directed triangles" ) + # relation-aware adjacency matrix, flattened to 2D for sparse implementation + # (adj_csr @ adj_csc).reshape(n_nodes, n_rels, n_rels, n_nodes)[h,r1,r2,t] counts + # the number of 2-hop paths of metapath (r1, r2) between h and t adj_csr = csr_array( (np.ones(len(df)), (df.h * n_rels + df.r, df.t)), shape=[n_nodes * n_rels, n_nodes], @@ -267,7 +293,8 @@ def composition_count( (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)), shape=[n_nodes, n_nodes * n_rels], ).astype(np.uint16) - # boolean mask to filter results with only the edges in the KG + # boolean mask to filter results, keep only triangles over (h,t) pairs connected + # by at least one edge (equivalent to flattened adj[:,None,None,:] > 0) msk = csc_array( ( [True] * (len(adj.data) * n_rels), @@ -280,18 +307,22 @@ def composition_count( ) else: if not directed: + # add inverse edges for undirected compositions adj = adj + adj.T + # (adj_csr @ adj_csc)[h,t] counts the number of 2-hop paths between h and t; + # the boolean mask here is simply adj_csc > 0 adj_csr = adj.tocsr() adj_csc = adj.tocsc() + # to compute (adj_csr @ adj_csc) * msk, serialize over vertical slices of adj_csc n_cols = adj_csc.shape[1] adj_csc_slices = { i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] for i in range(int(np.ceil(n_cols / chunk_size))) } - if len(adj_csc_slices) > 1 and workers > 1: with Pool(workers) as pool: + # workers are assigned different adj_csc slices df_composition_list = pool.starmap( _composition_count_worker, ( @@ -299,7 +330,7 @@ def composition_count( adj_csr, adj_csc_slice, ( - # relevant slice of mask (with wraparound) + # relevant slice of boolean mask (with wraparound) msk[ :, (i * chunk_size + np.arange(adj_csc_slice.shape[1])) @@ -319,7 +350,7 @@ def composition_count( adj_csr, adj_csc_slice, ( - # relevant slice of mask (with wraparound) + # relevant slice of boolean mask (with wraparound) msk[ :, (i * chunk_size + np.arange(adj_csc_slice.shape[1])) From d6effae8a3583d70847fda079740a753cd09385c Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 25 Oct 2024 15:01:17 +0000 Subject: [PATCH 12/14] document new functionalities in the doc notebook --- docs/source/notebooks/ogb_biokg_demo.ipynb | 316 ++++++++++++++++++++- 1 file changed, 304 insertions(+), 12 deletions(-) diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb index bb73448..dc13ff5 100644 --- a/docs/source/notebooks/ogb_biokg_demo.ipynb +++ b/docs/source/notebooks/ogb_biokg_demo.ipynb @@ -22,9 +22,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found existing installation: kg-topology-toolbox 0.1.0\n", - "Uninstalling kg-topology-toolbox-0.1.0:\n", - " Successfully uninstalled kg-topology-toolbox-0.1.0\n" + "Found existing installation: kg-topology-toolbox 1.0.0\n", + "Uninstalling kg-topology-toolbox-1.0.0:\n", + " Successfully uninstalled kg-topology-toolbox-1.0.0\n" ] } ], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ "[5088434 rows x 3 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -209,14 +209,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/topology_toolbox.py:64: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n", + "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/utils.py:42: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n", " warnings.warn(\n" ] } @@ -232,13 +232,77 @@ "Notice the warning raised by the constructor, which detects duplicated edges in the `biokg_df` DataFrame: to ensure optimal functionalities, duplicated edges should be removed before instantiating the `KGTopologyToolbox` class." ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hrt
38544071972451972
40005341972451972
\n", + "
" + ], + "text/plain": [ + " h r t\n", + "3854407 1972 45 1972\n", + "4000534 1972 45 1972" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find duplicated edges\n", + "biokg_df.loc[biokg_df.duplicated(keep=False)]" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Node-level analysis\n", "\n", - "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned dataframe is indexed on the node ID.\n", + "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned DataFrame is indexed on the node ID.\n", "\n", "- `h_degree` is the number of edges coming out from the node;\n", "- `t_degree` is the number of edges going into the node;\n", @@ -894,7 +958,7 @@ "\n", "![image info](../images/edge_patterns.png)\n", "\n", - "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided (the unique metapaths `[r_1, r_2]` can also be listed by setting `return_metapath_list=True` when calling the method)." + "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided." ] }, { @@ -1210,6 +1274,15 @@ "edge_eps" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we need to identify the different metapaths `[r_1, r_2]` that give triangles `(h,r1,x) - (x,r2,t)` over an edge `(h,r,t)`, we can do so by setting `return_metapath_list=True` in the call of `edge_pattern_summary`. In order to disaggregate the total number of triangles over an edge into separate counts for each existing metapath, the `edge_metapath_count` method should be used instead. \n", + "\n", + "We can now easily produce a global view of the distribution of topological properties." + ] + }, { "cell_type": "code", "execution_count": 12, @@ -1277,6 +1350,225 @@ "plt.tight_layout()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering relation types\n", + "\n", + "The edge-level methods presented in the previous section simultaneously compute statistics for all edges in the KG, and this can be expensive on larger graphs. Moreover, in many practical cases the user might be interested in looking only at the properties of edges of one or few specific relation types.\n", + "\n", + "The methods `edge_degree_cardinality_summary`, `edge_pattern_summary` and `edge_metapath_count` can be passed a list of relation type IDs to restrict computations of their outputs to edges of those specific relation types." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexhrtr1r2n_triangles
03343827327122541210
133438273271225392123
233438273271225382200
33343827327122537227
4334382732712253626
........................
7321494953327152924249213412
7321504953327152924249211412
732151495332715292424926412
732152495332715292424924411
732153495332715292424922412
\n", + "

732154 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " index h r t r1 r2 n_triangles\n", + "0 334382 732 7 1225 41 2 10\n", + "1 334382 732 7 1225 39 2 123\n", + "2 334382 732 7 1225 38 2 200\n", + "3 334382 732 7 1225 37 2 27\n", + "4 334382 732 7 1225 36 2 6\n", + "... ... ... .. ... .. .. ...\n", + "732149 4953327 1529 24 2492 13 41 2\n", + "732150 4953327 1529 24 2492 11 41 2\n", + "732151 4953327 1529 24 2492 6 41 2\n", + "732152 4953327 1529 24 2492 4 41 1\n", + "732153 4953327 1529 24 2492 2 41 2\n", + "\n", + "[732154 rows x 7 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_metapath_counts = kgtt.edge_metapath_count(filter_relations=[7, 24])\n", + "filtered_metapath_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The previous cell computes the number of triangles of each existing `(r1, r2)` metapath, but only over `(h,r,t)` edges of the two relation types with ID 7 and 24 (the column `index` gives the index of the edge in the `biokkg_df` DataFrame). This is the same as calling `kgtt.edge_metapath_count().query('r==7 or r==24')`, but the computation is much cheaper and faster." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "r\n", + "24 413366\n", + "7 318788\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_metapath_counts.r.value_counts()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2267,7 +2559,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv38", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -2281,7 +2573,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.12" } }, "nbformat": 4, From 2a71340a8f241d50f91175ea3350cfb0cb204e07 Mon Sep 17 00:00:00 2001 From: Daniel Justus Date: Fri, 8 Nov 2024 14:25:42 +0000 Subject: [PATCH 13/14] tidy up redundant code --- src/kg_topology_toolbox/topology_toolbox.py | 24 +++++++-------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index c503d44..0c1e73b 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -557,14 +557,14 @@ def edge_pattern_summary( df_res["has_inference"] = df_res["n_inference_relations"] > 0 # composition & metapaths + counts = composition_count( + df_triangles, + chunk_size=composition_chunk_size, + workers=composition_workers, + metapaths=return_metapath_list, + directed=True, + ) if return_metapath_list: - counts = composition_count( - df_triangles, - chunk_size=composition_chunk_size, - workers=composition_workers, - metapaths=True, - directed=True, - ) # turn (r1, r2) into "r1-r2" string for metapaths counts["metapath"] = ( counts["r1"].astype(str) + "-" + counts["r2"].astype(str) @@ -583,21 +583,13 @@ def edge_pattern_summary( df_res["metapath_list"] = df_res["metapath_list"].apply( lambda agg: agg if isinstance(agg, list) else [] ) - df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) else: - counts = composition_count( - df_triangles, - chunk_size=composition_chunk_size, - workers=composition_workers, - directed=True, - ) df_res = df_res.merge( counts, on=["h", "t"], how="left", ) - df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) - + df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) df_res["has_composition"] = df_res["n_triangles"] > 0 # undirected composition From 9862ce0e10857e814907da9d0aab2432915658f0 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Fri, 8 Nov 2024 14:48:29 +0000 Subject: [PATCH 14/14] fix typo --- src/kg_topology_toolbox/topology_toolbox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 0c1e73b..5ae642e 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -500,7 +500,7 @@ def edge_pattern_summary( else: rel_df = inference_df = inverse_df = self.df df_triangles = df_triangles_und = df_wo_loops - df_res = df_res = pd.DataFrame( + df_res = pd.DataFrame( {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False} ) # symmetry-asymmetry