Skip to content

Commit

Permalink
stash
Browse files Browse the repository at this point in the history
  • Loading branch information
bdpedigo committed May 7, 2024
1 parent 39f7bd7 commit a6e66ac
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 41 deletions.
3 changes: 2 additions & 1 deletion networkframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from importlib.metadata import version

from .networkframe import LocIndexer, NetworkFrame, NodeGroupBy
from .algorithms import aggregate_over_graph

__all__ = ["NetworkFrame", "NodeGroupBy", "LocIndexer"]
__all__ = ["NetworkFrame", "NodeGroupBy", "LocIndexer", "aggregate_over_graph"]

__version__ = version("networkframe")
139 changes: 139 additions & 0 deletions networkframe/algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from typing import Literal, Union

import numpy as np
import pandas as pd
from scipy.sparse import csr_array
from tqdm.autonotebook import tqdm


def aggregate_over_graph(
mask: np.ndarray,
nodes: pd.DataFrame,
aggregations: Union[str, list] = "mean",
drop_self_in_neighborhood: bool = True,
verbose: int = False,
engine: Literal["auto", "scipy", "pandas"] = "auto",
) -> pd.DataFrame:
if verbose > 0:
print("Aggregating over neighborhoods")

if engine == "auto":
# TODO might also want to do a check on sparsity of the graph here
if not all([isinstance(x, str) for x in aggregations]) or not all(
[x in ["mean", "sum", "std"] for x in aggregations]
):
engine = "pandas"
else:
engine = "scipy"

if engine == "pandas":
rows = []
for iloc in tqdm(range(len(nodes)), disable=not verbose):
# the selection here is pretty quick;
node = nodes.index[iloc]
select_nodes = nodes.loc[mask[iloc]]
if drop_self_in_neighborhood:
select_nodes = select_nodes.drop(index=node)

# the aggregation takes most of the time
agg_neighbor_features = select_nodes.agg(aggregations)

if isinstance(agg_neighbor_features, pd.Series):
agg_neighbor_features.index = agg_neighbor_features.index.map(
lambda x: f"{x}_neighbor_{aggregations[0]}"
)
elif isinstance(agg_neighbor_features, pd.DataFrame):
agg_neighbor_features = agg_neighbor_features.unstack()
agg_neighbor_features.index = agg_neighbor_features.index.map(
lambda x: f"{x[0]}_neighbor_{x[1]}"
)
agg_neighbor_features.name = node
rows.append(agg_neighbor_features)
neighborhood_features = pd.concat(rows, axis=1).T
elif engine == "scipy":
if not all([x in ["mean", "sum", "std"] for x in aggregations]):
raise ValueError(
"Currently only 'mean', 'sum', and 'std' are allowed in "
"`aggregations` "
"when using the 'scipy' engine."
)

if drop_self_in_neighborhood:
mask[np.diag_indices_from(mask)] = False

# this is an adjacency matrix for whether nodes are in neighborhood
mask = csr_array(mask)

feature_mat = nodes.fillna(0).values

neighborhood_sum_mat = mask @ feature_mat

if "mean" in aggregations:
# this sums the number of notna values in the neighborhood for each
# feature
divisor_matrix = mask @ nodes.notna().astype(int)
divisor_matrix[divisor_matrix == 0] = 1

neighborhood_mean_matrix = neighborhood_sum_mat / divisor_matrix
neighborhood_mean_matrix = pd.DataFrame(
neighborhood_mean_matrix, index=nodes.index, columns=nodes.columns
)
neighborhood_mean_matrix.rename(
columns=lambda x: f"{x}_neighbor_mean", inplace=True
)

if "sum" in aggregations:
neighborhood_sum_matrix = pd.DataFrame(
neighborhood_sum_mat, index=nodes.index, columns=nodes.columns
)
neighborhood_sum_matrix.rename(
columns=lambda x: f"{x}_neighbor_sum", inplace=True
)

if "std" in aggregations:
# REF: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
# using "Computing shifted data" method

# supposedly, this subtraction helps with numerical stability
# I think it makes the large values closer to correct, but the small
# values worse (at least with many 0s)
# could play with details here
const = feature_mat.mean(axis=0)
inner_term = feature_mat - const[None, :]

# this is to deal with NaNs (which were previously set to 0)
inner_term[nodes.isna().values] = 0

# sum of squares of the shifted data
first_term = mask @ (inner_term**2)
# squared sum of the shifted data, divided by the number of non-NaNs
second_term = (mask @ inner_term) ** 2 / divisor_matrix

# this is a node by feature matrix of the variances for each feature
# in that node's neighborhood
new_divisor_matrix = divisor_matrix - 1
new_divisor_matrix[new_divisor_matrix == 0] = 1
variances = (first_term - second_term) / new_divisor_matrix
variances[variances < 0] = 0

neighborhood_std_matrix = np.sqrt(variances)
neighborhood_std_matrix = pd.DataFrame(
neighborhood_std_matrix, index=nodes.index, columns=nodes.columns
)
neighborhood_std_matrix.rename(
columns=lambda x: f"{x}_neighbor_std", inplace=True
)

neighborhood_feature_dfs = []
if "mean" in aggregations:
neighborhood_feature_dfs.append(neighborhood_mean_matrix)
if "sum" in aggregations:
neighborhood_feature_dfs.append(neighborhood_sum_matrix)
if "std" in aggregations:
neighborhood_feature_dfs.append(neighborhood_std_matrix)

neighborhood_features = pd.concat(neighborhood_feature_dfs, axis=1)
else:
raise ValueError(f"Unknown engine {engine}")

return neighborhood_features
23 changes: 23 additions & 0 deletions networkframe/multilayer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from .networkframe import NetworkFrame


class MultilayerNetworkFrame:
def __init__(self):
pass

def add_layer(self, nodes, edges, layer_key):
pass

def add_layer_link(self, layer_mapping):
pass

def _get_layer_nodes(self, layer_key):
pass

def _get_layer_edges(self, layer_key):
pass

def layer(self, layer_key) -> NetworkFrame:
layer_nodes = self._get_layer_nodes(layer_key)
layer_edges = self._get_layer_edges(layer_key)
return NetworkFrame(layer_nodes, layer_edges)
77 changes: 37 additions & 40 deletions networkframe/networkframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def reindex_nodes(self, index: pd.Index) -> Self:
:
A new NetworkFrame with the reindexed nodes.
"""
nodes = self.nodes.reindex(index=index, axis=0)
nodes = self.nodes.reindex(index=index)
edges = self.edges.query("(source in @nodes.index) & (target in @nodes.index)")
out: NetworkFrame = self._return(nodes=nodes, edges=edges, inplace=False) # type: ignore
return out # type: ignore
Expand Down Expand Up @@ -1219,19 +1219,24 @@ def k_hop_neighborhood(
select_indices
return self.query_nodes("index in @select_indices", local_dict=locals())

def k_hop_decomposition(
self, k: int, directed: bool = False, verbose=False
) -> pd.Series:
def k_hop_mask(self, k: int, directed: bool = False, verbose=False):
"""
Return the k-hop decomposition of the network.
This function returns a series of NetworkFrames, each representing the k-hop
neighborhood of a node.
Return the k-hop neighborhood of a node as a boolean mask.
Parameters
----------
node_id
The node ID to use to select the k-hop neighborhood.
k
The number of hops to consider.
directed
Whether to consider the network as directed for computing the reachable
nodes.
Returns
-------
:
A boolean mask of the nodes in the k-hop neighborhood of the given node.
"""
if k < 0:
raise ValueError("k must be non-negative.")
Expand All @@ -1242,6 +1247,23 @@ def k_hop_decomposition(
# bi-directional edges
dists = dijkstra(sparse_adjacency, directed=directed, limit=k, unweighted=True)
mask = ~np.isinf(dists)
return mask

def k_hop_decomposition(
self, k: int, directed: bool = False, verbose=False
) -> pd.Series:
"""
Return the k-hop decomposition of the network.
This function returns a series of NetworkFrames, each representing the k-hop
neighborhood of a node.
Parameters
----------
k
The number of hops to consider.
"""
mask = self.k_hop_mask(k, directed=directed, verbose=verbose)

out = {}
for iloc in tqdm(range(len(self.nodes)), disable=not verbose):
Expand Down Expand Up @@ -1299,8 +1321,7 @@ def k_hop_aggregation(
columns will be the aggregated features (with f'_neighbor_{agg}' appended
to each column name).
"""
if k < 0:
raise ValueError("k must be non-negative.")


if isinstance(aggregations, str):
aggregations = [aggregations]
Expand All @@ -1318,36 +1339,10 @@ def k_hop_aggregation(
if drop_non_numeric:
nodes = nodes.select_dtypes(include=[np.number])

sparse_adjacency = self.to_sparse_adjacency()

# TODO add a check for interaction of directed and whether the graph has any
# bi-directional edges
dists = dijkstra(sparse_adjacency, directed=directed, limit=k, unweighted=True)
mask = ~np.isinf(dists)

# def _agg_neighborhood(iloc):
# node = self.nodes.index[iloc]
# select_nodes = self.nodes.loc[mask[iloc]]
# if drop_self_in_neighborhood:
# select_nodes = select_nodes.drop(index=node)
# agg_neighbor_features = select_nodes.agg(aggregations)
# if isinstance(agg_neighbor_features, pd.Series):
# agg_neighbor_features.index = agg_neighbor_features.index.map(
# lambda x: f"{x}_neighbor_{aggregations[0]}"
# )
# elif isinstance(agg_neighbor_features, pd.DataFrame):
# agg_neighbor_features = agg_neighbor_features.unstack()
# agg_neighbor_features.index = agg_neighbor_features.index.map(
# lambda x: f"{x[0]}_neighbor_{x[1]}"
# )
# agg_neighbor_features.name = node
# return agg_neighbor_features

# with tqdm_joblib(total=len(self.nodes)) as progress_bar:
# rows = Parallel(n_jobs=n_jobs)(
# delayed(_agg_neighborhood)(iloc) for iloc in range(len(self.nodes))
# )
mask = self.k_hop_mask(k, directed=directed, verbose=verbose)

if verbose > 0:
print("Aggregating over neighborhoods")
if engine == "pandas":
rows = []
for iloc in tqdm(range(len(self.nodes)), disable=not verbose):
Expand Down Expand Up @@ -1433,7 +1428,9 @@ def k_hop_aggregation(

# this is a node by feature matrix of the variances for each feature
# in that node's neighborhood
variances = (first_term - second_term) / (divisor_matrix - 1)
new_divisor_matrix = divisor_matrix - 1
new_divisor_matrix[new_divisor_matrix == 0] = 1
variances = (first_term - second_term) / new_divisor_matrix
variances[variances < 0] = 0

neighborhood_std_matrix = np.sqrt(variances)
Expand Down

0 comments on commit a6e66ac

Please sign in to comment.