-
Notifications
You must be signed in to change notification settings - Fork 25
refactor(summarize_networks): use col names directly #385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
f5f7542
bc6aae8
6451a11
71486e5
2af38a0
6320b72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,12 +1,13 @@ | ||
| from collections import OrderedDict | ||
| from pathlib import Path | ||
| from statistics import median | ||
| from typing import Iterable | ||
| from typing import Any, Iterable | ||
|
|
||
| import networkx as nx | ||
| import pandas as pd | ||
|
|
||
|
|
||
| def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict], | ||
| def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict[str, dict]], | ||
| algo_with_params: list) -> pd.DataFrame: | ||
| """ | ||
| Generate a table that aggregates summary information about networks in file_paths, including which nodes are present | ||
|
|
@@ -33,34 +34,44 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
| nodes_by_col.append(set(node_table.loc[node_table[col] > 0, 'NODEID'])) | ||
|
|
||
| # Initialize list to store network summary data | ||
| nw_info = [] | ||
| nw_info: list[list[Any]] = [] | ||
|
|
||
| algo_with_params = sorted(algo_with_params) | ||
|
|
||
| # Iterate through each network file path | ||
| for index, file_path in enumerate(sorted(file_paths)): | ||
| with open(file_path, 'r') as f: | ||
| lines = f.readlines()[1:] # skip the header line | ||
| lines = f.readlines()[1:] # as this is an output edge file, we skip the header line | ||
|
|
||
| # directed or mixed graphs are parsed and summarized as an undirected graph | ||
| nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) | ||
|
|
||
| # Save the network name, number of nodes, number edges, and number of connected components | ||
| nw_name = str(file_path) | ||
| number_nodes = nw.number_of_nodes() | ||
| number_edges = nw.number_of_edges() | ||
| ncc = nx.number_connected_components(nw) | ||
| # Begin collecting data into a dictionary. | ||
| # (We use dictionaries over list to make ) | ||
tristan-f-r marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| data: OrderedDict[str, Any] = OrderedDict() | ||
|
|
||
| number_of_nodes = nw.number_of_nodes() | ||
| data.update([ | ||
| ('Name', str(file_path)), | ||
| ('Number of nodes', number_of_nodes), | ||
| ('Number of edges', nw.number_of_edges()), | ||
| ('Number of connected components', nx.number_connected_components(nw)), | ||
| ]) | ||
|
|
||
| # Save the max/median degree, average clustering coefficient, and density | ||
| if number_nodes == 0: | ||
| max_degree = 0 | ||
| median_degree = 0.0 | ||
| density = 0.0 | ||
| if number_of_nodes == 0: | ||
| data.update([ | ||
| ('Density', 0.0), | ||
| ('Max degree', 0), | ||
| ('Median degree', 0.0), | ||
| ]) | ||
| else: | ||
| degrees = [deg for _, deg in nw.degree()] | ||
| max_degree = max(degrees) | ||
| median_degree = median(degrees) | ||
| density = nx.density(nw) | ||
| data.update([ | ||
| ('Density', nx.density(nw)), | ||
| ('Max degree', max(degrees, default=0)), | ||
| ('Median degree', float(median(degrees or [0]))), # float to have output stay consistent | ||
ntalluri marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ]) | ||
|
|
||
| cc = list(nx.connected_components(nw)) | ||
| # Save the max diameter | ||
|
|
@@ -69,27 +80,26 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
| nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0 | ||
| for c in cc | ||
| ] | ||
| max_diameter = max(diameters, default=0) | ||
| data['Max diameter'] = max(diameters, default=0) | ||
|
|
||
| # Save the average path lengths | ||
| # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) | ||
| avg_path_lengths = [ | ||
| nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0 | ||
| for c in cc | ||
| ] | ||
|
|
||
| if len(avg_path_lengths) != 0: | ||
| avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) | ||
| data['Average path length'] = sum(avg_path_lengths) / len(avg_path_lengths) | ||
| else: | ||
| avg_path_len = 0.0 | ||
| data['Average path length'] = 0.0 | ||
|
|
||
| # Initialize list to store current network information | ||
| cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len] | ||
| current_network_info = list(data.values()) | ||
|
|
||
| # Iterate through each node property and save the intersection with the current network | ||
| for node_list in nodes_by_col: | ||
| num_nodes = len(set(nw).intersection(node_list)) | ||
| cur_nw_info.append(num_nodes) | ||
| current_network_info.append(num_nodes) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to make this a key value pair too using nodes_by_col_labs? |
||
|
|
||
| # String split to access algorithm and hashcode: <algorithm>-params-<params_hash> | ||
| parts = algo_with_params[index].split('-') | ||
|
|
@@ -98,24 +108,23 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg | |
|
|
||
| # Algorithm parameters have format { algo : { hashcode : { parameter combos } } } | ||
| param_combo = algo_params[algo][hashcode] | ||
| cur_nw_info.append(param_combo) | ||
| current_network_info.append(param_combo) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same with param_combo? (making it a key value pair) |
||
|
|
||
| # Save the current network information to the network summary list | ||
| nw_info.append(cur_nw_info) | ||
| nw_info.append(current_network_info) | ||
|
|
||
| # Prepare column names | ||
| col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length'] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need col_names? Could we get that directly from the keys now?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could. Should we? It's nice to have an explicit list of all of the columns - the column duplication may help here.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was under the assumption that part of this PR was to get rid of the col_names part, otherwise I don't fully understand why this is being changed to a dict.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh - it's just to remove the
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still don't get the point of this PR. Why do we not want to use cur_nw_info? You mentioned it was bad in the other PR, but there was no reasoning as to why it's bad.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll close this PR since I don't want to push this change through if it isn't straightforward, since it's a small refactor after all 👍 My reasoning for not using |
||
| col_names.extend(nodes_by_col_labs) | ||
| col_names.append('Parameter combination') | ||
|
|
||
| # Convert the network summary data to pandas dataframe | ||
| # Could refactor to create the dataframe line by line instead of storing data as lists and then converting | ||
| nw_info = pd.DataFrame( | ||
| nw_df = pd.DataFrame( | ||
| nw_info, | ||
| columns=col_names | ||
| ) | ||
|
|
||
| return nw_info | ||
| return nw_df | ||
|
|
||
|
|
||
| def degree(g): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.