From 2fe3178427fd64380fe7ac3d744d3fe7943e31c6 Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Fri, 6 Jun 2025 19:04:21 +0000 Subject: [PATCH 1/7] changes to pyproject.toml --- pyproject.toml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8f433f3..8508214 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,19 @@ build-backend = "setuptools.build_meta" [project] name = "graph_split" -version = "0.3.1" +version = "0.3.2" description = "A package to split edges of graphs using different criteria compatible with machine learning model training." readme = "README.md" license = "GPL-3.0-only" authors = [ { name = "Nure Tasnina", email = "tasnina@vt.edu" } ] -requires-python = ">=3.9" +dependencies = [ + "pandas", + "numpy", + "sklearn" +] +requires-python = ">=3.7" [project.urls] Homepage = "https://github.com/Murali-group/graph-split" From 2a67faf6fe26969f6e113a791306cecca739b317 Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Fri, 6 Jun 2025 20:10:46 +0000 Subject: [PATCH 2/7] negative_samples consolidation --- graph_split/split_script.py | 46 +++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/graph_split/split_script.py b/graph_split/split_script.py index 8a774b2..6c5800d 100644 --- a/graph_split/split_script.py +++ b/graph_split/split_script.py @@ -356,38 +356,44 @@ def split_cv(df, split_type, n_folds, seed=None): verify_split(df, train_idx[i], test_idx[i], split_type) return train_idx, test_idx -def generate_negative_samples(df, graph_type='directed', anchor='source', seed=None): +def generate_negative_samples(edges, random_samples, duplicates=True, seed=None): ''' + For any positive edge **(a, b)**, create a negative edge **(a, c)** such that (a, c) was not present in the set of positive edges. Parameters: - graph_type: 'undirected', 'directed' - anchor: 'source', 'target', 'both' - - If graph_type=='directed' and anchor=='source': - - without edge_type: For any positive edge (a, b) create a negative edge (a, c) such that (a, c) was not present in the set of positive edges. + edges: Pandas DataFrame containing two columns where each row is the edge (a, b). + random_samples: Pandas series containing list of possible labels for c. None if c should be sampled from the set of b. + duplicates: True if the edge (a, c) should be able to appear more than once. + seed: Random generation seed for reproducibility of samples. + :return: DataFrame containing negative edges (a, c) ''' #works for directed, anchor based graph without edgetype. if df.shape[1]>2: - exit('Error: Not implemented for extra information on edges except for source and target. ') + raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.') + df = edges.rename({'source', 'target'}) + init_sample_space = [] + if (random_samples == None): + init_sample_space = set(df['target'].unique()) + else: + init_sample_space = set(random_samples.unique()) + + source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index() + source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x))) all_sampled_sources = [] all_sampled_targets = [] - - if (graph_type=='directed') and (anchor=='source'): - init_sample_space = set(df['target'].unique()) - - source_wise_targets = df.groupby('source').agg(target_list= ('target', lambda x:set(x)), count = ('target', 'size')).reset_index() - source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x))) - - for i, row in source_wise_targets.iterrows(): + for i, row in source_wise_targets.iterrows(): + sample_count = 0 + if (duplicates == False): sample_count = min(row['count'], len(row['target_list'])) all_sampled_targets.extend(list(random.Random(seed).sample(row['target_list'],sample_count))) - all_sampled_sources.extend([row['source']]*sample_count) - negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) + else: + sample_count = row['count'] + all_sampled_targets.extend(list(random.Random(seed).choices(row['target_list'], k=sample_count))) + all_sampled_sources.extend([row['source']]*sample_count) + negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) - return negative_df - else: - exit('Error: current code only works for graph_type= directed, anchor=source. ') + return negative_df From b155611f9fbb7ff772637f0f1cee8e2edc60abde Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Fri, 6 Jun 2025 20:45:46 +0000 Subject: [PATCH 3/7] fixes after testing to generate_negative_samples --- graph_split/split_script.py | 14 ++++++++------ pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/graph_split/split_script.py b/graph_split/split_script.py index 6c5800d..004c0b2 100644 --- a/graph_split/split_script.py +++ b/graph_split/split_script.py @@ -368,31 +368,33 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) ''' #works for directed, anchor based graph without edgetype. - if df.shape[1]>2: + if edges.shape[1]>2: raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.') - df = edges.rename({'source', 'target'}) + df = edges.set_axis(['source', 'target'], axis=1) init_sample_space = [] - if (random_samples == None): + if (random_samples is None): init_sample_space = set(df['target'].unique()) else: init_sample_space = set(random_samples.unique()) source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index() source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x))) + print(source_wise_targets) all_sampled_sources = [] all_sampled_targets = [] + randomChanger = 0 for i, row in source_wise_targets.iterrows(): sample_count = 0 if (duplicates == False): sample_count = min(row['count'], len(row['target_list'])) - all_sampled_targets.extend(list(random.Random(seed).sample(row['target_list'],sample_count))) + all_sampled_targets.extend(list(random.Random(seed + randomChanger).sample(row['target_list'],sample_count))) else: sample_count = row['count'] - all_sampled_targets.extend(list(random.Random(seed).choices(row['target_list'], k=sample_count))) + all_sampled_targets.extend(list(random.Random(seed + randomChanger).choices(row['target_list'], k=sample_count))) all_sampled_sources.extend([row['source']]*sample_count) + randomChanger += 1 negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) - return negative_df diff --git a/pyproject.toml b/pyproject.toml index 8508214..a0ac0ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ authors = [ dependencies = [ "pandas", "numpy", - "sklearn" + "scikit-learn" ] requires-python = ">=3.7" From fda0314cb8229ec1afed8fab1ff1c71f23ddcafc Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Tue, 10 Jun 2025 15:14:46 +0000 Subject: [PATCH 4/7] whoops --- graph_split/split_script.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graph_split/split_script.py b/graph_split/split_script.py index 004c0b2..3fed568 100644 --- a/graph_split/split_script.py +++ b/graph_split/split_script.py @@ -379,7 +379,6 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index() source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x))) - print(source_wise_targets) all_sampled_sources = [] all_sampled_targets = [] From 0c4fe8cfc10632241def0bd68364714d898d2e6f Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Thu, 19 Jun 2025 19:33:52 +0000 Subject: [PATCH 5/7] changes to random --- graph_split/split_script.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/graph_split/split_script.py b/graph_split/split_script.py index 3fed568..bbe6c6b 100644 --- a/graph_split/split_script.py +++ b/graph_split/split_script.py @@ -368,32 +368,31 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) ''' #works for directed, anchor based graph without edgetype. + random.seed(seed) if edges.shape[1]>2: raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.') - df = edges.set_axis(['source', 'target'], axis=1) + df = edges.set_axis(['source', 'target'], axis=1, inplace=False) init_sample_space = [] if (random_samples is None): init_sample_space = set(df['target'].unique()) else: init_sample_space = set(random_samples.unique()) - + source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index() source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x))) all_sampled_sources = [] all_sampled_targets = [] - randomChanger = 0 for i, row in source_wise_targets.iterrows(): sample_count = 0 if (duplicates == False): sample_count = min(row['count'], len(row['target_list'])) - all_sampled_targets.extend(list(random.Random(seed + randomChanger).sample(row['target_list'],sample_count))) + all_sampled_targets.extend(list(random.sample(row['target_list'],sample_count))) else: sample_count = row['count'] - all_sampled_targets.extend(list(random.Random(seed + randomChanger).choices(row['target_list'], k=sample_count))) + all_sampled_targets.extend(list(random.choices(row['target_list'], k=sample_count))) all_sampled_sources.extend([row['source']]*sample_count) - randomChanger += 1 - negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) + negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) return negative_df From 5cfae3ab1554d1a9614226db581e92a36dd90703 Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Mon, 23 Jun 2025 14:24:06 +0000 Subject: [PATCH 6/7] changes rng seed to randomstate --- graph_split/split_script.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graph_split/split_script.py b/graph_split/split_script.py index bbe6c6b..d10d5be 100644 --- a/graph_split/split_script.py +++ b/graph_split/split_script.py @@ -368,7 +368,7 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) ''' #works for directed, anchor based graph without edgetype. - random.seed(seed) + random = np.random.RandomState(seed) if edges.shape[1]>2: raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.') df = edges.set_axis(['source', 'target'], axis=1, inplace=False) @@ -387,10 +387,10 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) sample_count = 0 if (duplicates == False): sample_count = min(row['count'], len(row['target_list'])) - all_sampled_targets.extend(list(random.sample(row['target_list'],sample_count))) + all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=False)) else: sample_count = row['count'] - all_sampled_targets.extend(list(random.choices(row['target_list'], k=sample_count))) + all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=True)) all_sampled_sources.extend([row['source']]*sample_count) negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) return negative_df From 0832b54e39566e53f88a5fc64bbd6eda8224e1a9 Mon Sep 17 00:00:00 2001 From: yutuyt01 Date: Mon, 30 Jun 2025 15:10:11 +0000 Subject: [PATCH 7/7] change variable name for clarity (minor change) --- graph_split/split_script.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graph_split/split_script.py b/graph_split/split_script.py index d10d5be..81462c0 100644 --- a/graph_split/split_script.py +++ b/graph_split/split_script.py @@ -368,7 +368,7 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) ''' #works for directed, anchor based graph without edgetype. - random = np.random.RandomState(seed) + randomState = np.random.RandomState(seed) if edges.shape[1]>2: raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.') df = edges.set_axis(['source', 'target'], axis=1, inplace=False) @@ -387,10 +387,10 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None) sample_count = 0 if (duplicates == False): sample_count = min(row['count'], len(row['target_list'])) - all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=False)) + all_sampled_targets.extend(randomState.choice(row['target_list'], size=sample_count, replace=False)) else: sample_count = row['count'] - all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=True)) + all_sampled_targets.extend(randomState.choice(row['target_list'], size=sample_count, replace=True)) all_sampled_sources.extend([row['source']]*sample_count) negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) return negative_df