From 2fe3178427fd64380fe7ac3d744d3fe7943e31c6 Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Fri, 6 Jun 2025 19:04:21 +0000
Subject: [PATCH 1/7] changes to pyproject.toml

---
 pyproject.toml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8f433f3..8508214 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,14 +4,19 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "graph_split"
-version = "0.3.1"
+version = "0.3.2"
 description = "A package to split edges of graphs using different criteria compatible with machine learning model training."
 readme = "README.md"
 license = "GPL-3.0-only"
 authors = [
   { name = "Nure Tasnina", email = "tasnina@vt.edu" }
 ]
-requires-python = ">=3.9"
+dependencies = [
+  "pandas",
+  "numpy",
+  "sklearn"
+]
+requires-python = ">=3.7"
 
 [project.urls]
 Homepage = "https://github.com/Murali-group/graph-split"

From 2a67faf6fe26969f6e113a791306cecca739b317 Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Fri, 6 Jun 2025 20:10:46 +0000
Subject: [PATCH 2/7] negative_samples consolidation

---
 graph_split/split_script.py | 46 +++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/graph_split/split_script.py b/graph_split/split_script.py
index 8a774b2..6c5800d 100644
--- a/graph_split/split_script.py
+++ b/graph_split/split_script.py
@@ -356,38 +356,44 @@ def split_cv(df, split_type, n_folds, seed=None):
         verify_split(df, train_idx[i], test_idx[i], split_type)
     return train_idx, test_idx
 
-def generate_negative_samples(df, graph_type='directed', anchor='source', seed=None):
+def generate_negative_samples(edges, random_samples, duplicates=True, seed=None):
     '''
+    For any positive edge **(a, b)**, create a negative edge **(a, c)** such that (a, c) was not present in the set of positive edges.
     Parameters:
-        graph_type: 'undirected', 'directed'
-        anchor: 'source', 'target', 'both'
-
-    If graph_type=='directed' and anchor=='source':
-        - without edge_type: For any positive edge (a, b) create a negative edge (a, c) such that (a, c) was not present in the set of positive edges.
+        edges: Pandas DataFrame containing two columns where each row is the edge (a, b).
+        random_samples: Pandas series containing list of possible labels for c. None if c should be sampled from the set of b.
+        duplicates: True if the edge (a, c) should be able to appear more than once.
+        seed: Random generation seed for reproducibility of samples.
+    :return: DataFrame containing negative edges (a, c)
     '''
 
     #works for directed, anchor based graph without edgetype.
     if df.shape[1]>2:
-        exit('Error: Not implemented for extra information on edges except for source and target. ')
+        raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.')
+    df = edges.rename({'source', 'target'})
+    init_sample_space = []
+    if (random_samples == None):
+        init_sample_space = set(df['target'].unique())
+    else:
+        init_sample_space = set(random_samples.unique())
+        
+    source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index()
+    source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x)))
 
     all_sampled_sources = []
     all_sampled_targets = []
-
-    if (graph_type=='directed') and (anchor=='source'):
-        init_sample_space = set(df['target'].unique())
-
-        source_wise_targets = df.groupby('source').agg(target_list= ('target', lambda x:set(x)), count = ('target', 'size')).reset_index()
-        source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x)))
-
-        for i, row in source_wise_targets.iterrows():
+    for i, row in source_wise_targets.iterrows():
+        sample_count = 0
+        if (duplicates == False):
             sample_count = min(row['count'], len(row['target_list']))
             all_sampled_targets.extend(list(random.Random(seed).sample(row['target_list'],sample_count)))
-            all_sampled_sources.extend([row['source']]*sample_count)
-        negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets})
+        else:
+            sample_count = row['count']
+            all_sampled_targets.extend(list(random.Random(seed).choices(row['target_list'], k=sample_count)))
+        all_sampled_sources.extend([row['source']]*sample_count)
+    negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets})
 
-        return negative_df
-    else:
-        exit('Error: current code only works for graph_type= directed, anchor=source. ')
+    return negative_df
 
 
 

From b155611f9fbb7ff772637f0f1cee8e2edc60abde Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Fri, 6 Jun 2025 20:45:46 +0000
Subject: [PATCH 3/7] fixes after testing to generate_negative_samples

---
 graph_split/split_script.py | 14 ++++++++------
 pyproject.toml              |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/graph_split/split_script.py b/graph_split/split_script.py
index 6c5800d..004c0b2 100644
--- a/graph_split/split_script.py
+++ b/graph_split/split_script.py
@@ -368,31 +368,33 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
     '''
 
     #works for directed, anchor based graph without edgetype.
-    if df.shape[1]>2:
+    if edges.shape[1]>2:
         raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.')
-    df = edges.rename({'source', 'target'})
+    df = edges.set_axis(['source', 'target'], axis=1)
     init_sample_space = []
-    if (random_samples == None):
+    if (random_samples is None):
         init_sample_space = set(df['target'].unique())
     else:
         init_sample_space = set(random_samples.unique())
         
     source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index()
     source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x)))
+    print(source_wise_targets)
 
     all_sampled_sources = []
     all_sampled_targets = []
+    randomChanger = 0
     for i, row in source_wise_targets.iterrows():
         sample_count = 0
         if (duplicates == False):
             sample_count = min(row['count'], len(row['target_list']))
-            all_sampled_targets.extend(list(random.Random(seed).sample(row['target_list'],sample_count)))
+            all_sampled_targets.extend(list(random.Random(seed + randomChanger).sample(row['target_list'],sample_count)))
         else:
             sample_count = row['count']
-            all_sampled_targets.extend(list(random.Random(seed).choices(row['target_list'], k=sample_count)))
+            all_sampled_targets.extend(list(random.Random(seed + randomChanger).choices(row['target_list'], k=sample_count)))
         all_sampled_sources.extend([row['source']]*sample_count)
+        randomChanger += 1
     negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets})
-
     return negative_df
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 8508214..a0ac0ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
   "pandas",
   "numpy",
-  "sklearn"
+  "scikit-learn"
 ]
 requires-python = ">=3.7"
 

From fda0314cb8229ec1afed8fab1ff1c71f23ddcafc Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Tue, 10 Jun 2025 15:14:46 +0000
Subject: [PATCH 4/7] whoops

---
 graph_split/split_script.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/graph_split/split_script.py b/graph_split/split_script.py
index 004c0b2..3fed568 100644
--- a/graph_split/split_script.py
+++ b/graph_split/split_script.py
@@ -379,7 +379,6 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
         
     source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index()
     source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x)))
-    print(source_wise_targets)
 
     all_sampled_sources = []
     all_sampled_targets = []

From 0c4fe8cfc10632241def0bd68364714d898d2e6f Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Thu, 19 Jun 2025 19:33:52 +0000
Subject: [PATCH 5/7] changes to random

---
 graph_split/split_script.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/graph_split/split_script.py b/graph_split/split_script.py
index 3fed568..bbe6c6b 100644
--- a/graph_split/split_script.py
+++ b/graph_split/split_script.py
@@ -368,32 +368,31 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
     '''
 
     #works for directed, anchor based graph without edgetype.
+    random.seed(seed)
     if edges.shape[1]>2:
         raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.')
-    df = edges.set_axis(['source', 'target'], axis=1)
+    df = edges.set_axis(['source', 'target'], axis=1, inplace=False)
     init_sample_space = []
     if (random_samples is None):
         init_sample_space = set(df['target'].unique())
     else:
         init_sample_space = set(random_samples.unique())
-        
+    
     source_wise_targets = df.groupby('source')['target'].agg([('target_list', lambda x:set(x)), ('count', 'size')]).reset_index()
     source_wise_targets['target_list'] = source_wise_targets['target_list'].apply(lambda x: sorted(init_sample_space.difference(x)))
 
     all_sampled_sources = []
     all_sampled_targets = []
-    randomChanger = 0
     for i, row in source_wise_targets.iterrows():
         sample_count = 0
         if (duplicates == False):
             sample_count = min(row['count'], len(row['target_list']))
-            all_sampled_targets.extend(list(random.Random(seed + randomChanger).sample(row['target_list'],sample_count)))
+            all_sampled_targets.extend(list(random.sample(row['target_list'],sample_count)))
         else:
             sample_count = row['count']
-            all_sampled_targets.extend(list(random.Random(seed + randomChanger).choices(row['target_list'], k=sample_count)))
+            all_sampled_targets.extend(list(random.choices(row['target_list'], k=sample_count)))
         all_sampled_sources.extend([row['source']]*sample_count)
-        randomChanger += 1
-    negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets})
+    negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) 
     return negative_df
 
 

From 5cfae3ab1554d1a9614226db581e92a36dd90703 Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Mon, 23 Jun 2025 14:24:06 +0000
Subject: [PATCH 6/7] changes rng seed to randomstate

---
 graph_split/split_script.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/graph_split/split_script.py b/graph_split/split_script.py
index bbe6c6b..d10d5be 100644
--- a/graph_split/split_script.py
+++ b/graph_split/split_script.py
@@ -368,7 +368,7 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
     '''
 
     #works for directed, anchor based graph without edgetype.
-    random.seed(seed)
+    random = np.random.RandomState(seed)
     if edges.shape[1]>2:
         raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.')
     df = edges.set_axis(['source', 'target'], axis=1, inplace=False)
@@ -387,10 +387,10 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
         sample_count = 0
         if (duplicates == False):
             sample_count = min(row['count'], len(row['target_list']))
-            all_sampled_targets.extend(list(random.sample(row['target_list'],sample_count)))
+            all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=False))
         else:
             sample_count = row['count']
-            all_sampled_targets.extend(list(random.choices(row['target_list'], k=sample_count)))
+            all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=True))
         all_sampled_sources.extend([row['source']]*sample_count)
     negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) 
     return negative_df

From 0832b54e39566e53f88a5fc64bbd6eda8224e1a9 Mon Sep 17 00:00:00 2001
From: yutuyt01 <timwilson@vt.edu>
Date: Mon, 30 Jun 2025 15:10:11 +0000
Subject: [PATCH 7/7] change variable name for clarity (minor change)

---
 graph_split/split_script.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/graph_split/split_script.py b/graph_split/split_script.py
index d10d5be..81462c0 100644
--- a/graph_split/split_script.py
+++ b/graph_split/split_script.py
@@ -368,7 +368,7 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
     '''
 
     #works for directed, anchor based graph without edgetype.
-    random = np.random.RandomState(seed)
+    randomState = np.random.RandomState(seed)
     if edges.shape[1]>2:
         raise ValueError('Too many columns! Ensure edges only contain columns for an edge (a, b) and no other information.')
     df = edges.set_axis(['source', 'target'], axis=1, inplace=False)
@@ -387,10 +387,10 @@ def generate_negative_samples(edges, random_samples, duplicates=True, seed=None)
         sample_count = 0
         if (duplicates == False):
             sample_count = min(row['count'], len(row['target_list']))
-            all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=False))
+            all_sampled_targets.extend(randomState.choice(row['target_list'], size=sample_count, replace=False))
         else:
             sample_count = row['count']
-            all_sampled_targets.extend(random.choice(row['target_list'], size=sample_count, replace=True))
+            all_sampled_targets.extend(randomState.choice(row['target_list'], size=sample_count, replace=True))
         all_sampled_sources.extend([row['source']]*sample_count)
     negative_df = pd.DataFrame({'source': all_sampled_sources, 'target': all_sampled_targets}) 
     return negative_df