Skip to content

Commit 7e3f917

Browse files
Updated filtering for identity (#1113)
* Update filtering script to include new patterns and args * Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update scripts/data/filtering_and_updates/filter_dataset_by_keywords.py * Refine filtering patterns and dataset processing Updated regex patterns for filtering AI model mentions and improved dataset filtering logic. * Update filter_dataset_by_keywords.py * Optimize dataset filtering and loading processes * Refactor dataset loading and filtering processes --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent bac9393 commit 7e3f917

File tree

1 file changed

+78
-73
lines changed

1 file changed

+78
-73
lines changed

scripts/data/filtering_and_updates/filter_dataset_by_keywords.py

Lines changed: 78 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -16,65 +16,100 @@
1616
Motivated by: realizing the SFT mix has lots of "I am DeepSeek" snippets.
1717
1818
Run with:
19-
python scripts/data/sft/filter_dataset_by_keywords.py --input-dataset allenai/tulu-3-sft-mixture --column messages
19+
python scripts/data/filtering_and_updates/filter_dataset_by_keywords.py --input-dataset allenai/tulu-3-sft-mixture --column messages
2020
"""
2121

22+
import os
23+
os.environ['HF_DATASETS_DISABLE_CACHING'] = '1'
24+
25+
from datasets import disable_caching
26+
disable_caching()
27+
28+
2229

2330
# Popular model providers
2431
PROVIDERS = [
25-
"OpenAI", "Open AI", "Claude", "Gemini", "Qwen", "DeepSeek", "Anthropic", "Meta AI", "Meta's",
32+
"OpenAI", "Open AI", "Claude", "Gemini", "Qwen", "DeepSeek", "Anthropic", "Meta AI", "Meta's", "ChatGPT",
2633
"Cohere", "Mistral AI", "Mistral's", "xAI", "Perplexity" # "Google AI", "Google's", "Microsoft", "HuggingFace", "Hugging Face"
2734
]
2835

36+
# Regex patterns for filtering (case-insensitive for common words, case-sensitive for company names)
2937
# Regex patterns for filtering (case-insensitive for common words, case-sensitive for company names)
3038
PATTERNS = [
3139
# Pattern: "I'm [model name], an AI assistant made by {provider}"
32-
r"(?i)i'?m\s+(" + "|".join(PROVIDERS) + r"),?\s+an?\s+ai\s+(?:assistant|model)[^.!?]*?(?:made|developed|created|trained)\s+by\s+(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
40+
# Kept full range, removed optional grouping that was too restrictive
41+
r"(?i)\bI'?m\s+(" + "|".join(PROVIDERS) + r"),?\s+an?\s+AI\s+(?:assistant|model)[^.!?]{0,100}(?:made|developed|created|trained)\s+by\s+(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
3342

3443
# Pattern: "[Model name] is an AI assistant developed by {provider}"
35-
r"(?i)(" + "|".join(PROVIDERS) + r")\s+is\s+an?\s+ai\s+(?:assistant|model)[^.!?]*?(?:developed|created|made|trained)\s+by\s+(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
44+
# Restored full pattern
45+
r"(?i)\b(" + "|".join(PROVIDERS) + r")\s+is\s+an?\s+AI\s+(?:assistant|model)[^.!?]{0,100}(?:developed|created|made|trained)\s+by\s+(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
3646

3747
# Pattern: "as a [AI model/assistant/chatbot] ... {provider}"
38-
r"(?i)as\s+a\s+(?:language\s+model|ai\s+model|assistant|chatbot|model)[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
48+
# Kept greedy to match more
49+
r"(?i)\bas\s+an?\s+(?:language\s+model|AI\s+model|assistant|chatbot|model)[^.!?]{0,100}\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
3950

4051
# Pattern: "as an AI developed by {provider}"
41-
r"(?i)as\s+an\s+ai\s+(?:developed|created|made|trained)\s+by\s+(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
52+
# Kept full range
53+
r"(?i)\bas\s+an\s+AI\s+(?:developed|created|made|trained)\s+by\s+(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
4254

4355
# Pattern: "I am [model type] ... {provider}"
44-
r"(?i)i\s+am\s+(?:a\s+)?(?:language\s+model|ai\s+model|assistant|chatbot|model)[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
56+
# Kept greedy for full matches
57+
r"(?i)\bI\s+am\s+(?:a\s+)?(?:language\s+model|AI\s+model|assistant|chatbot|model)[^.!?]{0,100}\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
4558

46-
# Pattern: "trained by ... {provider}" within one sentence
47-
r"(?i)trained\s+by\s+[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
59+
# Pattern: "I am called [provider]"
60+
r"(?i)\bI\s+am\s+called\s+\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
61+
62+
# Pattern: "I'm [provider]" or "I am [provider]"
63+
r"(?i)\b(?:I'?m|I\s+am)\s+\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
64+
65+
# Pattern: "trained by ... {provider}" within one sentence
66+
# Made middle section non-greedy but kept full ranges
67+
r"(?i)\btrained\s+by\s+[^.!?]{0,100}?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
4868

4969
# Pattern: "developed by ... {provider}" within one sentence
50-
r"(?i)developed\s+by\s+[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
70+
r"(?i)\bdeveloped\s+by\s+[^.!?]{0,100}?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
5171

5272
# Pattern: "created by ... {provider}" within one sentence
53-
r"(?i)created\s+by\s+[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
73+
r"(?i)\bcreated\s+by\s+[^.!?]{0,100}?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
5474

5575
# Pattern: "made by ... {provider}" within one sentence
56-
r"(?i)made\s+by\s+[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",
76+
r"(?i)\bmade\s+by\s+[^.!?]{0,100}?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]{0,100}[.!?]",
5777

5878
# Pattern: "against {provider}'s use-case policy" or similar policy references
59-
r"(?i)against\s+(" + "|".join(PROVIDERS) + r")(?:'s|'s)?\s+(?:use-case\s+)?(?:policy|policies|guidelines|terms)[^.!?]*?[.!?]",
79+
r"(?i)\bagainst\s+(" + "|".join(PROVIDERS) + r")(?:'s|'s)?\s+(?:use-case\s+)?(?:policy|policies|guidelines|terms)[^.!?]{0,100}[.!?]",
6080

6181
# Pattern: "{provider}'s policy" or "{provider}'s guidelines"
62-
r"(?i)\b(" + "|".join(PROVIDERS) + r")(?:'s|'s)\s+(?:policy|policies|guidelines|terms|use-case)[^.!?]*?[.!?]",
63-
]
82+
r"(?i)\b(" + "|".join(PROVIDERS) + r")(?:'s|'s)\s+(?:policy|policies|guidelines|terms|use-case)[^.!?]{0,100}[.!?]",
83+
84+
# Pattern: Any sentence containing "DeepSeek-R1" or "DeepSeek R1" (case-insensitive)
85+
# Less restrictive: bounded but allows more at the start
86+
# r"(?i)[^.!?]{0,250}?\bDeepSeek[\s-]?R1\b[^.!?]{0,100}[.!?]",
87+
r"(?i)[^.!?]{0,250}?\bDeepSeek\b[^.!?]{0,100}[.!?]",
6488

89+
# Pattern: Anything with the word "Qwen" (case-insensitive)
90+
# Less restrictive: bounded but allows more at the start
91+
r"(?i)[^.!?]{0,250}?\bQwen\b[^.!?]{0,100}[.!?]",
92+
93+
# Pattern: Any sentence containing "Alibaba Qwen" (case-insensitive) or Alibaba Cloud
94+
# Less restrictive: bounded but allows more at the start
95+
r"(?i)[^.!?]{0,250}?\bAlibaba\s+Qwen\b[^.!?]{0,100}[.!?]",
96+
r"(?i)[^.!?]{0,250}?\bAlibaba\s+Cloud\b[^.!?]{0,100}[.!?]",
97+
]
6598

66-
def should_be_filtered_by_advanced_patterns(example, verbose=False, filter_user_turns=False):
99+
def should_be_filtered_by_advanced_patterns(example, column="messages", verbose=False, filter_user_turns=False):
67100
"""Filter by more sophisticated patterns like 'as a ... OpenAI' or 'trained by ... Google'"""
68101

69-
for message in example["messages"]:
102+
for message in example[column]:
70103
# Skip user messages unless explicitly enabled
71104
if message["role"] == "user" and not filter_user_turns:
72105
continue
73106
if message["role"] != "assistant" and message["role"] != "user":
74107
continue
75108

76109
content = message["content"] # Keep original case
77-
110+
# empty content check
111+
if content is None:
112+
return True
78113
for pattern in PATTERNS:
79114
if re.search(pattern, content):
80115
if verbose:
@@ -87,9 +122,9 @@ def should_be_filtered_by_advanced_patterns(example, verbose=False, filter_user_
87122
return False
88123

89124

90-
def should_be_filtered_combined(example, verbose=False, filter_user_turns=False):
125+
def should_be_filtered_combined(example, column="messages", verbose=False, filter_user_turns=False):
91126
"""Combined filtering function"""
92-
return should_be_filtered_by_advanced_patterns(example, verbose, filter_user_turns)
127+
return should_be_filtered_by_advanced_patterns(example, column=column, verbose=verbose, filter_user_turns=filter_user_turns)
93128

94129
def load_dataset_from_parquet(dataset_name):
95130
"""Load dataset directly from parquet files."""
@@ -124,7 +159,9 @@ def main():
124159
parser.add_argument("--filter-user-turns", action="store_true",
125160
help="Also filter based on user messages (default: only filter assistant messages)")
126161
parser.add_argument("--output-entity", type=str, help="Output entity (org/user) for the filtered dataset. If not provided, uses the same entity as the input dataset.")
127-
162+
parser.add_argument("--column", type=str, default="messages",
163+
help="Column name containing the messages (default: messages)")
164+
128165
args = parser.parse_args()
129166

130167
input_dataset = args.input_dataset
@@ -161,66 +198,34 @@ def main():
161198
raise
162199

163200
print(f"Dataset loaded with {len(dataset)} examples")
164-
165-
# Keep track of filtered examples
166-
filtered_examples = []
167-
168-
# Filter function
169-
def filter_fn(example):
170-
should_filter = should_be_filtered_combined(example, verbose=True, filter_user_turns=filter_user_turns)
171-
if should_filter and len(filtered_examples) < 3:
172-
# Find which pattern matched and extract the matching text
173-
for message in example["messages"]:
174-
# Apply same filtering logic for finding matched text
175-
if message["role"] == "user" and not filter_user_turns:
176-
continue
177-
if message["role"] != "assistant" and message["role"] != "user":
178-
continue
179-
180-
content = message["content"] # Keep original case
181-
182-
for pattern in PATTERNS:
183-
match = re.search(pattern, content)
184-
if match:
185-
example["_matched_text"] = match.group(0)
186-
example["_matched_role"] = message["role"]
187-
break
188-
if "_matched_text" in example:
189-
break
190-
191-
filtered_examples.append(example)
192-
return not should_filter
193-
201+
194202
print("Filtering dataset...")
195-
filtered_dataset = dataset.filter(filter_fn)
203+
# First filter without debugging
204+
filtered_dataset = dataset.filter(
205+
lambda ex: not should_be_filtered_combined(ex, column=args.column, verbose=False, filter_user_turns=filter_user_turns),
206+
num_proc=open_instruct_utils.max_num_processes()
207+
)
196208
print(f"Filtered size: {len(filtered_dataset)}")
197209
print(f"Removed {len(dataset) - len(filtered_dataset)} examples")
198-
199-
# Show a few filtered examples
200-
if filtered_examples:
201-
print("\n--- Examples that were removed ---")
202-
for i, example in enumerate(filtered_examples):
203-
print("---------------------------------")
204-
print(f"\nExample {i+1}:")
205-
if "_matched_text" in example:
206-
role = example.get("_matched_role", "unknown")
207-
print(f" Matched text ({role}): '{example['_matched_text']}'")
208-
messages = example.get("messages", [])
209-
for msg in messages:
210-
if msg.get("role") == "user":
211-
content = msg.get("content", "")
212-
print(f" User: {content}")
213-
if msg.get("role") == "assistant":
214-
content = msg.get("content", "")
215-
print(f" Assistant: {content}")
210+
211+
# Then collect a few filtered examples in serial for inspection
212+
if len(dataset) > len(filtered_dataset):
213+
print("\nCollecting example filtered instances...")
214+
examples_found = 0
215+
print_within = min(1000, len(dataset))
216+
for example in dataset.select(range(print_within)):
217+
if should_be_filtered_combined(example, column=args.column, verbose=True, filter_user_turns=filter_user_turns):
218+
# Show the example
219+
examples_found += 1
220+
if examples_found >= 10:
216221
break
217-
print("--- End of examples ---\n")
222+
218223

219224
# Upload
220225
full_name = f"{output_dataset}"
221226
print(f"Uploading to: {full_name}")
222-
filtered_dataset.push_to_hub(full_name, private=True)
227+
filtered_dataset.push_to_hub(full_name, private=True, num_proc=open_instruct_utils.max_num_processes())
223228
print("Done!")
224229

225230
if __name__ == "__main__":
226-
main()
231+
main()

0 commit comments

Comments
 (0)