Skip to content
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions scripts/data/filtering_and_updates/filter_dataset_by_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
Motivated by: realizing the SFT mix has lots of "I am DeepSeek" snippets.

Run with:
python scripts/data/sft/filter_dataset_by_keywords.py --input-dataset allenai/tulu-3-sft-mixture --column messages
python scripts/data/filtering_and_updates/filter_dataset_by_keywords.py --input-dataset allenai/tulu-3-sft-mixture --column messages
"""


# Popular model providers
PROVIDERS = [
"OpenAI", "Open AI", "Claude", "Gemini", "Qwen", "DeepSeek", "Anthropic", "Meta AI", "Meta's",
"OpenAI", "Open AI", "Claude", "Gemini", "Qwen", "DeepSeek", "Anthropic", "Meta AI", "Meta's", "ChatGPT"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Missing Comma Causes Provider Concatenation

The PROVIDERS list is missing a comma after "ChatGPT", causing Python to concatenate it with "Cohere" into a single "ChatGPTCohere" entry. This prevents the regex patterns from correctly matching "ChatGPT" and "Cohere" as distinct providers.

Fix in Cursor Fix in Web

"Cohere", "Mistral AI", "Mistral's", "xAI", "Perplexity" # "Google AI", "Google's", "Microsoft", "HuggingFace", "Hugging Face"
]

Expand All @@ -42,6 +42,12 @@
# Pattern: "I am [model type] ... {provider}"
r"(?i)i\s+am\s+(?:a\s+)?(?:language\s+model|ai\s+model|assistant|chatbot|model)[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",

# Pattern: "I am called [provider]"
r"(?i)i\s+am\s+called\s+\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",

# Pattern: "I'm [provider]" or "I am [provider]"
r"(?i)(?:i'?m|i\s+am)\s+\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",

# Pattern: "trained by ... {provider}" within one sentence
r"(?i)trained\s+by\s+[^.!?]*?\b(" + "|".join(PROVIDERS) + r")\b[^.!?]*?[.!?]",

Expand All @@ -59,21 +65,33 @@

# Pattern: "{provider}'s policy" or "{provider}'s guidelines"
r"(?i)\b(" + "|".join(PROVIDERS) + r")(?:'s|'s)\s+(?:policy|policies|guidelines|terms|use-case)[^.!?]*?[.!?]",

# Pattern: Any sentence containing "DeepSeek-R1" or "DeepSeek R1" (case-insensitive)
r"(?i)[^.!?]*\bDeepSeek[\s-]?R1\b[^.!?]*?[.!?]",

# Pattern: Anything with the word "Qwen" (case-insensitive)
r"(?i)[^.!?]*\bQwen\b[^.!?]*?[.!?]",

# Pattern: Any sentence containing "Alibaba Qwen" (case-insensitive) or Alibaba Cloud
r"(?i)[^.!?]*\bAlibaba\s+Qwen\b[^.!?]*?[.!?]",
r"(?i)[^.!?]*\bAlibaba\s+Cloud\b[^.!?]*?[.!?]",
]


def should_be_filtered_by_advanced_patterns(example, verbose=False, filter_user_turns=False):
def should_be_filtered_by_advanced_patterns(example, column="messages", verbose=False, filter_user_turns=False):
"""Filter by more sophisticated patterns like 'as a ... OpenAI' or 'trained by ... Google'"""

for message in example["messages"]:
for message in example[column]:
# Skip user messages unless explicitly enabled
if message["role"] == "user" and not filter_user_turns:
continue
if message["role"] != "assistant" and message["role"] != "user":
continue

content = message["content"] # Keep original case

# empty content check
if content is None:
return True
for pattern in PATTERNS:
if re.search(pattern, content):
if verbose:
Expand All @@ -86,9 +104,9 @@ def should_be_filtered_by_advanced_patterns(example, verbose=False, filter_user_
return False


def should_be_filtered_combined(example, verbose=False, filter_user_turns=False):
def should_be_filtered_combined(example, column="messages", verbose=False, filter_user_turns=False):
"""Combined filtering function"""
return should_be_filtered_by_advanced_patterns(example, verbose, filter_user_turns)
return should_be_filtered_by_advanced_patterns(example, column=column, verbose=verbose, filter_user_turns=filter_user_turns)

def load_dataset_from_parquet(dataset_name):
"""Load dataset directly from parquet files."""
Expand Down Expand Up @@ -123,7 +141,9 @@ def main():
parser.add_argument("--filter-user-turns", action="store_true",
help="Also filter based on user messages (default: only filter assistant messages)")
parser.add_argument("--output-entity", type=str, help="Output entity (org/user) for the filtered dataset. If not provided, uses the same entity as the input dataset.")

parser.add_argument("--column", type=str, default="messages",
help="Column name containing the messages (default: messages)")

args = parser.parse_args()

input_dataset = args.input_dataset
Expand Down Expand Up @@ -166,10 +186,10 @@ def main():

# Filter function
def filter_fn(example):
should_filter = should_be_filtered_combined(example, verbose=True, filter_user_turns=filter_user_turns)
should_filter = should_be_filtered_combined(example, column=args.column, verbose=True, filter_user_turns=filter_user_turns)
if should_filter and len(filtered_examples) < 3:
# Find which pattern matched and extract the matching text
for message in example["messages"]:
for message in example[args.column]:
# Apply same filtering logic for finding matched text
if message["role"] == "user" and not filter_user_turns:
continue
Expand All @@ -191,7 +211,7 @@ def filter_fn(example):
return not should_filter

print("Filtering dataset...")
filtered_dataset = dataset.filter(filter_fn)
filtered_dataset = dataset.filter(filter_fn, num_proc=32)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Debugging Section Fails on None Content

The filter_fn's debugging section, which collects filtered examples, calls re.search on content without a None check. This causes a TypeError when an example is filtered due to None content, even though the primary filtering logic correctly handles it.

Fix in Cursor Fix in Web

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this make those processes hang? It's getting to the point where jobs are at like 90% and then the progress grinds to a halt?

print(f"Filtered size: {len(filtered_dataset)}")
print(f"Removed {len(dataset) - len(filtered_dataset)} examples")

Expand All @@ -204,7 +224,7 @@ def filter_fn(example):
if "_matched_text" in example:
role = example.get("_matched_role", "unknown")
print(f" Matched text ({role}): '{example['_matched_text']}'")
messages = example.get("messages", [])
messages = example.get(args.column, [])
for msg in messages:
if msg.get("role") == "user":
content = msg.get("content", "")
Expand All @@ -222,4 +242,4 @@ def filter_fn(example):
print("Done!")

if __name__ == "__main__":
main()
main()