1616Motivated by: realizing the SFT mix has lots of "I am DeepSeek" snippets.
1717
1818Run with:
19- python scripts/data/sft /filter_dataset_by_keywords.py --input-dataset allenai/tulu-3-sft-mixture --column messages
19+ python scripts/data/filtering_and_updates /filter_dataset_by_keywords.py --input-dataset allenai/tulu-3-sft-mixture --column messages
2020"""
2121
22+ import os
23+ os .environ ['HF_DATASETS_DISABLE_CACHING' ] = '1'
24+
25+ from datasets import disable_caching
26+ disable_caching ()
27+
28+
2229
2330# Popular model providers
2431PROVIDERS = [
25- "OpenAI" , "Open AI" , "Claude" , "Gemini" , "Qwen" , "DeepSeek" , "Anthropic" , "Meta AI" , "Meta's" ,
32+ "OpenAI" , "Open AI" , "Claude" , "Gemini" , "Qwen" , "DeepSeek" , "Anthropic" , "Meta AI" , "Meta's" , "ChatGPT" ,
2633 "Cohere" , "Mistral AI" , "Mistral's" , "xAI" , "Perplexity" # "Google AI", "Google's", "Microsoft", "HuggingFace", "Hugging Face"
2734]
2835
36+ # Regex patterns for filtering (case-insensitive for common words, case-sensitive for company names)
2937# Regex patterns for filtering (case-insensitive for common words, case-sensitive for company names)
3038PATTERNS = [
3139 # Pattern: "I'm [model name], an AI assistant made by {provider}"
32- r"(?i)i'?m\s+(" + "|" .join (PROVIDERS ) + r"),?\s+an?\s+ai\s+(?:assistant|model)[^.!?]*?(?:made|developed|created|trained)\s+by\s+(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*?[.!?]" ,
40+ # Kept full range, removed optional grouping that was too restrictive
41+ r"(?i)\bI'?m\s+(" + "|" .join (PROVIDERS ) + r"),?\s+an?\s+AI\s+(?:assistant|model)[^.!?]{0,100}(?:made|developed|created|trained)\s+by\s+(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
3342
3443 # Pattern: "[Model name] is an AI assistant developed by {provider}"
35- r"(?i)(" + "|" .join (PROVIDERS ) + r")\s+is\s+an?\s+ai\s+(?:assistant|model)[^.!?]*?(?:developed|created|made|trained)\s+by\s+(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*?[.!?]" ,
44+ # Restored full pattern
45+ r"(?i)\b(" + "|" .join (PROVIDERS ) + r")\s+is\s+an?\s+AI\s+(?:assistant|model)[^.!?]{0,100}(?:developed|created|made|trained)\s+by\s+(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
3646
3747 # Pattern: "as a [AI model/assistant/chatbot] ... {provider}"
38- r"(?i)as\s+a\s+(?:language\s+model|ai\s+model|assistant|chatbot|model)[^.!?]*?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*?[.!?]" ,
48+ # Kept greedy to match more
49+ r"(?i)\bas\s+an?\s+(?:language\s+model|AI\s+model|assistant|chatbot|model)[^.!?]{0,100}\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
3950
4051 # Pattern: "as an AI developed by {provider}"
41- r"(?i)as\s+an\s+ai\s+(?:developed|created|made|trained)\s+by\s+(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*?[.!?]" ,
52+ # Kept full range
53+ r"(?i)\bas\s+an\s+AI\s+(?:developed|created|made|trained)\s+by\s+(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
4254
4355 # Pattern: "I am [model type] ... {provider}"
44- r"(?i)i\s+am\s+(?:a\s+)?(?:language\s+model|ai\s+model|assistant|chatbot|model)[^.!?]*?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*?[.!?]" ,
56+ # Kept greedy for full matches
57+ r"(?i)\bI\s+am\s+(?:a\s+)?(?:language\s+model|AI\s+model|assistant|chatbot|model)[^.!?]{0,100}\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
4558
46- # Pattern: "trained by ... {provider}" within one sentence
47- r"(?i)trained\s+by\s+[^.!?]*?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*?[.!?]" ,
59+ # Pattern: "I am called [provider]"
60+ r"(?i)\bI\s+am\s+called\s+\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
61+
62+ # Pattern: "I'm [provider]" or "I am [provider]"
63+ r"(?i)\b(?:I'?m|I\s+am)\s+\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
64+
65+ # Pattern: "trained by ... {provider}" within one sentence
66+ # Made middle section non-greedy but kept full ranges
67+ r"(?i)\btrained\s+by\s+[^.!?]{0,100}?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100}[.!?]" ,
4868
4969 # Pattern: "developed by ... {provider}" within one sentence
50- r"(?i)developed\ s+by\s+[^.!?]* ?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*? [.!?]" ,
70+ r"(?i)\bdeveloped\ s+by\s+[^.!?]{0,100} ?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100} [.!?]" ,
5171
5272 # Pattern: "created by ... {provider}" within one sentence
53- r"(?i)created\ s+by\s+[^.!?]* ?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*? [.!?]" ,
73+ r"(?i)\bcreated\ s+by\s+[^.!?]{0,100} ?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100} [.!?]" ,
5474
5575 # Pattern: "made by ... {provider}" within one sentence
56- r"(?i)made\ s+by\s+[^.!?]* ?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]*? [.!?]" ,
76+ r"(?i)\bmade\ s+by\s+[^.!?]{0,100} ?\b(" + "|" .join (PROVIDERS ) + r")\b[^.!?]{0,100} [.!?]" ,
5777
5878 # Pattern: "against {provider}'s use-case policy" or similar policy references
59- r"(?i)against\ s+(" + "|" .join (PROVIDERS ) + r")(?:'s|'s)?\s+(?:use-case\s+)?(?:policy|policies|guidelines|terms)[^.!?]*? [.!?]" ,
79+ r"(?i)\bagainst\ s+(" + "|" .join (PROVIDERS ) + r")(?:'s|'s)?\s+(?:use-case\s+)?(?:policy|policies|guidelines|terms)[^.!?]{0,100} [.!?]" ,
6080
6181 # Pattern: "{provider}'s policy" or "{provider}'s guidelines"
62- r"(?i)\b(" + "|" .join (PROVIDERS ) + r")(?:'s|'s)\s+(?:policy|policies|guidelines|terms|use-case)[^.!?]*?[.!?]" ,
63- ]
82+ r"(?i)\b(" + "|" .join (PROVIDERS ) + r")(?:'s|'s)\s+(?:policy|policies|guidelines|terms|use-case)[^.!?]{0,100}[.!?]" ,
83+
84+ # Pattern: Any sentence containing "DeepSeek-R1" or "DeepSeek R1" (case-insensitive)
85+ # Less restrictive: bounded but allows more at the start
86+ # r"(?i)[^.!?]{0,250}?\bDeepSeek[\s-]?R1\b[^.!?]{0,100}[.!?]",
87+ r"(?i)[^.!?]{0,250}?\bDeepSeek\b[^.!?]{0,100}[.!?]" ,
6488
89+ # Pattern: Anything with the word "Qwen" (case-insensitive)
90+ # Less restrictive: bounded but allows more at the start
91+ r"(?i)[^.!?]{0,250}?\bQwen\b[^.!?]{0,100}[.!?]" ,
92+
93+ # Pattern: Any sentence containing "Alibaba Qwen" (case-insensitive) or Alibaba Cloud
94+ # Less restrictive: bounded but allows more at the start
95+ r"(?i)[^.!?]{0,250}?\bAlibaba\s+Qwen\b[^.!?]{0,100}[.!?]" ,
96+ r"(?i)[^.!?]{0,250}?\bAlibaba\s+Cloud\b[^.!?]{0,100}[.!?]" ,
97+ ]
6598
66- def should_be_filtered_by_advanced_patterns (example , verbose = False , filter_user_turns = False ):
99+ def should_be_filtered_by_advanced_patterns (example , column = "messages" , verbose = False , filter_user_turns = False ):
67100 """Filter by more sophisticated patterns like 'as a ... OpenAI' or 'trained by ... Google'"""
68101
69- for message in example ["messages" ]:
102+ for message in example [column ]:
70103 # Skip user messages unless explicitly enabled
71104 if message ["role" ] == "user" and not filter_user_turns :
72105 continue
73106 if message ["role" ] != "assistant" and message ["role" ] != "user" :
74107 continue
75108
76109 content = message ["content" ] # Keep original case
77-
110+ # empty content check
111+ if content is None :
112+ return True
78113 for pattern in PATTERNS :
79114 if re .search (pattern , content ):
80115 if verbose :
@@ -87,9 +122,9 @@ def should_be_filtered_by_advanced_patterns(example, verbose=False, filter_user_
87122 return False
88123
89124
90- def should_be_filtered_combined (example , verbose = False , filter_user_turns = False ):
125+ def should_be_filtered_combined (example , column = "messages" , verbose = False , filter_user_turns = False ):
91126 """Combined filtering function"""
92- return should_be_filtered_by_advanced_patterns (example , verbose , filter_user_turns )
127+ return should_be_filtered_by_advanced_patterns (example , column = column , verbose = verbose , filter_user_turns = filter_user_turns )
93128
94129def load_dataset_from_parquet (dataset_name ):
95130 """Load dataset directly from parquet files."""
@@ -124,7 +159,9 @@ def main():
124159 parser .add_argument ("--filter-user-turns" , action = "store_true" ,
125160 help = "Also filter based on user messages (default: only filter assistant messages)" )
126161 parser .add_argument ("--output-entity" , type = str , help = "Output entity (org/user) for the filtered dataset. If not provided, uses the same entity as the input dataset." )
127-
162+ parser .add_argument ("--column" , type = str , default = "messages" ,
163+ help = "Column name containing the messages (default: messages)" )
164+
128165 args = parser .parse_args ()
129166
130167 input_dataset = args .input_dataset
@@ -161,66 +198,34 @@ def main():
161198 raise
162199
163200 print (f"Dataset loaded with { len (dataset )} examples" )
164-
165- # Keep track of filtered examples
166- filtered_examples = []
167-
168- # Filter function
169- def filter_fn (example ):
170- should_filter = should_be_filtered_combined (example , verbose = True , filter_user_turns = filter_user_turns )
171- if should_filter and len (filtered_examples ) < 3 :
172- # Find which pattern matched and extract the matching text
173- for message in example ["messages" ]:
174- # Apply same filtering logic for finding matched text
175- if message ["role" ] == "user" and not filter_user_turns :
176- continue
177- if message ["role" ] != "assistant" and message ["role" ] != "user" :
178- continue
179-
180- content = message ["content" ] # Keep original case
181-
182- for pattern in PATTERNS :
183- match = re .search (pattern , content )
184- if match :
185- example ["_matched_text" ] = match .group (0 )
186- example ["_matched_role" ] = message ["role" ]
187- break
188- if "_matched_text" in example :
189- break
190-
191- filtered_examples .append (example )
192- return not should_filter
193-
201+
194202 print ("Filtering dataset..." )
195- filtered_dataset = dataset .filter (filter_fn )
203+ # First filter without debugging
204+ filtered_dataset = dataset .filter (
205+ lambda ex : not should_be_filtered_combined (ex , column = args .column , verbose = False , filter_user_turns = filter_user_turns ),
206+ num_proc = open_instruct_utils .max_num_processes ()
207+ )
196208 print (f"Filtered size: { len (filtered_dataset )} " )
197209 print (f"Removed { len (dataset ) - len (filtered_dataset )} examples" )
198-
199- # Show a few filtered examples
200- if filtered_examples :
201- print ("\n --- Examples that were removed ---" )
202- for i , example in enumerate (filtered_examples ):
203- print ("---------------------------------" )
204- print (f"\n Example { i + 1 } :" )
205- if "_matched_text" in example :
206- role = example .get ("_matched_role" , "unknown" )
207- print (f" Matched text ({ role } ): '{ example ['_matched_text' ]} '" )
208- messages = example .get ("messages" , [])
209- for msg in messages :
210- if msg .get ("role" ) == "user" :
211- content = msg .get ("content" , "" )
212- print (f" User: { content } " )
213- if msg .get ("role" ) == "assistant" :
214- content = msg .get ("content" , "" )
215- print (f" Assistant: { content } " )
210+
211+ # Then collect a few filtered examples in serial for inspection
212+ if len (dataset ) > len (filtered_dataset ):
213+ print ("\n Collecting example filtered instances..." )
214+ examples_found = 0
215+ print_within = min (1000 , len (dataset ))
216+ for example in dataset .select (range (print_within )):
217+ if should_be_filtered_combined (example , column = args .column , verbose = True , filter_user_turns = filter_user_turns ):
218+ # Show the example
219+ examples_found += 1
220+ if examples_found >= 10 :
216221 break
217- print ( "--- End of examples --- \n " )
222+
218223
219224 # Upload
220225 full_name = f"{ output_dataset } "
221226 print (f"Uploading to: { full_name } " )
222- filtered_dataset .push_to_hub (full_name , private = True )
227+ filtered_dataset .push_to_hub (full_name , private = True , num_proc = open_instruct_utils . max_num_processes () )
223228 print ("Done!" )
224229
225230if __name__ == "__main__" :
226- main ()
231+ main ()
0 commit comments