Skip to content

Commit

Permalink
Fix crashes in PreTrainedTokenizer and PreTokenizer with Gemma 2 2B (#…
Browse files Browse the repository at this point in the history
…111)

* Avoid force unwrapping

* Fix crashes in split(by captureRegex:)
  • Loading branch information
DePasqualeOrg authored Aug 19, 2024
1 parent 1fdb4f4 commit c088078
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Sources/Tokenizers/BPETokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class BPETokenizer: PreTrainedTokenizerModel {
let RE = #"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"#
let tokens = text.ranges(of: RE).map { String(text[$0]) }
return tokens.map { (token) -> String in
return Array(token.utf8).map { byteEncoder[$0]! }.joined()
return Array(token.utf8).compactMap { byteEncoder[$0] }.joined()
}
}

Expand Down
4 changes: 2 additions & 2 deletions Sources/Tokenizers/BertTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public class BertTokenizer {
"""
)
}
return tokens.map { vocab[$0]! }
return tokens.compactMap { vocab[$0] }
}

/// Main entry point
Expand All @@ -91,7 +91,7 @@ public class BertTokenizer {

/// Un-tokenization: get tokens from tokenIds
func unTokenize(tokens: [Int]) -> [String] {
return tokens.map { ids_to_tokens[$0]! }
return tokens.compactMap { ids_to_tokens[$0] }
}

/// Un-tokenization:
Expand Down
22 changes: 12 additions & 10 deletions Sources/Tokenizers/PreTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -288,32 +288,34 @@ public extension String {
let selfRange = NSRange(startIndex..<endIndex, in: self)
let matches = captureRegex.matches(in: self, options: [], range: selfRange)

if matches.first == nil { return [self] }
if matches.isEmpty { return [self] }

var result: [String] = []
var start = startIndex
for match in matches {
// Append prefix before matched separator
let prefixEnd = index(startIndex, offsetBy: match.range.lowerBound)
if start < prefixEnd {
result.append(String(self[start..<prefixEnd]))
// Safely move the prefix end to the start of the current match
let safePrefixEnd = index(startIndex, offsetBy: match.range.lowerBound, limitedBy: endIndex) ?? endIndex
if start < safePrefixEnd {
result.append(String(self[start..<safePrefixEnd]))
}
start = index(startIndex, offsetBy: match.range.upperBound)

// Safely move the start index to the end of the current match
let matchEndIndex = index(startIndex, offsetBy: match.range.upperBound, limitedBy: endIndex) ?? endIndex
start = matchEndIndex

// Append separator, supporting capture groups
for r in (0..<match.numberOfRanges).reversed() {
let matchRange = match.range(at: r)
if let sepRange = Range(matchRange, in:self) {
if let sepRange = Range(matchRange, in: self) {
result.append(String(self[sepRange]))
break
}
}
}

// Append remaining suffix
let beginningOfEnd = index(startIndex, offsetBy: matches.last!.range.upperBound)
if beginningOfEnd < endIndex {
result.append(String(self[beginningOfEnd...]))
if start < endIndex {
result.append(String(self[start...]))
}

return result
Expand Down
2 changes: 1 addition & 1 deletion Sources/Tokenizers/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ public class PreTrainedTokenizer: Tokenizer {
/// Decode
public func decode(tokens: [Int]) -> String {
// IDs to tokens
let tokenStrings = tokens.map { model.convertIdToToken($0)! }
let tokenStrings = tokens.compactMap { model.convertIdToToken($0) }
let decoded = decodeTokens(tokenStrings)
// At this point we should have a single String
return cleanUp(text: decoded.joined(separator: ""))
Expand Down

0 comments on commit c088078

Please sign in to comment.