Skip to content

Commit

Permalink
Support "fuse unknown" configuration (#117)
Browse files Browse the repository at this point in the history
* Bring over hf token envvar from preview branch

* Add tests for Gemma, including edge cases

Edge cases also added for other BPE tokenizers, but not for T5 yet.

* Sort added tokens by length (descending) to avoid early partial matches

Similar to huggingface/transformers.js@c305c38

* Store vocab as NSString to allow multiple tokens with the same Unicode
canonical representation.

* Remove comments

* Go back to making vocab dictionaries private

* Use ungated copy of Gemma tokenizer

* Use NSString in UnigramTokenizer

* Support fuse_unk configuration.

Tested on one of the T5 testEdgeCases().

* Remove duplicate function
  • Loading branch information
pcuenca authored Aug 19, 2024
1 parent 4c8cf07 commit 1fdb4f4
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 11 deletions.
4 changes: 4 additions & 0 deletions Sources/Tokenizers/BPETokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class BPETokenizer: PreTrainedTokenizerModel {
public let unknownToken: String?
public let unknownTokenId: Int?

public let fuseUnknownTokens: Bool

required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
guard let merges = tokenizerData.model?.merges?.value as? [String] else { fatalError("BPETokenizer requires merges") }
guard let vocab = tokenizerData.model?.vocab?.dictionary as? [NSString: Int] else {
Expand Down Expand Up @@ -75,6 +77,8 @@ class BPETokenizer: PreTrainedTokenizerModel {

bosToken = tokenizerConfig.bosToken?.stringValue
bosTokenId = bosToken == nil ? nil : tokensToIds[bosToken! as NSString]

fuseUnknownTokens = tokenizerConfig.fuseUnk?.boolValue ?? false
}

func convertTokenToId(_ token: String) -> Int? {
Expand Down
15 changes: 10 additions & 5 deletions Sources/Tokenizers/BertTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@ public class BertTokenizer {
public var eosToken: String?
public var eosTokenId: Int?

public let fuseUnknownTokens: Bool

public init(vocab: [String: Int],
merges: [String]?,
tokenizeChineseChars: Bool = true,
bosToken: String? = nil,
eosToken: String? = nil
merges: [String]?,
tokenizeChineseChars: Bool = true,
bosToken: String? = nil,
eosToken: String? = nil,
fuseUnknownTokens: Bool = false
) {
self.vocab = vocab
self.ids_to_tokens = Utils.invert(vocab)
Expand All @@ -37,6 +40,7 @@ public class BertTokenizer {
self.bosTokenId = bosToken == nil ? nil : vocab[bosToken!]
self.eosToken = eosToken
self.eosTokenId = eosToken == nil ? nil : vocab[eosToken!]
self.fuseUnknownTokens = fuseUnknownTokens
}

public required convenience init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
Expand All @@ -47,7 +51,8 @@ public class BertTokenizer {
let tokenizeChineseChars = tokenizerConfig.handleChineseChars?.boolValue ?? true
let eosToken = tokenizerConfig.eosToken?.stringValue
let bosToken = tokenizerConfig.bosToken?.stringValue
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken)
let fuseUnknown = tokenizerConfig.fuseUnk?.boolValue ?? false
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken, fuseUnknownTokens: fuseUnknown)
}


Expand Down
20 changes: 19 additions & 1 deletion Sources/Tokenizers/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ public protocol TokenizingModel {
var eosTokenId: Int? { get }
var unknownToken: String? { get }
var unknownTokenId: Int? { get }

var fuseUnknownTokens: Bool { get }
}

public extension TokenizingModel {
Expand Down Expand Up @@ -138,6 +140,7 @@ public class PreTrainedTokenizer: Tokenizer {
public var eosTokenId: Int? { model.eosTokenId }
public var unknownToken: String? { model.unknownToken }
public var unknownTokenId: Int? { model.unknownTokenId }
public var fuseUnknownTokens: Bool { model.fuseUnknownTokens }

private let addedTokens: Set<String>
private let specialTokens: [String: Int]
Expand Down Expand Up @@ -232,6 +235,21 @@ public class PreTrainedTokenizer: Tokenizer {
.replacingOccurrences(of: " 're", with: "'re")
}

func fuseUnknown(_ tokens: [String]) -> [String] {
guard fuseUnknownTokens else { return tokens }
let (fused, _) = tokens.reduce((fused: [String](), previousIsUnknown: false)) { result, token in
var (fused, previousIsUnknown) = result
let isUnknown = model.convertTokenToId(token) == model.unknownTokenId
if isUnknown {
if !previousIsUnknown { fused.append(token) }
} else {
fused.append(token)
}
return (fused, isUnknown)
}
return fused
}

public func tokenize(text: String) -> [String] {
// Take care of special tokens first
let sections: [String]
Expand All @@ -243,7 +261,7 @@ public class PreTrainedTokenizer: Tokenizer {
return sections.enumerated().map { section, x in
if addedTokens.contains(x) { return [x] }
return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { model($0) }
}.flatMap { $0 }
}.flatMap { fuseUnknown($0) }
}

/// Main entry point
Expand Down
9 changes: 5 additions & 4 deletions Sources/Tokenizers/UnigramTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
let bosTokenId: Int?
let eosToken: String?
let eosTokenId: Int?


// Hardcoded in Unigram tokenizers
let fuseUnknownTokens: Bool = true

private let trie: Trie<Character>

required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
Expand Down Expand Up @@ -71,10 +74,8 @@ class UnigramTokenizer: PreTrainedTokenizerModel {

trie = Trie()
trie.append(contentsOf: vocab.map { $0.token })

// TODO: set fuse_unk to true
}

func convertTokenToId(_ token: String) -> Int? {
return tokensToIds[token as NSString] ?? self.unknownTokenId
}
Expand Down
2 changes: 1 addition & 1 deletion Tests/TokenizersTests/Resources/tokenizer_tests.json

Large diffs are not rendered by default.

0 comments on commit 1fdb4f4

Please sign in to comment.