Skip to content

Commit 1f2712d

Browse files
authored
chat: fix emoji corruption (#3443)
Signed-off-by: Jared Van Bortel <[email protected]>
1 parent f8f78c6 commit 1f2712d

File tree

6 files changed

+57
-53
lines changed

6 files changed

+57
-53
lines changed

gpt4all-chat/CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2121
- Code Interpreter: Fix console.log not accepting a single string after v3.7.0 ([#3426](https://github.com/nomic-ai/gpt4all/pull/3426))
2222
- Fix Phi 3.1 Mini 128K Instruct template (by [@ThiloteE](https://github.com/ThiloteE) in [#3412](https://github.com/nomic-ai/gpt4all/pull/3412))
2323
- Don't block the gui thread for reasoning ([#3435](https://github.com/nomic-ai/gpt4all/pull/3435))
24+
- Fix corruption of unicode in output of reasoning models ([#3443](https://github.com/nomic-ai/gpt4all/pull/3443))
2425

2526
## [3.7.0] - 2025-01-21
2627

gpt4all-chat/src/chat.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ void Chat::responseStopped(qint64 promptResponseMs)
254254
});
255255

256256
ToolCallParser parser;
257-
parser.update(possibleToolcall);
257+
parser.update(possibleToolcall.toUtf8());
258258
if (parser.state() == ToolEnums::ParseState::Complete && parser.startTag() != ToolCallConstants::ThinkTag)
259259
processToolCall(parser.toolCall());
260260
else

gpt4all-chat/src/chatllm.cpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -938,13 +938,11 @@ auto ChatLLM::promptInternal(
938938
result.responseTokens++;
939939
m_timer->inc();
940940

941-
// FIXME: This is *not* necessarily fully formed utf data because it can be partial at this point
942-
// handle this like below where we have a QByteArray
943-
toolCallParser.update(QString::fromStdString(piece.data()));
941+
toolCallParser.update(piece.data());
944942

945943
// Split the response into two if needed and create chat items
946944
if (toolCallParser.numberOfBuffers() < 2 && toolCallParser.splitIfPossible()) {
947-
const QVector<QString> &parseBuffers = toolCallParser.buffers();
945+
const auto parseBuffers = toolCallParser.buffers();
948946
Q_ASSERT(parseBuffers.size() == 2);
949947
if (toolCallParser.startTag() == ToolCallConstants::ThinkTag)
950948
m_chatModel->splitThinking({parseBuffers.at(0), parseBuffers.at(1)});
@@ -955,7 +953,7 @@ auto ChatLLM::promptInternal(
955953
// Split the response into three if needed and create chat items
956954
if (toolCallParser.numberOfBuffers() < 3 && toolCallParser.startTag() == ToolCallConstants::ThinkTag
957955
&& toolCallParser.splitIfPossible()) {
958-
const QVector<QString> &parseBuffers = toolCallParser.buffers();
956+
const auto parseBuffers = toolCallParser.buffers();
959957
Q_ASSERT(parseBuffers.size() == 3);
960958
m_chatModel->endThinking({parseBuffers.at(1), parseBuffers.at(2)}, totalTime.elapsed());
961959
}
@@ -964,7 +962,7 @@ auto ChatLLM::promptInternal(
964962
auto respStr = QString::fromUtf8(result.response);
965963

966964
try {
967-
const QVector<QString> &parseBuffers = toolCallParser.buffers();
965+
const auto parseBuffers = toolCallParser.buffers();
968966
if (parseBuffers.size() > 1)
969967
m_chatModel->setResponseValue(parseBuffers.last());
970968
else
@@ -998,7 +996,7 @@ auto ChatLLM::promptInternal(
998996
m_timer->stop();
999997
qint64 elapsed = totalTime.elapsed();
1000998

1001-
const QVector<QString> &parseBuffers = toolCallParser.buffers();
999+
const auto parseBuffers = toolCallParser.buffers();
10021000
const bool shouldExecuteToolCall = toolCallParser.state() == ToolEnums::ParseState::Complete
10031001
&& toolCallParser.startTag() != ToolCallConstants::ThinkTag;
10041002

gpt4all-chat/src/chatmodel.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ class ChatItem : public QObject
265265
if (type() == Type::Response) {
266266
// We parse if this contains any part of a partial toolcall
267267
ToolCallParser parser;
268-
parser.update(value);
268+
parser.update(value.toUtf8());
269269

270270
// If no tool call is detected, return the original value
271271
if (parser.startIndex() < 0)
@@ -292,7 +292,7 @@ class ChatItem : public QObject
292292
QString thinkContent(const QString &value) const
293293
{
294294
ToolCallParser parser;
295-
parser.update(value);
295+
parser.update(value.toUtf8());
296296

297297
// Extract the content
298298
QString content = parser.toolCall();
@@ -303,7 +303,7 @@ class ChatItem : public QObject
303303
QString toolCallContent(const QString &value) const
304304
{
305305
ToolCallParser parser;
306-
parser.update(value);
306+
parser.update(value.toUtf8());
307307

308308
// Extract the code
309309
QString code = parser.toolCall();

gpt4all-chat/src/toolcallparser.cpp

+29-25
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88

99
ToolCallParser::ToolCallParser()
1010
{
11-
m_possibleStartTags << ToolCallConstants::CodeInterpreterTag
12-
<< ToolCallConstants::ThinkTag;
13-
m_possibleEndTags << ToolCallConstants::CodeInterpreterEndTag
14-
<< ToolCallConstants::ThinkEndTag;
11+
m_possibleStartTags << ToolCallConstants::CodeInterpreterTag.toUtf8()
12+
<< ToolCallConstants::ThinkTag.toUtf8();
13+
m_possibleEndTags << ToolCallConstants::CodeInterpreterEndTag.toUtf8()
14+
<< ToolCallConstants::ThinkEndTag.toUtf8();
1515
reset();
1616
}
1717

@@ -22,7 +22,7 @@ void ToolCallParser::reset()
2222

2323
// These are global states maintained between update calls
2424
m_buffers.clear();
25-
m_buffers.append(QString());
25+
m_buffers << QByteArray();
2626
}
2727

2828
void ToolCallParser::resetSearchState()
@@ -40,48 +40,48 @@ void ToolCallParser::resetSearchState()
4040
m_endIndex = -1;
4141
}
4242

43-
bool ToolCallParser::isExpected(QChar c) const
43+
bool ToolCallParser::isExpected(char c) const
4444
{
4545
return m_expected.isEmpty() || m_expected.contains(c);
4646
}
4747

48-
void ToolCallParser::setExpected(const QStringList &tags)
48+
void ToolCallParser::setExpected(const QList<QByteArray> &tags)
4949
{
5050
m_expected.clear();
51-
for (const QString &tag : tags) {
51+
for (const auto &tag : tags) {
5252
Q_ASSERT(tag.size() > m_expectedIndex);
5353
m_expected << tag.at(m_expectedIndex);
5454
}
5555
}
5656

57-
QString ToolCallParser::startTag() const
57+
QByteArray ToolCallParser::startTag() const
5858
{
5959
if (m_currentTagIndex < 0)
60-
return QString();
60+
return {};
6161
return m_possibleStartTags.at(m_currentTagIndex);
6262
}
6363

64-
QString ToolCallParser::endTag() const
64+
QByteArray ToolCallParser::endTag() const
6565
{
6666
if (m_currentTagIndex < 0)
67-
return QString();
67+
return {};
6868
return m_possibleEndTags.at(m_currentTagIndex);
6969
}
7070

71-
QString &ToolCallParser::currentBuffer()
71+
QByteArray &ToolCallParser::currentBuffer()
7272
{
7373
return m_buffers.last();
7474
}
7575

7676
// This method is called with an arbitrary string and a current state. This method should take the
7777
// current state into account and then parse through the update character by character to arrive at
7878
// the new state.
79-
void ToolCallParser::update(const QString &update)
79+
void ToolCallParser::update(const QByteArray &update)
8080
{
8181
currentBuffer().append(update);
8282

8383
for (size_t i = currentBuffer().size() - update.size(); i < currentBuffer().size(); ++i) {
84-
const QChar c = currentBuffer()[i];
84+
const char c = currentBuffer()[i];
8585
const bool foundMatch = isExpected(c);
8686
if (!foundMatch) {
8787
resetSearchState();
@@ -100,7 +100,7 @@ void ToolCallParser::update(const QString &update)
100100
case ToolEnums::ParseState::InTagChoice:
101101
{
102102
for (int i = 0; i < m_possibleStartTags.size(); ++i) {
103-
const QString tag = m_possibleStartTags.at(i);
103+
const auto &tag = m_possibleStartTags.at(i);
104104
if (c == tag.at(1)) m_currentTagIndex = i;
105105
}
106106
if (m_currentTagIndex >= 0) {
@@ -115,7 +115,7 @@ void ToolCallParser::update(const QString &update)
115115
{
116116
m_startTagBuffer.append(c);
117117

118-
const QString startTag = this->startTag();
118+
const auto startTag = this->startTag();
119119
Q_ASSERT(!startTag.isEmpty());
120120
if (m_expectedIndex == startTag.size() - 1) {
121121
m_expectedIndex = 0;
@@ -131,7 +131,7 @@ void ToolCallParser::update(const QString &update)
131131
case ToolEnums::ParseState::Partial:
132132
{
133133
Q_ASSERT(m_currentTagIndex >= 0);
134-
const QString endTag = this->endTag();
134+
const auto endTag = this->endTag();
135135
Q_ASSERT(!endTag.isEmpty());
136136
m_toolCall.append(c);
137137
m_endTagBuffer.append(c);
@@ -159,26 +159,30 @@ bool ToolCallParser::splitIfPossible()
159159
// The first split happens when we're in a partial state
160160
if (m_buffers.size() < 2 && m_state == ToolEnums::ParseState::Partial) {
161161
Q_ASSERT(m_startIndex >= 0);
162-
const QString beforeToolCall = currentBuffer().left(m_startIndex);
163-
const QString toolCall = currentBuffer().mid(m_startIndex);
162+
const auto beforeToolCall = currentBuffer().left(m_startIndex);
163+
const auto toolCall = currentBuffer().mid (m_startIndex);
164164
m_buffers = { beforeToolCall, toolCall };
165165
return true;
166166
}
167167

168168
// The second split happens when we're in the complete state
169169
if (m_buffers.size() < 3 && m_state == ToolEnums::ParseState::Complete) {
170170
Q_ASSERT(m_endIndex >= 0);
171-
const QString beforeToolCall = m_buffers.first();
172-
const QString toolCall = currentBuffer().left(m_endIndex);
173-
const QString afterToolCall = currentBuffer().mid(m_endIndex);
171+
const auto &beforeToolCall = m_buffers.first();
172+
const auto toolCall = currentBuffer().left(m_endIndex);
173+
const auto afterToolCall = currentBuffer().mid (m_endIndex);
174174
m_buffers = { beforeToolCall, toolCall, afterToolCall };
175175
return true;
176176
}
177177

178178
return false;
179179
}
180180

181-
const QVector<QString> &ToolCallParser::buffers() const
181+
QStringList ToolCallParser::buffers() const
182182
{
183-
return m_buffers;
183+
QStringList result;
184+
result.reserve(m_buffers.size());
185+
for (const auto &buffer : m_buffers)
186+
result << QString::fromUtf8(buffer);
187+
return result;
184188
}

gpt4all-chat/src/toolcallparser.h

+18-17
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33

44
#include "tool.h"
55

6-
#include <QChar>
6+
#include <QByteArray>
7+
#include <QList>
78
#include <QString>
8-
#include <QPair>
9+
#include <QStringList>
910

1011
namespace ToolCallConstants
1112
{
@@ -25,34 +26,34 @@ class ToolCallParser
2526
public:
2627
ToolCallParser();
2728
void reset();
28-
void update(const QString &update);
29-
QString toolCall() const { return m_toolCall; }
29+
void update(const QByteArray &update);
30+
QString toolCall() const { return QString::fromUtf8(m_toolCall); }
3031
int startIndex() const { return m_startIndex; }
3132
ToolEnums::ParseState state() const { return m_state; }
32-
QString startTag() const;
33-
QString endTag() const;
33+
QByteArray startTag() const;
34+
QByteArray endTag() const;
3435

3536
bool splitIfPossible();
36-
const QVector<QString> &buffers() const;
37+
QStringList buffers() const;
3738
int numberOfBuffers() const { return m_buffers.size(); }
3839

3940
private:
40-
QString &currentBuffer();
41+
QByteArray &currentBuffer();
4142
void resetSearchState();
42-
bool isExpected(QChar c) const;
43-
void setExpected(const QStringList &tags);
43+
bool isExpected(char c) const;
44+
void setExpected(const QList<QByteArray> &tags);
4445

45-
QStringList m_possibleStartTags;
46-
QStringList m_possibleEndTags;
47-
QString m_startTagBuffer;
48-
QString m_endTagBuffer;
46+
QList<QByteArray> m_possibleStartTags;
47+
QList<QByteArray> m_possibleEndTags;
48+
QByteArray m_startTagBuffer;
49+
QByteArray m_endTagBuffer;
4950
int m_currentTagIndex;
5051

51-
QVector<QChar> m_expected;
52+
QList<char> m_expected;
5253
int m_expectedIndex;
5354
ToolEnums::ParseState m_state;
54-
QVector<QString> m_buffers;
55-
QString m_toolCall;
55+
QList<QByteArray> m_buffers;
56+
QByteArray m_toolCall;
5657
int m_startIndex;
5758
int m_endIndex;
5859
};

0 commit comments

Comments
 (0)