Skip to content

Commit 48ab849

Browse files
committed
feat(dragoman.go): enhance document markers for clarity in translation prompts
refactor(dragoman.go): streamline response processing with newline addition feat(internal/chunks): introduce chunking utility for text segmentation feat(internal/chunks/chunks_test.go): add tests for text chunking functionality refactor(internal/cli/cli.go): implement text chunking in CLI with new option feat(internal/cli/cli.go): add `SplitChunks` option for customizable text segmentation feat(internal/markdown): remove chunks.go to streamline markdown processing The removal of `chunks.go` from the internal markdown package suggests a strategic decision to streamline markdown processing capabilities, possibly due to redundancy or the adoption of a more efficient approach in handling markdown segmentation. This change could indicate a shift towards utilizing external libraries or a simplification of the codebase to improve maintainability and performance.
1 parent c4ed4ce commit 48ab849

File tree

5 files changed

+128
-99
lines changed

5 files changed

+128
-99
lines changed

dragoman.go

+20-5
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,9 @@ func (t *Translator) Translate(ctx context.Context, document string, opts ...Tra
130130

131131
prompt := heredoc.Docf(`
132132
Translate the following document %sto %s:
133-
---
133+
---<DOC_BEGIN>---
134134
%s
135-
---
135+
---<DOC_END>---
136136
137137
%s
138138
@@ -149,7 +149,10 @@ func (t *Translator) Translate(ctx context.Context, document string, opts ...Tra
149149
return "", err
150150
}
151151

152-
return trimDividers(response), nil
152+
response = trimDividers(response)
153+
response = addNewline(response)
154+
155+
return response, nil
153156
}
154157

155158
func trimDividers(text string) string {
@@ -160,13 +163,25 @@ func trimDividers(text string) string {
160163
return text
161164
}
162165

163-
if out[0] == "---" {
166+
if out[0] == "---<DOC_BEGIN>---" {
164167
out = out[1:]
165168
}
166169

167-
if len(out) > 0 && out[len(out)-1] == "---" {
170+
if len(out) > 0 && out[len(out)-1] == "---<DOC_END>---" {
168171
out = out[:len(out)-1]
169172
}
170173

171174
return strings.TrimSpace(strings.Join(out, "\n"))
172175
}
176+
177+
func addNewline(text string) string {
178+
if text == "" {
179+
return text
180+
}
181+
182+
if !strings.HasSuffix(text, "\n") {
183+
return text + "\n"
184+
}
185+
186+
return text
187+
}

internal/chunks/chunks.go

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package chunks
2+
3+
import (
4+
"strings"
5+
)
6+
7+
// Chunks splits a string into segments based on line prefixes specified in a
8+
// slice. If no prefixes are provided, it returns the entire string as a single
9+
// segment. Each segment is trimmed of leading and trailing whitespace.
10+
func Chunks(source string, splitPrefixes []string) []string {
11+
if len(splitPrefixes) == 0 {
12+
return []string{source}
13+
}
14+
15+
lines := strings.Split(source, "\n")
16+
17+
var chunks []string
18+
var currentChunk []string
19+
20+
appendChunk := func() {
21+
if len(currentChunk) == 0 {
22+
return
23+
}
24+
25+
chunks = append(chunks, strings.TrimSpace(strings.Join(currentChunk, "\n")))
26+
currentChunk = currentChunk[:0]
27+
}
28+
29+
for _, line := range lines {
30+
if len(currentChunk) == 0 {
31+
currentChunk = append(currentChunk, line)
32+
continue
33+
}
34+
35+
for _, prefix := range splitPrefixes {
36+
if strings.HasPrefix(line, prefix) {
37+
appendChunk()
38+
break
39+
}
40+
}
41+
42+
currentChunk = append(currentChunk, line)
43+
}
44+
45+
appendChunk()
46+
47+
return chunks
48+
}

internal/markdown/chunks_test.go internal/chunks/chunks_test.go

+17-17
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
package markdown_test
1+
package chunks_test
22

33
import (
44
"strings"
55
"testing"
66

77
"github.com/MakeNowJust/heredoc/v2"
88
"github.com/google/go-cmp/cmp"
9-
"github.com/modernice/dragoman/internal/markdown"
9+
"github.com/modernice/dragoman/internal/chunks"
1010
)
1111

1212
func TestChunks(t *testing.T) {
@@ -37,22 +37,22 @@ func TestChunks(t *testing.T) {
3737
`))
3838

3939
tests := []struct {
40-
name string
41-
splitLevels []int
42-
expected []string
40+
name string
41+
splitPrefixes []string
42+
expected []string
4343
}{
4444
{
4545
name: "no levels",
4646
expected: []string{source},
4747
},
4848
{
49-
name: "heading #1",
50-
splitLevels: []int{1},
51-
expected: []string{source},
49+
name: "heading #1",
50+
splitPrefixes: []string{"# "},
51+
expected: []string{source},
5252
},
5353
{
54-
name: "heading #2",
55-
splitLevels: []int{2},
54+
name: "heading #2",
55+
splitPrefixes: []string{"## "},
5656
expected: []string{
5757
takeLines(source, 3),
5858
skipAndTakeLines(source, 4, 3),
@@ -62,16 +62,16 @@ func TestChunks(t *testing.T) {
6262
},
6363
},
6464
{
65-
name: "heading #3",
66-
splitLevels: []int{3},
65+
name: "heading #3",
66+
splitPrefixes: []string{"### "},
6767
expected: []string{
6868
takeLines(source, 11),
6969
skipAndTakeLines(source, 12, 11),
7070
},
7171
},
7272
{
73-
name: "heading #1 and #2",
74-
splitLevels: []int{1, 2},
73+
name: "heading #1 and #2",
74+
splitPrefixes: []string{"# ", "## "},
7575
expected: []string{
7676
takeLines(source, 3),
7777
skipAndTakeLines(source, 4, 3),
@@ -81,8 +81,8 @@ func TestChunks(t *testing.T) {
8181
},
8282
},
8383
{
84-
name: "heading #2 and #3",
85-
splitLevels: []int{2, 3},
84+
name: "heading #2 and #3",
85+
splitPrefixes: []string{"## ", "### "},
8686
expected: []string{
8787
takeLines(source, 3),
8888
skipAndTakeLines(source, 4, 3),
@@ -96,7 +96,7 @@ func TestChunks(t *testing.T) {
9696

9797
for _, tt := range tests {
9898
t.Run(tt.name, func(t *testing.T) {
99-
chunks := markdown.Chunks(source, tt.splitLevels)
99+
chunks := chunks.Chunks(source, tt.splitPrefixes)
100100

101101
if len(tt.expected) != len(chunks) {
102102
t.Fatalf("unexpected number of chunks. want %d; got %d", len(tt.expected), len(chunks))

internal/cli/cli.go

+43-18
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,26 @@ import (
1010
"io/fs"
1111
"os"
1212
"os/signal"
13+
"strings"
1314
"syscall"
1415
"time"
1516

1617
"github.com/alecthomas/kong"
1718
"github.com/modernice/dragoman"
19+
"github.com/modernice/dragoman/internal/chunks"
1820
"github.com/modernice/dragoman/openai"
1921
)
2022

21-
var options struct {
22-
SourcePath string `arg:"source" name:"source" optional:"" help:"Source file" type:"path" env:"DRAGOMAN_SOURCE"`
23-
SourceLang string `name:"from" short:"f" help:"Source language" env:"DRAGOMAN_SOURCE_LANG" default:"auto"`
24-
TargetLang string `name:"to" short:"t" help:"Target language" env:"DRAGOMAN_TARGET_LANG" default:"English"`
25-
Preserve []string `short:"p" help:"Preserve the specified terms/words" env:"DRAGOMAN_PRESERVE"`
26-
Rules []string `name:"rule" short:"r" help:"Additional rules for the prompt" env:"DRAGOMAN_RULES"`
27-
Out string `short:"o" help:"Output file" type:"path" env:"DRAGOMAN_OUT"`
28-
Update bool `short:"u" help:"Only translate missing fields in output file (requires JSON files)" env:"DRAGOMAN_UPDATE"`
29-
Dry bool `help:"Write the result to stdout" env:"DRAGOMAN_DRY_RUN"`
23+
type cliOptions struct {
24+
SourcePath string `arg:"source" name:"source" optional:"" help:"Source file" type:"path" env:"DRAGOMAN_SOURCE"`
25+
SourceLang string `name:"from" short:"f" help:"Source language" env:"DRAGOMAN_SOURCE_LANG" default:"auto"`
26+
TargetLang string `name:"to" short:"t" help:"Target language" env:"DRAGOMAN_TARGET_LANG" default:"English"`
27+
Preserve []string `short:"p" help:"Preserve the specified terms/words" env:"DRAGOMAN_PRESERVE"`
28+
Rules []string `name:"rule" short:"r" help:"Additional rules for the prompt" env:"DRAGOMAN_RULES"`
29+
Out string `short:"o" help:"Output file" type:"path" env:"DRAGOMAN_OUT"`
30+
Update bool `short:"u" help:"Only translate missing fields in output file (requires JSON files)" env:"DRAGOMAN_UPDATE"`
31+
Dry bool `help:"Write the result to stdout" env:"DRAGOMAN_DRY_RUN"`
32+
SplitChunks []string `name:"split-chunks" help:"Chunk source file at lines that start with one of the provided prefixes" env:"DRAGOMAN_SPLIT_CHUNKS"`
3033

3134
OpenAIKey string `name:"openai-key" help:"OpenAI API key" env:"OPENAI_KEY"`
3235
OpenAIModel string `name:"openai-model" help:"OpenAI model" env:"OPENAI_MODEL" default:"gpt-3.5-turbo"`
@@ -39,6 +42,8 @@ var options struct {
3942
Stream bool `short:"s" help:"Stream output to stdout"`
4043
}
4144

45+
var options cliOptions
46+
4247
// App coordinates the translation of structured text using AI language models.
4348
// It sets up a command-line interface with various options to specify source
4449
// and target languages, preserve certain terms, apply translation rules, and
@@ -158,15 +163,23 @@ func (app *App) Run() {
158163
options.SourceLang = ""
159164
}
160165

161-
result, err := translator.Translate(
162-
ctx,
163-
string(source),
164-
dragoman.Source(options.SourceLang),
165-
dragoman.Target(options.TargetLang),
166-
dragoman.Preserve(options.Preserve...),
167-
dragoman.Rules(options.Rules...),
168-
)
169-
app.kong.FatalIfErrorf(err)
166+
chunks := getChunks(string(source), options)
167+
168+
var results []string
169+
for _, chunk := range chunks {
170+
chunkResult, err := translator.Translate(
171+
ctx,
172+
chunk,
173+
dragoman.Source(options.SourceLang),
174+
dragoman.Target(options.TargetLang),
175+
dragoman.Preserve(options.Preserve...),
176+
dragoman.Rules(options.Rules...),
177+
)
178+
app.kong.FatalIfErrorf(err)
179+
results = append(results, chunkResult)
180+
}
181+
182+
result := strings.Join(results, "\n\n")
170183

171184
if options.Dry {
172185
fmt.Fprintf(os.Stdout, "%s\n", result)
@@ -262,3 +275,15 @@ func jsonMarshal(v any) ([]byte, error) {
262275
err := enc.Encode(v)
263276
return buf.Bytes(), err
264277
}
278+
279+
func getChunks(source string, opts cliOptions) []string {
280+
if len(opts.SplitChunks) == 0 {
281+
return []string{string(source)}
282+
}
283+
284+
if opts.Verbose {
285+
fmt.Fprintf(os.Stderr, "Splitting source file at lines with prefixes: %v\n", opts.SplitChunks)
286+
}
287+
288+
return chunks.Chunks(string(source), opts.SplitChunks)
289+
}

internal/markdown/chunks.go

-59
This file was deleted.

0 commit comments

Comments
 (0)