Skip to content

Commit a4758eb

Browse files
committed
refactor
1 parent 54c27a5 commit a4758eb

13 files changed

Lines changed: 1421 additions & 1044 deletions

File tree

.editorconfig

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# EditorConfig helps developers define and maintain consistent
2+
# coding styles between different editors and IDEs
3+
# editorconfig.org
4+
#
5+
# Some of these options are also respected by Prettier
6+
root = true
7+
8+
[*]
9+
indent_style = space
10+
indent_size = 2
11+
12+
end_of_line = lf
13+
charset = utf-8
14+
trim_trailing_whitespace = true
15+
insert_final_newline = true
16+
17+
# Markdown syntax specifies that trailing whitespaces can be meaningful,
18+
# so let’s not trim those. e.g. 2 trailing spaces = linebreak (<br />)
19+
# See https://daringfireball.net/projects/markdown/syntax#p
20+
[*.md]
21+
trim_trailing_whitespace = false

.github/workflows/ci.yml

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
pull_request:
6+
7+
concurrency:
8+
group: ${{ github.workflow }}-${{ github.ref }}
9+
cancel-in-progress: true
10+
11+
permissions:
12+
checks: write # needed for checks
13+
id-token: write # needed for npm publish with provenance
14+
contents: write # needed for github release
15+
pull-requests: write # needed for coverage comment
16+
17+
jobs:
18+
lint:
19+
name: Lint
20+
runs-on: ubuntu-latest
21+
steps:
22+
- uses: actions/checkout@v4
23+
24+
- name: Setup Biome
25+
uses: biomejs/setup-biome@v2
26+
27+
- name: Lint
28+
run: biome ci --no-errors-on-unmatched .
29+
30+
test:
31+
name: Test
32+
runs-on: ubuntu-latest
33+
timeout-minutes: 15
34+
steps:
35+
- uses: actions/checkout@v4
36+
37+
- name: Setup and Install
38+
uses: zirkelc/setup-and-install@v1
39+
with:
40+
node-version: 20
41+
42+
- name: Build
43+
run: pnpm build
44+
45+
- name: Test
46+
run: pnpm test
47+
48+
preview:
49+
name: Preview
50+
runs-on: ubuntu-latest
51+
steps:
52+
- uses: actions/checkout@v4
53+
54+
- run: corepack enable
55+
56+
- name: Setup and Install
57+
uses: zirkelc/setup-and-install@v1
58+
with:
59+
node-version: 20
60+
61+
- name: Build
62+
run: pnpm build
63+
64+
- name: Publish Preview
65+
run: npx pkg-pr-new publish --pnpm --packageManager=pnpm
66+
67+
release:
68+
name: Release
69+
runs-on: ubuntu-latest
70+
needs: [lint, test]
71+
if: github.ref == 'refs/heads/main'
72+
73+
steps:
74+
- uses: actions/checkout@v4
75+
76+
- name: Setup and Install
77+
uses: zirkelc/setup-and-install@v1
78+
with:
79+
node-version: 20
80+
81+
- name: Build
82+
run: pnpm build
83+
84+
- name: Publish
85+
uses: zirkelc/npm-publish@v1
86+
with:
87+
token: ${{ secrets.NPM_TOKEN }}
88+
dry-run: false
89+
provenance: true

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
A very opinionated markdown text splitter to create chunks worth embedding.
2+
3+
It is built around a few core ideas:
4+
5+
1. **Semantic length vs. character length**: the length of a piece of content should be measured in terms of how much information it contains (i.e. words), not how many raw characters. Embeddings store semantics, not characters, therefore calculating the length of a chunk should ignore markdown formatting characters that may contribute to semantics of the text but aren't actual information.
6+
2. **Markdown as tree**: markdown can be represented as a hierarchical tree, where each node (e.g. heading) potentially has child nodes (e.g. paragraphs, images, links). This relationship between headings and their content can be leveraged to create better chunks.
7+
3. **Atomic units**: the smallest unit for a chunk is a word. Splitting a word further into its characters does not make sense for embeddings.
8+
4. **Never break semantics**: certain semantic elements like links or images should never be split across chunks. Embedding a partial description (of an image or link) or an URL will not create a meaningful embedding.

biome.json

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"$schema": "https://biomejs.dev/schemas/2.1.3/schema.json",
3+
"vcs": {
4+
"enabled": false,
5+
"clientKind": "git",
6+
"useIgnoreFile": false
7+
},
8+
"files": {
9+
"ignoreUnknown": false
10+
},
11+
"formatter": {
12+
"enabled": true,
13+
"indentStyle": "space",
14+
"useEditorconfig": true
15+
},
16+
"linter": {
17+
"enabled": true,
18+
"rules": {
19+
"recommended": true
20+
}
21+
},
22+
"javascript": {
23+
"formatter": {
24+
"quoteStyle": "single",
25+
"semicolons": "always",
26+
"trailingCommas": "all",
27+
"arrowParentheses": "always"
28+
}
29+
},
30+
"assist": {
31+
"enabled": true,
32+
"actions": {
33+
"source": {
34+
"organizeImports": "on"
35+
}
36+
}
37+
}
38+
}

package.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@
55
"description": "",
66
"main": "index.js",
77
"scripts": {
8-
"test": "vitest"
8+
"test": "vitest",
9+
"lint": "biome check . --write --no-errors-on-unmatched"
910
},
1011
"keywords": [],
1112
"author": "",
1213
"license": "ISC",
1314
"devDependencies": {
15+
"@biomejs/biome": "2.1.3",
16+
"@total-typescript/tsconfig": "^1.0.4",
1417
"tsx": "^4.20.3",
1518
"typescript": "^5.8.3",
1619
"vitest": "^3.2.4"
1720
}
18-
}
21+
}

packages/text-splitter/src/debug-tsx.ts

Lines changed: 0 additions & 9 deletions
This file was deleted.

0 commit comments

Comments
 (0)