Skip to content
This repository was archived by the owner on Mar 28, 2025. It is now read-only.

Commit fedd5ca

Browse files
authored
Extraction graph from yaml (#34)
* extraction graph from yaml * version bump
1 parent 0b9f008 commit fedd5ca

File tree

6 files changed

+129
-56
lines changed

6 files changed

+129
-56
lines changed

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "getindexify",
3-
"version": "0.0.42",
3+
"version": "0.0.43",
44
"description": "This is the TypeScript client for interacting with the Indexify service.",
55
"main": "./dist/index.js",
66
"module": "./dist/index.mjs",

src/ExtractionGraph.ts

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { IExtractionPolicy } from "./types";
2+
import yaml from "yaml";
3+
4+
class ExtractionGraph {
5+
id?: string;
6+
name: string;
7+
namespace?: string;
8+
extraction_policies: IExtractionPolicy[];
9+
10+
constructor({
11+
id,
12+
name,
13+
namespace,
14+
extraction_policies,
15+
}: {
16+
id?: string;
17+
name: string;
18+
namespace?: string;
19+
extraction_policies: IExtractionPolicy[];
20+
}) {
21+
this.id = id;
22+
this.name = name;
23+
this.namespace = namespace;
24+
this.extraction_policies = extraction_policies;
25+
}
26+
27+
static fromDict(json: Record<string, any>): ExtractionGraph {
28+
if ("namespace" in json) {
29+
delete json["namespace"];
30+
}
31+
return new ExtractionGraph({
32+
id: json.id,
33+
name: json.name,
34+
extraction_policies: json.extraction_policies,
35+
});
36+
}
37+
38+
static fromYaml(spec: string): ExtractionGraph {
39+
const json = yaml.parse(spec);
40+
return ExtractionGraph.fromDict(json);
41+
}
42+
43+
toDict(): Record<string, any> {
44+
const filteredDict: Record<string, any> = {};
45+
for (const key in this) {
46+
if (this[key] !== null && this[key] !== undefined) {
47+
filteredDict[key] = this[key];
48+
}
49+
}
50+
return filteredDict;
51+
}
52+
}
53+
54+
export default ExtractionGraph;

src/client.ts

+27-26
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ import Extractor from "./extractor";
33
import {
44
IContentMetadata,
55
IExtractor,
6-
IExtractionGraph,
76
IIndex,
87
INamespace,
98
ITask,
@@ -20,20 +19,21 @@ import {
2019
} from "./types";
2120
import { v4 as uuidv4 } from "uuid";
2221
import CryptoJS from "crypto-js";
22+
import ExtractionGraph from "./ExtractionGraph";
2323

2424
const DEFAULT_SERVICE_URL = "http://localhost:8900"; // Set your default service URL
2525

2626
class IndexifyClient {
2727
public serviceUrl: string;
2828
private client: AxiosInstance;
2929
public namespace: string;
30-
public extractionGraphs: IExtractionGraph[];
30+
public extractionGraphs: ExtractionGraph[];
3131

3232
constructor(
3333
serviceUrl: string = DEFAULT_SERVICE_URL,
3434
namespace: string = "default",
3535
// optional mtls config
36-
extractionGraphs: IExtractionGraph[],
36+
extractionGraphs: ExtractionGraph[],
3737
httpsAgent?: any
3838
) {
3939
this.serviceUrl = serviceUrl;
@@ -62,13 +62,17 @@ class IndexifyClient {
6262
return new IndexifyClient(
6363
serviceUrl,
6464
namespace,
65-
response.data.namespace.extraction_graphs.map((graph: { extraction_policies: any[]; }) => ({
66-
...graph,
67-
extraction_policies: graph.extraction_policies.map((policy: { filters_eq: any; }) => ({
68-
...policy,
69-
labels_eq: policy.filters_eq, // Transform filters_eq to labels_eq
70-
}))
71-
})),
65+
response.data.namespace.extraction_graphs.map(
66+
(graph: { extraction_policies: any[] }) => ({
67+
...graph,
68+
extraction_policies: graph.extraction_policies.map(
69+
(policy: { filters_eq: any }) => ({
70+
...policy,
71+
labels_eq: policy.filters_eq, // Transform filters_eq to labels_eq
72+
})
73+
),
74+
})
75+
),
7276
IndexifyClient.getHttpsAgent({ mtlsConfig })
7377
);
7478
}
@@ -161,7 +165,7 @@ class IndexifyClient {
161165
mtlsConfig,
162166
}: {
163167
name: string;
164-
extractionGraphs?: IExtractionGraph[];
168+
extractionGraphs?: ExtractionGraph[];
165169
labels?: Record<string, string>;
166170
mtlsConfig?: IMtlsConfig;
167171
}) {
@@ -205,17 +209,14 @@ class IndexifyClient {
205209
}
206210

207211
async createExtractionGraph(
208-
name: string,
209-
extractionPolicies: IExtractionPolicy | IExtractionPolicy[]
212+
extractionGraph: ExtractionGraph
210213
): Promise<IAddExtractorGraphResponse> {
211-
const policiesArray = Array.isArray(extractionPolicies)
212-
? extractionPolicies
213-
: [extractionPolicies];
214-
215-
const resp = await this.client.post("extraction_graphs", {
216-
name,
217-
extraction_policies: policiesArray,
218-
});
214+
const data = {
215+
name: extractionGraph.name,
216+
extraction_policies: extractionGraph.extraction_policies,
217+
}
218+
console.log("create extraction graph", JSON.stringify(data.extraction_policies));
219+
const resp = await this.client.post("extraction_graphs", data);
219220

220221
// update this.extractor_bindings
221222
await this.getExtractionGraphs();
@@ -364,15 +365,15 @@ class IndexifyClient {
364365
Object.keys(labels).forEach((key) => {
365366
formData.append(key, labels[key]);
366367
});
367-
368+
368369
// Upload File
369370
const res = await this.client.post("upload_file", formData, {
370371
headers: {
371372
...formData.getHeaders(),
372373
},
373374
params,
374375
});
375-
return res.data.content_id
376+
return res.data.content_id;
376377
} else {
377378
// browser
378379
if (!isBlob(fileInput)) {
@@ -390,13 +391,13 @@ class IndexifyClient {
390391

391392
// Upload File
392393
const res = await this.client.post("/upload_file", formData, {
393-
params
394+
params,
394395
});
395-
return res.data.content_id
396+
return res.data.content_id;
396397
}
397398
}
398399

399-
async getExtractionGraphs(): Promise<IExtractionGraph[]> {
400+
async getExtractionGraphs(): Promise<ExtractionGraph[]> {
400401
const resp = await this.client.get("");
401402
const extractionGraphs = resp.data.namespace?.extraction_graphs ?? [];
402403
this.extractionGraphs = extractionGraphs;

src/index.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ExtractionGraph from "./ExtractionGraph";
12
import IndexifyClient from "./client";
23
import Extractor from "./extractor";
34
import {
@@ -8,7 +9,6 @@ import {
89
IIndex,
910
IContentMetadata,
1011
IExtractedMetadata,
11-
IExtractionGraph,
1212
IExtractionPolicy,
1313
ISearchIndexResponse,
1414
ITask,
@@ -29,7 +29,7 @@ export {
2929
IIndex,
3030
IContentMetadata,
3131
IExtractedMetadata,
32-
IExtractionGraph,
32+
ExtractionGraph,
3333
IExtractionPolicy,
3434
ISearchIndexResponse,
3535
ITask,

src/types.ts

+4-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import ExtractionGraph from "./ExtractionGraph";
2+
13
export interface INamespace {
24
name: string;
3-
extraction_graphs: IExtractionGraph[];
5+
extraction_graphs: ExtractionGraph[];
46
}
57

68
export interface IEmbeddingSchema {
@@ -64,21 +66,14 @@ export interface IExtractedMetadata {
6466
extractor_name: string;
6567
}
6668

67-
export interface IExtractionGraph {
68-
id: string;
69-
name: string;
70-
namespace: string;
71-
extraction_policies: IExtractionPolicy[];
72-
}
73-
7469
export interface IExtractionPolicy {
7570
id?: string;
7671
extractor: string;
7772
name: string;
7873
labels_eq?: string;
7974
input_params?: Record<string, string | number>;
8075
content_source?: string;
81-
graph_name: string;
76+
graph_name?: string;
8277
}
8378

8479
export interface ITaskContentMetadata {

tests/client.test.ts

+41-18
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { IndexifyClient } from "../src";
2+
import ExtractionGraph from "../src/ExtractionGraph";
23
import { IExtractionPolicy } from "../src/types";
34
import { isAxiosError } from "axios";
45

@@ -23,16 +24,15 @@ async function setupExtractionGraph(
2324
extractor: string
2425
): Promise<string[]> {
2526
const nanoid = generateNanoId(8);
26-
const extractionPolicy: IExtractionPolicy = {
27-
extractor,
28-
name: `extractor.${nanoid}`,
29-
graph_name: extractionGraphName
30-
};
31-
const resp = await client.createExtractionGraph(
32-
extractionGraphName,
33-
extractionPolicy
34-
);
35-
return resp.indexes
27+
28+
const graph = ExtractionGraph.fromYaml(`
29+
name: '${extractionGraphName}'
30+
extraction_policies:
31+
- extractor: '${extractor}'
32+
name: 'extractor.${nanoid}'
33+
`);
34+
const resp = await client.createExtractionGraph(graph);
35+
return resp.indexes;
3636
}
3737

3838
test("createClient", async () => {
@@ -119,8 +119,8 @@ test("searchIndex", async () => {
119119
extractionGraphName,
120120
"tensorlake/minilm-l6"
121121
);
122-
123-
expect(indexes.length).toBe(1)
122+
123+
expect(indexes.length).toBe(1);
124124
await client.addDocuments(extractionGraphName, [
125125
{ text: "This is a test1", labels: { source: "test" } },
126126
{ text: "This is a test2", labels: { source: "test" } },
@@ -198,7 +198,10 @@ test("getStructuredMetadata", async () => {
198198
"tensorlake/wikipedia"
199199
);
200200

201-
const contentId = await client.uploadFile(extractionGraphName, `${__dirname}/files/steph_curry.html`);
201+
const contentId = await client.uploadFile(
202+
extractionGraphName,
203+
`${__dirname}/files/steph_curry.html`
204+
);
202205
await new Promise((r) => setTimeout(r, 10000));
203206
const extractedMetadata = await client.getStructuredMetadata(contentId);
204207
expect(extractedMetadata.length).toBeGreaterThanOrEqual(1);
@@ -215,16 +218,18 @@ test("getSchemas", async () => {
215218
extractionGraphName,
216219
"tensorlake/wikipedia"
217220
);
218-
221+
219222
// upload html
220-
await client.uploadFile(extractionGraphName, `${__dirname}/files/steph_curry.html`);
223+
await client.uploadFile(
224+
extractionGraphName,
225+
`${__dirname}/files/steph_curry.html`
226+
);
221227
await new Promise((r) => setTimeout(r, 10000));
222228

223-
224229
const schemas = await client.getSchemas();
225230
expect(schemas.length).toBe(1);
226-
expect(schemas[0].extraction_graph_name).toBe(extractionGraphName)
227-
expect(Object.keys(schemas[0].columns).length).toBe(13)
231+
expect(schemas[0].extraction_graph_name).toBe(extractionGraphName);
232+
expect(Object.keys(schemas[0].columns).length).toBe(13);
228233
});
229234

230235
test("downloadContent", async () => {
@@ -321,6 +326,24 @@ test("extract", async () => {
321326
expect(content.features?.[0].data.title).toBe("Stephen Curry");
322327
});
323328

329+
test("extractionGraph from yaml", async () => {
330+
const graph = ExtractionGraph.fromYaml(`
331+
name: 'nbakb'
332+
extraction_policies:
333+
- extractor: 'tensorlake/chunk-extractor'
334+
name: 'chunker'
335+
input_params:
336+
chunk_size: 1000
337+
overlap: 100
338+
- extractor: 'tensorlake/minilm-l6'
339+
name: 'wikiembedding'
340+
content_source: 'chunker'
341+
`);
342+
expect(graph.extraction_policies.length).toBe(2);
343+
expect(graph.id).toBe(undefined);
344+
expect(graph.name).toBe("nbakb");
345+
});
346+
324347
test("generateHashFromString", async () => {
325348
const client = await IndexifyClient.createClient();
326349
expect(client.generateHashFromString("test")).toBe("9f86d081884c7d65");

0 commit comments

Comments
 (0)