Skip to content
This repository was archived by the owner on Mar 28, 2025. It is now read-only.

Commit e91f532

Browse files
authored
Add extract method (#26)
* add extract method, tests, and example * version bump * export IContent type
1 parent e0ee4ce commit e91f532

File tree

7 files changed

+4565
-2
lines changed

7 files changed

+4565
-2
lines changed

examples/extractContent.ts

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import { IndexifyClient } from "../src";
2+
const fs = require("fs");
3+
4+
// Extract wikipedia article directly from client
5+
(async () => {
6+
// Initialize client
7+
const client = await IndexifyClient.createClient();
8+
9+
// Read html file
10+
const html = fs.readFileSync(
11+
__dirname + "/../tests/files/steph_curry.html",
12+
"utf8"
13+
);
14+
15+
// Call extract method on running wikipedia extractor
16+
const response = await client.extract({
17+
name: "tensorlake/wikipedia",
18+
content: { bytes: html, content_type: "text/plain" },
19+
});
20+
21+
// Output preview
22+
console.log(`Number of contents created ${response.content.length}`);
23+
response.content.forEach((c, i) => {
24+
console.log(`content ${i}`, c.content_type);
25+
if (c.content_type === "text/plain" && c.bytes.length) {
26+
console.log("Preview:", String.fromCharCode(...c.bytes).slice(0, 100));
27+
}
28+
if (c.features?.length) {
29+
console.log(`Features:`, c.features);
30+
}
31+
});
32+
33+
/*
34+
Output should look like this following
35+
36+
Number of contents created 29
37+
content 0 text/plain
38+
Preview: Curry is the son of Sonya and Dell Curry. He was born in Akron, Ohio, at Summa Akron City Hospital,
39+
Features: [
40+
{
41+
feature_type: 'metadata',
42+
name: 'metadata',
43+
data: { headline: 'Early life', title: 'Stephen Curry' }
44+
}
45+
]
46+
content 1 text/plain
47+
Preview: Before Curry even played in his first game for the Wildcats, head coach Bob McKillop praised him at
48+
Features: [
49+
{
50+
feature_type: 'metadata',
51+
name: 'metadata',
52+
data: { headline: 'College career', title: 'Stephen Curry' }
53+
}
54+
]
55+
*/
56+
})();

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "getindexify",
3-
"version": "0.0.33",
3+
"version": "0.0.34",
44
"description": "This is the TypeScript client for interacting with the Indexify service.",
55
"main": "./dist/index.js",
66
"module": "./dist/index.mjs",

src/client.ts

+27
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import {
1313
IBaseContentMetadata,
1414
ISchema,
1515
IMtlsConfig,
16+
IContent,
17+
IExtractResponse,
1618
} from "./types";
1719
import { Agent } from "https";
1820
import { randomUUID } from "crypto";
@@ -352,6 +354,31 @@ class IndexifyClient {
352354
return policies;
353355
}
354356

357+
async extract({
358+
name,
359+
input_params,
360+
content: { content_type, bytes, features = [], labels = {} },
361+
}: {
362+
name: string;
363+
input_params?: Record<string, string | number>;
364+
content: IContent;
365+
}): Promise<IExtractResponse> {
366+
const resp = await this.client.post(
367+
`${DEFAULT_SERVICE_URL}/extractors/extract`,
368+
{
369+
name,
370+
content: {
371+
content_type,
372+
bytes,
373+
features,
374+
labels,
375+
},
376+
input_params: JSON.stringify(input_params),
377+
}
378+
);
379+
return resp.data;
380+
}
381+
355382
async ingestRemoteFile(
356383
url: string,
357384
mime_type: string,

src/index.ts

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
ITask,
1414
IDocument,
1515
ISchema,
16+
IContent
1617
} from "./types";
1718

1819
export {
@@ -30,4 +31,5 @@ export {
3031
ISearchIndexResponse,
3132
ITask,
3233
IDocument,
34+
IContent
3335
};

src/types.ts

+26-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ export interface IBaseContentMetadata {
4242
source: string;
4343
size: number;
4444
}
45-
4645
export interface IContentMetadata extends IBaseContentMetadata {
4746
content_url: string;
4847
}
@@ -83,6 +82,31 @@ export interface IDocument {
8382
id?: string;
8483
}
8584

85+
export interface IFeature {
86+
feature_type: "embedding" | "metadata" | "unknown";
87+
name: string;
88+
data: { [key: string]: any };
89+
}
90+
91+
export interface IContent {
92+
content_type: string;
93+
bytes: string | number[];
94+
features?: IFeature[];
95+
labels?: Record<string, string>;
96+
}
97+
98+
export interface IContentResp {
99+
content_type: string;
100+
bytes: number[];
101+
features?: IFeature[];
102+
labels?: Record<string, string>;
103+
}
104+
105+
export interface IExtractResponse {
106+
features: IFeature[]
107+
content: IContentResp[]
108+
}
109+
86110
export interface ISearchIndexResponse {
87111
content_id: string;
88112
text: string;
@@ -99,3 +123,4 @@ export interface IMtlsConfig {
99123
keyPath: string;
100124
caPath?: string; // Optional, only if using a custom CA
101125
}
126+

tests/client.test.ts

+33
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import { IndexifyClient } from "../src";
22
import { IExtractionPolicy } from "../src/types";
3+
import { isAxiosError } from "axios";
4+
const fs = require("fs");
35

46
jest.setTimeout(30000);
57

@@ -173,6 +175,37 @@ test("Ingest remote url", async () => {
173175
);
174176
});
175177

178+
test.only("Test Extract Method", async () => {
179+
// Test minilm feature extract
180+
const client = await IndexifyClient.createClient();
181+
const res = await client.extract({
182+
name: "tensorlake/minilm-l6",
183+
content: { bytes: "testing", content_type: "text/plain" },
184+
});
185+
186+
expect(res.content.length).toBe(0);
187+
expect(res.features.length).toBe(1);
188+
189+
// Test wiki content extraction
190+
const html = fs.readFileSync(__dirname + "/files/steph_curry.html", "utf8");
191+
const res2 = await client.extract({
192+
name: "tensorlake/wikipedia",
193+
content: { bytes: html, content_type: "text/plain" },
194+
});
195+
expect(res2.content.length).toBe(29);
196+
197+
// Test eighth piece of content
198+
const content = res2.content[8];
199+
expect(String.fromCharCode(...content.bytes)).toBe(
200+
"NCAA Davidson College NBA Golden State Warriors"
201+
);
202+
expect(content.features?.length).toBe(1);
203+
expect(content.features?.[0].feature_type).toBe("metadata");
204+
expect(content.content_type).toBe("text/plain");
205+
expect(content.features?.[0].data.headline).toBe("Records");
206+
expect(content.features?.[0].data.title).toBe("Stephen Curry");
207+
});
208+
176209
// test.only("MTLS", async () => {
177210
// const fs = require("fs")
178211
// const https = require("https")

0 commit comments

Comments
 (0)