@@ -196,13 +196,13 @@ class IndexifyClient {
196
196
name : string ,
197
197
query : string ,
198
198
topK : number ,
199
- filters : string [ ] ,
199
+ filters ? : string [ ] ,
200
200
include_content : boolean = true
201
201
) : Promise < ISearchIndexResponse [ ] > {
202
202
const resp = await this . client . post ( `/indexes/${ name } /search` , {
203
203
query,
204
204
k : topK ,
205
- filters : filters ,
205
+ ... ( filters !== undefined && { filters } ) ,
206
206
include_content,
207
207
} ) ;
208
208
return resp . data [ "results" ] ;
@@ -224,88 +224,109 @@ class IndexifyClient {
224
224
}
225
225
226
226
async getExtractedContent ( {
227
- parentId,
228
- source,
229
- labelsEq,
230
- startId,
231
- limit,
232
- returnTotal = false ,
227
+ contentId,
228
+ graphName,
229
+ policyName,
230
+ blocking = false ,
233
231
} : {
234
- parentId ?: string ;
235
- source ?: string ;
236
- labelsEq ?: string ;
237
- startId ?: string ;
238
- limit ?: number ;
239
- returnTotal ?: boolean ;
240
- } = { } ) : Promise < { contentList : IContentMetadata [ ] ; total ?: number } > {
241
- const resp = await this . client . get ( "content" , {
242
- params : {
243
- parent_id : parentId ,
244
- labels_eq : labelsEq ,
245
- source,
246
- start_id : startId ,
247
- limit,
248
- return_total : returnTotal ,
249
- } ,
250
- } ) ;
251
- const contentList = resp . data . content_list . map (
252
- ( content : IBaseContentMetadata ) => {
253
- return this . baseContentToContentMetadata ( content ) ;
254
- }
232
+ contentId : string ;
233
+ graphName : string ;
234
+ policyName : string ;
235
+ blocking ?: boolean ;
236
+ } ) : Promise < { contentList : IContentMetadata [ ] ; total ?: number } > {
237
+ if ( blocking ) {
238
+ await this . waitForExtraction ( contentId ) ;
239
+ }
240
+
241
+ const response = await this . client . get (
242
+ `namespaces/${ this . namespace } /extraction_graphs/${ graphName } /extraction_policies/${ policyName } /content/${ contentId } ` ,
255
243
) ;
256
- return { contentList, total : resp . data . total } ;
244
+
245
+ const contentTree = response . data ;
246
+ const contentList : IContentMetadata [ ] = [ ] ;
247
+
248
+ for ( const item of contentTree . content_tree_metadata ) {
249
+ if ( item . extraction_graph_names . includes ( graphName ) && item . source === policyName ) {
250
+ const baseContent : IBaseContentMetadata = {
251
+ id : item . id ,
252
+ parent_id : item . parent_id ,
253
+ ingested_content_id : contentId ,
254
+ namespace : item . namespace ,
255
+ name : item . name ,
256
+ mime_type : item . mime_type ,
257
+ labels : item . labels ,
258
+ storage_url : item . storage_url ,
259
+ created_at : item . created_at ,
260
+ source : item . source ,
261
+ size : item . size ,
262
+ hash : item . hash ,
263
+ extraction_graph_names : item . extraction_graph_names ,
264
+ } ;
265
+
266
+ const contentMetadata = this . baseContentToContentMetadata ( baseContent ) ;
267
+ contentList . push ( contentMetadata ) ;
268
+ }
269
+ }
270
+
271
+ return { contentList } ;
257
272
}
258
273
259
274
async addDocuments (
260
- extractionGraphNames : string | string [ ] ,
261
- documents :
262
- | IDocument
263
- | string
264
- | IDocument [ ]
265
- | string [ ]
266
- | ( IDocument | string ) [ ]
267
- ) {
268
- function isIDocument ( obj : any ) : obj is IDocument {
269
- return (
270
- obj && typeof obj . text === "string" && typeof obj . labels === "object"
271
- ) ;
275
+ extractionGraphs : string | string [ ] ,
276
+ documents : IDocument | string | ( IDocument | string ) [ ] ,
277
+ docId ?: string
278
+ ) : Promise < string [ ] > {
279
+ let extractionGraphsArray : string [ ] ;
280
+ if ( typeof extractionGraphs === 'string' ) {
281
+ extractionGraphsArray = [ extractionGraphs ] ;
282
+ } else {
283
+ extractionGraphsArray = extractionGraphs ;
272
284
}
273
285
274
- let newDocuments : IDocument [ ] = [ ] ;
275
-
276
- if ( typeof documents === "string" ) {
277
- newDocuments . push ( { text : documents as string , labels : { } } ) ;
278
- } else if ( isIDocument ( documents ) ) {
279
- newDocuments . push ( documents ) ;
280
- } else if ( Array . isArray ( documents ) ) {
281
- newDocuments = [
282
- ...newDocuments ,
283
- ...( documents . map ( ( item ) => {
284
- if ( isIDocument ( item ) ) {
285
- return item ;
286
- } else if ( typeof item === "string" ) {
287
- return { text : item , labels : { } } ;
288
- } else {
289
- throw Error (
290
- "Invalid Type: Array items must be string or IDocument"
291
- ) ;
292
- }
293
- } ) as IDocument [ ] ) ,
294
- ] ;
286
+ let documentsArray : IDocument [ ] ;
287
+ if ( documents instanceof Array ) {
288
+ documentsArray = documents . map ( doc => {
289
+ if ( typeof doc === 'string' ) {
290
+ return { text : doc , labels : { } , id : undefined } ;
291
+ } else {
292
+ return doc ;
293
+ }
294
+ } ) ;
295
+ } else if ( typeof documents === 'string' ) {
296
+ documentsArray = [ { text : documents , labels : { } , id : docId } ] ;
295
297
} else {
296
- throw Error (
297
- "Invalid type for documents. Expected Document, str, or list of these."
298
- ) ;
298
+ documentsArray = [ documents ] ;
299
299
}
300
300
301
- const extractionGraphNamesArray = Array . isArray ( extractionGraphNames )
302
- ? extractionGraphNames
303
- : [ extractionGraphNames ] ;
304
-
305
- await this . client . post ( "add_texts" , {
306
- documents : newDocuments ,
307
- extraction_graph_names : extractionGraphNamesArray ,
301
+ // Add mime_type to all documents
302
+ documentsArray . forEach ( doc => {
303
+ doc . labels [ 'mime_type' ] = 'text/plain' ;
308
304
} ) ;
305
+
306
+ const contentIds : string [ ] = [ ] ;
307
+
308
+ for ( const extractionGraph of extractionGraphsArray ) {
309
+ for ( const document of documentsArray ) {
310
+ const formData = new FormData ( ) ;
311
+ formData . append ( 'file' , new Blob ( [ document . text ] , { type : 'text/plain' } ) , 'document.txt' ) ;
312
+ formData . append ( 'labels' , JSON . stringify ( document . labels ) ) ;
313
+
314
+ const response = await this . client . post (
315
+ `namespaces/${ this . namespace } /extraction_graphs/${ extractionGraph } /extract` ,
316
+ formData ,
317
+ {
318
+ headers : {
319
+ 'Content-Type' : 'multipart/form-data' ,
320
+ } ,
321
+ }
322
+ ) ;
323
+
324
+ const contentId = response . data . content_id ;
325
+ contentIds . push ( contentId ) ;
326
+ }
327
+ }
328
+
329
+ return contentIds ;
309
330
}
310
331
311
332
async getContentMetadata ( id : string ) : Promise < IContentMetadata > {
@@ -318,11 +339,6 @@ class IndexifyClient {
318
339
return resp . data . metadata ;
319
340
}
320
341
321
- async getContentTree ( id : string ) : Promise < IContentMetadata [ ] > {
322
- const resp = await this . client . get ( `content/${ id } /content-tree` ) ;
323
- return resp . data . content_tree_metadata ;
324
- }
325
-
326
342
async downloadContent < T > ( id : string ) : Promise < T > {
327
343
try {
328
344
const response = await this . client . get ( `content/${ id } /download` ) ;
@@ -474,6 +490,29 @@ class IndexifyClient {
474
490
return resp . data ;
475
491
}
476
492
493
+ async waitForExtraction ( contentIds : string | string [ ] ) : Promise < void > {
494
+ const ids = typeof contentIds === 'string' ? [ contentIds ] : contentIds ;
495
+
496
+ console . log ( "Waiting for extraction to complete for content id: " , ids . join ( "," ) ) ;
497
+
498
+ for ( const contentId of ids ) {
499
+ try {
500
+ const response = await this . client . get (
501
+ `namespaces/${ this . namespace } /content/${ contentId } /wait`
502
+ ) ;
503
+
504
+ console . log ( "Extraction completed for content id: " , contentId ) ;
505
+
506
+ if ( response . status >= 400 ) {
507
+ throw new Error ( `HTTP error! status: ${ response . status } ` ) ;
508
+ }
509
+ } catch ( error ) {
510
+ console . error ( `Error waiting for extraction of content id ${ contentId } :` , error ) ;
511
+ throw error ;
512
+ }
513
+ }
514
+ }
515
+
477
516
async ingestRemoteFile (
478
517
url : string ,
479
518
mime_type : string ,
0 commit comments