Is it possible to remove unused images when splitting pages? #1661

seisiuneer · 2024-08-01T04:01:34Z

seisiuneer
Aug 1, 2024

I am working on splitting a PDF document (PDF of music scores generated with a music transcription tool I've built) into individual page ranges, using a common pattern I've seen recommended for doing this sort of thing with pdf-lib:

async function splitPDF(pdfBytes, ranges) {

    const { PDFDocument } = PDFLib;
    
    var originalPdf = await PDFDocument.load(pdfBytes);

    const splitPdfs = [];

    for (const range of ranges) {

      //console.log("splitPDF range start: "+range.start+" end: "+range.end);
      
      const newPdf = await PDFDocument.create();
      
      for (let i = range.start; i <= range.end; i++) {

        const [copiedPage] = await newPdf.copyPages(originalPdf, [i]);

        newPdf.addPage(copiedPage);

      }
      
      const newPdfBytes = await newPdf.save();
      
      splitPdfs.push(newPdfBytes);

    }

    return splitPdfs;

}

Unfortunately, what I find in the split files that get written out is that all of the images referenced in the original PDF are present in the split PDF files, and I see entries for them in the context indirectObjects. The split files are essentially all the same size as the original complete PDF.

It looks like copyPages() doesn't filter out the unused images, it just copies the entire set of images referenced in the original PDF document you're copying from.

If I look at the actual operators using a PDF parser, I can see they only reference the images being used for the page range, but the resulting PDF files are all essentially the size of the original PDF file before the split.

I've seen a few posts about issues with file size using copyPages() to split the files, and I'm guessing this is the root cause.

Anyone have a workaround?

I'd far prefer to have this feature in my tool rather than recommending that users use Adobe Acrobat to both split and optimize the PDF files I export from my tool if they want to split them into individual page ranges.

seisiuneer · 2024-08-04T04:05:47Z

seisiuneer
Aug 4, 2024
Author

I was finally able to split the pages by range and delete unused images before exporting the split pages, this returns the pdf bytes for a range of pages in an original PDF with unused images stripped.

Now, this works for me because I specifically know how I'm creating the original PDF using jsPDF and know their deterministic structure, it may not be a general solution. I figured that I needed to be able to get at the list of XObject images used in the document, a way to delete them, and a way to get at the raw command stream for the page, from which I could figure out which images are actually used in the split pages and delete the rest.

splitPDF(originalPdf, range) takes in a PDFDocument and a {start:startpage, end:endpage} range and returns the bytes for the new split PDF document for saving or other processing. In my case, I just put them in a Blob and save the file (code not provided here).

function countImagesInPDF(dict) {

	const entries = Array.from(dict.dict.entries());

	var nImages = 0;

	var nEntries = entries.length;
	for (var i = 0; i < nEntries; ++i) {
		var thisEntry = entries[i];

		if (thisEntry[0].encodedName.indexOf("/I") != -1) {

			nImages++
		}
	}

	return nImages;

}

const findKeyForValue = (value, dict) => {

	//debugger;

	const entries = Array.from(dict.dict.entries());

	var match = null;

	var nEntries = entries.length;
	for (var i = 0; i < nEntries; ++i) {
		var thisEntry = entries[i];

		if (thisEntry[0].encodedName == value) {

			match = thisEntry;
			break;
		}
	}

	if (match) return match[0];

	return undefined;
};

// Parse the content stream for this page and find the images
function getImagesInThisPage(thePage){

	var theContents = thePage.node.Contents();

	var decoder = new TextDecoder('utf-8');
	var rawString = decoder.decode(theContents.contents);
	
	//console.log(rawString);

	const lines = rawString.split('\n');

	// Filter lines that start with '/I'
	const filteredLines = lines.filter(line => line.startsWith('/I'));

	var nLines = filteredLines.length;

	var imageList = [];

	for (var i=0;i<nLines;++i){

		var thisLine = filteredLines[i];

		thisLine = thisLine.replace(" Do","");
		thisLine = thisLine.replace("/I","");
		thisLine = thisLine.trim();

		imageList.push(parseInt(thisLine));
	}

	return imageList;
}

async function splitPDF(originalPdf, range) {

	//debugger;

	const {
		PDFDocument,
		PDFName,
		PDFDict
	} = PDFLib;

	//console.log("splitPDF range start: " + range.start + " end: " + range.end );

	const newPdf = await PDFDocument.create();

	var newPDFPageCount = 0;

	for (let i = range.start; i <= range.end; i++) {

		const [copiedPage] = await newPdf.copyPages(originalPdf, [i]);

		newPdf.addPage(copiedPage);

		newPDFPageCount++;

	}

	for (let i=0;i<newPDFPageCount;++i){

		const thisPage = newPdf.getPages()[i];

		const xObjects = thisPage.node
			.Resources()
			.lookup(PDFName.of('XObject'), PDFDict);

		var nImagesInPDF = countImagesInPDF(xObjects);

		//console.log("Image count in PDF: " + nImagesInPDF);

		var imagesInThisPDF = getImagesInThisPage(thisPage);

		// Get all the images in the command stream

		for (var j = 0; j < nImagesInPDF; ++j) {

			if (!(imagesInThisPDF.includes(j))){

				const key = findKeyForValue('/I' + j, xObjects);

				const imageRef = xObjects.get(key);

				if (imageRef) {

					//console.log("deleting "+ ('/I' + j));

					newPdf.context.delete(imageRef);

				}
			}
		}
	}

	newPdfBytes = await newPdf.save();

	return newPdfBytes;

}

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Is it possible to remove unused images when splitting pages? #1661

{{title}}

Replies: 1 comment

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Select a reply

Is it possible to remove unused images when splitting pages? #1661

seisiuneer Aug 1, 2024

Replies: 1 comment

seisiuneer Aug 4, 2024 Author

seisiuneer
Aug 1, 2024

seisiuneer
Aug 4, 2024
Author