-
Notifications
You must be signed in to change notification settings - Fork 6
Open
Description
Used AI to adjust this for my use case. i based it off of https://apidocs.bridge.xyz/docs/getting-started
const { chromium, devices } = require('playwright');
const path = require('path');
const fs = require('fs');
/**
* This script is designed to first discover the correct selectors by examining
* the page structure, then use those selectors to generate a PDF of all chapters.
*/
class GitBookPDFSpider {
constructor({browser, page, url, bookName}) {
this._browser = browser;
this._mainPage = page;
this._bookUrl = url;
this._bookName = bookName;
this._siteConfig = {}; // Will be populated after page inspection
}
static async create({url, bookName = 'Bridge.pdf'}) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
return new GitBookPDFSpider({browser, page, url, bookName});
}
async run() {
try {
console.log(`Starting PDF generation for: ${this._bookUrl}`);
// Step 1: Open main page and inspect structure
await this._openMainPage();
// Step 2: Analyze the page structure to determine correct selectors
await this._analyzePageStructure();
// Step 3: Get chapter links using detected selectors
const chaptersMetaInfo = await this._getChaptersMetaInfo();
console.log(`Found ${chaptersMetaInfo.length} chapters to process`);
// Step 4: Fetch HTML content for all chapters
const chaptersHTMLContent = await this._fetchAllChaptersHTMLContent(chaptersMetaInfo);
console.log(`Successfully fetched content for ${chaptersHTMLContent.length} chapters`);
// Step 5: Generate table of contents
const tableOfContentsHTML = await this._generateTableOfContents(chaptersMetaInfo);
// Step 6: Beautify the main page and prepare for PDF
await this._beautifyMainPage();
// Step 7: Combine all content into a single page
await this._generateFullHTMLPage(chaptersHTMLContent, tableOfContentsHTML, chaptersMetaInfo);
// Step 8: Generate the PDF
console.log(`Generating PDF: ${this._bookName}`);
await this._mainPage.pdf({
path: this._bookName,
format: 'A4',
margin: {
top: '50px',
right: '50px',
bottom: '50px',
left: '50px'
},
printBackground: true
});
console.log(`PDF successfully created: ${this._bookName}`);
} catch (error) {
console.error('Error during execution:', error);
} finally {
await this._browser.close();
}
}
_openMainPage = async () => {
console.log('Opening main page:', this._bookUrl);
await this._mainPage.goto(this._bookUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
// Wait a bit longer to ensure all JavaScript loads properly
await this._mainPage.waitForTimeout(5000);
}
_analyzePageStructure = async () => {
console.log('Analyzing page structure to determine correct selectors...');
// This function will analyze the page and find the correct selectors
const pageStructure = await this._mainPage.evaluate(() => {
// Helper function to get a unique selector for an element
const getSelector = (elem) => {
if (!elem) return null;
// Try common navigation elements
if (elem.tagName === 'NAV') return 'nav';
if (elem.tagName === 'ASIDE') return 'aside';
if (elem.tagName === 'HEADER') return 'header';
// Try by ID
if (elem.id) return `#${elem.id}`;
// Try by class
if (elem.className && typeof elem.className === 'string') {
const classes = elem.className.trim().split(/\s+/);
if (classes.length > 0) {
return `.${classes.join('.')}`;
}
}
// Try by attribute
for (const attr of ['role', 'data-testid', 'aria-label']) {
if (elem.hasAttribute(attr)) {
return `[${attr}="${elem.getAttribute(attr)}"]`;
}
}
// Fallback to a simple tag selector
return elem.tagName.toLowerCase();
};
// Find navigation/sidebar element containing links
const findNavElement = () => {
// Common selectors for navigation/sidebar in documentation sites
const possibleNavSelectors = [
'nav', 'aside', '.sidebar', '.navigation', '.menu', '.toc',
'[role="navigation"]', '[aria-label="Navigation"]', '[aria-label="Sidebar"]'
];
for (const selector of possibleNavSelectors) {
const navElem = document.querySelector(selector);
if (navElem && navElem.querySelectorAll('a').length > 3) {
return {
selector: selector,
linkCount: navElem.querySelectorAll('a').length,
linkSelector: `${selector} a`
};
}
}
// If no nav element found by common selectors, look for any element with lots of links
const allElements = document.querySelectorAll('*');
let bestElement = null;
let maxLinks = 0;
for (const elem of allElements) {
const links = elem.querySelectorAll('a');
if (links.length > maxLinks) {
maxLinks = links.length;
bestElement = elem;
}
}
if (bestElement && maxLinks > 3) {
const selector = getSelector(bestElement);
return {
selector: selector,
linkCount: maxLinks,
linkSelector: `${selector} a`
};
}
return null;
};
// Find main content element
const findMainContentElement = () => {
// Common selectors for main content in documentation sites
const possibleContentSelectors = [
'main', 'article', '.content', '.main-content', '.markdown-body', '.article',
'[role="main"]', '[aria-label="Content"]'
];
for (const selector of possibleContentSelectors) {
const contentElem = document.querySelector(selector);
if (contentElem && contentElem.textContent.trim().length > 200) {
return selector;
}
}
// If no content element found by common selectors, look for large text containers
const allElements = document.querySelectorAll('body *');
let bestElement = null;
let maxTextLength = 0;
for (const elem of allElements) {
// Skip elements that are likely navigation or headers
if (elem.tagName === 'NAV' || elem.tagName === 'ASIDE' || elem.tagName === 'HEADER') continue;
if (elem.querySelectorAll('a').length > 10) continue;
const textLength = elem.textContent.trim().length;
if (textLength > maxTextLength) {
maxTextLength = textLength;
bestElement = elem;
}
}
if (bestElement && maxTextLength > 200) {
return getSelector(bestElement);
}
return 'body'; // Fallback
};
// Find header element
const findHeaderElement = () => {
const headerElem = document.querySelector('header');
if (headerElem) return 'header';
const possibleHeaderSelectors = [
'.header', '.app-header', '.navbar', '.nav-header', '.top-bar'
];
for (const selector of possibleHeaderSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.getBoundingClientRect().top < 100) {
return selector;
}
}
return null;
};
// Analyze the structure of links in the sidebar/navigation
const analyzeLinks = (navSelector) => {
const links = document.querySelectorAll(`${navSelector} a`);
const linkInfo = Array.from(links).map(link => ({
href: link.href,
text: link.textContent.trim(),
isExternal: link.host !== window.location.host,
rect: link.getBoundingClientRect()
})).filter(info =>
info.text &&
!info.isExternal &&
info.text.length < 100 &&
!info.text.includes('http')
);
return linkInfo;
};
// Find the results
const navElement = findNavElement();
const mainContentSelector = findMainContentElement();
const headerSelector = findHeaderElement();
const linkInfoArr = navElement ? analyzeLinks(navElement.selector) : [];
// Log information about the page
console.log('Document title:', document.title);
console.log('URLs on page:', document.querySelectorAll('a').length);
return {
title: document.title,
navSelector: navElement ? navElement.selector : null,
navLinkSelector: navElement ? navElement.linkSelector : null,
navLinkCount: navElement ? navElement.linkCount : 0,
contentSelector: mainContentSelector,
headerSelector: headerSelector,
linkInfoSample: linkInfoArr.slice(0, 10), // Sample of first 10 links
};
});
console.log('Page analysis results:', JSON.stringify(pageStructure, null, 2));
// Update the site configuration with the detected selectors
this._siteConfig = {
chapterLinksElmSelector: pageStructure.navLinkSelector || 'a[href]',
bodySelector: 'body',
bookContentSelector: pageStructure.contentSelector || 'main',
headerSelector: pageStructure.headerSelector || 'header',
sideBarSelector: pageStructure.navSelector || 'nav',
};
console.log('Using detected selectors:', JSON.stringify(this._siteConfig, null, 2));
}
_getChaptersMetaInfo = async () => {
console.log('Getting chapter metadata using selector:', this._siteConfig.chapterLinksElmSelector);
// First, let's check how many elements we find with the selector
const linkCount = await this._mainPage.evaluate((selector) => {
return document.querySelectorAll(selector).length;
}, this._siteConfig.chapterLinksElmSelector);
console.log(`Found ${linkCount} potential chapter links`);
// If we still don't find links, dump the HTML structure to console for debugging
if (linkCount === 0) {
console.log('No links found. Analyzing document structure...');
await this._mainPage.evaluate(() => {
// Log the first level children of body
const bodyChildren = document.body.children;
console.log(`Body has ${bodyChildren.length} direct children:`);
Array.from(bodyChildren).forEach((child, i) => {
console.log(`Child ${i}: ${child.tagName} - ID: ${child.id || 'none'} - Class: ${child.className || 'none'}`);
});
// Log all links on the page
const allLinks = document.querySelectorAll('a[href]');
console.log(`Page has ${allLinks.length} links (first 10 shown):`);
Array.from(allLinks).slice(0, 10).forEach((link, i) => {
console.log(`Link ${i}: "${link.textContent.trim()}" - Href: ${link.href}`);
});
});
// Try a more general selector as fallback
this._siteConfig.chapterLinksElmSelector = 'a[href]';
console.log('Falling back to generic selector:', this._siteConfig.chapterLinksElmSelector);
}
// Now extract the chapter information
return this._mainPage.evaluate(({ config }) => {
const results = [];
const linksElm = document.querySelectorAll(config.chapterLinksElmSelector);
if (!linksElm || linksElm.length === 0) {
// Try one last attempt with document.links
const allLinks = Array.from(document.links);
if (allLinks.length === 0) {
throw new Error(`No links found on the page at all`);
}
// Filter links that are likely to be chapters
const possibleChapterLinks = allLinks.filter(link => {
const text = link.textContent.trim();
const href = link.href;
const isInternalLink = href.includes(window.location.hostname);
return text && isInternalLink && text.length < 100 && !text.includes('http');
});
if (possibleChapterLinks.length === 0) {
throw new Error(`No suitable chapter links found on the page`);
}
possibleChapterLinks.forEach((link, index) => {
results.push({
url: link.href,
title: link.textContent.trim() || `Chapter ${index + 1}`,
id: `chapter_${index}`
});
});
return results;
}
// Process links found with the selector
linksElm.forEach((link, index) => {
if (link.href) {
// Filter out likely non-chapter links
const text = link.textContent.trim();
if (!text || text.includes('http') || text === '↑' || text.length > 100) {
return;
}
// Only include links to internal pages
if (!link.href.includes(window.location.hostname)) {
return;
}
results.push({
url: link.href,
title: text || `Chapter ${index + 1}`,
id: `chapter_${index}`
});
}
});
return results;
}, { config: this._siteConfig });
}
_fetchAllChaptersHTMLContent = async (chaptersMetaInfo = []) => {
const chaptersContents = [];
if (chaptersMetaInfo.length) {
const newPage = await this._browser.newPage();
for (let i = 0; i < chaptersMetaInfo.length; i++) {
const chapter = chaptersMetaInfo[i];
console.log(`Fetching content for chapter ${i+1}/${chaptersMetaInfo.length}: "${chapter.title}"`);
try {
const contentHTML = await this._fetchChapterContent(newPage, chapter);
if (contentHTML) {
chaptersContents.push({
html: contentHTML,
title: chapter.title,
id: chapter.id
});
}
} catch (error) {
console.error(`Error fetching chapter "${chapter.title}":`, error);
}
}
await newPage.close();
}
return chaptersContents;
}
_fetchChapterContent = async (page, chapter) => {
try {
await page.goto(chapter.url, { timeout: 60000, waitUntil: 'domcontentloaded' });
// Wait a bit longer to ensure JavaScript loads properly
await page.waitForTimeout(2000);
// Take a screenshot for debugging if necessary
// await page.screenshot({ path: `chapter_${chapter.id}.png` });
// If the content selector isn't found, try to determine it
const hasContentSelector = await page.evaluate((selector) => {
return !!document.querySelector(selector);
}, this._siteConfig.bookContentSelector);
if (!hasContentSelector) {
// Analyze the page to find the main content
await page.evaluate(({ config, chapterId }) => {
console.log(`Analyzing content for chapter ${chapterId}...`);
// Find the element with the most text content
const allElements = document.querySelectorAll('body *');
let bestElement = null;
let maxTextLength = 0;
for (const elem of allElements) {
// Skip tiny elements or likely navigation elements
if (elem.offsetHeight < 100) continue;
if (elem.tagName === 'NAV' || elem.tagName === 'ASIDE') continue;
const textLength = elem.textContent.trim().length;
if (textLength > maxTextLength) {
maxTextLength = textLength;
bestElement = elem;
}
}
if (bestElement) {
console.log(`Found content element for chapter ${chapterId} with ${maxTextLength} characters`);
// Update the content selector for this page
if (bestElement.id) {
config.bookContentSelector = `#${bestElement.id}`;
} else if (bestElement.className) {
config.bookContentSelector = `.${bestElement.className.split(' ')[0]}`;
} else {
config.bookContentSelector = bestElement.tagName.toLowerCase();
}
}
}, { config: this._siteConfig, chapterId: chapter.id });
}
return page.evaluate(({ config, chapterId, chapterTitle }) => {
// Try to find the content using the selector
let contentElement = document.querySelector(config.bookContentSelector);
// If still not found, fall back to the entire body
if (!contentElement) {
console.log(`Content selector "${config.bookContentSelector}" not found for chapter ${chapterId}`);
contentElement = document.body;
}
// Create a clone to avoid modifying the original page
const contentClone = contentElement.cloneNode(true);
// Add chapter ID and title for better PDF structure
contentClone.id = chapterId;
// Add a chapter heading if it doesn't exist
const existingHeading = contentClone.querySelector('h1');
if (!existingHeading) {
const heading = document.createElement('h1');
heading.textContent = chapterTitle;
heading.style.pageBreakBefore = 'always';
heading.style.paddingTop = '20px';
contentClone.insertBefore(heading, contentClone.firstChild);
} else {
existingHeading.style.pageBreakBefore = 'always';
existingHeading.style.paddingTop = '20px';
}
return contentClone.outerHTML;
}, { config: this._siteConfig, chapterId: chapter.id, chapterTitle: chapter.title });
} catch (error) {
console.error(`Error fetching chapter ${chapter.url}:`, error);
return null;
}
}
_generateTableOfContents = async (chaptersMetaInfo = []) => {
return this._mainPage.evaluate(({ chaptersMetaInfo }) => {
const tocContainer = document.createElement('div');
tocContainer.className = 'toc-container';
tocContainer.style.paddingBottom = '60px';
tocContainer.style.breakAfter = 'page';
const tocTitle = document.createElement('h1');
tocTitle.textContent = 'Table of Contents';
tocTitle.style.textAlign = 'center';
tocTitle.style.marginBottom = '30px';
tocContainer.appendChild(tocTitle);
const tocList = document.createElement('div');
tocList.className = 'toc-list';
tocList.style.fontSize = '14px';
chaptersMetaInfo.forEach((chapter, index) => {
const tocItem = document.createElement('div');
tocItem.className = 'toc-item';
tocItem.style.margin = '10px 0';
const chapterLink = document.createElement('a');
chapterLink.href = `#${chapter.id}`;
chapterLink.textContent = `${index + 1}. ${chapter.title}`;
chapterLink.style.textDecoration = 'none';
chapterLink.style.color = '#333';
tocItem.appendChild(chapterLink);
tocList.appendChild(tocItem);
});
tocContainer.appendChild(tocList);
return tocContainer.outerHTML;
}, { chaptersMetaInfo });
}
_beautifyMainPage = async () => {
console.log('Beautifying main page for PDF output...');
await this._mainPage.evaluate(({ config }) => {
// Reset page to clean state
document.body.innerHTML = '';
document.body.style.margin = '0';
document.body.style.padding = '40px';
document.body.style.fontFamily = 'Arial, sans-serif';
document.body.style.fontSize = '14px';
document.body.style.lineHeight = '1.5';
document.body.style.color = '#333';
// Add container for our content
const container = document.createElement('div');
container.id = 'pdf-container';
container.style.maxWidth = '800px';
container.style.margin = '0 auto';
document.body.appendChild(container);
// Add title page
const titlePage = document.createElement('div');
titlePage.style.height = '80vh';
titlePage.style.display = 'flex';
titlePage.style.flexDirection = 'column';
titlePage.style.justifyContent = 'center';
titlePage.style.alignItems = 'center';
titlePage.style.textAlign = 'center';
titlePage.style.breakAfter = 'page';
const mainTitle = document.createElement('h1');
mainTitle.textContent = document.title || 'Bridge API Documentation';
mainTitle.style.fontSize = '32px';
mainTitle.style.marginBottom = '20px';
const subtitle = document.createElement('div');
subtitle.textContent = 'Generated on ' + new Date().toLocaleDateString();
subtitle.style.fontSize = '16px';
titlePage.appendChild(mainTitle);
titlePage.appendChild(subtitle);
container.appendChild(titlePage);
}, { config: this._siteConfig });
}
_generateFullHTMLPage = async (chaptersContent = [], tableOfContentsHTML = '', chaptersMetaInfo = []) => {
if (!chaptersContent.length) {
throw new Error('No chapter content found to generate PDF');
}
console.log('Assembling complete document...');
await this._mainPage.evaluate(({ chaptersContent, tableOfContentsHTML, chaptersMetaInfo }) => {
const container = document.getElementById('pdf-container');
// Add table of contents
container.insertAdjacentHTML('beforeend', tableOfContentsHTML);
// Add all chapter content
chaptersContent.forEach(chapter => {
container.insertAdjacentHTML('beforeend', chapter.html);
});
// Add page breaks between chapters and style improvements
const chapterHeadings = document.querySelectorAll('h1');
chapterHeadings.forEach(heading => {
heading.style.pageBreakBefore = 'always';
heading.style.marginTop = '40px';
});
// Improve image handling
const images = document.querySelectorAll('img');
images.forEach(img => {
img.style.maxWidth = '100%';
img.style.height = 'auto';
img.style.margin = '20px 0';
});
// Improve code blocks
// Style <pre> elements (block code)
const preBlocks = document.querySelectorAll('pre');
preBlocks.forEach(block => {
block.style.backgroundColor = '#f5f5f5';
block.style.padding = '15px';
block.style.borderRadius = '5px';
block.style.fontFamily = 'monospace';
block.style.overflowX = 'auto';
block.style.whiteSpace = 'pre-wrap'; // Keep wrapping for readability in PDF
// Ensure inner code doesn't get double background
const innerCode = block.querySelector('code');
if (innerCode) {
innerCode.style.backgroundColor = 'transparent';
innerCode.style.padding = '0';
innerCode.style.borderRadius = '0';
}
});
// Reset potentially problematic styles for inline <code> elements
const inlineCode = document.querySelectorAll('p code, li code'); // Target only inline code
inlineCode.forEach(code => {
code.style.backgroundColor = 'transparent'; // Remove background
code.style.padding = '1px 2px'; // Minimal padding
code.style.borderRadius = '3px'; // Minimal rounding
code.style.fontFamily = 'monospace'; // Keep font
code.style.whiteSpace = 'normal'; // Allow normal wrapping
code.style.overflowX = 'visible'; // Prevent potential overflow issues
});
}, {
chaptersContent,
tableOfContentsHTML,
chaptersMetaInfo
});
}
}
// Example usage
GitBookPDFSpider.create({
url: 'https://abc-api.com',
bookName: 'API_Reference.pdf',
}).then(spider => {
spider.run();
});
lufengd3
Metadata
Metadata
Assignees
Labels
No labels