Skip to content

I've modified the script to work for me - try it out #2

@natefikru

Description

@natefikru

Used AI to adjust this for my use case. i based it off of https://apidocs.bridge.xyz/docs/getting-started

const { chromium, devices } = require('playwright');
const path = require('path');
const fs = require('fs');

/**
 * This script is designed to first discover the correct selectors by examining 
 * the page structure, then use those selectors to generate a PDF of all chapters.
 */
class GitBookPDFSpider {
  constructor({browser, page, url, bookName}) {
    this._browser = browser;
    this._mainPage = page;
    this._bookUrl = url;
    this._bookName = bookName;
    this._siteConfig = {}; // Will be populated after page inspection
  }

  static async create({url, bookName = 'Bridge.pdf'}) {
    const browser = await chromium.launch({ headless: true });
    const page = await browser.newPage();
    
    return new GitBookPDFSpider({browser, page, url, bookName});
  }

  async run() {
    try {
      console.log(`Starting PDF generation for: ${this._bookUrl}`);
      
      // Step 1: Open main page and inspect structure
      await this._openMainPage();
      
      // Step 2: Analyze the page structure to determine correct selectors
      await this._analyzePageStructure();
      
      // Step 3: Get chapter links using detected selectors
      const chaptersMetaInfo = await this._getChaptersMetaInfo();
      console.log(`Found ${chaptersMetaInfo.length} chapters to process`);
      
      // Step 4: Fetch HTML content for all chapters
      const chaptersHTMLContent = await this._fetchAllChaptersHTMLContent(chaptersMetaInfo);
      console.log(`Successfully fetched content for ${chaptersHTMLContent.length} chapters`);
      
      // Step 5: Generate table of contents
      const tableOfContentsHTML = await this._generateTableOfContents(chaptersMetaInfo);
      
      // Step 6: Beautify the main page and prepare for PDF
      await this._beautifyMainPage();
      
      // Step 7: Combine all content into a single page
      await this._generateFullHTMLPage(chaptersHTMLContent, tableOfContentsHTML, chaptersMetaInfo);
      
      // Step 8: Generate the PDF
      console.log(`Generating PDF: ${this._bookName}`);
      await this._mainPage.pdf({ 
        path: this._bookName,
        format: 'A4',
        margin: {
          top: '50px',
          right: '50px',
          bottom: '50px',
          left: '50px'
        },
        printBackground: true
      });
      
      console.log(`PDF successfully created: ${this._bookName}`);
    } catch (error) {
      console.error('Error during execution:', error);
    } finally {
      await this._browser.close();
    }
  }

  _openMainPage = async () => {
    console.log('Opening main page:', this._bookUrl);
    await this._mainPage.goto(this._bookUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
    // Wait a bit longer to ensure all JavaScript loads properly
    await this._mainPage.waitForTimeout(5000);
  }

  _analyzePageStructure = async () => {
    console.log('Analyzing page structure to determine correct selectors...');
    
    // This function will analyze the page and find the correct selectors
    const pageStructure = await this._mainPage.evaluate(() => {
      // Helper function to get a unique selector for an element
      const getSelector = (elem) => {
        if (!elem) return null;
        
        // Try common navigation elements
        if (elem.tagName === 'NAV') return 'nav';
        if (elem.tagName === 'ASIDE') return 'aside';
        if (elem.tagName === 'HEADER') return 'header';
        
        // Try by ID
        if (elem.id) return `#${elem.id}`;
        
        // Try by class
        if (elem.className && typeof elem.className === 'string') {
          const classes = elem.className.trim().split(/\s+/);
          if (classes.length > 0) {
            return `.${classes.join('.')}`;
          }
        }
        
        // Try by attribute
        for (const attr of ['role', 'data-testid', 'aria-label']) {
          if (elem.hasAttribute(attr)) {
            return `[${attr}="${elem.getAttribute(attr)}"]`;
          }
        }
        
        // Fallback to a simple tag selector
        return elem.tagName.toLowerCase();
      };
      
      // Find navigation/sidebar element containing links
      const findNavElement = () => {
        // Common selectors for navigation/sidebar in documentation sites
        const possibleNavSelectors = [
          'nav', 'aside', '.sidebar', '.navigation', '.menu', '.toc',
          '[role="navigation"]', '[aria-label="Navigation"]', '[aria-label="Sidebar"]'
        ];
        
        for (const selector of possibleNavSelectors) {
          const navElem = document.querySelector(selector);
          if (navElem && navElem.querySelectorAll('a').length > 3) {
            return {
              selector: selector,
              linkCount: navElem.querySelectorAll('a').length,
              linkSelector: `${selector} a`
            };
          }
        }
        
        // If no nav element found by common selectors, look for any element with lots of links
        const allElements = document.querySelectorAll('*');
        let bestElement = null;
        let maxLinks = 0;
        
        for (const elem of allElements) {
          const links = elem.querySelectorAll('a');
          if (links.length > maxLinks) {
            maxLinks = links.length;
            bestElement = elem;
          }
        }
        
        if (bestElement && maxLinks > 3) {
          const selector = getSelector(bestElement);
          return {
            selector: selector,
            linkCount: maxLinks,
            linkSelector: `${selector} a`
          };
        }
        
        return null;
      };
      
      // Find main content element
      const findMainContentElement = () => {
        // Common selectors for main content in documentation sites
        const possibleContentSelectors = [
          'main', 'article', '.content', '.main-content', '.markdown-body', '.article', 
          '[role="main"]', '[aria-label="Content"]'
        ];
        
        for (const selector of possibleContentSelectors) {
          const contentElem = document.querySelector(selector);
          if (contentElem && contentElem.textContent.trim().length > 200) {
            return selector;
          }
        }
        
        // If no content element found by common selectors, look for large text containers
        const allElements = document.querySelectorAll('body *');
        let bestElement = null;
        let maxTextLength = 0;
        
        for (const elem of allElements) {
          // Skip elements that are likely navigation or headers
          if (elem.tagName === 'NAV' || elem.tagName === 'ASIDE' || elem.tagName === 'HEADER') continue;
          if (elem.querySelectorAll('a').length > 10) continue;
          
          const textLength = elem.textContent.trim().length;
          if (textLength > maxTextLength) {
            maxTextLength = textLength;
            bestElement = elem;
          }
        }
        
        if (bestElement && maxTextLength > 200) {
          return getSelector(bestElement);
        }
        
        return 'body'; // Fallback
      };
      
      // Find header element
      const findHeaderElement = () => {
        const headerElem = document.querySelector('header');
        if (headerElem) return 'header';
        
        const possibleHeaderSelectors = [
          '.header', '.app-header', '.navbar', '.nav-header', '.top-bar'
        ];
        
        for (const selector of possibleHeaderSelectors) {
          const elem = document.querySelector(selector);
          if (elem && elem.getBoundingClientRect().top < 100) {
            return selector;
          }
        }
        
        return null;
      };
      
      // Analyze the structure of links in the sidebar/navigation
      const analyzeLinks = (navSelector) => {
        const links = document.querySelectorAll(`${navSelector} a`);
        const linkInfo = Array.from(links).map(link => ({
          href: link.href,
          text: link.textContent.trim(),
          isExternal: link.host !== window.location.host,
          rect: link.getBoundingClientRect()
        })).filter(info => 
          info.text && 
          !info.isExternal && 
          info.text.length < 100 && 
          !info.text.includes('http')
        );
        
        return linkInfo;
      };
      
      // Find the results
      const navElement = findNavElement();
      const mainContentSelector = findMainContentElement();
      const headerSelector = findHeaderElement();
      
      const linkInfoArr = navElement ? analyzeLinks(navElement.selector) : [];
      
      // Log information about the page
      console.log('Document title:', document.title);
      console.log('URLs on page:', document.querySelectorAll('a').length);
      
      return {
        title: document.title,
        navSelector: navElement ? navElement.selector : null,
        navLinkSelector: navElement ? navElement.linkSelector : null,
        navLinkCount: navElement ? navElement.linkCount : 0,
        contentSelector: mainContentSelector,
        headerSelector: headerSelector,
        linkInfoSample: linkInfoArr.slice(0, 10), // Sample of first 10 links
      };
    });
    
    console.log('Page analysis results:', JSON.stringify(pageStructure, null, 2));
    
    // Update the site configuration with the detected selectors
    this._siteConfig = {
      chapterLinksElmSelector: pageStructure.navLinkSelector || 'a[href]',
      bodySelector: 'body',
      bookContentSelector: pageStructure.contentSelector || 'main',
      headerSelector: pageStructure.headerSelector || 'header',
      sideBarSelector: pageStructure.navSelector || 'nav',
    };
    
    console.log('Using detected selectors:', JSON.stringify(this._siteConfig, null, 2));
  }

  _getChaptersMetaInfo = async () => {
    console.log('Getting chapter metadata using selector:', this._siteConfig.chapterLinksElmSelector);

    // First, let's check how many elements we find with the selector
    const linkCount = await this._mainPage.evaluate((selector) => {
      return document.querySelectorAll(selector).length;
    }, this._siteConfig.chapterLinksElmSelector);
    
    console.log(`Found ${linkCount} potential chapter links`);
    
    // If we still don't find links, dump the HTML structure to console for debugging
    if (linkCount === 0) {
      console.log('No links found. Analyzing document structure...');
      
      await this._mainPage.evaluate(() => {
        // Log the first level children of body
        const bodyChildren = document.body.children;
        console.log(`Body has ${bodyChildren.length} direct children:`);
        
        Array.from(bodyChildren).forEach((child, i) => {
          console.log(`Child ${i}: ${child.tagName} - ID: ${child.id || 'none'} - Class: ${child.className || 'none'}`);
        });
        
        // Log all links on the page
        const allLinks = document.querySelectorAll('a[href]');
        console.log(`Page has ${allLinks.length} links (first 10 shown):`);
        
        Array.from(allLinks).slice(0, 10).forEach((link, i) => {
          console.log(`Link ${i}: "${link.textContent.trim()}" - Href: ${link.href}`);
        });
      });
      
      // Try a more general selector as fallback
      this._siteConfig.chapterLinksElmSelector = 'a[href]';
      console.log('Falling back to generic selector:', this._siteConfig.chapterLinksElmSelector);
    }

    // Now extract the chapter information
    return this._mainPage.evaluate(({ config }) => {
      const results = [];
      const linksElm = document.querySelectorAll(config.chapterLinksElmSelector);
      
      if (!linksElm || linksElm.length === 0) {
        // Try one last attempt with document.links
        const allLinks = Array.from(document.links);
        if (allLinks.length === 0) {
          throw new Error(`No links found on the page at all`);
        }
        
        // Filter links that are likely to be chapters
        const possibleChapterLinks = allLinks.filter(link => {
          const text = link.textContent.trim();
          const href = link.href;
          const isInternalLink = href.includes(window.location.hostname);
          return text && isInternalLink && text.length < 100 && !text.includes('http');
        });
        
        if (possibleChapterLinks.length === 0) {
          throw new Error(`No suitable chapter links found on the page`);
        }
        
        possibleChapterLinks.forEach((link, index) => {
          results.push({
            url: link.href,
            title: link.textContent.trim() || `Chapter ${index + 1}`,
            id: `chapter_${index}`
          });
        });
        
        return results;
      }
      
      // Process links found with the selector
      linksElm.forEach((link, index) => {
        if (link.href) {
          // Filter out likely non-chapter links
          const text = link.textContent.trim();
          if (!text || text.includes('http') || text === '↑' || text.length > 100) {
            return;
          }
          
          // Only include links to internal pages
          if (!link.href.includes(window.location.hostname)) {
            return;
          }
          
          results.push({
            url: link.href,
            title: text || `Chapter ${index + 1}`,
            id: `chapter_${index}`
          });
        }
      });

      return results;
    }, { config: this._siteConfig });
  }

  _fetchAllChaptersHTMLContent = async (chaptersMetaInfo = []) => {
    const chaptersContents = [];
    
    if (chaptersMetaInfo.length) {
      const newPage = await this._browser.newPage();
      
      for (let i = 0; i < chaptersMetaInfo.length; i++) {
        const chapter = chaptersMetaInfo[i];
        
        console.log(`Fetching content for chapter ${i+1}/${chaptersMetaInfo.length}: "${chapter.title}"`);
        
        try {
          const contentHTML = await this._fetchChapterContent(newPage, chapter);
          if (contentHTML) {
            chaptersContents.push({
              html: contentHTML,
              title: chapter.title,
              id: chapter.id
            });
          }
        } catch (error) {
          console.error(`Error fetching chapter "${chapter.title}":`, error);
        }
      }
      
      await newPage.close();
    }

    return chaptersContents;
  }

  _fetchChapterContent = async (page, chapter) => {
    try {
      await page.goto(chapter.url, { timeout: 60000, waitUntil: 'domcontentloaded' });
      // Wait a bit longer to ensure JavaScript loads properly
      await page.waitForTimeout(2000);
      
      // Take a screenshot for debugging if necessary
      // await page.screenshot({ path: `chapter_${chapter.id}.png` });
      
      // If the content selector isn't found, try to determine it
      const hasContentSelector = await page.evaluate((selector) => {
        return !!document.querySelector(selector);
      }, this._siteConfig.bookContentSelector);
      
      if (!hasContentSelector) {
        // Analyze the page to find the main content
        await page.evaluate(({ config, chapterId }) => {
          console.log(`Analyzing content for chapter ${chapterId}...`);
          
          // Find the element with the most text content
          const allElements = document.querySelectorAll('body *');
          let bestElement = null;
          let maxTextLength = 0;
          
          for (const elem of allElements) {
            // Skip tiny elements or likely navigation elements
            if (elem.offsetHeight < 100) continue;
            if (elem.tagName === 'NAV' || elem.tagName === 'ASIDE') continue;
            
            const textLength = elem.textContent.trim().length;
            if (textLength > maxTextLength) {
              maxTextLength = textLength;
              bestElement = elem;
            }
          }
          
          if (bestElement) {
            console.log(`Found content element for chapter ${chapterId} with ${maxTextLength} characters`);
            
            // Update the content selector for this page
            if (bestElement.id) {
              config.bookContentSelector = `#${bestElement.id}`;
            } else if (bestElement.className) {
              config.bookContentSelector = `.${bestElement.className.split(' ')[0]}`;
            } else {
              config.bookContentSelector = bestElement.tagName.toLowerCase();
            }
          }
        }, { config: this._siteConfig, chapterId: chapter.id });
      }
      
      return page.evaluate(({ config, chapterId, chapterTitle }) => {
        // Try to find the content using the selector
        let contentElement = document.querySelector(config.bookContentSelector);
        
        // If still not found, fall back to the entire body
        if (!contentElement) {
          console.log(`Content selector "${config.bookContentSelector}" not found for chapter ${chapterId}`);
          contentElement = document.body;
        }
        
        // Create a clone to avoid modifying the original page
        const contentClone = contentElement.cloneNode(true);
        
        // Add chapter ID and title for better PDF structure
        contentClone.id = chapterId;
        
        // Add a chapter heading if it doesn't exist
        const existingHeading = contentClone.querySelector('h1');
        if (!existingHeading) {
          const heading = document.createElement('h1');
          heading.textContent = chapterTitle;
          heading.style.pageBreakBefore = 'always';
          heading.style.paddingTop = '20px';
          contentClone.insertBefore(heading, contentClone.firstChild);
        } else {
          existingHeading.style.pageBreakBefore = 'always';
          existingHeading.style.paddingTop = '20px';
        }
        
        return contentClone.outerHTML;
      }, { config: this._siteConfig, chapterId: chapter.id, chapterTitle: chapter.title });
    } catch (error) {
      console.error(`Error fetching chapter ${chapter.url}:`, error);
      return null;
    }
  }

  _generateTableOfContents = async (chaptersMetaInfo = []) => {
    return this._mainPage.evaluate(({ chaptersMetaInfo }) => {
      const tocContainer = document.createElement('div');
      tocContainer.className = 'toc-container';
      tocContainer.style.paddingBottom = '60px';
      tocContainer.style.breakAfter = 'page';
      
      const tocTitle = document.createElement('h1');
      tocTitle.textContent = 'Table of Contents';
      tocTitle.style.textAlign = 'center';
      tocTitle.style.marginBottom = '30px';
      tocContainer.appendChild(tocTitle);
      
      const tocList = document.createElement('div');
      tocList.className = 'toc-list';
      tocList.style.fontSize = '14px';
      
      chaptersMetaInfo.forEach((chapter, index) => {
        const tocItem = document.createElement('div');
        tocItem.className = 'toc-item';
        tocItem.style.margin = '10px 0';
        
        const chapterLink = document.createElement('a');
        chapterLink.href = `#${chapter.id}`;
        chapterLink.textContent = `${index + 1}. ${chapter.title}`;
        chapterLink.style.textDecoration = 'none';
        chapterLink.style.color = '#333';
        
        tocItem.appendChild(chapterLink);
        tocList.appendChild(tocItem);
      });
      
      tocContainer.appendChild(tocList);
      return tocContainer.outerHTML;
    }, { chaptersMetaInfo });
  }

  _beautifyMainPage = async () => {
    console.log('Beautifying main page for PDF output...');
    
    await this._mainPage.evaluate(({ config }) => {
      // Reset page to clean state
      document.body.innerHTML = '';
      document.body.style.margin = '0';
      document.body.style.padding = '40px';
      document.body.style.fontFamily = 'Arial, sans-serif';
      document.body.style.fontSize = '14px';
      document.body.style.lineHeight = '1.5';
      document.body.style.color = '#333';
      
      // Add container for our content
      const container = document.createElement('div');
      container.id = 'pdf-container';
      container.style.maxWidth = '800px';
      container.style.margin = '0 auto';
      document.body.appendChild(container);
      
      // Add title page
      const titlePage = document.createElement('div');
      titlePage.style.height = '80vh';
      titlePage.style.display = 'flex';
      titlePage.style.flexDirection = 'column';
      titlePage.style.justifyContent = 'center';
      titlePage.style.alignItems = 'center';
      titlePage.style.textAlign = 'center';
      titlePage.style.breakAfter = 'page';
      
      const mainTitle = document.createElement('h1');
      mainTitle.textContent = document.title || 'Bridge API Documentation';
      mainTitle.style.fontSize = '32px';
      mainTitle.style.marginBottom = '20px';
      
      const subtitle = document.createElement('div');
      subtitle.textContent = 'Generated on ' + new Date().toLocaleDateString();
      subtitle.style.fontSize = '16px';
      
      titlePage.appendChild(mainTitle);
      titlePage.appendChild(subtitle);
      container.appendChild(titlePage);
    }, { config: this._siteConfig });
  }

  _generateFullHTMLPage = async (chaptersContent = [], tableOfContentsHTML = '', chaptersMetaInfo = []) => {
    if (!chaptersContent.length) {
      throw new Error('No chapter content found to generate PDF');
    }

    console.log('Assembling complete document...');
    
    await this._mainPage.evaluate(({ chaptersContent, tableOfContentsHTML, chaptersMetaInfo }) => {
      const container = document.getElementById('pdf-container');
      
      // Add table of contents
      container.insertAdjacentHTML('beforeend', tableOfContentsHTML);
      
      // Add all chapter content
      chaptersContent.forEach(chapter => {
        container.insertAdjacentHTML('beforeend', chapter.html);
      });
      
      // Add page breaks between chapters and style improvements
      const chapterHeadings = document.querySelectorAll('h1');
      chapterHeadings.forEach(heading => {
        heading.style.pageBreakBefore = 'always';
        heading.style.marginTop = '40px';
      });
      
      // Improve image handling
      const images = document.querySelectorAll('img');
      images.forEach(img => {
        img.style.maxWidth = '100%';
        img.style.height = 'auto';
        img.style.margin = '20px 0';
      });
      
      // Improve code blocks
      // Style <pre> elements (block code)
      const preBlocks = document.querySelectorAll('pre');
      preBlocks.forEach(block => {
        block.style.backgroundColor = '#f5f5f5';
        block.style.padding = '15px';
        block.style.borderRadius = '5px';
        block.style.fontFamily = 'monospace';
        block.style.overflowX = 'auto';
        block.style.whiteSpace = 'pre-wrap'; // Keep wrapping for readability in PDF
        // Ensure inner code doesn't get double background
        const innerCode = block.querySelector('code');
        if (innerCode) {
            innerCode.style.backgroundColor = 'transparent';
            innerCode.style.padding = '0';
            innerCode.style.borderRadius = '0';
        }
      });

      // Reset potentially problematic styles for inline <code> elements
      const inlineCode = document.querySelectorAll('p code, li code'); // Target only inline code
      inlineCode.forEach(code => {
        code.style.backgroundColor = 'transparent'; // Remove background
        code.style.padding = '1px 2px'; // Minimal padding
        code.style.borderRadius = '3px'; // Minimal rounding
        code.style.fontFamily = 'monospace'; // Keep font
        code.style.whiteSpace = 'normal'; // Allow normal wrapping
        code.style.overflowX = 'visible'; // Prevent potential overflow issues
      });
    }, {
      chaptersContent,
      tableOfContentsHTML,
      chaptersMetaInfo
    });
  }
}

// Example usage
GitBookPDFSpider.create({
  url: 'https://abc-api.com',
  bookName: 'API_Reference.pdf',
}).then(spider => {
  spider.run();
});

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions