From 478f161e4c17ce9a6603af206d2b7401c7b8dba0 Mon Sep 17 00:00:00 2001 From: Vatsal Sanjay Date: Tue, 18 Mar 2025 07:29:32 +0100 Subject: [PATCH 01/11] feat: Generate SEO metadata from search database This commit adds a new script `scripts/generate_seo_tags.rb` that generates SEO metadata (keywords and descriptions) for the website's pages based on the search database. The key changes are: - Loads the search database from `assets/js/search_db.json` - Extracts keywords from the page title, type, and tags - Generates descriptions from the page content - Updates the HTML files with the generated metadata (keywords and description) - Handles cases where no good description is available by generating one from the full content - Normalizes URLs to ensure consistent metadata generation This feature will improve the website's SEO by providing relevant metadata for search engines. --- _layouts/default.html | 32 +++++ robots.txt | 6 + scripts/build.sh | 4 + scripts/generate_seo_tags.rb | 232 +++++++++++++++++++++++++++++++++++ 4 files changed, 274 insertions(+) create mode 100644 robots.txt create mode 100644 scripts/generate_seo_tags.rb diff --git a/_layouts/default.html b/_layouts/default.html index 4673ad2..7ab8255 100644 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -32,11 +32,19 @@ + + + + + + {% if page.tags %}{% endif %} @@ -49,6 +57,30 @@ + + + + {% if page.layout == 'research' %} + + {% endif %} diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..887fcaf --- /dev/null +++ b/robots.txt @@ -0,0 +1,6 @@ +User-agent: * +Allow: / +Sitemap: https://comphy-lab.org/sitemap.xml + +# Allow all robots complete access +Disallow: diff --git a/scripts/build.sh b/scripts/build.sh index 51bb150..493417b 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -35,4 +35,8 @@ cd .. echo "Generating pre-filtered research pages..." bundle exec ruby scripts/generate_filtered_research.rb +# Generate SEO metadata from search database +echo "Generating SEO metadata..." +bundle exec ruby scripts/generate_seo_tags.rb + echo "Build completed successfully!" diff --git a/scripts/generate_seo_tags.rb b/scripts/generate_seo_tags.rb new file mode 100644 index 0000000..d9afe39 --- /dev/null +++ b/scripts/generate_seo_tags.rb @@ -0,0 +1,232 @@ +#!/usr/bin/env ruby +require 'json' +require 'fileutils' +require 'nokogiri' +require 'set' +require 'yaml' + +# Get the project root directory (one level up from scripts) +ROOT_DIR = File.expand_path('..', __dir__) + +# Load search database +search_db_path = File.join(ROOT_DIR, 'assets', 'js', 'search_db.json') +unless File.exist?(search_db_path) + puts "Search database not found at #{search_db_path}" + puts "Run scripts/generate_search_db.rb first" + exit 1 +end + +search_db = JSON.parse(File.read(search_db_path)) +puts "Loaded search database with #{search_db.length} entries" + +# Initialize collections to store keywords and descriptions +keywords_by_url = {} +descriptions_by_url = {} +content_by_url = {} + +# Track URL normalizations +normalized_urls = {} + +# Helper method to normalize URLs +def normalize_url(url) + # Remove anchor + url = url.split('#').first + + # Ensure URL starts with a slash + url = "/#{url}" unless url.start_with?('/') + + # Add index.html to root URL + url = "/index.html" if url == "/" + + # Add .html extension to URLs without extension + unless url.include?('.') + url = "#{url.chomp('/')}/index.html" + end + + # Remove leading slash for file operations + url.sub(/^\//, '') +end + +# Process search database to generate metadata +search_db.each do |entry| + url = entry['url'].to_s + + # Skip external URLs + next if url.start_with?('http') + next if url.empty? + + # Normalize URL for internal files + normalized_url = normalize_url(url) + normalized_urls[url] = normalized_url + + # Initialize collections for this URL if not already present + keywords_by_url[normalized_url] ||= Set.new + descriptions_by_url[normalized_url] ||= Set.new + content_by_url[normalized_url] ||= [] + + # Extract keywords from title and content + title = entry['title'].to_s + content = entry['content'].to_s + + # Skip entries with little content + next if content.length < 50 + + # Generate keywords + # From title - split and keep words longer than 3 characters + title_keywords = title.downcase + .gsub(/[^\w\s]/, ' ') + .split(/\s+/) + .select { |w| w.length > 3 } + .map(&:strip) + .uniq + .take(5) + + # From type and tags + type_keywords = [entry['type'].to_s.gsub('_', ' ')].reject(&:empty?) + tag_keywords = (entry['tags'] || []).map(&:downcase) + + # Combine all keywords + all_keywords = (title_keywords + type_keywords + tag_keywords).uniq.take(8) + + # Add to keywords for this URL + keywords_by_url[normalized_url].merge(all_keywords) + + # Generate description + description = content.gsub(/\s+/, ' ').strip.split(/[.!?]/).first + + # Only use if description is between 50 and 160 characters + if description && description.length >= 50 && description.length <= 160 + descriptions_by_url[normalized_url].add(description) + end + + # Store content for generating better descriptions if needed + content_by_url[normalized_url] << content +end + +puts "Generated metadata for #{keywords_by_url.keys.length} unique URLs" + +# Generate better descriptions for URLs with no good descriptions +descriptions_by_url.each do |url, descriptions| + if descriptions.empty? && content_by_url[url] + # Join all content and create a description + all_content = content_by_url[url].join(' ').gsub(/\s+/, ' ').strip + + # Take first 140 characters + ellipsis + if all_content.length > 50 + description = all_content[0..140] + (all_content.length > 140 ? '...' : '') + descriptions.add(description) + end + end +end + +# Function to update HTML files with metadata +def update_html_with_metadata(file_path, keywords, description) + return unless File.exist?(file_path) + + begin + # Read file content + html_content = File.read(file_path) + + # Parse with Nokogiri + doc = Nokogiri::HTML(html_content) + + # Get existing head element + head = doc.at_css('head') + return unless head + + # Check for existing metadata + existing_keywords = doc.css('meta[name="keywords"]') + existing_description = doc.css('meta[name="description"]') + + # Update or add keywords meta tag + if !keywords.empty? + keywords_str = keywords.to_a.uniq.join(', ') + if existing_keywords.empty? + # Add new meta tag for keywords + head.add_child("") + else + # Update existing keywords + existing_keywords.first['content'] = keywords_str + end + end + + # Update or add description meta tag if we have a good one + if !description.empty? + desc_str = description.to_a.first + + # Add if no description meta or if the existing one is too short + if existing_description.empty? || existing_description.first['content'].to_s.length < 50 + if existing_description.empty? + # Add new meta tag for description + head.add_child("") + else + # Update existing description + existing_description.first['content'] = desc_str + end + end + end + + # Write updated content back to file + File.write(file_path, doc.to_html) + return true + rescue => e + puts "Error updating metadata for #{file_path}: #{e.message}" + return false + end +end + +# Process HTML files and update metadata +site_dir = File.join(ROOT_DIR, '_site') +files_updated = 0 + +# Process each URL +keywords_by_url.each do |url, keywords| + # Get description for this URL + descriptions = descriptions_by_url[url] || Set.new + + # Get file path + file_path = File.join(site_dir, url) + + # Update HTML file if it exists + if update_html_with_metadata(file_path, keywords, descriptions) + files_updated += 1 + puts "Updated metadata for #{url}" + end +end + +puts "Updated #{files_updated} HTML files with SEO metadata" + +# Generate sitemapindex.xml +def generate_sitemapindex(site_dir) + sitemap_path = File.join(site_dir, 'sitemap.xml') + sitemapindex_path = File.join(site_dir, 'sitemapindex.xml') + + if File.exist?(sitemap_path) + # Get last modification time + last_mod = File.mtime(sitemap_path).strftime('%Y-%m-%dT%H:%M:%S%:z') + + # Create sitemapindex content + sitemapindex_content = <<-XML + + + + https://comphy-lab.org/sitemap.xml + #{last_mod} + + + XML + + # Write to file + File.write(sitemapindex_path, sitemapindex_content) + puts "Generated sitemapindex.xml" + return true + else + puts "sitemap.xml not found, skipping sitemapindex generation" + return false + end +end + +# Generate sitemapindex +generate_sitemapindex(site_dir) + +puts "SEO enhancement completed successfully!" From 059e9d9b1ddb1ae3840730325deb67ca9833618f Mon Sep 17 00:00:00 2001 From: Vatsal Sanjay Date: Tue, 18 Mar 2025 07:35:36 +0100 Subject: [PATCH 02/11] docs: add build and development instructions, repository guidelines, and coding conventions This commit adds detailed instructions for building and developing the project, as well as guidelines for maintaining the repository and coding conventions to follow. The changes include: - Added a new section on "Build and Development Commands" with instructions for installing dependencies, building the site and search database, running the local server, fetching blog content, and generating the search database. - Added a new section on "Repository Guidelines" with instructions for updating the README, using the provided templates and CSS files, and keeping the documentation up-to-date. - Added a new section on "General" coding conventions, including indentation, DRY principles, commenting, and support for light and dark themes. - Added a new section on "HTML/Markdown" conventions, including the use of semantic HTML elements, BEM naming, and keeping content in Markdown format. - Added a new section on "CSS" conventions, including the use of CSS variables, responsive breakpoints, units, mobile-first approach, and dark theme implementation. - Added a new section on "JavaScript" conventions, including the use of ES6+ features and following best practices. These changes aim to provide clear and comprehensive guidelines for contributors to follow when working on the project, ensuring consistency and maintainability. --- _layouts/research.html | 19 + _layouts/teaching-course.html | 24 + _layouts/teaching.html | 24 + _layouts/team.html | 19 + assets/js/search_db.json | 2055 +++++++++++++++++++++++++++++++++ 5 files changed, 2141 insertions(+) diff --git a/_layouts/research.html b/_layouts/research.html index 75f2189..0a1c3a6 100644 --- a/_layouts/research.html +++ b/_layouts/research.html @@ -106,11 +106,19 @@ + + + + + + @@ -125,6 +133,17 @@ + + + diff --git a/_layouts/teaching.html b/_layouts/teaching.html index 76d03f9..5b1a9fe 100644 --- a/_layouts/teaching.html +++ b/_layouts/teaching.html @@ -33,11 +33,19 @@ + + + + + + @@ -52,6 +60,22 @@ + + + diff --git a/_layouts/team.html b/_layouts/team.html index 864df97..ca298ab 100644 --- a/_layouts/team.html +++ b/_layouts/team.html @@ -33,11 +33,19 @@ + + + + + + @@ -52,6 +60,17 @@ + + +