add automatically add internal link feature

Nriver · Nriver · commit 12016bf5b9aa · 2024-12-18T16:35:56.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -127,8 +127,6 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-.pixi
-pixi.lock
 
 # Spyder project settings
 .spyderproject
diff --git a/README.md b/README.md
@@ -549,7 +549,8 @@ res = ea.sort_note_content('y6hROhWjNmHQ', 'zh_CN.UTF-8')
 
 ## (Advanced Usage) 🧹 Delete empty `new note`
 
-Sometimes I inadvertently create numerous "new notes" which remain undeleted within my note tree. These "new notes" clutter my
+Sometimes I inadvertently create numerous "new notes" which remain undeleted within my note tree. These "new notes"
+clutter my
 workspace, scattered across various locations. I made this bulk deletion of these empty "new notes." Additionally, it
 generates warning messages for "new notes" that contain content, maybe we should change the title for those notes.
 
@@ -581,6 +582,31 @@ ea.optimize_image_attachments_to_webp('H2q3901uFDCH')
 
 This action can save significant space if you have many clipped pages. Whoever invented `WebP` is a genius.
 
+## (Advanced Usage) Automatically add internal link
+
+Experimental feature. Backup your database before do anything with this feature. It may completely destroy your notes.
+
+If you find something wrong after using this feature. Please provide me a minimal note sample to test and fix potential
+bugs.
+
+Add internal link in note
+
+```
+auto_create_internal_link('HfAnsf8XiarY')
+```
+
+For multiple notes
+
+```
+auto_create_internal_link(target_notes=['gLmmsIM8yPqx', 'T4Ui3wNByO03'])
+```
+
+(Dangerous action, backup first) Add internal notes for all text notes
+
+```
+auto_create_internal_link(prcess_all_note=True)
+```
+
 ## 🛠️ Develop
 
 Install with pip egg link to make package change without reinstall.
diff --git a/README_CN.md b/README_CN.md
@@ -508,6 +508,32 @@ ea.optimize_image_attachments_to_webp('H2q3901uFDCH')
 
 如果你有很多剪辑的页面，这个操作可以节省大量空间。发明 `WebP` 的人真是个天才。
 
+
+## （高级用法）自动添加内部链接
+
+实验性功能。在使用此功能之前，请备份您的数据库。它可能会完全破坏您的笔记。
+
+如果在使用此功能后发现任何问题，请提供一个最小化的笔记示例，以便测试和修复潜在的错误。
+
+在笔记中添加内部链接：
+
+```
+auto_create_internal_link('HfAnsf8XiarY')
+```
+
+针对多个笔记：
+
+```
+auto_create_internal_link(target_notes=['gLmmsIM8yPqx', 'T4Ui3wNByO03'])
+```
+
+（危险操作，请先备份）为所有文本笔记添加内部链接：
+
+```
+auto_create_internal_link(process_all_note=True)
+```
+
+
 ## 🛠️ 开发
 
 使用pip egg link进行安装，以便在不重新安装的情况下进行包更改。
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ loguru
 minify-html
 pillow
 python-dateutil
+tqdm
diff --git a/setup.py b/setup.py
@@ -148,6 +148,7 @@
         'minify-html',
         'pillow',
         'python-dateutil',
+        'tqdm',
     ],
     # Optional
     # List additional groups of dependencies here (e.g. development
diff --git a/src/trilium_py/client.py b/src/trilium_py/client.py
@@ -16,12 +16,20 @@
 from bs4 import BeautifulSoup
 from loguru import logger
 from natsort import natsort
+from tqdm import tqdm
 
 from .utils.file_util import replace_extension
+from .utils.html_util import add_internal_links
 from .utils.markdown_math import reconstructMath, sanitizeInput
-from .utils.note_util import beautify_content, sort_note_by_headings
+from .utils.note_util import beautify_content, sort_note_by_headings, preprocess_note_title_list
 from .utils.param_util import clean_param, format_query_string
-from .utils.time_util import get_today, get_yesterday, synchronize_dates, format_date_to_etapi, get_local_timezone
+from .utils.time_util import (
+    get_today,
+    get_yesterday,
+    synchronize_dates,
+    format_date_to_etapi,
+    get_local_timezone,
+)
 from .utils.image_util import compress_image_bytes, get_extension_from_image_mime
 
 
@@ -373,9 +381,10 @@ def patch_note(
         }
         res = requests.patch(url, json=clean_param(params), headers=self.get_header())
         return res.json()
-    def handle_dates(self, 
-                     dateCreated: Optional[datetime] = None, 
-                     utcDateCreated: Optional[datetime] = None):
+
+    def handle_dates(
+        self, dateCreated: Optional[datetime] = None, utcDateCreated: Optional[datetime] = None
+    ):
         '''Ensure that both local and UTC times are defined, and have same time
         (adjusted for timezone)'''
         if not dateCreated and not utcDateCreated:
@@ -384,21 +393,21 @@ def handle_dates(self,
             print(f"dateCreated is not datetime object, is {type(dateCreated)}")
             raise TypeError("dateCreated must be a datetime object")
         if utcDateCreated and not isinstance(utcDateCreated, datetime):
-            print(f"utcdateCreated is not datetime object, is {type(utcDateCreated)}")            
+            print(f"utcdateCreated is not datetime object, is {type(utcDateCreated)}")
             raise TypeError("utcDateCreated must be a datetime object")
 
         if dateCreated and dateCreated.tzinfo is None:
             tzinfo = get_local_timezone()
             dateCreated = dateCreated.replace(tzinfo=tzinfo)
             print(f"dateCreated.tzinfo was None. Changed to: {dateCreated.tzinfo}.")
-        
+
         if utcDateCreated and utcDateCreated.tzinfo is None:
             utcDateCreated = utcDateCreated.replace(tzinfo=dateutil.tz.tzutc())
             print(f'utc date tzinfo was None, forced to UTC ({utcDateCreated})')
 
         # After ensuring TZ is set for one of the date types, synchronize them
         synchronized_dates = synchronize_dates(local_date=dateCreated, utc_date=utcDateCreated)
-    
+
         return synchronized_dates
 
     # def synchronize_dates(self, local_date: Optional[datetime], utc_date: Optional[datetime]) -> tuple[Optional[datetime], Optional[datetime]]:
@@ -423,10 +432,10 @@ def handle_dates(self,
     #     print(f"\tlocal_date: {local_date}")
     #     print(f"\tutc_date  : {utc_date}")
     #     return local_date, utc_date
-    
+
     # def get_local_timezone(self=None):
     #     print("Getting local_timezone...")
-        
+
     #     # this is short and sweet
     #     local_timezone = datetime.now().astimezone().tzinfo
 
@@ -446,7 +455,7 @@ def handle_dates(self,
     #             UTC  : '2023-08-22 01:38:51.110Z'
     #     and exactly 3 decimal places for seconds.'''
     #     if kind == 'local':
-    #         date = date.strftime('%Y-%m-%d %H:%M:%S.%d3%z') 
+    #         date = date.strftime('%Y-%m-%d %H:%M:%S.%d3%z')
     #     if kind == 'utc':
     #         date = date.astimezone(dateutil.tz.tzstr('Z')) # use Zulu time
     #         date = date.strftime('%Y-%m-%d %H:%M:%S.%d3%Z')
@@ -1499,6 +1508,96 @@ def delete_empty_note(self, note_title=None, verbose=False):
                 if verbose:
                     logger.info(content)
 
+    def auto_create_internal_link(
+        self,
+        target_note_id=None,
+        target_notes=None,
+        process_all_notes=False,
+        skip_clipped_notes=True,
+        skip_day_notes=True,
+        verbose=True,
+    ):
+        """
+        Create internal link for notes
+        """
+
+        # Prepare note title and note id list
+        # Get all note titles and note ids
+        all_notes = self.search_note(search="note.title %= '.*'")
+        all_note_title_list = []
+        for x in all_notes['results']:
+            if x['isProtected']:
+                # Remove protected notes, they are not editable via ETAPI
+                continue
+            title = x['title']
+            note_id = x['noteId']
+            all_note_title_list.append([title, note_id])
+
+        # Process the note titles, handling duplicates and sorting
+        processed_note_title_list = preprocess_note_title_list(all_note_title_list)
+
+        # prepare target note id
+        if target_note_id:
+            target_notes = [
+                target_note_id,
+            ]
+        elif target_notes:
+            pass
+        elif process_all_notes:
+            # process all notes if not provided a note id list
+            target_notes = [x[1] for x in all_note_title_list]
+
+        # Add internal link
+
+        def get_child_note_title_note_id_list(note_id):
+            res = self.get_note(note_id)
+            result = []
+            for child_note_id in res['childNoteIds']:
+                x = self.get_note(child_note_id)
+                result.append([x['title'], x['noteId']])
+            return preprocess_note_title_list(result)
+
+        for note_id in tqdm(target_notes):
+
+            # only process text note here
+            current_note = self.get_note(note_id)
+
+            if verbose:
+                logger.info(f'current note id: {note_id} title: {current_note["title"]}')
+
+            if not current_note['type'] == 'text':
+                if verbose:
+                    logger.info('skip: not text note')
+                continue
+
+            if skip_clipped_notes and any(
+                [x['name'] == 'pageUrl' for x in current_note['attributes']]
+            ):
+                if verbose:
+                    logger.info('skip: clipped note')
+                continue
+
+            if skip_day_notes and any(
+                [x['name'] == 'dateNote' for x in current_note['attributes']]
+            ):
+                if verbose:
+                    logger.info('skip: day note')
+                continue
+
+            # add child note, we can handle sub notes with same name from different parent notes
+            processed_child_note_title_list = get_child_note_title_note_id_list(note_id)
+            tmp_list_for_current_note = processed_child_note_title_list + processed_note_title_list
+
+            content = self.get_note_content(note_id)
+            updated_content, replaced = add_internal_links(
+                content, tmp_list_for_current_note, current_note_id=note_id
+            )
+            # If content has changed, update the note
+            if replaced:
+                self.update_note_content(note_id, updated_content)
+                if verbose:
+                    logger.info(f"Added internal link to note {note_id}.")
+
 
 class ListTemplate(string.Template):
     """Encapsulate To Do List HTML details
diff --git a/src/trilium_py/utils/html_util.py b/src/trilium_py/utils/html_util.py
@@ -1,7 +1,11 @@
 import locale
 import re
+import warnings
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
+
+# Disable MarkupResemblesLocatorWarning globally
+warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
 
 TAG_LEVELS = {'h1': 1, 'h2': 2, 'h3': 3, 'h4': 4, 'h5': 5, 'h6': 6}
 
@@ -73,3 +77,61 @@ def convert_to_tree(data):
     # Concatenate data in a depth-first search manner
     html_string = dfs_concat_data(tree)
     return html_string
+
+
+def add_internal_links(
+    html_content, keyword_note_id_list, current_note_id=None, exclude_headings=True
+):
+    """
+    Adds internal links to the HTML content by replacing keywords with anchor tags.
+
+    Args:
+        html_content (str): The HTML content to process.
+        keyword_note_id_list (list of tuples): List of (keyword, note_id).
+        exclude_headings (bool): Whether to exclude heading tags from processing.
+        current_note_id (str): The ID of the current note to prevent self-referencing.
+
+    Returns:
+        tuple: A tuple containing the updated HTML content and a boolean indicating if replacements were made.
+    """
+    # Use BeautifulSoup to parse the HTML content
+    soup = BeautifulSoup(html_content, "html.parser")
+    replaced = False  # Flag to check if any replacement happens
+
+    # Precompile the keywords and links into a dictionary, excluding self-referencing notes
+    keyword_to_link = {
+        keyword: f'<a class="reference-link" href="#root/{note_id}">{keyword}</a>'
+        for keyword, note_id in keyword_note_id_list
+        if note_id != current_note_id  # Exclude the current note's ID
+    }
+
+    # Create a regex pattern to match any keyword
+    if not keyword_to_link:
+        return str(soup), replaced  # No keywords to process
+
+    keyword_pattern = re.compile(
+        r'\b(' + '|'.join(re.escape(k) for k in keyword_to_link.keys()) + r')\b'
+    )
+
+    # Tags to exclude from replacement
+    exclude_tags = ['a']
+    if exclude_headings:
+        exclude_tags.extend(['h2', 'h3', 'h4', 'h5', 'h6'])
+
+    # Traverse all text nodes once
+    for text_node in soup.find_all(string=True):
+        # Skip nodes inside tags that shouldn't contain links
+        if text_node.parent.name in exclude_tags:
+            continue
+
+        # Replace keywords in the text
+        def replace_keyword(match):
+            replaced_keyword = match.group(0)
+            return keyword_to_link[replaced_keyword]
+
+        new_text = keyword_pattern.sub(replace_keyword, text_node)
+        if new_text != text_node:  # If the text has actually changed
+            text_node.replace_with(BeautifulSoup(new_text, "html.parser"))
+            replaced = True  # Mark that replacement has occurred
+
+    return str(soup), replaced
diff --git a/src/trilium_py/utils/note_util.py b/src/trilium_py/utils/note_util.py
@@ -110,3 +110,31 @@ def sort_note_by_headings(html_content, locale_str='zh_CN.UTF-8'):
     sorted_html_string = content_before_first_h + sorted_html
 
     return sorted_html_string
+
+
+def preprocess_note_title_list(data):
+    """
+    Optimized version of the function to preprocess the list of [title, note_id].
+    Cleans titles, removes duplicates and previous matching entries, and sorts by title length.
+    """
+
+    def clean_title(title):
+        return title.strip()
+
+    # Use an ordered dictionary to maintain insertion order while ensuring uniqueness
+    from collections import OrderedDict
+
+    cleaned_data = OrderedDict()
+
+    # Traverse the data and process each title
+    for title, note_id in data:
+        cleaned_title = clean_title(title)
+        if cleaned_title in cleaned_data:
+            # If the title already exists, remove it
+            del cleaned_data[cleaned_title]
+        else:
+            # Otherwise, add it to the dictionary
+            cleaned_data[cleaned_title] = note_id
+
+    # Convert the dictionary back to a list and sort by title length (descending)
+    return sorted(cleaned_data.items(), key=lambda x: len(x[0]), reverse=True)