From 43a26dd4c50c55ec0a4551a984927d30c8c63b29 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 8 Sep 2023 22:41:50 +0800 Subject: [PATCH 1/3] =?UTF-8?q?openai-translator=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- openai-translator/ai_translator/book/book.py | 14 +-- .../ai_translator/book/content.py | 32 +++++-- openai-translator/ai_translator/book/page.py | 6 +- openai-translator/ai_translator/main.py | 33 ++++--- .../ai_translator/model/glm_model.py | 51 +++++------ .../ai_translator/model/model.py | 14 ++- .../ai_translator/model/openai_model.py | 28 ++++-- .../ai_translator/translator/exceptions.py | 7 +- .../ai_translator/translator/pdf_parser.py | 37 ++++++-- .../translator/pdf_translator.py | 27 +++++- .../ai_translator/translator/writer.py | 86 ++++++++++++------- .../ai_translator/utils/argument_parser.py | 17 +++- .../ai_translator/utils/config_loader.py | 7 ++ .../ai_translator/utils/logger.py | 42 +++++---- 14 files changed, 265 insertions(+), 136 deletions(-) diff --git a/openai-translator/ai_translator/book/book.py b/openai-translator/ai_translator/book/book.py index b079357b..757f5577 100644 --- a/openai-translator/ai_translator/book/book.py +++ b/openai-translator/ai_translator/book/book.py @@ -1,9 +1,9 @@ -from .page import Page +from .page import Page # 导入 Page 类 -class Book: - def __init__(self, pdf_file_path): - self.pdf_file_path = pdf_file_path - self.pages = [] +class Book: # 定义 Book 类 + def __init__(self, pdf_file_path): # 定义构造函数,传入 pdf 文件路径 + self.pdf_file_path = pdf_file_path # 将 pdf 文件路径赋值给实例变量 + self.pages = [] # 初始化页面列表 - def add_page(self, page: Page): - self.pages.append(page) \ No newline at end of file + def add_page(self, page: Page): # 定义添加页面方法,传入 Page 类型的参数 + self.pages.append(page) # 将页面添加到页面列表中 \ No newline at end of file diff --git a/openai-translator/ai_translator/book/content.py b/openai-translator/ai_translator/book/content.py index 623c7b2d..bc6628b7 100644 --- a/openai-translator/ai_translator/book/content.py +++ b/openai-translator/ai_translator/book/content.py @@ -1,26 +1,35 @@ +# 导入 pandas 库,用于数据处理 import pandas as pd +# 导入 Enum 和 auto,用于创建枚举类型 from enum import Enum, auto +# 导入 PIL 库中的 Image 类,用于图像处理 from PIL import Image as PILImage +# 导入自定义的 LOG 函数,用于日志记录 from utils import LOG +# 定义 ContentType 枚举类,包含 TEXT、TABLE 和 IMAGE 三种类型 class ContentType(Enum): TEXT = auto() TABLE = auto() IMAGE = auto() +# 定义 Content 类,包含 content_type、original、translation 和 status 四个属性 class Content: def __init__(self, content_type, original, translation=None): - self.content_type = content_type - self.original = original - self.translation = translation - self.status = False + self.content_type = content_type # 内容类型 + self.original = original # 原始内容 + self.translation = translation # 翻译后的内容 + self.status = False # 翻译状态,默认为 False + # 设置翻译后的内容和翻译状态 def set_translation(self, translation, status): + # 检查翻译类型是否正确 if not self.check_translation_type(translation): raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}") self.translation = translation self.status = status + # 检查翻译类型是否正确 def check_translation_type(self, translation): if self.content_type == ContentType.TEXT and isinstance(translation, str): return True @@ -30,27 +39,30 @@ def check_translation_type(self, translation): return True return False - +# 定义 TableContent 类,继承自 Content 类 class TableContent(Content): def __init__(self, data, translation=None): df = pd.DataFrame(data) - # Verify if the number of rows and columns in the data and DataFrame object match + # 验证提取的表格数据和 DataFrame 对象的行数和列数是否匹配 if len(data) != len(df) or len(data[0]) != len(df.columns): raise ValueError("The number of rows and columns in the extracted table data and DataFrame object do not match.") + # 调用父类的构造函数,设置内容类型为表格,内容为 DataFrame 对象 super().__init__(ContentType.TABLE, df) + # 设置翻译后的内容和翻译状态 def set_translation(self, translation, status): try: + # 如果翻译不是字符串类型,则抛出异常 if not isinstance(translation, str): raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}") LOG.debug(translation) - # Convert the string to a list of lists + # 将字符串转换为列表形式 table_data = [row.strip().split() for row in translation.strip().split('\n')] LOG.debug(table_data) - # Create a DataFrame from the table_data + # 从 table_data 创建 DataFrame translated_df = pd.DataFrame(table_data[1:], columns=table_data[0]) LOG.debug(translated_df) self.translation = translated_df @@ -60,18 +72,22 @@ def set_translation(self, translation, status): self.translation = None self.status = False + # 将 DataFrame 转换为字符串形式 def __str__(self): return self.original.to_string(header=False, index=False) + # 迭代表格中的每个元素 def iter_items(self, translated=False): target_df = self.translation if translated else self.original for row_idx, row in target_df.iterrows(): for col_idx, item in enumerate(row): yield (row_idx, col_idx, item) + # 更新表格中的元素 def update_item(self, row_idx, col_idx, new_value, translated=False): target_df = self.translation if translated else self.original target_df.at[row_idx, col_idx] = new_value + # 获取原始表格的字符串形式 def get_original_as_str(self): return self.original.to_string(header=False, index=False) \ No newline at end of file diff --git a/openai-translator/ai_translator/book/page.py b/openai-translator/ai_translator/book/page.py index df12e772..273fae97 100644 --- a/openai-translator/ai_translator/book/page.py +++ b/openai-translator/ai_translator/book/page.py @@ -1,8 +1,8 @@ -from .content import Content +from .content import Content # 导入 Content 类 class Page: def __init__(self): - self.contents = [] + self.contents = [] # 初始化 contents 列表为空 def add_content(self, content: Content): - self.contents.append(content) + self.contents.append(content) # 将 content 添加到 contents 列表中 \ No newline at end of file diff --git a/openai-translator/ai_translator/main.py b/openai-translator/ai_translator/main.py index 6b8e0c9b..813eec40 100644 --- a/openai-translator/ai_translator/main.py +++ b/openai-translator/ai_translator/main.py @@ -1,27 +1,26 @@ -import sys -import os +import sys # 导入 sys 模块,用于添加系统路径 +import os # 导入 os 模块,用于访问操作系统功能 -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(os.path.abspath(__file__))) # 将当前文件所在目录添加到系统路径中,以便导入其他模块 -from utils import ArgumentParser, ConfigLoader, LOG -from model import GLMModel, OpenAIModel -from translator import PDFTranslator +from utils import ArgumentParser, ConfigLoader, LOG # 导入自定义模块 utils 中的 ArgumentParser、ConfigLoader 和 LOG 类 +from model import GLMModel, OpenAIModel # 导入自定义模块 model 中的 GLMModel 和 OpenAIModel 类 +from translator import PDFTranslator # 导入自定义模块 translator 中的 PDFTranslator 类 if __name__ == "__main__": - argument_parser = ArgumentParser() - args = argument_parser.parse_arguments() - config_loader = ConfigLoader(args.config) + argument_parser = ArgumentParser() # 实例化 ArgumentParser 类 + args = argument_parser.parse_arguments() # 解析命令行参数 - config = config_loader.load_config() + config_loader = ConfigLoader(args.config) # 实例化 ConfigLoader 类,并传入配置文件路径 + config = config_loader.load_config() # 加载配置文件 - model_name = args.openai_model if args.openai_model else config['OpenAIModel']['model'] - api_key = args.openai_api_key if args.openai_api_key else config['OpenAIModel']['api_key'] - model = OpenAIModel(model=model_name, api_key=api_key) + model_name = args.openai_model if args.openai_model else config['OpenAIModel']['model'] # 获取 OpenAI 模型名称 + api_key = args.openai_api_key if args.openai_api_key else config['OpenAIModel']['api_key'] # 获取 OpenAI API 密钥 + model = OpenAIModel(model=model_name, api_key=api_key) # 实例化 OpenAIModel 类,并传入模型名称和 API 密钥 - - pdf_file_path = args.book if args.book else config['common']['book'] - file_format = args.file_format if args.file_format else config['common']['file_format'] + pdf_file_path = args.book if args.book else config['common']['book'] # 获取 PDF 文件路径 + file_format = args.file_format if args.file_format else config['common']['file_format'] # 获取文件格式 # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 translator = PDFTranslator(model) - translator.translate_pdf(pdf_file_path, file_format) + translator.translate_pdf(pdf_file_path, file_format) \ No newline at end of file diff --git a/openai-translator/ai_translator/model/glm_model.py b/openai-translator/ai_translator/model/glm_model.py index 23f7e938..f27d2e22 100644 --- a/openai-translator/ai_translator/model/glm_model.py +++ b/openai-translator/ai_translator/model/glm_model.py @@ -1,30 +1,31 @@ -import requests -import simplejson +import requests # 导入requests库,用于发送HTTP请求 +import simplejson # 导入simplejson库,用于处理JSON格式数据 -from model import Model +from model import Model # 导入Model类,用于继承 -class GLMModel(Model): - def __init__(self, model_url: str, timeout: int): - self.model_url = model_url - self.timeout = timeout +class GLMModel(Model): # 定义GLMModel类,继承Model类 + def __init__(self, model_url: str, timeout: int): # 定义构造函数,传入模型URL和超时时间 + self.model_url = model_url # 将模型URL赋值给实例变量model_url + self.timeout = timeout # 将超时时间赋值给实例变量timeout - def make_request(self, prompt): - try: - payload = { - "prompt": prompt, - "history": [] + def make_request(self, prompt): # 定义make_request方法,传入prompt参数,返回翻译结果和是否成功的标志 + try: # 尝试执行以下代码 + payload = { # 定义payload字典,包含prompt和history两个键值对 + "prompt": prompt, # 将prompt参数赋值给payload字典的prompt键 + "history": [] # 将空列表赋值给payload字典的history键 } + # 发送POST请求,传入模型URL、payload字典和超时时间,将响应赋值给response变量 response = requests.post(self.model_url, json=payload, timeout=self.timeout) - response.raise_for_status() - response_dict = response.json() - translation = response_dict["response"] - return translation, True - except requests.exceptions.RequestException as e: - raise Exception(f"请求异常:{e}") - except requests.exceptions.Timeout as e: - raise Exception(f"请求超时:{e}") - except simplejson.errors.JSONDecodeError as e: - raise Exception("Error: response is not valid JSON format.") - except Exception as e: - raise Exception(f"发生了未知错误:{e}") - return "", False + response.raise_for_status() # 如果响应状态码不是200,抛出异常 + response_dict = response.json() # 将响应的JSON格式数据转换为Python字典,赋值给response_dict变量 + translation = response_dict["response"] # 从response_dict字典中获取response键对应的值,赋值给translation变量 + return translation, True # 返回translation和True + except requests.exceptions.RequestException as e: # 如果发生requests库的异常,将异常信息赋值给e变量 + raise Exception(f"请求异常:{e}") # 抛出异常,提示请求异常 + except requests.exceptions.Timeout as e: # 如果发生请求超时异常,将异常信息赋值给e变量 + raise Exception(f"请求超时:{e}") # 抛出异常,提示请求超时 + except simplejson.errors.JSONDecodeError as e: # 如果响应数据不是正确的JSON格式,将异常信息赋值给e变量 + raise Exception("Error: response is not valid JSON format.") # 抛出异常,提示响应数据不是正确的JSON格式 + except Exception as e: # 如果发生其他异常,将异常信息赋值给e变量 + raise Exception(f"发生了未知错误:{e}") # 抛出异常,提示发生了未知错误 + return "", False # 如果没有返回translation和True,返回空字符串和False \ No newline at end of file diff --git a/openai-translator/ai_translator/model/model.py b/openai-translator/ai_translator/model/model.py index 80f5db6e..a0278560 100644 --- a/openai-translator/ai_translator/model/model.py +++ b/openai-translator/ai_translator/model/model.py @@ -1,17 +1,29 @@ +# 引入 book 中的 ContentType 枚举类型 from book import ContentType +# 定义一个 Model 类 class Model: + # 定义一个方法,用于生成文本翻译的提示 def make_text_prompt(self, text: str, target_language: str) -> str: + # 返回一个字符串,表示将 text 翻译为 target_language return f"翻译为{target_language}:{text}" + # 定义一个方法,用于生成表格翻译的提示 def make_table_prompt(self, table: str, target_language: str) -> str: + # 返回一个字符串,表示将 table 翻译为 target_language,并保持原有的间距(空格,分隔符),以表格形式返回 return f"翻译为{target_language},保持间距(空格,分隔符),以表格形式返回:\n{table}" + # 定义一个方法,用于生成翻译提示 def translate_prompt(self, content, target_language: str) -> str: + # 判断内容类型,生成相应的翻译提示 if content.content_type == ContentType.TEXT: + # 如果内容类型为文本,则调用 make_text_prompt 方法生成翻译提示 return self.make_text_prompt(content.original, target_language) elif content.content_type == ContentType.TABLE: + # 如果内容类型为表格,则调用 make_table_prompt 方法生成翻译提示 return self.make_table_prompt(content.get_original_as_str(), target_language) + # 定义一个方法,用于生成请求 def make_request(self, prompt): - raise NotImplementedError("子类必须实现 make_request 方法") + # 抛出 NotImplementedError 异常,提示子类必须实现 make_request 方法 + raise NotImplementedError("子类必须实现 make_request 方法") \ No newline at end of file diff --git a/openai-translator/ai_translator/model/openai_model.py b/openai-translator/ai_translator/model/openai_model.py index 61dfff1f..ab1d778c 100644 --- a/openai-translator/ai_translator/model/openai_model.py +++ b/openai-translator/ai_translator/model/openai_model.py @@ -1,20 +1,24 @@ -import openai -import requests -import simplejson -import time +import openai # 导入 OpenAI API +import requests # 导入 requests 库,用于发送 HTTP 请求 +import simplejson # 导入 simplejson 库,用于处理 JSON 数据 +import time # 导入 time 库,用于处理时间 -from model import Model -from utils import LOG +from model import Model # # 导入Model类,用于继承 +from utils import LOG # 导入自定义的 LOG 函数,用于日志记录 +# 继承 Model 类,实现 OpenAIModel class OpenAIModel(Model): + # 初始化函数,传入模型名称和 API 密钥 def __init__(self, model: str, api_key: str): self.model = model openai.api_key = api_key + # 发送请求函数,传入请求的 prompt,返回翻译结果和是否成功的标志 def make_request(self, prompt): attempts = 0 while attempts < 3: try: + # 如果模型是 gpt-3.5-turbo,则使用 openai.ChatCompletion.create() 方法 if self.model == "gpt-3.5-turbo": response = openai.ChatCompletion.create( model=self.model, @@ -22,7 +26,9 @@ def make_request(self, prompt): {"role": "user", "content": prompt} ] ) + # 获取 response 中的翻译结果 translation = response.choices[0].message['content'].strip() + # 如果模型不是 gpt-3.5-turbo,则使用 openai.Completion.create() 方法 else: response = openai.Completion.create( model=self.model, @@ -30,22 +36,30 @@ def make_request(self, prompt): max_tokens=150, temperature=0 ) + # 获取 response 中的翻译结果 translation = response.choices[0].text.strip() return translation, True + # 捕获openai.error.RateLimitError异常 except openai.error.RateLimitError: attempts += 1 + # 如果尝试次数小于3,则等待60秒后重试 if attempts < 3: LOG.warning("Rate limit reached. Waiting for 60 seconds before retrying.") time.sleep(60) + # 如果尝试次数大于等于3,则抛出异常 else: raise Exception("Rate limit reached. Maximum attempts exceeded.") + # 如果请求异常,则抛出异常 except requests.exceptions.RequestException as e: raise Exception(f"请求异常:{e}") + # 如果请求超时,则抛出异常 except requests.exceptions.Timeout as e: raise Exception(f"请求超时:{e}") + # 如果 response 不是 JSON 格式,则抛出异常 except simplejson.errors.JSONDecodeError as e: raise Exception("Error: response is not valid JSON format.") + # 如果发生未知错误,则抛出异常 except Exception as e: raise Exception(f"发生了未知错误:{e}") - return "", False + return "", False \ No newline at end of file diff --git a/openai-translator/ai_translator/translator/exceptions.py b/openai-translator/ai_translator/translator/exceptions.py index 4f4c23c1..1c6c49bb 100644 --- a/openai-translator/ai_translator/translator/exceptions.py +++ b/openai-translator/ai_translator/translator/exceptions.py @@ -1,5 +1,10 @@ +# 定义一个名为PageOutOfRangeException的异常类 class PageOutOfRangeException(Exception): + # 定义构造函数,接收两个参数:book_pages和requested_pages def __init__(self, book_pages, requested_pages): + # 将book_pages参数赋值给self.book_pages属性 self.book_pages = book_pages + # 将requested_pages参数赋值给self.requested_pages属性 self.requested_pages = requested_pages - super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.") + # 调用父类的构造函数,传入异常信息字符串 + super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.") \ No newline at end of file diff --git a/openai-translator/ai_translator/translator/pdf_parser.py b/openai-translator/ai_translator/translator/pdf_parser.py index 6f2f9bc3..0d5de921 100644 --- a/openai-translator/ai_translator/translator/pdf_parser.py +++ b/openai-translator/ai_translator/translator/pdf_parser.py @@ -1,58 +1,77 @@ +# 导入 pdfplumber 库,用于解析 PDF 文件 import pdfplumber +# 导入 Optional 类型,用于指定可选参数的类型 from typing import Optional +# 导入 Book、Page、Content、ContentType 和 TableContent 类,用于表示书籍、页面、内容、内容类型和表格内容 from book import Book, Page, Content, ContentType, TableContent +# 导入 PageOutOfRangeException 异常,用于表示页面超出范围的错误 from translator.exceptions import PageOutOfRangeException +# 导入 LOG 对象,用于记录日志 from utils import LOG - class PDFParser: def __init__(self): pass + # 解析 PDF 文件,返回 Book 对象 + # 第一个参数 pdf_file_path 用于指定要解析的 PDF 文件的路径 + # 第二个参数 pages 用于指定解析的页数,如果不指定,则解析所有页,如果指定了页数,但页数超出了 PDF 文件的总页数,则抛出异常 def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book: + # 创建一个Book对象,用于存储解析后的PDF内容 book = Book(pdf_file_path) + # 打开PDF文件 with pdfplumber.open(pdf_file_path) as pdf: + # 如果指定了页数,但页数超出了PDF文件的总页数,则抛出异常 if pages is not None and pages > len(pdf.pages): raise PageOutOfRangeException(len(pdf.pages), pages) + # 如果没有指定页数,则解析所有页 if pages is None: pages_to_parse = pdf.pages else: pages_to_parse = pdf.pages[:pages] + # 遍历每一页 for pdf_page in pages_to_parse: + # 创建一个Page对象,用于存储解析后的PDF页面内容 page = Page() - # Store the original text content + # 提取原始文本内容和表格数据 raw_text = pdf_page.extract_text() tables = pdf_page.extract_tables() - # Remove each cell's content from the original text + # 从原始文本中移除表格中的每个单元格的内容 for table_data in tables: for row in table_data: for cell in row: + # 如果单元格的内容不为空,则从原始文本中移除单元格的内容 raw_text = raw_text.replace(cell, "", 1) - # Handling text + # 处理文本内容 if raw_text: - # Remove empty lines and leading/trailing whitespaces + # 移除空行和前导/尾随空格 + # 1. 使用splitlines()方法将原始文本按行分割,返回一个列表 raw_text_lines = raw_text.splitlines() + # 2. 使用strip()方法移除每一行的前导/尾随空格 cleaned_raw_text_lines = [line.strip() for line in raw_text_lines if line.strip()] + # 3. 使用join()方法将列表中的每一行连接起来,返回一个字符串 cleaned_raw_text = "\n".join(cleaned_raw_text_lines) + # 创建一个Content对象,用于存储文本内容 text_content = Content(content_type=ContentType.TEXT, original=cleaned_raw_text) page.add_content(text_content) LOG.debug(f"[raw_text]\n {cleaned_raw_text}") - - - # Handling tables + # 处理表格内容 if tables: + # 创建一个TableContent对象,用于存储表格内容 table = TableContent(tables) page.add_content(table) LOG.debug(f"[table]\n{table}") + # 将解析后的页面内容添加到Book对象中 book.add_page(page) - return book + # 返回解析后的Book对象 + return book \ No newline at end of file diff --git a/openai-translator/ai_translator/translator/pdf_translator.py b/openai-translator/ai_translator/translator/pdf_translator.py index ab0b40b4..aab82fd3 100644 --- a/openai-translator/ai_translator/translator/pdf_translator.py +++ b/openai-translator/ai_translator/translator/pdf_translator.py @@ -1,26 +1,49 @@ +# 导入 typing 模块中的 Optional 类型 from typing import Optional +# 导入 Model 类,用于翻译模型的加载和使用 from model import Model +# 导入 PDFParser 类,用于解析 PDF 文件 from translator.pdf_parser import PDFParser +# 导入 Writer 类,用于将翻译结果写入文件 from translator.writer import Writer +# 导入 LOG 函数,用于记录日志 from utils import LOG +# PDFTranslator 类 class PDFTranslator: + # 构造函数,接收一个 Model 类型的参数 def __init__(self, model: Model): + # 将传入的 Model 对象赋值给 self.model self.model = model + # 创建 PDFParser 对象并赋值给 self.pdf_parser self.pdf_parser = PDFParser() + # 创建 Writer 对象并赋值给 self.writer self.writer = Writer() + # 翻译 PDF 文件 + # 第一个参数 pdf_file_path 用于指定要翻译的 PDF 文件的路径 + # 第二个参数 file_format 用于指定翻译后的文件格式,默认为 PDF + # 第三个参数 target_language 用于指定目标语言,默认为中文 + # 第四个参数 output_file_path 用于指定翻译后的文件的保存路径,默认为 None,表示不保存 + # 第五个参数 pages 用于指定翻译的页数,默认为 None,表示翻译所有页 def translate_pdf(self, pdf_file_path: str, file_format: str = 'PDF', target_language: str = '中文', output_file_path: str = None, pages: Optional[int] = None): + # 解析 PDF 文件并返回一个 Book 对象,赋值给 self.book self.book = self.pdf_parser.parse_pdf(pdf_file_path, pages) + # 遍历每一页和每个内容块 for page_idx, page in enumerate(self.book.pages): for content_idx, content in enumerate(page.contents): + # 将内容块传入 Model 对象的 translate_prompt 方法中,返回翻译的提示语句 prompt = self.model.translate_prompt(content, target_language) + # 打印提示语句 LOG.debug(prompt) + # 将提示语句传入 Model 对象的 make_request 方法中,返回翻译结果和状态 translation, status = self.model.make_request(prompt) + # 打印翻译结果 LOG.info(translation) - # Update the content in self.book.pages directly + # 直接在 self.book.pages 中更新内容块的翻译结果和状态 self.book.pages[page_idx].contents[content_idx].set_translation(translation, status) - self.writer.save_translated_book(self.book, output_file_path, file_format) + # 将翻译后的 Book 对象保存到文件中 + self.writer.save_translated_book(self.book, output_file_path, file_format) \ No newline at end of file diff --git a/openai-translator/ai_translator/translator/writer.py b/openai-translator/ai_translator/translator/writer.py index 12b37e75..eb5f7db2 100644 --- a/openai-translator/ai_translator/translator/writer.py +++ b/openai-translator/ai_translator/translator/writer.py @@ -1,108 +1,128 @@ +# 导入 os 模块,用于访问操作系统功能 import os +# 导入 reportlab 库中的 colors、pagesizes、units 模块,用于设置 PDF 文件的颜色、页面大小、单位等 from reportlab.lib import colors, pagesizes, units +# 导入 reportlab 库中的 getSampleStyleSheet、ParagraphStyle 模块,用于设置 PDF 文件中的样式 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +# 导入 reportlab 库中的 pdfmetrics、TTFont 模块,用于设置 PDF 文件中的字体 from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont +# 导入 reportlab 库中的 SimpleDocTemplate、Paragraph、Spacer、Table、TableStyle、PageBreak 模块,用于创建 PDF 文件的各种元素 from reportlab.platypus import ( SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak ) - +# 导入自定义的 Book、ContentType 类,用于表示书籍和内容类型 from book import Book, ContentType +# 导入自定义的 LOG 函数,用于记录日志 from utils import LOG class Writer: def __init__(self): pass + # 保存翻译后的书籍 + # 第一个参数 book 是 Book 类的实例 + # 第二个参数 output_file_path 是输出文件的路径 + # 第三个参数 file_format 是输出文件的格式,支持 PDF 和 Markdown def save_translated_book(self, book: Book, output_file_path: str = None, file_format: str = "PDF"): + # 判断输出文件的格式,支持PDF和Markdown if file_format.lower() == "pdf": - self._save_translated_book_pdf(book, output_file_path) + self._save_translated_book_pdf(book, output_file_path) elif file_format.lower() == "markdown": self._save_translated_book_markdown(book, output_file_path) else: raise ValueError(f"Unsupported file format: {file_format}") def _save_translated_book_pdf(self, book: Book, output_file_path: str = None): + # 如果未指定输出路径,则使用默认路径 if output_file_path is None: output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.pdf') LOG.info(f"pdf_file_path: {book.pdf_file_path}") LOG.info(f"开始翻译: {output_file_path}") - - # Register Chinese font + + # 注册中文字体 font_path = "../fonts/simsun.ttc" # 请将此路径替换为您的字体文件路径 pdfmetrics.registerFont(TTFont("SimSun", font_path)) - - # Create a new ParagraphStyle with the SimSun font + + # 创建使用SimSun字体的ParagraphStyle simsun_style = ParagraphStyle('SimSun', fontName='SimSun', fontSize=12, leading=14) - - # Create a PDF document + + # 创建PDF文档 + # 页面大小为A4纸张大小 doc = SimpleDocTemplate(output_file_path, pagesize=pagesizes.letter) + # 设置页面的左、右、上、下边距 styles = getSampleStyleSheet() + # 创建一个空的列表,用于存储PDF中的元素 story = [] - - # Iterate over the pages and contents + + # 遍历页面和内容 for page in book.pages: for content in page.contents: + # 如果内容已翻译 if content.status: if content.content_type == ContentType.TEXT: - # Add translated text to the PDF + # 将翻译后的文本添加到PDF中 text = content.translation para = Paragraph(text, simsun_style) story.append(para) - + elif content.content_type == ContentType.TABLE: - # Add table to the PDF + # 将表格添加到PDF中 table = content.translation table_style = TableStyle([ - ('BACKGROUND', (0, 0), (-1, 0), colors.grey), - ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), - ('ALIGN', (0, 0), (-1, -1), 'CENTER'), - ('FONTNAME', (0, 0), (-1, 0), 'SimSun'), # 更改表头字体为 "SimSun" - ('FONTSIZE', (0, 0), (-1, 0), 14), - ('BOTTOMPADDING', (0, 0), (-1, 0), 12), - ('BACKGROUND', (0, 1), (-1, -1), colors.beige), - ('FONTNAME', (0, 1), (-1, -1), 'SimSun'), # 更改表格中的字体为 "SimSun" - ('GRID', (0, 0), (-1, -1), 1, colors.black) + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), # 设置表头背景颜色 + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), # 设置表头文本颜色 + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # 设置表格中的文本对齐方式 + ('FONTNAME', (0, 0), (-1, 0), 'SimSun'), # 设置表头字体 + ('FONTSIZE', (0, 0), (-1, 0), 14), # 设置表头字体大小 + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), # 设置表头下边距 + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), # 设置表格内容背景颜色 + ('FONTNAME', (0, 1), (-1, -1), 'SimSun'), # 设置表格内容字体 + ('GRID', (0, 0), (-1, -1), 1, colors.black) # 设置表格框线 ]) pdf_table = Table(table.values.tolist()) pdf_table.setStyle(table_style) story.append(pdf_table) - # Add a page break after each page except the last one + + # 在每页之后添加分页符,最后一页除外 if page != book.pages[-1]: story.append(PageBreak()) - - # Save the translated book as a new PDF file + + # 将翻译后的书保存为新的PDF文件 doc.build(story) LOG.info(f"翻译完成: {output_file_path}") def _save_translated_book_markdown(self, book: Book, output_file_path: str = None): + # 如果未指定输出路径,则使用默认路径 if output_file_path is None: output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.md') LOG.info(f"pdf_file_path: {book.pdf_file_path}") LOG.info(f"开始翻译: {output_file_path}") + # 打开文件,并设置编码为utf-8 with open(output_file_path, 'w', encoding='utf-8') as output_file: - # Iterate over the pages and contents + # 遍历页面和内容 for page in book.pages: for content in page.contents: + # 如果内容已翻译 if content.status: if content.content_type == ContentType.TEXT: - # Add translated text to the Markdown file + # 将翻译后的文本添加到Markdown文件中 text = content.translation output_file.write(text + '\n\n') - + elif content.content_type == ContentType.TABLE: - # Add table to the Markdown file + # 将表格添加到Markdown文件中 table = content.translation header = '| ' + ' | '.join(str(column) for column in table.columns) + ' |' + '\n' separator = '| ' + ' | '.join(['---'] * len(table.columns)) + ' |' + '\n' - # body = '\n'.join(['| ' + ' | '.join(row) + ' |' for row in table.values.tolist()]) + '\n\n' + # body = '\n'.join(['| ' + ' | '.join(row) for row in table.values.tolist()]) + '\n\n' body = '\n'.join(['| ' + ' | '.join(str(cell) for cell in row) + ' |' for row in table.values.tolist()]) + '\n\n' output_file.write(header + separator + body) - - # Add a page break (horizontal rule) after each page except the last one + + # 在每页之后添加分隔符(水平线),最后一页除外 if page != book.pages[-1]: output_file.write('---\n\n') - + LOG.info(f"翻译完成: {output_file_path}") \ No newline at end of file diff --git a/openai-translator/ai_translator/utils/argument_parser.py b/openai-translator/ai_translator/utils/argument_parser.py index 95681dc1..a21bdb33 100644 --- a/openai-translator/ai_translator/utils/argument_parser.py +++ b/openai-translator/ai_translator/utils/argument_parser.py @@ -1,19 +1,34 @@ +# 导入argparse模块,用于解析命令行参数 import argparse +# 定义一个ArgumentParser类 class ArgumentParser: + # 初始化函数 def __init__(self): + # 创建一个ArgumentParser对象,设置description属性 self.parser = argparse.ArgumentParser(description='Translate English PDF book to Chinese.') + # 添加一个--config参数,类型为字符串,默认值为'config.yaml',帮助信息为'包含模型和API设置的配置文件。' self.parser.add_argument('--config', type=str, default='config.yaml', help='Configuration file with model and API settings.') + # 添加一个--model_type参数,类型为字符串,必填,可选值为'GLMModel'和'OpenAIModel',帮助信息为'要使用的翻译模型类型。可选值为"GLMModel"和"OpenAIModel"。' self.parser.add_argument('--model_type', type=str, required=True, choices=['GLMModel', 'OpenAIModel'], help='The type of translation model to use. Choose between "GLMModel" and "OpenAIModel".') + # 添加一个--glm_model_url参数,类型为字符串,帮助信息为'ChatGLM模型的URL。' self.parser.add_argument('--glm_model_url', type=str, help='The URL of the ChatGLM model URL.') + # 添加一个--timeout参数,类型为整数,帮助信息为'API请求的超时时间(秒)。' self.parser.add_argument('--timeout', type=int, help='Timeout for the API request in seconds.') + # 添加一个--openai_model参数,类型为字符串,帮助信息为'OpenAI模型的名称。如果model_type为"OpenAIModel",则必填。' self.parser.add_argument('--openai_model', type=str, help='The model name of OpenAI Model. Required if model_type is "OpenAIModel".') + # 添加一个--openai_api_key参数,类型为字符串,帮助信息为'OpenAIModel的API密钥。如果model_type为"OpenAIModel",则必填。' self.parser.add_argument('--openai_api_key', type=str, help='The API key for OpenAIModel. Required if model_type is "OpenAIModel".') + # 添加一个--book参数,类型为字符串,帮助信息为'要翻译的PDF文件。' self.parser.add_argument('--book', type=str, help='PDF file to translate.') + # 添加一个--file_format参数,类型为字符串,帮助信息为'翻译后的书籍文件格式。现在支持PDF和Markdown。' self.parser.add_argument('--file_format', type=str, help='The file format of translated book. Now supporting PDF and Markdown') + # 解析参数 def parse_arguments(self): + # 调用parse_args()方法解析参数 args = self.parser.parse_args() + # 如果model_type为'OpenAIModel',但是openai_model和openai_api_key都没有设置,则抛出错误 if args.model_type == 'OpenAIModel' and not args.openai_model and not args.openai_api_key: self.parser.error("--openai_model and --openai_api_key is required when using OpenAIModel") - return args + return args \ No newline at end of file diff --git a/openai-translator/ai_translator/utils/config_loader.py b/openai-translator/ai_translator/utils/config_loader.py index 0f006948..76eb2b4e 100644 --- a/openai-translator/ai_translator/utils/config_loader.py +++ b/openai-translator/ai_translator/utils/config_loader.py @@ -1,10 +1,17 @@ +# 导入yaml模块,用于读取和解析YAML文件 import yaml +# 定义 ConfigLoader 类 class ConfigLoader: + # 初始化方法,接收配置文件路径参数 def __init__(self, config_path): self.config_path = config_path + # 加载配置文件的方法 def load_config(self): + # 打开配置文件 with open(self.config_path, "r") as f: + # 使用 yaml 库加载配置文件内容 config = yaml.safe_load(f) + # 返回配置文件内容 return config \ No newline at end of file diff --git a/openai-translator/ai_translator/utils/logger.py b/openai-translator/ai_translator/utils/logger.py index a252b50e..05166990 100644 --- a/openai-translator/ai_translator/utils/logger.py +++ b/openai-translator/ai_translator/utils/logger.py @@ -1,32 +1,30 @@ -from loguru import logger -import os -import sys +from loguru import logger # 导入loguru库 +import os # 导入os库 +import sys # 导入sys库 -LOG_FILE = "translation.log" -ROTATION_TIME = "02:00" +LOG_FILE = "translation.log" # 定义日志文件名 +ROTATION_TIME = "02:00" # 定义日志轮换时间 class Logger: + # 日志类的构造函数,参数分别为日志名称、日志目录、是否开启debug模式 def __init__(self, name="translation", log_dir="logs", debug=False): - if not os.path.exists(log_dir): - os.makedirs(log_dir) - log_file_path = os.path.join(log_dir, LOG_FILE) + if not os.path.exists(log_dir): # 如果日志目录不存在 + os.makedirs(log_dir) # 创建日志目录 + log_file_path = os.path.join(log_dir, LOG_FILE) # 拼接日志文件路径 - # Remove default loguru handler - logger.remove() + logger.remove() # 移除logger的所有handler - # Add console handler with a specific log level - level = "DEBUG" if debug else "INFO" - logger.add(sys.stdout, level=level) - # Add file handler with a specific log level and timed rotation - logger.add(log_file_path, rotation=ROTATION_TIME, level="DEBUG") - self.logger = logger + level = "DEBUG" if debug else "INFO" # 根据debug参数设置日志级别 + logger.add(sys.stdout, level=level) # 添加标准输出handler + logger.add(log_file_path, rotation=ROTATION_TIME, level="DEBUG") # 添加文件输出handler + self.logger = logger # 将logger对象保存到实例变量中 -LOG = Logger(debug=True).logger +LOG = Logger(debug=True).logger # 创建Logger实例并获取logger对象 if __name__ == "__main__": - log = Logger().logger + log = Logger().logger # 创建Logger实例并获取logger对象 - log.debug("This is a debug message.") - log.info("This is an info message.") - log.warning("This is a warning message.") - log.error("This is an error message.") + log.debug("This is a debug message.") # 输出debug级别的日志 + log.info("This is an info message.") # 输出info级别的日志 + log.warning("This is a warning message.") # 输出warning级别的日志 + log.error("This is an error message.") # 输出error级别的日志 \ No newline at end of file From edf770c54dc7e5d534a39afbd2241fc33b9091fa Mon Sep 17 00:00:00 2001 From: yuqi Date: Sat, 9 Sep 2023 11:04:18 +0800 Subject: [PATCH 2/3] =?UTF-8?q?CLI=20=E5=8F=82=E6=95=B0=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=20api=5Fkey?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- langchain/openai-translator/ai_translator/flask_server.py | 3 +++ langchain/openai-translator/ai_translator/gradio_server.py | 3 +++ langchain/openai-translator/ai_translator/main.py | 3 +++ .../openai-translator/ai_translator/utils/argument_parser.py | 1 + 4 files changed, 10 insertions(+) diff --git a/langchain/openai-translator/ai_translator/flask_server.py b/langchain/openai-translator/ai_translator/flask_server.py index 7b5bed03..fcf1d161 100644 --- a/langchain/openai-translator/ai_translator/flask_server.py +++ b/langchain/openai-translator/ai_translator/flask_server.py @@ -56,6 +56,9 @@ def initialize_translator(): argument_parser = ArgumentParser() args = argument_parser.parse_arguments() + # 设置 OpenAI API Key + os.environ["OPENAI_API_KEY"] = args.api_key + # 初始化配置单例 config = TranslationConfig() config.initialize(args) diff --git a/langchain/openai-translator/ai_translator/gradio_server.py b/langchain/openai-translator/ai_translator/gradio_server.py index 8f7d8569..d148f906 100644 --- a/langchain/openai-translator/ai_translator/gradio_server.py +++ b/langchain/openai-translator/ai_translator/gradio_server.py @@ -39,6 +39,9 @@ def initialize_translator(): argument_parser = ArgumentParser() args = argument_parser.parse_arguments() + # 设置 OpenAI API Key + os.environ["OPENAI_API_KEY"] = args.api_key + # 初始化配置单例 config = TranslationConfig() config.initialize(args) diff --git a/langchain/openai-translator/ai_translator/main.py b/langchain/openai-translator/ai_translator/main.py index 33ae7282..1ac6b2bb 100644 --- a/langchain/openai-translator/ai_translator/main.py +++ b/langchain/openai-translator/ai_translator/main.py @@ -11,6 +11,9 @@ argument_parser = ArgumentParser() args = argument_parser.parse_arguments() + # 设置 OpenAI API Key + os.environ["OPENAI_API_KEY"] = args.api_key + # 初始化配置单例 config = TranslationConfig() config.initialize(args) diff --git a/langchain/openai-translator/ai_translator/utils/argument_parser.py b/langchain/openai-translator/ai_translator/utils/argument_parser.py index 57684d86..16e3470c 100644 --- a/langchain/openai-translator/ai_translator/utils/argument_parser.py +++ b/langchain/openai-translator/ai_translator/utils/argument_parser.py @@ -5,6 +5,7 @@ def __init__(self): self.parser = argparse.ArgumentParser(description='A translation tool that supports translations in any language pair.') self.parser.add_argument('--config_file', type=str, default='config.yaml', help='Configuration file with model and API settings.') self.parser.add_argument('--model_name', type=str, help='Name of the Large Language Model.') + self.parser.add_argument('--api_key', type=str, help='The API key for OpenAIModel.') self.parser.add_argument('--input_file', type=str, help='PDF file to translate.') self.parser.add_argument('--output_file_format', type=str, help='The file format of translated book. Now supporting PDF and Markdown') self.parser.add_argument('--source_language', type=str, help='The language of the original book to be translated.') From 88de34bb8691df0767df1e799967c4d104629834 Mon Sep 17 00:00:00 2001 From: yuqi Date: Sat, 9 Sep 2023 22:34:35 +0800 Subject: [PATCH 3/3] =?UTF-8?q?langchain=20=E4=B8=AD=E7=9A=84=20translator?= =?UTF-8?q?=20=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ai_translator/book/book.py | 12 +- .../ai_translator/book/content.py | 116 +++++++++--------- .../ai_translator/book/page.py | 5 +- .../ai_translator/flask_server.py | 32 +++-- .../ai_translator/gradio_server.py | 101 +++++++++------ .../openai-translator/ai_translator/main.py | 29 ++--- .../ai_translator/translator/exceptions.py | 7 +- .../ai_translator/translator/pdf_parser.py | 41 ++++--- .../translator/pdf_translator.py | 31 +++-- .../translator/translation_chain.py | 26 ++-- .../translator/translation_config.py | 14 ++- .../ai_translator/translator/writer.py | 98 +++++++++------ .../ai_translator/utils/argument_parser.py | 15 ++- .../ai_translator/utils/logger.py | 38 +++--- 14 files changed, 341 insertions(+), 224 deletions(-) diff --git a/langchain/openai-translator/ai_translator/book/book.py b/langchain/openai-translator/ai_translator/book/book.py index b079357b..4e2b97dd 100644 --- a/langchain/openai-translator/ai_translator/book/book.py +++ b/langchain/openai-translator/ai_translator/book/book.py @@ -1,9 +1,9 @@ -from .page import Page +from .page import Page # 导入 Page 类,用于添加页面 class Book: - def __init__(self, pdf_file_path): - self.pdf_file_path = pdf_file_path - self.pages = [] + def __init__(self, pdf_file_path): # 初始化函数,传入 pdf 文件路径 + self.pdf_file_path = pdf_file_path # 将 pdf 文件路径存储到实例变量中 + self.pages = [] # 初始化一个空的页面列表 - def add_page(self, page: Page): - self.pages.append(page) \ No newline at end of file + def add_page(self, page: Page): # 添加页面的方法,传入一个 Page 类的实例 + self.pages.append(page) # 将传入的页面实例添加到页面列表中 \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/book/content.py b/langchain/openai-translator/ai_translator/book/content.py index 901c2a07..b47474c9 100644 --- a/langchain/openai-translator/ai_translator/book/content.py +++ b/langchain/openai-translator/ai_translator/book/content.py @@ -1,85 +1,85 @@ -import pandas as pd +import pandas as pd # 导入 pandas 库,用于处理表格数据 -from enum import Enum, auto -from PIL import Image as PILImage -from utils import LOG -from io import StringIO +from enum import Enum, auto # 导入枚举类和自动编号功能,用于定义 ContentType 枚举类 +from PIL import Image as PILImage # 导入 PIL 库中的 Image 类并重命名为 PILImage,用于处理图像数据 +from utils import LOG # 导入 utils 模块中的 LOG 对象,用于输出调试信息 +from io import StringIO # 导入 StringIO 类,用于将字符串转换为文件对象 -class ContentType(Enum): - TEXT = auto() - TABLE = auto() - IMAGE = auto() +class ContentType(Enum): # 定义 ContentType 枚举类 + TEXT = auto() # 文本类型 + TABLE = auto() # 表格类型 + IMAGE = auto() # 图像类型 -class Content: - def __init__(self, content_type, original, translation=None): - self.content_type = content_type - self.original = original - self.translation = translation - self.status = False +class Content: # 定义 Content 类 + def __init__(self, content_type, original, translation=None): # 初始化函数,传入内容类型、原始内容和翻译内容(可选) + self.content_type = content_type # 将内容类型存储到实例变量中 + self.original = original # 将原始内容存储到实例变量中 + self.translation = translation # 将翻译内容存储到实例变量中 + self.status = False # 初始化翻译状态为 False - def set_translation(self, translation, status): - if not self.check_translation_type(translation): - raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}") - self.translation = translation - self.status = status + def set_translation(self, translation, status): # 设置翻译内容和翻译状态的方法,传入翻译内容和翻译状态 + if not self.check_translation_type(translation): # 如果翻译内容类型不符合要求 + raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}") # 抛出 ValueError 异常 + self.translation = translation # 将翻译内容存储到实例变量中 + self.status = status # 将翻译状态存储到实例变量中 - def check_translation_type(self, translation): - if self.content_type == ContentType.TEXT and isinstance(translation, str): - return True - elif self.content_type == ContentType.TABLE and isinstance(translation, list): - return True - elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image): - return True - return False + def check_translation_type(self, translation): # 检查翻译内容类型的方法,传入翻译内容 + if self.content_type == ContentType.TEXT and isinstance(translation, str): # 如果内容类型为 TEXT,且翻译内容为字符串类型 + return True # 返回 True + elif self.content_type == ContentType.TABLE and isinstance(translation, list): # 如果内容类型为 TABLE,且翻译内容为列表类型 + return True # 返回 True + elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image): # 如果内容类型为 IMAGE,且翻译内容为 PIL 库中的 Image 类型 + return True # 返回 True + return False # 否则返回 False - def __str__(self): + def __str__(self): # 定义 __str__ 方法,返回原始内容 return self.original -class TableContent(Content): - def __init__(self, data, translation=None): - df = pd.DataFrame(data) +class TableContent(Content): # 定义 TableContent 类,继承自 Content 类 + def __init__(self, data, translation=None): # 初始化函数,传入表格数据和翻译内容(可选) + df = pd.DataFrame(data) # 将表格数据转换为 DataFrame 对象 - # Verify if the number of rows and columns in the data and DataFrame object match + # 验证提取的表格数据和 DataFrame 对象的行数和列数是否匹配 if len(data) != len(df) or len(data[0]) != len(df.columns): raise ValueError("The number of rows and columns in the extracted table data and DataFrame object do not match.") - super().__init__(ContentType.TABLE, df) + super().__init__(ContentType.TABLE, df) # 调用父类的初始化函数,传入内容类型和 DataFrame 对象 - def set_translation(self, translation, status): + def set_translation(self, translation, status): # 设置翻译内容和翻译状态的方法,传入翻译内容和翻译状态 try: - if not isinstance(translation, str): - raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}") + if not isinstance(translation, str): # 如果翻译内容不是字符串类型 + raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}") # 抛出 ValueError 异常 - LOG.debug(f"[translation]\n{translation}") - # Extract column names from the first set of brackets + LOG.debug(f"[translation]\n{translation}") # 输出调试信息 + # 从第一组方括号中提取列名 header = translation.split(']')[0][1:].split(', ') - # Extract data rows from the remaining brackets + # 从剩余的方括号中提取数据行 data_rows = translation.split('] ')[1:] - # Replace Chinese punctuation and split each row into a list of values + # 将数据行中的每一行转换为列表 data_rows = [row[1:-1].split(', ') for row in data_rows] - # Create a DataFrame using the extracted header and data + # 使用提取的列名和数据创建 DataFrame translated_df = pd.DataFrame(data_rows, columns=header) LOG.debug(f"[translated_df]\n{translated_df}") - self.translation = translated_df - self.status = status - except Exception as e: - LOG.error(f"An error occurred during table translation: {e}") - self.translation = None - self.status = False + self.translation = translated_df # 将翻译后的 DataFrame 存储到实例变量中 + self.status = status # 将翻译状态存储到实例变量中 + except Exception as e: # 捕获所有异常 + LOG.error(f"An error occurred during table translation: {e}") # 输出错误信息 + self.translation = None # 将翻译内容设置为 None + self.status = False # 将翻译状态设置为 False - def __str__(self): + def __str__(self): # 定义 __str__ 方法,返回原始内容的字符串表示(不包括表头和行号) return self.original.to_string(header=False, index=False) - def iter_items(self, translated=False): - target_df = self.translation if translated else self.original - for row_idx, row in target_df.iterrows(): - for col_idx, item in enumerate(row): - yield (row_idx, col_idx, item) + def iter_items(self, translated=False): # 定义迭代表格元素的方法,传入是否翻译的标志 + target_df = self.translation if translated else self.original # 根据是否翻译的标志选择要迭代的 DataFrame + for row_idx, row in target_df.iterrows(): # 遍历 DataFrame 的每一行 + for col_idx, item in enumerate(row): # 遍历每一行的每一列 + yield (row_idx, col_idx, item) # 返回行索引、列索引和元素值的元组 - def update_item(self, row_idx, col_idx, new_value, translated=False): - target_df = self.translation if translated else self.original - target_df.at[row_idx, col_idx] = new_value + def update_item(self, row_idx, col_idx, new_value, translated=False): # 定义更新表格元素的方法,传入行索引、列索引、新值和是否翻译的标志 + target_df = self.translation if translated else self.original # 根据是否翻译的标志选择要更新的 DataFrame + target_df.at[row_idx, col_idx] = new_value # 更新指定位置的元素值 - def get_original_as_str(self): + def get_original_as_str(self): # 定义获取原始内容的字符串表示的方法 return self.original.to_string(header=False, index=False) \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/book/page.py b/langchain/openai-translator/ai_translator/book/page.py index df12e772..7fa9fde8 100644 --- a/langchain/openai-translator/ai_translator/book/page.py +++ b/langchain/openai-translator/ai_translator/book/page.py @@ -1,8 +1,11 @@ +# 导入 Content 类,用于创建页面内容 from .content import Content class Page: def __init__(self): + # 初始化一个空列表,用于存储页面内容 self.contents = [] def add_content(self, content: Content): - self.contents.append(content) + # 将传入的内容添加到页面内容列表中 + self.contents.append(content) \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/flask_server.py b/langchain/openai-translator/ai_translator/flask_server.py index fcf1d161..b685574b 100644 --- a/langchain/openai-translator/ai_translator/flask_server.py +++ b/langchain/openai-translator/ai_translator/flask_server.py @@ -1,31 +1,37 @@ -import sys -import os +import sys # 导入sys模块,用于添加系统路径 +import os # 导入os模块,用于获取文件路径 -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(os.path.abspath(__file__))) # 将当前文件所在目录添加到系统路径中,以便导入其他模块 -from flask import Flask, request, send_file, jsonify -from translator import PDFTranslator, TranslationConfig -from utils import ArgumentParser, LOG +from flask import Flask, request, send_file, jsonify # 导入Flask框架中的Flask、request、send_file和jsonify模块,用于创建Flask应用实例、接收请求、发送文件和返回JSON数据 +from translator import PDFTranslator, TranslationConfig # 导入自定义的PDFTranslator和TranslationConfig类,用于翻译PDF文件 +from utils import ArgumentParser, LOG # 导入自定义的ArgumentParser和LOG函数,用于解析命令行参数和打印日志 -app = Flask(__name__) +app = Flask(__name__) # 创建Flask应用实例 -TEMP_FILE_DIR = "flask_temps/" +TEMP_FILE_DIR = "flask_temps/" # 定义临时文件目录 +# 使用 Flask 装饰器语法,将该函数绑定到 /translation 路径上,并指定请求方法为 POST。 @app.route('/translation', methods=['POST']) def translation(): try: + # 获取上传的文件 input_file = request.files['input_file'] + # 获取源语言和目标语言 source_language = request.form.get('source_language', 'English') target_language = request.form.get('target_language', 'Chinese') + # 打印上传文件的信息 LOG.debug(f"[input_file]\n{input_file}") LOG.debug(f"[input_file.filename]\n{input_file.filename}") + # 判断上传的文件是否存在 if input_file and input_file.filename: - # # 创建临时文件 + # 创建临时文件 input_file_path = TEMP_FILE_DIR+input_file.filename LOG.debug(f"[input_file_path]\n{input_file_path}") + # 保存上传的文件到临时文件夹 input_file.save(input_file_path) # 调用翻译函数 @@ -44,25 +50,25 @@ def translation(): # 返回翻译后的文件 return send_file(output_file_path, as_attachment=True) except Exception as e: + # 返回错误信息 response = { 'status': 'error', 'message': str(e) } return jsonify(response), 400 - def initialize_translator(): - # 解析命令行 + # 解析命令行参数 argument_parser = ArgumentParser() args = argument_parser.parse_arguments() # 设置 OpenAI API Key os.environ["OPENAI_API_KEY"] = args.api_key - # 初始化配置单例 + # 初始化配置 config = TranslationConfig() config.initialize(args) - # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 + # 初始化全局变量 Translator global Translator Translator = PDFTranslator(config.model_name) diff --git a/langchain/openai-translator/ai_translator/gradio_server.py b/langchain/openai-translator/ai_translator/gradio_server.py index d148f906..4ed1ae27 100644 --- a/langchain/openai-translator/ai_translator/gradio_server.py +++ b/langchain/openai-translator/ai_translator/gradio_server.py @@ -1,57 +1,78 @@ -import sys -import os -import gradio as gr +import sys # 系统库,用于添加系统路径 +import os # 操作系统库,用于处理文件路径 +import gradio as gr # Gradio库,用于构建用户界面 -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(os.path.abspath(__file__))) # 将当前文件所在目录添加到系统路径中,以便导入其他模块 -from utils import ArgumentParser, LOG -from translator import PDFTranslator, TranslationConfig +# 导入 utils 模块的 ArgumentParser和LOG类,用于解析命令行参数和打印日志 +from utils import ArgumentParser, LOG +# 导入 translator 模块的 PDFTranslator和TranslationConfig类,用于翻译PDF文件 +from translator import PDFTranslator, TranslationConfig +# 翻译函数,用于将PDF文件翻译为指定格式的文件 +# input_file: gradio.File类型,输入文件 +# source_language: str类型,源语言,默认为英语 +# target_language: str类型,目标语言,默认为中文 def translation(input_file, source_language, target_language): - LOG.debug(f"[翻译任务]\n源文件: {input_file.name}\n源语言: {source_language}\n目标语言: {target_language}") - output_file_path = Translator.translate_pdf( - input_file.name, source_language=source_language, target_language=target_language) + # 打印调试信息 + LOG.debug(f"[翻译任务]\n源文件: {input_file.name}\n源语言: {source_language}\n目标语言: {target_language}") + + # 调用翻译器的translate_pdf方法进行翻译 + # input_file.name: str类型,输入文件路径 + output_file_path = Translator.translate_pdf( + input_file.name, source_language=source_language, target_language=target_language) - return output_file_path + # 返回翻译后的文件路径 + return output_file_path +# 启动Gradio服务的函数 def launch_gradio(): - iface = gr.Interface( - fn=translation, - title="OpenAI-Translator v2.0(PDF 电子书翻译工具)", - inputs=[ - gr.File(label="上传PDF文件"), - gr.Textbox(label="源语言(默认:英文)", placeholder="English", value="English"), - gr.Textbox(label="目标语言(默认:中文)", placeholder="Chinese", value="Chinese") - ], - outputs=[ - gr.File(label="下载翻译文件") - ], - allow_flagging="never" - ) - - iface.launch(share=True, server_name="0.0.0.0") + # 构建界面 + # 点击提交按钮时,inputs 的值作为 fn 的参数传入 + # fn 的返回值作为 outputs 的值 + iface = gr.Interface( + fn=translation, + title="OpenAI-Translator v2.0(PDF电子书翻译工具)", + inputs=[ + gr.File(label="上传PDF文件"), + gr.Textbox(label="源语言(默认:英文)", placeholder="English", value="English"), + gr.Textbox(label="目标语言(默认:中文)", placeholder="Chinese", value="Chinese") + ], + outputs=[ + gr.File(label="下载翻译文件") + ], + allow_flagging="never" + ) + # 启动服务 + iface.launch(share=True, server_name="0.0.0.0") + +# 初始化翻译器 def initialize_translator(): - # 解析命令行 - argument_parser = ArgumentParser() - args = argument_parser.parse_arguments() - # 设置 OpenAI API Key - os.environ["OPENAI_API_KEY"] = args.api_key + # 解析命令行参数 + argument_parser = ArgumentParser() + args = argument_parser.parse_arguments() + + # 设置OpenAI API Key + os.environ["OPENAI_API_KEY"] = args.api_key - # 初始化配置单例 - config = TranslationConfig() - config.initialize(args) - # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 - global Translator - Translator = PDFTranslator(config.model_name) + # 初始化配置类 + config = TranslationConfig() + config.initialize(args) + # 实例化翻译器对象 + global Translator + Translator = PDFTranslator(config.model_name) +# 程序入口 if __name__ == "__main__": - # 初始化 translator - initialize_translator() - # 启动 Gradio 服务 - launch_gradio() + + # 初始化翻译器 + initialize_translator() + + # 启动Gradio服务 + launch_gradio() \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/main.py b/langchain/openai-translator/ai_translator/main.py index 1ac6b2bb..ee4e414f 100644 --- a/langchain/openai-translator/ai_translator/main.py +++ b/langchain/openai-translator/ai_translator/main.py @@ -1,23 +1,24 @@ -import sys -import os +import sys # 导入 sys 模块,用于添加系统路径 +import os # 导入 os 模块,用于获取文件路径 -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(os.path.abspath(__file__))) # 将当前文件所在目录添加到系统路径中,以便导入其他模块 -from utils import ArgumentParser, LOG -from translator import PDFTranslator, TranslationConfig +from utils import ArgumentParser, LOG # 导入 utils 模块中的 ArgumentParser 和 LOG 类,用于解析命令行参数和打印日志 +from translator import PDFTranslator, TranslationConfig # 导入 translator 模块中的 PDFTranslator 和 TranslationConfig 类,用于翻译 PDF 文件 if __name__ == "__main__": - # 解析命令行 - argument_parser = ArgumentParser() - args = argument_parser.parse_arguments() + # 解析命令行参数 + argument_parser = ArgumentParser() # 创建 ArgumentParser 实例 + args = argument_parser.parse_arguments() # 解析命令行参数 # 设置 OpenAI API Key - os.environ["OPENAI_API_KEY"] = args.api_key + os.environ["OPENAI_API_KEY"] = args.api_key - # 初始化配置单例 - config = TranslationConfig() - config.initialize(args) + # 初始化配置 + config = TranslationConfig() + config.initialize(args) - # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 + # 创建 PDFTranslator 实例 translator = PDFTranslator(config.model_name) - translator.translate_pdf(config.input_file, config.output_file_format, pages=None) + # 调用 translate_pdf() 方法进行 PDF 翻译 + translator.translate_pdf(config.input_file, config.output_file_format, pages=None) \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/translator/exceptions.py b/langchain/openai-translator/ai_translator/translator/exceptions.py index 4f4c23c1..d4787c20 100644 --- a/langchain/openai-translator/ai_translator/translator/exceptions.py +++ b/langchain/openai-translator/ai_translator/translator/exceptions.py @@ -1,5 +1,6 @@ +# 一个异常类,用于处理获取页数超出范围的情况 class PageOutOfRangeException(Exception): def __init__(self, book_pages, requested_pages): - self.book_pages = book_pages - self.requested_pages = requested_pages - super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.") + self.book_pages = book_pages # 书籍总页数 + self.requested_pages = requested_pages # 请求的页数 + super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.") \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/translator/pdf_parser.py b/langchain/openai-translator/ai_translator/translator/pdf_parser.py index 6f2f9bc3..f9f8a091 100644 --- a/langchain/openai-translator/ai_translator/translator/pdf_parser.py +++ b/langchain/openai-translator/ai_translator/translator/pdf_parser.py @@ -1,58 +1,71 @@ -import pdfplumber -from typing import Optional -from book import Book, Page, Content, ContentType, TableContent -from translator.exceptions import PageOutOfRangeException -from utils import LOG - +import pdfplumber # 导入pdfplumber库,用于解析PDF文件 +from typing import Optional # 导入typing库中的Optional类型,用于指定可选参数 +from book import Book, Page, Content, ContentType, TableContent # 导入自定义的Book、Page、Content、ContentType和TableContent类 +from translator.exceptions import PageOutOfRangeException # 导入自定义的PageOutOfRangeException异常类 +from utils import LOG # 导入自定义的LOG函数,用于记录日志 +# 创建一个PDFParser类,用于解析PDF文件 class PDFParser: def __init__(self): pass + # 解析PDF文件,返回一个Book对象 + # pdf_file_path: PDF文件的路径 + # pages: 解析的页数,如果不指定,则解析所有页 def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book: + # 创建一个Book对象,用于存储解析后的PDF内容 book = Book(pdf_file_path) + # 打开PDF文件 with pdfplumber.open(pdf_file_path) as pdf: + # 如果指定了解析页数,但页数超出了PDF文件的总页数,则抛出异常 if pages is not None and pages > len(pdf.pages): raise PageOutOfRangeException(len(pdf.pages), pages) + # 如果没有指定解析页数,则解析所有页 if pages is None: pages_to_parse = pdf.pages else: pages_to_parse = pdf.pages[:pages] + # 遍历每一页 for pdf_page in pages_to_parse: + # 创建一个Page对象,用于存储当前页的内容 page = Page() - # Store the original text content + # 获取当前页的原始文本内容和表格内容 raw_text = pdf_page.extract_text() tables = pdf_page.extract_tables() - # Remove each cell's content from the original text + # 从原始文本中移除表格中的每个单元格的内容 for table_data in tables: for row in table_data: for cell in row: raw_text = raw_text.replace(cell, "", 1) - # Handling text + # 处理文本内容 if raw_text: - # Remove empty lines and leading/trailing whitespaces + # 将原始文本按行分割 raw_text_lines = raw_text.splitlines() + # 将每一行的前后空格移除 cleaned_raw_text_lines = [line.strip() for line in raw_text_lines if line.strip()] + # 将每一行的内容拼接起来 cleaned_raw_text = "\n".join(cleaned_raw_text_lines) + # 创建一个Content对象,用于存储文本内容 text_content = Content(content_type=ContentType.TEXT, original=cleaned_raw_text) page.add_content(text_content) LOG.debug(f"[raw_text]\n {cleaned_raw_text}") - - - # Handling tables + # 处理表格内容 if tables: + # 创建一个TableContent对象,用于存储表格内容 table = TableContent(tables) page.add_content(table) LOG.debug(f"[table]\n{table}") + # 将当前页的内容添加到Book对象中 book.add_page(page) - return book + # 返回解析后的Book对象 + return book \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/translator/pdf_translator.py b/langchain/openai-translator/ai_translator/translator/pdf_translator.py index 873dee77..46a19e6a 100644 --- a/langchain/openai-translator/ai_translator/translator/pdf_translator.py +++ b/langchain/openai-translator/ai_translator/translator/pdf_translator.py @@ -1,29 +1,42 @@ -from typing import Optional -from translator.pdf_parser import PDFParser -from translator.writer import Writer -from translator.translation_chain import TranslationChain -from utils import LOG +from typing import Optional # 引入 Optional 类型,表示可选参数 +from translator.pdf_parser import PDFParser # 引入 PDFParser 类,用于解析 PDF 文件 +from translator.writer import Writer # 引入 Writer 类,用于将翻译结果写入文件 +from translator.translation_chain import TranslationChain # 引入 TranslationChain 类,用于构建翻译链 +from utils import LOG # 引入 LOG 对象,用于记录日志 +# PDFTranslator 类,用于将 PDF 文件翻译为指定格式的文件 class PDFTranslator: + # 初始化 PDFTranslator 类,传入模型名称 def __init__(self, model_name: str): + # 初始化翻译链 self.translate_chain = TranslationChain(model_name) + # 初始化 PDF 解析器 self.pdf_parser = PDFParser() + # 初始化写入器 self.writer = Writer() + # 将 PDF 文件翻译为指定格式的文件 + # input_file: str 类型,输入文件路径 + # output_file_format: str 类型,输出文件格式,支持PDF和Markdown + # source_language: str 类型,源语言,默认为英语 + # target_language: str 类型,目标语言,默认为中文 + # pages: Optional[int] 类型,翻译的页数,默认为 None,表示翻译所有页 def translate_pdf(self, input_file: str, output_file_format: str = 'markdown', source_language: str = "English", target_language: str = 'Chinese', pages: Optional[int] = None): - + # 解析 PDF 文件 self.book = self.pdf_parser.parse_pdf(input_file, pages) + # 遍历所有页面内容 for page_idx, page in enumerate(self.book.pages): for content_idx, content in enumerate(page.contents): - # Translate content.original + # 翻译每个页面的每个内容 translation, status = self.translate_chain.run(content, source_language, target_language) - # Update the content in self.book.pages directly + # 保存翻译结果 self.book.pages[page_idx].contents[content_idx].set_translation(translation, status) - return self.writer.save_translated_book(self.book, output_file_format) + # 保存翻译后的内容为指定格式的文件 + return self.writer.save_translated_book(self.book, output_file_format) \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/translator/translation_chain.py b/langchain/openai-translator/ai_translator/translator/translation_chain.py index bdab61a6..e8be67d4 100644 --- a/langchain/openai-translator/ai_translator/translator/translation_chain.py +++ b/langchain/openai-translator/ai_translator/translator/translation_chain.py @@ -1,18 +1,23 @@ -from langchain.chat_models import ChatOpenAI -from langchain.chains import LLMChain +from langchain.chat_models import ChatOpenAI # 导入 ChatOpenAI 类,用于构造聊天模型 +from langchain.chains import LLMChain # 导入 LLMChain 类,用于构造 LLMChain 对象 -from langchain.prompts.chat import ( +from langchain.prompts.chat import ( # 导入聊天提示模板,用于构造 ChatPromptTemplate 对象 ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ) -from utils import LOG +from utils import LOG # 导入 LOG 工具,用于记录日志 +# 创建一个 TranslationChain 类,用于构造翻译任务的聊天模型 class TranslationChain: + # model_name: 聊天模型的名称,默认为 gpt-3.5-turbo + # verbose: 是否打印日志,默认为 True def __init__(self, model_name: str = "gpt-3.5-turbo", verbose: bool = True): - - # 翻译任务指令始终由 System 角色承担 + + # 使用 System 角色的提示模板,用于提示用户当前的翻译任务 + # source_language: 待翻译文本的源语言 + # target_language: 待翻译文本的目标语言 template = ( """You are a translation expert, proficient in various languages. \n Translates {source_language} to {target_language}.""" @@ -20,6 +25,7 @@ def __init__(self, model_name: str = "gpt-3.5-turbo", verbose: bool = True): system_message_prompt = SystemMessagePromptTemplate.from_template(template) # 待翻译文本由 Human 角色输入 + # text: 待翻译的文本 human_template = "{text}" human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) @@ -28,14 +34,20 @@ def __init__(self, model_name: str = "gpt-3.5-turbo", verbose: bool = True): [system_message_prompt, human_message_prompt] ) - # 为了翻译结果的稳定性,将 temperature 设置为 0 + # 根据 model_name 构造聊天模型,为了翻译结果的稳定性,将 temperature 设置为 0 chat = ChatOpenAI(model_name=model_name, temperature=0, verbose=verbose) + # 构造 LLMChain 对象 self.chain = LLMChain(llm=chat, prompt=chat_prompt_template, verbose=verbose) + # run 方法,用于运行翻译任务,返回翻译结果和是否成功的标志 + # text: 待翻译的文本 + # source_language: 待翻译文本的源语言 + # target_language: 待翻译文本的目标语言 def run(self, text: str, source_language: str, target_language: str) -> (str, bool): result = "" try: + # 运行 LLMChain 对象 result = self.chain.run({ "text": text, "source_language": source_language, diff --git a/langchain/openai-translator/ai_translator/translator/translation_config.py b/langchain/openai-translator/ai_translator/translator/translation_config.py index 783823ae..d6bfdad2 100644 --- a/langchain/openai-translator/ai_translator/translator/translation_config.py +++ b/langchain/openai-translator/ai_translator/translator/translation_config.py @@ -1,29 +1,35 @@ -import yaml +import yaml # 导入yaml模块,用于读取和解析YAML文件 +# 定义TranslationConfig类,用于存储翻译配置 class TranslationConfig: _instance = None def __new__(cls): + # 如果类的_instance属性为空,则创建一个新的实例 if cls._instance is None: cls._instance = super(TranslationConfig, cls).__new__(cls) + # 初始化_config属性为空 cls._instance._config = None return cls._instance + # 初始化_config属性,参数args是argparse.Namespace对象 def initialize(self, args): + # 从配置文件中读取配置 with open(args.config_file, "r") as f: config = yaml.safe_load(f) - # Use the argparse Namespace to update the configuration + # 使用argparse Namespace更新配置 overridden_values = { key: value for key, value in vars(args).items() if key in config and value is not None } config.update(overridden_values) - # Store the original config dictionary + # 存储配置 self._instance._config = config def __getattr__(self, name): - # Try to get attribute from _config + # 尝试从_config中获取属性 if self._instance._config and name in self._instance._config: return self._instance._config[name] + # 如果属性不存在,则抛出AttributeError raise AttributeError(f"'TranslationConfig' object has no attribute '{name}'") \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/translator/writer.py b/langchain/openai-translator/ai_translator/translator/writer.py index 90b51ed5..60b21dde 100644 --- a/langchain/openai-translator/ai_translator/translator/writer.py +++ b/langchain/openai-translator/ai_translator/translator/writer.py @@ -1,113 +1,135 @@ -import os -from reportlab.lib import colors, pagesizes, units -from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle -from reportlab.pdfbase import pdfmetrics -from reportlab.pdfbase.ttfonts import TTFont +import os # 导入操作系统模块,用于获取文件路径 +from reportlab.lib import colors, pagesizes, units # 导入 reportlab 库中的颜色、页面大小和单位模块,用于设置 PDF 页面 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle # 导入 reportlab 库中的样式模块,用于设置 PDF 文本样式 +from reportlab.pdfbase import pdfmetrics # 导入 reportlab 库中的 pdfmetrics 模块,用于配置字体 +from reportlab.pdfbase.ttfonts import TTFont # 导入 reportlab 库中的 TTFont 模块,用于配置字体 +# 导入 reportlab 库中的 SimpleDocTemplate、Paragraph、Spacer、Table、TableStyle 和 PageBreak 类,用于创建 PDF 文档 from reportlab.platypus import ( SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak ) -from book import Book, ContentType -from utils import LOG +from book import Book, ContentType # 导入自定义的 Book 和 ContentType 类,用于获取翻译后的内容 +from utils import LOG # 导入自定义的 LOG 函数,用于打印日志 +# Writer 类,用于将翻译后的内容保存为指定格式的文件 class Writer: def __init__(self): pass - - def save_translated_book(self, book: Book, ouput_file_format: str): - LOG.debug(ouput_file_format) - + + # 将翻译后的书籍内容保存为指定格式的文件 + # book: Book 类型,翻译后的书籍内容 + # book.pdf_file_path: str 类型,翻译前的PDF文件路径 + # ouput_file_format: str 类型,输出文件格式,支持PDF和Markdown + def save_translated_book(self, book: Book, ouput_file_format: str): + # 打印输出文件格式 + LOG.debug(ouput_file_format) + + # 根据输出文件格式,保存为PDF或Markdown if ouput_file_format.lower() == "pdf": output_file_path = self._save_translated_book_pdf(book) elif ouput_file_format.lower() == "markdown": output_file_path = self._save_translated_book_markdown(book) else: + # 不支持的文件格式,打印错误提示 LOG.error(f"不支持文件类型: {ouput_file_format}") return "" - LOG.info(f"翻译完成,文件保存至: {output_file_path}") + # 打印已保存文件的路径 + LOG.info(f"翻译完成,文件保存至: {output_file_path}") return output_file_path + # 将翻译内容保存为PDF文件 def _save_translated_book_pdf(self, book: Book, output_file_path: str = None): + + # 构建PDF输出文件的路径 + output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.pdf') - output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.pdf') - + # 打印PDF输出文件路径 LOG.info(f"开始导出: {output_file_path}") - # Register Chinese font + # 注册中文字体 font_path = "../fonts/simsun.ttc" # 请将此路径替换为您的字体文件路径 pdfmetrics.registerFont(TTFont("SimSun", font_path)) - # Create a new ParagraphStyle with the SimSun font + # 创建使用SimSun字体的Paragraph样式 simsun_style = ParagraphStyle('SimSun', fontName='SimSun', fontSize=12, leading=14) - - # Create a PDF document + + # 创建PDF文档对象,设置页面大小为A4 doc = SimpleDocTemplate(output_file_path, pagesize=pagesizes.letter) + # 设置PDF文档的左、右、上、下边距 styles = getSampleStyleSheet() + # 创建存储所有页面内容的列表 story = [] - # Iterate over the pages and contents + # 遍历所有页面内容 for page in book.pages: for content in page.contents: + # 如果翻译成功 if content.status: if content.content_type == ContentType.TEXT: - # Add translated text to the PDF - text = content.translation + # 添加翻译后的文本内容 + text = content.translation para = Paragraph(text, simsun_style) story.append(para) elif content.content_type == ContentType.TABLE: - # Add table to the PDF + # 添加表格内容 table = content.translation table_style = TableStyle([ - ('BACKGROUND', (0, 0), (-1, 0), colors.grey), - ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), - ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), # 设置表头背景颜色为灰色 + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), # 设置表头文本颜色为白色 + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # 设置表格中的文本对齐方式为居中 ('FONTNAME', (0, 0), (-1, 0), 'SimSun'), # 更改表头字体为 "SimSun" - ('FONTSIZE', (0, 0), (-1, 0), 14), - ('BOTTOMPADDING', (0, 0), (-1, 0), 12), - ('BACKGROUND', (0, 1), (-1, -1), colors.beige), - ('FONTNAME', (0, 1), (-1, -1), 'SimSun'), # 更改表格中的字体为 "SimSun" - ('GRID', (0, 0), (-1, -1), 1, colors.black) + ('FONTSIZE', (0, 0), (-1, 0), 14), # 更改表头字体大小为 14 + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), # 设置表头下边距为 12 + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), # 设置表格内容背景颜色为米色 + ('FONTNAME', (0, 1), (-1, -1), 'SimSun'), # 更改表格中的字体为 "SimSun" + ('GRID', (0, 0), (-1, -1), 1, colors.black) # 设置表格边框为黑色 ]) pdf_table = Table(table.values.tolist()) pdf_table.setStyle(table_style) story.append(pdf_table) - # Add a page break after each page except the last one + # 在每页之后插入分页符,最后一页除外 if page != book.pages[-1]: story.append(PageBreak()) - # Save the translated book as a new PDF file + # 生成并保存翻译后的PDF文件 doc.build(story) return output_file_path + # 将翻译内容保存为Markdown文件 def _save_translated_book_markdown(self, book: Book, output_file_path: str = None): - output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.md') + # 构建Markdown输出文件的路径 + output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.md') + # 打印Markdown输出文件路径 LOG.info(f"开始导出: {output_file_path}") + # 打开Markdown输出文件 with open(output_file_path, 'w', encoding='utf-8') as output_file: - # Iterate over the pages and contents + # 遍历所有页面内容 for page in book.pages: for content in page.contents: + # 如果翻译成功 if content.status: if content.content_type == ContentType.TEXT: - # Add translated text to the Markdown file + # 添加翻译后的文本内容 text = content.translation output_file.write(text + '\n\n') elif content.content_type == ContentType.TABLE: - # Add table to the Markdown file + # 添加表格内容 table = content.translation + # 生成Markdown表格 header = '| ' + ' | '.join(str(column) for column in table.columns) + ' |' + '\n' separator = '| ' + ' | '.join(['---'] * len(table.columns)) + ' |' + '\n' - # body = '\n'.join(['| ' + ' | '.join(row) + ' |' for row in table.values.tolist()]) + '\n\n' + # body = '\n'.join(['| ' + ' | '.join(row) for row in table.values.tolist()]) + '\n\n' body = '\n'.join(['| ' + ' | '.join(str(cell) for cell in row) + ' |' for row in table.values.tolist()]) + '\n\n' output_file.write(header + separator + body) - # Add a page break (horizontal rule) after each page except the last one + # 在每页之后插入分隔线,最后一页除外 if page != book.pages[-1]: output_file.write('---\n\n') diff --git a/langchain/openai-translator/ai_translator/utils/argument_parser.py b/langchain/openai-translator/ai_translator/utils/argument_parser.py index 16e3470c..0523d633 100644 --- a/langchain/openai-translator/ai_translator/utils/argument_parser.py +++ b/langchain/openai-translator/ai_translator/utils/argument_parser.py @@ -1,16 +1,29 @@ +# 导入argparse模块,用于解析命令行参数 import argparse +# 定义一个ArgumentParser类 class ArgumentParser: + # 初始化函数,创建一个ArgumentParser对象 def __init__(self): + # 创建一个ArgumentParser对象,设置description属性 self.parser = argparse.ArgumentParser(description='A translation tool that supports translations in any language pair.') + # 添加一个config_file参数,类型为字符串,默认值为'config.yaml',帮助信息为'Configuration file with model and API settings.' self.parser.add_argument('--config_file', type=str, default='config.yaml', help='Configuration file with model and API settings.') + # 添加一个model_name参数,类型为字符串,帮助信息为'Name of the Large Language Model.' self.parser.add_argument('--model_name', type=str, help='Name of the Large Language Model.') + # 添加一个api_key参数,类型为字符串,帮助信息为'The API key for OpenAIModel.' self.parser.add_argument('--api_key', type=str, help='The API key for OpenAIModel.') + # 添加一个input_file参数,类型为字符串,帮助信息为'PDF file to translate.' self.parser.add_argument('--input_file', type=str, help='PDF file to translate.') + # 添加一个output_file_format参数,类型为字符串,帮助信息为'The file format of translated book. Now supporting PDF and Markdown' self.parser.add_argument('--output_file_format', type=str, help='The file format of translated book. Now supporting PDF and Markdown') + # 添加一个source_language参数,类型为字符串,帮助信息为'The language of the original book to be translated.' self.parser.add_argument('--source_language', type=str, help='The language of the original book to be translated.') + # 添加一个target_language参数,类型为字符串,帮助信息为'The target language for translating the original book.' self.parser.add_argument('--target_language', type=str, help='The target language for translating the original book.') + # 解析参数的函数 def parse_arguments(self): + # 调用ArgumentParser对象的parse_args()方法解析参数,并返回解析结果 args = self.parser.parse_args() - return args + return args \ No newline at end of file diff --git a/langchain/openai-translator/ai_translator/utils/logger.py b/langchain/openai-translator/ai_translator/utils/logger.py index a252b50e..1dfa9934 100644 --- a/langchain/openai-translator/ai_translator/utils/logger.py +++ b/langchain/openai-translator/ai_translator/utils/logger.py @@ -1,32 +1,38 @@ -from loguru import logger -import os -import sys +from loguru import logger # 导入loguru模块中的logger类,用于日志处理 +import os # 导入os模块,用于文件操作 +import sys # 导入sys模块,用于获取命令行参数 -LOG_FILE = "translation.log" -ROTATION_TIME = "02:00" +LOG_FILE = "translation.log" # 定义日志文件名 +ROTATION_TIME = "02:00" # 定义日志轮换时间 class Logger: + # 定义Logger类,用于日志处理,包括控制台和文件两种处理方式 + # 参数: + # name: 日志名称 + # log_dir: 日志目录 + # debug: 是否开启debug模式 def __init__(self, name="translation", log_dir="logs", debug=False): - if not os.path.exists(log_dir): + if not os.path.exists(log_dir): # 如果日志目录不存在,则创建 os.makedirs(log_dir) - log_file_path = os.path.join(log_dir, LOG_FILE) + log_file_path = os.path.join(log_dir, LOG_FILE) # 拼接日志文件路径 - # Remove default loguru handler + # 移除默认的loguru处理方式 logger.remove() - # Add console handler with a specific log level + # 添加控制台处理,设置特定的日志级别 level = "DEBUG" if debug else "INFO" logger.add(sys.stdout, level=level) - # Add file handler with a specific log level and timed rotation + + # 添加文件处理,设置特定的日志级别和定时轮换 logger.add(log_file_path, rotation=ROTATION_TIME, level="DEBUG") self.logger = logger -LOG = Logger(debug=True).logger +LOG = Logger(debug=True).logger # 创建Logger实例并获取logger对象 if __name__ == "__main__": - log = Logger().logger + log = Logger().logger # 创建Logger实例并获取logger对象 - log.debug("This is a debug message.") - log.info("This is an info message.") - log.warning("This is a warning message.") - log.error("This is an error message.") + log.debug("This is a debug message.") # 输出debug级别的日志信息 + log.info("This is an info message.") # 输出info级别的日志信息 + log.warning("This is a warning message.") # 输出warning级别的日志信息 + log.error("This is an error message.") # 输出error级别的日志信息 \ No newline at end of file